Source code for concept_formation.cobweb3

"""
The Cobweb3 module contains the :class:`Cobweb3Tree` and :class:`Cobweb3Node`
classes, which extend the traditional Cobweb capabilities to support numeric
values on attributes.
"""

from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from __future__ import division
from random import normalvariate
from math import sqrt
from math import pi
from math import exp
from math import log

from concept_formation.cobweb import CobwebNode
from concept_formation.cobweb import CobwebTree
from concept_formation.continuous_value import ContinuousValue
from concept_formation.utils import isNumber
from concept_formation.utils import weighted_choice
from concept_formation.utils import most_likely_choice

cv_key = "#ContinuousValue#"


[docs]class Cobweb3Tree(CobwebTree):
    """
    The Cobweb3Tree contains the knowledge base of a partiucluar instance of
    the Cobweb/3 algorithm and can be used to fit and categorize instances.
    Cobweb/3's main difference over Cobweb is the ability to handle numerical
    attributes by applying an assumption that they should follow a normal
    distribution. For the purposes of Cobweb/3's core algorithms a numeric
    attribute is any value where ``isinstance(instance[attr], Number)`` returns
    ``True``.

    The scaling parameter determines whether online normalization of continuous
    attributes is used, and to what standard deviation the values are scaled
    to. Scaling divides the std of each attribute by the std of the attribute
    in the root divided by the scaling constant (i.e., :math:`\\sigma_{root} /
    scaling` when making category utility calculations.  Scaling is useful to
    balance the weight of different numerical attributes, without scaling the
    magnitude of numerical attributes can affect category utility calculation
    meaning numbers that are naturally larger will recieve preference in the
    category utility calculation.

    :param scaling: The number of standard deviations numeric attributes
        are scaled to. By default this value is 0.5 (half a standard
        deviation), which is the max std of nominal values. If disabiling
        scaling is desirable, then it can be set to False or None.
    :type scaling: a float greater than 0.0, None, or False
    :param inner_attr_scaling: Whether to use the inner most attribute name
        when scaling numeric attributes. For example, if `('attr', '?o1')` was
        an attribute, then the inner most attribute would be 'attr'. When using
        inner most attributes, some objects might have multiple attributes
        (i.e., 'attr' for different objects) that contribute to the scaling.
    :param inner_attr_scaling: boolean
    """

    def __init__(self, scaling=0.5, inner_attr_scaling=True):
        """
        The tree constructor.
        """
        self.root = Cobweb3Node()
        self.root.tree = self
        self.scaling = scaling
        self.inner_attr_scaling = inner_attr_scaling
        self.attr_scales = {}

[docs]    def clear(self):
        """
        Clears the concepts of the tree, but maintains the scaling parameter.
        """
        self.root = Cobweb3Node()
        self.root.tree = self
        self.attr_scales = {}

[docs]    def get_inner_attr(self, attr):
        """
        Extracts the inner most attribute name from the provided attribute, if
        the attribute is a tuple and inner_attr_scaling is on. Otherwise it
        just returns the attribute. This is used to for normalizing attributes.

        >>> t = Cobweb3Tree()
        >>> t.get_inner_attr(('a', '?object1'))
        'a'
        >>> t.get_inner_attr('a')
        'a'
        """
        if isinstance(attr, tuple) and self.inner_attr_scaling:
            return attr[0]
        else:
            return attr

[docs]    def update_scales(self, instance):
        """
        Reads through all the attributes in an instance and updates the
        tree scales object so that the attributes can be properly scaled.
        """
        for attr in instance:
            if isNumber(instance[attr]):
                inner_attr = self.get_inner_attr(attr)
                if inner_attr not in self.attr_scales:
                    self.attr_scales[inner_attr] = ContinuousValue()
                self.attr_scales[inner_attr].update(instance[attr])

[docs]    def cobweb(self, instance):
        """
        A modification of the cobweb function to update the scales object
        first, so that attribute values can be properly scaled.
        """
        self.update_scales(instance)
        return super(Cobweb3Tree, self).cobweb(instance)

[docs]    def ifit(self, instance):
        """
        Incrementally fit a new instance into the tree and return its resulting
        concept.

        The cobweb3 version of the :meth:`CobwebTree.ifit` function. This
        version keeps track of all of the continuous

        :param instance: An instance to be categorized into the tree.
        :type instance:  :ref:`Instance<instance-rep>`
        :return: A concept describing the instance
        :rtype: Cobweb3Node

        .. seealso:: :meth:`CobwebTree.cobweb`
        """
        self._sanity_check_instance(instance)
        return self.cobweb(instance)


[docs]class Cobweb3Node(CobwebNode):
    """
    A Cobweb3Node represents a concept within the knoweldge base of a
    particular :class:`Cobweb3Tree`. Each node contians a probability table
    that can be used to calculate the probability of different attributes given
    the concept that the node represents.

    In general the :meth:`Cobweb3Tree.ifit`, :meth:`Cobweb3Tree.categorize`
    functions should be used to initially interface with the Cobweb/3 knowledge
    base and then the returned concept can be used to calculate probabilities of
    certain attributes or determine concept labels.
    """

[docs]    def increment_counts(self, instance):
        """
        Increment the counts at the current node according to the specified
        instance.

        Cobweb3Node uses a modified version of
        :meth:`CobwebNode.increment_counts
        <concept_formation.cobweb.CobwebNode.increment_counts>` that handles
        numerical attributes properly. Any attribute value where
        ``isinstance(instance[attr], Number)`` returns ``True`` will be treated
        as a numerical attribute and included under an assumption that the
        number should follow a normal distribution.

        .. warning:: If a numeric attribute is found in an instance with the
            name of a previously nominal attribute, or vice versa, this
            function will raise an exception. See: :class:`NumericToNominal
            <concept_formation.preprocessor.NumericToNominal>` for a way to fix
            this error.
        
        :param instance: A new instances to incorporate into the node.
        :type instance: :ref:`Instance<instance-rep>`

        """
        self.count += 1 
            
        for attr in instance:
            self.av_counts[attr] = self.av_counts.setdefault(attr,{})

            if isNumber(instance[attr]):
                if cv_key not in self.av_counts[attr]:
                    self.av_counts[attr][cv_key] = ContinuousValue()
                self.av_counts[attr][cv_key].update(instance[attr])
            else:
                prior_count = self.av_counts[attr].get(instance[attr], 0)
                self.av_counts[attr][instance[attr]] = prior_count + 1

[docs]    def update_counts_from_node(self, node):
        """
        Increments the counts of the current node by the amount in the specified
        node, modified to handle numbers.

        .. warning:: If a numeric attribute is found in an instance with the
            name of a previously nominal attribute, or vice versa, this
            function will raise an exception. See: :class:`NumericToNominal
            <concept_formation.preprocessor.NumericToNominal>` for a way to fix
            this error.

        :param node: Another node from the same Cobweb3Tree
        :type node: Cobweb3Node
        """
        self.count += node.count
        for attr in node.attrs('all'):
            self.av_counts[attr] = self.av_counts.setdefault(attr, {})
            for val in node.av_counts[attr]:
                if val == cv_key:
                    self.av_counts[attr][val] = self.av_counts[attr].get(val, ContinuousValue())
                    self.av_counts[attr][val].combine(node.av_counts[attr][val])
                else:
                    self.av_counts[attr][val] = (self.av_counts[attr].get(val, 0) +
                                         node.av_counts[attr][val])

[docs]    def expected_correct_guesses(self):
        """
        Returns the number of attribute values that would be correctly guessed
        in the current concept. This extension supports both nominal and
        numeric attribute values. 
        
        The typical Cobweb/3 calculation for correct guesses is:

        .. math::

            P(A_i = V_{ij})^2 = \\frac{1}{2 * \\sqrt{\\pi} * \\sigma}

        However, this does not take into account situations when 
        :math:`P(A_i) < 1.0`. Additionally, the original formulation set
        :math:`\\sigma` to have a user specified minimum value. However, for
        small lower bounds, this lets cobweb achieve more than 1 expected
        correct guess per attribute, which is impossible for nominal attributes
        (and does not really make sense for continuous either). This causes
        problems when both nominal and continuous values are being used
        together; i.e., continuous attributes will get higher preference. 

        To account for this we use a modified equation:

        .. math::

            P(A_i = V_{ij})^2 = P(A_i)^2 * \\frac{1}{2 * \\sqrt{\\pi} * \\sigma}

        The key change here is that we multiply by :math:`P(A_i)^2`. 
        Further, instead of bounding :math:`\\sigma` by a user specified lower
        bound (often called acuity), we add some independent, normally
        distributed noise to sigma: 
        :math:`\\sigma = \\sqrt{\\sigma^2 + \\sigma_{noise}^2}`, where 
        :math:`\\sigma_{noise} = \\frac{1}{2 * \\sqrt{\\pi}}`. 
        This ensures the expected correct guesses never exceeds 1. From a
        theoretical point of view, it basically is an assumption that there is
        some independent, normally distributed measurement error that is added
        to the estimated error of the attribute (`<https://en.wikipedia.org/wi
        ki/Sum_of_normally_distributed_random_variables>`_). It is possible
        that there is additional measurement error, but the value is chosen so
        as to yield a sensical upper bound on the expected correct guesses.

        :return: The number of attribute values that would be correctly guessed
            in the current concept.
        :rtype: float
        """
        correct_guesses = 0.0
        attr_count = 0

        for attr in self.attrs():
            attr_count += 1

            for val in self.av_counts[attr]:
                if val == cv_key:
                    scale = 1.0
                    if self.tree is not None and self.tree.scaling:
                        inner_attr = self.tree.get_inner_attr(attr)
                        if inner_attr in self.tree.attr_scales:
                            scale = ((1/self.tree.scaling) *
                                     self.tree.attr_scales[inner_attr].unbiased_std())

                    # we basically add noise to the std and adjust the
                    # normalizing constant to ensure the probability of a
                    # particular value never exceeds 1.
                    cv = self.av_counts[attr][cv_key]
                    std = sqrt(cv.scaled_unbiased_std(scale) *
                               cv.scaled_unbiased_std(scale) +
                               (1 / (4 * pi)))
                    prob_attr = cv.num / self.count
                    correct_guesses += ((prob_attr * prob_attr) * 
                                        (1/(2 * sqrt(pi) * std)))
                else:
                    prob = (self.av_counts[attr][val]) / self.count
                    correct_guesses += (prob * prob)

        return correct_guesses / attr_count

[docs]    def pretty_print(self, depth=0):
        """
        Print the categorization tree

        The string formatting inserts tab characters to align child nodes of the
        same depth. Numerical values are printed with their means and standard
        deviations.
        
        :param depth: The current depth in the print, intended to be called recursively
        :type depth: int
        :return: a formated string displaying the tree and its children
        :rtype: str
        """
        ret = str(('\t' * depth) + "|-")

        attributes = []

        for attr in self.attrs('all'):
            values = []
            for val in self.av_counts[attr]:
                values.append("'" + str(val) + "': " +
                              str(self.av_counts[attr][val]))

            attributes.append("'" + str(attr) + "': {" + ", ".join(values)
                              + "}")
                  
        ret += "{" + ", ".join(attributes) + "}: " + str(self.count) + '\n'
        
        for c in self.children:
            ret += c.pretty_print(depth+1)

        return ret

[docs]    def get_weighted_values(self, attr, allow_none=True):
        """
        Return a list of weighted choices for an attribute based on the node's
        probability table.

        This calculation will include an option for the change that an attribute
        is missing from an instance all together. This is useful for probability
        and sampling calculations. If the attribute has never appeared in the
        tree then it will return a 100% chance of None.

        :param attr: an attribute of an instance
        :type attr: :ref:`Attribute<attributes>`
        :param allow_none: whether attributes in the nodes probability table
            can be inferred to be missing. If False, then None will not be
            cosidered as a possible value.
        :type allow_none: Boolean
        :return: a list of weighted choices for attr's value
        :rtype: [(:ref:`Value<values>`, float), (:ref:`Value<values>`, float), ...]
        """
        choices = []
        if attr not in self.av_counts:
            choices.append((None, 1.0))
            return choices

        val_count = 0
        for val in self.av_counts[attr]:
            if val == cv_key:
                count = self.av_counts[attr][val].num
            else:
                count = self.av_counts[attr][val]
            choices.append((val, count / self.count))
            val_count += count

        if allow_none:
            choices.append((None, ((self.count - val_count) / self.count)))

        return choices

[docs]    def predict(self, attr, choice_fn="most likely", allow_none=True):
        """
        Predict the value of an attribute, using the provided strategy.

        If the attribute is a nominal then this function behaves the same as
        :meth:`CobwebNode.predict <concept_formation.cobweb.CobwebNode.predict>`.
        If the attribute is numeric then the mean value from the
        :class:`ContinuousValue<concept_formation.cv_key.ContinuousValue>` is chosen.

        :param attr: an attribute of an instance.
        :type attr: :ref:`Attribute<attributes>`
        :param allow_none: whether attributes not in the instance can be
            inferred to be missing. If False, then all attributes will be
            inferred with some value.
        :type allow_none: Boolean
        :return: The most likely value for the given attribute in the node's 
            probability table.
        :rtype: :ref:`Value<values>`

        .. seealso :meth:`Cobweb3Node.sample`
        """
        if choice_fn == "most likely" or choice_fn == "m":
            choose = most_likely_choice
        elif choice_fn == "sampled" or choice_fn == "s":
            choose = weighted_choice
        else:
            raise Exception("Unknown choice_fn")

        if attr not in self.av_counts:
            return None

        choices = self.get_weighted_values(attr, allow_none)
        val = choose(choices)

        if val == cv_key:
            if choice_fn == "most likely" or choice_fn == "m":
                val = self.av_counts[attr][val].mean
            elif choice_fn == "sampled" or choice_fn == "s":
                val = normalvariate(self.av_counts[attr][val].unbiased_mean(),
                                    self.av_counts[attr][val].unbiased_std())
            else:
                raise Exception("Unknown choice_fn")

        return val

[docs]    def probability(self, attr, val):
        """
        Returns the probability of a particular attribute value at the current
        concept.

        This takes into account the possibilities that an attribute can take
        any of the values available at the root, or be missing.

        For numerical attributes it returns the integral of the product of two
        gaussians. One gaussian has :math:`\\mu = val` and :math:`\\sigma =
        \\sigma_{noise} = \\frac{1}{2 * \\sqrt{\\pi}}` (where
        :math:`\\sigma_{noise}` is from
        :meth:`Cobweb3Node.expected_correct_guesses
        <concept_formation.cobweb3.Cobweb3Node.expected_correct_guesses>` and
        ensures the probability or expected correct guesses never exceeds 1).
        The second gaussian has the mean ad std values from the current concept
        with additional gaussian noise (independent and normally distributed
        noise with :math:`\\sigma_{noise} = \\frac{1}{2 * \\sqrt{\\pi}}`).

        The integral of this gaussian product is another gaussian with
        :math:`\\mu` equal to the concept attribut mean and :math:`\\sigma =
        \\sqrt{\\sigma_{attr}^2 + 2 * \\sigma_{noise}^2}` or, slightly
        simplified, :math:`\\sigma =
        \\sqrt{\\sigma_{attr}^2 + 2 * \\frac{1}{2 * \\pi}}`.

        :param attr: an attribute of an instance
        :type attr: :ref:`Attribute<attributes>`
        :param val: a value for the given attribute
        :type val: :ref:`Value<values>`
        :return: The probability of attr having the value val in the current
            concept.
        :rtype: float
        """
        if val is None:
            c = 0.0
            if attr in self.av_counts:
                c = sum([self.av_counts[attr][v].num if v == cv_key
                         else self.av_counts[attr][v] for v in
                         self.av_counts[attr]])
            return (self.count - c) / self.count

        if attr in self.av_counts and isNumber(val):
            if cv_key not in self.av_counts[attr]:
                return 0.0

            prob_attr = self.av_counts[attr][cv_key].num / self.count
            if self.tree is not None and self.tree.scaling:
                inner_attr = self.tree.get_inner_attr(attr)
                scale = ((1/self.tree.scaling) *
                         self.tree.attr_scales[inner_attr].unbiased_std())

                if scale == 0:
                    scale = 1
                shift = self.tree.attr_scales[inner_attr].mean
                val = (val - shift) / scale
            else:
                scale = 1.0
                shift = 0.0

            mean = (self.av_counts[attr][cv_key].mean - shift) / scale
            ostd = self.av_counts[attr][cv_key].scaled_unbiased_std(scale)
            std = sqrt(ostd * ostd + (1 / (2 * pi)))
            p = (prob_attr *
                 (1/(sqrt(2*pi) * std)) *
                 exp(-((val - mean) * (val - mean)) / (2.0 * std * std)))
            return p

        if attr in self.av_counts and val in self.av_counts[attr]:
            return self.av_counts[attr][val] / self.count

        return 0.0

[docs]    def log_likelihood(self, child_leaf):
        """
        Returns the log-likelihood of a leaf contained within the current
        concept. Note, if the leaf contains multiple instances, then it is
        treated as if it contained just a single instance (this function is
        just called multiple times for each instance in the leaf).
        """
        ll = 0
        for attr in set(self.attrs()).union(set(child_leaf.attrs())):
            vals = set([None])
            if attr in self.av_counts:
                vals.update(self.av_counts[attr])
            if attr in child_leaf.av_counts:
                vals.update(child_leaf.av_counts[attr])

            for val in vals:
                if val == cv_key:
                    if (attr in self.av_counts and cv_key in
                            self.av_counts[attr] and attr in
                            child_leaf.av_counts and cv_key in
                            child_leaf.av_counts[attr]):

                        n1 = self.av_counts[attr][cv_key]
                        n2 = child_leaf.av_counts[attr][cv_key]
                        pn1 = n1.num / self.count
                        pn2 = n2.num / child_leaf.count
                        p = pn1 * pn2 * n1.integral_of_gaussian_product(n2)
                        if p > 0:
                            ll += log(p)
                        else:
                            raise Exception("p should be greater than 0")
                else:
                    op = child_leaf.probability(attr, val)
                    if op > 0:
                        p = self.probability(attr, val) * op
                        if p > 0:
                            ll += log(p)
                        else:
                            raise Exception("p must be greater than 0")
        return ll

[docs]    def is_exact_match(self, instance):
        """
        Returns true if the concept exactly matches the instance.

        :param instance: The instance currently being categorized
        :type instance: :ref:`Instance<instance-rep>`
        :return: whether the instance perfectly matches the concept
        :rtype: boolean

        .. seealso:: :meth:`CobwebNode.get_best_operation`
        """
        for attr in set(instance).union(set(self.attrs())):
            if attr[0] == '_':
                continue
            if attr in instance and attr not in self.av_counts:
                return False
            if attr in self.av_counts and attr not in instance:
                return False
            if attr in self.av_counts and attr in instance:
                if (isNumber(instance[attr]) and
                        cv_key not in self.av_counts[attr]):
                    return False
                if (isNumber(instance[attr]) and cv_key in
                        self.av_counts[attr]):
                    if (len(self.av_counts[attr]) != 1 or
                            self.av_counts[attr][cv_key].num != self.count):
                        return False
                    if (not self.av_counts[attr][cv_key].unbiased_std() ==
                            0.0):
                        return False
                    if (not self.av_counts[attr][cv_key].unbiased_mean() ==
                            instance[attr]):
                        return False
                elif not instance[attr] in self.av_counts[attr]:
                    return False
                elif not self.av_counts[attr][instance[attr]] == self.count:
                    return False
        return True

[docs]    def output_json(self):
        """
        Outputs the categorization tree in JSON form. 

        This is a modification of the :meth:`CobwebNode.output_json
        <concept_formation.cobweb.CobwebNode.output_json>` to handle numeric
        values.

        :return: an object that contains all of the structural information of
            the node and its children
        :rtype: obj
        """
        output = {}
        if "_guid" in self.av_counts:
            for guid in self.av_counts['_guid']:
                output['guid'] = guid
        output["name"] = "Concept" + str(self.concept_id)
        output["size"] = self.count
        output["children"] = []

        temp = {}
        for attr in self.attrs('all'):
            temp[str(attr)] = {}

            for val in self.av_counts[attr]:
                if val == cv_key:
                    temp[str(attr)][cv_key] = self.av_counts[attr][val].output_json()
                else:
                    temp[str(attr)][str(val)] = self.av_counts[attr][val]

        for child in self.children:
            output["children"].append(child.output_json())

        output["counts"] = temp

        return output