Source code for concept_formation.cobweb3

"""
The Cobweb3 module contains the :class:`Cobweb3Tree` and :class:`Cobweb3Node`
classes, which extend the traditional Cobweb capabilities to support numeric
values on attributes.
"""

from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from __future__ import division
from random import normalvariate
from math import sqrt
from math import pi
from math import exp
from math import log

from concept_formation.cobweb import CobwebNode
from concept_formation.cobweb import CobwebTree
from concept_formation.continuous_value import ContinuousValue
from concept_formation.utils import isNumber
from concept_formation.utils import weighted_choice
from concept_formation.utils import most_likely_choice

cv_key = "#ContinuousValue#"


[docs]class Cobweb3Tree(CobwebTree): """ The Cobweb3Tree contains the knowledge base of a partiucluar instance of the Cobweb/3 algorithm and can be used to fit and categorize instances. Cobweb/3's main difference over Cobweb is the ability to handle numerical attributes by applying an assumption that they should follow a normal distribution. For the purposes of Cobweb/3's core algorithms a numeric attribute is any value where ``isinstance(instance[attr], Number)`` returns ``True``. The scaling parameter determines whether online normalization of continuous attributes is used, and to what standard deviation the values are scaled to. Scaling divides the std of each attribute by the std of the attribute in the root divided by the scaling constant (i.e., :math:`\\sigma_{root} / scaling` when making category utility calculations. Scaling is useful to balance the weight of different numerical attributes, without scaling the magnitude of numerical attributes can affect category utility calculation meaning numbers that are naturally larger will recieve preference in the category utility calculation. :param scaling: The number of standard deviations numeric attributes are scaled to. By default this value is 0.5 (half a standard deviation), which is the max std of nominal values. If disabiling scaling is desirable, then it can be set to False or None. :type scaling: a float greater than 0.0, None, or False :param inner_attr_scaling: Whether to use the inner most attribute name when scaling numeric attributes. For example, if `('attr', '?o1')` was an attribute, then the inner most attribute would be 'attr'. When using inner most attributes, some objects might have multiple attributes (i.e., 'attr' for different objects) that contribute to the scaling. :param inner_attr_scaling: boolean """ def __init__(self, scaling=0.5, inner_attr_scaling=True): """ The tree constructor. """ self.root = Cobweb3Node() self.root.tree = self self.scaling = scaling self.inner_attr_scaling = inner_attr_scaling self.attr_scales = {}
[docs] def clear(self): """ Clears the concepts of the tree, but maintains the scaling parameter. """ self.root = Cobweb3Node() self.root.tree = self self.attr_scales = {}
[docs] def get_inner_attr(self, attr): """ Extracts the inner most attribute name from the provided attribute, if the attribute is a tuple and inner_attr_scaling is on. Otherwise it just returns the attribute. This is used to for normalizing attributes. >>> t = Cobweb3Tree() >>> t.get_inner_attr(('a', '?object1')) 'a' >>> t.get_inner_attr('a') 'a' """ if isinstance(attr, tuple) and self.inner_attr_scaling: return attr[0] else: return attr
[docs] def update_scales(self, instance): """ Reads through all the attributes in an instance and updates the tree scales object so that the attributes can be properly scaled. """ for attr in instance: if isNumber(instance[attr]): inner_attr = self.get_inner_attr(attr) if inner_attr not in self.attr_scales: self.attr_scales[inner_attr] = ContinuousValue() self.attr_scales[inner_attr].update(instance[attr])
[docs] def cobweb(self, instance): """ A modification of the cobweb function to update the scales object first, so that attribute values can be properly scaled. """ self.update_scales(instance) return super(Cobweb3Tree, self).cobweb(instance)
[docs] def ifit(self, instance): """ Incrementally fit a new instance into the tree and return its resulting concept. The cobweb3 version of the :meth:`CobwebTree.ifit` function. This version keeps track of all of the continuous :param instance: An instance to be categorized into the tree. :type instance: :ref:`Instance<instance-rep>` :return: A concept describing the instance :rtype: Cobweb3Node .. seealso:: :meth:`CobwebTree.cobweb` """ self._sanity_check_instance(instance) return self.cobweb(instance)
[docs]class Cobweb3Node(CobwebNode): """ A Cobweb3Node represents a concept within the knoweldge base of a particular :class:`Cobweb3Tree`. Each node contians a probability table that can be used to calculate the probability of different attributes given the concept that the node represents. In general the :meth:`Cobweb3Tree.ifit`, :meth:`Cobweb3Tree.categorize` functions should be used to initially interface with the Cobweb/3 knowledge base and then the returned concept can be used to calculate probabilities of certain attributes or determine concept labels. """
[docs] def increment_counts(self, instance): """ Increment the counts at the current node according to the specified instance. Cobweb3Node uses a modified version of :meth:`CobwebNode.increment_counts <concept_formation.cobweb.CobwebNode.increment_counts>` that handles numerical attributes properly. Any attribute value where ``isinstance(instance[attr], Number)`` returns ``True`` will be treated as a numerical attribute and included under an assumption that the number should follow a normal distribution. .. warning:: If a numeric attribute is found in an instance with the name of a previously nominal attribute, or vice versa, this function will raise an exception. See: :class:`NumericToNominal <concept_formation.preprocessor.NumericToNominal>` for a way to fix this error. :param instance: A new instances to incorporate into the node. :type instance: :ref:`Instance<instance-rep>` """ self.count += 1 for attr in instance: self.av_counts[attr] = self.av_counts.setdefault(attr,{}) if isNumber(instance[attr]): if cv_key not in self.av_counts[attr]: self.av_counts[attr][cv_key] = ContinuousValue() self.av_counts[attr][cv_key].update(instance[attr]) else: prior_count = self.av_counts[attr].get(instance[attr], 0) self.av_counts[attr][instance[attr]] = prior_count + 1
[docs] def update_counts_from_node(self, node): """ Increments the counts of the current node by the amount in the specified node, modified to handle numbers. .. warning:: If a numeric attribute is found in an instance with the name of a previously nominal attribute, or vice versa, this function will raise an exception. See: :class:`NumericToNominal <concept_formation.preprocessor.NumericToNominal>` for a way to fix this error. :param node: Another node from the same Cobweb3Tree :type node: Cobweb3Node """ self.count += node.count for attr in node.attrs('all'): self.av_counts[attr] = self.av_counts.setdefault(attr, {}) for val in node.av_counts[attr]: if val == cv_key: self.av_counts[attr][val] = self.av_counts[attr].get(val, ContinuousValue()) self.av_counts[attr][val].combine(node.av_counts[attr][val]) else: self.av_counts[attr][val] = (self.av_counts[attr].get(val, 0) + node.av_counts[attr][val])
[docs] def expected_correct_guesses(self): """ Returns the number of attribute values that would be correctly guessed in the current concept. This extension supports both nominal and numeric attribute values. The typical Cobweb/3 calculation for correct guesses is: .. math:: P(A_i = V_{ij})^2 = \\frac{1}{2 * \\sqrt{\\pi} * \\sigma} However, this does not take into account situations when :math:`P(A_i) < 1.0`. Additionally, the original formulation set :math:`\\sigma` to have a user specified minimum value. However, for small lower bounds, this lets cobweb achieve more than 1 expected correct guess per attribute, which is impossible for nominal attributes (and does not really make sense for continuous either). This causes problems when both nominal and continuous values are being used together; i.e., continuous attributes will get higher preference. To account for this we use a modified equation: .. math:: P(A_i = V_{ij})^2 = P(A_i)^2 * \\frac{1}{2 * \\sqrt{\\pi} * \\sigma} The key change here is that we multiply by :math:`P(A_i)^2`. Further, instead of bounding :math:`\\sigma` by a user specified lower bound (often called acuity), we add some independent, normally distributed noise to sigma: :math:`\\sigma = \\sqrt{\\sigma^2 + \\sigma_{noise}^2}`, where :math:`\\sigma_{noise} = \\frac{1}{2 * \\sqrt{\\pi}}`. This ensures the expected correct guesses never exceeds 1. From a theoretical point of view, it basically is an assumption that there is some independent, normally distributed measurement error that is added to the estimated error of the attribute (`<https://en.wikipedia.org/wi ki/Sum_of_normally_distributed_random_variables>`_). It is possible that there is additional measurement error, but the value is chosen so as to yield a sensical upper bound on the expected correct guesses. :return: The number of attribute values that would be correctly guessed in the current concept. :rtype: float """ correct_guesses = 0.0 attr_count = 0 for attr in self.attrs(): attr_count += 1 for val in self.av_counts[attr]: if val == cv_key: scale = 1.0 if self.tree is not None and self.tree.scaling: inner_attr = self.tree.get_inner_attr(attr) if inner_attr in self.tree.attr_scales: scale = ((1/self.tree.scaling) * self.tree.attr_scales[inner_attr].unbiased_std()) # we basically add noise to the std and adjust the # normalizing constant to ensure the probability of a # particular value never exceeds 1. cv = self.av_counts[attr][cv_key] std = sqrt(cv.scaled_unbiased_std(scale) * cv.scaled_unbiased_std(scale) + (1 / (4 * pi))) prob_attr = cv.num / self.count correct_guesses += ((prob_attr * prob_attr) * (1/(2 * sqrt(pi) * std))) else: prob = (self.av_counts[attr][val]) / self.count correct_guesses += (prob * prob) return correct_guesses / attr_count
[docs] def pretty_print(self, depth=0): """ Print the categorization tree The string formatting inserts tab characters to align child nodes of the same depth. Numerical values are printed with their means and standard deviations. :param depth: The current depth in the print, intended to be called recursively :type depth: int :return: a formated string displaying the tree and its children :rtype: str """ ret = str(('\t' * depth) + "|-") attributes = [] for attr in self.attrs('all'): values = [] for val in self.av_counts[attr]: values.append("'" + str(val) + "': " + str(self.av_counts[attr][val])) attributes.append("'" + str(attr) + "': {" + ", ".join(values) + "}") ret += "{" + ", ".join(attributes) + "}: " + str(self.count) + '\n' for c in self.children: ret += c.pretty_print(depth+1) return ret
[docs] def get_weighted_values(self, attr, allow_none=True): """ Return a list of weighted choices for an attribute based on the node's probability table. This calculation will include an option for the change that an attribute is missing from an instance all together. This is useful for probability and sampling calculations. If the attribute has never appeared in the tree then it will return a 100% chance of None. :param attr: an attribute of an instance :type attr: :ref:`Attribute<attributes>` :param allow_none: whether attributes in the nodes probability table can be inferred to be missing. If False, then None will not be cosidered as a possible value. :type allow_none: Boolean :return: a list of weighted choices for attr's value :rtype: [(:ref:`Value<values>`, float), (:ref:`Value<values>`, float), ...] """ choices = [] if attr not in self.av_counts: choices.append((None, 1.0)) return choices val_count = 0 for val in self.av_counts[attr]: if val == cv_key: count = self.av_counts[attr][val].num else: count = self.av_counts[attr][val] choices.append((val, count / self.count)) val_count += count if allow_none: choices.append((None, ((self.count - val_count) / self.count))) return choices
[docs] def predict(self, attr, choice_fn="most likely", allow_none=True): """ Predict the value of an attribute, using the provided strategy. If the attribute is a nominal then this function behaves the same as :meth:`CobwebNode.predict <concept_formation.cobweb.CobwebNode.predict>`. If the attribute is numeric then the mean value from the :class:`ContinuousValue<concept_formation.cv_key.ContinuousValue>` is chosen. :param attr: an attribute of an instance. :type attr: :ref:`Attribute<attributes>` :param allow_none: whether attributes not in the instance can be inferred to be missing. If False, then all attributes will be inferred with some value. :type allow_none: Boolean :return: The most likely value for the given attribute in the node's probability table. :rtype: :ref:`Value<values>` .. seealso :meth:`Cobweb3Node.sample` """ if choice_fn == "most likely" or choice_fn == "m": choose = most_likely_choice elif choice_fn == "sampled" or choice_fn == "s": choose = weighted_choice else: raise Exception("Unknown choice_fn") if attr not in self.av_counts: return None choices = self.get_weighted_values(attr, allow_none) val = choose(choices) if val == cv_key: if choice_fn == "most likely" or choice_fn == "m": val = self.av_counts[attr][val].mean elif choice_fn == "sampled" or choice_fn == "s": val = normalvariate(self.av_counts[attr][val].unbiased_mean(), self.av_counts[attr][val].unbiased_std()) else: raise Exception("Unknown choice_fn") return val
[docs] def probability(self, attr, val): """ Returns the probability of a particular attribute value at the current concept. This takes into account the possibilities that an attribute can take any of the values available at the root, or be missing. For numerical attributes it returns the integral of the product of two gaussians. One gaussian has :math:`\\mu = val` and :math:`\\sigma = \\sigma_{noise} = \\frac{1}{2 * \\sqrt{\\pi}}` (where :math:`\\sigma_{noise}` is from :meth:`Cobweb3Node.expected_correct_guesses <concept_formation.cobweb3.Cobweb3Node.expected_correct_guesses>` and ensures the probability or expected correct guesses never exceeds 1). The second gaussian has the mean ad std values from the current concept with additional gaussian noise (independent and normally distributed noise with :math:`\\sigma_{noise} = \\frac{1}{2 * \\sqrt{\\pi}}`). The integral of this gaussian product is another gaussian with :math:`\\mu` equal to the concept attribut mean and :math:`\\sigma = \\sqrt{\\sigma_{attr}^2 + 2 * \\sigma_{noise}^2}` or, slightly simplified, :math:`\\sigma = \\sqrt{\\sigma_{attr}^2 + 2 * \\frac{1}{2 * \\pi}}`. :param attr: an attribute of an instance :type attr: :ref:`Attribute<attributes>` :param val: a value for the given attribute :type val: :ref:`Value<values>` :return: The probability of attr having the value val in the current concept. :rtype: float """ if val is None: c = 0.0 if attr in self.av_counts: c = sum([self.av_counts[attr][v].num if v == cv_key else self.av_counts[attr][v] for v in self.av_counts[attr]]) return (self.count - c) / self.count if attr in self.av_counts and isNumber(val): if cv_key not in self.av_counts[attr]: return 0.0 prob_attr = self.av_counts[attr][cv_key].num / self.count if self.tree is not None and self.tree.scaling: inner_attr = self.tree.get_inner_attr(attr) scale = ((1/self.tree.scaling) * self.tree.attr_scales[inner_attr].unbiased_std()) if scale == 0: scale = 1 shift = self.tree.attr_scales[inner_attr].mean val = (val - shift) / scale else: scale = 1.0 shift = 0.0 mean = (self.av_counts[attr][cv_key].mean - shift) / scale ostd = self.av_counts[attr][cv_key].scaled_unbiased_std(scale) std = sqrt(ostd * ostd + (1 / (2 * pi))) p = (prob_attr * (1/(sqrt(2*pi) * std)) * exp(-((val - mean) * (val - mean)) / (2.0 * std * std))) return p if attr in self.av_counts and val in self.av_counts[attr]: return self.av_counts[attr][val] / self.count return 0.0
[docs] def log_likelihood(self, child_leaf): """ Returns the log-likelihood of a leaf contained within the current concept. Note, if the leaf contains multiple instances, then it is treated as if it contained just a single instance (this function is just called multiple times for each instance in the leaf). """ ll = 0 for attr in set(self.attrs()).union(set(child_leaf.attrs())): vals = set([None]) if attr in self.av_counts: vals.update(self.av_counts[attr]) if attr in child_leaf.av_counts: vals.update(child_leaf.av_counts[attr]) for val in vals: if val == cv_key: if (attr in self.av_counts and cv_key in self.av_counts[attr] and attr in child_leaf.av_counts and cv_key in child_leaf.av_counts[attr]): n1 = self.av_counts[attr][cv_key] n2 = child_leaf.av_counts[attr][cv_key] pn1 = n1.num / self.count pn2 = n2.num / child_leaf.count p = pn1 * pn2 * n1.integral_of_gaussian_product(n2) if p > 0: ll += log(p) else: raise Exception("p should be greater than 0") else: op = child_leaf.probability(attr, val) if op > 0: p = self.probability(attr, val) * op if p > 0: ll += log(p) else: raise Exception("p must be greater than 0") return ll
[docs] def is_exact_match(self, instance): """ Returns true if the concept exactly matches the instance. :param instance: The instance currently being categorized :type instance: :ref:`Instance<instance-rep>` :return: whether the instance perfectly matches the concept :rtype: boolean .. seealso:: :meth:`CobwebNode.get_best_operation` """ for attr in set(instance).union(set(self.attrs())): if attr[0] == '_': continue if attr in instance and attr not in self.av_counts: return False if attr in self.av_counts and attr not in instance: return False if attr in self.av_counts and attr in instance: if (isNumber(instance[attr]) and cv_key not in self.av_counts[attr]): return False if (isNumber(instance[attr]) and cv_key in self.av_counts[attr]): if (len(self.av_counts[attr]) != 1 or self.av_counts[attr][cv_key].num != self.count): return False if (not self.av_counts[attr][cv_key].unbiased_std() == 0.0): return False if (not self.av_counts[attr][cv_key].unbiased_mean() == instance[attr]): return False elif not instance[attr] in self.av_counts[attr]: return False elif not self.av_counts[attr][instance[attr]] == self.count: return False return True
[docs] def output_json(self): """ Outputs the categorization tree in JSON form. This is a modification of the :meth:`CobwebNode.output_json <concept_formation.cobweb.CobwebNode.output_json>` to handle numeric values. :return: an object that contains all of the structural information of the node and its children :rtype: obj """ output = {} if "_guid" in self.av_counts: for guid in self.av_counts['_guid']: output['guid'] = guid output["name"] = "Concept" + str(self.concept_id) output["size"] = self.count output["children"] = [] temp = {} for attr in self.attrs('all'): temp[str(attr)] = {} for val in self.av_counts[attr]: if val == cv_key: temp[str(attr)][cv_key] = self.av_counts[attr][val].output_json() else: temp[str(attr)][str(val)] = self.av_counts[attr][val] for child in self.children: output["children"].append(child.output_json()) output["counts"] = temp return output