Source code for concept_formation.cobweb

"""
The Cobweb module contains the :class:`CobwebTree` and :class:`CobwebNode`
classes which are used to achieve the basic Cobweb functionality.
"""

from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from __future__ import division
from random import shuffle
from random import random
from math import log

from concept_formation.utils import weighted_choice
from concept_formation.utils import most_likely_choice


[docs]class CobwebTree(object): """ The CobwebTree contains the knoweldge base of a partiucluar instance of the cobweb algorithm and can be used to fit and categorize instances. """ def __init__(self): """ The tree constructor. """ self.root = CobwebNode() self.root.tree = self
[docs] def clear(self): """ Clears the concepts of the tree. """ self.root = CobwebNode() self.root.tree = self
def __str__(self): return str(self.root) def _sanity_check_instance(self, instance): for attr in instance: if instance[attr] is None: raise ValueError("Attributes with value None should" " be manually removed.") try: hash(attr) attr[0] except: raise ValueError('Invalid attribute: '+str(attr) + ' of type: '+str(type(attr)) + ' in instance: '+str(instance) + ',\n'+type(self).__name__ + ' only works with hashable ' + 'and subscriptable attributes' + ' (e.g., strings).') try: hash(instance[attr]) except: raise ValueError('Invalid value: '+str(instance[attr]) + ' of type: '+str(type(instance[attr])) + ' in instance: '+str(instance) + ',\n'+type(self).__name__ + ' only works with hashable values.')
[docs] def ifit(self, instance): """ Incrementally fit a new instance into the tree and return its resulting concept. The instance is passed down the cobweb tree and updates each node to incorporate the instance. **This process modifies the tree's knowledge** for a non-modifying version of labeling use the :meth:`CobwebTree.categorize` function. :param instance: An instance to be categorized into the tree. :type instance: :ref:`Instance<instance-rep>` :return: A concept describing the instance :rtype: CobwebNode .. seealso:: :meth:`CobwebTree.cobweb` """ self._sanity_check_instance(instance) return self.cobweb(instance)
[docs] def fit(self, instances, iterations=1, randomize_first=True): """ Fit a collection of instances into the tree. This is a batch version of the ifit function that takes a collection of instances and categorizes all of them. The instances can be incorporated multiple times to burn in the tree with prior knowledge. Each iteration of fitting uses a randomized order but the first pass can be done in the original order of the list if desired, this is useful for initializing the tree with specific prior experience. :param instances: a collection of instances :type instances: [:ref:`Instance<instance-rep>`, :ref:`Instance<instance-rep>`, ...] :param iterations: number of times the list of instances should be fit. :type iterations: int :param randomize_first: whether or not the first iteration of fitting should be done in a random order or in the list's original order. :type randomize_first: bool """ instances = [i for i in instances] for x in range(iterations): if x == 0 and randomize_first: shuffle(instances) for i in instances: self.ifit(i) shuffle(instances)
[docs] def cobweb(self, instance): """ The core cobweb algorithm used in fitting and categorization. In the general case, the cobweb algorith entertains a number of sorting operations for the instance and then commits to the operation that maximizes the :meth:`category utility <CobwebNode.category_utility>` of the tree at the current node and then recurses. At each node the alogrithm first calculates the category utility of inserting the instance at each of the node's children, keeping the best two (see: :meth:`CobwebNode.two_best_children <CobwebNode.two_best_children>`), and then calculates the category_utility of performing other operations using the best two children (see: :meth:`CobwebNode.get_best_operation <CobwebNode.get_best_operation>`), commiting to whichever operation results in the highest category utility. In the case of ties an operation is chosen at random. In the base case, i.e. a leaf node, the algorithm checks to see if the current leaf is an exact match to the current node. If it is, then the instance is inserted and the leaf is returned. Otherwise, a new leaf is created. .. note:: This function is equivalent to calling :meth:`CobwebTree.ifit` but its better to call ifit because it is the polymorphic method siganture between the different cobweb family algorithms. :param instance: an instance to incorporate into the tree :type instance: :ref:`Instance<instance-rep>` :return: a concept describing the instance :rtype: CobwebNode .. seealso:: :meth:`CobwebTree.ifit`, :meth:`CobwebTree.categorize` """ current = self.root while current: # the current.count == 0 here is for the initially empty tree. if not current.children and (current.is_exact_match(instance) or current.count == 0): # print("leaf match") current.increment_counts(instance) break elif not current.children: # print("fringe split") new = current.__class__(current) current.parent = new new.children.append(current) if new.parent: new.parent.children.remove(current) new.parent.children.append(new) else: self.root = new new.increment_counts(instance) current = new.create_new_child(instance) break else: best1_cu, best1, best2 = current.two_best_children(instance) _, best_action = current.get_best_operation(instance, best1, best2, best1_cu) # print(best_action) if best_action == 'best': current.increment_counts(instance) current = best1 elif best_action == 'new': current.increment_counts(instance) current = current.create_new_child(instance) break elif best_action == 'merge': current.increment_counts(instance) new_child = current.merge(best1, best2) current = new_child elif best_action == 'split': current.split(best1) else: raise Exception('Best action choice "' + best_action + '" not a recognized option. This should be' ' impossible...') return current
def _cobweb_categorize(self, instance): """ A cobweb specific version of categorize, not inteded to be externally called. .. seealso:: :meth:`CobwebTree.categorize` """ current = self.root while current: if not current.children: return current _, best1, best2 = current.two_best_children(instance) current = best1
[docs] def infer_missing(self, instance, choice_fn="most likely", allow_none=True): """ Given a tree and an instance, returns a new instance with attribute values picked using the specified choice function (either "most likely" or "sampled"). .. todo:: write some kind of test for this. :param instance: an instance to be completed. :type instance: :ref:`Instance<instance-rep>` :param choice_fn: a string specifying the choice function to use, either "most likely" or "sampled". :type choice_fn: a string :param allow_none: whether attributes not in the instance can be inferred to be missing. If False, then all attributes will be inferred with some value. :type allow_none: Boolean :return: A completed instance :rtype: :ref:`Instance<instance-rep>` """ self._sanity_check_instance(instance) temp_instance = {a: instance[a] for a in instance} concept = self._cobweb_categorize(temp_instance) for attr in concept.attrs('all'): if attr in temp_instance: continue val = concept.predict(attr, choice_fn, allow_none) if val is not None: temp_instance[attr] = val return temp_instance
[docs] def categorize(self, instance): """ Sort an instance in the categorization tree and return its resulting concept. The instance is passed down the categorization tree according to the normal cobweb algorithm except using only the best operator and without modifying nodes' probability tables. **This process does not modify the tree's knowledge** for a modifying version of labeling use the :meth:`CobwebTree.ifit` function :param instance: an instance to be categorized into the tree. :type instance: :ref:`Instance<instance-rep>` :return: A concept describing the instance :rtype: CobwebNode .. seealso:: :meth:`CobwebTree.cobweb` """ self._sanity_check_instance(instance) return self._cobweb_categorize(instance)
[docs]class CobwebNode(object): """ A CobwebNode represents a concept within the knoweldge base of a particular :class:`CobwebTree`. Each node contains a probability table that can be used to calculate the probability of different attributes given the concept that the node represents. In general the :meth:`CobwebTree.ifit`, :meth:`CobwebTree.categorize` functions should be used to initially interface with the Cobweb knowledge base and then the returned concept can be used to calculate probabilities of certain attributes or determine concept labels. This constructor creates a CobwebNode with default values. It can also be used as a copy constructor to "deepcopy" a node, including all references to other parts of the original node's CobwebTree. :param otherNode: Another concept node to deepcopy. :type otherNode: CobwebNode """ # a counter used to generate unique concept names. _counter = 0 def __init__(self, otherNode=None): """Create a new CobwebNode""" self.concept_id = self.gensym() self.count = 0.0 self.av_counts = {} self.children = [] self.parent = None self.tree = None if otherNode: self.tree = otherNode.tree self.parent = otherNode.parent self.update_counts_from_node(otherNode) for child in otherNode.children: self.children.append(self.__class__(child))
[docs] def shallow_copy(self): """ Create a shallow copy of the current node (and not its children) This can be used to copy only the information relevant to the node's probability table without maintaining reference to other elements of the tree, except for the root which is necessary to calculate category utility. """ temp = self.__class__() temp.tree = self.tree temp.parent = self.parent temp.update_counts_from_node(self) return temp
[docs] def attrs(self, attr_filter=None): """ Iterates over the attributes present in the node's attribute-value table with the option to filter certain types. By default the filter will ignore hidden attributes and yield all others. If the string 'all' is provided then all attributes will be yielded. In neither of those cases the filter will be interpreted as a function that returns true if an attribute should be yielded and false otherwise. """ if attr_filter is None: return filter(lambda x: x[0] != "_", self.av_counts) elif attr_filter == 'all': return self.av_counts else: return filter(attr_filter, self.av_counts)
[docs] def increment_counts(self, instance): """ Increment the counts at the current node according to the specified instance. :param instance: A new instances to incorporate into the node. :type instance: :ref:`Instance<instance-rep>` """ self.count += 1 for attr in instance: if attr not in self.av_counts: self.av_counts[attr] = {} if instance[attr] not in self.av_counts[attr]: self.av_counts[attr][instance[attr]] = 0 self.av_counts[attr][instance[attr]] += 1
[docs] def update_counts_from_node(self, node): """ Increments the counts of the current node by the amount in the specified node. This function is used as part of copying nodes and in merging nodes. :param node: Another node from the same CobwebTree :type node: CobwebNode """ self.count += node.count for attr in node.attrs('all'): if attr not in self.av_counts: self.av_counts[attr] = {} for val in node.av_counts[attr]: if val not in self.av_counts[attr]: self.av_counts[attr][val] = 0 self.av_counts[attr][val] += node.av_counts[attr][val]
[docs] def expected_correct_guesses(self): """ Returns the number of correct guesses that are expected from the given concept. This is the sum of the probability of each attribute value squared. This function is used in calculating category utility. :return: the number of correct guesses that are expected from the given concept. :rtype: float """ correct_guesses = 0.0 attr_count = 0 for attr in self.attrs(): attr_count += 1 if attr in self.av_counts: for val in self.av_counts[attr]: prob = (self.av_counts[attr][val]) / self.count correct_guesses += (prob * prob) return correct_guesses / attr_count
[docs] def category_utility(self): """ Return the category utility of a particular division of a concept into its children. Category utility is always calculated in reference to a parent node and its own children. This is used as the heuristic to guide the concept formation process. Category Utility is calculated as: .. math:: CU(\\{C_1, C_2, \\cdots, C_n\\}) = \\frac{1}{n} \\sum_{k=1}^n P(C_k) \\left[ \\sum_i \\sum_j P(A_i = V_{ij} | C_k)^2 \\right] - \\sum_i \\sum_j P(A_i = V_{ij})^2 where :math:`n` is the numer of children concepts to the current node, :math:`P(C_k)` is the probability of a concept given the current node, :math:`P(A_i = V_{ij} | C_k)` is the probability of a particular attribute value given the concept :math:`C_k`, and :math:`P(A_i = V_{ij})` is the probability of a particular attribute value given the current node. In general this is used as an internal function of the cobweb algorithm but there may be times when it would be useful to call outside of the algorithm itself. :return: The category utility of the current node with respect to its children. :rtype: float """ if len(self.children) == 0: return 0.0 child_correct_guesses = 0.0 for child in self.children: p_of_child = child.count / self.count child_correct_guesses += (p_of_child * child.expected_correct_guesses()) return ((child_correct_guesses - self.expected_correct_guesses()) / len(self.children))
[docs] def get_best_operation(self, instance, best1, best2, best1_cu, possible_ops=["best", "new", "merge", "split"]): """ Given an instance, the two best children based on category utility and a set of possible operations, find the operation that produces the highest category utility, and then return the category utility and name for the best operation. In the case of ties, an operator is randomly chosen. Given the following starting tree the results of the 4 standard Cobweb operations are shown below: .. image:: images/Original.png :width: 200px :align: center * **Best** - Categorize the instance to child with the best category utility. This results in a recurisve call to :meth:`cobweb <concept_formation.cobweb.CobwebTree.cobweb>`. .. image:: images/Best.png :width: 200px :align: center * **New** - Create a new child node to the current node and add the instance there. See: :meth:`create_new_child <concept_formation.cobweb.CobwebNode.create_new_child>`. .. image:: images/New.png :width: 200px :align: center * **Merge** - Take the two best children, create a new node as their mutual parent and add the instance there. See: :meth:`merge <concept_formation.cobweb.CobwebNode.merge>`. .. image:: images/Merge.png :width: 200px :align: center * **Split** - Take the best node and promote its children to be children of the current node and recurse on the current node. See: :meth:`split <concept_formation.cobweb.CobwebNode.split>` .. image:: images/Split.png :width: 200px :align: center Each operation is entertained and the resultant category utility is used to pick which operation to perform. The list of operations to entertain can be controlled with the possible_ops parameter. For example, when performing categorization without modifying knoweldge only the best and new operators are used. :param instance: The instance currently being categorized :type instance: :ref:`Instance<instance-rep>` :param best1: A tuple containing the relative cu of the best child and the child itself, as determined by :meth:`CobwebNode.two_best_children`. :type best1: (float, CobwebNode) :param best2: A tuple containing the relative cu of the second best child and the child itself, as determined by :meth:`CobwebNode.two_best_children`. :type best2: (float, CobwebNode) :param possible_ops: A list of operations from ["best", "new", "merge", "split"] to entertain. :type possible_ops: ["best", "new", "merge", "split"] :return: A tuple of the category utility of the best operation and the name of the best operation. :rtype: (cu_bestOp, name_bestOp) """ if not best1: raise ValueError("Need at least one best child.") operations = [] if "best" in possible_ops: operations.append((best1_cu, random(), "best")) if "new" in possible_ops: operations.append((self.cu_for_new_child(instance), random(), 'new')) if "merge" in possible_ops and len(self.children) > 2 and best2: operations.append((self.cu_for_merge(best1, best2, instance), random(), 'merge')) if "split" in possible_ops and len(best1.children) > 0: operations.append((self.cu_for_split(best1), random(), 'split')) operations.sort(reverse=True) # print(operations) best_op = (operations[0][0], operations[0][2]) # print(best_op) return best_op
[docs] def two_best_children(self, instance): """ Calculates the category utility of inserting the instance into each of this node's children and returns the best two. In the event of ties children are sorted first by category utility, then by their size, then by a random value. :param instance: The instance currently being categorized :type instance: :ref:`Instance<instance-rep>` :return: the category utility and indices for the two best children (the second tuple will be ``None`` if there is only 1 child). :rtype: ((cu_best1,index_best1),(cu_best2,index_best2)) """ if len(self.children) == 0: raise Exception("No children!") children_relative_cu = [(self.relative_cu_for_insert(child, instance), child.count, random(), child) for child in self.children] children_relative_cu.sort(reverse=True) # Convert the relative CU's of the two best children into CU scores # that can be compared with the other operations. const = self.compute_relative_CU_const(instance) best1 = children_relative_cu[0][3] best1_relative_cu = children_relative_cu[0][0] best1_cu = (best1_relative_cu / (self.count+1) / len(self.children) + const) best2 = None if len(children_relative_cu) > 1: best2 = children_relative_cu[1][3] return best1_cu, best1, best2
[docs] def compute_relative_CU_const(self, instance): """ Computes the constant value that is used to convert between CU and relative CU scores. The constant value is basically the category utility that results from adding the instance to the root, but none of the children. It can be computed directly as: .. math:: const = \\frac{1}{n} \\sum_{k=1}^{n} \\left[ \\frac{C_k.count}{count + 1} \\sum_i \\sum_j P(A_i = V_{ij} | C)^2 \\right] - \\sum_i \\sum_j P(A_i = V_{ij} | UpdatedRoot)^2 where :math:`n` is the number of children of the root, :math:`C_k` is child :math:`k`, :math:`C_k.count` is the number of instances stored in child :math:`C_k`, :math:`count` is the number of instances stored in the root. Finally, :math:`UpdatedRoot` is a copy of the root that has been updated with the counts of the instance. :param instance: The instance currently being categorized :type instance: :ref:`Instance<instance-rep>` :return: The value of the constant used to relativize the CU. :rtype: float """ temp = self.shallow_copy() temp.increment_counts(instance) ec_root_u = temp.expected_correct_guesses() const = 0 for c in self.children: const += ((c.count / (self.count + 1)) * c.expected_correct_guesses()) const -= ec_root_u const /= len(self.children) return const
[docs] def relative_cu_for_insert(self, child, instance): """ Computes a relative CU score for each insert operation. The relative CU score is more efficient to calculate for each insert operation and is guranteed to have the same rank ordering as the CU score so it can be used to determine which insert operation is best. The relative CU can be computed from the CU using the following transformation. .. math:: relative_cu(cu) = (cu - const) * n * (count + 1) where :math:`const` is the one returned by :meth:`CobwebNode.compute_relative_CU_const`, :math:`n` is the number of children of the current node, and :math:`count` is the number of instances stored in the current node (the root). The particular :math:`const` value was chosen to make the calculation of the relative cu scores for each insert operation efficient. When computing the CU for inserting the instance into a particular child, the terms in the formula above can be expanded and many of the intermediate calculations cancel out. After these cancelations, computing the relative CU for inserting into a particular child :math:`C_i` reduces to: relative_cu_for_insert(C_i) = (C_i.count + 1) * \\sum_i \\sum_j P(A_i = V_{ij}| UpdatedC_i)^2 - (C_i.count) * \\sum_i \\sum_j P(A_i = V_{ij}| C_i)^2 where :math:`UpdatedC_i` is a copy of :math:`C_i` that has been updated with the counts from the given instance. By computing relative_CU scores instead of CU scores for each insert operation, the time complexity of the underlying Cobweb algorithm is reduced from :math:`O(B^2 \\times log_B(n) \\times AV)` to :math:`O(B \\times log_B(n) \\times AV)` where :math:`B` is the average branching factor of the tree, :math`n` is the number of instances being categorized, :math:`A` is the average number of attributes per instance, and :math:`V` is the average number of values per attribute. :param child: a child of the current node :type child: CobwebNode :param instance: The instance currently being categorized :type instance: :ref:`Instance<instance-rep>` :return: the category utility of adding the instance to the given node :rtype: float """ temp = child.shallow_copy() temp.increment_counts(instance) return ((child.count + 1) * temp.expected_correct_guesses() - child.count * child.expected_correct_guesses())
[docs] def cu_for_insert(self, child, instance): """ Compute the category utility of adding the instance to the specified child. This operation does not actually insert the instance into the child it only calculates what the result of the insertion would be. For the actual insertion function see: :meth:`CobwebNode.increment_counts` This is the function used to determine the best children for each of the other operations. :param child: a child of the current node :type child: CobwebNode :param instance: The instance currently being categorized :type instance: :ref:`Instance<instance-rep>` :return: the category utility of adding the instance to the given node :rtype: float .. seealso:: :meth:`CobwebNode.two_best_children` and :meth:`CobwebNode.get_best_operation` """ temp = self.shallow_copy() temp.increment_counts(instance) for c in self.children: temp_child = c.shallow_copy() temp.children.append(temp_child) temp_child.parent = temp if c == child: temp_child.increment_counts(instance) return temp.category_utility()
[docs] def create_new_child(self, instance): """ Create a new child (to the current node) with the counts initialized by the *given instance*. This is the operation used for creating a new child to a node and adding the instance to it. :param instance: The instance currently being categorized :type instance: :ref:`Instance<instance-rep>` :return: The new child :rtype: CobwebNode """ new_child = self.__class__() new_child.parent = self new_child.tree = self.tree new_child.increment_counts(instance) self.children.append(new_child) return new_child
[docs] def create_child_with_current_counts(self): """ Create a new child (to the current node) with the counts initialized by the *current node's counts*. This operation is used in the speical case of a fringe split when a new node is created at a leaf. :return: The new child :rtype: CobwebNode """ if self.count > 0: new = self.__class__(self) new.parent = self new.tree = self.tree self.children.append(new) return new
[docs] def cu_for_new_child(self, instance): """ Return the category utility for creating a new child using the particular instance. This operation does not actually create the child it only calculates what the result of creating it would be. For the actual new function see: :meth:`CobwebNode.create_new_child`. :param instance: The instance currently being categorized :type instance: :ref:`Instance<instance-rep>` :return: the category utility of adding the instance to a new child. :rtype: float .. seealso:: :meth:`CobwebNode.get_best_operation` """ temp = self.shallow_copy() for c in self.children: temp.children.append(c.shallow_copy()) # temp = self.shallow_copy() temp.increment_counts(instance) temp.create_new_child(instance) return temp.category_utility()
[docs] def merge(self, best1, best2): """ Merge the two specified nodes. A merge operation introduces a new node to be the merger of the the two given nodes. This new node becomes a child of the current node and the two given nodes become children of the new node. :param best1: The child of the current node with the best category utility :type best1: CobwebNode :param best2: The child of the current node with the second best category utility :type best2: CobwebNode :return: The new child node that was created by the merge :rtype: CobwebNode """ new_child = self.__class__() new_child.parent = self new_child.tree = self.tree new_child.update_counts_from_node(best1) new_child.update_counts_from_node(best2) best1.parent = new_child # best1.tree = new_child.tree best2.parent = new_child # best2.tree = new_child.tree new_child.children.append(best1) new_child.children.append(best2) self.children.remove(best1) self.children.remove(best2) self.children.append(new_child) return new_child
[docs] def cu_for_merge(self, best1, best2, instance): """ Return the category utility for merging the two best children. This does not actually merge the two children it only calculates what the result of the merge would be. For the actual merge operation see: :meth:`CobwebNode.merge` :param best1: The child of the current node with the best category utility :type best1: CobwebNode :param best2: The child of the current node with the second best category utility :type best2: CobwebNode :param instance: The instance currently being categorized :type instance: :ref:`Instance<instance-rep>` :return: The category utility that would result from merging best1 and best2. :rtype: float .. seealso:: :meth:`CobwebNode.get_best_operation` """ temp = self.shallow_copy() temp.increment_counts(instance) new_child = self.__class__() new_child.tree = self.tree new_child.parent = temp new_child.update_counts_from_node(best1) new_child.update_counts_from_node(best2) new_child.increment_counts(instance) temp.children.append(new_child) for c in self.children: if c == best1 or c == best2: continue temp_child = c.shallow_copy() temp.children.append(temp_child) return temp.category_utility()
[docs] def split(self, best): """ Split the best node and promote its children A split operation removes a child node and promotes its children to be children of the current node. Split operations result in a recursive call of cobweb on the current node so this function does not return anything. :param best: The child node to be split :type best: CobwebNode """ self.children.remove(best) for child in best.children: child.parent = self child.tree = self.tree self.children.append(child)
[docs] def cu_for_fringe_split(self, instance): """ Return the category utility of performing a fringe split (i.e., adding a leaf to a leaf). A "fringe split" is essentially a new operation performed at a leaf. It is necessary to have the distinction because unlike a normal split a fringe split must also push the parent down to maintain a proper tree structure. This is useful for identifying unnecessary fringe splits, when the two leaves are essentially identical. It can be used to keep the tree from growing and to increase the tree's predictive accuracy. :param instance: The instance currently being categorized :type instance: :ref:`Instance<instance-rep>` :return: the category utility of fringe splitting at the current node. :rtype: float .. seealso:: :meth:`CobwebNode.get_best_operation` """ temp = self.shallow_copy() temp.create_child_with_current_counts() temp.increment_counts(instance) temp.create_new_child(instance) return temp.category_utility()
[docs] def cu_for_split(self, best): """ Return the category utility for splitting the best child. This does not actually split the child it only calculates what the result of the split would be. For the actual split operation see: :meth:`CobwebNode.split`. Unlike the category utility calculations for the other operations split does not need the instance because splits trigger a recursive call on the current node. :param best: The child of the current node with the best category utility :type best: CobwebNode :return: The category utility that would result from splitting best :rtype: float .. seealso:: :meth:`CobwebNode.get_best_operation` """ temp = self.shallow_copy() for c in self.children + best.children: if c == best: continue temp_child = c.shallow_copy() temp.children.append(temp_child) return temp.category_utility()
[docs] def is_exact_match(self, instance): """ Returns true if the concept exactly matches the instance. :param instance: The instance currently being categorized :type instance: :ref:`Instance<instance-rep>` :return: whether the instance perfectly matches the concept :rtype: boolean .. seealso:: :meth:`CobwebNode.get_best_operation` """ for attr in set(instance).union(set(self.attrs())): if attr[0] == '_': continue if attr in instance and attr not in self.av_counts: return False if attr in self.av_counts and attr not in instance: return False if attr in self.av_counts and attr in instance: if instance[attr] not in self.av_counts[attr]: return False if not self.av_counts[attr][instance[attr]] == self.count: return False return True
def __hash__(self): """ The basic hash function. This hashes the concept name, which is generated to be unique across concepts. """ return hash("CobwebNode" + str(self.concept_id))
[docs] def gensym(self): """ Generate a unique id and increment the class _counter. This is used to create a unique name for every concept. As long as the class _counter variable is never externally altered these keys will remain unique. """ self.__class__._counter += 1 return self.__class__._counter
# return str(self.__class__._counter) def __str__(self): """ Call :meth:`CobwebNode.pretty_print` """ return self.pretty_print()
[docs] def pretty_print(self, depth=0): """ Print the categorization tree The string formatting inserts tab characters to align child nodes of the same depth. :param depth: The current depth in the print, intended to be called recursively :type depth: int :return: a formated string displaying the tree and its children :rtype: str """ ret = str(('\t' * depth) + "|-" + str(self.av_counts) + ":" + str(self.count) + '\n') for c in self.children: ret += c.pretty_print(depth+1) return ret
[docs] def depth(self): """ Returns the depth of the current node in its tree :return: the depth of the current node in its tree :rtype: int """ if self.parent: return 1 + self.parent.depth() return 0
[docs] def is_parent(self, other_concept): """ Return True if this concept is a parent of other_concept :return: ``True`` if this concept is a parent of other_concept else ``False`` :rtype: bool """ temp = other_concept while temp is not None: if temp == self: return True try: temp = temp.parent except: print(temp) assert False return False
[docs] def num_concepts(self): """ Return the number of concepts contained below the current node in the tree. When called on the :attr:`CobwebTree.root` this is the number of nodes in the whole tree. :return: the number of concepts below this concept. :rtype: int """ children_count = 0 for c in self.children: children_count += c.num_concepts() return 1 + children_count
[docs] def output_json(self): """ Outputs the categorization tree in JSON form :return: an object that contains all of the structural information of the node and its children :rtype: obj """ output = {} output['name'] = "Concept" + str(self.concept_id) output['size'] = self.count output['children'] = [] temp = {} for attr in self.attrs('all'): for value in self.av_counts[attr]: temp[str(attr)] = {str(value): self.av_counts[attr][value] for value in self.av_counts[attr]} # temp[attr + " = " + str(value)] = self.av_counts[attr][value] for child in self.children: output["children"].append(child.output_json()) output['counts'] = temp return output
[docs] def get_weighted_values(self, attr, allow_none=True): """ Return a list of weighted choices for an attribute based on the node's probability table. This calculation will include an option for the change that an attribute is missing from an instance all together. This is useful for probability and sampling calculations. If the attribute has never appeared in the tree then it will return a 100% chance of None. :param attr: an attribute of an instance :type attr: :ref:`Attribute<attributes>` :param allow_none: whether attributes in the nodes probability table can be inferred to be missing. If False, then None will not be cosidered as a possible value. :type allow_none: Boolean :return: a list of weighted choices for attr's value :rtype: [(:ref:`Value<values>`, float), (:ref:`Value<values>`, float), ...] """ choices = [] if attr not in self.av_counts: choices.append((None, 1.0)) return choices val_count = 0 for val in self.av_counts[attr]: count = self.av_counts[attr][val] choices.append((val, count / self.count)) val_count += count if allow_none: choices.append((None, ((self.count - val_count) / self.count))) return choices
[docs] def predict(self, attr, choice_fn="most likely", allow_none=True): """ Predict the value of an attribute, using the specified choice function (either the "most likely" value or a "sampled" value). :param attr: an attribute of an instance. :type attr: :ref:`Attribute<attributes>` :param choice_fn: a string specifying the choice function to use, either "most likely" or "sampled". :type choice_fn: a string :param allow_none: whether attributes not in the instance can be inferred to be missing. If False, then all attributes will be inferred with some value. :type allow_none: Boolean :return: The most likely value for the given attribute in the node's probability table. :rtype: :ref:`Value<values>` """ if choice_fn == "most likely" or choice_fn == "m": choose = most_likely_choice elif choice_fn == "sampled" or choice_fn == "s": choose = weighted_choice else: raise Exception("Unknown choice_fn") if attr not in self.av_counts: return None choices = self.get_weighted_values(attr, allow_none) val = choose(choices) return val
[docs] def probability(self, attr, val): """ Returns the probability of a particular attribute value at the current concept. This takes into account the possibilities that an attribute can take any of the values available at the root, or be missing. If you you want to check if the probability that an attribute is missing, then check for the probability that the val is ``None``. :param attr: an attribute of an instance :type attr: :ref:`Attribute<attributes>` :param val: a value for the given attribute or None :type val: :ref:`Value<values>` :return: The probability of attr having the value val in the current concept. :rtype: float """ if val is None: c = 0.0 if attr in self.av_counts: c = sum([self.av_counts[attr][v] for v in self.av_counts[attr]]) return (self.count - c) / self.count if attr in self.av_counts and val in self.av_counts[attr]: return self.av_counts[attr][val] / self.count return 0.0
[docs] def log_likelihood(self, child_leaf): """ Returns the log-likelihood of a leaf contained within the current concept. Note, if the leaf contains multiple instances, then it is treated as if it contained just a single instance (this function is just called multiple times for each instance in the leaf). """ ll = 0 for attr in set(self.attrs()).union(set(child_leaf.attrs())): vals = set([None]) if attr in self.av_counts: vals.update(self.av_counts[attr]) if attr in child_leaf.av_counts: vals.update(child_leaf.av_counts[attr]) for val in vals: op = child_leaf.probability(attr, val) if op > 0: p = self.probability(attr, val) * op if p >= 0: ll += log(p) else: raise Exception("Should always be greater than 0") return ll