Fast Example¶

In [1]: from pprint import pprint

In [2]: from concept_formation.trestle import TrestleTree

In [3]: from concept_formation.cluster import cluster

# Data is stored in a list of dictionaries where values can be either nominal,
# numeric, component.
In [4]: data = [{'f1': 'v1', #nominal value
   ...:          'f2': 2.6, #numeric value
   ...:          '?f3': {'sub-feature1': 'v1'}, # component value
   ...:          '?f4': {'sub-feature1': 'v1'}, # component value
   ...:          ('some-relation','?f3','?f4'): True #relational attribute
   ...:         },
   ...:         {'f1': 'v1', #nominal value
   ...:          'f2': 2.8, #numeric value
   ...:          '?f3': {'sub-feature1': 'v2'}, # component value
   ...:          '?f4': {'sub-feature1': 'v1'}, # component value
   ...:          ('some-relation','?f3','?f4'): True #relational attribute
   ...:         }]
   ...: 

# Data can be clustered with a TrestleTree, which supports all data types or
# with a specific tree (CobwebTree or Cobweb3Tree) that supports subsets of
# datatypes (CobwebTree supports only Nominal and Cobweb3Tree supports only
# nominal or numeric).
In [5]: tree = TrestleTree()

In [6]: tree.fit(data)

# Trees can be printed in plaintext or exported in JSON format
In [7]: print(tree)
|-{'f1': {'v1': 2}, 'f2': {'#ContinuousValue#': 2.7000 (0.1772) [2]}, '('sub-feature1', u'?o2')': {'v1': 2}, '('some-relation', u'?o1', u'?o2')': {'True': 2}, '('sub-feature1', u'?o1')': {'v1': 1, 'v2': 1}}: 2.0
	|-{'f1': {'v1': 1}, 'f2': {'#ContinuousValue#': 2.8000 (0.0000) [1]}, '('some-relation', u'?o1', u'?o2')': {'True': 1}, '('sub-feature1', u'?o2')': {'v1': 1}, '('sub-feature1', u'?o1')': {'v2': 1}}: 1.0
	|-{'f1': {'v1': 1}, 'f2': {'#ContinuousValue#': 2.6000 (0.0000) [1]}, '('sub-feature1', u'?o2')': {'v1': 1}, '('some-relation', u'?o1', u'?o2')': {'True': 1}, '('sub-feature1', u'?o1')': {'v1': 1}}: 1.0


In [8]: pprint(tree.root.output_json())
{u'children': [{u'children': [],
                u'counts': {"('some-relation', u'?o1', u'?o2')": {'True': 1},
                            "('sub-feature1', u'?o1')": {'v2': 1},
                            "('sub-feature1', u'?o2')": {'v1': 1},
                            'f1': {'v1': 1},
                            'f2': {u'#ContinuousValue#': {u'mean': 2.8,
                                                          u'n': 1.0,
                                                          u'std': 0.0}}},
                u'name': u'Concept247011',
                u'size': 1.0},
               {u'children': [],
                u'counts': {"('some-relation', u'?o1', u'?o2')": {'True': 1},
                            "('sub-feature1', u'?o1')": {'v1': 1},
                            "('sub-feature1', u'?o2')": {'v1': 1},
                            'f1': {'v1': 1},
                            'f2': {u'#ContinuousValue#': {u'mean': 2.6,
                                                          u'n': 1.0,
                                                          u'std': 0.0}}},
                u'name': u'Concept247022',
                u'size': 1.0}],
 u'counts': {"('some-relation', u'?o1', u'?o2')": {'True': 2},
             "('sub-feature1', u'?o1')": {'v1': 1, 'v2': 1},
             "('sub-feature1', u'?o2')": {'v1': 2},
             'f1': {'v1': 2},
             'f2': {u'#ContinuousValue#': {u'mean': 2.7,
                                           u'n': 2.0,
                                           u'std': 0.17724538509055154}}},
 u'name': u'Concept247021',
 u'size': 2.0}

# Trees can also be used to infer missing attributes of new data points.
In [9]: new = {'f2': 2.6, '?f3': {'sub-feature1': 'v1'},
   ...:        '?f4': {'sub-feature1': 'v1'}}
   ...: 

# Here we see that 'f1' and 'some-relation' are infered.
In [10]: pprint(tree.infer_missing(new))
{'?f3': {'sub-feature1': 'v1'},
 '?f4': {'sub-feature1': 'v1'},
 'f1': 'v1',
 'f2': 2.6,
 ('some-relation', '?f3', '?f4'): True}

# They can also be used to predict specific attribute values
In [11]: concept = tree.categorize(new)

In [12]: print(concept.predict('f1'))
v1

# Or to get the probability of a particular attribute value
In [13]: print(concept.probability('f1', 'v1'))
1.0

# Trees can also be used to produce flat clusterings
In [14]: new_tree = TrestleTree()

In [15]: clustering = cluster(new_tree, data)

In [16]: print(clustering)
[[u'Concept247047', u'Concept247058']]