In [1]: from pprint import pprint
In [2]: from concept_formation.trestle import TrestleTree
In [3]: from concept_formation.cluster import cluster
# Data is stored in a list of dictionaries where values can be either nominal,
# numeric, component.
In [4]: data = [{'f1': 'v1', #nominal value
...: 'f2': 2.6, #numeric value
...: '?f3': {'sub-feature1': 'v1'}, # component value
...: '?f4': {'sub-feature1': 'v1'}, # component value
...: ('some-relation','?f3','?f4'): True #relational attribute
...: },
...: {'f1': 'v1', #nominal value
...: 'f2': 2.8, #numeric value
...: '?f3': {'sub-feature1': 'v2'}, # component value
...: '?f4': {'sub-feature1': 'v1'}, # component value
...: ('some-relation','?f3','?f4'): True #relational attribute
...: }]
...:
# Data can be clustered with a TrestleTree, which supports all data types or
# with a specific tree (CobwebTree or Cobweb3Tree) that supports subsets of
# datatypes (CobwebTree supports only Nominal and Cobweb3Tree supports only
# nominal or numeric).
In [5]: tree = TrestleTree()
In [6]: tree.fit(data)
# Trees can be printed in plaintext or exported in JSON format
In [7]: print(tree)
|-{'f1': {'v1': 2}, 'f2': {'#ContinuousValue#': 2.7000 (0.1772) [2]}, '('sub-feature1', u'?o2')': {'v1': 2}, '('some-relation', u'?o1', u'?o2')': {'True': 2}, '('sub-feature1', u'?o1')': {'v1': 1, 'v2': 1}}: 2.0
|-{'f1': {'v1': 1}, 'f2': {'#ContinuousValue#': 2.8000 (0.0000) [1]}, '('some-relation', u'?o1', u'?o2')': {'True': 1}, '('sub-feature1', u'?o2')': {'v1': 1}, '('sub-feature1', u'?o1')': {'v2': 1}}: 1.0
|-{'f1': {'v1': 1}, 'f2': {'#ContinuousValue#': 2.6000 (0.0000) [1]}, '('sub-feature1', u'?o2')': {'v1': 1}, '('some-relation', u'?o1', u'?o2')': {'True': 1}, '('sub-feature1', u'?o1')': {'v1': 1}}: 1.0
In [8]: pprint(tree.root.output_json())
{u'children': [{u'children': [],
u'counts': {"('some-relation', u'?o1', u'?o2')": {'True': 1},
"('sub-feature1', u'?o1')": {'v2': 1},
"('sub-feature1', u'?o2')": {'v1': 1},
'f1': {'v1': 1},
'f2': {u'#ContinuousValue#': {u'mean': 2.8,
u'n': 1.0,
u'std': 0.0}}},
u'name': u'Concept247011',
u'size': 1.0},
{u'children': [],
u'counts': {"('some-relation', u'?o1', u'?o2')": {'True': 1},
"('sub-feature1', u'?o1')": {'v1': 1},
"('sub-feature1', u'?o2')": {'v1': 1},
'f1': {'v1': 1},
'f2': {u'#ContinuousValue#': {u'mean': 2.6,
u'n': 1.0,
u'std': 0.0}}},
u'name': u'Concept247022',
u'size': 1.0}],
u'counts': {"('some-relation', u'?o1', u'?o2')": {'True': 2},
"('sub-feature1', u'?o1')": {'v1': 1, 'v2': 1},
"('sub-feature1', u'?o2')": {'v1': 2},
'f1': {'v1': 2},
'f2': {u'#ContinuousValue#': {u'mean': 2.7,
u'n': 2.0,
u'std': 0.17724538509055154}}},
u'name': u'Concept247021',
u'size': 2.0}
# Trees can also be used to infer missing attributes of new data points.
In [9]: new = {'f2': 2.6, '?f3': {'sub-feature1': 'v1'},
...: '?f4': {'sub-feature1': 'v1'}}
...:
# Here we see that 'f1' and 'some-relation' are infered.
In [10]: pprint(tree.infer_missing(new))
{'?f3': {'sub-feature1': 'v1'},
'?f4': {'sub-feature1': 'v1'},
'f1': 'v1',
'f2': 2.6,
('some-relation', '?f3', '?f4'): True}
# They can also be used to predict specific attribute values
In [11]: concept = tree.categorize(new)
In [12]: print(concept.predict('f1'))
v1
# Or to get the probability of a particular attribute value
In [13]: print(concept.probability('f1', 'v1'))
1.0
# Trees can also be used to produce flat clusterings
In [14]: new_tree = TrestleTree()
In [15]: clustering = cluster(new_tree, data)
In [16]: print(clustering)
[[u'Concept247047', u'Concept247058']]