Source code for concept_formation.datasets

"""
The dataset module has functions for loading a variety of datasets that
are properly formated for use with CobwebTrees and their derivatives.
"""

from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from __future__ import division
from os.path import dirname
from os.path import join
import json

from concept_formation.data_files.generate_quadruped import generate_animals


def _load_json(filename, num_instances=None):
    """
    Loads a json file and returns a python object generated from parsing the
    json.
    """
    module_path = dirname(__file__)
    output = []
    with open(join(module_path, 'data_files', filename)) as dat:
        for idx, lin in enumerate(dat):
            if num_instances is not None and idx > num_instances:
                break
            output.append(json.loads(lin.strip('[],\n')))
    return output


def _load_file(filename):
    """
    Reads the rows of a file and returns them as an array.
    """
    module_path = dirname(__file__)
    with open(join(module_path, 'data_files', filename)) as dat:
        output = [row[:-1] for row in dat]
    return output


[docs]def load_forest_fires(num_instances=None): """ Load the forest fires dataset. This is an example of instances with :ref:`Nominal<val-nom>` and :ref:`Numeric<val-num>` values and :ref:`Constant<attr-const>` attributes. This dataset contains 517 instances. This dataset was downloaded from the `UCI machine learning repository <http://archive.ics.uci.edu/ml/datasets/Forest+Fires>`__. We processed the data to be in dictionary format with human readable labels. >>> import pprint >>> data = load_forest_fires(num_instances=1) >>> pprint.pprint(data[0]) {'DC': 94.3, 'DMC': 26.2, 'FFMC': 86.2, 'ISI': 5.1, 'RH': 51.0, 'area': 0.0, 'day': 'fri', 'month': 'mar', 'rain': 0.0, 'temp': 8.2, 'wind': 6.7, 'x-axis': 7.0, 'y-axis': 5.0} """ return _load_json('forest_fires.json', num_instances)
[docs]def load_congressional_voting(num_instances=None): """ Load the voting dataset. This is an example of instances with only :ref:`Nominal<val-nom>` values and :ref:`Constant<attr-const>` attributes but some attributes are occasionally missing. This dataset contains 435 instances. This dataset was downloaded from the `UCI machine learning repository <http://archive.ics.uci.edu/ml/datasets/Congressional+Voting+Records>`__. We processed the data to be in dictionary format with human readable labels. >>> import pprint >>> data = load_congressional_voting(num_instances=1) >>> pprint.pprint(data[0]) {'Class Name': 'republican', 'adoption-of-the-budget-resolution': 'n', 'aid-to-nicaraguan-contras': 'n', 'anti-satellite-test-ban': 'n', 'crime': 'y', 'duty-free-exports': 'n', 'education-spending': 'y', 'el-salvador-aid': 'y', 'export-administration-act-south-africa': 'y', 'handicapped-infants': 'n', 'immigration': 'y', 'mx-missile': 'n', 'physician-fee-freeze': 'y', 'religious-groups-in-schools': 'y', 'superfund-right-to-sue': 'y', 'water-project-cost-sharing': 'y'} """ return _load_json('congressional_voting.json', num_instances)
[docs]def load_iris(num_instances=None): """ Load the iris dataset. This is an example of instances with :ref:`Nominal<val-nom>` and :ref:`Numeric<val-num>` values and :ref:`Constant<attr-const>` attributes. This dataset contains 150 instances. This dataset was downloaded from the `UCI machine learning repository <https://archive.ics.uci.edu/ml/datasets/Iris>`__. We processed the data to be in dictionary format with human readable labels. >>> import pprint >>> data = load_iris(num_instances=1) >>> pprint.pprint(data[0]) {'class': 'Iris-setosa', 'petal length': 1.4, 'petal width': 0.2, 'sepal length': 5.1, 'sepal width': 3.5} """ return _load_json('iris.json', num_instances)
[docs]def load_mushroom(num_instances=None): """ Load the mushroom dataset. This is an example of instances with only :ref:`Nominal<val-nom>` values and :ref:`Constant<attr-const>` attributes. This dataset contains 8124 instances. This dataset was downloaded from the `UCI machine learning repository <https://archive.ics.uci.edu/ml/datasets/Mushroom>`__. We processed the data to be in dictionary format with human readable labels. >>> import pprint >>> data = load_mushroom(num_instances=1) >>> pprint.pprint(data[0]) {'bruises?': 'yes', 'cap-color': 'brown', 'cap-shape': 'convex', 'cap-surface': 'smooth', 'classification': 'poisonous', 'gill-attachment': 'free', 'gill-color': 'black', 'gill-size': 'narrow', 'gill-spacing': 'closed', 'habitat': 'urban', 'odor': 'pungent', 'population': 'scattered', 'ring-number': 'one', 'ring-type': 'pendant', 'spore-print-color': 'black', 'stalk-color-above-ring': 'white', 'stalk-color-below-ring': 'white', 'stalk-root': 'equal', 'stalk-shape': 'enlarging', 'stalk-surface-above-ring': 'smooth', 'stalk-surface-below-ring': 'smooth', 'veil-color': 'white', 'veil-type': 'partial'} """ return _load_json('mushrooms.json', num_instances)
[docs]def load_rb_com_11(): """ Load the RumbleBlocks, Center of Mass Level 11, dataset. This is an example of instances with all the attribute and value types described in the :ref:`instance-rep`. This dataset contains 251 instances. >>> import pprint >>> data = load_rb_com_11() >>> pprint.pprint(data[0]) {'_guid': 'ea022d3d-5c9e-46d7-be23-8ea718fe7816', '_human_cluster_label': '0', 'component0': {'b': 1.0, 'l': 0.0, 'r': 1.0, 't': 2.0, 'type': 'cube0'}, 'component1': {'b': 3.0, 'l': 2.0, 'r': 3.0, 't': 4.0, 'type': 'cube0'}, 'component14': {'b': 4.0, 'l': 1.0, 'r': 4.0, 't': 5.0, 'type': 'ufoo0'}, 'component2': {'b': 1.0, 'l': 1.0, 'r': 4.0, 't': 2.0, 'type': 'plat0'}, 'component3': {'b': 2.0, 'l': 1.0, 'r': 4.0, 't': 3.0, 'type': 'plat0'}, 'component4': {'b': 0.0, 'l': 0.0, 'r': 5.0, 't': 1.0, 'type': 'rect0'}} """ return _load_json('rb_com_11_continuous.json')
[docs]def load_rb_s_07(num_instances=None): """ Load the RumbleBlocks, Symmetry Level 7, dataset. This is an example of instances with all the attribute and value types described in the :ref:`instance-rep`. This dataset contains 141 instances. >>> import pprint >>> data = load_rb_s_07(num_instances=1) >>> pprint.pprint(data[0]) {'_guid': '660ac76d-93b3-4ce7-8a15-a3213e9103f5', 'component0': {'b': 0.0, 'l': 0.0, 'r': 3.0, 't': 1.0, 'type': 'plat0'}, 'component1': {'b': 1.0, 'l': 1.0, 'r': 2.0, 't': 4.0, 'type': 'plat90'}, 'component8': {'b': 4.0, 'l': 0.0, 'r': 3.0, 't': 5.0, 'type': 'ufoo0'}, 'success': '0'} """ return _load_json('rb_s_07_continuous.json', num_instances)
[docs]def load_rb_s_13(num_instances=None): """ Load the RumbleBlocks, Symmetry Level 13, dataset. This is an example of instances with all the attribute and value types described in the :ref:`instance-rep`. This dataset contains 249 instances. >>> import pprint >>> data = load_rb_s_13(num_instances=1) >>> pprint.pprint(data[0]) {'_guid': '684b4ce5-0f55-481c-ae9a-1474de8418ea', '_human_cluster_label': '0', 'component0': {'b': 3.0, 'l': 2.0, 'r': 3.0, 't': 4.0, 'type': 'cube0'}, 'component1': {'b': 4.0, 'l': 2.0, 'r': 3.0, 't': 5.0, 'type': 'cube0'}, 'component14': {'b': 0.0, 'l': 0.0, 'r': 4.0, 't': 1.0, 'type': 'trap0'}, 'component15': {'b': 5.0, 'l': 1.0, 'r': 3.0, 't': 6.0, 'type': 'ufoo0'}, 'component2': {'b': 1.0, 'l': 0.0, 'r': 3.0, 't': 2.0, 'type': 'plat0'}, 'component3': {'b': 2.0, 'l': 0.0, 'r': 3.0, 't': 3.0, 'type': 'plat0'}} """ return _load_json('rb_s_13_continuous.json', num_instances)
[docs]def load_rb_wb_03(num_instances=None): """ Load the RumbleBlocks, Wide Base Level 03, dataset. This is an example of instances with all the attribute and value types described in the :ref:`instance-rep`. This dataset contains 254 instances. >>> import pprint >>> data = load_rb_wb_03(num_instances=1) >>> pprint.pprint(data[0]) {'_guid': 'aa5eff72-0572-4eff-a007-3def9a82ba5b', '_human_cluster_label': '0', 'component0': {'b': 2.0, 'l': 2.0, 'r': 3.0, 't': 3.0, 'type': 'cube0'}, 'component1': {'b': 2.0, 'l': 3.0, 'r': 4.0, 't': 3.0, 'type': 'cube0'}, 'component11': {'b': 3.0, 'l': 1.0, 'r': 4.0, 't': 4.0, 'type': 'ufoo0'}, 'component2': {'b': 1.0, 'l': 2.0, 'r': 5.0, 't': 2.0, 'type': 'plat0'}, 'component3': {'b': 0.0, 'l': 0.0, 'r': 5.0, 't': 1.0, 'type': 'rect0'}} """ return _load_json('rb_wb_03_continuous.json', num_instances)
[docs]def load_rb_s_07_human_predictions(): """ Load the Human Predictions Data for the RumbleBlocks, Symmetry Level 7, dataset. This is data collected from mechanical turk, where workers were tasked with predicting a concept label (success) given a picture of the tower. The element contains labels for the data and subsequent rows contain the actual data. This dataset contains 601 instances. >>> import pprint >>> data = load_rb_s_07_human_predictions() >>> pprint.pprint(data[0:2]) ['user_id,instance_guid,time,order,prediction,correctness', '1,2fda0bde-95a7-4bda-9851-785275c3f56d,2015-02-15 ' '19:21:14.327344+00:00,1,0,1'] """ return _load_file('human_s_07_success_predictions.csv')
[docs]def load_quadruped(num_instances): """ Returns a randomly generated quadruped dataset of size `num_instances` using the procedure employed in: Gennari, J. H., Langley, P., & Fisher, D. H. (1989). Models of incremental concept formation. Artificial Intelligence, 40, 11-61. This dataset contains four kinds of quadruped animals: dogs, cats, horses, and giraffes. The type of each component is included as a hidden variable, so that structure mapping can be tested. Additionally, the type of animal (e.g., dog) is also included as a hidden variable. >>> import pprint >>> import random >>> random.seed(0) >>> data = load_quadruped(10) >>> print(len(data)) 10 >>> pprint.pprint(data[0:1]) [{'_type': 'giraffe', 'head': {'_type': 'head', 'axisX': 1, 'axisY': -0.23376215459531377, 'axisZ': 0, 'height': 19.069373148228724, 'locationX': 71.71171645023995, 'locationY': 0, 'locationZ': 49.26645266304532, 'radius': 4.05626484907961, 'texture': 177.5670433982545}, 'leg1': {'_type': 'leg1', 'axisX': 0.25279896094692916, 'axisY': 0, 'axisZ': -1, 'height': 60.13197726212744, 'locationX': 35.29119556606559, 'locationY': 12.845931778870957, 'locationZ': -42.91192040993468, 'radius': 3.597944849223721, 'texture': 179.23727389536953}, 'leg2': {'_type': 'leg2', 'axisX': 0, 'axisY': 0, 'axisZ': -1, 'height': 60.13197726212744, 'locationX': 35.29119556606559, 'locationY': -12.845931778870957, 'locationZ': -42.91192040993468, 'radius': 2.009043416794043, 'texture': 174.58392827108403}, 'leg3': {'_type': 'leg3', 'axisX': 0, 'axisY': 0, 'axisZ': -1, 'height': 60.13197726212744, 'locationX': -35.29119556606559, 'locationY': 12.845931778870957, 'locationZ': -42.91192040993468, 'radius': 2.348946587645933, 'texture': 178.9283460962157}, 'leg4': {'_type': 'leg4', 'axisX': 0.28802829434429883, 'axisY': 0, 'axisZ': -1, 'height': 60.13197726212744, 'locationX': -35.29119556606559, 'locationY': -12.845931778870957, 'locationZ': -42.91192040993468, 'radius': 2.9029316087251233, 'texture': 171.86316987918838}, 'neck': {'_type': 'neck', 'axisX': 1, 'axisY': 0, 'axisZ': 1, 'height': 51.49861653022255, 'locationX': 53.50145600815277, 'locationY': 0, 'locationZ': 31.05619222095814, 'radius': 7.87732253394808, 'texture': 177.14627952379485}, 'tail': {'_type': 'tail', 'axisX': -1, 'axisY': 0.24883477194257322, 'axisZ': -0.531438665320418, 'height': 20.918101962779517, 'locationX': -49.66428916935166, 'locationY': 0, 'locationZ': 0, 'radius': 0.9455145384298446, 'texture': 177.24907471005645}, 'torso': {'_type': 'torso', 'axisX': 1, 'axisY': 0, 'axisZ': 0, 'height': 70.58239113213118, 'locationX': 0, 'locationY': 0, 'locationZ': 0, 'radius': 12.845931778870957, 'texture': 171.2283287965781}}] """ return generate_animals(num_instances)
[docs]def load_molecule(num_instances=None): """Load a dataset of 101 molecules from the pubchem database This dataset was downloaded from the `Pubchem databse <https://www.ncbi.nlm.nih.gov/pccompound>`__. We used a custom `molfile parser<https://github.com/eharpste/molparser>`__ to process the data to be in dictionary format with human readable labels. >>> import pprint >>> data = load_molecule() >>> pprint.pprint(data[3]) {'(bond Single Not_stereo ?atom0001 ?atom0003)': True, '(bond Single Not_stereo ?atom0001 ?atom0014)': True, '(bond Single Not_stereo ?atom0002 ?atom0004)': True, '(bond Single Not_stereo ?atom0002 ?atom0012)': True, '(bond Single Not_stereo ?atom0002 ?atom0013)': True, '(bond Single Not_stereo ?atom0003 ?atom0004)': True, '(bond Single Not_stereo ?atom0003 ?atom0005)': True, '(bond Single Not_stereo ?atom0003 ?atom0006)': True, '(bond Single Not_stereo ?atom0004 ?atom0007)': True, '(bond Single Not_stereo ?atom0004 ?atom0008)': True, '(bond Single Not_stereo ?atom0005 ?atom0009)': True, '(bond Single Not_stereo ?atom0005 ?atom0010)': True, '(bond Single Not_stereo ?atom0005 ?atom0011)': True, '?atom0001': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'O', 'valence': 'no marking', 'x': 2.5369, 'y': 0.75, 'z': 0.0}, '?atom0002': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'N', 'valence': 'no marking', 'x': 5.135, 'y': 0.25, 'z': 0.0}, '?atom0003': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'unmarked', 'symbol': 'C', 'valence': 'no marking', 'x': 3.403, 'y': 0.25, 'z': 0.0}, '?atom0004': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'C', 'valence': 'no marking', 'x': 4.269, 'y': 0.75, 'z': 0.0}, '?atom0005': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'C', 'valence': 'no marking', 'x': 3.403, 'y': -0.75, 'z': 0.0}, '?atom0006': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'H', 'valence': 'no marking', 'x': 3.403, 'y': 1.1, 'z': 0.0}, '?atom0007': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'H', 'valence': 'no marking', 'x': 4.6675, 'y': 1.225, 'z': 0.0}, '?atom0008': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'H', 'valence': 'no marking', 'x': 3.8705, 'y': 1.225, 'z': 0.0}, '?atom0009': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'H', 'valence': 'no marking', 'x': 2.783, 'y': -0.75, 'z': 0.0}, '?atom0010': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'H', 'valence': 'no marking', 'x': 3.403, 'y': -1.37, 'z': 0.0}, '?atom0011': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'H', 'valence': 'no marking', 'x': 4.023, 'y': -0.75, 'z': 0.0}, '?atom0012': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'H', 'valence': 'no marking', 'x': 5.672, 'y': 0.56, 'z': 0.0}, '?atom0013': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'H', 'valence': 'no marking', 'x': 5.135, 'y': -0.37, 'z': 0.0}, '?atom0014': {'charge': 'outside_limits', 'hydrogen_count': 'H0', 'mass_diff': '0', 'stereo_parity': 'not_stereo', 'symbol': 'H', 'valence': 'no marking', 'x': 2.0, 'y': 0.44, 'z': 0.0}, '_name': '4', '_software': '-OEChem-03201502492D', '_version': 'V2000', 'chiral': True} """ return _load_json('molecule.json', num_instances)