Source code for concept_formation.datasets

"""
The dataset module has functions for loading a variety of datasets that
are properly formated for use with CobwebTrees and their derivatives.
"""

from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from __future__ import division
from os.path import dirname
from os.path import join
import json

from concept_formation.data_files.generate_quadruped import generate_animals


def _load_json(filename, num_instances=None):
    """
    Loads a json file and returns a python object generated from parsing the
    json.
    """
    module_path = dirname(__file__)
    output = []
    with open(join(module_path, 'data_files', filename)) as dat:
        for idx, lin in enumerate(dat):
            if num_instances is not None and idx > num_instances:
                break
            output.append(json.loads(lin.strip('[],\n')))
    return output


def _load_file(filename):
    """
    Reads the rows of a file and returns them as an array.
    """
    module_path = dirname(__file__)
    with open(join(module_path, 'data_files', filename)) as dat:
        output = [row[:-1] for row in dat]
    return output


[docs]def load_forest_fires(num_instances=None):
    """
    Load the forest fires dataset.

    This is an example of instances with :ref:`Nominal<val-nom>` and
    :ref:`Numeric<val-num>` values and :ref:`Constant<attr-const>` attributes.
    This dataset contains 517 instances.

    This dataset was downloaded from the `UCI machine learning repository
    <http://archive.ics.uci.edu/ml/datasets/Forest+Fires>`__.
    We processed the data to be in dictionary format with human readable
    labels.

    >>> import pprint
    >>> data = load_forest_fires(num_instances=1)
    >>> pprint.pprint(data[0])
    {'DC': 94.3,
     'DMC': 26.2,
     'FFMC': 86.2,
     'ISI': 5.1,
     'RH': 51.0,
     'area': 0.0,
     'day': 'fri',
     'month': 'mar',
     'rain': 0.0,
     'temp': 8.2,
     'wind': 6.7,
     'x-axis': 7.0,
     'y-axis': 5.0}

    """
    return _load_json('forest_fires.json', num_instances)


[docs]def load_congressional_voting(num_instances=None):
    """
    Load the voting dataset.

    This is an example of instances with only :ref:`Nominal<val-nom>` values
    and :ref:`Constant<attr-const>` attributes but some attributes are
    occasionally missing. This dataset contains 435 instances.

    This dataset was downloaded from the `UCI machine learning repository
    <http://archive.ics.uci.edu/ml/datasets/Congressional+Voting+Records>`__.
    We processed the data to be in dictionary format with human readable
    labels.

    >>> import pprint
    >>> data = load_congressional_voting(num_instances=1)
    >>> pprint.pprint(data[0])
    {'Class Name': 'republican',
     'adoption-of-the-budget-resolution': 'n',
     'aid-to-nicaraguan-contras': 'n',
     'anti-satellite-test-ban': 'n',
     'crime': 'y',
     'duty-free-exports': 'n',
     'education-spending': 'y',
     'el-salvador-aid': 'y',
     'export-administration-act-south-africa': 'y',
     'handicapped-infants': 'n',
     'immigration': 'y',
     'mx-missile': 'n',
     'physician-fee-freeze': 'y',
     'religious-groups-in-schools': 'y',
     'superfund-right-to-sue': 'y',
     'water-project-cost-sharing': 'y'}

    """
    return _load_json('congressional_voting.json', num_instances)


[docs]def load_iris(num_instances=None):
    """
    Load the iris dataset.

    This is an example of instances with :ref:`Nominal<val-nom>` and
    :ref:`Numeric<val-num>` values and :ref:`Constant<attr-const>` attributes.
    This dataset contains 150 instances.

    This dataset was downloaded from the `UCI machine learning repository
    <https://archive.ics.uci.edu/ml/datasets/Iris>`__. We processed the data
    to be in dictionary format with human readable labels.

    >>> import pprint
    >>> data = load_iris(num_instances=1)
    >>> pprint.pprint(data[0])
    {'class': 'Iris-setosa',
     'petal length': 1.4,
     'petal width': 0.2,
     'sepal length': 5.1,
     'sepal width': 3.5}

    """
    return _load_json('iris.json', num_instances)


[docs]def load_mushroom(num_instances=None):
    """
    Load the mushroom dataset.

    This is an example of instances with only :ref:`Nominal<val-nom>` values
    and :ref:`Constant<attr-const>` attributes. This dataset contains 8124
    instances.

    This dataset was downloaded from the `UCI machine learning repository
    <https://archive.ics.uci.edu/ml/datasets/Mushroom>`__. We processed the
    data to be in dictionary format with human readable labels.

    >>> import pprint
    >>> data = load_mushroom(num_instances=1)
    >>> pprint.pprint(data[0])
    {'bruises?': 'yes',
     'cap-color': 'brown',
     'cap-shape': 'convex',
     'cap-surface': 'smooth',
     'classification': 'poisonous',
     'gill-attachment': 'free',
     'gill-color': 'black',
     'gill-size': 'narrow',
     'gill-spacing': 'closed',
     'habitat': 'urban',
     'odor': 'pungent',
     'population': 'scattered',
     'ring-number': 'one',
     'ring-type': 'pendant',
     'spore-print-color': 'black',
     'stalk-color-above-ring': 'white',
     'stalk-color-below-ring': 'white',
     'stalk-root': 'equal',
     'stalk-shape': 'enlarging',
     'stalk-surface-above-ring': 'smooth',
     'stalk-surface-below-ring': 'smooth',
     'veil-color': 'white',
     'veil-type': 'partial'}
    """
    return _load_json('mushrooms.json', num_instances)


[docs]def load_rb_com_11():
    """
    Load the RumbleBlocks, Center of Mass Level 11, dataset.

    This is an example of instances with all the attribute and value types
    described in the :ref:`instance-rep`. This dataset contains 251 instances.

    >>> import pprint
    >>> data = load_rb_com_11()
    >>> pprint.pprint(data[0])
    {'_guid': 'ea022d3d-5c9e-46d7-be23-8ea718fe7816',
     '_human_cluster_label': '0',
     'component0': {'b': 1.0, 'l': 0.0, 'r': 1.0, 't': 2.0, 'type': 'cube0'},
     'component1': {'b': 3.0, 'l': 2.0, 'r': 3.0, 't': 4.0, 'type': 'cube0'},
     'component14': {'b': 4.0, 'l': 1.0, 'r': 4.0, 't': 5.0, 'type': 'ufoo0'},
     'component2': {'b': 1.0, 'l': 1.0, 'r': 4.0, 't': 2.0, 'type': 'plat0'},
     'component3': {'b': 2.0, 'l': 1.0, 'r': 4.0, 't': 3.0, 'type': 'plat0'},
     'component4': {'b': 0.0, 'l': 0.0, 'r': 5.0, 't': 1.0, 'type': 'rect0'}}
    """
    return _load_json('rb_com_11_continuous.json')


[docs]def load_rb_s_07(num_instances=None):
    """
    Load the RumbleBlocks, Symmetry Level 7, dataset.

    This is an example of instances with all the attribute and value types
    described in the :ref:`instance-rep`. This dataset contains 141 instances.

    >>> import pprint
    >>> data = load_rb_s_07(num_instances=1)
    >>> pprint.pprint(data[0])
    {'_guid': '660ac76d-93b3-4ce7-8a15-a3213e9103f5',
     'component0': {'b': 0.0, 'l': 0.0, 'r': 3.0, 't': 1.0, 'type': 'plat0'},
     'component1': {'b': 1.0, 'l': 1.0, 'r': 2.0, 't': 4.0, 'type': 'plat90'},
     'component8': {'b': 4.0, 'l': 0.0, 'r': 3.0, 't': 5.0, 'type': 'ufoo0'},
     'success': '0'}
    """
    return _load_json('rb_s_07_continuous.json', num_instances)


[docs]def load_rb_s_13(num_instances=None):
    """
    Load the RumbleBlocks, Symmetry Level 13, dataset.

    This is an example of instances with all the attribute and value types
    described in the :ref:`instance-rep`. This dataset contains 249 instances.

    >>> import pprint
    >>> data = load_rb_s_13(num_instances=1)
    >>> pprint.pprint(data[0])
    {'_guid': '684b4ce5-0f55-481c-ae9a-1474de8418ea',
     '_human_cluster_label': '0',
     'component0': {'b': 3.0, 'l': 2.0, 'r': 3.0, 't': 4.0, 'type': 'cube0'},
     'component1': {'b': 4.0, 'l': 2.0, 'r': 3.0, 't': 5.0, 'type': 'cube0'},
     'component14': {'b': 0.0, 'l': 0.0, 'r': 4.0, 't': 1.0, 'type': 'trap0'},
     'component15': {'b': 5.0, 'l': 1.0, 'r': 3.0, 't': 6.0, 'type': 'ufoo0'},
     'component2': {'b': 1.0, 'l': 0.0, 'r': 3.0, 't': 2.0, 'type': 'plat0'},
     'component3': {'b': 2.0, 'l': 0.0, 'r': 3.0, 't': 3.0, 'type': 'plat0'}}
    """
    return _load_json('rb_s_13_continuous.json', num_instances)


[docs]def load_rb_wb_03(num_instances=None):
    """
    Load the RumbleBlocks, Wide Base Level 03, dataset.

    This is an example of instances with all the attribute and value types
    described in the :ref:`instance-rep`. This dataset contains 254 instances.

    >>> import pprint
    >>> data = load_rb_wb_03(num_instances=1)
    >>> pprint.pprint(data[0])
    {'_guid': 'aa5eff72-0572-4eff-a007-3def9a82ba5b',
     '_human_cluster_label': '0',
     'component0': {'b': 2.0, 'l': 2.0, 'r': 3.0, 't': 3.0, 'type': 'cube0'},
     'component1': {'b': 2.0, 'l': 3.0, 'r': 4.0, 't': 3.0, 'type': 'cube0'},
     'component11': {'b': 3.0, 'l': 1.0, 'r': 4.0, 't': 4.0, 'type': 'ufoo0'},
     'component2': {'b': 1.0, 'l': 2.0, 'r': 5.0, 't': 2.0, 'type': 'plat0'},
     'component3': {'b': 0.0, 'l': 0.0, 'r': 5.0, 't': 1.0, 'type': 'rect0'}}
    """
    return _load_json('rb_wb_03_continuous.json', num_instances)


[docs]def load_rb_s_07_human_predictions():
    """
    Load the Human Predictions Data for the RumbleBlocks, Symmetry Level 7,
    dataset.

    This is data collected from mechanical turk, where workers were tasked with
    predicting a concept label (success) given a picture of the tower. The
    element contains labels for the data and subsequent rows contain the actual
    data. This dataset contains 601 instances.

    >>> import pprint
    >>> data = load_rb_s_07_human_predictions()
    >>> pprint.pprint(data[0:2])
    ['user_id,instance_guid,time,order,prediction,correctness',
     '1,2fda0bde-95a7-4bda-9851-785275c3f56d,2015-02-15 '
     '19:21:14.327344+00:00,1,0,1']
    """
    return _load_file('human_s_07_success_predictions.csv')


[docs]def load_quadruped(num_instances):
    """
    Returns a randomly generated quadruped dataset of size `num_instances`
    using the procedure employed in:

    Gennari, J. H., Langley, P., & Fisher, D. H. (1989). Models of incremental
    concept formation. Artificial Intelligence, 40, 11-61.

    This dataset contains four kinds of quadruped animals: dogs, cats, horses,
    and giraffes. The type of each component is included as a hidden variable,
    so that structure mapping can be tested. Additionally, the type of animal
    (e.g., dog) is also included as a hidden variable.

    >>> import pprint
    >>> import random
    >>> random.seed(0)
    >>> data = load_quadruped(10)
    >>> print(len(data))
    10
    >>> pprint.pprint(data[0:1])
    [{'_type': 'giraffe',
      'head': {'_type': 'head',
               'axisX': 1,
               'axisY': -0.23376215459531377,
               'axisZ': 0,
               'height': 19.069373148228724,
               'locationX': 71.71171645023995,
               'locationY': 0,
               'locationZ': 49.26645266304532,
               'radius': 4.05626484907961,
               'texture': 177.5670433982545},
      'leg1': {'_type': 'leg1',
               'axisX': 0.25279896094692916,
               'axisY': 0,
               'axisZ': -1,
               'height': 60.13197726212744,
               'locationX': 35.29119556606559,
               'locationY': 12.845931778870957,
               'locationZ': -42.91192040993468,
               'radius': 3.597944849223721,
               'texture': 179.23727389536953},
      'leg2': {'_type': 'leg2',
               'axisX': 0,
               'axisY': 0,
               'axisZ': -1,
               'height': 60.13197726212744,
               'locationX': 35.29119556606559,
               'locationY': -12.845931778870957,
               'locationZ': -42.91192040993468,
               'radius': 2.009043416794043,
               'texture': 174.58392827108403},
      'leg3': {'_type': 'leg3',
               'axisX': 0,
               'axisY': 0,
               'axisZ': -1,
               'height': 60.13197726212744,
               'locationX': -35.29119556606559,
               'locationY': 12.845931778870957,
               'locationZ': -42.91192040993468,
               'radius': 2.348946587645933,
               'texture': 178.9283460962157},
      'leg4': {'_type': 'leg4',
               'axisX': 0.28802829434429883,
               'axisY': 0,
               'axisZ': -1,
               'height': 60.13197726212744,
               'locationX': -35.29119556606559,
               'locationY': -12.845931778870957,
               'locationZ': -42.91192040993468,
               'radius': 2.9029316087251233,
               'texture': 171.86316987918838},
      'neck': {'_type': 'neck',
               'axisX': 1,
               'axisY': 0,
               'axisZ': 1,
               'height': 51.49861653022255,
               'locationX': 53.50145600815277,
               'locationY': 0,
               'locationZ': 31.05619222095814,
               'radius': 7.87732253394808,
               'texture': 177.14627952379485},
      'tail': {'_type': 'tail',
               'axisX': -1,
               'axisY': 0.24883477194257322,
               'axisZ': -0.531438665320418,
               'height': 20.918101962779517,
               'locationX': -49.66428916935166,
               'locationY': 0,
               'locationZ': 0,
               'radius': 0.9455145384298446,
               'texture': 177.24907471005645},
      'torso': {'_type': 'torso',
                'axisX': 1,
                'axisY': 0,
                'axisZ': 0,
                'height': 70.58239113213118,
                'locationX': 0,
                'locationY': 0,
                'locationZ': 0,
                'radius': 12.845931778870957,
                'texture': 171.2283287965781}}]
    """
    return generate_animals(num_instances)


[docs]def load_molecule(num_instances=None):
    """Load a dataset of 101 molecules from the pubchem database

    This dataset was downloaded from the `Pubchem databse
    <https://www.ncbi.nlm.nih.gov/pccompound>`__. We used a custom `molfile
    parser<https://github.com/eharpste/molparser>`__ to process the data to be
    in dictionary format with human readable labels.

    >>> import pprint
    >>> data = load_molecule()
    >>> pprint.pprint(data[3])
    {'(bond Single Not_stereo ?atom0001 ?atom0003)': True,
     '(bond Single Not_stereo ?atom0001 ?atom0014)': True,
     '(bond Single Not_stereo ?atom0002 ?atom0004)': True,
     '(bond Single Not_stereo ?atom0002 ?atom0012)': True,
     '(bond Single Not_stereo ?atom0002 ?atom0013)': True,
     '(bond Single Not_stereo ?atom0003 ?atom0004)': True,
     '(bond Single Not_stereo ?atom0003 ?atom0005)': True,
     '(bond Single Not_stereo ?atom0003 ?atom0006)': True,
     '(bond Single Not_stereo ?atom0004 ?atom0007)': True,
     '(bond Single Not_stereo ?atom0004 ?atom0008)': True,
     '(bond Single Not_stereo ?atom0005 ?atom0009)': True,
     '(bond Single Not_stereo ?atom0005 ?atom0010)': True,
     '(bond Single Not_stereo ?atom0005 ?atom0011)': True,
     '?atom0001': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'O',
                   'valence': 'no marking',
                   'x': 2.5369,
                   'y': 0.75,
                   'z': 0.0},
     '?atom0002': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'N',
                   'valence': 'no marking',
                   'x': 5.135,
                   'y': 0.25,
                   'z': 0.0},
     '?atom0003': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'unmarked',
                   'symbol': 'C',
                   'valence': 'no marking',
                   'x': 3.403,
                   'y': 0.25,
                   'z': 0.0},
     '?atom0004': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'C',
                   'valence': 'no marking',
                   'x': 4.269,
                   'y': 0.75,
                   'z': 0.0},
     '?atom0005': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'C',
                   'valence': 'no marking',
                   'x': 3.403,
                   'y': -0.75,
                   'z': 0.0},
     '?atom0006': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'H',
                   'valence': 'no marking',
                   'x': 3.403,
                   'y': 1.1,
                   'z': 0.0},
     '?atom0007': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'H',
                   'valence': 'no marking',
                   'x': 4.6675,
                   'y': 1.225,
                   'z': 0.0},
     '?atom0008': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'H',
                   'valence': 'no marking',
                   'x': 3.8705,
                   'y': 1.225,
                   'z': 0.0},
     '?atom0009': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'H',
                   'valence': 'no marking',
                   'x': 2.783,
                   'y': -0.75,
                   'z': 0.0},
     '?atom0010': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'H',
                   'valence': 'no marking',
                   'x': 3.403,
                   'y': -1.37,
                   'z': 0.0},
     '?atom0011': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'H',
                   'valence': 'no marking',
                   'x': 4.023,
                   'y': -0.75,
                   'z': 0.0},
     '?atom0012': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'H',
                   'valence': 'no marking',
                   'x': 5.672,
                   'y': 0.56,
                   'z': 0.0},
     '?atom0013': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'H',
                   'valence': 'no marking',
                   'x': 5.135,
                   'y': -0.37,
                   'z': 0.0},
     '?atom0014': {'charge': 'outside_limits',
                   'hydrogen_count': 'H0',
                   'mass_diff': '0',
                   'stereo_parity': 'not_stereo',
                   'symbol': 'H',
                   'valence': 'no marking',
                   'x': 2.0,
                   'y': 0.44,
                   'z': 0.0},
     '_name': '4',
     '_software': '-OEChem-03201502492D',
     '_version': 'V2000',
     'chiral': True}
    """
    return _load_json('molecule.json', num_instances)