Source code for concept_formation.preprocessor

"""
This module contains an number of proprocessors that can be used on various
forms of raw input data to convert an instance into a shape that Trestle would
better understand. Almost all preprocessors preserve the original semantics of
an instance and are mainly being used to prep for Trestle's internal operations.

Two abstract preprocessors are defined:

* :class:`Preprocessor` - Defines the general structure of a preprocessor.
* :class:`Pipeline` - Allows for chaining a collection of preprocessors together.

Trestle's normal implementation uses a standard pipeline of preprocessors that
run in the following order:

#. :class:`SubComponentProcessor` - Pulls any sub-components present in the
   instance to the top level of the instance and adds ``has-component``
   relations to preserve semantics.
#. :class:`Flattener` - Flattens component instances into a number of tuples
   (i.e. ``(attr,component)``) for faster hashing and access.
#. :class:`StructureMapper<concept_formation.structure_mapper.StructureMapper>` 
    - Gives any variables unique names so they can be renamed in matching without 
    colliding, and matches instances to the root concept.

The remaining preprocessors are helper classes designed to support data that is
not stored in Trestle's conventional representation:

* :class:`Tuplizer` - Looks for relation attributes denoted as strings (i.e.
  ``'(relation e1 e1)'``) and replaces the string attribute name with the
  equivalent tuple representation of the relation.
* :class:`ListProcessor` - Search for list values and extracts their elements
  into their own objects and replaces the list with ordering and element-of
  relations. Intended to preserve the semenatics of a list in JSON representation.
* :class:`ObjectVariablizer` - Looks for component objects within an instance
  and variablizes their names by prepending a ``'?'``.
* :class:`NumericToNominal` - Converts numeric values to nominal ones.
* :class:`NominalToNumeric` - Converts nominal values to numeric ones.
"""

from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from __future__ import division

from copy import deepcopy
from numbers import Number
import collections

_gensym_counter = 0


[docs]def get_attribute_components(attribute, vars_only=True):
    """
    Gets component names out of an attribute

    >>> from pprint import pprint
    >>> attr = ('a', ('sub1', '?c1'))
    >>> get_attribute_components(attr)
    {'?c1'}

    >>> attr = '?c1'
    >>> get_attribute_components(attr)
    {'?c1'}

    >>> attr = ('a', ('sub1', 'c1'))
    >>> get_attribute_components(attr)
    set()

    >>> attr = 'c1'
    >>> get_attribute_components(attr)
    set()
    """
    names = set()

    if vars_only is not True and attribute[0] != '_':
        names.add(attribute)

    if isinstance(attribute, tuple):
        for ele in attribute:
            if isinstance(ele, tuple):
                for name in get_attribute_components(ele, vars_only):
                    names.add(name)
            else:
                if ((vars_only is not True or (len(ele) > 0 and ele[0] == '?'))
                    and (ele != '_' and len(ele) > 0 and ele[0] != '_')):
                    names.add(ele)

    elif ((vars_only is not True and attribute[0] != '_') or
          attribute[0] == '?'):
        names.add(attribute)

    return names


[docs]def default_gensym():
    """
    Generates unique names for naming renaming apart objects.

    :return: a unique object name
    :rtype: 'o'+counter
    """
    global _gensym_counter
    _gensym_counter += 1
    return '?o' + str(_gensym_counter)


def _reset_gensym():
    """
    Resets the gensym counter to 0, which is useful for doctesting. Do not call
    this function during normal operation.
    """
    global _gensym_counter
    _gensym_counter = 0


[docs]class Preprocessor(object):
    """
    A template class that defines the functions a preprocessor class should
    implement. In particular, a preprocessor should tranform an instance and
    implement a function for undoing this transformation.
    """
[docs]    def transform(self, instance):
        """
        Transforms an instance.
        """
        raise NotImplementedError("Class must implement transform")

[docs]    def undo_transform(self, instance):
        """
        Undoes a transformation to an instance.
        """
        raise NotImplementedError("Class must implement undo_transform")

[docs]    def batch_transform(self, instances):
        """
        Transforms a collection of instances.
        """
        return [self.transform(instance) for instance in instances]

[docs]    def batch_undo(self, instances):
        """
        Undoes transformation for a collection of instances
        """
        return [self.undo_transform(instance) for instance in instances]


[docs]class OneWayPreprocessor(Preprocessor):
    """
    A template class that defines a transformation function that only works in
    the forward direction. If undo_transform is called then an exact copy of
    the given object is returned.
    """

[docs]    def undo_transform(self, instance):
        """
        No-op
        """
        return {k: instance[k] for k in instance}


[docs]class Pipeline(Preprocessor):
    """
    A special preprocessor class used to chain together many preprocessors.
    Supports the same transform and undo_transform functions as a regular
    preprocessor.
    """
    def __init__(self, *preprocessors):
        self.preprocessors = preprocessors

[docs]    def transform(self, instance):
        """
        Apply a series of transformations to the instance.
        """
        for pp in self.preprocessors:
            instance = pp.transform(instance)
        return instance

[docs]    def undo_transform(self, instance):
        """
        Undo the series of transformations done to the instance.
        """
        for pp in reversed(self.preprocessors):
            instance = pp.undo_transform(instance)
        return instance


[docs]class Tuplizer(Preprocessor):
    """
    Converts all string versions of relations into tuples.

    Relation attributes are expected to be specified as a string enclosed in
    ``(`` ``)`` with values delimited by spaces. We conventionally use a prefix
    notation for relations ``(related a b)`` but this preprocessor should be
    flexible enough to handle postfix and prefix.

    This is a helper function preprocessor and so is not part of
    :class:`StructureMapper
    <concept_formation.structure_mapper.StructureMapper>`'s standard pipeline.

    >>> tuplizer = Tuplizer()
    >>> instance = {'(foo1 o1 (foo2 o2 o3))': True}
    >>> print(tuplizer.transform(instance))
    {('foo1', 'o1', ('foo2', 'o2', 'o3')): True}
    >>> print(tuplizer.undo_transform(tuplizer.transform(instance)))
    {'(foo1 o1 (foo2 o2 o3))': True}

    >>> instance = {'(place x1 12.4 9.6 (div width 18.2))':True}
    >>> tuplizer = Tuplizer()
    >>> tuplizer.transform(instance)
    {('place', 'x1', 12.4, 9.6, ('div', 'width', 18.2)): True}
    """
[docs]    def transform(self, instance):
        """
        Convert at string specified relations into tuples.
        """
        return {self._tuplize_relation(attr): instance[attr] for attr in
                instance}

[docs]    def undo_transform(self, instance):
        """
        Convert tuple relations back into their string forms.
        """
        return {self._stringify_relation(attr): instance[attr] for attr in
                instance}

    def _tuplize_relation(self, relation):
        """
        Converts a string formatted relation into a tuplized relation.

        :param attr: The relational attribute formatted as a string
        :type attr: string
        :param mapping: A dictionary of mappings with component names as keys.
            Just the keys are used (i.e., as a set) to determine if elements in
            the relation are objects.
        :type mapping: dict
        :return: A new relational attribute in tuple format
        :rtype: tuple

        >>> relation = '(foo1 o1 (foo2 o2 o3))'
        >>> tuplizer = Tuplizer()
        >>> tuplizer._tuplize_relation(relation)
        ('foo1', 'o1', ('foo2', 'o2', 'o3'))
        """
        if relation[0] != '(':
            return relation

        stack = [[]]

        for val in relation.split(' '):
            end = 0

            if val[0] == '(':
                stack.append([])
                val = val[1:]

            while val[-1] == ')':
                end += 1
                val = val[:-1]

            current = stack[-1]
            try:
                val = float(val)
            except ValueError:
                val = val
            current.append(val)

            while end > 0:
                last = tuple(stack.pop())
                current = stack[-1]
                current.append(last)
                end -= 1

        final = tuple(stack[-1][-1])
        return final

    def _stringify_relation(self, relation):
        """
        Converts a tupleized relation into a string formated relation.

        >>> relation = ('foo1', 'o1', ('foo2', 'o2', 'o3'))
        >>> tuplizer = Tuplizer()
        >>> tuplizer._stringify_relation(relation)
        '(foo1 o1 (foo2 o2 o3))'
        """
        if isinstance(relation, tuple):
            relation = [self._stringify_relation(ele) if isinstance(ele, tuple)
                        else ele for ele in relation]
            return "(" + " ".join(relation) + ")"
        else:
            return relation


[docs]def rename_relation(relation, mapping):
    """
    Takes a tuplized relational attribute (e.g., ``('before', 'o1', 'o2')``)
    and a mapping and renames the components based on the mapping. This
    function contains a special edge case for handling dot notation which is
    used in the NameStandardizer.

    :param attr: The relational attribute containing components to be renamed
    :type attr: :ref:`Relation Attribute<attr-rel>`
    :param mapping: A dictionary of mappings between component names
    :type mapping: dict
    :return: A new relational attribute with components renamed
    :rtype: tuple

    >>> relation = ('foo1', 'o1', ('foo2', 'o2', 'o3'))
    >>> mapping = {'o1': 'o100', 'o2': 'o200', 'o3': 'o300'}
    >>> rename_relation(relation, mapping)
    ('foo1', 'o100', ('foo2', 'o200', 'o300'))

    >>> relation = ('foo1', ('o1', ('o2', 'o3')))
    >>> mapping = {('o1', ('o2', 'o3')): 'o100'}
    >>> rename_relation(relation, mapping)
    ('foo1', 'o100')
    """
    return tuple(mapping[v] if v in mapping else rename_relation(v, mapping) if
                 isinstance(v, tuple) else v for v in relation)


[docs]class NameStandardizer(Preprocessor):
    """
    A preprocessor that standardizes apart object names.

    Given an instance rename all the components so they have unique names.

    .. :warning: relations cannot have dictionaries as values (i.e., cannot be
        subojects).
    .. :warning: relations can only exist at the top level, not in sub-objects.

    This will rename component attributes as well as any occurance of the
    component's name within relation attributes. This renaming is necessary to
    allow for a search between possible mappings without collisions.

    This is the first operation in :class:`StructureMapper
    <concept_formation.structure_mapper.StructureMapper>`'s standard
    pipeline.

    :param gensym: a function that returns unique object names (str) on each
        call. If None, then :func:`default_gensym` is used, which keeps a
        global object counter.
    :type gensym: a function

    # Reset the symbol generator for doctesting purposes.
    >>> _reset_gensym()
    >>> import pprint
    >>> instance = {'nominal': 'v1', 'numeric': 2.3, 'c1': {'a1': 'v1'}, '?c2':
    ...             {'a2': 'v2', '?c3': {'a3': 'v3'}}, '(relation1 c1 ?c2)':
    ...             True, 'lists': [{'c1': {'inner': 'val'}}, 's2', 's3'],
    ...             '(relation2 (a1 c1) (relation3 (a3 (?c3 ?c2))))': 4.3,
    ...             ('relation4', '?c2', '?c4'):True}
    >>> tuplizer = Tuplizer()
    >>> instance = tuplizer.transform(instance)
    >>> std = NameStandardizer()
    >>> std.undo_transform(instance)
    Traceback (most recent call last):
        ...
    Exception: Must call transform before undo_transform!
    >>> new_i = std.transform(instance)
    >>> old_i = std.undo_transform(new_i)
    >>> pprint.pprint(instance)
    {'?c2': {'?c3': {'a3': 'v3'}, 'a2': 'v2'},
     'c1': {'a1': 'v1'},
     'lists': [{'c1': {'inner': 'val'}}, 's2', 's3'],
     'nominal': 'v1',
     'numeric': 2.3,
     ('relation1', 'c1', '?c2'): True,
     ('relation2', ('a1', 'c1'), ('relation3', ('a3', ('?c3', '?c2')))): 4.3,
     ('relation4', '?c2', '?c4'): True}
    >>> pprint.pprint(new_i)
    {'?o1': {'?o2': {'a3': 'v3'}, 'a2': 'v2'},
     'c1': {'a1': 'v1'},
     'lists': [{'c1': {'inner': 'val'}}, 's2', 's3'],
     'nominal': 'v1',
     'numeric': 2.3,
     ('relation1', 'c1', '?o1'): True,
     ('relation2', ('a1', 'c1'), ('relation3', ('a3', ('?o2', '?o1')))): 4.3,
     ('relation4', '?o1', '?o3'): True}
    >>> pprint.pprint(old_i)
    {'?c2': {'?c3': {'a3': 'v3'}, 'a2': 'v2'},
     'c1': {'a1': 'v1'},
     'lists': [{'c1': {'inner': 'val'}}, 's2', 's3'],
     'nominal': 'v1',
     'numeric': 2.3,
     ('relation1', 'c1', '?c2'): True,
     ('relation2', ('a1', 'c1'), ('relation3', ('a3', ('?c3', '?c2')))): 4.3,
     ('relation4', '?c2', '?c4'): True}
    """
    def __init__(self, gensym=None):
        self.reverse_mapping = None
        if gensym:
            self.gensym = gensym
        else:
            self.gensym = default_gensym

[docs]    def transform(self, instance):
        """
        Performs the standardize apart tranformation.
        """
        mapping = {}
        new_instance = self._standardize(instance, mapping)
        self.reverse_mapping = {mapping[o]: o for o in mapping}
        return new_instance

[docs]    def undo_transform(self, instance):
        """
        Undoes the standardize apart tranformation.
        """
        if self.reverse_mapping is None:
            raise Exception("Must call transform before undo_transform!")

        return self._undo_standardize(instance)

    def _undo_standardize(self, instance):
        new_instance = {}

        for attr in instance:

            name = attr
            if attr in self.reverse_mapping:
                name = self.reverse_mapping[attr]
                if isinstance(name, tuple):
                    name = name[0]

            if isinstance(instance[attr], dict):
                new_instance[name] = self._undo_standardize(instance[attr])
            elif isinstance(instance[attr], list):
                new_instance[name] = [self._undo_standardize(ele) if
                                      isinstance(ele, dict) else ele for ele in
                                      instance[attr]]
            elif isinstance(attr, tuple):
                temp_rel = rename_relation(attr, self.reverse_mapping)
                new_instance[temp_rel] = instance[attr]
            else:
                new_instance[attr] = instance[attr]

        return new_instance

    def _standardize(self, instance, mapping={}, prefix=None):
        """
        Given an instance rename all the components so they
        have unique names.

        .. :warning: relations cannot have dictionaries as values (i.e., canno
            be subojects).
        .. :warning: relations can only exist at the top level, not in
            sub-objects.

        This will rename component attirbutes as well as any occurance of the
        component's name within relation attributes. This renaming is necessary
        to allow for a search between possible mappings without collisions.

        :param instance: An instance to be named apart.
        :param mapping: An existing mapping to add new mappings to; used for
            recursive calls.
        :type instance: :ref:`Instance<instance-rep>`
        :return: an instance with component attributes renamed
        :rtype: :ref:`Instance<instance-rep>`

        # Reset the symbol generator for doctesting purposes.
        >>> _reset_gensym()
        >>> import pprint
        >>> instance = {'nominal': 'v1', 'numeric': 2.3, '?c1': {'a1': 'v1'}, 'c2': {'a2': 'v2', 'c3': {'a3': 'v3'}}, '(relation1 ?c1 c2)': True, 'lists': ['s1', 's2', 's3'], '(relation2 (a1 ?c1) (relation3 (a3 (c2 c3))))': 4.3}
        >>> tuplizer = Tuplizer()
        >>> instance = tuplizer.transform(instance)
        >>> std = NameStandardizer()
        >>> standard = std.transform(instance)
        >>> pprint.pprint(standard)
        {'?o1': {'a1': 'v1'},
         'c2': {'a2': 'v2', 'c3': {'a3': 'v3'}},
         'lists': ['s1', 's2', 's3'],
         'nominal': 'v1',
         'numeric': 2.3,
         ('relation1', '?o1', 'c2'): True,
         ('relation2', ('a1', '?o1'), ('relation3', ('a3', ('c2', 'c3')))): 4.3}
        """
        new_instance = {}
        relations = []

        # I had to add the key function to the sort because python apparently
        # can't naturally sort strings and tuples
        # for attr in instance:
        for attr in sorted(instance, key=lambda at: str(at)):

            name = attr
            indexable = False
            try:
                attr[0] == '?'
                indexable = True
            except:
                pass

            if indexable and attr[0] == '?' and not isinstance(attr, tuple):
                if name not in mapping:
                    mapping[name] = self.gensym()
                name = mapping[name]

            value = instance[attr]

            if isinstance(value, dict):
                value = self._standardize(value, mapping, name)
            elif isinstance(value, list):
                value = [self._standardize(ele, mapping, name) if
                         isinstance(ele, dict) else ele for ele in value]

            if isinstance(name, tuple):
                for o in get_attribute_components(name):
                    if o not in mapping:
                        mapping[o] = self.gensym()
                relations.append((name, value))
            else:
                new_instance[name] = value

        for relation, val in relations:
            temp_rel = rename_relation(relation, mapping)
            new_instance[temp_rel] = val

        return new_instance


[docs]class Flattener(Preprocessor):
    """
    Flattens subobject attributes.

    Takes an instance that has already been standardized apart and flattens it.

    .. :warning: important to note that relations can only exist at the top
        level, not within subobjects. If they do exist than this function will
        return incorrect results.

    Hierarchy is represented with periods between variable names in the
    flattened attributes. However, this process converts the attributes with
    periods in them into a tuple of objects with an attribute as the last
    element, this is more efficient for later processing.

    This is the third and final operation in :class:`StructureMapper
    <concept_formation.structure_mapper.StructureMapper>`'s standard
    pipeline.

    >>> import pprint
    >>> flattener = Flattener()
    >>> instance = {'a': 1, 'c1': {'b': 1, '_c': 2}}
    >>> pprint.pprint(instance)
    {'a': 1, 'c1': {'_c': 2, 'b': 1}}
    >>> instance = flattener.transform(instance)
    >>> pprint.pprint(instance)
    {'a': 1, ('_', ('_c', 'c1')): 2, ('b', 'c1'): 1}
    >>> instance = flattener.undo_transform(instance)
    >>> pprint.pprint(instance)
    {'a': 1, 'c1': {'_c': 2, 'b': 1}}

    >>> instance = {'l1': {'l2': {'l3': {'l4': 1}}}}
    >>> pprint.pprint(instance)
    {'l1': {'l2': {'l3': {'l4': 1}}}}
    >>> instance = flattener.transform(instance)
    >>> pprint.pprint(instance)
    {('l4', ('l3', ('l2', 'l1'))): 1}
    >>> instance = flattener.undo_transform(instance)
    >>> pprint.pprint(instance)
    {'l1': {'l2': {'l3': {'l4': 1}}}}
    """

[docs]    def transform(self, instance):
        """
        Perform the flattening procedure.
        """
        return self._flatten(instance)

[docs]    def undo_transform(self, instance):
        """
        Undo the flattening procedure.
        """
        return self._structurize(instance)

    def _get_path(self, attr):
        """
        Unfolds a flattened attr to get the path

        >>> import pprint
        >>> flattener = Flattener()
        >>> attribute = ('x', ('o1', ('o2', 'o3')))
        >>> path = flattener._get_path(attribute)
        >>> pprint.pprint(path)
        ['o3', 'o2', 'o1', 'x']
        """
        path = []
        curr = attr
        while isinstance(curr, tuple) and len(curr) == 2:
            if curr[0] == '_':
                _, curr = curr
            else:
                a, curr = curr
                path.append(a)
        path.append(curr)
        path.reverse()
        return path

    def _structurize(self, instance):
        """
        This undoes the flattening process. In particular, it takes an instance
        that has unary relations and it unpacks them into structured objects
        and returns the fully structured object.

        >>> import pprint
        >>> flattener = Flattener()
        >>> instance = {('l4', ('l3', ('l2', 'l1'))): 1}
        >>> pprint.pprint(instance)
        {('l4', ('l3', ('l2', 'l1'))): 1}
        >>> instance = flattener._structurize(instance)
        >>> pprint.pprint(instance)
        {'l1': {'l2': {'l3': {'l4': 1}}}}
        """
        temp = {}
        for attr in instance:
            if (isinstance(attr, tuple) and len(attr) == 2):
                path = self._get_path(attr)
                curr = temp
                for sa in path[:-1]:
                    if sa not in curr:
                        curr[sa] = {}
                    curr = curr[sa]
                curr[path[-1]] = instance[attr]
            else:
                temp[attr] = instance[attr]

        return temp

    def _flatten(self, instance, outer_attr=None):
        """
        Takes an instance with dictionary attributes and and flattens it, so
        that there are no more dictionary attributes.

        .. :warning: important to note that relations can only exist at the top
            level, not within subobjects. If they do exist than this function
            will return incorrect results.

        To eliminate structure, the inner most attributes are pulled up to the
        top level and renamed as tuples that contain information about the
        structure.

        :param instance: An instance to be flattened.
        :type instance: instance
        :return: A copy of the instance flattend
        :rtype: :ref:`flattened instance <flattened-instance>`

        >>> import pprint
        >>> flattener = Flattener()
        >>> instance = {'a': 1, 'c1': {'b': 1, '_c': 2}}
        >>> flat = flattener.transform(instance)
        >>> pprint.pprint(flat)
        {'a': 1, ('_', ('_c', 'c1')): 2, ('b', 'c1'): 1}
        >>> instance = {'l1': {'l2': {'l3': {'l4': 1}}}}
        >>> pprint.pprint(instance)
        {'l1': {'l2': {'l3': {'l4': 1}}}}
        >>> instance = flattener._flatten(instance)
        >>> pprint.pprint(instance)
        {('l4', ('l3', ('l2', 'l1'))): 1}
        >>> instance = {'?check0': {'Position': {'X': 1.5990001, 'Y': -7.05200052}, 'Type': 'Checkpoint'}, '?cube01': {'Bounds': {'X': 1.7420001, 'Y': 1.751}, 'Name': 'cube01', 'Position': {'X': -12.0840006, 'Y': -7.1050005}, 'Rotation': {'Z': 0.0}, 'Type': 'cube'}, '?cube02': {'Bounds': {'X': 1.7420001, 'Y': 1.751}, 'Name': 'cube02', 'Position': {'X': -4.662, 'Y': -7.1050005}, 'Rotation': {'Z': 0.0}, 'Type': 'cube'}, 'Goal': {'Position': {'X': 8.599, 'Y': 0.715000033}, 'Type': 'Goal'}}
        >>> flat = flattener.transform(instance)
        >>> pprint.pprint(flat)
        {('Name', '?cube01'): 'cube01',
         ('Name', '?cube02'): 'cube02',
         ('Type', '?check0'): 'Checkpoint',
         ('Type', '?cube01'): 'cube',
         ('Type', '?cube02'): 'cube',
         ('Type', 'Goal'): 'Goal',
         ('X', ('Bounds', '?cube01')): 1.7420001,
         ('X', ('Bounds', '?cube02')): 1.7420001,
         ('X', ('Position', '?check0')): 1.5990001,
         ('X', ('Position', '?cube01')): -12.0840006,
         ('X', ('Position', '?cube02')): -4.662,
         ('X', ('Position', 'Goal')): 8.599,
         ('Y', ('Bounds', '?cube01')): 1.751,
         ('Y', ('Bounds', '?cube02')): 1.751,
         ('Y', ('Position', '?check0')): -7.05200052,
         ('Y', ('Position', '?cube01')): -7.1050005,
         ('Y', ('Position', '?cube02')): -7.1050005,
         ('Y', ('Position', 'Goal')): 0.715000033,
         ('Z', ('Rotation', '?cube01')): 0.0,
         ('Z', ('Rotation', '?cube02')): 0.0}
        """
        temp = {}
        for attr in instance:
            original = attr
            if outer_attr is not None:
                if attr[0] == "_":
                    attr = ('_', (attr, outer_attr))
                else:
                    attr = (attr, outer_attr)

            if isinstance(instance[original], dict):
                so = self._flatten(instance[original], attr)
                for so_attr in so:
                    temp[so_attr] = so[so_attr]
            else:
                temp[attr] = instance[original]
        return temp


[docs]class ListProcessor(Preprocessor):
    """
    Preprocesses out the lists, converting them into objects and relations.

    This preprocessor is a pipeline of two operations. First it extracts
    elements from any lists in the instance and makes them their own
    subcomponents with unique names. Second it removes the lists altogether and
    replaces them with a series of relations that both express that
    subcomponents are elments of the list and the order that they existed in.
    These two operations transform the list in a way that preserves the
    semenatics of the original list but makes them compatible with Trestle's
    understanding of component objects.

    None of the list operations are part of :class:`StructureMapper
    <concept_formation.structure_mapper.StructureMapper>`'s standard
    pipeline.

    .. warning:: The ListProcessor's undo_transform function is not
        guaranteed to be deterministic and attempts a best guess at a partial
        ordering.  In most cases this will be fine but in complex instances
        with multiple lists and user defined ordering relations it can break
        down. If an ordering cannot be determined then ordering relations are
        left in place.

    # Reset the symbol generator for doctesting purposes.
    >>> _reset_gensym()
    >>> import pprint
    >>> instance = {"att1": "val1", "list1":["a", "b", "a", "c", "d"]}
    >>> lp = ListProcessor()
    >>> instance = lp.transform(instance)
    >>> pprint.pprint(instance)
    {'?o1': {'val': 'a'},
     '?o2': {'val': 'b'},
     '?o3': {'val': 'a'},
     '?o4': {'val': 'c'},
     '?o5': {'val': 'd'},
     'att1': 'val1',
     'list1': {},
     ('has-element', 'list1', '?o1'): True,
     ('has-element', 'list1', '?o2'): True,
     ('has-element', 'list1', '?o3'): True,
     ('has-element', 'list1', '?o4'): True,
     ('has-element', 'list1', '?o5'): True,
     ('ordered-list', 'list1', '?o1', '?o2'): True,
     ('ordered-list', 'list1', '?o2', '?o3'): True,
     ('ordered-list', 'list1', '?o3', '?o4'): True,
     ('ordered-list', 'list1', '?o4', '?o5'): True}

    >>> instance = lp.undo_transform(instance)
    >>> pprint.pprint(instance)
    {'att1': 'val1', 'list1': ['a', 'b', 'a', 'c', 'd']}

    # Reset the symbol generator for doctesting purposes.
    >>> _reset_gensym()
    >>> instance = {'l1': ['a', {'in1': 3, 'in2': 4}, {'ag': 'b', 'ah': 'c'}, 12, 'again']}
    >>> lp = ListProcessor()
    >>> instance = lp.transform(instance)
    >>> pprint.pprint(instance)
    {'?o1': {'val': 'a'},
     '?o2': {'in1': 3, 'in2': 4},
     '?o3': {'ag': 'b', 'ah': 'c'},
     '?o4': {'val': 12},
     '?o5': {'val': 'again'},
     'l1': {},
     ('has-element', 'l1', '?o1'): True,
     ('has-element', 'l1', '?o2'): True,
     ('has-element', 'l1', '?o3'): True,
     ('has-element', 'l1', '?o4'): True,
     ('has-element', 'l1', '?o5'): True,
     ('ordered-list', 'l1', '?o1', '?o2'): True,
     ('ordered-list', 'l1', '?o2', '?o3'): True,
     ('ordered-list', 'l1', '?o3', '?o4'): True,
     ('ordered-list', 'l1', '?o4', '?o5'): True}

    >>> instance = lp.undo_transform(instance)
    >>> pprint.pprint(instance)
    {'l1': ['a', {'in1': 3, 'in2': 4}, {'ag': 'b', 'ah': 'c'}, 12, 'again']}

    # Reset the symbol generator for doctesting purposes.
    >>> _reset_gensym()
    >>> instance = {'tta': 'alpha', 'ttb':{'tlist': ['a', 'b', {'sub-a': 'c', 'sub-sub': {'s': 'd', 'sslist': ['w', 'x', 'y', {'issue': 'here'}]}}, 'g']}}
    >>> pprint.pprint(instance)
    {'tta': 'alpha',
     'ttb': {'tlist': ['a',
                       'b',
                       {'sub-a': 'c',
                        'sub-sub': {'s': 'd',
                                    'sslist': ['w', 'x', 'y', {'issue': 'here'}]}},
                       'g']}}

    >>> lp = ListProcessor()
    >>> instance = lp.transform(instance)
    >>> pprint.pprint(instance)
    {'tta': 'alpha',
     'ttb': {'?o1': {'val': 'a'},
             '?o2': {'val': 'b'},
             '?o3': {'sub-a': 'c',
                     'sub-sub': {'?o4': {'val': 'w'},
                                 '?o5': {'val': 'x'},
                                 '?o6': {'val': 'y'},
                                 '?o7': {'issue': 'here'},
                                 's': 'd',
                                 'sslist': {}}},
             '?o8': {'val': 'g'},
             'tlist': {}},
     ('has-element', ('sslist', ('sub-sub', ('?o3', 'ttb'))), '?o4'): True,
     ('has-element', ('sslist', ('sub-sub', ('?o3', 'ttb'))), '?o5'): True,
     ('has-element', ('sslist', ('sub-sub', ('?o3', 'ttb'))), '?o6'): True,
     ('has-element', ('sslist', ('sub-sub', ('?o3', 'ttb'))), '?o7'): True,
     ('has-element', ('tlist', 'ttb'), '?o1'): True,
     ('has-element', ('tlist', 'ttb'), '?o2'): True,
     ('has-element', ('tlist', 'ttb'), '?o3'): True,
     ('has-element', ('tlist', 'ttb'), '?o8'): True,
     ('ordered-list', ('sslist', ('sub-sub', ('?o3', 'ttb'))), '?o4', '?o5'): True,
     ('ordered-list', ('sslist', ('sub-sub', ('?o3', 'ttb'))), '?o5', '?o6'): True,
     ('ordered-list', ('sslist', ('sub-sub', ('?o3', 'ttb'))), '?o6', '?o7'): True,
     ('ordered-list', ('tlist', 'ttb'), '?o1', '?o2'): True,
     ('ordered-list', ('tlist', 'ttb'), '?o2', '?o3'): True,
     ('ordered-list', ('tlist', 'ttb'), '?o3', '?o8'): True}

    >>> instance = lp.undo_transform(instance)
    >>> pprint.pprint(instance)
    {'tta': 'alpha',
     'ttb': {'tlist': ['a',
                       'b',
                       {'sub-a': 'c',
                        'sub-sub': {'s': 'd',
                                    'sslist': ['w', 'x', 'y', {'issue': 'here'}]}},
                       'g']}}

    """
    def __init__(self):
        self.processor = Pipeline(ExtractListElements(), ListsToRelations())

[docs]    def transform(self, instance):
        """
        Extract list elements and replace lists with ordering relations.
        """
        return self.processor.transform(instance)

[docs]    def undo_transform(self, instance):
        """
        Attempt to reconstruct lists from ordering relations and add extracted
        list elements back to constructed lists.
        """
        return self.processor.undo_transform(instance)


[docs]class ExtractListElements(Preprocessor):
    """
    A pre-processor that extracts the elements of lists into their own objects

    Find all lists in an instance and extract their elements into their own
    subjects of the main instance.

    This is a first subprocess of the :class:`ListProcessor
    <concept_formation.preprocessor.ListProcessor>`. None of the list operations
    are part of :class:`StructureMapper
    <concept_formation.structure_mapper.StructureMapper>`'s standard pipeline.

    # Reset the symbol generator for doctesting purposes.
    >>> _reset_gensym()
    >>> import pprint
    >>> instance = {"a": "n", "list1": ["test", {"p": "q", "j": "k"}, {"n": "m"}]}
    >>> pp = ExtractListElements()
    >>> instance = pp.transform(instance)
    >>> pprint.pprint(instance)
    {'?o1': {'val': 'test'},
     '?o2': {'j': 'k', 'p': 'q'},
     '?o3': {'n': 'm'},
     'a': 'n',
     'list1': ['?o1', '?o2', '?o3']}

    # Reset the symbol generator for doctesting purposes.
    >>> _reset_gensym()
    >>> import pprint
    >>> instance = {"att1": "V1", 'subobj': {"list1": ["a", "b", "c", {"B": "C", "D": "E"}]}}
    >>> pprint.pprint(instance)
    {'att1': 'V1', 'subobj': {'list1': ['a', 'b', 'c', {'B': 'C', 'D': 'E'}]}}
    >>> pp = ExtractListElements()
    >>> instance = pp.transform(instance)
    >>> pprint.pprint(instance)
    {'att1': 'V1',
     'subobj': {'?o1': {'val': 'a'},
                '?o2': {'val': 'b'},
                '?o3': {'val': 'c'},
                '?o4': {'B': 'C', 'D': 'E'},
                'list1': ['?o1', '?o2', '?o3', '?o4']}}
    >>> instance = pp.undo_transform(instance)
    >>> pprint.pprint(instance)
    {'att1': 'V1', 'subobj': {'list1': ['a', 'b', 'c', {'B': 'C', 'D': 'E'}]}}

    """
    def __init__(self, gensym=None):
        if gensym:
            self.gensym = gensym
        else:
            self.gensym = default_gensym

[docs]    def transform(self, instance):
        """
        Find all lists in an instance and extract their elements into their own
        subjects of the main instance.
        """
        new_instance = self._extract(instance)
        return new_instance

[docs]    def undo_transform(self, instance):
        """
        Undoes the list element extraction operation.
        """
        return self._undo_extract(instance)

    def _undo_extract(self, instance):
        """
        Reverses the list element extraction process
        """
        new_instance = {}
        lists = {}
        elements = {}

        for a in instance:
            if isinstance(instance[a], list):
                lists[a] = True
                new_list = []
                for i in range(len(instance[a])):
                    elements[instance[a][i]] = True
                    obj = self._undo_extract(instance[instance[a][i]])

                    if "val" not in obj:
                        new_list.append(obj)
                    else:
                        new_list.append(obj["val"])
                new_instance[a] = new_list

        for a in instance:
            if isinstance(instance[a], list) or a in elements:
                continue
            elif isinstance(instance[a], dict):
                new_instance[a] = self._undo_extract(instance[a])
            else:
                new_instance[a] = instance[a]

        return new_instance

    def _extract(self, instance):
        """
        Unlike the utils.extract_components function this one will extract ALL
        elements into their own objects not just object literals
        """
        new_instance = {}
        for a in instance.keys():
            if isinstance(instance[a], list):

                if a[0] == '_':
                    new_instance[a] = str(instance[a])
                    continue

                new_list = []
                for el in instance[a]:

                    if isinstance(el, dict):
                        new_obj = deepcopy(el)
                    else:
                        new_obj = {"val": el}

                    new_att = self.gensym()
                    new_instance[new_att] = self._extract(new_obj)
                    new_list.append(new_att)

                new_instance[a] = new_list

            elif isinstance(instance[a], dict):
                new_instance[a] = self._extract(instance[a])
            else:
                new_instance[a] = instance[a]

        return new_instance


[docs]class ListsToRelations(Preprocessor):
    """
    Converts an object with lists into an object with sub-objects and list
    relations.

    This is a second subprocess of the :class:`ListProcessor
    <concept_formation.preprocessor.ListProcessor>`. None of the list
    operations are part of :class:`StructureMapper
    <concept_formation.structure_mapper.StructureMapper>`'s standard pipeline.

    # Reset the symbol generator for doctesting purposes.
    >>> _reset_gensym()
    >>> ltr = ListsToRelations()
    >>> import pprint
    >>> instance = {"list1": ['a', 'b', 'c']}
    >>> instance = ltr.transform(instance)
    >>> pprint.pprint(instance)
    {'list1': {},
     ('has-element', 'list1', 'a'): True,
     ('has-element', 'list1', 'b'): True,
     ('has-element', 'list1', 'c'): True,
     ('ordered-list', 'list1', 'a', 'b'): True,
     ('ordered-list', 'list1', 'b', 'c'): True}

    >>> instance = {"list1": ['a', 'b', 'c'], "list2": ['w', 'x', 'y', 'z']}
    >>> instance = ltr.transform(instance)
    >>> pprint.pprint(instance)
    {'list1': {},
     'list2': {},
     ('has-element', 'list1', 'a'): True,
     ('has-element', 'list1', 'b'): True,
     ('has-element', 'list1', 'c'): True,
     ('has-element', 'list2', 'w'): True,
     ('has-element', 'list2', 'x'): True,
     ('has-element', 'list2', 'y'): True,
     ('has-element', 'list2', 'z'): True,
     ('ordered-list', 'list1', 'a', 'b'): True,
     ('ordered-list', 'list1', 'b', 'c'): True,
     ('ordered-list', 'list2', 'w', 'x'): True,
     ('ordered-list', 'list2', 'x', 'y'): True,
     ('ordered-list', 'list2', 'y', 'z'): True}

    # Reset the symbol generator for doctesting purposes.
    >>> _reset_gensym()
    >>> ltr = ListsToRelations()
    >>> import pprint
    >>> instance = {'o1': {"list1":['c','b','a']}}
    >>> instance = ltr.transform(instance)
    >>> pprint.pprint(instance)
    {'o1': {'list1': {}},
     ('has-element', ('list1', 'o1'), 'a'): True,
     ('has-element', ('list1', 'o1'), 'b'): True,
     ('has-element', ('list1', 'o1'), 'c'): True,
     ('ordered-list', ('list1', 'o1'), 'b', 'a'): True,
     ('ordered-list', ('list1', 'o1'), 'c', 'b'): True}

    >>> instance = ltr.undo_transform(instance)
    >>> pprint.pprint(instance)
    {'o1': {'list1': ['c', 'b', 'a']}}
    """
[docs]    def transform(self, instance):
        return self._lists_to_relations(instance)

[docs]    def undo_transform(self, instance):
        """
        Traverse the instance and turns each set of totally ordered list
        relations into a list.

        If there is a cycle or a partial ordering, than the relations are not
        converted and left as they are.
        """
        return self._relations_to_lists(instance)

    def _relations_to_lists(self, instance, path=None):
        new_instance = {}

        elements = {}
        order = {}
        originals = {}

        for attr in instance:
            if isinstance(attr, tuple) and (attr[0] == 'has-element'):
                rel, lname, ele = attr
                if lname not in elements:
                    elements[lname] = []
                elements[lname].append(ele)

                if lname not in originals:
                    originals[lname] = []
                originals[lname].append((attr, instance[attr]))

            elif isinstance(attr, tuple) and attr[0] == 'ordered-list':
                rel, lname, ele1, ele2 = attr

                if lname not in order:
                    order[lname] = []

                order[lname].append((ele1, ele2))

                if lname not in originals:
                    originals[lname] = []
                originals[lname].append((attr, instance[attr]))

            else:
                new_instance[attr] = instance[attr]

        for l in elements:
            new_list = [elements[l].pop(0)]
            change = True

            while len(elements[l]) > 0 and change:
                change = False

                # chain to front
                front = True
                while front is not None:
                    front = None
                    for a, b in order[l]:
                        if b == new_list[0]:
                            change = True
                            front = (a, b)
                            elements[l].remove(a)
                            new_list.insert(0, a)
                            break
                    if front is not None:
                        order[l].remove(front)

                # chain to end
                back = True
                while back is not None:
                    back = None
                    for a, b in order[l]:
                        if a == new_list[-1]:
                            change = True
                            back = (a, b)
                            elements[l].remove(b)
                            new_list.append(b)
                            break
                    if back is not None:
                        order[l].remove(back)

            if len(elements[l]) == 0:
                path = self._get_path(l)
                current = new_instance
                while len(path) > 1:
                    current = current[path.pop(0)]
                current[path[0]] = new_list
            else:
                for attr, val in originals:
                    new_instance[attr] = val

        return new_instance

    def _get_path(self, path):
        if isinstance(path, tuple) and len(path) == 2:
            return self._get_path(path[1]) + [path[0]]
        else:
            return [path]

    def _lists_to_relations(self, instance, current=None, top_level=None):
        new_instance = {}
        if top_level is None:
            top_level = new_instance

        for attr in instance.keys():
            if current is None:
                lname = attr
            else:
                lname = (attr, current)

            if isinstance(instance[attr], list):
                new_instance[attr] = {}

                for i in range(len(instance[attr])-1):
                    rel = ("ordered-list", lname, str(instance[attr][i]),
                           str(instance[attr][i+1]))
                    top_level[rel] = True

                    rel = ("has-element", lname, instance[attr][i])
                    top_level[rel] = True

                if len(instance[attr]) > 0:
                    rel = ('has-element', lname, instance[attr][-1])
                    top_level[rel] = True

            elif isinstance(instance[attr], dict):
                new_instance[attr] = self._lists_to_relations(instance[attr],
                                                              lname, top_level)
            else:
                new_instance[attr] = instance[attr]

        return new_instance


[docs]class SubComponentProcessor(Preprocessor):
    """
    Takes a flattened instance and moves sub-objects (not sub-attributes) to be
    top-level objects and adds has-component relations to preserve semantics.

    This process is primairly used to improve matching by having all sub-
    component objects exist as their own top level objects with relations
    describing their original position in the hierarchy. This allows the
    structure mapper to partially match against subobjects.

    This is the second operation in :class:`TrestleTree
    <concept_formation.trestle.TrestleTree>`'s standard pipeline (after
    flattening).

    .. warning:: This assumes that the :class:`NameStandardizer
        <concept_formation.preprocessor.NameStandardizer>` has been run on the
        instance first otherwise there can be name collisions.

    # Reset the symbol generator for doctesting purposes.
    >>> _reset_gensym()
    >>> import pprint
    >>> flattener = Flattener()
    >>> psc = SubComponentProcessor()
    >>> instance = {"a1": "v1", "?sub1": {"a2": "v2", "a3": 3},
    ...             "?sub2": {"a4": "v4", "?subsub1": {"a5": "v5", "a6": "v6"},
    ...                       "?subsub2":{"?subsubsub": {"a8": "V8"}, "a7": 7}},
    ...             ('ordered-list', ('list1', ('?o2', '?o1')), 'b', 'a'):
    ...             True}
    >>> pprint.pprint(instance)
    {'?sub1': {'a2': 'v2', 'a3': 3},
     '?sub2': {'?subsub1': {'a5': 'v5', 'a6': 'v6'},
               '?subsub2': {'?subsubsub': {'a8': 'V8'}, 'a7': 7},
               'a4': 'v4'},
     'a1': 'v1',
     ('ordered-list', ('list1', ('?o2', '?o1')), 'b', 'a'): True}
    >>> instance = psc.transform(flattener.transform(instance))
    >>> pprint.pprint(instance)
    {'a1': 'v1',
     ('a2', '?sub1'): 'v2',
     ('a3', '?sub1'): 3,
     ('a4', '?sub2'): 'v4',
     ('a5', '?subsub1'): 'v5',
     ('a6', '?subsub1'): 'v6',
     ('a7', '?subsub2'): 7,
     ('a8', '?subsubsub'): 'V8',
     ('has-component', '?o1', '?o2'): True,
     ('has-component', '?sub2', '?subsub1'): True,
     ('has-component', '?sub2', '?subsub2'): True,
     ('has-component', '?subsub2', '?subsubsub'): True,
     ('ordered-list', ('list1', '?o2'), 'b', 'a'): True}
    >>> instance = psc.undo_transform(instance)
    >>> instance = flattener.undo_transform(instance)
    >>> pprint.pprint(instance)
    {'?sub1': {'a2': 'v2', 'a3': 3},
     '?sub2': {'?subsub1': {'a5': 'v5', 'a6': 'v6'},
               '?subsub2': {'?subsubsub': {'a8': 'V8'}, 'a7': 7},
               'a4': 'v4'},
     'a1': 'v1',
     ('ordered-list', ('list1', ('?o2', '?o1')), 'b', 'a'): True}
    """

[docs]    def transform(self, instance):
        """
        Travese the instance for objects that contain subobjects and extracts
        the subobjects to be their own objects at the top level of the
        instance.
        """
        return self._extract_sub_objects(instance)

[docs]    def undo_transform(self, instance):
        """
        Removes the has-component relations by adding the elements as
        subobjects.

        If a objects is a child in multiple has-component relationships than it
        is left in relational form (i.e., it cannot be expressed in sub-object
        form).
        """
        return self._embed_sub_objects(instance)

    def _embed_sub_objects(self, instance):
        so_mapping = {attr[2]: (attr[2], attr[1]) for attr in instance
                      if (isinstance(attr, tuple) and len(attr) == 3 and
                          attr[0] == 'has-component')}

        return {self._rename_embedding(attr, so_mapping): instance[attr] for
                attr in instance if not (isinstance(attr, tuple) and len(attr)
                                         == 3 and attr[0] == 'has-component')}

    def _rename_embedding(self, attr, so_mapping):
        if attr in so_mapping:
            new_a = so_mapping[attr]
            if isinstance(new_a, tuple) and len(new_a) == 2:
                return (new_a[0], self._rename_embedding(new_a[1], so_mapping))
            return new_a

        if (isinstance(attr, tuple) and len(attr) == 2):
            return (attr[0], self._rename_embedding(attr[1], so_mapping))

        if (isinstance(attr, tuple) and len(attr) != 2):
            return tuple(self._rename_embedding(ele, so_mapping) for ele in
                         attr)

        return attr

    def _extract_sub_objects(self, instance):
        new_instance = {}
        for a in instance:
            rels = self._get_has_components(a)
            for r in rels:
                new_instance[r] = True
            new_a = self._extract_attr(a)
            new_instance[new_a] = instance[a]
        return new_instance

    def _extract_attr(self, attr):
        if isinstance(attr, tuple) and len(attr) != 2:
            return tuple([self._extract_attr(ele) for ele in attr])

        if isinstance(attr, tuple) and len(attr) == 2:
            outer, inner = attr
            if isinstance(outer, str) and len(outer) > 0 and outer[0] == "?":
                return outer
            return (outer, self._extract_attr(inner))

        return attr

    def _get_has_components(self, attr):
        if not isinstance(attr, tuple):
            return []

        if len(attr) != 2:
            relations = []
            for ele in attr:
                relations = relations + self._get_has_components(ele)
            return relations

        last_comp = None
        inner = None
        relations = []

        while len(attr) == 2:
            a, attr = attr
            if isinstance(a, str) and len(a) > 0 and a[0] == '?':
                if last_comp is not None:
                    relations.append(('has-component', inner, last_comp))
                last_comp = a
                inner = self._extract_attr(attr)

        if last_comp is not None and (isinstance(attr, str) and len(attr) > 0
                                      and attr[0] == '?'):
            relations.append(('has-component', inner, last_comp))

        return relations


[docs]class ObjectVariablizer(OneWayPreprocessor):
    """
    Converts all attributes with dictionary values into variables by adding a
    question mark.

    Attribute names beginning with `?` are treated as bindable variables while
    all other attributes names are considered constants. This process searches
    through an instances and variablizes attributes that might not have been
    defined this way in the original data.

    This is a helper function preprocessor and so is not part of
    :class:`StructureMapper
    <concept_formation.structure_mapper.StructureMapper>`'s standard pipeline.

    >>> from pprint import pprint
    >>> instance = {"ob1":{"myX":12.4,"myY":13.1,"myType":"square"},"ob2":{"myX":9.5,"myY":12.6,"myType":"rect"}}
    >>> ov = ObjectVariablizer()
    >>> instance = ov.transform(instance)
    >>> pprint(instance)
    {'?ob1': {'myType': 'square', 'myX': 12.4, 'myY': 13.1},
     '?ob2': {'myType': 'rect', 'myX': 9.5, 'myY': 12.6}}
    >>> instance = ov.undo_transform(instance)
    >>> pprint(instance)
    {'?ob1': {'myType': 'square', 'myX': 12.4, 'myY': 13.1},
     '?ob2': {'myType': 'rect', 'myX': 9.5, 'myY': 12.6}}
    >>> instance = {"p1":{"x":12,"y":3},"p2":{"x":5,"y":14},"p3":{"x":4,"y":18},"setttings":{"x_lab":"height","y_lab":"age"}}
    >>> ov = ObjectVariablizer("p1","p2","p3")
    >>> instance = ov.transform(instance)
    >>> pprint(instance)
    {'?p1': {'x': 12, 'y': 3},
     '?p2': {'x': 5, 'y': 14},
     '?p3': {'x': 4, 'y': 18},
     'setttings': {'x_lab': 'height', 'y_lab': 'age'}}

    :param attrs: A list of specific attribute names to variablize. If left
        empty then all variables will be converted.
    :type attrs: strings
    """
    def __init__(self, *attrs):
        if len(attrs) == 0:
            self.targets = None
        else:
            self.targets = attrs

[docs]    def transform(self, instance):
        """
        Variablize target attributes.
        """
        return self._variablize(instance)

    def _variablize(self, instance, mapping={}, prefix=None):
        new_instance = {}

        mapping = {}
        relations = []
        if self.targets is None:
            attrs = [k for k in instance.keys()]
        else:
            attrs = self.targets

        for attr in instance:
            if prefix is None:
                prefix = attr
            else:
                prefix = (attr, prefix)

            if isinstance(attr, tuple):
                relations.append(attr)

            elif attr in attrs and isinstance(instance[attr], dict):
                name = attr
                if attr[0] != '?':
                    name = '?' + attr
                new_instance[name] = self._variablize(instance[attr], mapping,
                                                      prefix)
            else:
                new_instance[attr] = instance[attr]

        for rel in relations:
            new_instance[rename_relation(rel, mapping)] = instance[rel]

        return new_instance


[docs]class NumericToNominal(OneWayPreprocessor):
    """
    Converts numeric values to nominal ones.

    :class:`Cobweb3 <concept_formation.cobweb3.Cobweb3Tree>` and
    :class:`Trestle <concept_formation.trestle.TrestleTree>` will treat
    anything that passes ``isinstance(instance[attr],Number)`` as a numerical
    value. Because of how they store numerical distribution information, If
    either algorithm encounts a numerical value where it previously saw a
    nominal one it will throw an error. This preprocessor is provided as a way
    to address that problem by unifying the value types of attributes across an
    instance.

    This is a helper function preprocessor and so is not part of
    :class:`StructureMapper
    <concept_formation.structure_mapper.StructureMapper>`'s standard pipeline.

    >>> import pprint
    >>> ntn = NumericToNominal()
    >>> instance = {"x":12.5,"y":9,"z":"top"}
    >>> instance = ntn.transform(instance)
    >>> pprint.pprint(instance)
    {'x': '12.5', 'y': '9', 'z': 'top'}

    >>> ntn = NumericToNominal("y")
    >>> instance = {"x":12.5,"y":9,"z":"top"}
    >>> instance = ntn.transform(instance)
    >>> pprint.pprint(instance)
    {'x': 12.5, 'y': '9', 'z': 'top'}

    :param attrs: A list of specific attributes to convert. If left empty all
        numeric values will be converted.
    :type attrs: strings
    """
    def __init__(self, *attrs):
        if len(attrs) == 0:
            self.targets = None
        else:
            self.targets = attrs

[docs]    def transform(self, instance):
        """
        Transform target attribute values to nominal if they are numeric.
        """
        if self.targets is None:
            attrs = [k for k in instance.keys()]
        else:
            attrs = self.targets

        new_instance = {}

        for a in instance:
            if a in attrs and isinstance(instance[a], Number):
                new_instance[a] = str(instance[a])
            elif isinstance(instance[a], dict):
                new_instance[a] = self.transform(instance[a])
            else:
                new_instance[a] = instance[a]
        return new_instance


[docs]class NominalToNumeric(OneWayPreprocessor):
    """
    Converts nominal values to numeric ones.

    :class:`Cobweb3 <concept_formation.cobweb3.Cobweb3Tree>` and
    :class:`Trestle <concept_formation.trestle.TrestleTree>` will treat
    anything that passes ``isinstance(instance[attr],Number)`` as a numerical
    value. Because of how they store numerical distribution information, If
    either algorithm encounts a numerical value where it previously saw a
    nominal one it will throw an error. This preprocessor is provided as a way
    to address that problem by unifying the value types of attributes across an
    instance.

    Because parsing numbers is a less automatic function than casting things to
    strings this preprocessor has an extra parameter from
    :class:`NumericToNominal`. The on_fail parameter determines what should be
    done in the event of a parsing error and provides 3 options:

    * ``'break'`` - Simply raises the ValueError that caused the problem and
      fails. **(Default)**
    * ``'drop'``  - Drops any attributes that fail to parse. They would be
      treated as missing by categorization.
    * ``'zero'``  - Replaces any problem values with ``0.0``.

    This is a helper function preprocessor and so is not part of
    :class:`StructureMapper
    <concept_formation.structure_mapper.StructureMapper>`'s standard pipeline.

    >>> import pprint
    >>> ntn = NominalToNumeric()
    >>> instance = {"a":"123","b":"12.1241","c":"134"}
    >>> instance = ntn.transform(instance)
    >>> pprint.pprint(instance)
    {'a': 123.0, 'b': 12.1241, 'c': 134.0}

    >>> ntn = NominalToNumeric(on_fail='break')
    >>> instance = {"a":"123","b":"12.1241","c":"bad"}
    >>> instance = ntn.transform(instance)
    Traceback (most recent call last):
        ...
    ValueError: could not convert string to float: 'bad'

    >>> ntn = NominalToNumeric(on_fail="drop")
    >>> instance = {"a":"123","b":"12.1241","c":"bad"}
    >>> instance = ntn.transform(instance)
    >>> pprint.pprint(instance)
    {'a': 123.0, 'b': 12.1241}

    >>> ntn = NominalToNumeric(on_fail="zero")
    >>> instance = {"a":"123","b":"12.1241","c":"bad"}
    >>> instance = ntn.transform(instance)
    >>> pprint.pprint(instance)
    {'a': 123.0, 'b': 12.1241, 'c': 0.0}

    >>> ntn = NominalToNumeric("break","a","b")
    >>> instance = {"a":"123","b":"12.1241","c":"bad"}
    >>> instance = ntn.transform(instance)
    >>> pprint.pprint(instance)
    {'a': 123.0, 'b': 12.1241, 'c': 'bad'}

    :param on_fail: defines what should be done in the event of a numerical parse error
    :type on_fail: 'break', 'drop', or 'zero'
    :param attrs: A list of specific attributes to convert. If left empty all
        non-component values will be converted.
    :type attrs: strings
    """

    def __init__(self, on_fail='break', *attrs):
        if len(attrs) == 0:
            self.targets = None
        else:
            self.targets = attrs

        if on_fail not in ["break","drop","zero"]:
            on_fail = "break"
        self.on_fail = on_fail

[docs]    def transform(self,instance):
        """
        Transform target attribute values to numeric if they are valid nominals.
        """
        if self.targets is None:
            attrs = [k for k in instance.keys()]
        else:
            attrs = self.targets

        new_instance = {}

        for a in instance:
            if a in attrs:
                try:
                    val = float(instance[a])
                except ValueError as e:
                    if self.on_fail == "break":
                        raise e
                        return None
                    elif self.on_fail == "drop":
                        continue
                    elif self.on_fail == "zero":
                        val = 0.0
                new_instance[a] = val
            elif isinstance(instance[a],dict):
                new_instance[a] = self.transform(instance[a])
            else:
                new_instance[a] = instance[a]
        
        return new_instance


[docs]class Sanitizer(OneWayPreprocessor):
    """
    This is a preprocessor that santizes instances to adhere to the general
    expectations of either Cobweb, Cobweb3 or Trestle. In general this
    means enforcing that attribute keys are either of type str or tuple and
    that relational tuples contain only values of str or tuple. The  main
    reason for having this preprocessor is because many other things are valid
    dictionary keys in python and its possible to have weird behavior as a
    result.


    >>> from pprint import pprint
    >>> instance = {'a1':'v1','a2':2,'a3':{'aa1':'1','aa2':2},1:'v2',len:'v3',('r1',2,'r3'):'v4',('r4','r5'):{'aa3':4,3:'v6'}}
    >>> pprint(instance)
    {<built-in function len>: 'v3',
     1: 'v2',
     'a1': 'v1',
     'a2': 2,
     'a3': {'aa1': '1', 'aa2': 2},
     ('r1', 2, 'r3'): 'v4',
     ('r4', 'r5'): {3: 'v6', 'aa3': 4}}
    >>> san = Sanitizer('cobweb')
    >>> inst = san.transform(instance)
    >>> pprint(inst)
    {'1': 'v2',
     '<built-in function len>': 'v3',
     'a1': 'v1',
     'a2': 2,
     'a3': "{'aa1':'1','aa2':2}",
     ('r1', 2, 'r3'): 'v4',
     ('r4', 'r5'): "{3:'v6','aa3':4}"}
    >>> san = Sanitizer('trestle')
    >>> inst = san.transform(instance)
    >>> pprint(inst)
    {'1': 'v2',
     '<built-in function len>': 'v3',
     'a1': 'v1',
     'a2': 2,
     'a3': {'aa1': '1', 'aa2': 2},
     ('r1', '2', 'r3'): 'v4',
     ('r4', 'r5'): {'3': 'v6', 'aa3': 4}}
    """

    def __init__(self,spec='trestle'):
        if spec.lower() not in ['trestle','cobweb','cobweb3']:
            raise ValueError("Invalid Spec: must be one of: 'trestle','cobweb','cobweb3'")
        self.spec = spec

[docs]    def transform(self, instance):
        return self._sanitize(instance)

    def _cob_str(self,d):
        """
        Calling str on a dictionary is not gauranteed to print its keys
        deterministically so we need this function to ensure any stringified
        subobjects will be treated the same.
        """
        if isinstance(d,str):
            return "'"+d+"'"
        if isinstance(d,dict):
            return '{'+','.join([ self._cob_str(k)+':'+self._cob_str(d[k]) for k in sorted(d.keys(),key=str)])+'}'
        else:
            return str(d)

    def _sanitize_tuple(self,tup):
        ret = []
        for v in tup:
            if isinstance(v,str):
                ret.append(v)
            elif isinstance(v,tuple):
                ret.append(self._sanitize_tuple(v))
            else:
                ret.append(str(v))
        return tuple(ret)

    def _sanitize(self, instance):
        ret = {}
        for attr in instance:
            val = instance[attr]
            if not isinstance(attr,str) and not isinstance(attr,tuple):
                if str(attr) in instance:
                    print('Santitizing',attr,'is clobbering an existing value')
                
                if self.spec == 'trestle':
                    if isinstance(val,collections.Hashable):
                        ret[str(attr)] = val
                    elif isinstance(val,dict):
                        ret[str(attr)] = self._sanitize(val)
                    else:
                        ret[str(attr)] = self._cob_str(val)
                else:
                    ret[str(attr)] = val if isinstance(val,collections.Hashable) else self._cob_str(val)
            if isinstance(attr,str):
                if self.spec == 'trestle':
                    if isinstance(val,collections.Hashable):
                        ret[attr] = val
                    elif isinstance(val,dict):
                        ret[attr] = self._sanitize(val)
                    else:
                        ret[attr] = self._cob_str(val)
                else:
                    ret[attr] = val if isinstance(val,collections.Hashable) else self._cob_str(val)
                    
            if isinstance(attr,tuple):
                if self.spec == 'trestle':
                    san_tup = self._sanitize_tuple(attr)
                    if san_tup != attr and san_tup in instance:
                        print('Sanitizing',attr,'is clobbering an existing vlaue')
                    
                    if isinstance(val,collections.Hashable):
                        ret[san_tup] = val
                    elif isinstance(val,dict):
                        ret[san_tup] = self._sanitize(val)
                    else:
                        ret[san_tup] = self._cob_str(val)
                else:
                    ret[attr] = val if isinstance(val,collections.Hashable) else self._cob_str(val)
        return ret