Source code for concept_formation.utils

"""
The utils module contains a number of utility functions used by other modules.
"""

from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from __future__ import division
from numbers import Number
from random import uniform
from random import random
from math import sqrt
from math import isnan


# A hashtable of values to use in the c4(n) function to apply corrections to
# estimates of std.
c4n_table = {2: 0.7978845608028654, 
      3:  0.886226925452758, 
      4:  0.9213177319235613, 
      5:  0.9399856029866254, 
      6:  0.9515328619481445, 
      7:  0.9593687886998328, 
      8:  0.9650304561473722, 
      9:  0.9693106997139539, 
      10: 0.9726592741215884, 
      11: 0.9753500771452293, 
      12: 0.9775593518547722, 
      13: 0.9794056043142177, 
      14: 0.9809714367555161, 
      15: 0.9823161771626504, 
      16: 0.9834835316158412, 
      17: 0.9845064054718315, 
      18: 0.985410043808079, 
      19: 0.9862141368601935, 
      20: 0.9869342675246552, 
      21: 0.9875829288261562, 
      22: 0.9881702533158311, 
      23: 0.988704545233999, 
      24: 0.9891926749585048, 
      25: 0.9896403755857028, 
      26: 0.9900524688409107, 
      27: 0.990433039209448, 
      28: 0.9907855696217323, 
      29: 0.9911130482419843}

[docs]def c4(n) :
    """
    Returns the correction factor to apply to unbias estimates of standard 
    deviation in low sample sizes. This implementation is based on a lookup 
    table for n in [2-29] and returns 1.0 for values >= 30.

    >>> c4(3)
    0.886226925452758
    """
    if n <= 1 :
        raise ValueError("Cannot apply correction for a sample size of 1.")
    else :
        return c4n_table[n] if n < 30 else 1.0

[docs]def isNumber(n):
    """
    Check if a value is a number that should be handled differently
    than nominals. 
    """
    return (not isinstance(n, bool) and isinstance(n, Number)) and not isnan(n)

[docs]def mean(values):
    """
    Computes the mean of a list of values.

    This is primarily included to reduce dependency on external math libraries
    like numpy in the core algorithm.

    :param values: a list of numbers
    :type values: list
    :return: the mean of the list of values
    :rtype: float

    >>> mean([600, 470, 170, 430, 300])
    394.0
    """
    if len(values) <= 0:
        raise ValueError("Length of list must be greater than 0.")

    return float(sum(values))/len(values)

[docs]def std(values):
    """
    Computes the standard deviation of a list of values.

    This is primarily included to reduce dependency on external math libraries
    like numpy in the core algorithm.

    :param values: a list of numbers
    :type values: list
    :return: the standard deviation of the list of values
    :rtype: float

    >>> std([600, 470, 170, 430, 300])
    147.32277488562318
    """
    if len(values) <= 0:
        raise ValueError("Length of list must be greater than 0.")

    meanValue = mean(values)
    variance =  float(sum([(v - meanValue) * (v - meanValue) for v in
                           values]))/len(values)
    return sqrt(variance)

[docs]def weighted_choice(choices):
    """
    Given a list of tuples [(val, prob),...(val, prob)], return a
    randomly chosen value where the choice is weighted by prob.

    :param choices: A list of tuples
    :type choices: [(val, prob),...(val, prob)]
    :return: A choice sampled from the list according to the weightings
    :rtype: val

    >>> from random import seed
    >>> seed(1234)
    >>> options = [('a',.25),('b',.12),('c',.46),('d',.07)]
    >>> weighted_choice(options)
    'd'
    >>> weighted_choice(options)
    'c'
    >>> weighted_choice(options)
    'a'

    .. seealso:: :meth:`CobwebNode.sample <concept_formation.cobweb.CobwebNode.sample>`
    """
    total = sum(w for c, w in choices)
    r = uniform(0, total)
    upto = 0
    for c, w in choices:
       if upto + w > r:
          return c
       upto += w
    assert False, "Shouldn't get here"

[docs]def most_likely_choice(choices):
    """
    Given a list of tuples [(val, prob),...(val, prob)], returns the
    value with the highest probability. Ties are randomly broken.

    >>> options = [('a',.25),('b',.12),('c',.46),('d',.07)]
    >>> most_likely_choice(options)
    'c'
    >>> most_likely_choice(options)
    'c'
    >>> most_likely_choice(options)
    'c'

    :param choices: A list of tuples
    :type choices: [(val, prob),...(val, prob)]
    :return: the val with the hightest prob
    :rtype: val
    """
    updated_choices = [(prob, random(), val) for val, prob in choices]
    return sorted(updated_choices, reverse=True)[0][2]