Predicting with Noisy Nominal and Numeric FeaturesΒΆ

import numpy as np
import matplotlib.pyplot as plt
from random import choice
from random import shuffle
from random import random

from concept_formation.cobweb3 import Cobweb3Tree
from concept_formation.cluster import cluster

def run_clust_exp(nominal_noise=0, numeric_noise=0, scaling=False):
    data = []

    for i in range(60):
        x = {}
        x['_label'] = "G1"

        if random() >= nominal_noise:
            x['f1'] = "G1f1"
        else:
            x['f1'] = choice(['G2f1', 'G3f1'])

        if random() >= nominal_noise:
            x['f2'] = choice(["G1f2a", "G1f2b"])
        else:
            x['f2'] = choice(["G2f2a", "G2f2b","G3f2a", "G3f2b"])

        if random() >= numeric_noise:
            x['f3'] = np.random.normal(4,1,1)[0]
        else:
            x['f3'] = choice([np.random.normal(10,1,1)[0],np.random.normal(16,1,1)[0]])

        if random() >= numeric_noise:
            x['f4'] = np.random.normal(20,2,1)[0]
        else:
            x['f4'] = choice([np.random.normal(32,2,1)[0],np.random.normal(44,2,1)[0]])

        data.append(x)

    for i in range(60):
        x = {}
        x['_label'] = "G2"

        if random() >= nominal_noise:
            x['f1'] = "G2f1"
        else:
            x['f1'] = choice(["G2f1", "G3f1"])

        if random() >= nominal_noise:
            x['f2'] = choice(["G2f2a", "G2f2b"])
        else:
            x['f2'] = choice(["G1f2a", "G1f2b", "G3f2a", "G3f2b"])

        if random() >= numeric_noise:
            x['f3'] = np.random.normal(10,1,1)[0]
        else:
            x['f3'] = choice([np.random.normal(4,1,1)[0],np.random.normal(16,1,1)[0]])

        if random() >= numeric_noise:
            x['f4'] = np.random.normal(32,2,1)[0]
        else:
            x['f4'] = choice([np.random.normal(20,2,1)[0],np.random.normal(44,2,1)[0]])

        data.append(x)

    for i in range(60):
        x = {}
        x['_label'] = "G3"

        if random() >= nominal_noise:
            x['f1'] = "G3f1"
        else:
            x['f1'] = choice(["G1f1", "G2f1"])

        if random() >= nominal_noise:
            x['f2'] = choice(["G3f2a", "G3f2b"])
        else:
            x['f2'] = choice(["G1f2a", "G1f2b", "G2f2a", "G2f2b"])

        if random() >= numeric_noise:
            x['f3'] = np.random.normal(16,1,1)[0]
        else:
            x['f3'] = choice([np.random.normal(4,1,1)[0],np.random.normal(10,1,1)[0]])

        if random() >= numeric_noise:
            x['f4'] = np.random.normal(44,2,1)[0]
        else:
            x['f4'] = choice([np.random.normal(20,2,1)[0],np.random.normal(32,2,1)[0]])

        data.append(x)

    shuffle(data)
    t = Cobweb3Tree(scaling=scaling)
    clustering = cluster(t, data)
    return data, clustering[0]

def run_noise_exp(scaling=False):
    noise = np.arange(0.0, 0.8, 0.2)
    print(noise)

    miss_nominal = []
    miss_numeric = []
    #aris = []

    for n in noise:
        data, clustering = run_clust_exp(n,0,scaling)
        confusion = {}
        for i,c in enumerate(clustering):
            if c not in confusion:
                confusion[c] = {}
            if data[i]['_label'] not in confusion[c]:
                confusion[c][data[i]['_label']] = 0
            confusion[c][data[i]['_label']] += 1
        print(confusion)

        totals = sorted([(sum([confusion[c][g] for g in
                               confusion[c]]), c) for c in confusion], reverse=True)
        top = [c for t,c in totals[:3]]
        miss = 0

        for c in confusion:
            v = sorted([confusion[c][g] for g in confusion[c]], reverse=True)
            if c not in top:
                miss += v[0]

            for minorities in v[1:]:
                miss += 2 * minorities

        #labels = [d['_label'] for d in data]
        #aris.append(ari(labels, clustering))
        miss_nominal.append(miss)

    for n in noise:
        data, clustering = run_clust_exp(0,n, scaling)
        confusion = {}
        for i,c in enumerate(clustering):
            if c not in confusion:
                confusion[c] = {}
            if data[i]['_label'] not in confusion[c]:
                confusion[c][data[i]['_label']] = 0
            confusion[c][data[i]['_label']] += 1
        print(confusion)

        totals = sorted([(sum([confusion[c][g] for g in
                               confusion[c]]), c) for c in confusion], reverse=True)
        top = [c for t,c in totals[:3]]
        miss = 0

        for c in confusion:
            v = sorted([confusion[c][g] for g in confusion[c]], reverse=True)
            if c not in top:
                miss += v[0]

            for minorities in v[1:]:
                miss += 2 * minorities

        #labels = [d['_label'] for d in data]
        #aris.append(ari(labels, clustering))
        miss_numeric.append(miss)

    return noise, miss_nominal, miss_numeric

nominal = []
numeric = []

for i in range(2):
    noise, miss_nominal, miss_numeric = run_noise_exp(scaling=0.5)
    nominal.append(miss_nominal)
    numeric.append(miss_numeric)
    noise = noise

nominal = np.array(nominal)
numeric = np.array(numeric)

nominal = np.mean(nominal, axis=0)
numeric = np.mean(numeric, axis=0)

nominal_line, = plt.plot(noise, miss_nominal, linestyle="--", marker="o", color="b")
numeric_line, = plt.plot(noise, miss_numeric, linestyle="-", marker="o", color="g")
plt.legend([nominal_line, numeric_line], ["Noisy Nominal", "Noisy Numeric"],
           loc=2)

plt.xlabel("Percentage Noise")
plt.ylabel("Misclassification Count")


plt.ylim(0,200)
plt.show()

(Source code, png, hires.png, pdf)

../_images/li_biswas_experiment_00_00.png
#for i in range(len(data)):
#    c = clustering[i]
#    d = data[i]
#    print(c, d)
#
#

()