Predicting with Noisy Nominal and Numeric FeaturesΒΆ

import numpy as np
import matplotlib.pyplot as plt
from random import choice
from random import shuffle
from random import random
from random import seed

from concept_formation.cobweb3 import Cobweb3Tree
from concept_formation.cluster import cluster

seed(0)

def run_clust_exp(nominal_noise=0, numeric_noise=0, scaling=False):
    data = []

    for i in range(60):
        x = {}
        x['_label'] = "G1"

        if random() >= nominal_noise:
            x['f1'] = "G1f1"
        else:
            x['f1'] = choice(['G2f1', 'G3f1'])

        if random() >= nominal_noise:
            x['f2'] = choice(["G1f2a", "G1f2b"])
        else:
            x['f2'] = choice(["G2f2a", "G2f2b","G3f2a", "G3f2b"])

        if random() >= numeric_noise:
            x['f3'] = np.random.normal(4,1,1)[0]
        else:
            x['f3'] = choice([np.random.normal(10,1,1)[0],np.random.normal(16,1,1)[0]])

        if random() >= numeric_noise:
            x['f4'] = np.random.normal(20,2,1)[0]
        else:
            x['f4'] = choice([np.random.normal(32,2,1)[0],np.random.normal(44,2,1)[0]])

        data.append(x)

    for i in range(60):
        x = {}
        x['_label'] = "G2"

        if random() >= nominal_noise:
            x['f1'] = "G2f1"
        else:
            x['f1'] = choice(["G2f1", "G3f1"])

        if random() >= nominal_noise:
            x['f2'] = choice(["G2f2a", "G2f2b"])
        else:
            x['f2'] = choice(["G1f2a", "G1f2b", "G3f2a", "G3f2b"])

        if random() >= numeric_noise:
            x['f3'] = np.random.normal(10,1,1)[0]
        else:
            x['f3'] = choice([np.random.normal(4,1,1)[0],np.random.normal(16,1,1)[0]])

        if random() >= numeric_noise:
            x['f4'] = np.random.normal(32,2,1)[0]
        else:
            x['f4'] = choice([np.random.normal(20,2,1)[0],np.random.normal(44,2,1)[0]])

        data.append(x)

    for i in range(60):
        x = {}
        x['_label'] = "G3"

        if random() >= nominal_noise:
            x['f1'] = "G3f1"
        else:
            x['f1'] = choice(["G1f1", "G2f1"])

        if random() >= nominal_noise:
            x['f2'] = choice(["G3f2a", "G3f2b"])
        else:
            x['f2'] = choice(["G1f2a", "G1f2b", "G2f2a", "G2f2b"])

        if random() >= numeric_noise:
            x['f3'] = np.random.normal(16,1,1)[0]
        else:
            x['f3'] = choice([np.random.normal(4,1,1)[0],np.random.normal(10,1,1)[0]])

        if random() >= numeric_noise:
            x['f4'] = np.random.normal(44,2,1)[0]
        else:
            x['f4'] = choice([np.random.normal(20,2,1)[0],np.random.normal(32,2,1)[0]])

        data.append(x)

    shuffle(data)
    t = Cobweb3Tree(scaling=scaling)
    clustering = cluster(t, data)
    return data, clustering[0]

def run_noise_exp(scaling=False):
    noise = np.arange(0.0, 0.8, 0.2)
    print(noise)

    miss_nominal = []
    miss_numeric = []
    #aris = []

    for n in noise:
        data, clustering = run_clust_exp(n,0,scaling)
        confusion = {}
        for i,c in enumerate(clustering):
            if c not in confusion:
                confusion[c] = {}
            if data[i]['_label'] not in confusion[c]:
                confusion[c][data[i]['_label']] = 0
            confusion[c][data[i]['_label']] += 1
        print(confusion)

        totals = sorted([(sum([confusion[c][g] for g in
                               confusion[c]]), c) for c in confusion], reverse=True)
        top = [c for t,c in totals[:3]]
        miss = 0

        for c in confusion:
            v = sorted([confusion[c][g] for g in confusion[c]], reverse=True)
            if c not in top:
                miss += v[0]

            for minorities in v[1:]:
                miss += 2 * minorities

        #labels = [d['_label'] for d in data]
        #aris.append(ari(labels, clustering))
        miss_nominal.append(miss)

    for n in noise:
        data, clustering = run_clust_exp(0,n, scaling)
        confusion = {}
        for i,c in enumerate(clustering):
            if c not in confusion:
                confusion[c] = {}
            if data[i]['_label'] not in confusion[c]:
                confusion[c][data[i]['_label']] = 0
            confusion[c][data[i]['_label']] += 1
        print(confusion)

        totals = sorted([(sum([confusion[c][g] for g in
                               confusion[c]]), c) for c in confusion], reverse=True)
        top = [c for t,c in totals[:3]]
        miss = 0

        for c in confusion:
            v = sorted([confusion[c][g] for g in confusion[c]], reverse=True)
            if c not in top:
                miss += v[0]

            for minorities in v[1:]:
                miss += 2 * minorities

        # labels = [d['_label'] for d in data]
        # aris.append(ari(labels, clustering))
        miss_numeric.append(miss)

    return noise, miss_nominal, miss_numeric

nominal = []
numeric = []

for i in range(2):
    noise, miss_nominal, miss_numeric = run_noise_exp(scaling=0.5)
    nominal.append(miss_nominal)
    numeric.append(miss_numeric)
    noise = noise

nominal = np.array(nominal)
numeric = np.array(numeric)

nominal = np.mean(nominal, axis=0)
numeric = np.mean(numeric, axis=0)

nominal_line, = plt.plot(noise, miss_nominal, linestyle="--", marker="o", color="b")
numeric_line, = plt.plot(noise, miss_numeric, linestyle="-", marker="o", color="g")
plt.legend([nominal_line, numeric_line], ["Noisy Nominal", "Noisy Numeric"],
           loc=2)

plt.xlabel("Percentage Noise")
plt.ylabel("Misclassification Count")


plt.ylim(0,200)
plt.show()

(Source code, png, hires.png, pdf)

../_images/li_biswas_experiment.png