diff options
| -rwxr-xr-x | data/class.py | 95 |
1 files changed, 32 insertions, 63 deletions
diff --git a/data/class.py b/data/class.py index e7975f9..2cdfa40 100755 --- a/data/class.py +++ b/data/class.py @@ -2,69 +2,38 @@ import copy import sys from svmutil import * +import numpy as np +import matplotlib.pyplot as plt -lower = 0.1 -upper = 10 - -def normalize_instances(instances, ranges = None) : - normalized_instances = copy.deepcopy(instances) - if ranges == None : - ranges_dict = dict() - for attribute in normalized_instances[0].keys() : # we iterate on the attributes - column = [instance[attribute] for instance in normalized_instances] - if ranges != None : - minimum = ranges[attribute][0] - maximum = ranges[attribute][1] - else : - minimum = min(column) - maximum = max(column) - ranges_dict[attribute] = [minimum, maximum] - for i in range(len(column)) : - if column[i] == minimum : - column[i] = lower - elif column[i] == maximum : - column[i] = upper - else : - column[i] = lower + (upper-lower) * (column[i] - minimum) / (maximum - minimum) - # Copying normalized values in memory - - for elem, instance in zip(column, normalized_instances): - instance[attribute] = elem - - if ranges == None : - return normalized_instances, ranges_dict - else : - return normalized_instances - - -def read_file(filename) : - y = [] - x = [] - for line in filename: - values = line.rstrip().split(',') - if values[0] != "# dir": - dict = {} - for i in range(9): - if float(values[i+5])!=-1.: - dict[i+1] = float(values[i+5]) - if len(dict)==9: - y += [int(values[1])] - x += [dict] - print line.rstrip() - #for a,b in zip(y,x): - # result = str(a) - # for i in range(9): - # result += " "+str(i+1)+":"+str(b[i+1]) - # print result - #return (y,x) +def read_normalize(filename,means=None,std=None) : + a = np.loadtxt(filename,comments="#",delimiter=",",usecols=(1,6,7,8,9,10,11,12,13,14)) + distance,variance = np.loadtxt(filename,comments="#",delimiter=",",usecols=((4,5)),unpack=True) + a = np.ma.masked_equal(a,-1) + a = np.ma.mask_rows(a) + mask = a.mask[:,0] + a = np.ma.compress_rows(a) + distance = distance[mask] + variance = variance[mask] +# plt.plot(range(len(variance)),variance,range(len(distance)),distance) +# plt.show() + a = a[np.logical_and(variance>0.005,variance<0.05)] + + rows,cols = a.shape + if means==None: + means = {} + if std==None: + std = {} + for col in xrange(1,cols): + if col not in means: + means[col] = np.mean(a[:,col]) + if col not in std: + std[col] = np.std(a[:,col]) + a[:,col] = (a[:,col]-means[col])/(std[col]) + return list(a[:,0]),[dict(zip(range(1,cols+1),r)) for r in a[:,1:]],means,std train_filename = sys.argv[1] -#test_filename = sys.argv[2] -y1,x1 = read_file(open(train_filename)) -#x1,ranges = normalize_instances(x1) -#print ranges -#exit(0) -#model = svm_train(y1,x1) -#y2,x2 = read_file(open(test_filename)) -#x2 = normalize_instances(x2,ranges) -#p_labels,p_acc,p_vals = svm_predict(y2,x2,model) +test_filename = sys.argv[2] +y1,x1,means,std = read_normalize(train_filename) +model = svm_train(y1,x1) +y2,x2,means,std = read_normalize(test_filename,means=means,std=std) +p_labels,p_acc,p_vals = svm_predict(y2,x2,model) |
