diff options
Diffstat (limited to 'data/nn-goldman.py')
| -rwxr-xr-x | data/nn-goldman.py | 71 |
1 files changed, 27 insertions, 44 deletions
diff --git a/data/nn-goldman.py b/data/nn-goldman.py index 0a04e7a..297544b 100755 --- a/data/nn-goldman.py +++ b/data/nn-goldman.py @@ -1,52 +1,35 @@ #!/usr/bin/python import sys -import random import numpy as np - -def quadratic_knn_search(data, lidx, ldata, K): - """ find K nearest neighbours of data among ldata """ - ndata = ldata.shape[1] - param = ldata.shape[0] - K = K if K < ndata else ndata - retval = [] - sqd = ((ldata - data[:,:ndata])**2).sum(axis=0) # data.reshape((param,1)).repeat(ndata, axis=1); - idx = np.argsort(sqd, kind='mergesort') - idx = idx[:K] - return zip(sqd[idx], lidx[idx]) - -def normalize(a,weights=None): - if weights == None: - weights= {} - cols = a.shape[1] - for i in range(cols): - weights[i] = None - - for i in weights.keys(): +#in place modification ! +def normalize(a): + print a + for i in range(a.shape[1]): column = a[:,i] - if weights[i] == None: - weights[i] = np.mean(column), np.std(column) - a[:,i] = (column-weights[i][0])/weights[i][1] - return a + weights = np.mean(column), np.std(column) + a[:,i] = (column-weights[0])/weights[1] + return a -def knn_search( data1, data2, K ): - """ find the K nearest neighbours for data points in data, - using O(n**2) search """ - ndata = data1.shape[1] - knn = [] - idx = np.arange(ndata) - for i in np.arange(ndata): - _knn = quadratic_knn_search(data1[:,i], idx, data2, K+1) # see above - knn.append( _knn[1:] ) - return knn +def knn_search(names,d1,d2,k): + for i,row2 in enumerate(d2): + distance = [] + for row1 in d1: + distance += [((row2-row1)**2).sum()] + indexes = np.argsort(np.array(distance))[:k] + nn = map(int,names[indexes]) + name = int(names[i]) + print str(name)+"|"+ ",".join(map(str,nn))+"|"+str(name in nn) if __name__ == "__main__": - random.seed() - sk_data = normalize(np.loadtxt(sys.argv[1],comments="#",delimiter=",",usecols=range(1,7,1)).T) - gaussify = np.vectorize(lambda x: x+random.gauss(0,float(sys.argv[2]))) - sk1 = gaussify(sk_data) - sk2 = gaussify(sk_data) - print sk1 - print sk2 - print knn_search(sk1,sk2,1) - + np.random.seed() + var = float(sys.argv[2]) + sk_data = np.loadtxt(sys.argv[1],comments="#",delimiter=",") + names = sk_data[:,0] + sk_data = sk_data[:,1:] + noise1 = np.random.normal(0,var,sk_data.shape) + noise2 = np.random.normal(0,var,sk_data.shape) + sk1 = normalize(sk_data+noise1) + sk2 = normalize(sk_data+noise2) + knn_search(names,sk1,sk2,1) + |
