summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xdata/nn-goldman.py71
1 files changed, 27 insertions, 44 deletions
diff --git a/data/nn-goldman.py b/data/nn-goldman.py
index 0a04e7a..297544b 100755
--- a/data/nn-goldman.py
+++ b/data/nn-goldman.py
@@ -1,52 +1,35 @@
#!/usr/bin/python
import sys
-import random
import numpy as np
-
-def quadratic_knn_search(data, lidx, ldata, K):
- """ find K nearest neighbours of data among ldata """
- ndata = ldata.shape[1]
- param = ldata.shape[0]
- K = K if K < ndata else ndata
- retval = []
- sqd = ((ldata - data[:,:ndata])**2).sum(axis=0) # data.reshape((param,1)).repeat(ndata, axis=1);
- idx = np.argsort(sqd, kind='mergesort')
- idx = idx[:K]
- return zip(sqd[idx], lidx[idx])
-
-def normalize(a,weights=None):
- if weights == None:
- weights= {}
- cols = a.shape[1]
- for i in range(cols):
- weights[i] = None
-
- for i in weights.keys():
+#in place modification !
+def normalize(a):
+ print a
+ for i in range(a.shape[1]):
column = a[:,i]
- if weights[i] == None:
- weights[i] = np.mean(column), np.std(column)
- a[:,i] = (column-weights[i][0])/weights[i][1]
- return a
+ weights = np.mean(column), np.std(column)
+ a[:,i] = (column-weights[0])/weights[1]
+ return a
-def knn_search( data1, data2, K ):
- """ find the K nearest neighbours for data points in data,
- using O(n**2) search """
- ndata = data1.shape[1]
- knn = []
- idx = np.arange(ndata)
- for i in np.arange(ndata):
- _knn = quadratic_knn_search(data1[:,i], idx, data2, K+1) # see above
- knn.append( _knn[1:] )
- return knn
+def knn_search(names,d1,d2,k):
+ for i,row2 in enumerate(d2):
+ distance = []
+ for row1 in d1:
+ distance += [((row2-row1)**2).sum()]
+ indexes = np.argsort(np.array(distance))[:k]
+ nn = map(int,names[indexes])
+ name = int(names[i])
+ print str(name)+"|"+ ",".join(map(str,nn))+"|"+str(name in nn)
if __name__ == "__main__":
- random.seed()
- sk_data = normalize(np.loadtxt(sys.argv[1],comments="#",delimiter=",",usecols=range(1,7,1)).T)
- gaussify = np.vectorize(lambda x: x+random.gauss(0,float(sys.argv[2])))
- sk1 = gaussify(sk_data)
- sk2 = gaussify(sk_data)
- print sk1
- print sk2
- print knn_search(sk1,sk2,1)
-
+ np.random.seed()
+ var = float(sys.argv[2])
+ sk_data = np.loadtxt(sys.argv[1],comments="#",delimiter=",")
+ names = sk_data[:,0]
+ sk_data = sk_data[:,1:]
+ noise1 = np.random.normal(0,var,sk_data.shape)
+ noise2 = np.random.normal(0,var,sk_data.shape)
+ sk1 = normalize(sk_data+noise1)
+ sk2 = normalize(sk_data+noise2)
+ knn_search(names,sk1,sk2,1)
+