summaryrefslogtreecommitdiffstats
path: root/data/nn-goldman.py
blob: 34072de5a61101d14cec4373fbf18e4dfc54dbb8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/python
import sys
import numpy as np

#in place modification !
def normalize(a,weights=None):
    if weights == None:
        weights= {}
        cols = a.shape[1]
        for i in range(cols):
            weights[i] = None

    for i in weights.keys():
        column = a[:,i]
        if weights[i] == None:
            weights[i] = np.mean(column), np.std(column)
        a[:,i] = (column-weights[i][0])/weights[i][1]
    return a,weights

def knn_search(names,d1,d2,k):
    for i,row2 in enumerate(d2):
        distance = []
        for row1 in d1:
            distance += [((row2-row1)**2).sum()]
        indexes = np.argsort(np.array(distance))[:k]
        nn = map(int,names[indexes])
        name = int(names[i])
        print str(name)+"|"+ ",".join(map(str,nn))+"|"+str(name in nn)

if __name__ == "__main__":
    np.random.seed()
    var = float(sys.argv[2])
    sk_data  = np.loadtxt(sys.argv[1],comments="#",delimiter=",")
    names = sk_data[:,0]
    sk_data = sk_data[:,1:]
    noise1 = np.random.normal(0,var,sk_data.shape)
    noise2 = np.random.normal(0,var,sk_data.shape)
    #sk1,weights = normalize(sk_data+noise1)
    #sk2,weights = normalize(sk_data+noise2,weights)
    sk1 = sk_data + noise1
    sk2 = sk_data + noise2
    print sk1
    print sk2
    knn_search(names,sk1,sk2,1)