diff options
| author | jeanpouget-abadie <jean.pougetabadie@gmail.com> | 2014-12-07 12:36:49 -0500 |
|---|---|---|
| committer | jeanpouget-abadie <jean.pougetabadie@gmail.com> | 2014-12-07 12:36:49 -0500 |
| commit | 830e7fdc86c10d22bca2694f2a1da276cd1c8f60 (patch) | |
| tree | ae910498fb08b3862423fc745ba106609fd6dd58 /datasets/normalize_dataset.py | |
| parent | 9de35421f25bf45158187daea4ddfedd1c93f3d8 (diff) | |
| download | cascades-830e7fdc86c10d22bca2694f2a1da276cd1c8f60.tar.gz | |
normalize dataset function
Diffstat (limited to 'datasets/normalize_dataset.py')
| -rw-r--r-- | datasets/normalize_dataset.py | 35 |
1 files changed, 30 insertions, 5 deletions
diff --git a/datasets/normalize_dataset.py b/datasets/normalize_dataset.py index befebee..95d8537 100644 --- a/datasets/normalize_dataset.py +++ b/datasets/normalize_dataset.py @@ -1,6 +1,31 @@ -""" -Run the following script on a dataset! +import numpy as np +from itertools import izip -If the nodes are not numbered 0 to number_of_nodes - 1, then it will print -out the normalized version of the dataset in the same directory -""" +def normalize_file(filename): + """ + Normalizes file: + If nodes are not numbered 0 to number_of_nodes - 1, then prints + normalized version of dataset in the same directory + """ + #Read number of unique node identifications + nodes = [] + with open(filename, "r") as f: + for line in f: + if "#" not in line: + node_1, node_2 = line.split() + nodes.append(node_1); nodes.append(node_2) + uniq_nodes = np.unique(nodes) + + #Hash nodes to an index between 0 and number_of_nodes - 1 + hash_nodes = {} + for idx, node in enumerate(uniq_nodes): + hash_nodes[node] = idx + + #Write to file + with open(filename[:-4]+"normalize.txt", "w") as g: + with open(filename, "r") as f: + for line_f in f: + f_node_1, f_node_2 = line_f.split() + g_node_1 = hash_nodes[f_node_1] + g_node_2 = hash_nodes[f_node_2] + g.write(str(g_node_1)+" "+str(g_node_2)+"\n") |
