aboutsummaryrefslogtreecommitdiffstats
path: root/datasets/normalize_dataset.py
diff options
context:
space:
mode:
authorjeanpouget-abadie <jean.pougetabadie@gmail.com>2014-12-07 12:36:49 -0500
committerjeanpouget-abadie <jean.pougetabadie@gmail.com>2014-12-07 12:36:49 -0500
commit830e7fdc86c10d22bca2694f2a1da276cd1c8f60 (patch)
treeae910498fb08b3862423fc745ba106609fd6dd58 /datasets/normalize_dataset.py
parent9de35421f25bf45158187daea4ddfedd1c93f3d8 (diff)
downloadcascades-830e7fdc86c10d22bca2694f2a1da276cd1c8f60.tar.gz
normalize dataset function
Diffstat (limited to 'datasets/normalize_dataset.py')
-rw-r--r--datasets/normalize_dataset.py35
1 files changed, 30 insertions, 5 deletions
diff --git a/datasets/normalize_dataset.py b/datasets/normalize_dataset.py
index befebee..95d8537 100644
--- a/datasets/normalize_dataset.py
+++ b/datasets/normalize_dataset.py
@@ -1,6 +1,31 @@
-"""
-Run the following script on a dataset!
+import numpy as np
+from itertools import izip
-If the nodes are not numbered 0 to number_of_nodes - 1, then it will print
-out the normalized version of the dataset in the same directory
-"""
+def normalize_file(filename):
+ """
+ Normalizes file:
+ If nodes are not numbered 0 to number_of_nodes - 1, then prints
+ normalized version of dataset in the same directory
+ """
+ #Read number of unique node identifications
+ nodes = []
+ with open(filename, "r") as f:
+ for line in f:
+ if "#" not in line:
+ node_1, node_2 = line.split()
+ nodes.append(node_1); nodes.append(node_2)
+ uniq_nodes = np.unique(nodes)
+
+ #Hash nodes to an index between 0 and number_of_nodes - 1
+ hash_nodes = {}
+ for idx, node in enumerate(uniq_nodes):
+ hash_nodes[node] = idx
+
+ #Write to file
+ with open(filename[:-4]+"normalize.txt", "w") as g:
+ with open(filename, "r") as f:
+ for line_f in f:
+ f_node_1, f_node_2 = line_f.split()
+ g_node_1 = hash_nodes[f_node_1]
+ g_node_2 = hash_nodes[f_node_2]
+ g.write(str(g_node_1)+" "+str(g_node_2)+"\n")