normalize dataset function

author: jeanpouget-abadie <jean.pougetabadie@gmail.com> 2014-12-07 12:36:49 -0500
committer: jeanpouget-abadie <jean.pougetabadie@gmail.com> 2014-12-07 12:36:49 -0500
commit: 830e7fdc86c10d22bca2694f2a1da276cd1c8f60 (patch)
tree: ae910498fb08b3862423fc745ba106609fd6dd58 /datasets/normalize_dataset.py
parent: 9de35421f25bf45158187daea4ddfedd1c93f3d8 (diff)
download: cascades-830e7fdc86c10d22bca2694f2a1da276cd1c8f60.tar.gz
1 files changed, 30 insertions, 5 deletions
diff --git a/datasets/normalize_dataset.py b/datasets/normalize_dataset.py
index befebee..95d8537 100644
--- a/datasets/normalize_dataset.py
+++ b/datasets/normalize_dataset.py
@@ -1,6 +1,31 @@
-"""
-Run the following script on a dataset!
+import numpy as np
+from itertools import izip
 
-If the nodes are not numbered 0 to number_of_nodes - 1, then it will print
-out the normalized version of the dataset in the same directory
-"""
+def normalize_file(filename):
+    """
+    Normalizes file:
+    If nodes are not numbered 0 to number_of_nodes - 1, then prints
+    normalized version of dataset in the same directory
+    """
+    #Read number of unique node identifications
+    nodes = []
+    with open(filename, "r") as f:
+        for line in f:
+            if "#" not in line:
+                node_1, node_2 = line.split()
+                nodes.append(node_1); nodes.append(node_2)
+        uniq_nodes = np.unique(nodes)
+
+    #Hash nodes to an index between 0 and number_of_nodes - 1
+    hash_nodes = {}
+    for idx, node in enumerate(uniq_nodes):
+        hash_nodes[node] = idx
+
+    #Write to file
+    with open(filename[:-4]+"normalize.txt", "w") as g:
+        with open(filename, "r") as f:
+            for line_f in f:
+                f_node_1, f_node_2 = line_f.split()
+                g_node_1 = hash_nodes[f_node_1]
+                g_node_2 = hash_nodes[f_node_2]
+                g.write(str(g_node_1)+" "+str(g_node_2)+"\n")
author	jeanpouget-abadie <jean.pougetabadie@gmail.com>	2014-12-07 12:36:49 -0500
committer	jeanpouget-abadie <jean.pougetabadie@gmail.com>	2014-12-07 12:36:49 -0500
commit	830e7fdc86c10d22bca2694f2a1da276cd1c8f60 (patch)
tree	ae910498fb08b3862423fc745ba106609fd6dd58 /datasets/normalize_dataset.py
parent	9de35421f25bf45158187daea4ddfedd1c93f3d8 (diff)
download	cascades-830e7fdc86c10d22bca2694f2a1da276cd1c8f60.tar.gz