initial tf-idf construction

author: Guillaume Horel <guillaume.horel@gmail.com> 2015-11-07 18:49:40 -0500
committer: Guillaume Horel <guillaume.horel@gmail.com> 2015-11-07 18:49:40 -0500
commit: fc94b19c14cd44223b205469ed794abf08d9dfdb (patch)
tree: 34d04bbf0a36c9cf2f7698ad850769ae740555a3 /tf-idf.py
parent: 939ea00056d5cc8817f00b8c293efa04d36bf6d5 (diff)
download: slack-fc94b19c14cd44223b205469ed794abf08d9dfdb.tar.gz
1 files changed, 33 insertions, 0 deletions
diff --git a/tf-idf.py b/tf-idf.py
new file mode 100644
index 0000000..60280b8
--- /dev/null
+++ b/tf-idf.py
@@ -0,0 +1,33 @@
+import nltk
+import string
+import os
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from nltk.stem.porter import PorterStemmer
+
+def stem_tokens(tokens, stemmer):
+    stemmed = []
+    for item in tokens:
+        stemmed.append(stemmer.stem(item))
+    return stemmed
+
+def tokenize(text):
+    tokens = nltk.word_tokenize(text)
+    stems = stem_tokens(tokens, stemmer)
+    return stems
+
+if __name__=="__main__":
+    token_dict = {}
+    stemmer = PorterStemmer()
+    for subdir, dirs, files in os.walk("explanations"):
+        for f in files:
+            file_path = os.path.join(subdir, f)
+            with open(file_path) as fh:
+                text = fh.read()
+            lowers = text.lower()
+            no_punctuation = lowers.translate(None, string.punctuation)
+            token_dict[f] = no_punctuation
+
+    #this can take some time
+    tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
+    tfs = tfidf.fit_transform(token_dict.values())
author	Guillaume Horel <guillaume.horel@gmail.com>	2015-11-07 18:49:40 -0500
committer	Guillaume Horel <guillaume.horel@gmail.com>	2015-11-07 18:49:40 -0500
commit	fc94b19c14cd44223b205469ed794abf08d9dfdb (patch)
tree	34d04bbf0a36c9cf2f7698ad850769ae740555a3 /tf-idf.py
parent	939ea00056d5cc8817f00b8c293efa04d36bf6d5 (diff)
download	slack-fc94b19c14cd44223b205469ed794abf08d9dfdb.tar.gz