aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tf-idf.py33
1 files changed, 33 insertions, 0 deletions
diff --git a/tf-idf.py b/tf-idf.py
new file mode 100644
index 0000000..60280b8
--- /dev/null
+++ b/tf-idf.py
@@ -0,0 +1,33 @@
+import nltk
+import string
+import os
+
+from sklearn.feature_extraction.text import TfidfVectorizer
+from nltk.stem.porter import PorterStemmer
+
+def stem_tokens(tokens, stemmer):
+ stemmed = []
+ for item in tokens:
+ stemmed.append(stemmer.stem(item))
+ return stemmed
+
+def tokenize(text):
+ tokens = nltk.word_tokenize(text)
+ stems = stem_tokens(tokens, stemmer)
+ return stems
+
+if __name__=="__main__":
+ token_dict = {}
+ stemmer = PorterStemmer()
+ for subdir, dirs, files in os.walk("explanations"):
+ for f in files:
+ file_path = os.path.join(subdir, f)
+ with open(file_path) as fh:
+ text = fh.read()
+ lowers = text.lower()
+ no_punctuation = lowers.translate(None, string.punctuation)
+ token_dict[f] = no_punctuation
+
+ #this can take some time
+ tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
+ tfs = tfidf.fit_transform(token_dict.values())