diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2015-11-07 18:49:40 -0500 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2015-11-07 18:49:40 -0500 |
| commit | fc94b19c14cd44223b205469ed794abf08d9dfdb (patch) | |
| tree | 34d04bbf0a36c9cf2f7698ad850769ae740555a3 /tf-idf.py | |
| parent | 939ea00056d5cc8817f00b8c293efa04d36bf6d5 (diff) | |
| download | slack-fc94b19c14cd44223b205469ed794abf08d9dfdb.tar.gz | |
initial tf-idf construction
Diffstat (limited to 'tf-idf.py')
| -rw-r--r-- | tf-idf.py | 33 |
1 files changed, 33 insertions, 0 deletions
diff --git a/tf-idf.py b/tf-idf.py new file mode 100644 index 0000000..60280b8 --- /dev/null +++ b/tf-idf.py @@ -0,0 +1,33 @@ +import nltk +import string +import os + +from sklearn.feature_extraction.text import TfidfVectorizer +from nltk.stem.porter import PorterStemmer + +def stem_tokens(tokens, stemmer): + stemmed = [] + for item in tokens: + stemmed.append(stemmer.stem(item)) + return stemmed + +def tokenize(text): + tokens = nltk.word_tokenize(text) + stems = stem_tokens(tokens, stemmer) + return stems + +if __name__=="__main__": + token_dict = {} + stemmer = PorterStemmer() + for subdir, dirs, files in os.walk("explanations"): + for f in files: + file_path = os.path.join(subdir, f) + with open(file_path) as fh: + text = fh.read() + lowers = text.lower() + no_punctuation = lowers.translate(None, string.punctuation) + token_dict[f] = no_punctuation + + #this can take some time + tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english') + tfs = tfidf.fit_transform(token_dict.values()) |
