blob: 60280b8c93c185a41cadebdfc76eedb93f873bf0 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
import nltk
import string
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
tokens = nltk.word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
if __name__=="__main__":
token_dict = {}
stemmer = PorterStemmer()
for subdir, dirs, files in os.walk("explanations"):
for f in files:
file_path = os.path.join(subdir, f)
with open(file_path) as fh:
text = fh.read()
lowers = text.lower()
no_punctuation = lowers.translate(None, string.punctuation)
token_dict[f] = no_punctuation
#this can take some time
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())
|