aboutsummaryrefslogtreecommitdiffstats
path: root/tf-idf.py
blob: 60280b8c93c185a41cadebdfc76eedb93f873bf0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import nltk
import string
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

if __name__=="__main__":
    token_dict = {}
    stemmer = PorterStemmer()
    for subdir, dirs, files in os.walk("explanations"):
        for f in files:
            file_path = os.path.join(subdir, f)
            with open(file_path) as fh:
                text = fh.read()
            lowers = text.lower()
            no_punctuation = lowers.translate(None, string.punctuation)
            token_dict[f] = no_punctuation

    #this can take some time
    tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
    tfs = tfidf.fit_transform(token_dict.values())