import nltk import string import os from sklearn.feature_extraction.text import TfidfVectorizer from nltk.stem.porter import PorterStemmer def stem_tokens(tokens, stemmer): stemmed = [] for item in tokens: stemmed.append(stemmer.stem(item)) return stemmed def tokenize(text): tokens = nltk.word_tokenize(text) stems = stem_tokens(tokens, stemmer) return stems if __name__=="__main__": token_dict = {} stemmer = PorterStemmer() for subdir, dirs, files in os.walk("explanations"): for f in files: file_path = os.path.join(subdir, f) with open(file_path) as fh: text = fh.read() lowers = text.lower() no_punctuation = lowers.translate(None, string.punctuation) token_dict[f] = no_punctuation #this can take some time tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english') tfs = tfidf.fit_transform(token_dict.values())