from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer import sklearn import pandas as pd import math import re def create_dictionary(in_path): tfDict = [] with open(in_path,encoding='utf-8') as in_file: for line in in_file: for word in re.findall(r"[\w]+",line): tfDict.append(word) return tfDict def main(): created_dictionary=create_dictionary("train/in.tsv") #tfidf = TfidfVectorizer(min_df=1,stop_words='english') tfidf = TfidfVectorizer(stop_words='english') x = tfidf.fit(created_dictionary) y = tfidf.transform(x.vocabulary_) main()