GuessRedditDateSumo/linear_regression.py
2020-04-19 19:30:57 +02:00

26 lines
679 B
Python

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import sklearn
import pandas as pd
import math
import re
def create_dictionary(in_path):
tfDict = []
with open(in_path,encoding='utf-8') as in_file:
for line in in_file:
for word in re.findall(r"[\w]+",line):
tfDict.append(word)
return tfDict
def main():
created_dictionary=create_dictionary("train/in.tsv")
#tfidf = TfidfVectorizer(min_df=1,stop_words='english')
tfidf = TfidfVectorizer(stop_words='english')
x = tfidf.fit(created_dictionary)
y = tfidf.transform(x.vocabulary_)
main()