GuessRedditDateSumo/linear_regression.py

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import sklearn
import pandas as pd
import math
import re


def create_dictionary(in_path):
    tfDict = []
    with open(in_path,encoding='utf-8') as in_file:
        for line in in_file:
            for word in re.findall(r"[\w]+",line):
                tfDict.append(word)
    return tfDict


def main():
    created_dictionary=create_dictionary("train/in.tsv")
    #tfidf = TfidfVectorizer(min_df=1,stop_words='english')
    tfidf = TfidfVectorizer(stop_words='english')
    x = tfidf.fit(created_dictionary)
    y = tfidf.transform(x.vocabulary_)

main()