GuessRedditDateSumo/linear_regression.py

import pickle

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import sklearn
import pandas as pd
import math
import re

from sklearn.linear_model import LinearRegression


def create_dictionary(in_path):
    tfDict = []
    max_iteration = 50000
    i=0;
    with open(in_path,encoding='utf-8') as in_file:
        for line in in_file:
            for word in re.findall(r"[\w]+",line):
                tfDict.append(word)
                i+=1
            if(i>=50054):
                break
    return tfDict
##

def train():
    created_dictionary=create_dictionary("train/in.tsv")
    expected_dictionary=create_dictionary("train/expected.tsv");
    #tfidf = TfidfVectorizer(min_df=1,stop_words='english')
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
    x = tfidf.fit_transform(created_dictionary)
    #PCA - principal component analysis
    pca = TruncatedSVD(n_components=200) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
    x_pca = pca.fit_transform(x)
    l_regression = LinearRegression()
    l_regression.fit(x_pca,expected_dictionary)

    with open('l_regression.pkl','wb') as f:
        pickle.dump(l_regression,f)
    with open('tfidf_model.pkl', 'wb') as f:
        pickle.dump(tfidf,f)

    #y = tfidf.transform(x)
    #print(y);

train()
TFIDF proj commit 2020-05-05 14:17:49 +02:00			`import pickle`

			`from sklearn.decomposition import TruncatedSVD`
Added create_dictionary and main 2020-04-19 19:30:57 +02:00			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`from sklearn.feature_extraction.text import CountVectorizer`
			`import sklearn`
			`import pandas as pd`
			`import math`
			`import re`

TFIDF proj commit 2020-05-05 14:17:49 +02:00			`from sklearn.linear_model import LinearRegression`
Added create_dictionary and main 2020-04-19 19:30:57 +02:00

			`def create_dictionary(in_path):`
			`tfDict = []`
TFIDF proj commit 2020-05-05 14:17:49 +02:00			`max_iteration = 50000`
			`i=0;`
Added create_dictionary and main 2020-04-19 19:30:57 +02:00			`with open(in_path,encoding='utf-8') as in_file:`
			`for line in in_file:`
			`for word in re.findall(r"[\w]+",line):`
			`tfDict.append(word)`
TFIDF proj commit 2020-05-05 14:17:49 +02:00			`i+=1`
			`if(i>=50054):`
			`break`
Added create_dictionary and main 2020-04-19 19:30:57 +02:00			`return tfDict`
TFIDF proj commit 2020-05-05 14:17:49 +02:00			`##`
Added create_dictionary and main 2020-04-19 19:30:57 +02:00
TFIDF proj commit 2020-05-05 14:17:49 +02:00			`def train():`
Added create_dictionary and main 2020-04-19 19:30:57 +02:00			`created_dictionary=create_dictionary("train/in.tsv")`
TFIDF proj commit 2020-05-05 14:17:49 +02:00			`expected_dictionary=create_dictionary("train/expected.tsv");`
Added create_dictionary and main 2020-04-19 19:30:57 +02:00			`#tfidf = TfidfVectorizer(min_df=1,stop_words='english')`
TFIDF proj commit 2020-05-05 14:17:49 +02:00			`tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji`
			`x = tfidf.fit_transform(created_dictionary)`
			`#PCA - principal component analysis`
TFIDF proj commit 2020-05-05 14:47:21 +02:00			`pca = TruncatedSVD(n_components=200) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych`
TFIDF proj commit 2020-05-05 14:17:49 +02:00			`x_pca = pca.fit_transform(x)`
			`l_regression = LinearRegression()`
			`l_regression.fit(x_pca,expected_dictionary)`

			`with open('l_regression.pkl','wb') as f:`
			`pickle.dump(l_regression,f)`
			`with open('tfidf_model.pkl', 'wb') as f:`
			`pickle.dump(tfidf,f)`

			`#y = tfidf.transform(x)`
			`#print(y);`
Added create_dictionary and main 2020-04-19 19:30:57 +02:00
TFIDF proj commit 2020-05-05 14:17:49 +02:00			`train()`