GuessRedditDateSumo/linear_regression.py

48 lines
1.5 KiB
Python
Raw Normal View History

2020-05-05 14:17:49 +02:00
import pickle
from sklearn.decomposition import TruncatedSVD
2020-04-19 19:30:57 +02:00
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import sklearn
import pandas as pd
import math
import re
2020-05-05 14:17:49 +02:00
from sklearn.linear_model import LinearRegression
2020-04-19 19:30:57 +02:00
def create_dictionary(in_path):
tfDict = []
2020-05-05 14:17:49 +02:00
max_iteration = 50000
i=0;
2020-04-19 19:30:57 +02:00
with open(in_path,encoding='utf-8') as in_file:
for line in in_file:
for word in re.findall(r"[\w]+",line):
tfDict.append(word)
2020-05-05 14:17:49 +02:00
i+=1
if(i>=50054):
break
2020-04-19 19:30:57 +02:00
return tfDict
2020-05-05 14:17:49 +02:00
##
2020-04-19 19:30:57 +02:00
2020-05-05 14:17:49 +02:00
def train():
2020-04-19 19:30:57 +02:00
created_dictionary=create_dictionary("train/in.tsv")
2020-05-05 14:17:49 +02:00
expected_dictionary=create_dictionary("train/expected.tsv");
2020-04-19 19:30:57 +02:00
#tfidf = TfidfVectorizer(min_df=1,stop_words='english')
2020-05-05 14:17:49 +02:00
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
x = tfidf.fit_transform(created_dictionary)
#PCA - principal component analysis
2020-05-05 14:47:21 +02:00
pca = TruncatedSVD(n_components=200) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
2020-05-05 14:17:49 +02:00
x_pca = pca.fit_transform(x)
l_regression = LinearRegression()
l_regression.fit(x_pca,expected_dictionary)
with open('l_regression.pkl','wb') as f:
pickle.dump(l_regression,f)
with open('tfidf_model.pkl', 'wb') as f:
pickle.dump(tfidf,f)
#y = tfidf.transform(x)
#print(y);
2020-04-19 19:30:57 +02:00
2020-05-05 14:17:49 +02:00
train()