44 lines
1.4 KiB
Python
44 lines
1.4 KiB
Python
import pickle
|
|
|
|
from sklearn.decomposition import TruncatedSVD
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
import sklearn
|
|
import pandas as pd
|
|
import math
|
|
import re
|
|
|
|
from sklearn.linear_model import LinearRegression
|
|
|
|
|
|
def create_dictionary(in_path):
|
|
tfDict = []
|
|
i=0;
|
|
with open(in_path,encoding='utf-8') as in_file:
|
|
for line in in_file:
|
|
for word in re.findall(r"[\w]+",line):
|
|
tfDict.append(word)
|
|
i+=1
|
|
if(i>=50054):
|
|
break
|
|
return tfDict
|
|
##
|
|
|
|
def train():
|
|
created_dictionary=create_dictionary("train/in.tsv")
|
|
expected_dictionary=create_dictionary("train/expected.tsv");
|
|
#tfidf = TfidfVectorizer(min_df=1,stop_words='english')
|
|
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
|
|
x = tfidf.fit_transform(created_dictionary)
|
|
#PCA - principal component analysis
|
|
pca = TruncatedSVD(n_components=100) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
|
|
x_pca = pca.fit_transform(x)
|
|
l_regression = LinearRegression()
|
|
l_regression.fit(x_pca,expected_dictionary)
|
|
|
|
with open('l_regression.pkl','wb') as f:
|
|
pickle.dump(l_regression,f)
|
|
with open('tfidf_model.pkl', 'wb') as f:
|
|
pickle.dump(tfidf,f)
|
|
|
|
train() |