import pickle from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer import sklearn import pandas as pd import math import re from sklearn.linear_model import LinearRegression def create_dictionary(in_path): tfDict = [] max_iteration = 50000 i=0; with open(in_path,encoding='utf-8') as in_file: for line in in_file: for word in re.findall(r"[\w]+",line): tfDict.append(word) i+=1 if(i>=50054): break return tfDict ## def train(): created_dictionary=create_dictionary("train/in.tsv") expected_dictionary=create_dictionary("train/expected.tsv"); #tfidf = TfidfVectorizer(min_df=1,stop_words='english') tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji x = tfidf.fit_transform(created_dictionary) #PCA - principal component analysis pca = TruncatedSVD(n_components=200) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych x_pca = pca.fit_transform(x) l_regression = LinearRegression() l_regression.fit(x_pca,expected_dictionary) with open('l_regression.pkl','wb') as f: pickle.dump(l_regression,f) with open('tfidf_model.pkl', 'wb') as f: pickle.dump(tfidf,f) #y = tfidf.transform(x) #print(y); train()