diff --git a/l_regression.pkl b/l_regression.pkl index a3bb734..5bda98b 100644 Binary files a/l_regression.pkl and b/l_regression.pkl differ diff --git a/linear_regression.py b/linear_regression.py index 2d89ef6..b8d355d 100644 --- a/linear_regression.py +++ b/linear_regression.py @@ -13,14 +13,13 @@ from sklearn.linear_model import LinearRegression def create_dictionary(in_path): tfDict = [] - max_iteration = 60000 i=0; with open(in_path,encoding='utf-8') as in_file: for line in in_file: for word in re.findall(r"[\w]+",line): tfDict.append(word) i+=1 - if(i>=60014): + if(i>=50054): break return tfDict ## @@ -32,7 +31,7 @@ def train(): tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji x = tfidf.fit_transform(created_dictionary) #PCA - principal component analysis - pca = TruncatedSVD(n_components=300) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych + pca = TruncatedSVD(n_components=100) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych x_pca = pca.fit_transform(x) l_regression = LinearRegression() l_regression.fit(x_pca,expected_dictionary) @@ -42,7 +41,4 @@ def train(): with open('tfidf_model.pkl', 'wb') as f: pickle.dump(tfidf,f) - #y = tfidf.transform(x) - #print(y); - train() \ No newline at end of file diff --git a/predict.py b/predict.py index 90a19f2..58c9215 100644 --- a/predict.py +++ b/predict.py @@ -1,10 +1,19 @@ import pickle +from typing import re + import numpy as np from sklearn.decomposition import PCA -from linear_regression import create_dictionary from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD +def create_dictionary(in_path): + tfDict = [] + with open(in_path,encoding='utf-8') as in_file: + for line in in_file: + for word in re.findall(r"[\w]+",line): + tfDict.append(word) + return tfDict + def predict(): input_file = open("l_regression.pkl",'rb') l_regression = pickle.load(input_file) @@ -17,7 +26,7 @@ def predict(): testA_vector = tfidf.fit_transform(testA) #print(testA_vector) - pca = TruncatedSVD(n_components=300) + pca = TruncatedSVD(n_components=100) dev0_pca = pca.fit_transform(dev0_vector) testA_pca = pca.fit_transform(testA_vector) @@ -32,9 +41,4 @@ def predict(): foo = np.array(y_test) np.savetxt(output,foo) - #print(y_test) - - # dev0_vectorizer = - - predict() \ No newline at end of file diff --git a/tfidf_model.pkl b/tfidf_model.pkl index ccb9c30..0ecb2b1 100644 Binary files a/tfidf_model.pkl and b/tfidf_model.pkl differ