TFIDF proj commit

This commit is contained in:
Bartusiak 2020-05-05 15:19:10 +02:00
parent 05327feaf1
commit d2b5466b05
4 changed files with 13 additions and 13 deletions

Binary file not shown.

View File

@ -13,14 +13,13 @@ from sklearn.linear_model import LinearRegression
def create_dictionary(in_path): def create_dictionary(in_path):
tfDict = [] tfDict = []
max_iteration = 60000
i=0; i=0;
with open(in_path,encoding='utf-8') as in_file: with open(in_path,encoding='utf-8') as in_file:
for line in in_file: for line in in_file:
for word in re.findall(r"[\w]+",line): for word in re.findall(r"[\w]+",line):
tfDict.append(word) tfDict.append(word)
i+=1 i+=1
if(i>=60014): if(i>=50054):
break break
return tfDict return tfDict
## ##
@ -32,7 +31,7 @@ def train():
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
x = tfidf.fit_transform(created_dictionary) x = tfidf.fit_transform(created_dictionary)
#PCA - principal component analysis #PCA - principal component analysis
pca = TruncatedSVD(n_components=300) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych pca = TruncatedSVD(n_components=100) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
x_pca = pca.fit_transform(x) x_pca = pca.fit_transform(x)
l_regression = LinearRegression() l_regression = LinearRegression()
l_regression.fit(x_pca,expected_dictionary) l_regression.fit(x_pca,expected_dictionary)
@ -42,7 +41,4 @@ def train():
with open('tfidf_model.pkl', 'wb') as f: with open('tfidf_model.pkl', 'wb') as f:
pickle.dump(tfidf,f) pickle.dump(tfidf,f)
#y = tfidf.transform(x)
#print(y);
train() train()

View File

@ -1,10 +1,19 @@
import pickle import pickle
from typing import re
import numpy as np import numpy as np
from sklearn.decomposition import PCA from sklearn.decomposition import PCA
from linear_regression import create_dictionary
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD from sklearn.decomposition import TruncatedSVD
def create_dictionary(in_path):
tfDict = []
with open(in_path,encoding='utf-8') as in_file:
for line in in_file:
for word in re.findall(r"[\w]+",line):
tfDict.append(word)
return tfDict
def predict(): def predict():
input_file = open("l_regression.pkl",'rb') input_file = open("l_regression.pkl",'rb')
l_regression = pickle.load(input_file) l_regression = pickle.load(input_file)
@ -17,7 +26,7 @@ def predict():
testA_vector = tfidf.fit_transform(testA) testA_vector = tfidf.fit_transform(testA)
#print(testA_vector) #print(testA_vector)
pca = TruncatedSVD(n_components=300) pca = TruncatedSVD(n_components=100)
dev0_pca = pca.fit_transform(dev0_vector) dev0_pca = pca.fit_transform(dev0_vector)
testA_pca = pca.fit_transform(testA_vector) testA_pca = pca.fit_transform(testA_vector)
@ -32,9 +41,4 @@ def predict():
foo = np.array(y_test) foo = np.array(y_test)
np.savetxt(output,foo) np.savetxt(output,foo)
#print(y_test)
# dev0_vectorizer =
predict() predict()

Binary file not shown.