TFIDF proj commit
This commit is contained in:
parent
05327feaf1
commit
d2b5466b05
BIN
l_regression.pkl
BIN
l_regression.pkl
Binary file not shown.
@ -13,14 +13,13 @@ from sklearn.linear_model import LinearRegression
|
|||||||
|
|
||||||
def create_dictionary(in_path):
|
def create_dictionary(in_path):
|
||||||
tfDict = []
|
tfDict = []
|
||||||
max_iteration = 60000
|
|
||||||
i=0;
|
i=0;
|
||||||
with open(in_path,encoding='utf-8') as in_file:
|
with open(in_path,encoding='utf-8') as in_file:
|
||||||
for line in in_file:
|
for line in in_file:
|
||||||
for word in re.findall(r"[\w]+",line):
|
for word in re.findall(r"[\w]+",line):
|
||||||
tfDict.append(word)
|
tfDict.append(word)
|
||||||
i+=1
|
i+=1
|
||||||
if(i>=60014):
|
if(i>=50054):
|
||||||
break
|
break
|
||||||
return tfDict
|
return tfDict
|
||||||
##
|
##
|
||||||
@ -32,7 +31,7 @@ def train():
|
|||||||
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
|
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,1)) #Konwertuje tekst w dokumencie do macierzy tfidf , ngram_range - lb słów w sekwencji
|
||||||
x = tfidf.fit_transform(created_dictionary)
|
x = tfidf.fit_transform(created_dictionary)
|
||||||
#PCA - principal component analysis
|
#PCA - principal component analysis
|
||||||
pca = TruncatedSVD(n_components=300) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
|
pca = TruncatedSVD(n_components=100) # Liniowa redukcja wymiarów , n_components - Pożądana wymiarowość danych wyjściowych
|
||||||
x_pca = pca.fit_transform(x)
|
x_pca = pca.fit_transform(x)
|
||||||
l_regression = LinearRegression()
|
l_regression = LinearRegression()
|
||||||
l_regression.fit(x_pca,expected_dictionary)
|
l_regression.fit(x_pca,expected_dictionary)
|
||||||
@ -42,7 +41,4 @@ def train():
|
|||||||
with open('tfidf_model.pkl', 'wb') as f:
|
with open('tfidf_model.pkl', 'wb') as f:
|
||||||
pickle.dump(tfidf,f)
|
pickle.dump(tfidf,f)
|
||||||
|
|
||||||
#y = tfidf.transform(x)
|
|
||||||
#print(y);
|
|
||||||
|
|
||||||
train()
|
train()
|
18
predict.py
18
predict.py
@ -1,10 +1,19 @@
|
|||||||
import pickle
|
import pickle
|
||||||
|
from typing import re
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.decomposition import PCA
|
from sklearn.decomposition import PCA
|
||||||
from linear_regression import create_dictionary
|
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.decomposition import TruncatedSVD
|
from sklearn.decomposition import TruncatedSVD
|
||||||
|
|
||||||
|
def create_dictionary(in_path):
|
||||||
|
tfDict = []
|
||||||
|
with open(in_path,encoding='utf-8') as in_file:
|
||||||
|
for line in in_file:
|
||||||
|
for word in re.findall(r"[\w]+",line):
|
||||||
|
tfDict.append(word)
|
||||||
|
return tfDict
|
||||||
|
|
||||||
def predict():
|
def predict():
|
||||||
input_file = open("l_regression.pkl",'rb')
|
input_file = open("l_regression.pkl",'rb')
|
||||||
l_regression = pickle.load(input_file)
|
l_regression = pickle.load(input_file)
|
||||||
@ -17,7 +26,7 @@ def predict():
|
|||||||
testA_vector = tfidf.fit_transform(testA)
|
testA_vector = tfidf.fit_transform(testA)
|
||||||
|
|
||||||
#print(testA_vector)
|
#print(testA_vector)
|
||||||
pca = TruncatedSVD(n_components=300)
|
pca = TruncatedSVD(n_components=100)
|
||||||
|
|
||||||
dev0_pca = pca.fit_transform(dev0_vector)
|
dev0_pca = pca.fit_transform(dev0_vector)
|
||||||
testA_pca = pca.fit_transform(testA_vector)
|
testA_pca = pca.fit_transform(testA_vector)
|
||||||
@ -32,9 +41,4 @@ def predict():
|
|||||||
foo = np.array(y_test)
|
foo = np.array(y_test)
|
||||||
np.savetxt(output,foo)
|
np.savetxt(output,foo)
|
||||||
|
|
||||||
#print(y_test)
|
|
||||||
|
|
||||||
# dev0_vectorizer =
|
|
||||||
|
|
||||||
|
|
||||||
predict()
|
predict()
|
BIN
tfidf_model.pkl
BIN
tfidf_model.pkl
Binary file not shown.
Loading…
Reference in New Issue
Block a user