2020-05-05 14:17:49 +02:00
|
|
|
import pickle
|
2020-05-05 15:19:10 +02:00
|
|
|
from typing import re
|
|
|
|
|
2020-05-05 14:17:49 +02:00
|
|
|
import numpy as np
|
|
|
|
from sklearn.decomposition import PCA
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
from sklearn.decomposition import TruncatedSVD
|
|
|
|
|
2020-05-05 15:19:10 +02:00
|
|
|
def create_dictionary(in_path):
|
|
|
|
tfDict = []
|
|
|
|
with open(in_path,encoding='utf-8') as in_file:
|
|
|
|
for line in in_file:
|
|
|
|
for word in re.findall(r"[\w]+",line):
|
|
|
|
tfDict.append(word)
|
|
|
|
return tfDict
|
|
|
|
|
2020-05-05 14:17:49 +02:00
|
|
|
def predict():
|
|
|
|
input_file = open("l_regression.pkl",'rb')
|
|
|
|
l_regression = pickle.load(input_file)
|
|
|
|
input_file = open("tfidf_model.pkl",'rb')
|
|
|
|
tfidf = pickle.load(input_file)
|
|
|
|
|
|
|
|
dev0 = create_dictionary("dev-0/in.tsv")
|
|
|
|
testA = create_dictionary("test-A/in.tsv")
|
|
|
|
dev0_vector = tfidf.fit_transform(dev0)
|
|
|
|
testA_vector = tfidf.fit_transform(testA)
|
|
|
|
|
|
|
|
#print(testA_vector)
|
2020-05-05 15:19:10 +02:00
|
|
|
pca = TruncatedSVD(n_components=100)
|
2020-05-05 14:17:49 +02:00
|
|
|
|
|
|
|
dev0_pca = pca.fit_transform(dev0_vector)
|
|
|
|
testA_pca = pca.fit_transform(testA_vector)
|
|
|
|
output= open("dev-0/out.tsv","w+",encoding="UTF-8")
|
|
|
|
y_dev = l_regression.predict(dev0_pca)
|
|
|
|
print(y_dev)
|
|
|
|
foo = np.array(y_dev)
|
|
|
|
print(foo)
|
|
|
|
np.savetxt(output,foo)
|
|
|
|
output = open("test-A/out.tsv", "w+", encoding="UTF-8")
|
|
|
|
y_test = l_regression.predict(testA_pca)
|
|
|
|
foo = np.array(y_test)
|
|
|
|
np.savetxt(output,foo)
|
|
|
|
|
|
|
|
predict()
|