2020-05-05 16:02:39 +02:00
|
|
|
import csv
|
2020-05-05 14:17:49 +02:00
|
|
|
import pickle
|
2020-05-05 15:19:10 +02:00
|
|
|
from typing import re
|
|
|
|
|
2020-05-05 17:30:04 +02:00
|
|
|
import numpy
|
2020-05-05 14:17:49 +02:00
|
|
|
import numpy as np
|
2020-05-05 16:02:39 +02:00
|
|
|
import pandas as pd
|
2020-05-05 14:17:49 +02:00
|
|
|
from sklearn.decomposition import PCA
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
from sklearn.decomposition import TruncatedSVD
|
|
|
|
|
|
|
|
def predict():
|
|
|
|
input_file = open("l_regression.pkl",'rb')
|
|
|
|
l_regression = pickle.load(input_file)
|
|
|
|
input_file = open("tfidf_model.pkl",'rb')
|
2020-05-05 19:28:18 +02:00
|
|
|
tfidf = pickle.load(input_file,encoding='UTF-8')
|
2020-05-05 14:17:49 +02:00
|
|
|
|
2020-05-05 19:28:18 +02:00
|
|
|
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False, skip_blank_lines=False)
|
|
|
|
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE, error_bad_lines=False, skip_blank_lines=False )
|
|
|
|
#devtxt = dev0["txt"]
|
|
|
|
#testAtxt = testA["txt"]
|
2020-05-05 17:30:04 +02:00
|
|
|
|
2020-05-05 19:28:18 +02:00
|
|
|
#print(testAtxt)
|
2020-05-05 14:17:49 +02:00
|
|
|
|
2020-05-05 19:28:18 +02:00
|
|
|
dev0_vector = tfidf.fit_transform(dev0['txt'].apply(lambda dev0_vector: np.str_(dev0_vector)))
|
|
|
|
testA_vector = tfidf.fit_transform(testA['txt'].apply(lambda testA_vector: np.str_(testA_vector)))
|
|
|
|
#dev0_vector = tfidf.fit_transform(dev0['txt'].values.astype('U'))
|
|
|
|
#testA_vector = tfidf.fit_transform(testA['txt'].values.astype('U'))
|
|
|
|
#dev0_vector = tfidf.fit_transform(dev0['txt'],y=None)
|
|
|
|
#testA_vector = tfidf.fit_transform(testA['txt'],y=None)
|
2020-05-05 14:17:49 +02:00
|
|
|
#print(testA_vector)
|
2020-05-05 19:28:18 +02:00
|
|
|
pca = TruncatedSVD(n_components=120)
|
2020-05-05 14:17:49 +02:00
|
|
|
|
|
|
|
dev0_pca = pca.fit_transform(dev0_vector)
|
|
|
|
testA_pca = pca.fit_transform(testA_vector)
|
|
|
|
y_dev = l_regression.predict(dev0_pca)
|
|
|
|
y_test = l_regression.predict(testA_pca)
|
2020-05-05 17:30:04 +02:00
|
|
|
numpy.savetxt('dev-0/out.tsv', y_dev)
|
|
|
|
numpy.savetxt('test-A/out.tsv', y_test)
|
|
|
|
#y_dev.to_csv(r'dev-0/out.csv',index=False)
|
|
|
|
#y_test.to_csv(r'test-A/out.csv',index=False)
|
2020-05-05 14:17:49 +02:00
|
|
|
|
|
|
|
predict()
|