GuessRedditDateSumo/predict.py

43 lines
1.3 KiB
Python
Raw Normal View History

2020-05-05 16:02:39 +02:00
import csv
2020-05-05 14:17:49 +02:00
import pickle
2020-05-05 15:19:10 +02:00
from typing import re
2020-05-05 14:17:49 +02:00
import numpy as np
2020-05-05 16:02:39 +02:00
import pandas as pd
2020-05-05 14:17:49 +02:00
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
def predict():
input_file = open("l_regression.pkl",'rb')
l_regression = pickle.load(input_file)
input_file = open("tfidf_model.pkl",'rb')
tfidf = pickle.load(input_file)
2020-05-05 16:02:39 +02:00
dev0 = pd.read_csv("dev-0/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
testA = pd.read_csv("test-A/in.tsv", delimiter="\t", header=None, names=["txt"], quoting=csv.QUOTE_NONE)
devtxt = dev0["txt"]
testAtxt = testA["txt"]
print(testAtxt)
dev0_vector = tfidf.fit_transform(devtxt)
testA_vector = tfidf.fit_transform(testAtxt)
2020-05-05 14:17:49 +02:00
#print(testA_vector)
2020-05-05 15:19:10 +02:00
pca = TruncatedSVD(n_components=100)
2020-05-05 14:17:49 +02:00
dev0_pca = pca.fit_transform(dev0_vector)
testA_pca = pca.fit_transform(testA_vector)
output= open("dev-0/out.tsv","w+",encoding="UTF-8")
y_dev = l_regression.predict(dev0_pca)
print(y_dev)
foo = np.array(y_dev)
print(foo)
np.savetxt(output,foo)
output = open("test-A/out.tsv", "w+", encoding="UTF-8")
y_test = l_regression.predict(testA_pca)
foo = np.array(y_test)
np.savetxt(output,foo)
predict()