GuessRedditDateSumo/predict.py

import pickle
from typing import re

import numpy as np
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def create_dictionary(in_path):
    tfDict = []
    with open(in_path,encoding='utf-8') as in_file:
        for line in in_file:
            for word in re.findall(r"[\w]+",line):
                tfDict.append(word)
    return tfDict

def predict():
    input_file = open("l_regression.pkl",'rb')
    l_regression = pickle.load(input_file)
    input_file = open("tfidf_model.pkl",'rb')
    tfidf = pickle.load(input_file)

    dev0 = create_dictionary("dev-0/in.tsv")
    testA = create_dictionary("test-A/in.tsv")
    dev0_vector = tfidf.fit_transform(dev0)
    testA_vector = tfidf.fit_transform(testA)

    #print(testA_vector)
    pca = TruncatedSVD(n_components=100)

    dev0_pca = pca.fit_transform(dev0_vector)
    testA_pca = pca.fit_transform(testA_vector)
    output= open("dev-0/out.tsv","w+",encoding="UTF-8")
    y_dev = l_regression.predict(dev0_pca)
    print(y_dev)
    foo = np.array(y_dev)
    print(foo)
    np.savetxt(output,foo)
    output = open("test-A/out.tsv", "w+", encoding="UTF-8")
    y_test = l_regression.predict(testA_pca)
    foo = np.array(y_test)
    np.savetxt(output,foo)

predict()
TFIDF proj commit 2020-05-05 14:17:49 +02:00			`import pickle`
TFIDF proj commit 2020-05-05 15:19:10 +02:00			`from typing import re`

TFIDF proj commit 2020-05-05 14:17:49 +02:00			`import numpy as np`
			`from sklearn.decomposition import PCA`
			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`from sklearn.decomposition import TruncatedSVD`

TFIDF proj commit 2020-05-05 15:19:10 +02:00			`def create_dictionary(in_path):`
			`tfDict = []`
			`with open(in_path,encoding='utf-8') as in_file:`
			`for line in in_file:`
			`for word in re.findall(r"[\w]+",line):`
			`tfDict.append(word)`
			`return tfDict`

TFIDF proj commit 2020-05-05 14:17:49 +02:00			`def predict():`
			`input_file = open("l_regression.pkl",'rb')`
			`l_regression = pickle.load(input_file)`
			`input_file = open("tfidf_model.pkl",'rb')`
			`tfidf = pickle.load(input_file)`

			`dev0 = create_dictionary("dev-0/in.tsv")`
			`testA = create_dictionary("test-A/in.tsv")`
			`dev0_vector = tfidf.fit_transform(dev0)`
			`testA_vector = tfidf.fit_transform(testA)`

			`#print(testA_vector)`
TFIDF proj commit 2020-05-05 15:19:10 +02:00			`pca = TruncatedSVD(n_components=100)`
TFIDF proj commit 2020-05-05 14:17:49 +02:00
			`dev0_pca = pca.fit_transform(dev0_vector)`
			`testA_pca = pca.fit_transform(testA_vector)`
			`output= open("dev-0/out.tsv","w+",encoding="UTF-8")`
			`y_dev = l_regression.predict(dev0_pca)`
			`print(y_dev)`
			`foo = np.array(y_dev)`
			`print(foo)`
			`np.savetxt(output,foo)`
			`output = open("test-A/out.tsv", "w+", encoding="UTF-8")`
			`y_test = l_regression.predict(testA_pca)`
			`foo = np.array(y_test)`
			`np.savetxt(output,foo)`

			`predict()`