guess-reddit-date-sumo/reddit_date.py
2020-04-19 11:13:11 +02:00

88 lines
2.3 KiB
Python

# -*- coding: utf-8 -*-
"""reddit_date.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1sEEvGtUrrC2XUk3zSN0D7FmFdQ4TK2de
"""
#!git clone https://git.wmi.amu.edu.pl/dawjur/guess-reddit-date-sumo.git
#!xzcat "guess-reddit-date-sumo/train/in.tsv.xz" | wc -l
import sys
import numpy as np
from sklearn.linear_model import LinearRegression
import sklearn.metrics
import sklearn.decomposition
import sklearn.feature_extraction.text
from sklearn.feature_extraction.text import CountVectorizer
import csv
import datetime
import lzma
import pandas as pd
from sklearn.metrics import mean_squared_error
def read_file_to_list(path):
row_list = []
with lzma.open(path) as fp:
while True:
line = fp.readline()
if not line:
break
row_list.append(line)
return row_list
def load_set(path, isTest):
dataset = pd.DataFrame(read_file_to_list("guess-reddit-date-sumo/"+path+"/in.tsv.xz"),columns=["text"])
if not isTest:
expected = pd.read_csv("guess-reddit-date-sumo/"+path+"/expected.tsv.xz",header=None,names=["year"])
return dataset, expected
return dataset
train_set, expected_train = load_set("train", False)
dev_set, expected_dev = load_set("dev-0", False)
test_set = load_set("test-A", True)
test_set.info()
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
stop_words='english',
ngram_range=(1, 1),
)
train_set = train_set.fillna("No text")
train_data = vectorizer.fit_transform(train_set["text"])
pca = sklearn.decomposition.TruncatedSVD(n_components=100)
data = pca.fit_transform(train_data)
data
regression = LinearRegression()
regression.fit(data,expected_train)
mean_squared_error(regression.predict(data),expected_train)
dev_set
def transform_data(raw_data):
raw_data = raw_data.fillna("No text")
vector = vectorizer.transform(raw_data["text"])
clean_data = pca.transform(vector)
return clean_data
dev_transformed = transform_data(dev_set)
predict_dev = regression.predict(dev_transformed)
predict_dev
test_transformed = transform_data(test_set)
predict_test = regression.predict(test_transformed)
predict_test
mean_squared_error(predict_dev,expected_dev)
np.savetxt('guess-reddit-date-sumo/test-A/out.tsv', predict_test, '%f')
np.savetxt('guess-reddit-date-sumo/dev-0/out.tsv', predict_dev, '%f')