88 lines
2.3 KiB
Python
88 lines
2.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""reddit_date.ipynb
|
|
|
|
Automatically generated by Colaboratory.
|
|
|
|
Original file is located at
|
|
https://colab.research.google.com/drive/1sEEvGtUrrC2XUk3zSN0D7FmFdQ4TK2de
|
|
"""
|
|
|
|
#!git clone https://git.wmi.amu.edu.pl/dawjur/guess-reddit-date-sumo.git
|
|
|
|
#!xzcat "guess-reddit-date-sumo/train/in.tsv.xz" | wc -l
|
|
|
|
import sys
|
|
import numpy as np
|
|
|
|
from sklearn.linear_model import LinearRegression
|
|
import sklearn.metrics
|
|
import sklearn.decomposition
|
|
import sklearn.feature_extraction.text
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
import csv
|
|
import datetime
|
|
import lzma
|
|
import pandas as pd
|
|
from sklearn.metrics import mean_squared_error
|
|
|
|
def read_file_to_list(path):
|
|
row_list = []
|
|
with lzma.open(path) as fp:
|
|
while True:
|
|
line = fp.readline()
|
|
if not line:
|
|
break
|
|
row_list.append(line)
|
|
return row_list
|
|
|
|
def load_set(path, isTest):
|
|
dataset = pd.DataFrame(read_file_to_list("guess-reddit-date-sumo/"+path+"/in.tsv.xz"),columns=["text"])
|
|
if not isTest:
|
|
expected = pd.read_csv("guess-reddit-date-sumo/"+path+"/expected.tsv.xz",header=None,names=["year"])
|
|
return dataset, expected
|
|
return dataset
|
|
|
|
train_set, expected_train = load_set("train", False)
|
|
dev_set, expected_dev = load_set("dev-0", False)
|
|
test_set = load_set("test-A", True)
|
|
|
|
test_set.info()
|
|
|
|
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
|
|
stop_words='english',
|
|
ngram_range=(1, 1),
|
|
)
|
|
|
|
train_set = train_set.fillna("No text")
|
|
|
|
train_data = vectorizer.fit_transform(train_set["text"])
|
|
|
|
pca = sklearn.decomposition.TruncatedSVD(n_components=100)
|
|
data = pca.fit_transform(train_data)
|
|
data
|
|
|
|
regression = LinearRegression()
|
|
regression.fit(data,expected_train)
|
|
|
|
mean_squared_error(regression.predict(data),expected_train)
|
|
|
|
dev_set
|
|
|
|
def transform_data(raw_data):
|
|
raw_data = raw_data.fillna("No text")
|
|
vector = vectorizer.transform(raw_data["text"])
|
|
clean_data = pca.transform(vector)
|
|
return clean_data
|
|
|
|
dev_transformed = transform_data(dev_set)
|
|
predict_dev = regression.predict(dev_transformed)
|
|
predict_dev
|
|
|
|
test_transformed = transform_data(test_set)
|
|
predict_test = regression.predict(test_transformed)
|
|
predict_test
|
|
|
|
mean_squared_error(predict_dev,expected_dev)
|
|
|
|
np.savetxt('guess-reddit-date-sumo/test-A/out.tsv', predict_test, '%f')
|
|
np.savetxt('guess-reddit-date-sumo/dev-0/out.tsv', predict_dev, '%f') |