Linear regression TfIdf, dim reduced
This commit is contained in:
parent
a4585a4957
commit
ff6cc29cc4
200000
dev-0/out.tsv
200000
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
1
link_to_collab.txt
Normal file
1
link_to_collab.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
https://colab.research.google.com/drive/1sEEvGtUrrC2XUk3zSN0D7FmFdQ4TK2de
|
88
reddit_date.py
Normal file
88
reddit_date.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""reddit_date.ipynb
|
||||||
|
|
||||||
|
Automatically generated by Colaboratory.
|
||||||
|
|
||||||
|
Original file is located at
|
||||||
|
https://colab.research.google.com/drive/1sEEvGtUrrC2XUk3zSN0D7FmFdQ4TK2de
|
||||||
|
"""
|
||||||
|
|
||||||
|
#!git clone https://git.wmi.amu.edu.pl/dawjur/guess-reddit-date-sumo.git
|
||||||
|
|
||||||
|
#!xzcat "guess-reddit-date-sumo/train/in.tsv.xz" | wc -l
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
import sklearn.metrics
|
||||||
|
import sklearn.decomposition
|
||||||
|
import sklearn.feature_extraction.text
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
import csv
|
||||||
|
import datetime
|
||||||
|
import lzma
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
|
||||||
|
def read_file_to_list(path):
|
||||||
|
row_list = []
|
||||||
|
with lzma.open(path) as fp:
|
||||||
|
while True:
|
||||||
|
line = fp.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
row_list.append(line)
|
||||||
|
return row_list
|
||||||
|
|
||||||
|
def load_set(path, isTest):
|
||||||
|
dataset = pd.DataFrame(read_file_to_list("guess-reddit-date-sumo/"+path+"/in.tsv.xz"),columns=["text"])
|
||||||
|
if not isTest:
|
||||||
|
expected = pd.read_csv("guess-reddit-date-sumo/"+path+"/expected.tsv.xz",header=None,names=["year"])
|
||||||
|
return dataset, expected
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
train_set, expected_train = load_set("train", False)
|
||||||
|
dev_set, expected_dev = load_set("dev-0", False)
|
||||||
|
test_set = load_set("test-A", True)
|
||||||
|
|
||||||
|
test_set.info()
|
||||||
|
|
||||||
|
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
|
||||||
|
stop_words='english',
|
||||||
|
ngram_range=(1, 1),
|
||||||
|
)
|
||||||
|
|
||||||
|
train_set = train_set.fillna("No text")
|
||||||
|
|
||||||
|
train_data = vectorizer.fit_transform(train_set["text"])
|
||||||
|
|
||||||
|
pca = sklearn.decomposition.TruncatedSVD(n_components=100)
|
||||||
|
data = pca.fit_transform(train_data)
|
||||||
|
data
|
||||||
|
|
||||||
|
regression = LinearRegression()
|
||||||
|
regression.fit(data,expected_train)
|
||||||
|
|
||||||
|
mean_squared_error(regression.predict(data),expected_train)
|
||||||
|
|
||||||
|
dev_set
|
||||||
|
|
||||||
|
def transform_data(raw_data):
|
||||||
|
raw_data = raw_data.fillna("No text")
|
||||||
|
vector = vectorizer.transform(raw_data["text"])
|
||||||
|
clean_data = pca.transform(vector)
|
||||||
|
return clean_data
|
||||||
|
|
||||||
|
dev_transformed = transform_data(dev_set)
|
||||||
|
predict_dev = regression.predict(dev_transformed)
|
||||||
|
predict_dev
|
||||||
|
|
||||||
|
test_transformed = transform_data(test_set)
|
||||||
|
predict_test = regression.predict(test_transformed)
|
||||||
|
predict_test
|
||||||
|
|
||||||
|
mean_squared_error(predict_dev,expected_dev)
|
||||||
|
|
||||||
|
np.savetxt('guess-reddit-date-sumo/test-A/out.tsv', predict_test, '%f')
|
||||||
|
np.savetxt('guess-reddit-date-sumo/dev-0/out.tsv', predict_dev, '%f')
|
200000
test-A/out.tsv
200000
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user