Linear regression TfIdf, dim reduced
This commit is contained in:
parent
a4585a4957
commit
ff6cc29cc4
200000
dev-0/out.tsv
200000
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
1
link_to_collab.txt
Normal file
1
link_to_collab.txt
Normal file
@ -0,0 +1 @@
|
||||
https://colab.research.google.com/drive/1sEEvGtUrrC2XUk3zSN0D7FmFdQ4TK2de
|
88
reddit_date.py
Normal file
88
reddit_date.py
Normal file
@ -0,0 +1,88 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""reddit_date.ipynb
|
||||
|
||||
Automatically generated by Colaboratory.
|
||||
|
||||
Original file is located at
|
||||
https://colab.research.google.com/drive/1sEEvGtUrrC2XUk3zSN0D7FmFdQ4TK2de
|
||||
"""
|
||||
|
||||
#!git clone https://git.wmi.amu.edu.pl/dawjur/guess-reddit-date-sumo.git
|
||||
|
||||
#!xzcat "guess-reddit-date-sumo/train/in.tsv.xz" | wc -l
|
||||
|
||||
import sys
|
||||
import numpy as np
|
||||
|
||||
from sklearn.linear_model import LinearRegression
|
||||
import sklearn.metrics
|
||||
import sklearn.decomposition
|
||||
import sklearn.feature_extraction.text
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
import csv
|
||||
import datetime
|
||||
import lzma
|
||||
import pandas as pd
|
||||
from sklearn.metrics import mean_squared_error
|
||||
|
||||
def read_file_to_list(path):
|
||||
row_list = []
|
||||
with lzma.open(path) as fp:
|
||||
while True:
|
||||
line = fp.readline()
|
||||
if not line:
|
||||
break
|
||||
row_list.append(line)
|
||||
return row_list
|
||||
|
||||
def load_set(path, isTest):
|
||||
dataset = pd.DataFrame(read_file_to_list("guess-reddit-date-sumo/"+path+"/in.tsv.xz"),columns=["text"])
|
||||
if not isTest:
|
||||
expected = pd.read_csv("guess-reddit-date-sumo/"+path+"/expected.tsv.xz",header=None,names=["year"])
|
||||
return dataset, expected
|
||||
return dataset
|
||||
|
||||
train_set, expected_train = load_set("train", False)
|
||||
dev_set, expected_dev = load_set("dev-0", False)
|
||||
test_set = load_set("test-A", True)
|
||||
|
||||
test_set.info()
|
||||
|
||||
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(
|
||||
stop_words='english',
|
||||
ngram_range=(1, 1),
|
||||
)
|
||||
|
||||
train_set = train_set.fillna("No text")
|
||||
|
||||
train_data = vectorizer.fit_transform(train_set["text"])
|
||||
|
||||
pca = sklearn.decomposition.TruncatedSVD(n_components=100)
|
||||
data = pca.fit_transform(train_data)
|
||||
data
|
||||
|
||||
regression = LinearRegression()
|
||||
regression.fit(data,expected_train)
|
||||
|
||||
mean_squared_error(regression.predict(data),expected_train)
|
||||
|
||||
dev_set
|
||||
|
||||
def transform_data(raw_data):
|
||||
raw_data = raw_data.fillna("No text")
|
||||
vector = vectorizer.transform(raw_data["text"])
|
||||
clean_data = pca.transform(vector)
|
||||
return clean_data
|
||||
|
||||
dev_transformed = transform_data(dev_set)
|
||||
predict_dev = regression.predict(dev_transformed)
|
||||
predict_dev
|
||||
|
||||
test_transformed = transform_data(test_set)
|
||||
predict_test = regression.predict(test_transformed)
|
||||
predict_test
|
||||
|
||||
mean_squared_error(predict_dev,expected_dev)
|
||||
|
||||
np.savetxt('guess-reddit-date-sumo/test-A/out.tsv', predict_test, '%f')
|
||||
np.savetxt('guess-reddit-date-sumo/dev-0/out.tsv', predict_dev, '%f')
|
200000
test-A/out.tsv
200000
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user