retroc2/run.ipynb

9.9 KiB
Raw Permalink Blame History

import pandas as pd
import numpy as np
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
data = pd.read_csv('train/train.tsv', sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
data['Year'] = data.apply(lambda row: ((row['Begin'] + row['End'])/2), axis=1)
data = data[['Text', 'Year']]
data
Text Year
0 nowią część kultury. U nas już nikt ich nie ch... 1985.494521
1 hlstorja znana w okresie piramid, jak wlaśclcl... 1926.475342
2 działek. Idąc dalej w swych hipotetycznych roz... 2013.963014
3 w Warszawie o stosunkach domowych dziatwy szko... 1925.500000
4 \\\\'iykład: "Cywilizacyjna Koncepcja dziejów ¥e... 1981.500000
... ... ...
107458 M. (2) na rzecz powoda M. S. kwotę 5003,66 zł ... 2013.058904
107459 Zintegrowanego Systemu Informatycznego (ZSI), ... 2013.023288
107460 prokurator. Wyrokowi temu powołując się na prz... 2013.921918
107461 07 lipca 2010 r. świadczą o tym, że nie wszyst... 2013.083562
107462 zatem niezdolności do pracy było schorzenie sa... 2013.100000

107463 rows × 2 columns

X = data['Text']
y = data['Year']
model = make_pipeline(TfidfVectorizer(), LinearRegression())
model.fit(X, y)
Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('linearregression', LinearRegression())])

Dev0

with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
    X_dev0 = f.readlines()
20000
with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:
    y_dev0 = f.readlines()
y_dev0 = pd.Series(y_dev0)
y_dev0 = y_dev0.apply(lambda row: row.replace('\n', ''))
predictions_dev0 = model.predict(X_dev0)
math.sqrt(mean_squared_error(y_dev0, predictions_dev0))
21.66807634196494
with open('dev-0/out.tsv', 'wt') as f:
    for pred in predictions_dev0:
        f.write(str(pred)+'\n')

Dev1

with open('dev-1/in.tsv', 'r', encoding='utf8') as f:
    X_dev1 = f.readlines()
with open('dev-1/expected.tsv', 'r', encoding='utf8') as f:
    y_dev1 = f.readlines()
y_dev1 = pd.Series(y_dev1)
y_dev1 = y_dev1.apply(lambda row: row.replace('\n', ''))
predictions_dev1 = model.predict(X_dev1)
math.sqrt(mean_squared_error(y_dev1, predictions_dev1))
21.943703116726265
with open('dev-1/out.tsv', 'wt') as f:
    for pred in predictions_dev1:
        f.write(str(pred)+'\n')

Test

with open('test-A/in.tsv', 'r', encoding='utf8') as f:
    X_test = f.readlines()
predictions_test = model.predict(X_test)
with open('test-A/out.tsv', 'wt') as f:
    for pred in predictions_test:
        f.write(str(pred)+'\n')