9.9 KiB
9.9 KiB
import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
data = pd.read_csv('train/train.tsv', sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
data['Year'] = data.apply(lambda row: ((row['Begin'] + row['End'])/2), axis=1)
data = data[['Text', 'Year']]
data
Text | Year | |
---|---|---|
0 | nowią część kultury. U nas już nikt ich nie ch... | 1985.494521 |
1 | hlstorja znana w okresie piramid, jak wlaśclcl... | 1926.475342 |
2 | działek. Idąc dalej w swych hipotetycznych roz... | 2013.963014 |
3 | w Warszawie o stosunkach domowych dziatwy szko... | 1925.500000 |
4 | \\\\'iykład: "Cywilizacyjna Koncepcja dziejów ¥e... | 1981.500000 |
... | ... | ... |
107458 | M. (2) na rzecz powoda M. S. kwotę 5003,66 zł ... | 2013.058904 |
107459 | Zintegrowanego Systemu Informatycznego (ZSI), ... | 2013.023288 |
107460 | prokurator. Wyrokowi temu powołując się na prz... | 2013.921918 |
107461 | 07 lipca 2010 r. świadczą o tym, że nie wszyst... | 2013.083562 |
107462 | zatem niezdolności do pracy było schorzenie sa... | 2013.100000 |
107463 rows × 2 columns
X = data['Text']
y = data['Year']
model = make_pipeline(TfidfVectorizer(), LinearRegression())
model.fit(X, y)
Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()), ('linearregression', LinearRegression())])
Dev0
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
X_dev0 = f.readlines()
20000
with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:
y_dev0 = f.readlines()
y_dev0 = pd.Series(y_dev0)
y_dev0 = y_dev0.apply(lambda row: row.replace('\n', ''))
predictions_dev0 = model.predict(X_dev0)
math.sqrt(mean_squared_error(y_dev0, predictions_dev0))
21.66807634196494
with open('dev-0/out.tsv', 'wt') as f:
for pred in predictions_dev0:
f.write(str(pred)+'\n')
Dev1
with open('dev-1/in.tsv', 'r', encoding='utf8') as f:
X_dev1 = f.readlines()
with open('dev-1/expected.tsv', 'r', encoding='utf8') as f:
y_dev1 = f.readlines()
y_dev1 = pd.Series(y_dev1)
y_dev1 = y_dev1.apply(lambda row: row.replace('\n', ''))
predictions_dev1 = model.predict(X_dev1)
math.sqrt(mean_squared_error(y_dev1, predictions_dev1))
21.943703116726265
with open('dev-1/out.tsv', 'wt') as f:
for pred in predictions_dev1:
f.write(str(pred)+'\n')
Test
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
X_test = f.readlines()
predictions_test = model.predict(X_test)
with open('test-A/out.tsv', 'wt') as f:
for pred in predictions_test:
f.write(str(pred)+'\n')