64 lines
1.5 KiB
Python
64 lines
1.5 KiB
Python
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
import math
|
||
|
|
||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||
|
from sklearn.linear_model import LinearRegression
|
||
|
from sklearn.pipeline import make_pipeline
|
||
|
from sklearn.metrics import mean_squared_error
|
||
|
|
||
|
|
||
|
data = pd.read_csv('train/train.tsv', sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
|
||
|
|
||
|
data = data[['Text', 'Year']]
|
||
|
|
||
|
X = data['Text']
|
||
|
y = data['Year']
|
||
|
|
||
|
model = make_pipeline(TfidfVectorizer(), LinearRegression())
|
||
|
|
||
|
model.fit(X, y)
|
||
|
|
||
|
|
||
|
|
||
|
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
|
||
|
X_dev0 = f.readlines()
|
||
|
|
||
|
with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:
|
||
|
y_dev0 = f.readlines()
|
||
|
y_dev0 = pd.Series(y_dev0)
|
||
|
y_dev0 = y_dev0.apply(lambda row: row.replace('\n', ''))
|
||
|
|
||
|
predictions_dev0 = model.predict(X_dev0)
|
||
|
|
||
|
with open('dev-0/out.tsv', 'wt') as f:
|
||
|
for pred in predictions_dev0:
|
||
|
f.write(str(pred)+'\n')
|
||
|
|
||
|
|
||
|
|
||
|
with open('dev-1/in.tsv', 'r', encoding='utf8') as f:
|
||
|
X_dev1 = f.readlines()
|
||
|
|
||
|
with open('dev-1/expected.tsv', 'r', encoding='utf8') as f:
|
||
|
y_dev1 = f.readlines()
|
||
|
y_dev1 = pd.Series(y_dev1)
|
||
|
y_dev1 = y_dev1.apply(lambda row: row.replace('\n', ''))
|
||
|
|
||
|
predictions_dev1 = model.predict(X_dev1)
|
||
|
|
||
|
with open('dev-1/out.tsv', 'wt') as f:
|
||
|
for pred in predictions_dev1:
|
||
|
f.write(str(pred)+'\n')
|
||
|
|
||
|
|
||
|
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
|
||
|
X_test = f.readlines()
|
||
|
|
||
|
predictions_test = model.predict(X_test)
|
||
|
|
||
|
with open('test-A/out.tsv', 'wt') as f:
|
||
|
for pred in predictions_test:
|
||
|
f.write(str(pred)+'\n')
|
||
|
|