retroc2/run.py

64 lines
1.5 KiB
Python

import pandas as pd
import numpy as np
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
data = pd.read_csv('train/train.tsv', sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text'])
data = data[['Text', 'Year']]
X = data['Text']
y = data['Year']
model = make_pipeline(TfidfVectorizer(), LinearRegression())
model.fit(X, y)
with open('dev-0/in.tsv', 'r', encoding='utf8') as f:
X_dev0 = f.readlines()
with open('dev-0/expected.tsv', 'r', encoding='utf8') as f:
y_dev0 = f.readlines()
y_dev0 = pd.Series(y_dev0)
y_dev0 = y_dev0.apply(lambda row: row.replace('\n', ''))
predictions_dev0 = model.predict(X_dev0)
with open('dev-0/out.tsv', 'wt') as f:
for pred in predictions_dev0:
f.write(str(pred)+'\n')
with open('dev-1/in.tsv', 'r', encoding='utf8') as f:
X_dev1 = f.readlines()
with open('dev-1/expected.tsv', 'r', encoding='utf8') as f:
y_dev1 = f.readlines()
y_dev1 = pd.Series(y_dev1)
y_dev1 = y_dev1.apply(lambda row: row.replace('\n', ''))
predictions_dev1 = model.predict(X_dev1)
with open('dev-1/out.tsv', 'wt') as f:
for pred in predictions_dev1:
f.write(str(pred)+'\n')
with open('test-A/in.tsv', 'r', encoding='utf8') as f:
X_test = f.readlines()
predictions_test = model.predict(X_test)
with open('test-A/out.tsv', 'wt') as f:
for pred in predictions_test:
f.write(str(pred)+'\n')