2022-05-17 23:35:15 +02:00
|
|
|
import os
|
|
|
|
import sklearn
|
|
|
|
import pandas as pd
|
|
|
|
from gzip import open as open_gz
|
|
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
from sklearn.pipeline import make_pipeline
|
|
|
|
from sklearn.linear_model import LinearRegression
|
|
|
|
from sklearn.metrics import mean_squared_error
|
|
|
|
|
|
|
|
def predict_year(x, path_out, model):
|
|
|
|
results = model.predict(x)
|
|
|
|
with open(path_out, 'wt') as file:
|
|
|
|
for r in results:
|
|
|
|
file.write(str(r) + '\n')
|
|
|
|
|
|
|
|
def read_file(filename):
|
|
|
|
result = []
|
|
|
|
with open(filename, 'r', encoding="utf-8") as file:
|
|
|
|
for line in file:
|
|
|
|
text = line.split("\t")[0].strip()
|
|
|
|
result.append(text)
|
|
|
|
return result
|
|
|
|
|
|
|
|
with open('train/train.tsv', 'r', encoding='utf8') as file:
|
|
|
|
train = pd.read_csv(file, sep='\t', names=['Start', 'End', 'Title', 'Author', 'Text'])
|
|
|
|
|
|
|
|
train = train[0:12000]
|
|
|
|
train_x = train['Text']
|
|
|
|
#train['Date'] = (train['Start'].astype(float) + train['End'].astype(float))/2
|
|
|
|
train_y = train['Start']
|
|
|
|
|
|
|
|
model = make_pipeline(TfidfVectorizer(), LinearRegression())
|
|
|
|
model.fit(train_x, train_y)
|
|
|
|
|
|
|
|
x_dev_0 = read_file('dev-0/in.tsv')
|
|
|
|
predict_year(x_dev_0, 'dev-0/out.tsv', model)
|
|
|
|
x_dev_1 = read_file('dev-1/in.tsv')
|
|
|
|
predict_year(x_dev_1,'dev-1/out.tsv', model)
|
|
|
|
x_test = read_file('test-A/in.tsv')
|
|
|
|
predict_year(x_test,'test-A/out.tsv', model)
|
|
|
|
|