import pandas as pd import numpy as np import csv from sklearn.linear_model import LinearRegression from stop_words import get_stop_words from sklearn.feature_extraction.text import TfidfVectorizer train = pd.read_csv("train/train.tsv", names = ['start_date', 'end_date', 'title', 'sort_title', 'data'], sep = "\t") vectorizer = TfidfVectorizer(stop_words=get_stop_words('polish')) linear_reg = LinearRegression() date = (train['start_date'] + train['end_date']) / 2 train_vec = vectorizer.fit_transform(train['data']) linear_reg.fit(train_vec, date) dev_0 = pd.read_csv("dev-0/in.tsv", error_bad_lines = False, header = None, sep = "\t", quoting=csv.QUOTE_NONE) pred_dev_0 = linear_reg.predict(vectorizer.transform(dev_0[0])) pd.DataFrame(pred_dev_0).to_csv('dev-0/out.tsv', sep = "\t", index = False, header = False) dev_1 = pd.read_csv("dev-1/in.tsv", error_bad_lines = False, header = None, sep = "\t", quoting=csv.QUOTE_NONE) pred_dev_1 = linear_reg.predict(vectorizer.transform(dev_1[0])) pd.DataFrame(pred_dev_1).to_csv('dev-1/out.tsv', sep = "\t", index = False, header = False) test = pd.read_csv("test-A/in.tsv", names = ['data'], sep = "\t") pred_test = linear_reg.predict(vectorizer.transform(test['data'])) pd.DataFrame(pred_test).to_csv('test-A/out.tsv', sep = "\t", index = False, header = False) #./geval -t dev-0 #21.8069 #./geval -t dev-1 #22.0247