import pandas as pd import numpy as np import math from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression from sklearn.pipeline import make_pipeline from sklearn.metrics import mean_squared_error data = pd.read_csv('train/train.tsv', sep='\t', names=['Begin', 'End', 'Title', 'Publisher', 'Text']) data = data[['Text', 'Year']] X = data['Text'] y = data['Year'] model = make_pipeline(TfidfVectorizer(), LinearRegression()) model.fit(X, y) with open('dev-0/in.tsv', 'r', encoding='utf8') as f: X_dev0 = f.readlines() with open('dev-0/expected.tsv', 'r', encoding='utf8') as f: y_dev0 = f.readlines() y_dev0 = pd.Series(y_dev0) y_dev0 = y_dev0.apply(lambda row: row.replace('\n', '')) predictions_dev0 = model.predict(X_dev0) with open('dev-0/out.tsv', 'wt') as f: for pred in predictions_dev0: f.write(str(pred)+'\n') with open('dev-1/in.tsv', 'r', encoding='utf8') as f: X_dev1 = f.readlines() with open('dev-1/expected.tsv', 'r', encoding='utf8') as f: y_dev1 = f.readlines() y_dev1 = pd.Series(y_dev1) y_dev1 = y_dev1.apply(lambda row: row.replace('\n', '')) predictions_dev1 = model.predict(X_dev1) with open('dev-1/out.tsv', 'wt') as f: for pred in predictions_dev1: f.write(str(pred)+'\n') with open('test-A/in.tsv', 'r', encoding='utf8') as f: X_test = f.readlines() predictions_test = model.predict(X_test) with open('test-A/out.tsv', 'wt') as f: for pred in predictions_test: f.write(str(pred)+'\n')