import os import sklearn import pandas as pd from gzip import open as open_gz from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import make_pipeline from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error def predict_year(x, path_out, model): results = model.predict(x) with open(path_out, 'wt') as file: for r in results: file.write(str(r) + '\n') def read_file(filename): result = [] with open(filename, 'r', encoding="utf-8") as file: for line in file: text = line.split("\t")[0].strip() result.append(text) return result with open('train/train.tsv', 'r', encoding='utf8') as file: train = pd.read_csv(file, sep='\t', names=['Start', 'End', 'Title', 'Author', 'Text']) train = train[0:12000] train_x = train['Text'] #train['Date'] = (train['Start'].astype(float) + train['End'].astype(float))/2 train_y = train['Start'] model = make_pipeline(TfidfVectorizer(), LinearRegression()) model.fit(train_x, train_y) x_dev_0 = read_file('dev-0/in.tsv') predict_year(x_dev_0, 'dev-0/out.tsv', model) x_dev_1 = read_file('dev-1/in.tsv') predict_year(x_dev_1,'dev-1/out.tsv', model) x_test = read_file('test-A/in.tsv') predict_year(x_test,'test-A/out.tsv', model)