from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression from stop_words import get_stop_words import pandas as pd meh = TfidfVectorizer(stop_words = get_stop_words('polish')) linReg = LinearRegression() colnames = ['start_date', 'end_date', 'title', 'sort_title', 'data'] t = pd.read_csv('test-A/in.tsv', sep = "\t", names = colnames) datFin = pd.read_csv('train/train.tsv', sep = "\t", names = colnames) date = (datFin['start_date'] + datFin['end_date']) / 2 trn = meh.fit_transform(datFin['data']) linReg.fit(trn, date) r = meh.transform(t['data']) x = linReg.predict(r) with open('test-A/out.tsv', 'w') as file: for each in x: file.write("%f\n" % each)