from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import LinearRegression from stop_words import get_stop_words import pandas as pd import numpy as np import csv lm_model = LinearRegression() tfidvectorizer = TfidfVectorizer(stop_words=get_stop_words('polish')) train_nm = ['start_date', 'end_date', 'title', 'sort_title', 'data'] train_nm_test = ['data'] dataset = [] processed = [] new_text = "" train_file = pd.read_csv('train/train.tsv', sep="\t", names=train_nm) print('DONE20!') date = (train_file['start_date'] + train_file['end_date']) / 2 print('DONE22!') vectorizer= tfidvectorizer.fit_transform(train_file['data']) print('DONE24!') lm_model.fit(vectorizer, date) print('DONE26!') dev_0 = pd.read_csv("dev-0/in.tsv", error_bad_lines = False, header = None, sep = "\t", quoting=csv.QUOTE_NONE) dev_1 = pd.read_csv("dev-1/in.tsv", error_bad_lines = False, header = None, sep = "\t", quoting=csv.QUOTE_NONE,) test = pd.read_csv("test-A/in.tsv", names = train_nm, sep = "\t") print('DONE31!') test_file= tfidvectorizer.transform(test['data']) test_file_predict = lm_model.predict(test_file) with open('test-A/out.tsv', 'w') as file: for i in test_file_predict: file.write("%f\n" % i) print('DONE38!')