import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from sklearn.feature_extraction.text import TfidfVectorizer print('Loading train.tsv...') train = pd.read_csv('train/train.tsv', sep='\t', header=None) train_date = (train[0] + train[1]) / 2 train_text = train[4] # Debug # train_text = train_text[:1000] # train_date = train_date[:1000] print('Loading dev-0...') with open('dev-0/in.tsv', encoding='utf8') as file: dev0_text = file.readlines() dev0_text = [str(line) for line in dev0_text] print('Loading dev-1...') with open('dev-1/in.tsv', encoding='utf8') as file: dev1_text = file.readlines() dev1_text = [str(line) for line in dev1_text] print('Loading test...') with open('test-A/in.tsv', encoding='utf8') as file: test_text = file.readlines() test_text = [str(line) for line in test_text] print('Vectorizing training text...') vc = TfidfVectorizer(max_df=0.90) vectorized_text = vc.fit_transform(train_text) print('Training model...') model = LinearRegression() model.fit(vectorized_text, train_date) print('Predicting dev0...') vectorized_dev0 = vc.transform(dev0_text) out_dev0 = model.predict(vectorized_dev0) print('Predicting dev1...') vectorized_dev1 = vc.transform(dev1_text) out_dev1 = model.predict(vectorized_dev1) print('Predicting test-A...') vectorized_test = vc.transform(test_text) out_test = model.predict(vectorized_test) print('Saving to file') np.savetxt('dev-0/out.tsv', out_dev0, fmt='%f') np.savetxt('dev-1/out.tsv', out_dev1, fmt='%f') np.savetxt('test-A/out.tsv', out_test, fmt='%f')