import pandas as pd import numpy as np import sklearn from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error # from xgboost import XGBRegressor import random import pickle import sys import lzma def tokenizer_space(text): return text.split(' ') type = sys.argv[1] # 1 or 2 def run(): # LOADING DATA train_text = [a.rstrip('\n') for a in lzma.open('../train/in.tsv.xz', 'rt')] dev_text = [a.rstrip('\n') for a in lzma.open('../dev-0/in.tsv.xz', 'rt')] test_text = [a.rstrip('\n') for a in lzma.open('../test-A/in.tsv.xz', 'rt')] global lowest train_year = [float(a.rstrip('\n')) for a in open(f'../train/expected{type}.tsv','r')] dev_year = [float(a.rstrip('\n')) for a in open(f'../dev-0/expected{type}.tsv','r')] max_year = max(train_year) min_year = min(train_year) tfidf = TfidfVectorizer() #tfidf = HashingVectorizer() train_text_vectorized = tfidf.fit_transform(train_text) pickle.dump(train_text_vectorized, open('text_train_tfidf_all.pickle','wb')) pickle.dump(tfidf, open('tfidf_all.pickle','wb')) train_text_vectorized = pickle.load(open('text_train_tfidf_all.pickle','rb')) tfidf = pickle.load(open('tfidf_all.pickle','rb')) dev_text_vectorized = tfidf.transform(dev_text) test_text_vectorized = tfidf.transform(test_text) # MODELLING lr = LinearRegression( n_jobs=5) #xgb = XGBRegressor(n_jobs=8) #xgb_1000 = XGBRegressor(n_estimators=1000,n_jobs=8) #xgb_5000 = XGBRegressor(n_estimators=5000,n_jobs=8) lr.fit(train_text_vectorized, train_year) #xgb.fit(text, year) #xgb_1000.fit(text, year) #xgb_5000.fit(text, year) ################## # DEV PREDICTIONS predictions_lr = lr.predict(dev_text_vectorized) predictions_lr = np.minimum(predictions_lr, max_year) predictions_lr = np.maximum(predictions_lr, min_year) print('dev-0 RMSE') print(np.sqrt(sklearn.metrics.mean_squared_error(predictions_lr, dev_year))) print('dev-0 MAE') print(sklearn.metrics.mean_absolute_error(predictions_lr, dev_year)) f = open(f'../dev-0/out{type}.tsv','w') for i in predictions_lr: f.write(str(i) + '\n') f.close() ################## # TEST PREDICTIONS predictions_lr = lr.predict(test_text_vectorized) predictions_lr = np.minimum(predictions_lr, max_year) predictions_lr = np.maximum(predictions_lr, min_year) f = open(f'../test-A/out{type}.tsv','w') for i in predictions_lr: f.write(str(i) + '\n') f.close() run()