wiki-historian/tfidf/train_predict.py
2022-07-03 18:50:46 +02:00

81 lines
2.6 KiB
Python

import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# from xgboost import XGBRegressor
import random
import pickle
def tokenizer_space(text):
return text.split(' ')
def run():
# LOADING DATA
train_text = [a.rstrip('\n') for a in open('../train/in.tsv','r')]
dev_text = [a.rstrip('\n') for a in open('../dev-0/in.tsv','r')]
test_text = [a.rstrip('\n') for a in open('../test-A/in.tsv','r')]
global lowest
train_year = [ sum([float(b) for b in a.rstrip('\n').split(',')]) / 2 for a in open('../train/expected.tsv','r')]
dev_year = [ sum([float(b) for b in a.rstrip('\n').split(',')]) / 2 for a in open('../dev-0/expected.tsv','r')]
max_year = max(train_year)
min_year = min(train_year)
tfidf = TfidfVectorizer()
#tfidf = HashingVectorizer()
train_text_vectorized = tfidf.fit_transform(train_text)
pickle.dump(train_text_vectorized, open('text_train_tfidf_all.pickle','wb'))
pickle.dump(tfidf, open('tfidf_all.pickle','wb'))
train_text_vectorized = pickle.load(open('text_train_tfidf_all.pickle','rb'))
tfidf = pickle.load(open('tfidf_all.pickle','rb'))
dev_text_vectorized = tfidf.transform(dev_text)
test_text_vectorized = tfidf.transform(test_text)
# MODELLING
lr = LinearRegression( n_jobs=10)
#xgb = XGBRegressor(n_jobs=8)
#xgb_1000 = XGBRegressor(n_estimators=1000,n_jobs=8)
#xgb_5000 = XGBRegressor(n_estimators=5000,n_jobs=8)
lr.fit(train_text_vectorized, train_year)
#xgb.fit(text, year)
#xgb_1000.fit(text, year)
#xgb_5000.fit(text, year)
##################
# DEV PREDICTIONS
predictions_lr = lr.predict(dev_text_vectorized)
predictions_lr = np.minimum(predictions_lr, max_year)
predictions_lr = np.maximum(predictions_lr, min_year)
print('dev-0 RMSE')
print(np.sqrt(sklearn.metrics.mean_squared_error(predictions_lr, dev_year)))
print('dev-0 MAE')
print(sklearn.metrics.mean_absolute_error(predictions_lr, dev_year))
f = open('../dev-0/out.tsv','w')
for i in predictions_lr:
f.write(str(i) + '\n')
f.close()
##################
# TEST PREDICTIONS
predictions_lr = lr.predict(test_text_vectorized)
predictions_lr = np.minimum(predictions_lr, max_year)
predictions_lr = np.maximum(predictions_lr, min_year)
f = open('../test-A/out.tsv','w')
for i in predictions_lr:
f.write(str(i) + '\n')
f.close()
run()