mean from train
This commit is contained in:
commit
4856c520f2
100000
dev-0/out.tsv
Normal file
100000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
21
mean_from_train/mean_from_train.py
Normal file
21
mean_from_train/mean_from_train.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
import numpy as np
|
||||||
|
|
||||||
|
train_exp_f = open('../train/expected.tsv')
|
||||||
|
train_mean = np.mean([float(a.rstrip()) for a in train_exp_f])
|
||||||
|
train_mean_line = str(train_mean) + '\n'
|
||||||
|
|
||||||
|
|
||||||
|
def pred(in_f_path, out_f_path):
|
||||||
|
f_in = open(in_f_path, 'r')
|
||||||
|
f_out = open(out_f_path, 'w')
|
||||||
|
|
||||||
|
for l in f_in:
|
||||||
|
f_out.write(train_mean_line)
|
||||||
|
|
||||||
|
f_in.close()
|
||||||
|
f_out.close()
|
||||||
|
|
||||||
|
pred('../dev-0/in.tsv', '../dev-0/out.tsv')
|
||||||
|
pred('../test-A/in.tsv', '../test-A/out.tsv')
|
||||||
|
|
||||||
|
|
100359
test-A/out.tsv
Normal file
100359
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
80
tfidflr/train_predict.py
Normal file
80
tfidflr/train_predict.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import sklearn
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
# from xgboost import XGBRegressor
|
||||||
|
import random
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
def tokenizer_space(text):
|
||||||
|
return text.split(' ')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def run():
|
||||||
|
|
||||||
|
# LOADING DATA
|
||||||
|
train_text = [a.rstrip('\n') for a in open('../train/in.tsv','r')]
|
||||||
|
dev_text = [a.rstrip('\n') for a in open('../dev-0/in.tsv','r')]
|
||||||
|
test_text = [a.rstrip('\n') for a in open('../test-A/in.tsv','r')]
|
||||||
|
global lowest
|
||||||
|
|
||||||
|
train_year = [float(a.rstrip('\n')) for a in open('../train/expected.tsv','r')]
|
||||||
|
dev_year = [float(a.rstrip('\n')) for a in open('../dev-0/expected.tsv','r')]
|
||||||
|
|
||||||
|
max_year = max(train_year)
|
||||||
|
min_year = min(train_year)
|
||||||
|
|
||||||
|
tfidf = TfidfVectorizer()
|
||||||
|
#tfidf = HashingVectorizer()
|
||||||
|
train_text_vectorized = tfidf.fit_transform(train_text)
|
||||||
|
pickle.dump(train_text_vectorized, open('text_train_tfidf_all.pickle','wb'))
|
||||||
|
pickle.dump(tfidf, open('tfidf_all.pickle','wb'))
|
||||||
|
train_text_vectorized = pickle.load(open('text_train_tfidf_all.pickle','rb'))
|
||||||
|
tfidf = pickle.load(open('tfidf_all.pickle','rb'))
|
||||||
|
|
||||||
|
dev_text_vectorized = tfidf.transform(dev_text)
|
||||||
|
test_text_vectorized = tfidf.transform(test_text)
|
||||||
|
|
||||||
|
# MODELLING
|
||||||
|
lr = LinearRegression( n_jobs=10)
|
||||||
|
#xgb = XGBRegressor(n_jobs=8)
|
||||||
|
#xgb_1000 = XGBRegressor(n_estimators=1000,n_jobs=8)
|
||||||
|
#xgb_5000 = XGBRegressor(n_estimators=5000,n_jobs=8)
|
||||||
|
lr.fit(train_text_vectorized, train_year)
|
||||||
|
#xgb.fit(text, year)
|
||||||
|
#xgb_1000.fit(text, year)
|
||||||
|
#xgb_5000.fit(text, year)
|
||||||
|
|
||||||
|
|
||||||
|
##################
|
||||||
|
# DEV PREDICTIONS
|
||||||
|
predictions_lr = lr.predict(dev_text_vectorized)
|
||||||
|
predictions_lr = np.minimum(predictions_lr, max_year)
|
||||||
|
predictions_lr = np.maximum(predictions_lr, min_year)
|
||||||
|
print('dev-0 RMSE')
|
||||||
|
print(np.sqrt(sklearn.metrics.mean_squared_error(predictions_lr, dev_year)))
|
||||||
|
print('dev-0 MAE')
|
||||||
|
print(sklearn.metrics.mean_absolute_error(predictions_lr, dev_year))
|
||||||
|
|
||||||
|
f = open('../dev-0/out.tsv','w')
|
||||||
|
for i in predictions_lr:
|
||||||
|
f.write(str(i) + '\n')
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
##################
|
||||||
|
# TEST PREDICTIONS
|
||||||
|
predictions_lr = lr.predict(test_text_vectorized)
|
||||||
|
predictions_lr = np.minimum(predictions_lr, max_year)
|
||||||
|
predictions_lr = np.maximum(predictions_lr, min_year)
|
||||||
|
|
||||||
|
f = open('../test-A/out.tsv','w')
|
||||||
|
for i in predictions_lr:
|
||||||
|
f.write(str(i) + '\n')
|
||||||
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
run()
|
Loading…
Reference in New Issue
Block a user