precipitation-pl/run.py
2022-05-20 11:53:37 +02:00

74 lines
2.3 KiB
Python

import numpy as np
import pandas as pd
import xgboost as xg
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import QuantileTransformer, StandardScaler
train = pd.read_csv("train/in.tsv", header=None, sep="\t")
train_expected = pd.read_csv("train/expected.tsv", header=None, sep="\t")
dev_0 = pd.read_csv("dev-0/in.tsv", header=None, sep="\t")
test_A = pd.read_csv("test-A/in.tsv", header=None, sep="\t")
def preprocess_data(df_to_process, main_df=None):
final_df = pd.get_dummies(df_to_process, columns=[1, 2])
final_df.drop(columns=[0], inplace=True)
numeric = [3, 4]
sc = StandardScaler()
final_df[numeric] = sc.fit_transform(final_df[numeric])
if type(main_df) == pd.DataFrame:
final_columns = [
value
for value in list(main_df.columns)
if value not in list(final_df.columns)
]
for col in final_columns:
final_df[col] = 0
return final_df
train_df = preprocess_data(train)
dev_df = preprocess_data(dev_0, train_df)
test_A_df = preprocess_data(test_A, train_df)
y = train_expected[0]
param_grid = {
"n_estimators": [100, 80, 60, 55, 51, 45],
"max_depth": [7, 8],
"reg_lambda": [0.26, 0.25, 0.2],
}
grid = GridSearchCV(
xg.XGBRFRegressor(), param_grid, refit=True, verbose=3, n_jobs=-1
) #
regr_trans = TransformedTargetRegressor(
regressor=grid, transformer=QuantileTransformer(output_distribution="normal")
)
# fitting the model for grid search
grid_result = regr_trans.fit(train_df, y)
best_params = grid_result.regressor_.best_params_
# using best params to create and fit model
best_model = xg.XGBRFRegressor(
max_depth=best_params["max_depth"],
n_estimators=best_params["n_estimators"],
reg_lambda=best_params["reg_lambda"],
)
regr_trans = TransformedTargetRegressor(
regressor=best_model, transformer=QuantileTransformer(output_distribution="normal")
)
regr_trans.fit(train_df, y)
dev0_predicted = regr_trans.predict(dev_df)
test_A_predicted = regr_trans.predict(test_A_df)
dev0_predicted = np.round(dev0_predicted, decimals=1)
test_A_predicted = np.round(test_A_predicted, decimals=1)
pd.DataFrame(dev0_predicted).to_csv("dev-0/out.tsv", header=None, index=None)
pd.DataFrame(test_A_predicted).to_csv("test-A/out.tsv", header=None, index=None)