forked from kubapok/auta-public
Improve script
This commit is contained in:
parent
6402b55d3b
commit
869f3fbb4e
1000
dev-0/out.tsv
1000
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -1,47 +1,78 @@
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
from pandas import DataFrame
|
||||
from sklearn import preprocessing
|
||||
from sklearn.linear_model import LinearRegression
|
||||
import plotly.express as px
|
||||
|
||||
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
|
||||
from sklearn.metrics import mean_squared_error
|
||||
from sklearn.preprocessing import PolynomialFeatures
|
||||
|
||||
col_names = ["price", "mileage", "year", "brand", "engine_type", "engine_cap"]
|
||||
col_names_in = ["mileage", "year", "brand", "engine_type", "engine_cap"]
|
||||
df_train = pd.read_csv("train/train.tsv", error_bad_lines=False, header=None, sep="\t", names=col_names)
|
||||
df = df_train.drop(df_train[df_train["price"] < 1000].index)
|
||||
dev0 = pd.read_csv("dev-0/in.tsv", error_bad_lines=False, header=None, sep="\t", names=col_names_in)
|
||||
testA = pd.read_csv("test-A/in.tsv", error_bad_lines=False, header=None, sep="\t", names=col_names_in)
|
||||
df_train = pd.read_csv(
|
||||
"train/train.tsv", error_bad_lines=False, header=None, sep="\t", names=col_names
|
||||
)
|
||||
df = df_train
|
||||
dev0 = pd.read_csv(
|
||||
"dev-0/in.tsv", error_bad_lines=False, header=None, sep="\t", names=col_names_in
|
||||
)
|
||||
testA = pd.read_csv(
|
||||
"test-A/in.tsv", error_bad_lines=False, header=None, sep="\t", names=col_names_in
|
||||
)
|
||||
|
||||
test = pd.read_csv("dev-0/expected.tsv", error_bad_lines=False, header=None, sep="\t")
|
||||
Y_a = test[0]
|
||||
|
||||
brands = df.brand.value_counts()[:35].index.tolist()
|
||||
|
||||
|
||||
def preprocess_data(df: DataFrame) -> DataFrame:
|
||||
def preprocess_data(df: DataFrame, brands: list) -> DataFrame:
|
||||
"""Prepare dataset to linear regression"""
|
||||
df["brand"] = df["brand"].str.lower()
|
||||
scaler = preprocessing.StandardScaler()
|
||||
df[["mileage", "year", "engine_cap"]] = scaler.fit_transform(df[["mileage", "year", "engine_cap"]])
|
||||
enc = preprocessing.LabelEncoder()
|
||||
enc.fit(df[["brand"]])
|
||||
df[["brand"]] = enc.transform(df[["brand"]])
|
||||
|
||||
enc.fit(df["engine_type"])
|
||||
df[["engine_type"]] = enc.transform(df[["engine_type"]])
|
||||
df.brand = df.brand.apply(lambda x: x if x in brands else "0")
|
||||
df["year"] = df.year / 2000
|
||||
df["mileage"] = df.mileage ** 0.3
|
||||
df["engine_cap"] = df.engine_cap * 0.3
|
||||
df["brand"] = df["brand"].str.lower()
|
||||
scaler = preprocessing.RobustScaler()
|
||||
df = pd.get_dummies(df, columns=["brand", "engine_type"])
|
||||
# takes 1k rmse more ;(
|
||||
df[["mileage", "year", "engine_cap", "year"]] = scaler.fit_transform(
|
||||
df[["mileage", "year", "engine_cap", "year"]]
|
||||
)
|
||||
poly = PolynomialFeatures(2, interaction_only=True)
|
||||
df = poly.fit_transform(df)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
df_train = preprocess_data(df_train)
|
||||
dev0 = preprocess_data(dev0)
|
||||
testA = preprocess_data(testA)
|
||||
|
||||
fig = px.imshow(df_train.corr())
|
||||
fig.show()
|
||||
indexes = df_train[(df_train.price < 1000) & (df_train.price > 1)].index
|
||||
df_train.drop(indexes, inplace=True)
|
||||
|
||||
index = df_train[(df_train.mileage > 900000)].index
|
||||
df_train.drop(index, inplace=True)
|
||||
Y_train = df_train["price"]
|
||||
df_train.drop("price", axis=1, inplace=True)
|
||||
# df_train = df_train[df_train.price not in range (2, 1000)]
|
||||
|
||||
df_train = preprocess_data(df_train, brands)
|
||||
dev0 = preprocess_data(dev0, brands)
|
||||
testA = preprocess_data(testA, brands)
|
||||
|
||||
# fig = px.imshow(df_train.corr())
|
||||
# fig.show()
|
||||
|
||||
|
||||
lm_model = LinearRegression()
|
||||
lm_model.fit(df_train[["mileage", "year", "brand", "engine_cap"]], Y_train)
|
||||
|
||||
dev0_predicted = lm_model.predict(dev0[["mileage", "year", "brand", "engine_cap"]])
|
||||
testA_predicted = lm_model.predict(testA[["mileage", "year", "brand", "engine_cap"]])
|
||||
# clf = RidgeCV(alphas=[0.1, 0.01, 0.001, 0.00001, 1e-1], cv=10, fit_intercept=True, normalize=True)
|
||||
# clf.fit(df_train, Y_train)
|
||||
lm_model.fit(df_train, Y_train)
|
||||
|
||||
dev0_predicted = lm_model.predict(dev0)
|
||||
testA_predicted = lm_model.predict(testA)
|
||||
|
||||
# dev0_predicted2 = clf.predict(dev0)
|
||||
|
||||
pd.Series(dev0_predicted).to_csv("dev-0/out.tsv", sep="\t", index=False, header=False)
|
||||
pd.Series(testA_predicted).to_csv("test-A/out.tsv", sep="\t", index=False, header=False)
|
||||
print(mean_squared_error(Y_a, dev0_predicted, squared=False))
|
||||
|
1000
test-A/out.tsv
1000
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user