forked from kubapok/auta-public
Linear regression solution RMSE 34k
This commit is contained in:
parent
5c4bb10ddf
commit
6402b55d3b
1000
dev-0/out.tsv
Normal file
1000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
47
linear-regression.py
Normal file
47
linear-regression.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from pandas import DataFrame
|
||||||
|
from sklearn import preprocessing
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
import plotly.express as px
|
||||||
|
|
||||||
|
|
||||||
|
col_names = ["price", "mileage", "year", "brand", "engine_type", "engine_cap"]
|
||||||
|
col_names_in = ["mileage", "year", "brand", "engine_type", "engine_cap"]
|
||||||
|
df_train = pd.read_csv("train/train.tsv", error_bad_lines=False, header=None, sep="\t", names=col_names)
|
||||||
|
df = df_train.drop(df_train[df_train["price"] < 1000].index)
|
||||||
|
dev0 = pd.read_csv("dev-0/in.tsv", error_bad_lines=False, header=None, sep="\t", names=col_names_in)
|
||||||
|
testA = pd.read_csv("test-A/in.tsv", error_bad_lines=False, header=None, sep="\t", names=col_names_in)
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_data(df: DataFrame) -> DataFrame:
|
||||||
|
"""Prepare dataset to linear regression"""
|
||||||
|
df["brand"] = df["brand"].str.lower()
|
||||||
|
scaler = preprocessing.StandardScaler()
|
||||||
|
df[["mileage", "year", "engine_cap"]] = scaler.fit_transform(df[["mileage", "year", "engine_cap"]])
|
||||||
|
enc = preprocessing.LabelEncoder()
|
||||||
|
enc.fit(df[["brand"]])
|
||||||
|
df[["brand"]] = enc.transform(df[["brand"]])
|
||||||
|
|
||||||
|
enc.fit(df["engine_type"])
|
||||||
|
df[["engine_type"]] = enc.transform(df[["engine_type"]])
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
df_train = preprocess_data(df_train)
|
||||||
|
dev0 = preprocess_data(dev0)
|
||||||
|
testA = preprocess_data(testA)
|
||||||
|
|
||||||
|
fig = px.imshow(df_train.corr())
|
||||||
|
fig.show()
|
||||||
|
|
||||||
|
Y_train = df_train["price"]
|
||||||
|
|
||||||
|
lm_model = LinearRegression()
|
||||||
|
lm_model.fit(df_train[["mileage", "year", "brand", "engine_cap"]], Y_train)
|
||||||
|
|
||||||
|
dev0_predicted = lm_model.predict(dev0[["mileage", "year", "brand", "engine_cap"]])
|
||||||
|
testA_predicted = lm_model.predict(testA[["mileage", "year", "brand", "engine_cap"]])
|
||||||
|
|
||||||
|
pd.Series(dev0_predicted).to_csv("dev-0/out.tsv", sep="\t", index=False, header=False)
|
||||||
|
pd.Series(testA_predicted).to_csv("test-A/out.tsv", sep="\t", index=False, header=False)
|
1000
test-A/out.tsv
Normal file
1000
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user