forked from kubapok/auta-public
4.4 KiB
4.4 KiB
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
col_names = ["Price","Mileage","Year","Brand","EngineType","EngineCapacity"]
df =pd.read_csv('train/train.tsv', sep="\t", names=col_names)
def prepareData(df):
df["Age"] = 2018 - df["Year"]
df["SqrtAge"] = df.age**0.5
df["SqrtMileage"] = df.Mileage ** 0.5
df["SqrtEngineCapacity"] = df.EngineCapacity ** 0.5
df = pd.concat([df, df['EngineType'].str.get_dummies()], axis = 1 )
df = df.drop(['EngineType','Brand'], axis = 1)
poly = PolynomialFeatures(2, interaction_only=True)
df = poly.fit_transform(df)
return df
df_train = df
y_train = df_train.Price
x_train = df_train.drop('Price', axis=1)
x_train = prepareData(x_train)
linReg = LinearRegression()
linReg.fit(x_train, y_train)
LinearRegression()
y_dev =pd.read_csv('dev-0/expected.tsv', sep="\t", names=["Price"])
x_dev =pd.read_csv('dev-0/in.tsv', sep="\t", names=["Mileage","Year","Brand","EngineType","EngineCapacity"])
x_dev = prepareData(x_dev)
score = linReg.score(x_dev, y_dev)
print(score)
0.7535351650926749
y_pred = linReg.predict(x_dev)
data = {'Price':y_pred}
y_pred = pd.DataFrame(data)
mean_squared_error(y_dev, y_pred, squared=False)
24989.603665517054
24943.930732282024 26863.621497665004 #BEZ AGE