auta-public/cars price prediction.ipynb
2021-05-18 23:41:45 +02:00

32 KiB
Raw Permalink Blame History

!pip install scikit-learn==0.24.2 --user
Requirement already satisfied: scikit-learn==0.24.2 in c:\users\ania\appdata\roaming\python\python38\site-packages (0.24.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn==0.24.2) (2.1.0)
Requirement already satisfied: scipy>=0.19.1 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn==0.24.2) (1.5.2)
Requirement already satisfied: joblib>=0.11 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn==0.24.2) (0.17.0)
Requirement already satisfied: numpy>=1.13.3 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn==0.24.2) (1.19.2)
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
np.set_printoptions(formatter={'float_kind':'{:f}'.format})
header = None
with open('names') as f:
    header = f.read().replace('\n', '').split('\t')
cars_train = pd.read_csv('train/train.tsv', sep="\t", names=header)
cars_train_X = cars_train[["mileage", "year", "brand", "engineType", "engineCapacity"]]
cars_train_X = pd.get_dummies(cars_train_X)
cars_train_Y = cars_train["price"]
input_columns = cars_train_X.columns
#Trenowanie modelu i błąd na train
model = LinearRegression(positive=True)
model.fit(cars_train_X, cars_train_Y)
predictions = model.predict(cars_train_X)
sqrt(mean_squared_error(predictions, cars_train_Y))
30118.8791272898
cars_dev_0_X = pd.read_csv('dev-0/in.tsv', sep="\t", names=header[1:])
cars_dev_0_Y = pd.read_csv('dev-0/expected.tsv', sep="\t", header=None).to_numpy().flatten('F')
#Przygotowanie inputu dla DEV_0
cars_dev_0_X = pd.get_dummies(cars_dev_0_X)
columns_to_add = [x for x in input_columns if x not in cars_dev_0_X.columns]
for column in columns_to_add:
    cars_dev_0_X[column] = 0
cars_dev_0_X = cars_dev_0_X[input_columns]
cars_dev_0_X
mileage year engineCapacity brand_Abarth brand_Aixam brand_Alfa brand_Aston brand_Audi brand_Austin brand_BMW ... brand_Uaz brand_Vauxhall brand_Volkswagen brand_Volvo brand_Warszawa brand_dla brand_star engineType_benzyna engineType_diesel engineType_gaz
0 77000 2015 2000 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
1 186146 2006 1498 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 192000 2007 2500 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
3 220000 2003 1997 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
4 248000 2008 1900 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 0 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 146000 2004 1686 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
996 19323 2015 1598 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
997 27561 2016 1598 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
998 155000 2012 1600 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
999 31438 2015 3000 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0

1000 rows × 96 columns

#Wynik dla DEV_0
predictions_dev = model.predict(cars_dev_0_X)
np.savetxt("dev-0/out.tsv", predictions_dev, fmt='%f')
sqrt(mean_squared_error(predictions_dev, cars_dev_0_Y))
33193.54683638966
cars_test_A_X = pd.read_csv('test-A/in.tsv', sep="\t", names=header[1:])
#Dostosowanie inputu dla testu
cars_test_A_X = pd.get_dummies(cars_test_A_X)
columns_to_add = [x for x in input_columns if x not in cars_test_A_X.columns]
for column in columns_to_add:
    cars_test_A_X[column] = 0
cars_test_A_X = cars_test_A_X[input_columns]
cars_test_A_X
mileage year engineCapacity brand_Abarth brand_Aixam brand_Alfa brand_Aston brand_Audi brand_Austin brand_BMW ... brand_Uaz brand_Vauxhall brand_Volkswagen brand_Volvo brand_Warszawa brand_dla brand_star engineType_benzyna engineType_diesel engineType_gaz
0 203000 2010 1500 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
1 39000 2008 1000 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
2 190000 2005 1600 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
3 230000 2001 1598 0 0 0 0 0 0 0 ... 0 0 1 0 0 0 0 1 0 0
4 189000 2000 1600 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
995 465000 2005 2500 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
996 89074 2014 2000 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 1 0
997 21711 2014 1329 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0
998 144000 2014 1500 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 1 0
999 113606 2000 4000 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 1 0 0

1000 rows × 96 columns

#Predykcja i zapisywanie wyniku dla testu
predictions_test = model.predict(cars_test_A_X)
np.savetxt("test-A/out.tsv", predictions_test,  fmt='%f')