auta-public/cars price prediction.ipynb at master

!pip install scikit-learn==0.24.2 --user

Requirement already satisfied: scikit-learn==0.24.2 in c:\users\ania\appdata\roaming\python\python38\site-packages (0.24.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn==0.24.2) (2.1.0)
Requirement already satisfied: scipy>=0.19.1 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn==0.24.2) (1.5.2)
Requirement already satisfied: joblib>=0.11 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn==0.24.2) (0.17.0)
Requirement already satisfied: numpy>=1.13.3 in c:\programdata\anaconda3\lib\site-packages (from scikit-learn==0.24.2) (1.19.2)

import pandas as pd
import numpy as np
from math import sqrt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

header = None
with open('names') as f:
    header = f.read().replace('\n', '').split('\t')
cars_train = pd.read_csv('train/train.tsv', sep="\t", names=header)
cars_train_X = cars_train[["mileage", "year", "brand", "engineType", "engineCapacity"]]
cars_train_X = pd.get_dummies(cars_train_X)
cars_train_Y = cars_train["price"]
input_columns = cars_train_X.columns

#Trenowanie modelu i błąd na train
model = LinearRegression(positive=True)
model.fit(cars_train_X, cars_train_Y)
predictions = model.predict(cars_train_X)
sqrt(mean_squared_error(predictions, cars_train_Y))

30118.8791272898

cars_dev_0_X = pd.read_csv('dev-0/in.tsv', sep="\t", names=header[1:])
cars_dev_0_Y = pd.read_csv('dev-0/expected.tsv', sep="\t", header=None).to_numpy().flatten('F')

#Przygotowanie inputu dla DEV_0
cars_dev_0_X = pd.get_dummies(cars_dev_0_X)
columns_to_add = [x for x in input_columns if x not in cars_dev_0_X.columns]
for column in columns_to_add:
    cars_dev_0_X[column] = 0
cars_dev_0_X = cars_dev_0_X[input_columns]
cars_dev_0_X

	mileage	year	engineCapacity	brand_Abarth	brand_Aixam	brand_Alfa	brand_Aston	brand_Audi	brand_Austin	brand_BMW	...	brand_Uaz	brand_Vauxhall	brand_Volkswagen	brand_Volvo	brand_Warszawa	brand_dla	brand_star	engineType_benzyna	engineType_diesel	engineType_gaz
0	77000	2015	2000	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
1	186146	2006	1498	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
2	192000	2007	2500	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
3	220000	2003	1997	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
4	248000	2008	1900	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	146000	2004	1686	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
996	19323	2015	1598	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
997	27561	2016	1598	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
998	155000	2012	1600	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
999	31438	2015	3000	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0

1000 rows × 96 columns

#Wynik dla DEV_0
predictions_dev = model.predict(cars_dev_0_X)
np.savetxt("dev-0/out.tsv", predictions_dev, fmt='%f')
sqrt(mean_squared_error(predictions_dev, cars_dev_0_Y))

33193.54683638966

cars_test_A_X = pd.read_csv('test-A/in.tsv', sep="\t", names=header[1:])

#Dostosowanie inputu dla testu
cars_test_A_X = pd.get_dummies(cars_test_A_X)
columns_to_add = [x for x in input_columns if x not in cars_test_A_X.columns]
for column in columns_to_add:
    cars_test_A_X[column] = 0
cars_test_A_X = cars_test_A_X[input_columns]
cars_test_A_X

	mileage	year	engineCapacity	brand_Abarth	brand_Aixam	brand_Alfa	brand_Aston	brand_Audi	brand_Austin	brand_BMW	...	brand_Uaz	brand_Vauxhall	brand_Volkswagen	brand_Volvo	brand_Warszawa	brand_dla	brand_star	engineType_benzyna	engineType_diesel	engineType_gaz
0	203000	2010	1500	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
1	39000	2008	1000	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
2	190000	2005	1600	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
3	230000	2001	1598	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	1	0	0
4	189000	2000	1600	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	1	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	465000	2005	2500	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
996	89074	2014	2000	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	1	0
997	21711	2014	1329	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0
998	144000	2014	1500	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	1	0
999	113606	2000	4000	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	1	0	0

1000 rows × 96 columns

#Predykcja i zapisywanie wyniku dla testu
predictions_test = model.predict(cars_test_A_X)
np.savetxt("test-A/out.tsv", predictions_test,  fmt='%f')

32 KiB Raw Permalink Blame History Unescape Escape

32 KiB

Raw Permalink Blame History