LinearRegression

This commit is contained in:
Maciej Sobkowiak 2021-05-19 00:29:44 +02:00
parent fc12d3b07a
commit 811aceb045
3 changed files with 2040 additions and 19 deletions

1000
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

59
main.py
View File

@ -1,5 +1,9 @@
import pandas as pd import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
# Read column names # Read column names
col_names = [] col_names = []
@ -13,47 +17,64 @@ test = pd.read_table('test-A/in.tsv', error_bad_lines=False,
header=None, names=col_names[1:]) header=None, names=col_names[1:])
train = pd.read_table('train/train.tsv', error_bad_lines=False, train = pd.read_table('train/train.tsv', error_bad_lines=False,
header=None, names=col_names) header=None, names=col_names)
test_expected = pd.read_table('dev-0/expected.tsv', error_bad_lines=False, dev_expected = pd.read_table('dev-0/expected.tsv', error_bad_lines=False,
header=None) header=None)
# Create dummies for brand # Create dummies for columns
train = pd.get_dummies(train, columns=['engineType']) for c in train.select_dtypes(include=object).columns.values:
train[c] = train[c].astype("category").cat.codes
for c in dev.select_dtypes(include=object).columns.values:
dev[c] = dev[c].astype("category").cat.codes
for c in test.select_dtypes(include=object).columns.values:
test[c] = test[c].astype("category").cat.codes
# Sprawdzanie ile jest odstających wartości dla price # Sprawdzanie ile jest odstających wartości dla price
fig, ax = plt.subplots(1, 2) # fig, ax = plt.subplots(1, 2)
fig.set_figheight(15) # fig.set_figheight(15)
fig.set_figwidth(20) # fig.set_figwidth(20)
ax[0].boxplot(train['price']) # ax[0].boxplot(train['price'])
ax[0].set_title('price') # ax[0].set_title('price')
ax[1].boxplot(train['mileage']) # ax[1].boxplot(train['mileage'])
ax[1].set_title('mileage') # ax[1].set_title('mileage')
plt.show() # plt.show()
# Usunięcie odstających wartości # Usunięcie odstających wartości
priceMin = 0 priceMin = 0
for price in train['price']: for price in train['price']:
if price < 1000: if price < 1000:
priceMin += 1 priceMin += 1
print("Price min cut: " + str(priceMin)) # print("Price min cut: " + str(priceMin))
priceMax = 0 priceMax = 0
for price in train['price']: for price in train['price']:
if price > 1000000: if price > 1000000:
priceMin += 1 priceMin += 1
print("Price max cut: " + str(priceMax)) # print("Price max cut: " + str(priceMax))
mileageMin = 0 mileageMin = 0
for m in train['mileage']: for m in train['mileage']:
if m < 100: if m < 100:
mileageMin += 1 mileageMin += 1
print("Mileage min cut: " + str(mileageMin)) # print("Mileage min cut: " + str(mileageMin))
train = train.loc[(train['price'] > 1000)] train = train.loc[(train['price'] > 1000)]
train = train.loc[(train['mileage'] > 100)] train = train.loc[(train['mileage'] > 100)]
# Split train set to X and Y # Split train set to X and Y
X = train.loc[:, train.columns != 'price'] X_train = train.loc[:, train.columns != 'price']
Y = train['price'] Y_train = train['price']
# print(train) # Create Linear regresion model
# print(col_names) clf = LinearRegression().fit(X_train, Y_train)
# # Predict
dev_p = clf.predict(dev)
test_p = clf.predict(test)
# # Accuracy
score = mean_squared_error(dev_p, dev_expected, squared=False)
print("RMSE: " + str(score))
# # Save to files
dev_p.tofile('./dev-0/out.tsv', sep='\n')
test_p.tofile('./test-A/out.tsv', sep='\n')

1000
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff