forked from kubapok/auta-public
LinearRegression
This commit is contained in:
parent
fc12d3b07a
commit
811aceb045
1000
dev-0/out.tsv
Normal file
1000
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
59
main.py
59
main.py
@ -1,5 +1,9 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.linear_model import LinearRegression
|
||||||
|
from sklearn.metrics import mean_squared_error
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
# Read column names
|
# Read column names
|
||||||
col_names = []
|
col_names = []
|
||||||
@ -13,47 +17,64 @@ test = pd.read_table('test-A/in.tsv', error_bad_lines=False,
|
|||||||
header=None, names=col_names[1:])
|
header=None, names=col_names[1:])
|
||||||
train = pd.read_table('train/train.tsv', error_bad_lines=False,
|
train = pd.read_table('train/train.tsv', error_bad_lines=False,
|
||||||
header=None, names=col_names)
|
header=None, names=col_names)
|
||||||
test_expected = pd.read_table('dev-0/expected.tsv', error_bad_lines=False,
|
dev_expected = pd.read_table('dev-0/expected.tsv', error_bad_lines=False,
|
||||||
header=None)
|
header=None)
|
||||||
|
|
||||||
# Create dummies for brand
|
# Create dummies for columns
|
||||||
train = pd.get_dummies(train, columns=['engineType'])
|
for c in train.select_dtypes(include=object).columns.values:
|
||||||
|
train[c] = train[c].astype("category").cat.codes
|
||||||
|
for c in dev.select_dtypes(include=object).columns.values:
|
||||||
|
dev[c] = dev[c].astype("category").cat.codes
|
||||||
|
for c in test.select_dtypes(include=object).columns.values:
|
||||||
|
test[c] = test[c].astype("category").cat.codes
|
||||||
|
|
||||||
# Sprawdzanie ile jest odstających wartości dla price
|
# Sprawdzanie ile jest odstających wartości dla price
|
||||||
fig, ax = plt.subplots(1, 2)
|
# fig, ax = plt.subplots(1, 2)
|
||||||
fig.set_figheight(15)
|
# fig.set_figheight(15)
|
||||||
fig.set_figwidth(20)
|
# fig.set_figwidth(20)
|
||||||
ax[0].boxplot(train['price'])
|
# ax[0].boxplot(train['price'])
|
||||||
ax[0].set_title('price')
|
# ax[0].set_title('price')
|
||||||
ax[1].boxplot(train['mileage'])
|
# ax[1].boxplot(train['mileage'])
|
||||||
ax[1].set_title('mileage')
|
# ax[1].set_title('mileage')
|
||||||
plt.show()
|
# plt.show()
|
||||||
|
|
||||||
# Usunięcie odstających wartości
|
# Usunięcie odstających wartości
|
||||||
priceMin = 0
|
priceMin = 0
|
||||||
for price in train['price']:
|
for price in train['price']:
|
||||||
if price < 1000:
|
if price < 1000:
|
||||||
priceMin += 1
|
priceMin += 1
|
||||||
print("Price min cut: " + str(priceMin))
|
# print("Price min cut: " + str(priceMin))
|
||||||
|
|
||||||
priceMax = 0
|
priceMax = 0
|
||||||
for price in train['price']:
|
for price in train['price']:
|
||||||
if price > 1000000:
|
if price > 1000000:
|
||||||
priceMin += 1
|
priceMin += 1
|
||||||
print("Price max cut: " + str(priceMax))
|
# print("Price max cut: " + str(priceMax))
|
||||||
|
|
||||||
mileageMin = 0
|
mileageMin = 0
|
||||||
for m in train['mileage']:
|
for m in train['mileage']:
|
||||||
if m < 100:
|
if m < 100:
|
||||||
mileageMin += 1
|
mileageMin += 1
|
||||||
print("Mileage min cut: " + str(mileageMin))
|
# print("Mileage min cut: " + str(mileageMin))
|
||||||
|
|
||||||
train = train.loc[(train['price'] > 1000)]
|
train = train.loc[(train['price'] > 1000)]
|
||||||
train = train.loc[(train['mileage'] > 100)]
|
train = train.loc[(train['mileage'] > 100)]
|
||||||
|
|
||||||
# Split train set to X and Y
|
# Split train set to X and Y
|
||||||
X = train.loc[:, train.columns != 'price']
|
X_train = train.loc[:, train.columns != 'price']
|
||||||
Y = train['price']
|
Y_train = train['price']
|
||||||
|
|
||||||
# print(train)
|
# Create Linear regresion model
|
||||||
# print(col_names)
|
clf = LinearRegression().fit(X_train, Y_train)
|
||||||
|
|
||||||
|
# # Predict
|
||||||
|
dev_p = clf.predict(dev)
|
||||||
|
test_p = clf.predict(test)
|
||||||
|
|
||||||
|
# # Accuracy
|
||||||
|
score = mean_squared_error(dev_p, dev_expected, squared=False)
|
||||||
|
print("RMSE: " + str(score))
|
||||||
|
|
||||||
|
# # Save to files
|
||||||
|
dev_p.tofile('./dev-0/out.tsv', sep='\n')
|
||||||
|
test_p.tofile('./test-A/out.tsv', sep='\n')
|
||||||
|
1000
test-A/out.tsv
Normal file
1000
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user