Projekt_Uczenie_Maszynowe/main.ipynb
2021-06-16 17:11:47 +02:00

6.6 KiB

import pandas
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score
import torch
from torch import nn
from sklearn import preprocessing
import numpy as np
from sklearn.naive_bayes import GaussianNB

Przygotowanie danych

r_in = './train/train.tsv'
dev_expected= './dev-0/expected.tsv'
r_ind_ev = './dev-0/in.tsv'

expected = pd.read_csv(dev_expected, error_bad_lines=False, header=None, sep="\t")
Y_test = expected[0]

with open('./names') as f_names:
    names = f_names.read().rstrip('\n').split('\t')

tsv_read = pandas.read_table(r_in, error_bad_lines=False, sep='\t', names=names)
tsv_read_dev = pandas.read_table(r_ind_ev, error_bad_lines=False, sep='\t',
                             names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity'])


train = pandas.get_dummies(tsv_read, columns=['engineType'])

categorical_cols = train.select_dtypes(include=object).columns.values
for col in categorical_cols:
    train[col] = train[col].astype('category').cat.codes

train = train.loc[(train['price'] > 1000)]

X = train.loc[:, train.columns != 'price']


dev = pandas.get_dummies(tsv_read_dev, columns=['engineType'])

categorical_cols1 = dev.select_dtypes(include=object).columns.values
for col in categorical_cols1:
    dev[col] = dev[col].astype('category').cat.codes
print(X.size)
print(dev.size)
print(Y_test.size)
print(dev.columns)
335531
7000
1000
Index(['mileage', 'year', 'brand', 'engineCapacity', 'engineType_benzyna',
       'engineType_diesel', 'engineType_gaz'],
      dtype='object')

Regresja Liniowa

clf = LinearRegression().fit(X, train['price'])
predictions = clf.predict(dev)

test = pandas.get_dummies(tsv_read_test_A, columns=['engineType'])
print("MSE: ", mean_squared_error(Y_test, predictions))
MSE:  1163801682.3714898

Pytroch regresja logistyczna

dev = dev[['mileage','year','brand','engineCapacity', 'engineType_benzyna', 'engineType_diesel', 'engineType_gaz']].astype(np.float32)
X = X[['mileage','year','brand','engineCapacity', 'engineType_benzyna', 'engineType_diesel', 'engineType_gaz']].astype(np.float32)
ytrain = train['price'].astype(np.float32)
Y_test = Y_test.astype(np.float32)


torch_tensor_X = torch.from_numpy(X.values)
torch_tensor_Y = torch.from_numpy(ytrain.values.reshape(47933,1))
torch_tensor_dev = torch.from_numpy(dev.values)
torch_tensor_Y_test = torch.from_numpy(Y_test.values)
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        out = self.linear(x)
        return self.sigmoid(out)


learning_rate = 0.0002
input_dim = 7
output_dim = 1

model = LogisticRegressionModel(input_dim, output_dim)
criterion = torch.nn.BCELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

for epoch in range(10):
    # print ("Epoch #",epoch)
    model.train()
    optimizer.zero_grad()
    # Forward pass
    y_pred = model(torch_tensor_X)
    # Compute Loss
    loss = criterion(y_pred, torch_tensor_Y)
    # print(loss.item())
    # Backward pass
    loss.backward()
    optimizer.step()
predictions = model(torch_tensor_dev)
print("MSE: ", mean_squared_error(torch_tensor_Y_test, np.argmax(predictions.detach().numpy(), axis=1)))
MSE:  4107035476.14

Naiwny Bayes

gnb = GaussianNB()
predictions = gnb.fit(X, train['price']).predict(dev)
print("MSE: ", mean_squared_error(Y_test, predictions))
MSE:  1648858588.032