{ "cells": [ { "cell_type": "code", "execution_count": 144, "id": "44a708aa", "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], "source": [ "import pandas\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.metrics import mean_squared_error\n", "from sklearn.metrics import precision_score\n", "import torch\n", "from torch import nn\n", "from sklearn import preprocessing\n", "import numpy as np\n", "from sklearn.naive_bayes import GaussianNB" ] }, { "cell_type": "markdown", "id": "7a4557aa", "metadata": {}, "source": [ "Przygotowanie danych" ] }, { "cell_type": "code", "execution_count": 146, "id": "de736649", "metadata": {}, "outputs": [], "source": [ "r_in = './train/train.tsv'\n", "dev_expected= './dev-0/expected.tsv'\n", "r_ind_ev = './dev-0/in.tsv'\n", "\n", "expected = pd.read_csv(dev_expected, error_bad_lines=False, header=None, sep=\"\\t\")\n", "Y_test = expected[0]\n", "\n", "with open('./names') as f_names:\n", " names = f_names.read().rstrip('\\n').split('\\t')\n", "\n", "tsv_read = pandas.read_table(r_in, error_bad_lines=False, sep='\\t', names=names)\n", "tsv_read_dev = pandas.read_table(r_ind_ev, error_bad_lines=False, sep='\\t',\n", " names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity'])\n", "\n", "\n", "train = pandas.get_dummies(tsv_read, columns=['engineType'])\n", "\n", "categorical_cols = train.select_dtypes(include=object).columns.values\n", "for col in categorical_cols:\n", " train[col] = train[col].astype('category').cat.codes\n", "\n", "train = train.loc[(train['price'] > 1000)]\n", "\n", "X = train.loc[:, train.columns != 'price']\n", "\n", "\n", "dev = pandas.get_dummies(tsv_read_dev, columns=['engineType'])\n", "\n", "categorical_cols1 = dev.select_dtypes(include=object).columns.values\n", "for col in categorical_cols1:\n", " dev[col] = dev[col].astype('category').cat.codes\n" ] }, { "cell_type": "code", "execution_count": 147, "id": "b8e71b16", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "335531\n", "7000\n", "1000\n", "Index(['mileage', 'year', 'brand', 'engineCapacity', 'engineType_benzyna',\n", " 'engineType_diesel', 'engineType_gaz'],\n", " dtype='object')\n" ] } ], "source": [ "print(X.size)\n", "print(dev.size)\n", "print(Y_test.size)\n", "print(dev.columns)\n" ] }, { "cell_type": "markdown", "id": "add6af4d", "metadata": {}, "source": [ "Regresja Liniowa" ] }, { "cell_type": "code", "execution_count": 148, "id": "ac09c69c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MSE: 1163801682.3714898\n" ] } ], "source": [ "clf = LinearRegression().fit(X, train['price'])\n", "predictions = clf.predict(dev)\n", "\n", "test = pandas.get_dummies(tsv_read_test_A, columns=['engineType'])\n", "print(\"MSE: \", mean_squared_error(Y_test, predictions))" ] }, { "cell_type": "markdown", "id": "6e19cd2f", "metadata": {}, "source": [ "Pytroch regresja logistyczna" ] }, { "cell_type": "code", "execution_count": 149, "id": "fb9d136a", "metadata": {}, "outputs": [], "source": [ "dev = dev[['mileage','year','brand','engineCapacity', 'engineType_benzyna', 'engineType_diesel', 'engineType_gaz']].astype(np.float32)\n", "X = X[['mileage','year','brand','engineCapacity', 'engineType_benzyna', 'engineType_diesel', 'engineType_gaz']].astype(np.float32)\n", "ytrain = train['price'].astype(np.float32)\n", "Y_test = Y_test.astype(np.float32)\n", "\n", "\n", "torch_tensor_X = torch.from_numpy(X.values)\n", "torch_tensor_Y = torch.from_numpy(ytrain.values.reshape(47933,1))\n", "torch_tensor_dev = torch.from_numpy(dev.values)\n", "torch_tensor_Y_test = torch.from_numpy(Y_test.values)\n" ] }, { "cell_type": "code", "execution_count": 151, "id": "9d50f6f4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MSE: 4107035476.14\n" ] } ], "source": [ "class LogisticRegressionModel(nn.Module):\n", " def __init__(self, input_dim, output_dim):\n", " super(LogisticRegressionModel, self).__init__()\n", " self.linear = nn.Linear(input_dim, output_dim)\n", " self.sigmoid = nn.Sigmoid()\n", " def forward(self, x):\n", " out = self.linear(x)\n", " return self.sigmoid(out)\n", "\n", "\n", "learning_rate = 0.0002\n", "input_dim = 7\n", "output_dim = 1\n", "\n", "model = LogisticRegressionModel(input_dim, output_dim)\n", "criterion = torch.nn.BCELoss(reduction='mean')\n", "optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)\n", "\n", "for epoch in range(10):\n", " # print (\"Epoch #\",epoch)\n", " model.train()\n", " optimizer.zero_grad()\n", " # Forward pass\n", " y_pred = model(torch_tensor_X)\n", " # Compute Loss\n", " loss = criterion(y_pred, torch_tensor_Y)\n", " # print(loss.item())\n", " # Backward pass\n", " loss.backward()\n", " optimizer.step()\n", "predictions = model(torch_tensor_dev)\n", "print(\"MSE: \", mean_squared_error(torch_tensor_Y_test, np.argmax(predictions.detach().numpy(), axis=1)))" ] }, { "cell_type": "markdown", "id": "995ea3a5", "metadata": {}, "source": [ "Naiwny Bayes" ] }, { "cell_type": "code", "execution_count": 152, "id": "0aa24c4c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MSE: 1648858588.032\n" ] } ], "source": [ "gnb = GaussianNB()\n", "predictions = gnb.fit(X, train['price']).predict(dev)\n", "print(\"MSE: \", mean_squared_error(Y_test, predictions))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 5 }