Projekt_Uczenie_Maszynowe/main.ipynb
2021-06-16 17:11:47 +02:00

256 lines
6.6 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 144,
"id": "44a708aa",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"import pandas\n",
"from sklearn.linear_model import LinearRegression\n",
"from sklearn.metrics import mean_squared_error\n",
"from sklearn.metrics import precision_score\n",
"import torch\n",
"from torch import nn\n",
"from sklearn import preprocessing\n",
"import numpy as np\n",
"from sklearn.naive_bayes import GaussianNB"
]
},
{
"cell_type": "markdown",
"id": "7a4557aa",
"metadata": {},
"source": [
"Przygotowanie danych"
]
},
{
"cell_type": "code",
"execution_count": 146,
"id": "de736649",
"metadata": {},
"outputs": [],
"source": [
"r_in = './train/train.tsv'\n",
"dev_expected= './dev-0/expected.tsv'\n",
"r_ind_ev = './dev-0/in.tsv'\n",
"\n",
"expected = pd.read_csv(dev_expected, error_bad_lines=False, header=None, sep=\"\\t\")\n",
"Y_test = expected[0]\n",
"\n",
"with open('./names') as f_names:\n",
" names = f_names.read().rstrip('\\n').split('\\t')\n",
"\n",
"tsv_read = pandas.read_table(r_in, error_bad_lines=False, sep='\\t', names=names)\n",
"tsv_read_dev = pandas.read_table(r_ind_ev, error_bad_lines=False, sep='\\t',\n",
" names=['mileage', 'year', 'brand', 'engineType', 'engineCapacity'])\n",
"\n",
"\n",
"train = pandas.get_dummies(tsv_read, columns=['engineType'])\n",
"\n",
"categorical_cols = train.select_dtypes(include=object).columns.values\n",
"for col in categorical_cols:\n",
" train[col] = train[col].astype('category').cat.codes\n",
"\n",
"train = train.loc[(train['price'] > 1000)]\n",
"\n",
"X = train.loc[:, train.columns != 'price']\n",
"\n",
"\n",
"dev = pandas.get_dummies(tsv_read_dev, columns=['engineType'])\n",
"\n",
"categorical_cols1 = dev.select_dtypes(include=object).columns.values\n",
"for col in categorical_cols1:\n",
" dev[col] = dev[col].astype('category').cat.codes\n"
]
},
{
"cell_type": "code",
"execution_count": 147,
"id": "b8e71b16",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"335531\n",
"7000\n",
"1000\n",
"Index(['mileage', 'year', 'brand', 'engineCapacity', 'engineType_benzyna',\n",
" 'engineType_diesel', 'engineType_gaz'],\n",
" dtype='object')\n"
]
}
],
"source": [
"print(X.size)\n",
"print(dev.size)\n",
"print(Y_test.size)\n",
"print(dev.columns)\n"
]
},
{
"cell_type": "markdown",
"id": "add6af4d",
"metadata": {},
"source": [
"Regresja Liniowa"
]
},
{
"cell_type": "code",
"execution_count": 148,
"id": "ac09c69c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MSE: 1163801682.3714898\n"
]
}
],
"source": [
"clf = LinearRegression().fit(X, train['price'])\n",
"predictions = clf.predict(dev)\n",
"\n",
"test = pandas.get_dummies(tsv_read_test_A, columns=['engineType'])\n",
"print(\"MSE: \", mean_squared_error(Y_test, predictions))"
]
},
{
"cell_type": "markdown",
"id": "6e19cd2f",
"metadata": {},
"source": [
"Pytroch regresja logistyczna"
]
},
{
"cell_type": "code",
"execution_count": 149,
"id": "fb9d136a",
"metadata": {},
"outputs": [],
"source": [
"dev = dev[['mileage','year','brand','engineCapacity', 'engineType_benzyna', 'engineType_diesel', 'engineType_gaz']].astype(np.float32)\n",
"X = X[['mileage','year','brand','engineCapacity', 'engineType_benzyna', 'engineType_diesel', 'engineType_gaz']].astype(np.float32)\n",
"ytrain = train['price'].astype(np.float32)\n",
"Y_test = Y_test.astype(np.float32)\n",
"\n",
"\n",
"torch_tensor_X = torch.from_numpy(X.values)\n",
"torch_tensor_Y = torch.from_numpy(ytrain.values.reshape(47933,1))\n",
"torch_tensor_dev = torch.from_numpy(dev.values)\n",
"torch_tensor_Y_test = torch.from_numpy(Y_test.values)\n"
]
},
{
"cell_type": "code",
"execution_count": 151,
"id": "9d50f6f4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MSE: 4107035476.14\n"
]
}
],
"source": [
"class LogisticRegressionModel(nn.Module):\n",
" def __init__(self, input_dim, output_dim):\n",
" super(LogisticRegressionModel, self).__init__()\n",
" self.linear = nn.Linear(input_dim, output_dim)\n",
" self.sigmoid = nn.Sigmoid()\n",
" def forward(self, x):\n",
" out = self.linear(x)\n",
" return self.sigmoid(out)\n",
"\n",
"\n",
"learning_rate = 0.0002\n",
"input_dim = 7\n",
"output_dim = 1\n",
"\n",
"model = LogisticRegressionModel(input_dim, output_dim)\n",
"criterion = torch.nn.BCELoss(reduction='mean')\n",
"optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)\n",
"\n",
"for epoch in range(10):\n",
" # print (\"Epoch #\",epoch)\n",
" model.train()\n",
" optimizer.zero_grad()\n",
" # Forward pass\n",
" y_pred = model(torch_tensor_X)\n",
" # Compute Loss\n",
" loss = criterion(y_pred, torch_tensor_Y)\n",
" # print(loss.item())\n",
" # Backward pass\n",
" loss.backward()\n",
" optimizer.step()\n",
"predictions = model(torch_tensor_dev)\n",
"print(\"MSE: \", mean_squared_error(torch_tensor_Y_test, np.argmax(predictions.detach().numpy(), axis=1)))"
]
},
{
"cell_type": "markdown",
"id": "995ea3a5",
"metadata": {},
"source": [
"Naiwny Bayes"
]
},
{
"cell_type": "code",
"execution_count": 152,
"id": "0aa24c4c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MSE: 1648858588.032\n"
]
}
],
"source": [
"gnb = GaussianNB()\n",
"predictions = gnb.fit(X, train['price']).predict(dev)\n",
"print(\"MSE: \", mean_squared_error(Y_test, predictions))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}