{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "exposed-browse", "metadata": {}, "outputs": [], "source": [ "import pandas as pd \n", "import numpy as np\n", "import matplotlib.pyplot as plt \n", "import seaborn as sns\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import confusion_matrix, classification_report\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.svm import SVC\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.neighbors import KNeighborsClassifier\n", "import xgboost as xgb\n", "import torch\n", "\n", "def eval(model,x_test,y_test, nn = None):\n", " if nn == None:\n", " print(classification_report(y_test,model.predict(x_test)))\n", " else:\n", " y_pred = model(torch.tensor(x_test.astype(np.float32)))\n", " y_pred = y_pred.cpu().detach().numpy() \n", " y_pred = (y_pred > 0.5)\n", " y_pred = np.asarray(y_pred, dtype=np.int32)\n", " print(classification_report(y_test,y_pred))\n", " " ] }, { "cell_type": "code", "execution_count": 2, "id": "unlimited-modern", "metadata": {}, "outputs": [], "source": [ "data = pd.read_csv('water_potability.csv')" ] }, { "cell_type": "code", "execution_count": 3, "id": "simplified-ranch", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
phHardnessSolidsChloraminesSulfateConductivityOrganic_carbonTrihalomethanesTurbidityPotability
count2785.0000003276.0000003276.0000003276.0000002495.0000003276.0000003276.0000003114.0000003276.0000003276.000000
mean7.080795196.36949622014.0925267.122277333.775777426.20511114.28497066.3962933.9667860.390110
std1.59432032.8797618768.5708281.58308541.41684080.8240643.30816216.1750080.7803820.487849
min0.00000047.432000320.9426110.352000129.000000181.4837542.2000000.7380001.4500000.000000
25%6.093092176.85053815666.6902976.127421307.699498365.73441412.06580155.8445363.4397110.000000
50%7.036752196.96762720927.8336077.130299333.073546421.88496814.21833866.6224853.9550280.000000
75%8.062066216.66745627332.7621278.114887359.950170481.79230416.55765277.3374734.5003201.000000
max14.000000323.12400061227.19600813.127000481.030642753.34262028.300000124.0000006.7390001.000000
\n", "
" ], "text/plain": [ " ph Hardness Solids Chloramines Sulfate \\\n", "count 2785.000000 3276.000000 3276.000000 3276.000000 2495.000000 \n", "mean 7.080795 196.369496 22014.092526 7.122277 333.775777 \n", "std 1.594320 32.879761 8768.570828 1.583085 41.416840 \n", "min 0.000000 47.432000 320.942611 0.352000 129.000000 \n", "25% 6.093092 176.850538 15666.690297 6.127421 307.699498 \n", "50% 7.036752 196.967627 20927.833607 7.130299 333.073546 \n", "75% 8.062066 216.667456 27332.762127 8.114887 359.950170 \n", "max 14.000000 323.124000 61227.196008 13.127000 481.030642 \n", "\n", " Conductivity Organic_carbon Trihalomethanes Turbidity Potability \n", "count 3276.000000 3276.000000 3114.000000 3276.000000 3276.000000 \n", "mean 426.205111 14.284970 66.396293 3.966786 0.390110 \n", "std 80.824064 3.308162 16.175008 0.780382 0.487849 \n", "min 181.483754 2.200000 0.738000 1.450000 0.000000 \n", "25% 365.734414 12.065801 55.844536 3.439711 0.000000 \n", "50% 421.884968 14.218338 66.622485 3.955028 0.000000 \n", "75% 481.792304 16.557652 77.337473 4.500320 1.000000 \n", "max 753.342620 28.300000 124.000000 6.739000 1.000000 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.describe()" ] }, { "cell_type": "code", "execution_count": 4, "id": "present-wedding", "metadata": {}, "outputs": [], "source": [ "data = data.dropna()" ] }, { "cell_type": "code", "execution_count": 5, "id": "informal-allergy", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(20,17))\n", "matrix = np.triu(data.corr())\n", "sns.heatmap(data.corr(), annot=True, linewidth=0, mask=matrix)" ] }, { "cell_type": "code", "execution_count": 6, "id": "respiratory-assumption", "metadata": {}, "outputs": [], "source": [ "data_feat = data.iloc[:, :-1]\n", "data_target = data.iloc[:, -1]\n", "st = StandardScaler()\n", "data_feat = st.fit_transform(data_feat)" ] }, { "cell_type": "code", "execution_count": 7, "id": "fallen-buffer", "metadata": {}, "outputs": [], "source": [ "X_train, X_test, Y_train, Y_test = train_test_split(data_feat, data_target, random_state = 42, test_size = 0.1)" ] }, { "cell_type": "code", "execution_count": 22, "id": "accomplished-afternoon", "metadata": {}, "outputs": [], "source": [ "DT = DecisionTreeClassifier(criterion = 'gini').fit(X_train, Y_train)" ] }, { "cell_type": "code", "execution_count": 23, "id": "laughing-revelation", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.72 0.67 0.69 123\n", " 1 0.53 0.59 0.56 79\n", "\n", " accuracy 0.64 202\n", " macro avg 0.63 0.63 0.63 202\n", "weighted avg 0.65 0.64 0.64 202\n", "\n" ] } ], "source": [ "eval(DT,X_test,Y_test)" ] }, { "cell_type": "code", "execution_count": 10, "id": "practical-perspective", "metadata": {}, "outputs": [], "source": [ "SV = SVC(gamma='scale').fit(X_train, Y_train)" ] }, { "cell_type": "code", "execution_count": 11, "id": "alive-trick", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.70 0.93 0.80 123\n", " 1 0.78 0.39 0.52 79\n", "\n", " accuracy 0.72 202\n", " macro avg 0.74 0.66 0.66 202\n", "weighted avg 0.73 0.72 0.69 202\n", "\n" ] } ], "source": [ "eval(SV,X_test,Y_test)" ] }, { "cell_type": "code", "execution_count": 12, "id": "brave-islam", "metadata": {}, "outputs": [], "source": [ "import xgboost as xgb\n", "xgb_model = xgb.XGBClassifier(objective=\"binary:logistic\",\n", " eval_metric='logloss',\n", " use_label_encoder = False,\n", " booster='gbtree',reg_lambda=3).fit(X_train, Y_train)" ] }, { "cell_type": "code", "execution_count": 13, "id": "controlling-ordinary", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.67 0.72 0.69 123\n", " 1 0.50 0.44 0.47 79\n", "\n", " accuracy 0.61 202\n", " macro avg 0.58 0.58 0.58 202\n", "weighted avg 0.60 0.61 0.60 202\n", "\n" ] } ], "source": [ "eval(xgb_model,X_test,Y_test)" ] }, { "cell_type": "code", "execution_count": 24, "id": "fluid-responsibility", "metadata": {}, "outputs": [], "source": [ "kn = KNeighborsClassifier(n_neighbors=1,\n", " algorithm='kd_tree').fit(X_train, Y_train)" ] }, { "cell_type": "code", "execution_count": 25, "id": "coordinate-steal", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.67 0.71 0.69 123\n", " 1 0.50 0.46 0.48 79\n", "\n", " accuracy 0.61 202\n", " macro avg 0.58 0.58 0.58 202\n", "weighted avg 0.60 0.61 0.61 202\n", "\n" ] } ], "source": [ "eval(kn,X_test,Y_test)" ] }, { "cell_type": "code", "execution_count": 40, "id": "addressed-nothing", "metadata": {}, "outputs": [], "source": [ "class NeuralNetworkModel(torch.nn.Module):\n", "\n", " def __init__(self):\n", " super(NeuralNetworkModel, self).__init__()\n", " self.fc1 = torch.nn.Linear(9,2000)\n", " self.fc2 = torch.nn.Linear(2000,1000)\n", " self.fc3 = torch.nn.Linear(1000,1)\n", "\n", " def forward(self, x):\n", " x = self.fc1(x)\n", " x = torch.relu(x)\n", " x = self.fc2(x)\n", " x = torch.relu(x)\n", " x = self.fc3(x)\n", " x = torch.sigmoid(x)\n", " return x\n", " \n", "model_nn = NeuralNetworkModel()\n", "criterion = torch.nn.BCELoss()\n", "optimizer = torch.optim.SGD(model_nn.parameters(), lr=0.01)\n", " \n", "batch_size = 3\n", "\n", " \n", "for epoch in range(6):\n", " loss_score = 0\n", " acc_score = 0\n", " items_total = 0\n", " model_nn.train()\n", " for i in range(0, Y_train.to_numpy().shape[0], batch_size):\n", " X = X_train[i:i+batch_size]\n", " X = torch.tensor(X.astype(np.float32))\n", " Y = Y_train[i:i+batch_size].to_numpy()\n", " Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n", " \n", " Y_predictions = model_nn(X)\n", " acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n", " items_total += Y.shape[0] \n", "\n", " optimizer.zero_grad()\n", " loss = criterion(Y_predictions, Y)\n", " loss.backward()\n", " optimizer.step()\n", "\n", "\n", " loss_score += loss.item() * Y.shape[0]" ] }, { "cell_type": "code", "execution_count": 41, "id": "brilliant-surge", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.70 0.89 0.78 123\n", " 1 0.70 0.41 0.51 79\n", "\n", " accuracy 0.70 202\n", " macro avg 0.70 0.65 0.65 202\n", "weighted avg 0.70 0.70 0.68 202\n", "\n" ] } ], "source": [ "eval(model_nn,X_test,Y_test, 1)" ] }, { "cell_type": "code", "execution_count": 18, "id": "controlled-measure", "metadata": {}, "outputs": [], "source": [ "class LogisticRegressionModel(torch.nn.Module):\n", "\n", " def __init__(self):\n", " super(LogisticRegressionModel, self).__init__()\n", " self.fc = torch.nn.Linear(9,1)\n", "\n", " def forward(self, x):\n", " x = self.fc(x)\n", " x = torch.sigmoid(x)\n", " return x\n", " \n", "lr_model = LogisticRegressionModel()\n", "BATCH_SIZE = 2\n", "criterion = torch.nn.BCELoss()\n", "optimizer = torch.optim.Adam(lr_model.parameters(), lr = 0.1)\n", "loss_score = 0\n", "acc_score = 0\n", "items_total = 0\n", "lr_model.train()\n", "for i in range(0, Y_train.shape[0], BATCH_SIZE):\n", " X = X_train[i:i+BATCH_SIZE]\n", " X = torch.tensor(X.astype(np.float32))\n", " Y = Y_train[i:i+BATCH_SIZE].to_numpy()\n", " Y = torch.tensor(Y.astype(np.float32)).reshape(-1,1)\n", " Y_predictions = lr_model(X)\n", " acc_score += torch.sum((Y_predictions > 0.5) == Y).item()\n", " items_total += Y.shape[0] \n", " \n", " optimizer.zero_grad()\n", " loss = criterion(Y_predictions, Y)\n", " loss.backward()\n", " optimizer.step()\n", " \n", "\n", " loss_score += loss.item() * Y.shape[0] " ] }, { "cell_type": "code", "execution_count": 19, "id": "environmental-virginia", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " 0 0.63 0.80 0.71 123\n", " 1 0.47 0.28 0.35 79\n", "\n", " accuracy 0.59 202\n", " macro avg 0.55 0.54 0.53 202\n", "weighted avg 0.57 0.59 0.57 202\n", "\n" ] } ], "source": [ "eval(lr_model,X_test,Y_test, 1)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 5 }