projekt_rozpoznywanie_grzybow/mushrooms.ipynb
2023-02-03 15:47:49 +01:00

1586 lines
47 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('mushrooms/train/train.tsv',sep='\\t', header=None)\n",
"X_ver = pd.read_csv('mushrooms/dev-0/in.tsv',sep='\\t', header=None)\n",
"y_ver = pd.read_csv('mushrooms/dev-0/expected.tsv',sep='\\t', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 0 1 2 3 4 5 6 7 8 9 ... 13 14 15 16 17 18 19 20 21 22\n",
"0 p x s n t p f c n k ... s w w p w o p k s u\n",
"1 e x s y t a f c b k ... s w w p w o p n n g\n",
"2 p x y w t p f c n n ... s w w p w o p k s u\n",
"3 e x s g f n f w b k ... s w w p w o e n a g\n",
"4 e x y y t a f c b n ... s w w p w o p k n g\n",
"\n",
"[5 rows x 23 columns]\n",
"0 0\n",
"1 0\n",
"2 0\n",
"3 0\n",
"4 0\n",
"5 0\n",
"6 0\n",
"7 0\n",
"8 0\n",
"9 0\n",
"10 0\n",
"11 0\n",
"12 0\n",
"13 0\n",
"14 0\n",
"15 0\n",
"16 0\n",
"17 0\n",
"18 0\n",
"19 0\n",
"20 0\n",
"21 0\n",
"22 0\n",
"dtype: int64\n",
"(6465, 23)\n"
]
}
],
"source": [
"print(df.head())\n",
"print(df.isna().sum())\n",
"print(df.shape)"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 2\n",
"1 6\n",
"2 4\n",
"3 10\n",
"4 2\n",
"5 9\n",
"6 2\n",
"7 2\n",
"8 2\n",
"9 12\n",
"10 2\n",
"11 5\n",
"12 4\n",
"13 4\n",
"14 9\n",
"15 9\n",
"16 1\n",
"17 4\n",
"18 3\n",
"19 5\n",
"20 9\n",
"21 6\n",
"22 7\n",
"dtype: int64"
]
},
"execution_count": 168,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.nunique()"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 6\n",
"1 3\n",
"2 10\n",
"3 2\n",
"4 9\n",
"5 2\n",
"6 2\n",
"7 2\n",
"8 12\n",
"9 2\n",
"10 5\n",
"11 4\n",
"12 4\n",
"13 9\n",
"14 9\n",
"15 1\n",
"16 4\n",
"17 3\n",
"18 5\n",
"19 9\n",
"20 6\n",
"21 7\n",
"dtype: int64"
]
},
"execution_count": 169,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_ver.nunique()"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0_b</th>\n",
" <th>0_c</th>\n",
" <th>0_f</th>\n",
" <th>0_k</th>\n",
" <th>0_s</th>\n",
" <th>0_x</th>\n",
" <th>1_s</th>\n",
" <th>1_y</th>\n",
" <th>1_f</th>\n",
" <th>1_g</th>\n",
" <th>...</th>\n",
" <th>20_s</th>\n",
" <th>20_v</th>\n",
" <th>20_y</th>\n",
" <th>21_d</th>\n",
" <th>21_g</th>\n",
" <th>21_l</th>\n",
" <th>21_m</th>\n",
" <th>21_p</th>\n",
" <th>21_u</th>\n",
" <th>21_w</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>787</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>788</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>789</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>790</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>791</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>792 rows × 117 columns</p>\n",
"</div>"
],
"text/plain": [
" 0_b 0_c 0_f 0_k 0_s 0_x 1_s 1_y 1_f 1_g ... 20_s 20_v 20_y \\\n",
"0 1 0 0 0 0 0 1 0 0 0 ... 0 0 0 \n",
"1 0 0 0 0 0 1 0 1 0 0 ... 0 1 0 \n",
"2 1 0 0 0 0 0 0 1 0 0 ... 0 0 0 \n",
"3 1 0 0 0 0 0 1 0 0 0 ... 1 0 0 \n",
"4 0 0 0 0 0 1 0 1 0 0 ... 0 0 0 \n",
".. ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n",
"787 0 0 0 0 0 1 1 0 0 0 ... 0 1 0 \n",
"788 0 0 1 0 0 0 1 0 0 0 ... 0 1 0 \n",
"789 0 0 0 1 0 0 1 0 0 0 ... 0 1 0 \n",
"790 0 0 0 1 0 0 0 1 0 0 ... 0 1 0 \n",
"791 0 0 0 0 0 1 1 0 0 0 ... 0 1 0 \n",
"\n",
" 21_d 21_g 21_l 21_m 21_p 21_u 21_w \n",
"0 0 0 0 1 0 0 0 \n",
"1 0 1 0 0 0 0 0 \n",
"2 0 0 0 1 0 0 0 \n",
"3 0 0 0 1 0 0 0 \n",
"4 0 0 0 1 0 0 0 \n",
".. ... ... ... ... ... ... ... \n",
"787 1 0 0 0 0 0 0 \n",
"788 0 0 1 0 0 0 0 \n",
"789 0 0 1 0 0 0 0 \n",
"790 0 0 1 0 0 0 0 \n",
"791 0 0 1 0 0 0 0 \n",
"\n",
"[792 rows x 117 columns]"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_ver = pd.get_dummies(y_ver,columns=[0],drop_first=True)\n",
"df2_unique = df[2].unique()\n",
"X_ver[1] = pd.Categorical(X_ver[1],categories=df2_unique)\n",
"X_ver = pd.get_dummies(X_ver)\n",
"X_ver"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0_p</th>\n",
" <th>1_b</th>\n",
" <th>1_c</th>\n",
" <th>1_f</th>\n",
" <th>1_k</th>\n",
" <th>1_s</th>\n",
" <th>1_x</th>\n",
" <th>2_f</th>\n",
" <th>2_g</th>\n",
" <th>2_s</th>\n",
" <th>...</th>\n",
" <th>21_s</th>\n",
" <th>21_v</th>\n",
" <th>21_y</th>\n",
" <th>22_d</th>\n",
" <th>22_g</th>\n",
" <th>22_l</th>\n",
" <th>22_m</th>\n",
" <th>22_p</th>\n",
" <th>22_u</th>\n",
" <th>22_w</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6460</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6461</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6462</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6463</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6464</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6465 rows × 118 columns</p>\n",
"</div>"
],
"text/plain": [
" 0_p 1_b 1_c 1_f 1_k 1_s 1_x 2_f 2_g 2_s ... 21_s 21_v 21_y \\\n",
"0 1 0 0 0 0 0 1 0 0 1 ... 1 0 0 \n",
"1 0 0 0 0 0 0 1 0 0 1 ... 0 0 0 \n",
"2 1 0 0 0 0 0 1 0 0 0 ... 1 0 0 \n",
"3 0 0 0 0 0 0 1 0 0 1 ... 0 0 0 \n",
"4 0 0 0 0 0 0 1 0 0 0 ... 0 0 0 \n",
"... ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n",
"6460 1 0 0 0 1 0 0 0 0 0 ... 0 1 0 \n",
"6461 0 0 0 0 1 0 0 0 0 1 ... 0 0 0 \n",
"6462 0 0 0 1 0 0 0 0 0 1 ... 0 0 0 \n",
"6463 1 0 0 0 1 0 0 0 0 0 ... 0 1 0 \n",
"6464 0 0 0 0 0 0 1 0 0 1 ... 0 0 0 \n",
"\n",
" 22_d 22_g 22_l 22_m 22_p 22_u 22_w \n",
"0 0 0 0 0 0 1 0 \n",
"1 0 1 0 0 0 0 0 \n",
"2 0 0 0 0 0 1 0 \n",
"3 0 1 0 0 0 0 0 \n",
"4 0 1 0 0 0 0 0 \n",
"... ... ... ... ... ... ... ... \n",
"6460 1 0 0 0 0 0 0 \n",
"6461 0 0 1 0 0 0 0 \n",
"6462 0 0 1 0 0 0 0 \n",
"6463 0 0 1 0 0 0 0 \n",
"6464 0 0 1 0 0 0 0 \n",
"\n",
"[6465 rows x 118 columns]"
]
},
"execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.get_dummies(df,columns=[0],drop_first=True)\n",
"df = pd.get_dummies(df)\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(4848, 117)\n",
"(1617, 117)\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X = df.loc[:, df.columns != '0_p']\n",
"y = df['0_p']\n",
"X_train,X_test,y_train,y_test = train_test_split(X,y)\n",
"print(X_train.shape)\n",
"print(X_test.shape)"
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>1_b</th>\n",
" <th>1_c</th>\n",
" <th>1_f</th>\n",
" <th>1_k</th>\n",
" <th>1_s</th>\n",
" <th>1_x</th>\n",
" <th>2_f</th>\n",
" <th>2_g</th>\n",
" <th>2_s</th>\n",
" <th>2_y</th>\n",
" <th>...</th>\n",
" <th>21_s</th>\n",
" <th>21_v</th>\n",
" <th>21_y</th>\n",
" <th>22_d</th>\n",
" <th>22_g</th>\n",
" <th>22_l</th>\n",
" <th>22_m</th>\n",
" <th>22_p</th>\n",
" <th>22_u</th>\n",
" <th>22_w</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>787</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>788</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>789</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>790</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>791</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>792 rows × 117 columns</p>\n",
"</div>"
],
"text/plain": [
" 1_b 1_c 1_f 1_k 1_s 1_x 2_f 2_g 2_s 2_y ... 21_s 21_v 21_y \\\n",
"0 1 0 0 0 0 0 1 0 0 0 ... 0 0 0 \n",
"1 0 0 0 0 0 1 0 1 0 0 ... 0 1 0 \n",
"2 1 0 0 0 0 0 0 1 0 0 ... 0 0 0 \n",
"3 1 0 0 0 0 0 1 0 0 0 ... 1 0 0 \n",
"4 0 0 0 0 0 1 0 1 0 0 ... 0 0 0 \n",
".. ... ... ... ... ... ... ... ... ... ... ... ... ... ... \n",
"787 0 0 0 0 0 1 1 0 0 0 ... 0 1 0 \n",
"788 0 0 1 0 0 0 1 0 0 0 ... 0 1 0 \n",
"789 0 0 0 1 0 0 1 0 0 0 ... 0 1 0 \n",
"790 0 0 0 1 0 0 0 1 0 0 ... 0 1 0 \n",
"791 0 0 0 0 0 1 1 0 0 0 ... 0 1 0 \n",
"\n",
" 22_d 22_g 22_l 22_m 22_p 22_u 22_w \n",
"0 0 0 0 1 0 0 0 \n",
"1 0 1 0 0 0 0 0 \n",
"2 0 0 0 1 0 0 0 \n",
"3 0 0 0 1 0 0 0 \n",
"4 0 0 0 1 0 0 0 \n",
".. ... ... ... ... ... ... ... \n",
"787 1 0 0 0 0 0 0 \n",
"788 0 0 1 0 0 0 0 \n",
"789 0 0 1 0 0 0 0 \n",
"790 0 0 1 0 0 0 0 \n",
"791 0 0 1 0 0 0 0 \n",
"\n",
"[792 rows x 117 columns]"
]
},
"execution_count": 173,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_ver.columns = X_test.columns\n",
"X_ver"
]
},
{
"cell_type": "code",
"execution_count": 174,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.00\n",
"1.00\n"
]
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"\n",
"poly = PolynomialFeatures(degree=2,include_bias=False)\n",
"X_poly = poly.fit_transform(X_train)\n",
"lr = LogisticRegression(C=10).fit(X_poly,y_train)\n",
"print('{:.2f}'.format(lr.score(X_poly,y_train)))\n",
"print('{:.2f}'.format(lr.score(poly.fit_transform(X_test),y_test)))"
]
},
{
"cell_type": "code",
"execution_count": 175,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.95\n",
"0.97\n"
]
}
],
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"\n",
"gnb = GaussianNB()\n",
"gnb.fit(X_train,y_train)\n",
"\n",
"print('{:.2f}'.format(gnb.score(X_train,y_train)))\n",
"print('{:.2f}'.format(gnb.score(X_test,y_test)))"
]
},
{
"cell_type": "code",
"execution_count": 176,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.00\n",
"1.00\n"
]
}
],
"source": [
"from sklearn.svm import SVC\n",
"\n",
"svc = SVC(kernel='rbf',C=10, gamma=0.1).fit(X_train,y_train)\n",
"\n",
"print('{:.2f}'.format(svc.score(X_train,y_train)))\n",
"print('{:.2f}'.format(svc.score(X_test,y_test)))"
]
},
{
"cell_type": "code",
"execution_count": 177,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/tonywesoly/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n",
" mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.00\n",
"1.00\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/tonywesoly/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n",
" mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n"
]
}
],
"source": [
"from sklearn.neighbors import KNeighborsClassifier\n",
"\n",
"knn = KNeighborsClassifier(n_neighbors=3).fit(X_train,y_train)\n",
"print('{:.2f}'.format(knn.score(X_train,y_train)))\n",
"print('{:.2f}'.format(knn.score(X_test,y_test)))"
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.00\n",
"1.00\n"
]
}
],
"source": [
"from sklearn.neural_network import MLPClassifier\n",
"\n",
"mlp = MLPClassifier(activation='relu', hidden_layer_sizes=[10],solver='lbfgs').fit(X_train,y_train)\n",
"\n",
"print('{:.2f}'.format(mlp.score(X_train,y_train)))\n",
"print('{:.2f}'.format(mlp.score(X_test,y_test)))"
]
},
{
"cell_type": "code",
"execution_count": 179,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Bayes raport:\n",
"Accuracy score 0.91\n",
" precision recall f1-score support\n",
"\n",
" jadalne 1.00 0.82 0.90 406\n",
" trujące 0.84 1.00 0.91 386\n",
"\n",
" accuracy 0.91 792\n",
" macro avg 0.92 0.91 0.91 792\n",
"weighted avg 0.92 0.91 0.91 792\n",
"\n"
]
}
],
"source": [
"from sklearn.metrics import classification_report,accuracy_score\n",
"\n",
"pred_bayes = gnb.predict(X_ver)\n",
"print('Bayes raport:')\n",
"print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_bayes)))\n",
"print(classification_report(y_ver,pred_bayes,target_names=['jadalne','trujące']))"
]
},
{
"cell_type": "code",
"execution_count": 180,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Logistic Regression raport:\n",
"Accuracy score 1.00\n",
" precision recall f1-score support\n",
"\n",
" jadalne 1.00 1.00 1.00 406\n",
" trujące 1.00 1.00 1.00 386\n",
"\n",
" accuracy 1.00 792\n",
" macro avg 1.00 1.00 1.00 792\n",
"weighted avg 1.00 1.00 1.00 792\n",
"\n"
]
}
],
"source": [
"pred_log = lr.predict(poly.fit_transform(X_ver))\n",
"print('Logistic Regression raport:')\n",
"print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_log)))\n",
"print(classification_report(y_ver,pred_log,target_names=['jadalne','trujące']))"
]
},
{
"cell_type": "code",
"execution_count": 181,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Support Vector Machines raport:\n",
"Accuracy score 1.00\n",
" precision recall f1-score support\n",
"\n",
" jadalne 1.00 1.00 1.00 406\n",
" trujące 1.00 1.00 1.00 386\n",
"\n",
" accuracy 1.00 792\n",
" macro avg 1.00 1.00 1.00 792\n",
"weighted avg 1.00 1.00 1.00 792\n",
"\n"
]
}
],
"source": [
"pred_svc = svc.predict(X_ver)\n",
"print('Support Vector Machines raport:')\n",
"print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_svc)))\n",
"print(classification_report(y_ver,pred_svc,target_names=['jadalne','trujące']))"
]
},
{
"cell_type": "code",
"execution_count": 182,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"K-nearest neighbors raport:\n",
"Accuracy score 1.00\n",
" precision recall f1-score support\n",
"\n",
" jadalne 1.00 1.00 1.00 406\n",
" trujące 1.00 1.00 1.00 386\n",
"\n",
" accuracy 1.00 792\n",
" macro avg 1.00 1.00 1.00 792\n",
"weighted avg 1.00 1.00 1.00 792\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/tonywesoly/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.\n",
" mode, _ = stats.mode(_y[neigh_ind, k], axis=1)\n"
]
}
],
"source": [
"pred_knn = knn.predict(X_ver)\n",
"print('K-nearest neighbors raport:')\n",
"print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_knn)))\n",
"print(classification_report(y_ver,pred_knn,target_names=['jadalne','trujące']))"
]
},
{
"cell_type": "code",
"execution_count": 183,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Neural network raport:\n",
"Accuracy score 1.00\n",
" precision recall f1-score support\n",
"\n",
" jadalne 1.00 1.00 1.00 406\n",
" trujące 1.00 1.00 1.00 386\n",
"\n",
" accuracy 1.00 792\n",
" macro avg 1.00 1.00 1.00 792\n",
"weighted avg 1.00 1.00 1.00 792\n",
"\n"
]
}
],
"source": [
"pred_mlp = mlp.predict(X_ver)\n",
"print('Neural network raport:')\n",
"print('Accuracy score {:.2f}'.format(accuracy_score(y_ver,pred_mlp)))\n",
"print(classification_report(y_ver,pred_mlp,target_names=['jadalne','trujące']))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "fbbbb91f3443f337fad6219902aa19c75c8f48b69079f7de3a01210f85667a20"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}