PhishGuardian/backend/ML.ipynb

1887 lines
887 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b313cab7d5cc49c0",
"metadata": {
"ExecuteTime": {
"start_time": "2024-06-05T20:03:23.481431Z"
},
"jupyter": {
"is_executing": true
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (2.2.2)Note: you may need to restart the kernel to use updated packages.\n",
"\n",
"Requirement already satisfied: numpy>=1.26.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (1.26.4)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
"Requirement already satisfied: matplotlib in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (3.9.0)\n",
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (1.2.1)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (4.53.0)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (1.4.5)\n",
"Requirement already satisfied: numpy>=1.23 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (1.26.4)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from matplotlib) (24.0)\n",
"Requirement already satisfied: pillow>=8 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (10.3.0)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (3.1.2)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from matplotlib) (2.9.0.post0)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: nltk in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (3.8.1)\n",
"Requirement already satisfied: click in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from nltk) (8.1.7)\n",
"Requirement already satisfied: joblib in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from nltk) (1.4.2)\n",
"Requirement already satisfied: regex>=2021.8.3 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from nltk) (2024.5.15)\n",
"Requirement already satisfied: tqdm in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from nltk) (4.66.4)\n",
"Requirement already satisfied: colorama in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from click->nltk) (0.4.6)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: wordcloud in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (1.9.3)\n",
"Requirement already satisfied: numpy>=1.6.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from wordcloud) (1.26.4)\n",
"Requirement already satisfied: pillow in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from wordcloud) (10.3.0)\n",
"Requirement already satisfied: matplotlib in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from wordcloud) (3.9.0)\n",
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib->wordcloud) (1.2.1)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib->wordcloud) (0.12.1)\n",
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib->wordcloud) (4.53.0)\n",
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib->wordcloud) (1.4.5)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from matplotlib->wordcloud) (24.0)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib->wordcloud) (3.1.2)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from matplotlib->wordcloud) (2.9.0.post0)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: scikit-learn==1.3.2 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (1.3.2)\n",
"Requirement already satisfied: numpy<2.0,>=1.17.3 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn==1.3.2) (1.26.4)\n",
"Requirement already satisfied: scipy>=1.5.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn==1.3.2) (1.13.1)\n",
"Requirement already satisfied: joblib>=1.1.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn==1.3.2) (1.4.2)\n",
"Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn==1.3.2) (3.5.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: scikit-fuzzy==0.4.2 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (0.4.2)\n",
"Requirement already satisfied: numpy>=1.6.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-fuzzy==0.4.2) (1.26.4)\n",
"Requirement already satisfied: scipy>=0.9.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-fuzzy==0.4.2) (1.13.1)\n",
"Requirement already satisfied: networkx>=1.9.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-fuzzy==0.4.2) (3.3)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to\n",
"[nltk_data] C:\\Users\\alicj\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to\n",
"[nltk_data] C:\\Users\\alicj\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
}
],
"source": [
"%pip install pandas\n",
"%pip install matplotlib\n",
"%pip install nltk\n",
"%pip install wordcloud\n",
"%pip install scikit-learn==1.3.2\n",
"%pip install scikit-fuzzy==0.4.2\n",
"# Import pakietów\n",
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import re\n",
"import string\n",
"from wordcloud import WordCloud\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import PorterStemmer\n",
"from nltk.tokenize import word_tokenize\n",
"import joblib\n",
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "768266dbb79c5e9d",
"metadata": {},
"outputs": [],
"source": [
"# Załaduj dane\n",
"data_path = \"joined_data.csv\"\n",
"data = pd.read_csv(data_path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ee08266d5c30627b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Unnamed: 0 Body Label\n",
"0 0 Subject: congratulations\\n vince ,\\n congratul... 0\n",
"1 1 \\nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0\n",
"2 2 Big and big\\nMAIN PAGE\\nHuge big titties @ big... 1\n",
"3 3 Subject: re : enron visit - - thanks\\n larry ,... 0\n",
"4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0\n"
]
}
],
"source": [
"print(data.head())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "1798f605e33fe5e5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 18651 entries, 0 to 18650\n",
"Data columns (total 3 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Unnamed: 0 18651 non-null int64 \n",
" 1 Body 18650 non-null object\n",
" 2 Label 18651 non-null int64 \n",
"dtypes: int64(2), object(1)\n",
"memory usage: 437.3+ KB\n",
"None\n"
]
}
],
"source": [
"print(data.info())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b4f43d913b92485b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Body</th>\n",
" <th>Label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Subject: congratulations\\n vince ,\\n congratul...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>\\nhttp://news.bbc.co.uk/1/hi/scotland/2515231....</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Big and big\\nMAIN PAGE\\nHuge big titties @ big...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>Subject: re : enron visit - - thanks\\n larry ,...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18646</th>\n",
" <td>18646</td>\n",
" <td>Subject: fluid analysis\\n our customer speak v...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18647</th>\n",
" <td>18647</td>\n",
" <td>Subject: guadalupe\\n i rolled 740208 , 740209 ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18648</th>\n",
" <td>18648</td>\n",
" <td>100% Free Porn!\\nWhat more can you ask for?\\nC...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18649</th>\n",
" <td>18649</td>\n",
" <td>Subject: revised nominations\\n daren ,\\n we ha...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18650</th>\n",
" <td>18650</td>\n",
" <td>Hello,\\nI've got a small problem but still ann...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>18651 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Body Label\n",
"0 0 Subject: congratulations\\n vince ,\\n congratul... 0\n",
"1 1 \\nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0\n",
"2 2 Big and big\\nMAIN PAGE\\nHuge big titties @ big... 1\n",
"3 3 Subject: re : enron visit - - thanks\\n larry ,... 0\n",
"4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0\n",
"... ... ... ...\n",
"18646 18646 Subject: fluid analysis\\n our customer speak v... 1\n",
"18647 18647 Subject: guadalupe\\n i rolled 740208 , 740209 ... 0\n",
"18648 18648 100% Free Porn!\\nWhat more can you ask for?\\nC... 1\n",
"18649 18649 Subject: revised nominations\\n daren ,\\n we ha... 0\n",
"18650 18650 Hello,\\nI've got a small problem but still ann... 0\n",
"\n",
"[18651 rows x 3 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e3bf0f04a2be4e1a",
"metadata": {},
"outputs": [],
"source": [
"# Usuwamy NaN"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "71a6bbebdb0dccd4",
"metadata": {},
"outputs": [],
"source": [
"data.dropna(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "b7fca25d67381cdd",
"metadata": {},
"outputs": [],
"source": [
"# Usuwamy puste wiadomości i wiadomości zawierające jedynie \"\\n\""
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "72d84bf6c1e7023a",
"metadata": {},
"outputs": [],
"source": [
"data = data[data['Body'] != '\\n']"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "7c94c4dca6c4cdae",
"metadata": {},
"outputs": [],
"source": [
"data = data[data['Body'] != 'empty']"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "7e6fd3f8014498f3",
"metadata": {},
"outputs": [],
"source": [
"data.reset_index(drop=True, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a0c33f82a936c59",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Body</th>\n",
" <th>Label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Subject: congratulations\\n vince ,\\n congratul...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>\\nhttp://news.bbc.co.uk/1/hi/scotland/2515231....</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Big and big\\nMAIN PAGE\\nHuge big titties @ big...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>Subject: re : enron visit - - thanks\\n larry ,...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18109</th>\n",
" <td>18646</td>\n",
" <td>Subject: fluid analysis\\n our customer speak v...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18110</th>\n",
" <td>18647</td>\n",
" <td>Subject: guadalupe\\n i rolled 740208 , 740209 ...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18111</th>\n",
" <td>18648</td>\n",
" <td>100% Free Porn!\\nWhat more can you ask for?\\nC...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18112</th>\n",
" <td>18649</td>\n",
" <td>Subject: revised nominations\\n daren ,\\n we ha...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18113</th>\n",
" <td>18650</td>\n",
" <td>Hello,\\nI've got a small problem but still ann...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>18114 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Body Label\n",
"0 0 Subject: congratulations\\n vince ,\\n congratul... 0\n",
"1 1 \\nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0\n",
"2 2 Big and big\\nMAIN PAGE\\nHuge big titties @ big... 1\n",
"3 3 Subject: re : enron visit - - thanks\\n larry ,... 0\n",
"4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0\n",
"... ... ... ...\n",
"18109 18646 Subject: fluid analysis\\n our customer speak v... 1\n",
"18110 18647 Subject: guadalupe\\n i rolled 740208 , 740209 ... 0\n",
"18111 18648 100% Free Porn!\\nWhat more can you ask for?\\nC... 1\n",
"18112 18649 Subject: revised nominations\\n daren ,\\n we ha... 0\n",
"18113 18650 Hello,\\nI've got a small problem but still ann... 0\n",
"\n",
"[18114 rows x 3 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "19af5936d0cfeba2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Label\n",
"0 11124\n",
"1 6990\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"# Sprawdźmy rozkład targetów\n",
"print(data['Label'].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "96c861e2655312cb",
"metadata": {},
"outputs": [],
"source": [
"# Analiza długości wiadomości"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "e1ec1ed8aa7c856d",
"metadata": {},
"outputs": [],
"source": [
"def get_len(row):\n",
" try:\n",
" return len(row)\n",
" except:\n",
" return row"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "63c023f34d234f3e",
"metadata": {},
"outputs": [],
"source": [
"data['message_length'] = data['Body'].apply(get_len)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "d4fd0e2dcc2bfee9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Body</th>\n",
" <th>Label</th>\n",
" <th>message_length</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>16293</th>\n",
" <td>16774</td>\n",
" <td>\\n4623\\n</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6071</th>\n",
" <td>6254</td>\n",
" <td>Subject: \\n</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3683</th>\n",
" <td>3792</td>\n",
" <td>Subject: \\n</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12843</th>\n",
" <td>13228</td>\n",
" <td>Subject: \\n</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17867</th>\n",
" <td>18399</td>\n",
" <td>Subject: \\n</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6927</th>\n",
" <td>7128</td>\n",
" <td>------------------------ Yahoo! Groups Sponsor...</td>\n",
" <td>0</td>\n",
" <td>107989</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6887</th>\n",
" <td>7088</td>\n",
" <td>Subject: enron mentions\\n enron discusses cred...</td>\n",
" <td>0</td>\n",
" <td>121502</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2422</th>\n",
" <td>2488</td>\n",
" <td>=?GB2312?B?yNW12squ0ru97NbQufq5+rzKtefX08nosb...</td>\n",
" <td>1</td>\n",
" <td>129635</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1522</th>\n",
" <td>1569</td>\n",
" <td>change your settings: http://blo.gs/settings.p...</td>\n",
" <td>0</td>\n",
" <td>194978</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4844</th>\n",
" <td>4987</td>\n",
" <td>,Body,Label\\n 0,\"Subject: great part-time or s...</td>\n",
" <td>0</td>\n",
" <td>17085626</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>18114 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Body Label \\\n",
"16293 16774 \\n4623\\n 1 \n",
"6071 6254 Subject: \\n 1 \n",
"3683 3792 Subject: \\n 1 \n",
"12843 13228 Subject: \\n 1 \n",
"17867 18399 Subject: \\n 1 \n",
"... ... ... ... \n",
"6927 7128 ------------------------ Yahoo! Groups Sponsor... 0 \n",
"6887 7088 Subject: enron mentions\\n enron discusses cred... 0 \n",
"2422 2488 =?GB2312?B?yNW12squ0ru97NbQufq5+rzKtefX08nosb... 1 \n",
"1522 1569 change your settings: http://blo.gs/settings.p... 0 \n",
"4844 4987 ,Body,Label\\n 0,\"Subject: great part-time or s... 0 \n",
"\n",
" message_length \n",
"16293 6 \n",
"6071 10 \n",
"3683 10 \n",
"12843 10 \n",
"17867 10 \n",
"... ... \n",
"6927 107989 \n",
"6887 121502 \n",
"2422 129635 \n",
"1522 194978 \n",
"4844 17085626 \n",
"\n",
"[18114 rows x 4 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.sort_values(by='message_length')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "e62112260ebc17f0",
"metadata": {},
"outputs": [],
"source": [
"# Jedna wiadomość jest bardzo długa 17085626"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "7c369131e3c91ce3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"message_length\n",
"293 68\n",
"295 53\n",
"291 52\n",
"539 44\n",
"446 40\n",
" ..\n",
"2394 1\n",
"4856 1\n",
"6192 1\n",
"2597 1\n",
"4004 1\n",
"Name: count, Length: 4903, dtype: int64"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['message_length'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "b6b509692fd7c541",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIkCAYAAABxx+gQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAB4WElEQVR4nO3dfXzO9f////ux2Sm2OdtmmZlzc05hyVnGaElaheQsEpFQQiEnlaIIOflUznonRVFCmPOzOUnOSRFR2VTMlOz0+fvDb8fX0YaN7XhpbtfLxeWy4/l6vl7H4/U4Dtq91+t4HjZjjBEAAAAAwOlcrC4AAAAAAO5UBDIAAAAAsAiBDAAAAAAsQiADAAAAAIsQyAAAAADAIgQyAAAAALAIgQwAAAAALEIgAwAAAACLEMgAAMihKVOm6H//+5/VZSAL8+bN03vvvWd1GQCQbQQyAMgnmjZtqmrVqt1wXpkyZdStW7dce96TJ0/KZrNp7ty5N7X/3LlzZbPZdPLkSftY06ZN1bRp01ypL7e99957GjNmjOrXr5/jfW02m0aNGpX7RUkaNWqUbDZbrh4zt98reW3ZsmXq06ePateuneN9/2vnCiD/IJABQC7LCBgZfwoUKKC77rpL3bp106+//mp1ebgF+/bt04gRI7R06VJVrFjR6nJwlV9//VU9evTQ//73PzVs2NDqcgAg2wpYXQAA5FdjxoxRaGioLl++rO3bt2vu3LnasmWLDh48KE9PT6vLw004dOiQFi1apHvvvfem9v/nn39UoEDe/Kd3+PDhGjp0aJ4c+79g3759mjZtmqKjo29q/6NHj8rFhf9PDcD5CGQAkEdat26tu+++W5LUs2dPFS9eXG+99ZaWLl2qxx9/3OLqcDOeeOKJW9o/L4N4gQIF8izs/Rc88MADt7S/h4dHLlUCADnD/woCACdp1KiRJOn48eMO4+vWrVOjRo1UsGBB+fn5qW3btjpy5Ih9e8ZntK7153pWr14tb29vdezYUampqQ7bjh07JklKTk7WyJEjVbduXfn6+qpgwYJq1KiR1q9fn+l4CQkJ6tatm3x9feXn56euXbsqISEh2z04dOiQ7r//fnl5ealUqVJ67bXXlJ6efs35GTVu2LBBNptNGzZscNh+rc+vLVq0SGFhYfL09FS1atW0ZMkSdevWTWXKlHGY9/fff+uFF15QcHCwPDw8VKlSJb399tsyxjjMi4mJ0X333Sc/Pz8VKlRIlSpV0ssvv+ww5/Llyxo1apQqVqwoT09PlSxZUo888ojD632jz5AZY1S8eHENGjTIPpaeni4/Pz+5uro69Pqtt95SgQIF9Ndff0nK+jNkc+bM0f333y9/f395eHgoLCxMM2bMyPJ5X3vtNZUqVUre3t5q1qyZDh06lGWNP/30kx577DEVLVpU3t7eatCggZYvX+4wJ+P1WrhwoUaPHq277rpLhQsX1qOPPqoLFy4oKSlJAwYMkL+/vwoVKqTu3bsrKSnJ4RipqakaO3asypUrJw8PD5UpU0Yvv/xypnnffvutIiMjVbx4cXl5eSk0NFRPPfWUw5z09HRNnjxZ1atXl6enp0qUKKFWrVrp22+/tc/hM2QArHLn/q80AHCyjEUrihQpYh9bs2aNWrdurbJly2rUqFH6559/NHXqVDVs2FDfffedypQpoxIlSmRa0S8lJUUDBw6Uu7v7NZ9v2bJlevTRR9W+fXvNnj1brq6u9m0xMTFasmSJTpw4ofT0dH344Yfq2LGjnn76aV28eFGzZs1SZGSkdu7cqVq1akm68kt727ZttWXLFvXu3VtVqlTRkiVL1LVr12ydf1xcnJo1a6bU1FQNHTpUBQsW1Pvvvy8vL68s5x86dEi1atXSrl27snX8DMuXL1f79u1VvXp1jRs3TufPn1ePHj101113Ocwzxuihhx7S+vXr1aNHD9WqVUurVq3S4MGD9euvv2rSpEn2Oh588EHVqFFDY8aMkYeHh44dO6atW7faj5WWlqYHH3xQa9euVYcOHfT888/r4sWLiomJ0cGDB1WuXLls1W6z2dSwYUNt2rTJPrZ//35duHBBLi4u2rp1q6KioiRJmzdvVu3atVWoUKFrHm/GjBmqWrWqHnroIRUoUEBff/21nn32WaWnp6tv3772eSNHjtRrr72mBx54QA888IC+++47tWzZUsnJyQ7Hi4+P17333qtLly6pf//+KlasmObNm6eHHnpIn3/+udq1a+cwf9y4cfLy8tLQoUN17NgxTZ06VW5ubnJxcdH58+c1atQo++28oaGhGjlypH3fnj17at68eXr00Uf1wgsvaMeOHRo3bpyOHDmiJUuWSJLOnj2rli1bqkSJEho6dKj8/Px08uRJLV682KGOHj16aO7cuWrdurV69uyp1NRUbd68Wdu3b7dfxQYAyxgAQK6aM2eOkWTWrFljfv/9d3P69Gnz+eefmxIlShgPDw9z+vRp+9xatWoZf39/8+eff9rH9u3bZ1xcXEyXLl2u+RzPPvuscXV1NevWrbOPNWnSxFStWtUYY8wXX3xh3NzczNNPP23S0tLsc86fP2+KFy9uSpcubfbu3WuMMSY1NdUkJSU5HP/8+fMmICDAPPXUU/axL7/80kgy48ePt4+lpqaaRo0aGUlmzpw51+3LgAEDjCSzY8cO+9jZs2eNr6+vkWROnDhhjDHmn3/+MVWrVjVFihQxa9euNcYYs379eiPJrF+/3uGYJ06cyPTc1atXN6VKlTIXL160j23YsMFIMiEhIZnO57XXXnM45qOPPmpsNps5duyYMcaYSZMmGUnm999/v+a5zZ4920gyEydOzLQtPT3d/rMk8+qrr17zOMYYM2HCBOPq6moSExONMcZMmTLFhISEmHr16pkhQ4YYY4xJS0szfn5+ZuDAgfb9Xn31VfPv/6xfunQp0/EjIyNN2bJl7Y/Pnj1r3N3dTVRUlEOtL7/8spFkunbtah/LeA03b95sH7t48aIJDQ01ZcqUsb/XMl6vatWqmeTkZPvcjh07GpvNZlq3bu1QU3h4uMNrs3fvXiPJ9OzZ02Heiy++aCTZ3/dLliwxksyuXbuy6OQV69atM5JM//79M227+nxDQkIczhUAnIVbFgEgj0RERKhEiRIKDg7Wo48+qoIFC2rp0qUqVaqUJOnMmTPau3evunXrpqJFi9r3q1Gjhlq0aKEVK1ZkedyPPvpI06dP1/jx49WsWbNM2xcsWKD27dvrmWee0f/93//ZFyo4evSo7r77bv3xxx9q1qyZatasKUlydXW1X2lLT0/XuXPnlJqaqrvvvlvfffed/bgrVqxQgQIF1KdPH/uYq6urnnvuuWz1Y8WKFWrQoIHq1atnHytRooQ6depkfxwfH69GjRrp0KFDqlGjhu6///5sHTvDb7/9pgMHDqhLly4OV46aNGmi6tWrZ6rH1dVV/fv3dxh/4YUXZIzRN998I0ny8/OTJH311VfXvL3yiy++UPHixbPsRU6Xom/UqJHS0tK0bds2SVeuhDVq1EiNGjXS5s2bJUkHDx5UQkKC/TbYa7n66uOFCxf0xx9/qEmTJvrpp5904cIFSVeu0iYnJ+u5555zqHXAgAGZjrdixQrVq1dP9913n32sUKFC6tWrl06ePKnDhw87zO/SpYvc3Nzsj+vXry9jTKZbCuvXr6/Tp0/bb6vNeO9ffeumdOW1kWS/RTLjtVm2bJlSUlKy7MEXX3whm82mV199NdO23P6aAAC4GQQyAMgj06ZNU0xMjD7//HM98MAD+uOPPxwWDvj5558lSZUqVcq0b5UqVfTHH3/o77//dhjfu3evevfurY4dO2b6ZVWSTpw4oSeffFLR0dGaOnWqwy+cBQsW1FNPPaXSpUtn2m/evHmqUaOGPD09VaxYMZUoUULLly+3/9KeUW/JkiUz3SKXVf1Z+fnnn1WhQoVM41fvX7BgQbVs2VLh4eHZOmZWzyFJ5cuXz7Tt32M///yzgoKCVLhwYYfxKlWqOByrffv2atiwoXr27KmAgAB16NBBCxcudAhnx48fV6VKlXJlUY06derI29vbHr4yAlnjxo317bf
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 200.000 znaków celem wyświetlenia histogramów\n",
"hist_data = data[data['message_length'] < 200000]\n",
"plt.figure(figsize=(10, 6))\n",
"hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')\n",
"hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')\n",
"plt.legend()\n",
"plt.xlabel('Długość wiadomości')\n",
"plt.ylabel('Liczba wiadomości')\n",
"plt.title('Rozkład długości wiadomości')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "7182d6a1d6600c2",
"metadata": {},
"outputs": [],
"source": [
"# Ograniczamy jeszcze bardziej "
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "962efe0bd652ecdb",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIkCAYAAAAUKhpvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABuoUlEQVR4nO3de3zP9f//8ft75w3bDNschxzHRI5LTplzzhWSUwclEkpRkUM1qRCRbyVUUhQSkjlLTgnNIZWIyqZijpkdnr8/+u398W4He82293vcrpfLLpe9n8/n6/V6vN6vB94Pz9fr+bYZY4wAAAAAANnm5uwAAAAAAKCgoZACAAAAAIsopAAAAADAIgopAAAAALCIQgoAAAAALKKQAgAAAACLKKQAAAAAwCIKKQAAAACwiEIKAICrTJ8+XR988IGzw0AG5s+frzfffNPZYQCAJAopACgQmjdvrpo1a15zXPny5dW/f/9cO+6xY8dks9k0b968HG0/b9482Ww2HTt2zN7WvHlzNW/ePFfiy21vvvmmJkyYoIYNG1re1mazady4cbkflKRx48bJZrPl6j5zO1fy2ooVKzRo0CDVqVPH8rYF7VwBFAwUUgBgQVphkPbj4eGh0qVLq3///vr999+dHR6uw759+zRmzBgtX75cVapUcXY4uMrvv/+uBx98UB988IEaN27s7HAAQJLk4ewAAKAgmjBhgipUqKDLly9r+/btmjdvnr7++mvt379fPj4+zg4POXDgwAEtXrxYt99+e462/+eff+ThkTf/rD7//PMaNWpUnuy7INi3b59mzpyp7t2752j7w4cPy82N/zsGkLsopAAgB9q1a6d69epJkh566CEVL15cr7zyipYvX657773XydEhJ+67777r2j4vC2gPD488K9IKgvbt21/X9t7e3rkUCQD8D/89AwC5oEmTJpKkI0eOOLSvX79eTZo0UaFChRQYGKjOnTvr0KFD9v60Z5Ay+8nKmjVr5Ofnp169eik5Odmh7+eff5YkXblyRWPHjlXdunUVEBCgQoUKqUmTJtqwYUO6/SUkJKh///4KCAhQYGCg+vXrp4SEhGy/BwcOHNCdd94pX19flSlTRi+++KJSU1MzHZ8W48aNG2Wz2bRx40aH/syez1q8eLHCw8Pl4+OjmjVraunSperfv7/Kly/vMO7ixYt68sknVbZsWXl7e6tq1ap67bXXZIxxGBcTE6M77rhDgYGBKly4sKpWrapnn33WYczly5c1btw4ValSRT4+PipZsqS6devmcL2v9YyUMUbFixfXiBEj7G2pqakKDAyUu7u7w3v9yiuvyMPDQxcuXJCU8TNSc+fO1Z133qng4GB5e3srPDxcb731VobHffHFF1WmTBn5+fmpRYsWOnDgQIYx/vLLL7rnnnsUFBQkPz8/NWrUSCtXrnQYk3a9Fi1apPHjx6t06dIqUqSI7r77bp09e1aJiYkaNmyYgoODVbhwYQ0YMECJiYkO+0hOTtbEiRN1yy23yNvbW+XLl9ezzz6bbty3336rNm3aqHjx4vL19VWFChX0wAMPOIxJTU3VG2+8oYiICPn4+KhEiRJq27atvv32W/sYnpECkBdu3v/eAoBclLaYQtGiRe1ta9euVbt27VSxYkWNGzdO//zzj2bMmKHGjRvru+++U/ny5VWiRIl0K8QlJSVp+PDh8vLyyvR4K1as0N13360ePXrovffek7u7u70vJiZGS5cu1dGjR5Wamqp3331XvXr10sMPP6zz589rzpw5atOmjXbu3KnatWtL+vfDdufOnfX111/r0UcfVfXq1bV06VL169cvW+cfFxenFi1aKDk5WaNGjVKhQoX09ttvy9fXN8PxBw4cUO3atbVr165s7T/NypUr1aNHD0VERCg6OlpnzpzRgw8+qNKlSzuMM8aoU6dO2rBhgx588EHVrl1bX331lUaOHKnff/9dU6dOtcdx1113qVatWpowYYK8vb31888/a+vWrfZ9paSk6K677tK6devUs2dPPfHEEzp//rxiYmK0f/9+3XLLLdmK3WazqXHjxtq8ebO97fvvv9fZs2fl5uamrVu3qkOHDpKkLVu2qE6dOipcuHCm+3vrrbdUo0YNderUSR4eHvriiy/02GOPKTU1VYMHD7aPGzt2rF588UW1b99e7du313fffafWrVvrypUrDvuLj4/X7bffrkuXLmno0KEqVqyY5s+fr06dOunTTz9V165dHcZHR0fL19dXo0aN0s8//6wZM2bI09NTbm5uOnPmjMaNG2e/7bVChQoaO3asfduHHnpI8+fP1913360nn3xSO3bsUHR0tA4dOqSlS5dKkk6dOqXWrVurRIkSGjVqlAIDA3Xs2DEtWbLEIY4HH3xQ8+bNU7t27fTQQw8pOTlZW7Zs0fbt2+2zxgCQJwwAINvmzp1rJJm1a9eaP//805w4ccJ8+umnpkSJEsbb29ucOHHCPrZ27domODjY/P333/a2ffv2GTc3N9O3b99Mj/HYY48Zd3d3s379entbs2bNTI0aNYwxxnz22WfG09PTPPzwwyYlJcU+5syZM6Z48eKmXLlyZu/evcYYY5KTk01iYqLD/s+cOWNCQkLMAw88YG9btmyZkWQmT55sb0tOTjZNmjQxkszcuXOzfF+GDRtmJJkdO3bY206dOmUCAgKMJHP06FFjjDH//POPqVGjhilatKhZt26dMcaYDRs2GElmw4YNDvs8evRoumNHRESYMmXKmPPnz9vbNm7caCSZsLCwdOfz4osvOuzz7rvvNjabzfz888/GGGOmTp1qJJk///wz03N77733jCQzZcqUdH2pqan23yWZF154IdP9GGPMq6++atzd3c25c+eMMcZMnz7dhIWFmQYNGphnnnnGGGNMSkqKCQwMNMOHD7dv98ILL5j//pN96dKldPtv06aNqVixov31qVOnjJeXl+nQoYNDrM8++6yRZPr162dvS7uGW7ZssbedP3/eVKhQwZQvX96ea2nXq2bNmubKlSv2sb169TI2m820a9fOIabIyEiHa7N3714jyTz00EMO45566ikjyZ73S5cuNZLMrl27Mngn/7V+/XojyQwdOjRd39XnGxYW5nCuAJAbuLUPAHIgKipKJUqUUNmyZXX33XerUKFCWr58ucqUKSNJOnnypPbu3av+/fsrKCjIvl2tWrXUqlUrrVq1KsP9vv/++5o1a5YmT56sFi1apOtfuHChevTooUceeUT/93//Z3+A/vDhw6pXr57++usvtWjRQrfeeqskyd3d3T6zlZqaqtOnTys5OVn16tXTd999Z9/vqlWr5OHhoUGDBtnb3N3d9fjjj2fr/Vi1apUaNWqkBg0a2NtKlCih3r1721/Hx8erSZMmOnDggGrVqqU777wzW/tO88cffyg2NlZ9+/Z1mKlp1qyZIiIi0sXj7u6uoUOHOrQ/+eSTMsboyy+/lCQFBgZKkj7//PNMb0P87LPPVLx48QzfC6tLkjdp0kQpKSn65ptvJP0789SkSRM1adJEW7ZskSTt379fCQkJ9ttFM3P1bN/Zs2f1119/qVmzZvrll1909uxZSf/Oil65ckWPP/64Q6zDhg1Lt79Vq1apQYMGuuOOO+xthQsX1sCBA3Xs2DEdPHjQYXzfvn3l6elpf92wYUMZY9LdetewYUOdOHHCfvtpWu5ffYuj9O+1kWS/lTDt2qxYsUJJSUkZvgefffaZbDabXnjhhXR9ub1cPAD8F4UUAOTAzJkzFRMTo08//VTt27fXX3/95fBA+6+//ipJqlq1arptq1evrr/++ksXL150aN+7d68effRR9erVK92HTEk6evSo7r//fnXv3l0zZsxw+KBYqFAhPfDAAypXrly67ebPn69atWrJx8dHxYoVU4kSJbRy5Ur7h+20eEuWLJnuVrKM4s/Ir7/+qsqVK6drv3r7QoUKqXXr1oqMjMzWPjM6hiRVqlQpXd9/23799VeVKlVKRYoUcWivXr26w7569Oihxo0b66GHHlJISIh69uypRYsWORRVR44cUdWqVXNlsYfbbrtNfn5+9qIprZBq2rSpvv3
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 10000 znaków celem wyświetlenia histogramów\n",
"hist_data = data[data['message_length'] < 10000]\n",
"plt.figure(figsize=(10, 6))\n",
"hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')\n",
"hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')\n",
"plt.legend()\n",
"plt.xlabel('Długość wiadomości')\n",
"plt.ylabel('Liczba wiadomości')\n",
"plt.title('Rozkład długości wiadomości')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "eaa483deb9c81942",
"metadata": {},
"outputs": [],
"source": [
"# Można zauważyć, że trudno odróżnić widomości po samej długości. W tym celu należy skorzystać z bardziej zaawansowanych metod."
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "6e0ee5fccf308cd1",
"metadata": {},
"outputs": [],
"source": [
"# Przetwarzanie tekstu"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "50c0131db25859cb",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Body</th>\n",
" <th>Label</th>\n",
" <th>message_length</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Subject: congratulations\\n vince ,\\n congratul...</td>\n",
" <td>0</td>\n",
" <td>129</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>\\nhttp://news.bbc.co.uk/1/hi/scotland/2515231....</td>\n",
" <td>0</td>\n",
" <td>435</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Big and big\\nMAIN PAGE\\nHuge big titties @ big...</td>\n",
" <td>1</td>\n",
" <td>231</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>Subject: re : enron visit - - thanks\\n larry ,...</td>\n",
" <td>0</td>\n",
" <td>1180</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan...</td>\n",
" <td>0</td>\n",
" <td>574</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18109</th>\n",
" <td>18646</td>\n",
" <td>Subject: fluid analysis\\n our customer speak v...</td>\n",
" <td>1</td>\n",
" <td>927</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18110</th>\n",
" <td>18647</td>\n",
" <td>Subject: guadalupe\\n i rolled 740208 , 740209 ...</td>\n",
" <td>0</td>\n",
" <td>337</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18111</th>\n",
" <td>18648</td>\n",
" <td>100% Free Porn!\\nWhat more can you ask for?\\nC...</td>\n",
" <td>1</td>\n",
" <td>345</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18112</th>\n",
" <td>18649</td>\n",
" <td>Subject: revised nominations\\n daren ,\\n we ha...</td>\n",
" <td>0</td>\n",
" <td>346</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18113</th>\n",
" <td>18650</td>\n",
" <td>Hello,\\nI've got a small problem but still ann...</td>\n",
" <td>0</td>\n",
" <td>744</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>18114 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Body Label \\\n",
"0 0 Subject: congratulations\\n vince ,\\n congratul... 0 \n",
"1 1 \\nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0 \n",
"2 2 Big and big\\nMAIN PAGE\\nHuge big titties @ big... 1 \n",
"3 3 Subject: re : enron visit - - thanks\\n larry ,... 0 \n",
"4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0 \n",
"... ... ... ... \n",
"18109 18646 Subject: fluid analysis\\n our customer speak v... 1 \n",
"18110 18647 Subject: guadalupe\\n i rolled 740208 , 740209 ... 0 \n",
"18111 18648 100% Free Porn!\\nWhat more can you ask for?\\nC... 1 \n",
"18112 18649 Subject: revised nominations\\n daren ,\\n we ha... 0 \n",
"18113 18650 Hello,\\nI've got a small problem but still ann... 0 \n",
"\n",
" message_length \n",
"0 129 \n",
"1 435 \n",
"2 231 \n",
"3 1180 \n",
"4 574 \n",
"... ... \n",
"18109 927 \n",
"18110 337 \n",
"18111 345 \n",
"18112 346 \n",
"18113 744 \n",
"\n",
"[18114 rows x 4 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "c32c52a7b2575a3b",
"metadata": {},
"outputs": [],
"source": [
"stop_words = set(stopwords.words('english'))\n",
"ps = PorterStemmer()\n",
"\n",
"def preprocess_text(text):\n",
" # Usuwanie znaków specjalnych i tokenizacja\n",
" text = re.sub(r'\\d+', '', text)\n",
" text = text.translate(str.maketrans('', '', string.punctuation))\n",
" words = word_tokenize(text)\n",
" # Usuwanie stopwords i stemming\n",
" words = [ps.stem(word) for word in words if word.lower() not in stop_words]\n",
" return \" \".join(words)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "5953cb974349cb33",
"metadata": {},
"outputs": [],
"source": [
"# Ten proces jest czasochłonny"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "89b8cdeaa9da5c2d",
"metadata": {},
"outputs": [],
"source": [
"data['processed_message'] = data['Body'].apply(preprocess_text)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "ccce395ac94c39a1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>Body</th>\n",
" <th>Label</th>\n",
" <th>message_length</th>\n",
" <th>processed_message</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>Subject: congratulations\\n vince ,\\n congratul...</td>\n",
" <td>0</td>\n",
" <td>129</td>\n",
" <td>subject congratul vinc congratul wish best luc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>\\nhttp://news.bbc.co.uk/1/hi/scotland/2515231....</td>\n",
" <td>0</td>\n",
" <td>435</td>\n",
" <td>httpnewsbbccoukhiscotlandstm yahoo group spons...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>Big and big\\nMAIN PAGE\\nHuge big titties @ big...</td>\n",
" <td>1</td>\n",
" <td>231</td>\n",
" <td>big big main page huge big titti bigbigscom sa...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>Subject: re : enron visit - - thanks\\n larry ,...</td>\n",
" <td>0</td>\n",
" <td>1180</td>\n",
" <td>subject enron visit thank larri think potenti ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan...</td>\n",
" <td>0</td>\n",
" <td>574</td>\n",
" <td>fri aug ryan shane mention imho stop spammer g...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 Body Label \\\n",
"0 0 Subject: congratulations\\n vince ,\\n congratul... 0 \n",
"1 1 \\nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0 \n",
"2 2 Big and big\\nMAIN PAGE\\nHuge big titties @ big... 1 \n",
"3 3 Subject: re : enron visit - - thanks\\n larry ,... 0 \n",
"4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0 \n",
"\n",
" message_length processed_message \n",
"0 129 subject congratul vinc congratul wish best luc... \n",
"1 435 httpnewsbbccoukhiscotlandstm yahoo group spons... \n",
"2 231 big big main page huge big titti bigbigscom sa... \n",
"3 1180 subject enron visit thank larri think potenti ... \n",
"4 574 fri aug ryan shane mention imho stop spammer g... "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "7ce382be7bcdff2c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 subject congratul vinc congratul wish best luc...\n",
"1 httpnewsbbccoukhiscotlandstm yahoo group spons...\n",
"2 big big main page huge big titti bigbigscom sa...\n",
"3 subject enron visit thank larri think potenti ...\n",
"4 fri aug ryan shane mention imho stop spammer g...\n",
" ... \n",
"18109 subject fluid analysi custom speak volum spur ...\n",
"18110 subject guadalup roll june ena deal guadalup d...\n",
"18111 free porn ask click â â â remov instruct striv...\n",
"18112 subject revis nomin daren receiv revis nomin p...\n",
"18113 hello ive got small problem still annoy upgrad...\n",
"Name: processed_message, Length: 18114, dtype: object"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data['processed_message']"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "dc456d793b576f7",
"metadata": {},
"outputs": [],
"source": [
"# Analiza słów za pomocą WordCloud\n",
"spam_words = ' '.join(list(data[data['Label'] == 1]['processed_message']))\n",
"not_spam_words = ' '.join(list(data[data['Label'] == 0]['processed_message']))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "c9d7d9c9f4ae91ed",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAGtCAYAAACREAK2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOy9d5gd532Y+06f08+e7X2x6IXoBEmQ6oVWFyMqKkkkF8ly7Jix48T3Kn/YcRzbiX3jx45sOdG9jiXLdqxiybIqRbGIDexE78Bie9/Tz5n+3T9mscBiK4AFSVHn1fMI3GnfN3Om/PpPEkIIatSoUaNGjRo1atSoUWMNkV/tCdSoUaNGjRo1atSoUeP1R03RqFGjRo0aNWrUqFGjxppTUzRq1KhRo0aNGjVq1Kix5tQUjRo1atSoUaNGjRo1aqw5NUWjRo0aNWrUqFGjRo0aa05N0ahRo0aNGjVq1KhRo8aaU1M0atSoUaNGjRo1atSosebUFI0aNWrUqFGjRo0aNWqsOTVFo0aNGjVq1KhRo0aNGmtOTdGoUaNGjVeAxx57DEmSeOyxx9b0uD09Pfzsz/7smh5ztbz5zW/mzW9+8w3vL0kS/+k//ae5v7/4xS8iSRKXLl266bnVqFGjRo1Xn5qiUaNGjdcNX/3qV5EkiW9+85sL1u3atQtJknj00UcXrOvq6uLgwYOvxBRXzYULF/jMZz5Db28vpmmSTCa5++67+dM//VOq1eqrPb2fGI4dO8b9999Pd3c3pmnS3t7OO97xDj73uc+92lOrUaNGjdc9NUWjRo0arxvuueceAJ588sl5ywuFAsePH0dVVZ566ql56wYHBxkcHJzb97XAd7/7XW677Ta++tWv8r73vY/Pfe5z/MEf/AFdXV38h//wH/i3//bfvtpT/Ing6aefZv/+/Rw5coRPf/rT/Nmf/Rmf+tSnkGWZP/3TP321p1ejRo0ar3vUV3sCNWrUqLFWtLW1sW7dugWKxqFDhxBC8OEPf3jBust/36yiIYTAsiwikchNHaevr4+PfvSjdHd388gjj9Da2jq37ld+5Vc4f/483/3ud29qjJ8Wfu/3fo9UKsXzzz9POp2et25iYuLVmVSNGjVq/BRR82jUqFHjdcU999zDyy+/PC+86KmnnmL79u28613v4plnniEIgnnrJEni7rvvBsDzPH73d3+X9evXYxgGPT09/Mf/+B+xbXveOD09Pbz3ve/lwQcfZP/+/UQiEf7X//pfAAwNDfHBD36QWCxGU1MTv/7rv75g/6X4wz/8Q0qlEn/5l385T8m4zIYNG1b0aFy8eJEPf/jDZDIZotEod9555wLlZKl8iKVySb7whS+wfv16IpEIBw4c4IknnljV+QDYts2v//qv09jYSCKR4P3vfz9DQ0Or2vdb3/oW73nPe2hra8MwDNavX8/v/u7v4vv+ivteuHCB7du3L1AyAJqamub9LUkS/+bf/Bv+9m//ls2bN2OaJvv27ePxxx+ft11/fz+//Mu/zObNm4lEItTX1/PhD394wXW8fH2ffPJJHnjgARobG0mn03zmM5/BcRxyuRyf+MQnqKuro66ujt/8zd9ECLGqa1KjRo0aPynUPBo1atR4XXHPPffw5S9/mWeffXYuUfmpp57i4MGDHDx4kHw+z/Hjx9m5c+fcui1btlBfXw/Apz71Kb70pS9x//338xu/8Rs8++yz/MEf/AGnTp1akPtx5swZPvaxj/GZz3yGT3/602zevJlqtcrb3vY2BgYGeOCBB2hra+PLX/4yjzzyyKrm/+1vf5ve3t4bzhkZHx/n4MGDVCoVHnjgAerr6/nSl77E+9//fr7+9a9z3333Xfcx//Iv/5LPfOYzHDx4kF/7tV/j4sWLvP/97yeTydDZ2bni/p/61Kf4m7/5Gz7+8Y9z8OBBHnnkEd7znvesauwvfvGLxONx/t2/+3fE43EeeeQRfuu3fotCocAf/dEfLbtvd3c3hw4d4vjx4+zYsWPFsX784x/zla98hQceeADDMPj85z/Pz/zMz/Dcc8/N7f/888/z9NNP89GPfpSOjg4uXbrEX/zFX/DmN7+ZkydPEo1G5x3zV3/1V2lpaeF3fud3eOaZZ/jCF75AOp3m6aefpquri9///d/ne9/7Hn/0R3/Ejh07+MQnPrGq61KjRo0aPxGIGjVq1HgdceLECQGI3/3d3xVCCOG6rojFYuJLX/qSEEKI5uZm8ed//udCCCEKhYJQFEV8+tOfFkIIcfjwYQGIT33qU/OO+e///b8XgHjkkUfmlnV3dwtA/OAHP5i37Z/8yZ8IQHz1q1+dW1Yul8WGDRsEIB599NEl557P5wUgPvCBD6z6fLu7u8UnP/nJub9/7dd+TQDiiSeemFtWLBbFunXrRE9Pj/B9XwghxF/91V8JQPT19c073qOPPjpvno7jiKamJrF7925h2/bcdl/4whcEIN70pjctO7/L1/SXf/mX5y3/+Mc/LgDx27/923PLFptTpVJZcMzPfOYzIhqNCsuylh37hz/8oVAURSiKIu666y7xm7/5m+LBBx8UjuMs2BYQgHjhhRfmlvX39wvTNMV999237HwOHTokAPHXf/3XC87l3nvvFUEQzC2/6667hCRJ4pd+6ZfmlnmeJzo6Ola8ljVq1Kjxk0YtdKpGjRqvK7Zu3Up9ff1c7sWRI0col8tzHoKDBw/OJYQfOnQI3/fn8jO+973vAfDv/t2/m3fM3/iN3wBYEH60bt067r333nnLvve979Ha2sr9998/tywajfKLv/iLK869UCgAkEgkVneyi/C9732PAwcOzMs5icfj/OIv/iKXLl3i5MmT13W8F154gYmJCX7pl34JXdfnlv/sz/4sqVRqVfMBeOCBB+Yt/7Vf+7VVjX91zkuxWGRqaoo3vOENVCoVTp8+vey+73jHOzh06BDvf//7OXLkCH/4h3/IvffeS3t7O//0T/+0YPu77rqLffv2zf3d1dXFBz7wAR588MG5UK2r5+O6LtPT02zYsIF0Os1LL7204Ji/8Au/gCRJc3/fcccdCCH4hV/4hblliqKwf/9+Ll68uIorUqNGjRo/OdQUjRo1aryukCSJgwcPzuViPPXUUzQ1NbFhwwZgvqJx+d/LQnl/fz+yLM9te5mWlhbS6TT9/f3zlq9bt27B+P39/WzYsGGecAmwefPmFeeeTCaBUKC+Ufr7+xcda+vWrXPrr/d4ABs3bpy3XNM0ent7V7W/LMusX79+3vLVXA+AEydOcN9995FKpUgmkzQ2NvIv/+W/BCCfz6+4/+233843vvENstkszz33HJ/97GcpFovcf//9C5Sua88RYNOmTVQqFSYnJwGoVqv81m/9Fp2dnRiGQUNDA42NjeRyuUXn09XVNe/vy8rZtSFnqVSKbDa74vnUqFGjxk8SNUWjRo0arzvuuece8vk8x44dm8vPuMzBgwfp7+9neHiYJ598kra2tgUC87VKwlLcbIWpa0kmk7S1tXH8+PE1Pe5iLHWOq0myfqXI5XK86U1v4siRI/zn//yf+fa3v81DDz3Ef/tv/w1gXlL/Sui6zu23387v//7v8xd/8Re4rsvXvva1657Tr/7qr/J7v/d7/PN//s/56le/yg9/+EMeeugh6uvrF52PoiiLHmex5aKWDF6jRo3XGbVk8Bo1arzuuLqfxlNPPTUvTGffvn0YhsFjjz3Gs88+y7vf/e65dd3d3QRBwLlz5+Y8ABAmWOdyObq7u1ccu7u7m+PHjyOEmCfMnzlzZlVzf+9738sXvvAFDh06xF133bWqfa4df7GxLocZXT6Huro6IBTmr+Zaj8fl7c+dO8db3/rWueWu69LX18euXbtWnE8QBFy4cGGeF2M11+Oxxx5jenqab3zjG7zxjW+cW97X17fivsuxf/9+AEZHR+ctP3fu3IJtz549SzQapbGxEYCvf/3rfPKTn+S///f/PreNZVkLrmONGjVq1Kh5NGrUqPE6ZP/+/Zimyd/+7d8yPDw8z6NhGAZ79+7lz//8zymXy/NyGS4rHX/yJ38y73h//Md/DLCqSknvfve7GRkZ4et
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"wordcloud_spam = WordCloud(width=800, height=400).generate(spam_words)\n",
"plt.imshow(wordcloud_spam, interpolation='bilinear')\n",
"plt.axis('off')\n",
"plt.title('Word Cloud dla Spam')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "d954e01a1d0b3a97",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAGtCAYAAACREAK2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOy9d5xdZ33n/z793H6n96JebUtyk+UKmGrHlEAghABLSMgSAimbbIBsyIbsJkvKhg2QAD8SCAESSgIBG4zBTZZl2ZKs3kea0fRy5/Z7T39+f9zRSFczI42ksTHhvv3yS9I59zznOf35PN8mCSEENWrUqFGjRo0aNWrUqLGEyD/pDtSoUaNGjRo1atSoUeM/HzWhUaNGjRo1atSoUaNGjSWnJjRq1KhRo0aNGjVq1Kix5NSERo0aNWrUqFGjRo0aNZacmtCoUaNGjRo1atSoUaPGklMTGjVq1KhRo0aNGjVq1FhyakKjRo0aNWrUqFGjRo0aS05NaNSoUaNGjRo1atSoUWPJqQmNGjVq1KhRo0aNGjVqLDk1oVGjRo0al+Hxxx9HkiQef/zxJW23t7eXd7/73Uva5mK55557uOeee656e0mS+OM//uPZf3/xi19EkiT6+/uvuW81atSoUeM/BzWhUaNGjZcEX//615EkiX//93+fs+6GG25AkiQee+yxOeu6u7vZtm3bi9HFRdPX18f73vc+li9fjmmaxONxbr/9dj75yU9SLpd/0t37qeCccDFNk+Hh4Tnr77nnHjZu3HhVbX/mM5/hi1/84qJ/XygU+NjHPsbGjRuJRCI0NDSwadMmPvShDzEyMnJVfahRo0aNnwVqQqNGjRovCe644w4AnnrqqarluVyOQ4cOoaoqO3bsqFo3ODjI4ODg7LYvBR588EGuu+46vv71r/NzP/dz/O3f/i1/9md/Rnd3N7/3e7/Hhz70oZ90F3+qsG2bP//zP1/SNq9EaLiuy1133cVf/MVfcOedd/LXf/3XfOQjH2HLli189atf5cSJE0vatxo1atT4z4T6k+5AjRo1agC0t7ezbNmyOUJj586dCCF4y1veMmfduX9fq9AQQmBZFqFQ6JraOXPmDG9729vo6enh0Ucfpa2tbXbdb/zGb3Dq1CkefPDBa9rHzxqbNm3i85//PB/+8Idpb29/0ff/7W9/m+eff56vfOUrvP3tb69aZ1kWjuO86H2qUaNGjZ8WahaNGjVqvGS44447eP7556vci3bs2MGGDRt47WtfyzPPPEMQBFXrJEni9ttvB8DzPD7+8Y+zYsUKDMOgt7eXj3zkI9i2XbWf3t5e7r//fh5++GFuuukmQqEQn/3sZwEYGhriDW94A5FIhObmZn77t397zvYL8YlPfIJCocAXvvCFKpFxjpUrV17WonH69Gne8pa3UF9fTzgcZuvWrXPEyULxEAvFknzuc59jxYoVhEIhbrnlFrZv376o44GKReG3f/u3aWpqIhaL8cADDzA0NLSobb/zne9w33330d7ejmEYrFixgo9//OP4vr/o/X/kIx/B9/1FWTUWc/17e3s5fPgwTzzxBJIkIUnSJWNV+vr6AGbvsQs55xZ3jne/+91Eo1FOnz7Nq1/9aiKRCO3t7fzJn/wJQoiqbf/yL/+Sbdu20dDQQCgU4sYbb+Sb3/zmnH1IksQHPvABvvGNb7B+/XpCoRC33XYbBw8eBOCzn/0sK1euxDRN7rnnnlqMTI0aNV5S1IRGjRo1XjLccccduK7Lrl27Zpft2LGDbdu2sW3bNrLZLIcOHapat3btWhoaGgB473vfyx/90R+xZcsW/u///b/cfffd/Nmf/Rlve9vb5uzr+PHj/OIv/iKvfOUr+eQnP8mmTZsol8u84hWv4OGHH+YDH/gAH/3oR9m+fTu///u/v6j+f/e732X58uVXHTMyPj7Otm3bePjhh3n/+9/P//pf/wvLsnjggQfmjV1ZDF/4whd43/veR2trK5/4xCe4/fbbeeCBBxgcHFzU9u9973v5m7/5G171qlfx53/+52iaxn333beobb/4xS8SjUb5nd/5HT75yU9y44038kd/9Ef8wR/8waL7v2zZMt75znfy+c9//rLxEIu5/n/zN39DZ2cna9eu5ctf/jJf/vKX+ehHP7pgmz09PQD80z/90xyxMB++7/Oa17yGlpYWPvGJT3DjjTfysY99jI997GNVv/vkJz/J5s2b+ZM/+RP+9//+36iqylve8pZ5LV7bt2/nd3/3d3nXu97FH//xH3P06FHuv/9+Pv3pT/P//t//4/3vfz+/93u/x86dO3nPe95z2T7WqFGjxouGqFGjRo2XCIcPHxaA+PjHPy6EEMJ1XRGJRMSXvvQlIYQQLS0t4tOf/rQQQohcLicURRG/+qu/KoQQYt++fQIQ733ve6va/G//7b8JQDz66KOzy3p6egQgfvCDH1T99m/+5m8EIL7+9a/PLisWi2LlypUCEI899tiCfc9mswIQr3/96xd9vD09PeJd73rX7L9/67d+SwBi+/bts8vy+bxYtmyZ6O3tFb7vCyGE+Md//EcBiDNnzlS199hjj1X103Ec0dzcLDZt2iRs25793ec+9zkBiLvvvvuS/Tt3Tt///vdXLX/7298uAPGxj31sdtl8fSqVSnPafN/73ifC4bCwLOuS+z7X3nPPPSf6+vqEqqrigx/84Oz6u+++W2zYsGFOXxdz/Tds2HDZY7/wGNasWSMA0dPTI9797neLL3zhC2J8fHzOb9/1rncJQPzmb/7m7LIgCMR9990ndF0Xk5OTVe1eiOM4YuPGjeLlL3951XJAGIZRdV4/+9nPCkC0traKXC43u/zDH/7wvPdFjRo1avykqFk0atSo8ZJh3bp1NDQ0zMZe7N+/n2KxOGsh2LZt22xA+M6dO/F9fzY+46GHHgLgd37nd6ra/N3f/V2AOTPFy5Yt49WvfnXVsoceeoi2tjbe/OY3zy4Lh8P82q/92mX7nsvlAIjFYos72Hl46KGHuOWWW6piTqLRKL/2a79Gf38/R44cuaL2du/ezcTEBL/+67+Oruuzy9/97neTSCQW1R+AD37wg1XLf+u3fmtR+78w5iWfzzM1NcWdd95JqVTi2LFji2oDYPny5fzyL/8yn/vc5xgdHb1kXxd7/RdLKBRi165d/N7v/R5QsdL8yq/8Cm1tbfzmb/7mvG51H/jAB2b/fs71yXEcfvSjH1W1e450Ok02m+XOO+9k7969c9p7xSteQW9v7+y/b731VgB+/ud/vup+O7f89OnTV3WsNWrUqLHU1IRGjRo1XjJIksS2bdtmYzF27NhBc3MzK1euBKqFxrk/zw3KBwYGkGV59rfnaG1tJZlMMjAwULV82bJlc/Y/MDDAypUrkSSpavmaNWsu2/dzvvr5fH4xhzovAwMD8+5r3bp1s+uvtD2AVatWVS3XNI3ly5cvantZllmxYkXV8sWcD4DDhw/zxje+kUQiQTwep6mpiXe84x0AZLPZRbVxjj/8wz/E87wFYzWu9PpfCYlEgk984hP09/fT39/PF77wBdasWcOnPvUpPv7xj1f9VpblOed29erVAFXxE9/73vfYunUrpmlSX19PU1MTf/d3fzfveenu7p7TH4Curq55l6fT6as70Bo1atRYYmpCo0aNGi8p7rjjDrLZLAcPHpyNzzjHtm3bGBgYYHh4mKeeeor29vY5g7qLRcJCXGuGqYuJx+O0t7dXxZC8UCx0jFcSZP1Ck8lkuPvuu9m/fz9/8id/wne/+10eeeQR/s//+T8AVUH9i2H58uW84x3vuKRVAxZ//a+Wnp4e3vOe97Bjxw6SySRf+cpXrriN7du388ADD2CaJp/5zGd46KGHeOSRR3j7298+bxyIoijztrPQ8vnaqFGjRo2fBDWhUaNGjZcUF9bT2LFjR1W2nxtvvBHDMHj88cfZtWtX1bqenh6CIODkyZNV7Y2Pj5PJZGaDei9FT08PfX19cwZqx48fX1Tf77//fvr6+ti5c+eifj/f/ufb1zk3o3PHUFdXB1QG8xdy8az9ud9ffE5c1+XMmTOL6k8QBLOZl86xmPP
"text/plain": [
"<Figure size 1000x600 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.figure(figsize=(10, 6))\n",
"wordcloud_not_spam = WordCloud(width=800, height=400).generate(not_spam_words)\n",
"plt.imshow(wordcloud_not_spam, interpolation='bilinear')\n",
"plt.axis('off')\n",
"plt.title('Word Cloud dla Not Spam')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "743000c7d99b8a85",
"metadata": {},
"outputs": [],
"source": [
"# Budowa modelu klasyfikacyjnego"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "7b3ba8e5b035cdc0",
"metadata": {},
"outputs": [],
"source": [
"# Zamiana tekstu na wektory\n",
"vectorizer = CountVectorizer()\n",
"X = vectorizer.fit_transform(data['processed_message'])\n",
"y = data['Label']"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "5d66dcf506f4f399",
"metadata": {},
"outputs": [],
"source": [
"# Podział na zbiór treningowy i testowy\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "b3c2a6673c718301",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-r
],
"text/plain": [
"MultinomialNB()"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Trenowanie modelu Naiwnego Bayesa\n",
"model_NB = MultinomialNB()\n",
"model_NB.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "82f18edc9161422a",
"metadata": {},
"outputs": [],
"source": [
"# Predykcja i ocena Naiwny Bayes\n",
"y_pred_NB = model_NB.predict(X_test)\n",
"accuracy_NB = accuracy_score(y_test, y_pred_NB)\n",
"classification_rep_NB = classification_report(y_test, y_pred_NB)\n",
"confusion_matrix_NB = confusion_matrix(y_test, y_pred_NB)"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "a629b6b89d5cdf34",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9536295887386144"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy_NB"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "53c0cf3dc8aa02bc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.95 0.96 2229\n",
" 1 0.92 0.96 0.94 1394\n",
"\n",
" accuracy 0.95 3623\n",
" macro avg 0.95 0.96 0.95 3623\n",
"weighted avg 0.95 0.95 0.95 3623\n",
"\n"
]
}
],
"source": [
"print(classification_rep_NB)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "9b915d02828de60",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[2110 119]\n",
" [ 49 1345]]\n"
]
}
],
"source": [
"print(confusion_matrix_NB)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "160da18f95c142a0",
"metadata": {},
"outputs": [],
"source": [
"# Trening Drzewa Decyzyjnego (DT)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "8720ed4fd0ed5c72",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-2 {color: black;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-r
],
"text/plain": [
"DecisionTreeClassifier()"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Parametry domyślne\n",
"model_DT = DecisionTreeClassifier(criterion= 'gini',\n",
" max_depth= None,\n",
" min_samples_leaf= 1,\n",
" min_samples_split= 2,\n",
" splitter= 'best')\n",
"model_DT.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "7aee079d59bdd4eb",
"metadata": {},
"outputs": [],
"source": [
"# Predykcja i ocena DT\n",
"y_pred_DT = model_DT.predict(X_test)\n",
"accuracy_DT = accuracy_score(y_test, y_pred_DT)\n",
"classification_rep_DT = classification_report(y_test, y_pred_DT)\n",
"confusion_matrix_DT = confusion_matrix(y_test, y_pred_DT)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "57ac5a3ffe724fd5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9354126414573558"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy_DT"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "ed8955dc5d5cdeaf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.95 0.94 0.95 2229\n",
" 1 0.91 0.93 0.92 1394\n",
"\n",
" accuracy 0.94 3623\n",
" macro avg 0.93 0.93 0.93 3623\n",
"weighted avg 0.94 0.94 0.94 3623\n",
"\n"
]
}
],
"source": [
"print(classification_rep_DT)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "3ebfee20eb06e8cc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[2098 131]\n",
" [ 103 1291]]\n"
]
}
],
"source": [
"print(confusion_matrix_DT)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "85d3dc4e44a2a4b3",
"metadata": {},
"outputs": [],
"source": [
"# Las losowy"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "6f454235f54aa9cc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-3 {color: black;}#sk-container-id-3 pre{padding: 0;}#sk-container-id-3 div.sk-toggleable {background-color: white;}#sk-container-id-3 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-3 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-3 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-3 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-3 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-3 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-3 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-3 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-3 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-3 div.sk-item {position: relative;z-index: 1;}#sk-container-id-3 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-3 div.sk-item::before, #sk-container-id-3 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-3 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-3 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-3 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-3 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-3 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-3 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-3 div.sk-label-container {text-align: center;}#sk-container-id-3 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-3 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-3\" class=\"sk-top-container\"><div class=\"sk-text-r
],
"text/plain": [
"RandomForestClassifier(random_state=123)"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_RF = RandomForestClassifier(n_estimators= 100,\n",
" bootstrap= True,\n",
" ccp_alpha= 0.0,\n",
" criterion= 'gini',\n",
" max_depth= None,\n",
" min_samples_leaf= 1,\n",
" min_samples_split= 2,\n",
" random_state=123)\n",
"model_RF.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "23d68d066dc47f9",
"metadata": {},
"outputs": [],
"source": [
"# Predykcja i ocena RF\n",
"y_pred_RF = model_RF.predict(X_test)\n",
"accuracy_RF = accuracy_score(y_test, y_pred_RF)\n",
"classification_rep_RF = classification_report(y_test, y_pred_RF)\n",
"confusion_matrix_RF = confusion_matrix(y_test, y_pred_RF)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "55789560bb43f9b8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9770908087220536"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy_RF"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "d15d57c467b94bad",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 0.98 0.99 0.98 2229\n",
" 1 0.98 0.96 0.97 1394\n",
"\n",
" accuracy 0.98 3623\n",
" macro avg 0.98 0.97 0.98 3623\n",
"weighted avg 0.98 0.98 0.98 3623\n",
"\n"
]
}
],
"source": [
"print(classification_rep_RF)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "477ea9a19dbe7389",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[2201 28]\n",
" [ 55 1339]]\n"
]
}
],
"source": [
"print(confusion_matrix_RF)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "9c3308c811b9d014",
"metadata": {},
"outputs": [],
"source": [
"# Najlepszym modelem okazał się Las losowy - lepiej sklasyfikować spam jako wiadomość nie będącą spamem niż odwrotnie. \n",
"# Dlatego wybieramy RF, a nie NB."
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "81f08fa14ba4daf5",
"metadata": {},
"outputs": [],
"source": [
"# Teraz dokonamy treningu na pełnych danych i zapiszemy model celem wykorzystania na danych rzeczywistych w późniejszej \n",
"# aplikacji."
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "7f580653f470d7af",
"metadata": {},
"outputs": [],
"source": [
"model_RF_full = RandomForestClassifier(n_estimators= 100,\n",
" bootstrap= True,\n",
" ccp_alpha= 0.0,\n",
" criterion= 'gini',\n",
" max_depth= None,\n",
" min_samples_leaf= 1,\n",
" min_samples_split= 2,\n",
" random_state=123)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "f75fc9a4d4746e5a",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-4 {color: black;}#sk-container-id-4 pre{padding: 0;}#sk-container-id-4 div.sk-toggleable {background-color: white;}#sk-container-id-4 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-4 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-4 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-4 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-4 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-4 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-4 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-4 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-4 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-4 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-4 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-4 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-4 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-4 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-4 div.sk-item {position: relative;z-index: 1;}#sk-container-id-4 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-4 div.sk-item::before, #sk-container-id-4 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-4 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-4 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-4 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-4 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-4 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-4 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-4 div.sk-label-container {text-align: center;}#sk-container-id-4 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-4 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-4\" class=\"sk-top-container\"><div class=\"sk-text-r
],
"text/plain": [
"RandomForestClassifier(random_state=123)"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_RF_full.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "3d77bed327ac2fa1",
"metadata": {},
"outputs": [],
"source": [
"# Predykcja i ocena RF\n",
"y_pred_RF_full = model_RF_full.predict(X)\n",
"accuracy_RF_full = accuracy_score(y, y_pred_RF_full)\n",
"classification_rep_RF_full = classification_report(y, y_pred_RF_full)\n",
"confusion_matrix_RF_full = confusion_matrix(y, y_pred_RF_full)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "a76a53da77128562",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1.0"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"accuracy_RF_full"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "9a66104fd13572f8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0 1.00 1.00 1.00 11124\n",
" 1 1.00 1.00 1.00 6990\n",
"\n",
" accuracy 1.00 18114\n",
" macro avg 1.00 1.00 1.00 18114\n",
"weighted avg 1.00 1.00 1.00 18114\n",
"\n"
]
}
],
"source": [
"print(classification_rep_RF_full)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "823635f2315ecf05",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[11124 0]\n",
" [ 0 6990]]\n"
]
}
],
"source": [
"print(confusion_matrix_RF_full)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "d0136f7b9f6344c4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-5 {color: black;}#sk-container-id-5 pre{padding: 0;}#sk-container-id-5 div.sk-toggleable {background-color: white;}#sk-container-id-5 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-5 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-5 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-5 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-5 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-5 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-5 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-5 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-5 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-5 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-5 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-5 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-5 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-5 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-5 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-5 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-5 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-5 div.sk-item {position: relative;z-index: 1;}#sk-container-id-5 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-5 div.sk-item::before, #sk-container-id-5 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-5 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-5 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-5 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-5 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-5 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-5 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-5 div.sk-label-container {text-align: center;}#sk-container-id-5 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-5 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-5\" class=\"sk-top-container\"><div class=\"sk-text-r
],
"text/plain": [
"RandomForestClassifier(random_state=123)"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_RF_full"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "e02e9031d10617f6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['vectorizer.pkl']"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Zapisz model i vectorizer\n",
"joblib.dump(model_RF_full, 'spam_classifier_model.pkl')\n",
"joblib.dump(vectorizer, 'vectorizer.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "2ac5943e18571301",
"metadata": {},
"outputs": [],
"source": [
"# Uwaga, ważna jest zgodność wersji scikita i joblib tutaj i w środowisku aplikacji"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "a238743e07978f4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"scikit-fuzzy==0.4.2\n",
"scikit-learn==1.3.2\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"pip freeze | findstr scikit"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "a64099b8c61a884",
"metadata": {},
"outputs": [],
"source": [
"# Jak instalować?"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "d99c1dbe",
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-05T16:57:22.800834Z",
"start_time": "2024-06-05T16:57:22.798725Z"
}
},
"outputs": [],
"source": [
"# Np. tak\n",
"# pip install scikit-learn==1.3.2"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}