1887 lines
887 KiB
Plaintext
1887 lines
887 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 1,
|
|||
|
"id": "b313cab7d5cc49c0",
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"start_time": "2024-06-05T20:03:23.481431Z"
|
|||
|
},
|
|||
|
"jupyter": {
|
|||
|
"is_executing": true
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Requirement already satisfied: pandas in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (2.2.2)Note: you may need to restart the kernel to use updated packages.\n",
|
|||
|
"\n",
|
|||
|
"Requirement already satisfied: numpy>=1.26.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (1.26.4)\n",
|
|||
|
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from pandas) (2.9.0.post0)\n",
|
|||
|
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n",
|
|||
|
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n",
|
|||
|
"Requirement already satisfied: six>=1.5 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
|
|||
|
"Requirement already satisfied: matplotlib in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (3.9.0)\n",
|
|||
|
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (1.2.1)\n",
|
|||
|
"Requirement already satisfied: cycler>=0.10 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (0.12.1)\n",
|
|||
|
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (4.53.0)\n",
|
|||
|
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (1.4.5)\n",
|
|||
|
"Requirement already satisfied: numpy>=1.23 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (1.26.4)\n",
|
|||
|
"Requirement already satisfied: packaging>=20.0 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from matplotlib) (24.0)\n",
|
|||
|
"Requirement already satisfied: pillow>=8 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (10.3.0)\n",
|
|||
|
"Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib) (3.1.2)\n",
|
|||
|
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from matplotlib) (2.9.0.post0)\n",
|
|||
|
"Requirement already satisfied: six>=1.5 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n",
|
|||
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
|||
|
"Requirement already satisfied: nltk in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (3.8.1)\n",
|
|||
|
"Requirement already satisfied: click in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from nltk) (8.1.7)\n",
|
|||
|
"Requirement already satisfied: joblib in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from nltk) (1.4.2)\n",
|
|||
|
"Requirement already satisfied: regex>=2021.8.3 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from nltk) (2024.5.15)\n",
|
|||
|
"Requirement already satisfied: tqdm in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from nltk) (4.66.4)\n",
|
|||
|
"Requirement already satisfied: colorama in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from click->nltk) (0.4.6)\n",
|
|||
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
|||
|
"Requirement already satisfied: wordcloud in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (1.9.3)\n",
|
|||
|
"Requirement already satisfied: numpy>=1.6.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from wordcloud) (1.26.4)\n",
|
|||
|
"Requirement already satisfied: pillow in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from wordcloud) (10.3.0)\n",
|
|||
|
"Requirement already satisfied: matplotlib in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from wordcloud) (3.9.0)\n",
|
|||
|
"Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib->wordcloud) (1.2.1)\n",
|
|||
|
"Requirement already satisfied: cycler>=0.10 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib->wordcloud) (0.12.1)\n",
|
|||
|
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib->wordcloud) (4.53.0)\n",
|
|||
|
"Requirement already satisfied: kiwisolver>=1.3.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib->wordcloud) (1.4.5)\n",
|
|||
|
"Requirement already satisfied: packaging>=20.0 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from matplotlib->wordcloud) (24.0)\n",
|
|||
|
"Requirement already satisfied: pyparsing>=2.3.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from matplotlib->wordcloud) (3.1.2)\n",
|
|||
|
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from matplotlib->wordcloud) (2.9.0.post0)\n",
|
|||
|
"Requirement already satisfied: six>=1.5 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)\n",
|
|||
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
|||
|
"Requirement already satisfied: scikit-learn==1.3.2 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (1.3.2)\n",
|
|||
|
"Requirement already satisfied: numpy<2.0,>=1.17.3 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn==1.3.2) (1.26.4)\n",
|
|||
|
"Requirement already satisfied: scipy>=1.5.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn==1.3.2) (1.13.1)\n",
|
|||
|
"Requirement already satisfied: joblib>=1.1.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn==1.3.2) (1.4.2)\n",
|
|||
|
"Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-learn==1.3.2) (3.5.0)\n",
|
|||
|
"Note: you may need to restart the kernel to use updated packages.\n",
|
|||
|
"Requirement already satisfied: scikit-fuzzy==0.4.2 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (0.4.2)\n",
|
|||
|
"Requirement already satisfied: numpy>=1.6.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-fuzzy==0.4.2) (1.26.4)\n",
|
|||
|
"Requirement already satisfied: scipy>=0.9.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-fuzzy==0.4.2) (1.13.1)\n",
|
|||
|
"Requirement already satisfied: networkx>=1.9.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from scikit-fuzzy==0.4.2) (3.3)\n",
|
|||
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"[nltk_data] Downloading package punkt to\n",
|
|||
|
"[nltk_data] C:\\Users\\alicj\\AppData\\Roaming\\nltk_data...\n",
|
|||
|
"[nltk_data] Package punkt is already up-to-date!\n",
|
|||
|
"[nltk_data] Downloading package stopwords to\n",
|
|||
|
"[nltk_data] C:\\Users\\alicj\\AppData\\Roaming\\nltk_data...\n",
|
|||
|
"[nltk_data] Package stopwords is already up-to-date!\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"%pip install pandas\n",
|
|||
|
"%pip install matplotlib\n",
|
|||
|
"%pip install nltk\n",
|
|||
|
"%pip install wordcloud\n",
|
|||
|
"%pip install scikit-learn==1.3.2\n",
|
|||
|
"%pip install scikit-fuzzy==0.4.2\n",
|
|||
|
"# Import pakietów\n",
|
|||
|
"import nltk\n",
|
|||
|
"nltk.download('punkt')\n",
|
|||
|
"nltk.download('stopwords')\n",
|
|||
|
"import pandas as pd\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"import re\n",
|
|||
|
"import string\n",
|
|||
|
"from wordcloud import WordCloud\n",
|
|||
|
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
|||
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
|||
|
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
|
|||
|
"from nltk.corpus import stopwords\n",
|
|||
|
"from nltk.stem import PorterStemmer\n",
|
|||
|
"from nltk.tokenize import word_tokenize\n",
|
|||
|
"import joblib\n",
|
|||
|
"import pickle"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 2,
|
|||
|
"id": "768266dbb79c5e9d",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Załaduj dane\n",
|
|||
|
"data_path = \"joined_data.csv\"\n",
|
|||
|
"data = pd.read_csv(data_path)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 3,
|
|||
|
"id": "ee08266d5c30627b",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" Unnamed: 0 Body Label\n",
|
|||
|
"0 0 Subject: congratulations\\n vince ,\\n congratul... 0\n",
|
|||
|
"1 1 \\nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0\n",
|
|||
|
"2 2 Big and big\\nMAIN PAGE\\nHuge big titties @ big... 1\n",
|
|||
|
"3 3 Subject: re : enron visit - - thanks\\n larry ,... 0\n",
|
|||
|
"4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(data.head())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 4,
|
|||
|
"id": "1798f605e33fe5e5",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|||
|
"RangeIndex: 18651 entries, 0 to 18650\n",
|
|||
|
"Data columns (total 3 columns):\n",
|
|||
|
" # Column Non-Null Count Dtype \n",
|
|||
|
"--- ------ -------------- ----- \n",
|
|||
|
" 0 Unnamed: 0 18651 non-null int64 \n",
|
|||
|
" 1 Body 18650 non-null object\n",
|
|||
|
" 2 Label 18651 non-null int64 \n",
|
|||
|
"dtypes: int64(2), object(1)\n",
|
|||
|
"memory usage: 437.3+ KB\n",
|
|||
|
"None\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(data.info())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 5,
|
|||
|
"id": "b4f43d913b92485b",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Unnamed: 0</th>\n",
|
|||
|
" <th>Body</th>\n",
|
|||
|
" <th>Label</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>Subject: congratulations\\n vince ,\\n congratul...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>\\nhttp://news.bbc.co.uk/1/hi/scotland/2515231....</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>Big and big\\nMAIN PAGE\\nHuge big titties @ big...</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>Subject: re : enron visit - - thanks\\n larry ,...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18646</th>\n",
|
|||
|
" <td>18646</td>\n",
|
|||
|
" <td>Subject: fluid analysis\\n our customer speak v...</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18647</th>\n",
|
|||
|
" <td>18647</td>\n",
|
|||
|
" <td>Subject: guadalupe\\n i rolled 740208 , 740209 ...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18648</th>\n",
|
|||
|
" <td>18648</td>\n",
|
|||
|
" <td>100% Free Porn!\\nWhat more can you ask for?\\nC...</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18649</th>\n",
|
|||
|
" <td>18649</td>\n",
|
|||
|
" <td>Subject: revised nominations\\n daren ,\\n we ha...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18650</th>\n",
|
|||
|
" <td>18650</td>\n",
|
|||
|
" <td>Hello,\\nI've got a small problem but still ann...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>18651 rows × 3 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Unnamed: 0 Body Label\n",
|
|||
|
"0 0 Subject: congratulations\\n vince ,\\n congratul... 0\n",
|
|||
|
"1 1 \\nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0\n",
|
|||
|
"2 2 Big and big\\nMAIN PAGE\\nHuge big titties @ big... 1\n",
|
|||
|
"3 3 Subject: re : enron visit - - thanks\\n larry ,... 0\n",
|
|||
|
"4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0\n",
|
|||
|
"... ... ... ...\n",
|
|||
|
"18646 18646 Subject: fluid analysis\\n our customer speak v... 1\n",
|
|||
|
"18647 18647 Subject: guadalupe\\n i rolled 740208 , 740209 ... 0\n",
|
|||
|
"18648 18648 100% Free Porn!\\nWhat more can you ask for?\\nC... 1\n",
|
|||
|
"18649 18649 Subject: revised nominations\\n daren ,\\n we ha... 0\n",
|
|||
|
"18650 18650 Hello,\\nI've got a small problem but still ann... 0\n",
|
|||
|
"\n",
|
|||
|
"[18651 rows x 3 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 5,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 6,
|
|||
|
"id": "e3bf0f04a2be4e1a",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Usuwamy NaN"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 7,
|
|||
|
"id": "71a6bbebdb0dccd4",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"data.dropna(inplace=True)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 8,
|
|||
|
"id": "b7fca25d67381cdd",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Usuwamy puste wiadomości i wiadomości zawierające jedynie \"\\n\""
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 9,
|
|||
|
"id": "72d84bf6c1e7023a",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"data = data[data['Body'] != '\\n']"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 10,
|
|||
|
"id": "7c94c4dca6c4cdae",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"data = data[data['Body'] != 'empty']"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 11,
|
|||
|
"id": "7e6fd3f8014498f3",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"data.reset_index(drop=True, inplace=True)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 12,
|
|||
|
"id": "a0c33f82a936c59",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Unnamed: 0</th>\n",
|
|||
|
" <th>Body</th>\n",
|
|||
|
" <th>Label</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>Subject: congratulations\\n vince ,\\n congratul...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>\\nhttp://news.bbc.co.uk/1/hi/scotland/2515231....</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>Big and big\\nMAIN PAGE\\nHuge big titties @ big...</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>Subject: re : enron visit - - thanks\\n larry ,...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18109</th>\n",
|
|||
|
" <td>18646</td>\n",
|
|||
|
" <td>Subject: fluid analysis\\n our customer speak v...</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18110</th>\n",
|
|||
|
" <td>18647</td>\n",
|
|||
|
" <td>Subject: guadalupe\\n i rolled 740208 , 740209 ...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18111</th>\n",
|
|||
|
" <td>18648</td>\n",
|
|||
|
" <td>100% Free Porn!\\nWhat more can you ask for?\\nC...</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18112</th>\n",
|
|||
|
" <td>18649</td>\n",
|
|||
|
" <td>Subject: revised nominations\\n daren ,\\n we ha...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18113</th>\n",
|
|||
|
" <td>18650</td>\n",
|
|||
|
" <td>Hello,\\nI've got a small problem but still ann...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>18114 rows × 3 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Unnamed: 0 Body Label\n",
|
|||
|
"0 0 Subject: congratulations\\n vince ,\\n congratul... 0\n",
|
|||
|
"1 1 \\nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0\n",
|
|||
|
"2 2 Big and big\\nMAIN PAGE\\nHuge big titties @ big... 1\n",
|
|||
|
"3 3 Subject: re : enron visit - - thanks\\n larry ,... 0\n",
|
|||
|
"4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0\n",
|
|||
|
"... ... ... ...\n",
|
|||
|
"18109 18646 Subject: fluid analysis\\n our customer speak v... 1\n",
|
|||
|
"18110 18647 Subject: guadalupe\\n i rolled 740208 , 740209 ... 0\n",
|
|||
|
"18111 18648 100% Free Porn!\\nWhat more can you ask for?\\nC... 1\n",
|
|||
|
"18112 18649 Subject: revised nominations\\n daren ,\\n we ha... 0\n",
|
|||
|
"18113 18650 Hello,\\nI've got a small problem but still ann... 0\n",
|
|||
|
"\n",
|
|||
|
"[18114 rows x 3 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 12,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 13,
|
|||
|
"id": "19af5936d0cfeba2",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Label\n",
|
|||
|
"0 11124\n",
|
|||
|
"1 6990\n",
|
|||
|
"Name: count, dtype: int64\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Sprawdźmy rozkład targetów\n",
|
|||
|
"print(data['Label'].value_counts())"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 14,
|
|||
|
"id": "96c861e2655312cb",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Analiza długości wiadomości"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 15,
|
|||
|
"id": "e1ec1ed8aa7c856d",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def get_len(row):\n",
|
|||
|
" try:\n",
|
|||
|
" return len(row)\n",
|
|||
|
" except:\n",
|
|||
|
" return row"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 16,
|
|||
|
"id": "63c023f34d234f3e",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"data['message_length'] = data['Body'].apply(get_len)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 17,
|
|||
|
"id": "d4fd0e2dcc2bfee9",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Unnamed: 0</th>\n",
|
|||
|
" <th>Body</th>\n",
|
|||
|
" <th>Label</th>\n",
|
|||
|
" <th>message_length</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>16293</th>\n",
|
|||
|
" <td>16774</td>\n",
|
|||
|
" <td>\\n4623\\n</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>6</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6071</th>\n",
|
|||
|
" <td>6254</td>\n",
|
|||
|
" <td>Subject: \\n</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3683</th>\n",
|
|||
|
" <td>3792</td>\n",
|
|||
|
" <td>Subject: \\n</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>12843</th>\n",
|
|||
|
" <td>13228</td>\n",
|
|||
|
" <td>Subject: \\n</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>17867</th>\n",
|
|||
|
" <td>18399</td>\n",
|
|||
|
" <td>Subject: \\n</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>10</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6927</th>\n",
|
|||
|
" <td>7128</td>\n",
|
|||
|
" <td>------------------------ Yahoo! Groups Sponsor...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>107989</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>6887</th>\n",
|
|||
|
" <td>7088</td>\n",
|
|||
|
" <td>Subject: enron mentions\\n enron discusses cred...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>121502</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2422</th>\n",
|
|||
|
" <td>2488</td>\n",
|
|||
|
" <td>=?GB2312?B?yNW12squ0ru97NbQufq5+rzKtefX08nosb...</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>129635</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1522</th>\n",
|
|||
|
" <td>1569</td>\n",
|
|||
|
" <td>change your settings: http://blo.gs/settings.p...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>194978</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4844</th>\n",
|
|||
|
" <td>4987</td>\n",
|
|||
|
" <td>,Body,Label\\n 0,\"Subject: great part-time or s...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>17085626</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>18114 rows × 4 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Unnamed: 0 Body Label \\\n",
|
|||
|
"16293 16774 \\n4623\\n 1 \n",
|
|||
|
"6071 6254 Subject: \\n 1 \n",
|
|||
|
"3683 3792 Subject: \\n 1 \n",
|
|||
|
"12843 13228 Subject: \\n 1 \n",
|
|||
|
"17867 18399 Subject: \\n 1 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"6927 7128 ------------------------ Yahoo! Groups Sponsor... 0 \n",
|
|||
|
"6887 7088 Subject: enron mentions\\n enron discusses cred... 0 \n",
|
|||
|
"2422 2488 =?GB2312?B?yNW12squ0ru97NbQufq5+rzKtefX08nosb... 1 \n",
|
|||
|
"1522 1569 change your settings: http://blo.gs/settings.p... 0 \n",
|
|||
|
"4844 4987 ,Body,Label\\n 0,\"Subject: great part-time or s... 0 \n",
|
|||
|
"\n",
|
|||
|
" message_length \n",
|
|||
|
"16293 6 \n",
|
|||
|
"6071 10 \n",
|
|||
|
"3683 10 \n",
|
|||
|
"12843 10 \n",
|
|||
|
"17867 10 \n",
|
|||
|
"... ... \n",
|
|||
|
"6927 107989 \n",
|
|||
|
"6887 121502 \n",
|
|||
|
"2422 129635 \n",
|
|||
|
"1522 194978 \n",
|
|||
|
"4844 17085626 \n",
|
|||
|
"\n",
|
|||
|
"[18114 rows x 4 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 17,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data.sort_values(by='message_length')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 18,
|
|||
|
"id": "e62112260ebc17f0",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Jedna wiadomość jest bardzo długa 17085626"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 19,
|
|||
|
"id": "7c369131e3c91ce3",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"message_length\n",
|
|||
|
"293 68\n",
|
|||
|
"295 53\n",
|
|||
|
"291 52\n",
|
|||
|
"539 44\n",
|
|||
|
"446 40\n",
|
|||
|
" ..\n",
|
|||
|
"2394 1\n",
|
|||
|
"4856 1\n",
|
|||
|
"6192 1\n",
|
|||
|
"2597 1\n",
|
|||
|
"4004 1\n",
|
|||
|
"Name: count, Length: 4903, dtype: int64"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 19,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data['message_length'].value_counts()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 20,
|
|||
|
"id": "b6b509692fd7c541",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2QAAAIkCAYAAABxx+gQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAB4WElEQVR4nO3dfXzO9f////ux2Sm2OdtmmZlzc05hyVnGaElaheQsEpFQQiEnlaIIOflUznonRVFCmPOzOUnOSRFR2VTMlOz0+fvDb8fX0YaN7XhpbtfLxeWy4/l6vl7H4/U4Dtq91+t4HjZjjBEAAAAAwOlcrC4AAAAAAO5UBDIAAAAAsAiBDAAAAAAsQiADAAAAAIsQyAAAAADAIgQyAAAAALAIgQwAAAAALEIgAwAAAACLEMgAAMihKVOm6H//+5/VZSAL8+bN03vvvWd1GQCQbQQyAMgnmjZtqmrVqt1wXpkyZdStW7dce96TJ0/KZrNp7ty5N7X/3LlzZbPZdPLkSftY06ZN1bRp01ypL7e99957GjNmjOrXr5/jfW02m0aNGpX7RUkaNWqUbDZbrh4zt98reW3ZsmXq06ePateuneN9/2vnCiD/IJABQC7LCBgZfwoUKKC77rpL3bp106+//mp1ebgF+/bt04gRI7R06VJVrFjR6nJwlV9//VU9evTQ//73PzVs2NDqcgAg2wpYXQAA5FdjxoxRaGioLl++rO3bt2vu3LnasmWLDh48KE9PT6vLw004dOiQFi1apHvvvfem9v/nn39UoEDe/Kd3+PDhGjp0aJ4c+79g3759mjZtmqKjo29q/6NHj8rFhf9PDcD5CGQAkEdat26tu+++W5LUs2dPFS9eXG+99ZaWLl2qxx9/3OLqcDOeeOKJW9o/L4N4gQIF8izs/Rc88MADt7S/h4dHLlUCADnD/woCACdp1KiRJOn48eMO4+vWrVOjRo1UsGBB+fn5qW3btjpy5Ih9e8ZntK7153pWr14tb29vdezYUampqQ7bjh07JklKTk7WyJEjVbduXfn6+qpgwYJq1KiR1q9fn+l4CQkJ6tatm3x9feXn56euXbsqISEh2z04dOiQ7r//fnl5ealUqVJ67bXXlJ6efs35GTVu2LBBNptNGzZscNh+rc+vLVq0SGFhYfL09FS1atW0ZMkSdevWTWXKlHGY9/fff+uFF15QcHCwPDw8VKlSJb399tsyxjjMi4mJ0X333Sc/Pz8VKlRIlSpV0ssvv+ww5/Llyxo1apQqVqwoT09PlSxZUo888ojD632jz5AZY1S8eHENGjTIPpaeni4/Pz+5uro69Pqtt95SgQIF9Ndff0nK+jNkc+bM0f333y9/f395eHgoLCxMM2bMyPJ5X3vtNZUqVUre3t5q1qyZDh06lGWNP/30kx577DEVLVpU3t7eatCggZYvX+4wJ+P1WrhwoUaPHq277rpLhQsX1qOPPqoLFy4oKSlJAwYMkL+/vwoVKqTu3bsrKSnJ4RipqakaO3asypUrJw8PD5UpU0Yvv/xypnnffvutIiMjVbx4cXl5eSk0NFRPPfWUw5z09HRNnjxZ1atXl6enp0qUKKFWrVrp22+/tc/hM2QArHLn/q80AHCyjEUrihQpYh9bs2aNWrdurbJly2rUqFH6559/NHXqVDVs2FDfffedypQpoxIlSmRa0S8lJUUDBw6Uu7v7NZ9v2bJlevTRR9W+fXvNnj1brq6u9m0xMTFasmSJTpw4ofT0dH344Yfq2LGjnn76aV28eFGzZs1SZGSkdu7cqVq1akm68kt727ZttWXLFvXu3VtVqlTRkiVL1LVr12ydf1xcnJo1a6bU1FQNHTpUBQsW1Pvvvy8vL68s5x86dEi1atXSrl27snX8DMuXL1f79u1VvXp1jRs3TufPn1ePHj101113Ocwzxuihhx7S+vXr1aNHD9WqVUurVq3S4MGD9euvv2rSpEn2Oh588EHVqFFDY8aMkYeHh44dO6atW7faj5WWlqYHH3xQa9euVYcOHfT888/r4sWLiomJ0cGDB1WuXLls1W6z2dSwYUNt2rTJPrZ//35duHBBLi4u2rp1q6KioiRJmzdvVu3atVWoUKFrHm/GjBmqWrWqHnroIRUoUEBff/21nn32WaWnp6tv3772eSNHjtRrr72mBx54QA888IC+++47tWzZUsnJyQ7Hi4+P17333qtLly6pf//+KlasmObNm6eHHnpIn3/+udq1a+cwf9y4cfLy8tLQoUN17NgxTZ06VW5ubnJxcdH58+c1atQo++28oaGhGjlypH3fnj17at68eXr00Uf1wgsvaMeOHRo3bpyOHDmiJUuWSJLOnj2rli1bqkSJEho6dKj8/Px08uRJLV682KGOHj16aO7cuWrdurV69uyp1NRUbd68Wdu3b7dfxQYAyxgAQK6aM2eOkWTWrFljfv/9d3P69Gnz+eefmxIlShgPDw9z+vRp+9xatWoZf39/8+eff9rH9u3bZ1xcXEyXLl2u+RzPPvuscXV1NevWrbOPNWnSxFStWtUYY8wXX3xh3NzczNNPP23S0tLsc86fP2+KFy9uSpcubfbu3WuMMSY1NdUkJSU5HP/8+fMmICDAPPXUU/axL7/80kgy48ePt4+lpqaaRo0aGUlmzpw51+3LgAEDjCSzY8cO+9jZs2eNr6+vkWROnDhhjDHmn3/+MVWrVjVFihQxa9euNcYYs379eiPJrF+/3uGYJ06cyPTc1atXN6VKlTIXL160j23YsMFIMiEhIZnO57XXXnM45qOPPmpsNps5duyYMcaYSZMmGUnm999/v+a5zZ4920gyEydOzLQtPT3d/rMk8+qrr17zOMYYM2HCBOPq6moSExONMcZMmTLFhISEmHr16pkhQ4YYY4xJS0szfn5+ZuDAgfb9Xn31VfPv/6xfunQp0/EjIyNN2bJl7Y/Pnj1r3N3dTVRUlEOtL7/8spFkunbtah/LeA03b95sH7t48aIJDQ01ZcqUsb/XMl6vatWqmeTkZPvcjh07GpvNZlq3bu1QU3h4uMNrs3fvXiPJ9OzZ02Heiy++aCTZ3/dLliwxksyuXbuy6OQV69atM5JM//79M227+nxDQkIczhUAnIVbFgEgj0RERKhEiRIKDg7Wo48+qoIFC2rp0qUqVaqUJOnMmTPau3evunXrpqJFi9r3q1Gjhlq0aKEVK1ZkedyPPvpI06dP1/jx49WsWbNM2xcsWKD27dvrmWee0f/93//ZFyo4evSo7r77bv3xxx9q1qyZatasKUlydXW1X2lLT0/XuXPnlJqaqrvvvlvfffed/bgrVqxQgQIF1KdPH/uYq6urnnvuuWz1Y8WKFWrQoIHq1atnHytRooQ6depkfxwfH69GjRrp0KFDqlGjhu6///5sHTvDb7/9pgMHDqhLly4OV46aNGmi6tWrZ6rH1dVV/fv3dxh/4YUXZIzRN998I0ny8/OTJH311VfXvL3yiy++UPHixbPsRU6Xom/UqJHS0tK0bds2SVeuhDVq1EiNGjXS5s2bJUkHDx5UQkKC/TbYa7n66uOFCxf0xx9/qEmTJvrpp5904cIFSVeu0iYnJ+u5555zqHXAgAGZjrdixQrVq1dP9913n32sUKFC6tWrl06ePKnDhw87zO/SpYvc3Nzsj+vXry9jTKZbCuvXr6/Tp0/bb6vNeO9ffeumdOW1kWS/RTLjtVm2bJlSUlKy7MEXX3whm82mV199NdO23P6aAAC4GQQyAMgj06ZNU0xMjD7//HM98MAD+uOPPxwWDvj5558lSZUqVcq0b5UqVfTHH3/o77//dhjfu3evevfurY4dO2b6ZVWSTpw4oSeffFLR0dGaOnWqwy+cBQsW1FNPPaXSpUtn2m/evHmqUaOGPD09VaxYMZUoUULLly+3/9KeUW/JkiUz3SKXVf1Z+fnnn1WhQoVM41fvX7BgQbVs2VLh4eHZOmZWzyFJ5cuXz7Tt32M///yzgoKCVLhwYYfxKlWqOByrffv2atiwoXr27KmAgAB16NBBCxcudAhnx48fV6VKlXJlUY06derI29vbHr4yAlnjxo317bf
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 200.000 znaków celem wyświetlenia histogramów\n",
|
|||
|
"hist_data = data[data['message_length'] < 200000]\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')\n",
|
|||
|
"hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"plt.xlabel('Długość wiadomości')\n",
|
|||
|
"plt.ylabel('Liczba wiadomości')\n",
|
|||
|
"plt.title('Rozkład długości wiadomości')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 21,
|
|||
|
"id": "7182d6a1d6600c2",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Ograniczamy jeszcze bardziej "
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 22,
|
|||
|
"id": "962efe0bd652ecdb",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA1IAAAIkCAYAAAAUKhpvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAABuoUlEQVR4nO3de3zP9f//8ft75w3bDNschxzHRI5LTplzzhWSUwclEkpRkUM1qRCRbyVUUhQSkjlLTgnNIZWIyqZijpkdnr8/+u398W4He82293vcrpfLLpe9n8/n6/V6vN6vB94Pz9fr+bYZY4wAAAAAANnm5uwAAAAAAKCgoZACAAAAAIsopAAAAADAIgopAAAAALCIQgoAAAAALKKQAgAAAACLKKQAAAAAwCIKKQAAAACwiEIKAICrTJ8+XR988IGzw0AG5s+frzfffNPZYQCAJAopACgQmjdvrpo1a15zXPny5dW/f/9cO+6xY8dks9k0b968HG0/b9482Ww2HTt2zN7WvHlzNW/ePFfiy21vvvmmJkyYoIYNG1re1mazady4cbkflKRx48bJZrPl6j5zO1fy2ooVKzRo0CDVqVPH8rYF7VwBFAwUUgBgQVphkPbj4eGh0qVLq3///vr999+dHR6uw759+zRmzBgtX75cVapUcXY4uMrvv/+uBx98UB988IEaN27s7HAAQJLk4ewAAKAgmjBhgipUqKDLly9r+/btmjdvnr7++mvt379fPj4+zg4POXDgwAEtXrxYt99+e462/+eff+ThkTf/rD7//PMaNWpUnuy7INi3b59mzpyp7t2752j7w4cPy82N/zsGkLsopAAgB9q1a6d69epJkh566CEVL15cr7zyipYvX657773XydEhJ+67777r2j4vC2gPD488K9IKgvbt21/X9t7e3rkUCQD8D/89AwC5oEmTJpKkI0eOOLSvX79eTZo0UaFChRQYGKjOnTvr0KFD9v60Z5Ay+8nKmjVr5Ofnp169eik5Odmh7+eff5YkXblyRWPHjlXdunUVEBCgQoUKqUmTJtqwYUO6/SUkJKh///4KCAhQYGCg+vXrp4SEhGy/BwcOHNCdd94pX19flSlTRi+++KJSU1MzHZ8W48aNG2Wz2bRx40aH/syez1q8eLHCw8Pl4+OjmjVraunSperfv7/Kly/vMO7ixYt68sknVbZsWXl7e6tq1ap67bXXZIxxGBcTE6M77rhDgYGBKly4sKpWrapnn33WYczly5c1btw4ValSRT4+PipZsqS6devmcL2v9YyUMUbFixfXiBEj7G2pqakKDAyUu7u7w3v9yiuvyMPDQxcuXJCU8TNSc+fO1Z133qng4GB5e3srPDxcb731VobHffHFF1WmTBn5+fmpRYsWOnDgQIYx/vLLL7rnnnsUFBQkPz8/NWrUSCtXrnQYk3a9Fi1apPHjx6t06dIqUqSI7r77bp09e1aJiYkaNmyYgoODVbhwYQ0YMECJiYkO+0hOTtbEiRN1yy23yNvbW+XLl9ezzz6bbty3336rNm3aqHjx4vL19VWFChX0wAMPOIxJTU3VG2+8oYiICPn4+KhEiRJq27atvv32W/sYnpECkBdu3v/eAoBclLaYQtGiRe1ta9euVbt27VSxYkWNGzdO//zzj2bMmKHGjRvru+++U/ny5VWiRIl0K8QlJSVp+PDh8vLyyvR4K1as0N13360ePXrovffek7u7u70vJiZGS5cu1dGjR5Wamqp3331XvXr10sMPP6zz589rzpw5atOmjXbu3KnatWtL+vfDdufOnfX111/r0UcfVfXq1bV06VL169cvW+cfFxenFi1aKDk5WaNGjVKhQoX09ttvy9fXN8PxBw4cUO3atbVr165s7T/NypUr1aNHD0VERCg6OlpnzpzRgw8+qNKlSzuMM8aoU6dO2rBhgx588EHVrl1bX331lUaOHKnff/9dU6dOtcdx1113qVatWpowYYK8vb31888/a+vWrfZ9paSk6K677tK6devUs2dPPfHEEzp//rxiYmK0f/9+3XLLLdmK3WazqXHjxtq8ebO97fvvv9fZs2fl5uamrVu3qkOHDpKkLVu2qE6dOipcuHCm+3vrrbdUo0YNderUSR4eHvriiy/02GOPKTU1VYMHD7aPGzt2rF588UW1b99e7du313fffafWrVvrypUrDvuLj4/X7bffrkuXLmno0KEqVqyY5s+fr06dOunTTz9V165dHcZHR0fL19dXo0aN0s8//6wZM2bI09NTbm5uOnPmjMaNG2e/7bVChQoaO3asfduHHnpI8+fP1913360nn3xSO3bsUHR0tA4dOqSlS5dKkk6dOqXWrVurRIkSGjVqlAIDA3Xs2DEtWbLEIY4HH3xQ8+bNU7t27fTQQw8pOTlZW7Zs0fbt2+2zxgCQJwwAINvmzp1rJJm1a9eaP//805w4ccJ8+umnpkSJEsbb29ucOHHCPrZ27domODjY/P333/a2ffv2GTc3N9O3b99Mj/HYY48Zd3d3s379entbs2bNTI0aNYwxxnz22WfG09PTPPzwwyYlJcU+5syZM6Z48eKmXLlyZu/evcYYY5KTk01iYqLD/s+cOWNCQkLMAw88YG9btmyZkWQmT55sb0tOTjZNmjQxkszcuXOzfF+GDRtmJJkdO3bY206dOmUCAgKMJHP06FFjjDH//POPqVGjhilatKhZt26dMcaYDRs2GElmw4YNDvs8evRoumNHRESYMmXKmPPnz9vbNm7caCSZsLCwdOfz4osvOuzz7rvvNjabzfz888/GGGOmTp1qJJk///wz03N77733jCQzZcqUdH2pqan23yWZF154IdP9GGPMq6++atzd3c25c+eMMcZMnz7dhIWFmQYNGphnnnnGGGNMSkqKCQwMNMOHD7dv98ILL5j//pN96dKldPtv06aNqVixov31qVOnjJeXl+nQoYNDrM8++6yRZPr162dvS7uGW7ZssbedP3/eVKhQwZQvX96ea2nXq2bNmubKlSv2sb169TI2m820a9fOIabIyEiHa7N3714jyTz00EMO45566ikjyZ73S5cuNZLMrl27Mngn/7V+/XojyQwdOjRd39XnGxYW5nCuAJAbuLUPAHIgKipKJUqUUNmyZXX33XerUKFCWr58ucqUKSNJOnnypPbu3av+/fsrKCjIvl2tWrXUqlUrrVq1KsP9vv/++5o1a5YmT56sFi1apOtfuHChevTooUceeUT/93//Z3+A/vDhw6pXr57++usvtWjRQrfeeqskyd3d3T6zlZqaqtOnTys5OVn16tXTd999Z9/vqlWr5OHhoUGDBtnb3N3d9fjjj2fr/Vi1apUaNWqkBg0a2NtKlCih3r1721/Hx8erSZMmOnDggGrVqqU777wzW/tO88cffyg2NlZ9+/Z1mKlp1qyZIiIi0sXj7u6uoUOHOrQ/+eSTMsboyy+/lCQFBgZKkj7//PNMb0P87LPPVLx48QzfC6tLkjdp0kQpKSn65ptvJP0789SkSRM1adJEW7ZskSTt379fCQkJ9ttFM3P1bN/Zs2f1119/qVmzZvrll1909uxZSf/Oil65ckWPP/64Q6zDhg1Lt79Vq1apQYMGuuOOO+xthQsX1sCBA3Xs2DEdPHjQYXzfvn3l6elpf92wYUMZY9LdetewYUOdOHHCfvtpWu5ffYuj9O+1kWS/lTDt2qxYsUJJSUkZvgefffaZbDabXnjhhXR9ub1cPAD8F4UUAOTAzJkzFRMTo08//VTt27fXX3/95fBA+6+//ipJqlq1arptq1evrr/++ksXL150aN+7d68effRR9erVK92HTEk6evSo7r//fnXv3l0zZsxw+KBYqFAhPfDAAypXrly67ebPn69atWrJx8dHxYoVU4kSJbRy5Ur7h+20eEuWLJnuVrKM4s/Ir7/+qsqVK6drv3r7QoUKqXXr1oqMjMzWPjM6hiRVqlQpXd9/23799VeVKlVKRYoUcWivXr26w7569Oihxo0b66GHHlJISIh69uypRYsWORRVR44cUdWqVXNlsYfbbrtNfn5+9qIprZBq2rSpvv3
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 10000 znaków celem wyświetlenia histogramów\n",
|
|||
|
"hist_data = data[data['message_length'] < 10000]\n",
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')\n",
|
|||
|
"hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')\n",
|
|||
|
"plt.legend()\n",
|
|||
|
"plt.xlabel('Długość wiadomości')\n",
|
|||
|
"plt.ylabel('Liczba wiadomości')\n",
|
|||
|
"plt.title('Rozkład długości wiadomości')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 23,
|
|||
|
"id": "eaa483deb9c81942",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Można zauważyć, że trudno odróżnić widomości po samej długości. W tym celu należy skorzystać z bardziej zaawansowanych metod."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 24,
|
|||
|
"id": "6e0ee5fccf308cd1",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Przetwarzanie tekstu"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 25,
|
|||
|
"id": "50c0131db25859cb",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Unnamed: 0</th>\n",
|
|||
|
" <th>Body</th>\n",
|
|||
|
" <th>Label</th>\n",
|
|||
|
" <th>message_length</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>Subject: congratulations\\n vince ,\\n congratul...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>129</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>\\nhttp://news.bbc.co.uk/1/hi/scotland/2515231....</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>435</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>Big and big\\nMAIN PAGE\\nHuge big titties @ big...</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>231</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>Subject: re : enron visit - - thanks\\n larry ,...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1180</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>574</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18109</th>\n",
|
|||
|
" <td>18646</td>\n",
|
|||
|
" <td>Subject: fluid analysis\\n our customer speak v...</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>927</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18110</th>\n",
|
|||
|
" <td>18647</td>\n",
|
|||
|
" <td>Subject: guadalupe\\n i rolled 740208 , 740209 ...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>337</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18111</th>\n",
|
|||
|
" <td>18648</td>\n",
|
|||
|
" <td>100% Free Porn!\\nWhat more can you ask for?\\nC...</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>345</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18112</th>\n",
|
|||
|
" <td>18649</td>\n",
|
|||
|
" <td>Subject: revised nominations\\n daren ,\\n we ha...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>346</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>18113</th>\n",
|
|||
|
" <td>18650</td>\n",
|
|||
|
" <td>Hello,\\nI've got a small problem but still ann...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>744</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>18114 rows × 4 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Unnamed: 0 Body Label \\\n",
|
|||
|
"0 0 Subject: congratulations\\n vince ,\\n congratul... 0 \n",
|
|||
|
"1 1 \\nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0 \n",
|
|||
|
"2 2 Big and big\\nMAIN PAGE\\nHuge big titties @ big... 1 \n",
|
|||
|
"3 3 Subject: re : enron visit - - thanks\\n larry ,... 0 \n",
|
|||
|
"4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0 \n",
|
|||
|
"... ... ... ... \n",
|
|||
|
"18109 18646 Subject: fluid analysis\\n our customer speak v... 1 \n",
|
|||
|
"18110 18647 Subject: guadalupe\\n i rolled 740208 , 740209 ... 0 \n",
|
|||
|
"18111 18648 100% Free Porn!\\nWhat more can you ask for?\\nC... 1 \n",
|
|||
|
"18112 18649 Subject: revised nominations\\n daren ,\\n we ha... 0 \n",
|
|||
|
"18113 18650 Hello,\\nI've got a small problem but still ann... 0 \n",
|
|||
|
"\n",
|
|||
|
" message_length \n",
|
|||
|
"0 129 \n",
|
|||
|
"1 435 \n",
|
|||
|
"2 231 \n",
|
|||
|
"3 1180 \n",
|
|||
|
"4 574 \n",
|
|||
|
"... ... \n",
|
|||
|
"18109 927 \n",
|
|||
|
"18110 337 \n",
|
|||
|
"18111 345 \n",
|
|||
|
"18112 346 \n",
|
|||
|
"18113 744 \n",
|
|||
|
"\n",
|
|||
|
"[18114 rows x 4 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 26,
|
|||
|
"id": "c32c52a7b2575a3b",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"stop_words = set(stopwords.words('english'))\n",
|
|||
|
"ps = PorterStemmer()\n",
|
|||
|
"\n",
|
|||
|
"def preprocess_text(text):\n",
|
|||
|
" # Usuwanie znaków specjalnych i tokenizacja\n",
|
|||
|
" text = re.sub(r'\\d+', '', text)\n",
|
|||
|
" text = text.translate(str.maketrans('', '', string.punctuation))\n",
|
|||
|
" words = word_tokenize(text)\n",
|
|||
|
" # Usuwanie stopwords i stemming\n",
|
|||
|
" words = [ps.stem(word) for word in words if word.lower() not in stop_words]\n",
|
|||
|
" return \" \".join(words)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 27,
|
|||
|
"id": "5953cb974349cb33",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Ten proces jest czasochłonny"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 28,
|
|||
|
"id": "89b8cdeaa9da5c2d",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"data['processed_message'] = data['Body'].apply(preprocess_text)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 29,
|
|||
|
"id": "ccce395ac94c39a1",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>Unnamed: 0</th>\n",
|
|||
|
" <th>Body</th>\n",
|
|||
|
" <th>Label</th>\n",
|
|||
|
" <th>message_length</th>\n",
|
|||
|
" <th>processed_message</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>Subject: congratulations\\n vince ,\\n congratul...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>129</td>\n",
|
|||
|
" <td>subject congratul vinc congratul wish best luc...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>\\nhttp://news.bbc.co.uk/1/hi/scotland/2515231....</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>435</td>\n",
|
|||
|
" <td>httpnewsbbccoukhiscotlandstm yahoo group spons...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>2</td>\n",
|
|||
|
" <td>Big and big\\nMAIN PAGE\\nHuge big titties @ big...</td>\n",
|
|||
|
" <td>1</td>\n",
|
|||
|
" <td>231</td>\n",
|
|||
|
" <td>big big main page huge big titti bigbigscom sa...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>3</td>\n",
|
|||
|
" <td>Subject: re : enron visit - - thanks\\n larry ,...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>1180</td>\n",
|
|||
|
" <td>subject enron visit thank larri think potenti ...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>4</td>\n",
|
|||
|
" <td>On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan...</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td>574</td>\n",
|
|||
|
" <td>fri aug ryan shane mention imho stop spammer g...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" Unnamed: 0 Body Label \\\n",
|
|||
|
"0 0 Subject: congratulations\\n vince ,\\n congratul... 0 \n",
|
|||
|
"1 1 \\nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0 \n",
|
|||
|
"2 2 Big and big\\nMAIN PAGE\\nHuge big titties @ big... 1 \n",
|
|||
|
"3 3 Subject: re : enron visit - - thanks\\n larry ,... 0 \n",
|
|||
|
"4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0 \n",
|
|||
|
"\n",
|
|||
|
" message_length processed_message \n",
|
|||
|
"0 129 subject congratul vinc congratul wish best luc... \n",
|
|||
|
"1 435 httpnewsbbccoukhiscotlandstm yahoo group spons... \n",
|
|||
|
"2 231 big big main page huge big titti bigbigscom sa... \n",
|
|||
|
"3 1180 subject enron visit thank larri think potenti ... \n",
|
|||
|
"4 574 fri aug ryan shane mention imho stop spammer g... "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 29,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data.head()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"id": "7ce382be7bcdff2c",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"0 subject congratul vinc congratul wish best luc...\n",
|
|||
|
"1 httpnewsbbccoukhiscotlandstm yahoo group spons...\n",
|
|||
|
"2 big big main page huge big titti bigbigscom sa...\n",
|
|||
|
"3 subject enron visit thank larri think potenti ...\n",
|
|||
|
"4 fri aug ryan shane mention imho stop spammer g...\n",
|
|||
|
" ... \n",
|
|||
|
"18109 subject fluid analysi custom speak volum spur ...\n",
|
|||
|
"18110 subject guadalup roll june ena deal guadalup d...\n",
|
|||
|
"18111 free porn ask click â â â remov instruct striv...\n",
|
|||
|
"18112 subject revis nomin daren receiv revis nomin p...\n",
|
|||
|
"18113 hello ive got small problem still annoy upgrad...\n",
|
|||
|
"Name: processed_message, Length: 18114, dtype: object"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"data['processed_message']"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 31,
|
|||
|
"id": "dc456d793b576f7",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Analiza słów za pomocą WordCloud\n",
|
|||
|
"spam_words = ' '.join(list(data[data['Label'] == 1]['processed_message']))\n",
|
|||
|
"not_spam_words = ' '.join(list(data[data['Label'] == 0]['processed_message']))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 32,
|
|||
|
"id": "c9d7d9c9f4ae91ed",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAGtCAYAAACREAK2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOy9d5gd532Y+06f08+e7X2x6IXoBEmQ6oVWFyMqKkkkF8ly7Jix48T3Kn/YcRzbiX3jx45sOdG9jiXLdqxiybIqRbGIDexE78Bie9/Tz5n+3T9mscBiK4AFSVHn1fMI3GnfN3Om/PpPEkIIatSoUaNGjRo1atSoUWMNkV/tCdSoUaNGjRo1atSoUeP1R03RqFGjRo0aNWrUqFGjxppTUzRq1KhRo0aNGjVq1Kix5tQUjRo1atSoUaNGjRo1aqw5NUWjRo0aNWrUqFGjRo0aa05N0ahRo0aNGjVq1KhRo8aaU1M0atSoUaNGjRo1atSosebUFI0aNWrUqFGjRo0aNWqsOTVFo0aNGjVq1KhRo0aNGmtOTdGoUaNGjVeAxx57DEmSeOyxx9b0uD09Pfzsz/7smh5ztbz5zW/mzW9+8w3vL0kS/+k//ae5v7/4xS8iSRKXLl266bnVqFGjRo1Xn5qiUaNGjdcNX/3qV5EkiW9+85sL1u3atQtJknj00UcXrOvq6uLgwYOvxBRXzYULF/jMZz5Db28vpmmSTCa5++67+dM//VOq1eqrPb2fGI4dO8b9999Pd3c3pmnS3t7OO97xDj73uc+92lOrUaNGjdc9NUWjRo0arxvuueceAJ588sl5ywuFAsePH0dVVZ566ql56wYHBxkcHJzb97XAd7/7XW677Ta++tWv8r73vY/Pfe5z/MEf/AFdXV38h//wH/i3//bfvtpT/Ing6aefZv/+/Rw5coRPf/rT/Nmf/Rmf+tSnkGWZP/3TP321p1ejRo0ar3vUV3sCNWrUqLFWtLW1sW7dugWKxqFDhxBC8OEPf3jBust/36yiIYTAsiwikchNHaevr4+PfvSjdHd388gjj9Da2jq37ld+5Vc4f/483/3ud29qjJ8Wfu/3fo9UKsXzzz9POp2et25iYuLVmVSNGjVq/BRR82jUqFHjdcU999zDyy+/PC+86KmnnmL79u28613v4plnniEIgnnrJEni7rvvBsDzPH73d3+X9evXYxgGPT09/Mf/+B+xbXveOD09Pbz3ve/lwQcfZP/+/UQiEf7X//pfAAwNDfHBD36QWCxGU1MTv/7rv75g/6X4wz/8Q0qlEn/5l385T8m4zIYNG1b0aFy8eJEPf/jDZDIZotEod9555wLlZKl8iKVySb7whS+wfv16IpEIBw4c4IknnljV+QDYts2v//qv09jYSCKR4P3vfz9DQ0Or2vdb3/oW73nPe2hra8MwDNavX8/v/u7v4vv+ivteuHCB7du3L1AyAJqamub9LUkS/+bf/Bv+9m//ls2bN2OaJvv27ePxxx+ft11/fz+//Mu/zObNm4lEItTX1/PhD394wXW8fH2ffPJJHnjgARobG0mn03zmM5/BcRxyuRyf+MQnqKuro66ujt/8zd9ECLGqa1KjRo0aPynUPBo1atR4XXHPPffw5S9/mWeffXYuUfmpp57i4MGDHDx4kHw+z/Hjx9m5c+fcui1btlBfXw/Apz71Kb70pS9x//338xu/8Rs8++yz/MEf/AGnTp1akPtx5swZPvaxj/GZz3yGT3/602zevJlqtcrb3vY2BgYGeOCBB2hra+PLX/4yjzzyyKrm/+1vf5ve3t4bzhkZHx/n4MGDVCoVHnjgAerr6/nSl77E+9//fr7+9a9z3333Xfcx//Iv/5LPfOYzHDx4kF/7tV/j4sWLvP/97yeTydDZ2bni/p/61Kf4m7/5Gz7+8Y9z8OBBHnnkEd7znvesauwvfvGLxONx/t2/+3fE43EeeeQRfuu3fotCocAf/dEfLbtvd3c3hw4d4vjx4+zYsWPFsX784x/zla98hQceeADDMPj85z/Pz/zMz/Dcc8/N7f/888/z9NNP89GPfpSOjg4uXbrEX/zFX/DmN7+ZkydPEo1G5x3zV3/1V2lpaeF3fud3eOaZZ/jCF75AOp3m6aefpquri9///d/ne9/7Hn/0R3/Ejh07+MQnPrGq61KjRo0aPxGIGjVq1HgdceLECQGI3/3d3xVCCOG6rojFYuJLX/qSEEKI5uZm8ed//udCCCEKhYJQFEV8+tOfFkIIcfjwYQGIT33qU/OO+e///b8XgHjkkUfmlnV3dwtA/OAHP5i37Z/8yZ8IQHz1q1+dW1Yul8WGDRsEIB599NEl557P5wUgPvCBD6z6fLu7u8UnP/nJub9/7dd+TQDiiSeemFtWLBbFunXrRE9Pj/B9XwghxF/91V8JQPT19c073qOPPjpvno7jiKamJrF7925h2/bcdl/4whcEIN70pjctO7/L1/SXf/mX5y3/+Mc/LgDx27/923PLFptTpVJZcMzPfOYzIhqNCsuylh37hz/8oVAURSiKIu666y7xm7/5m+LBBx8UjuMs2BYQgHjhhRfmlvX39wvTNMV999237HwOHTokAPHXf/3XC87l3nvvFUEQzC2/6667hCRJ4pd+6ZfmlnmeJzo6Ola8ljVq1Kjxk0YtdKpGjRqvK7Zu3Up9ff1c7sWRI0col8tzHoKDBw/OJYQfOnQI3/fn8jO+973vAfDv/t2/m3fM3/iN3wBYEH60bt067r333nnLvve979Ha2sr9998/tywajfKLv/iLK869UCgAkEgkVneyi/C9732PAwcOzMs5icfj/OIv/iKXLl3i5MmT13W8F154gYmJCX7pl34JXdfnlv/sz/4sqVRqVfMBeOCBB+Yt/7Vf+7VVjX91zkuxWGRqaoo3vOENVCoVTp8+vey+73jHOzh06BDvf//7OXLkCH/4h3/IvffeS3t7O//0T/+0YPu77rqLffv2zf3d1dXFBz7wAR588MG5UK2r5+O6LtPT02zYsIF0Os1LL7204Ji/8Au/gCRJc3/fcccdCCH4hV/4hblliqKwf/9+Ll68uIorUqNGjRo/OdQUjRo1aryukCSJgwcPzuViPPXUUzQ1NbFhwwZgvqJx+d/LQnl/fz+yLM9te5mWlhbS6TT9/f3zlq9bt27B+P39/WzYsGGecAmwefPmFeeeTCaBUKC+Ufr7+xcda+vWrXPrr/d4ABs3bpy3XNM0ent7V7W/LMusX79+3vLVXA+AEydOcN9995FKpUgmkzQ2NvIv/+W/BCCfz6+4/+233843vvENstkszz33HJ/97GcpFovcf//9C5Sua88RYNOmTVQqFSYnJwGoVqv81m/9Fp2dnRiGQUNDA42NjeRyuUXn09XVNe/vy8rZtSFnqVSKbDa74vnUqFGjxk8SNUWjRo0arzvuuece8vk8x44dm8vPuMzBgwfp7+9neHiYJ598kra2tgUC87VKwlLcbIWpa0kmk7S1tXH8+PE1Pe5iLHWOq0myfqXI5XK86U1v4siRI/zn//yf+fa3v81DDz3Ef/tv/w1gXlL/Sui6zu23387v//7v8xd/8Re4rsvXvva1657Tr/7qr/J7v/d7/PN//s/56le/yg9/+EMeeugh6uvrF52PoiiLHmex5aKWDF6jRo3XGbVk8Bo1arzuuLqfxlNPPTUvTGffvn0YhsFjjz3Gs88+y7vf/e65dd3d3QRBwLlz5+Y8ABAmWOdyObq7u1ccu7u7m+PHjyOEmCfMnzlzZlVzf+9738sXvvAFDh06xF133bWqfa4df7GxLocZXT6Huro6IBTmr+Zaj8fl7c+dO8db3/rWueWu69LX18euXbtWnE8QBFy4cGGeF2M11+Oxxx5jenqab3zjG7zxjW+cW97X17fivsuxf/9+AEZHR+ctP3fu3IJtz549SzQapbGxEYCvf/3rfPKTn+S///f/PreNZVkLrmONGjVq1Kh5NGrUqPE6ZP/+/Zimyd/+7d8yPDw8z6NhGAZ79+7lz//8zymXy/NyGS4rHX/yJ38y73h//Md/DLCqSknvfve7GRkZ4et
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"wordcloud_spam = WordCloud(width=800, height=400).generate(spam_words)\n",
|
|||
|
"plt.imshow(wordcloud_spam, interpolation='bilinear')\n",
|
|||
|
"plt.axis('off')\n",
|
|||
|
"plt.title('Word Cloud dla Spam')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 33,
|
|||
|
"id": "d954e01a1d0b3a97",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAxoAAAGtCAYAAACREAK2AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOy9d5xdZ33n/z793H6n96JebUtyk+UKmGrHlEAghABLSMgSAimbbIBsyIbsJkvKhg2QAD8SCAESSgIBG4zBTZZl2ZKs3kea0fRy5/Z7T39+f9zRSFczI42ksTHhvv3yS9I59zznOf35PN8mCSEENWrUqFGjRo0aNWrUqLGEyD/pDtSoUaNGjRo1atSoUeM/HzWhUaNGjRo1atSoUaNGjSWnJjRq1KhRo0aNGjVq1Kix5NSERo0aNWrUqFGjRo0aNZacmtCoUaNGjRo1atSoUaPGklMTGjVq1KhRo0aNGjVq1FhyakKjRo0aNWrUqFGjRo0aS05NaNSoUaNGjRo1atSoUWPJqQmNGjVq1KhRo0aNGjVqLDk1oVGjRo0al+Hxxx9HkiQef/zxJW23t7eXd7/73Uva5mK55557uOeee656e0mS+OM//uPZf3/xi19EkiT6+/uvuW81atSoUeM/BzWhUaNGjZcEX//615EkiX//93+fs+6GG25AkiQee+yxOeu6u7vZtm3bi9HFRdPX18f73vc+li9fjmmaxONxbr/9dj75yU9SLpd/0t37qeCccDFNk+Hh4Tnr77nnHjZu3HhVbX/mM5/hi1/84qJ/XygU+NjHPsbGjRuJRCI0NDSwadMmPvShDzEyMnJVfahRo0aNnwVqQqNGjRovCe644w4AnnrqqarluVyOQ4cOoaoqO3bsqFo3ODjI4ODg7LYvBR588EGuu+46vv71r/NzP/dz/O3f/i1/9md/Rnd3N7/3e7/Hhz70oZ90F3+qsG2bP//zP1/SNq9EaLiuy1133cVf/MVfcOedd/LXf/3XfOQjH2HLli189atf5cSJE0vatxo1atT4z4T6k+5AjRo1agC0t7ezbNmyOUJj586dCCF4y1veMmfduX9fq9AQQmBZFqFQ6JraOXPmDG9729vo6enh0Ucfpa2tbXbdb/zGb3Dq1CkefPDBa9rHzxqbNm3i85//PB/+8Idpb29/0ff/7W9/m+eff56vfOUrvP3tb69aZ1kWjuO86H2qUaNGjZ8WahaNGjVqvGS44447eP7556vci3bs2MGGDRt47WtfyzPPPEMQBFXrJEni9ttvB8DzPD7+8Y+zYsUKDMOgt7eXj3zkI9i2XbWf3t5e7r//fh5++GFuuukmQqEQn/3sZwEYGhriDW94A5FIhObmZn77t397zvYL8YlPfIJCocAXvvCFKpFxjpUrV17WonH69Gne8pa3UF9fTzgcZuvWrXPEyULxEAvFknzuc59jxYoVhEIhbrnlFrZv376o44GKReG3f/u3aWpqIhaL8cADDzA0NLSobb/zne9w33330d7ejmEYrFixgo9//OP4vr/o/X/kIx/B9/1FWTUWc/17e3s5fPgwTzzxBJIkIUnSJWNV+vr6AGbvsQs55xZ3jne/+91Eo1FOnz7Nq1/9aiKRCO3t7fzJn/wJQoiqbf/yL/+Sbdu20dDQQCgU4sYbb+Sb3/zmnH1IksQHPvABvvGNb7B+/XpCoRC33XYbBw8eBOCzn/0sK1euxDRN7rnnnlqMTI0aNV5S1IRGjRo1XjLccccduK7Lrl27Zpft2LGDbdu2sW3bNrLZLIcOHapat3btWhoaGgB473vfyx/90R+xZcsW/u///b/cfffd/Nmf/Rlve9vb5uzr+PHj/OIv/iKvfOUr+eQnP8mmTZsol8u84hWv4OGHH+YDH/gAH/3oR9m+fTu///u/v6j+f/e732X58uVXHTMyPj7Otm3bePjhh3n/+9/P//pf/wvLsnjggQfmjV1ZDF/4whd43/veR2trK5/4xCe4/fbbeeCBBxgcHFzU9u9973v5m7/5G171qlfx53/+52iaxn333beobb/4xS8SjUb5nd/5HT75yU9y44038kd/9Ef8wR/8waL7v2zZMt75znfy+c9//rLxEIu5/n/zN39DZ2cna9eu5ctf/jJf/vKX+ehHP7pgmz09PQD80z/90xyxMB++7/Oa17yGlpYWPvGJT3DjjTfysY99jI997GNVv/vkJz/J5s2b+ZM/+RP+9//+36iqylve8pZ5LV7bt2/nd3/3d3nXu97FH//xH3P06FHuv/9+Pv3pT/P//t//4/3vfz+/93u/x86dO3nPe95z2T7WqFGjxouGqFGjRo2XCIcPHxaA+PjHPy6EEMJ1XRGJRMSXvvQlIYQQLS0t4tOf/rQQQohcLicURRG/+qu/KoQQYt++fQIQ733ve6va/G//7b8JQDz66KOzy3p6egQgfvCDH1T99m/+5m8EIL7+9a/PLisWi2LlypUCEI899tiCfc9mswIQr3/96xd9vD09PeJd73rX7L9/67d+SwBi+/bts8vy+bxYtmyZ6O3tFb7vCyGE+Md//EcBiDNnzlS199hjj1X103Ec0dzcLDZt2iRs25793ec+9zkBiLvvvvuS/Tt3Tt///vdXLX/7298uAPGxj31sdtl8fSqVSnPafN/73ifC4bCwLOuS+z7X3nPPPSf6+vqEqqrigx/84Oz6u+++W2zYsGFOXxdz/Tds2HDZY7/wGNasWSMA0dPTI9797neLL3zhC2J8fHzOb9/1rncJQPzmb/7m7LIgCMR9990ndF0Xk5OTVe1eiOM4YuPGjeLlL3951XJAGIZRdV4/+9nPCkC0traKXC43u/zDH/7wvPdFjRo1avykqFk0atSo8ZJh3bp1NDQ0zMZe7N+/n2KxOGsh2LZt22xA+M6dO/F9fzY+46GHHgLgd37nd6ra/N3f/V2AOTPFy5Yt49WvfnXVsoceeoi2tjbe/OY3zy4Lh8P82q/92mX7nsvlAIjFYos72Hl46KGHuOWWW6piTqLRKL/2a79Gf38/R44cuaL2du/ezcTEBL/+67+Oruuzy9/97neTSCQW1R+AD37wg1XLf+u3fmtR+78w5iWfzzM1NcWdd95JqVTi2LFji2oDYPny5fzyL/8yn/vc5xgdHb1kXxd7/RdLKBRi165d/N7v/R5QsdL8yq/8Cm1tbfzmb/7mvG51H/jAB2b/fs71yXEcfvSjH1W1e450Ok02m+XOO+9k7969c9p7xSteQW9v7+y/b731VgB+/ud/vup+O7f89OnTV3WsNWrUqLHU1IRGjRo1XjJIksS2bdtmYzF27NhBc3MzK1euBKqFxrk/zw3KBwYGkGV59rfnaG1tJZlMMjAwULV82bJlc/Y/MDDAypUrkSSpavmaNWsu2/dzvvr5fH4xhzovAwMD8+5r3bp1s+uvtD2AVatWVS3XNI3ly5cvantZllmxYkXV8sWcD4DDhw/zxje+kUQiQTwep6mpiXe84x0AZLPZRbVxjj/8wz/E87wFYzWu9PpfCYlEgk984hP09/fT39/PF77wBdasWcOnPvUpPv7xj1f9VpblOed29erVAFXxE9/73vfYunUrpmlSX19PU1MTf/d3fzfveenu7p7TH4Curq55l6fT6as70Bo1atRYYmpCo0aNGi8p7rjjDrLZLAcPHpyNzzjHtm3bGBgYYHh4mKeeeor29vY5g7qLRcJCXGuGqYuJx+O0t7dXxZC8UCx0jFcSZP1Ck8lkuPvuu9m/fz9/8id/wne/+10eeeQR/s//+T8AVUH9i2H58uW84x3vuKRVAxZ//a+Wnp4e3vOe97Bjxw6SySRf+cpXrriN7du388ADD2CaJp/5zGd46KGHeOSRR3j7298+bxyIoijztrPQ8vnaqFGjRo2fBDWhUaNGjZcUF9bT2LFjR1W2nxtvvBHDMHj88cfZtWtX1bqenh6CIODkyZNV7Y2Pj5PJZGaDei9FT08PfX19cwZqx48fX1Tf77//fvr6+ti5c+eifj/f/ufb1zk3o3PHUFdXB1QG8xdy8az9ud9ffE5c1+XMmTOL6k8QBLOZl86xmPP
|
|||
|
"text/plain": [
|
|||
|
"<Figure size 1000x600 with 1 Axes>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"plt.figure(figsize=(10, 6))\n",
|
|||
|
"wordcloud_not_spam = WordCloud(width=800, height=400).generate(not_spam_words)\n",
|
|||
|
"plt.imshow(wordcloud_not_spam, interpolation='bilinear')\n",
|
|||
|
"plt.axis('off')\n",
|
|||
|
"plt.title('Word Cloud dla Not Spam')\n",
|
|||
|
"plt.show()"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 34,
|
|||
|
"id": "743000c7d99b8a85",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Budowa modelu klasyfikacyjnego"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 35,
|
|||
|
"id": "7b3ba8e5b035cdc0",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Zamiana tekstu na wektory\n",
|
|||
|
"vectorizer = CountVectorizer()\n",
|
|||
|
"X = vectorizer.fit_transform(data['processed_message'])\n",
|
|||
|
"y = data['Label']"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 36,
|
|||
|
"id": "5d66dcf506f4f399",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Podział na zbiór treningowy i testowy\n",
|
|||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 37,
|
|||
|
"id": "b3c2a6673c718301",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style>#sk-container-id-1 {color: black;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-r
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"MultinomialNB()"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 37,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Trenowanie modelu Naiwnego Bayesa\n",
|
|||
|
"model_NB = MultinomialNB()\n",
|
|||
|
"model_NB.fit(X_train, y_train)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 38,
|
|||
|
"id": "82f18edc9161422a",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Predykcja i ocena Naiwny Bayes\n",
|
|||
|
"y_pred_NB = model_NB.predict(X_test)\n",
|
|||
|
"accuracy_NB = accuracy_score(y_test, y_pred_NB)\n",
|
|||
|
"classification_rep_NB = classification_report(y_test, y_pred_NB)\n",
|
|||
|
"confusion_matrix_NB = confusion_matrix(y_test, y_pred_NB)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 39,
|
|||
|
"id": "a629b6b89d5cdf34",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"0.9536295887386144"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 39,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"accuracy_NB"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 40,
|
|||
|
"id": "53c0cf3dc8aa02bc",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" precision recall f1-score support\n",
|
|||
|
"\n",
|
|||
|
" 0 0.98 0.95 0.96 2229\n",
|
|||
|
" 1 0.92 0.96 0.94 1394\n",
|
|||
|
"\n",
|
|||
|
" accuracy 0.95 3623\n",
|
|||
|
" macro avg 0.95 0.96 0.95 3623\n",
|
|||
|
"weighted avg 0.95 0.95 0.95 3623\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(classification_rep_NB)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 41,
|
|||
|
"id": "9b915d02828de60",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"[[2110 119]\n",
|
|||
|
" [ 49 1345]]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(confusion_matrix_NB)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 42,
|
|||
|
"id": "160da18f95c142a0",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Trening Drzewa Decyzyjnego (DT)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 43,
|
|||
|
"id": "8720ed4fd0ed5c72",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style>#sk-container-id-2 {color: black;}#sk-container-id-2 pre{padding: 0;}#sk-container-id-2 div.sk-toggleable {background-color: white;}#sk-container-id-2 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-2 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-2 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-2 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-2 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-2 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-2 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-2 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-2 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-2 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-2 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-2 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-2 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-2 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-2 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-2 div.sk-item {position: relative;z-index: 1;}#sk-container-id-2 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-2 div.sk-item::before, #sk-container-id-2 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-2 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-2 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-2 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-2 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-2 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-2 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-2 div.sk-label-container {text-align: center;}#sk-container-id-2 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-2 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-2\" class=\"sk-top-container\"><div class=\"sk-text-r
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"DecisionTreeClassifier()"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 43,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Parametry domyślne\n",
|
|||
|
"model_DT = DecisionTreeClassifier(criterion= 'gini',\n",
|
|||
|
" max_depth= None,\n",
|
|||
|
" min_samples_leaf= 1,\n",
|
|||
|
" min_samples_split= 2,\n",
|
|||
|
" splitter= 'best')\n",
|
|||
|
"model_DT.fit(X_train, y_train)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 44,
|
|||
|
"id": "7aee079d59bdd4eb",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Predykcja i ocena DT\n",
|
|||
|
"y_pred_DT = model_DT.predict(X_test)\n",
|
|||
|
"accuracy_DT = accuracy_score(y_test, y_pred_DT)\n",
|
|||
|
"classification_rep_DT = classification_report(y_test, y_pred_DT)\n",
|
|||
|
"confusion_matrix_DT = confusion_matrix(y_test, y_pred_DT)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 45,
|
|||
|
"id": "57ac5a3ffe724fd5",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"0.9354126414573558"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 45,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"accuracy_DT"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 46,
|
|||
|
"id": "ed8955dc5d5cdeaf",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" precision recall f1-score support\n",
|
|||
|
"\n",
|
|||
|
" 0 0.95 0.94 0.95 2229\n",
|
|||
|
" 1 0.91 0.93 0.92 1394\n",
|
|||
|
"\n",
|
|||
|
" accuracy 0.94 3623\n",
|
|||
|
" macro avg 0.93 0.93 0.93 3623\n",
|
|||
|
"weighted avg 0.94 0.94 0.94 3623\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(classification_rep_DT)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 47,
|
|||
|
"id": "3ebfee20eb06e8cc",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"[[2098 131]\n",
|
|||
|
" [ 103 1291]]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(confusion_matrix_DT)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 48,
|
|||
|
"id": "85d3dc4e44a2a4b3",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Las losowy"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 49,
|
|||
|
"id": "6f454235f54aa9cc",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style>#sk-container-id-3 {color: black;}#sk-container-id-3 pre{padding: 0;}#sk-container-id-3 div.sk-toggleable {background-color: white;}#sk-container-id-3 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-3 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-3 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-3 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-3 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-3 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-3 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-3 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-3 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-3 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-3 div.sk-item {position: relative;z-index: 1;}#sk-container-id-3 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-3 div.sk-item::before, #sk-container-id-3 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-3 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-3 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-3 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-3 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-3 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-3 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-3 div.sk-label-container {text-align: center;}#sk-container-id-3 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-3 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-3\" class=\"sk-top-container\"><div class=\"sk-text-r
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"RandomForestClassifier(random_state=123)"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 49,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"model_RF = RandomForestClassifier(n_estimators= 100,\n",
|
|||
|
" bootstrap= True,\n",
|
|||
|
" ccp_alpha= 0.0,\n",
|
|||
|
" criterion= 'gini',\n",
|
|||
|
" max_depth= None,\n",
|
|||
|
" min_samples_leaf= 1,\n",
|
|||
|
" min_samples_split= 2,\n",
|
|||
|
" random_state=123)\n",
|
|||
|
"model_RF.fit(X_train, y_train)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 50,
|
|||
|
"id": "23d68d066dc47f9",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Predykcja i ocena RF\n",
|
|||
|
"y_pred_RF = model_RF.predict(X_test)\n",
|
|||
|
"accuracy_RF = accuracy_score(y_test, y_pred_RF)\n",
|
|||
|
"classification_rep_RF = classification_report(y_test, y_pred_RF)\n",
|
|||
|
"confusion_matrix_RF = confusion_matrix(y_test, y_pred_RF)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 51,
|
|||
|
"id": "55789560bb43f9b8",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"0.9770908087220536"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 51,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"accuracy_RF"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 52,
|
|||
|
"id": "d15d57c467b94bad",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" precision recall f1-score support\n",
|
|||
|
"\n",
|
|||
|
" 0 0.98 0.99 0.98 2229\n",
|
|||
|
" 1 0.98 0.96 0.97 1394\n",
|
|||
|
"\n",
|
|||
|
" accuracy 0.98 3623\n",
|
|||
|
" macro avg 0.98 0.97 0.98 3623\n",
|
|||
|
"weighted avg 0.98 0.98 0.98 3623\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(classification_rep_RF)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 53,
|
|||
|
"id": "477ea9a19dbe7389",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"[[2201 28]\n",
|
|||
|
" [ 55 1339]]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(confusion_matrix_RF)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 54,
|
|||
|
"id": "9c3308c811b9d014",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Najlepszym modelem okazał się Las losowy - lepiej sklasyfikować spam jako wiadomość nie będącą spamem niż odwrotnie. \n",
|
|||
|
"# Dlatego wybieramy RF, a nie NB."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 55,
|
|||
|
"id": "81f08fa14ba4daf5",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Teraz dokonamy treningu na pełnych danych i zapiszemy model celem wykorzystania na danych rzeczywistych w późniejszej \n",
|
|||
|
"# aplikacji."
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 56,
|
|||
|
"id": "7f580653f470d7af",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"model_RF_full = RandomForestClassifier(n_estimators= 100,\n",
|
|||
|
" bootstrap= True,\n",
|
|||
|
" ccp_alpha= 0.0,\n",
|
|||
|
" criterion= 'gini',\n",
|
|||
|
" max_depth= None,\n",
|
|||
|
" min_samples_leaf= 1,\n",
|
|||
|
" min_samples_split= 2,\n",
|
|||
|
" random_state=123)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 57,
|
|||
|
"id": "f75fc9a4d4746e5a",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style>#sk-container-id-4 {color: black;}#sk-container-id-4 pre{padding: 0;}#sk-container-id-4 div.sk-toggleable {background-color: white;}#sk-container-id-4 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-4 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-4 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-4 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-4 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-4 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-4 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-4 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-4 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-4 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-4 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-4 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-4 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-4 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-4 div.sk-item {position: relative;z-index: 1;}#sk-container-id-4 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-4 div.sk-item::before, #sk-container-id-4 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-4 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-4 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-4 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-4 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-4 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-4 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-4 div.sk-label-container {text-align: center;}#sk-container-id-4 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-4 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-4\" class=\"sk-top-container\"><div class=\"sk-text-r
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"RandomForestClassifier(random_state=123)"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 57,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"model_RF_full.fit(X, y)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 58,
|
|||
|
"id": "3d77bed327ac2fa1",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Predykcja i ocena RF\n",
|
|||
|
"y_pred_RF_full = model_RF_full.predict(X)\n",
|
|||
|
"accuracy_RF_full = accuracy_score(y, y_pred_RF_full)\n",
|
|||
|
"classification_rep_RF_full = classification_report(y, y_pred_RF_full)\n",
|
|||
|
"confusion_matrix_RF_full = confusion_matrix(y, y_pred_RF_full)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 59,
|
|||
|
"id": "a76a53da77128562",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"1.0"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 59,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"accuracy_RF_full"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 60,
|
|||
|
"id": "9a66104fd13572f8",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
" precision recall f1-score support\n",
|
|||
|
"\n",
|
|||
|
" 0 1.00 1.00 1.00 11124\n",
|
|||
|
" 1 1.00 1.00 1.00 6990\n",
|
|||
|
"\n",
|
|||
|
" accuracy 1.00 18114\n",
|
|||
|
" macro avg 1.00 1.00 1.00 18114\n",
|
|||
|
"weighted avg 1.00 1.00 1.00 18114\n",
|
|||
|
"\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(classification_rep_RF_full)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 61,
|
|||
|
"id": "823635f2315ecf05",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"[[11124 0]\n",
|
|||
|
" [ 0 6990]]\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(confusion_matrix_RF_full)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 62,
|
|||
|
"id": "d0136f7b9f6344c4",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<style>#sk-container-id-5 {color: black;}#sk-container-id-5 pre{padding: 0;}#sk-container-id-5 div.sk-toggleable {background-color: white;}#sk-container-id-5 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-5 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-5 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-5 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-5 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-5 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-5 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-5 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-5 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-5 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-5 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-5 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-5 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-5 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-5 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-5 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-5 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-5 div.sk-item {position: relative;z-index: 1;}#sk-container-id-5 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-5 div.sk-item::before, #sk-container-id-5 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-5 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-5 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-5 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-5 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-5 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-5 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-5 div.sk-label-container {text-align: center;}#sk-container-id-5 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-5 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-5\" class=\"sk-top-container\"><div class=\"sk-text-r
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"RandomForestClassifier(random_state=123)"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 62,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"model_RF_full"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 63,
|
|||
|
"id": "e02e9031d10617f6",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"['vectorizer.pkl']"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 63,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"# Zapisz model i vectorizer\n",
|
|||
|
"joblib.dump(model_RF_full, 'spam_classifier_model.pkl')\n",
|
|||
|
"joblib.dump(vectorizer, 'vectorizer.pkl')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 64,
|
|||
|
"id": "2ac5943e18571301",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Uwaga, ważna jest zgodność wersji scikita i joblib tutaj i w środowisku aplikacji"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 65,
|
|||
|
"id": "a238743e07978f4",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"scikit-fuzzy==0.4.2\n",
|
|||
|
"scikit-learn==1.3.2\n",
|
|||
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"pip freeze | findstr scikit"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 66,
|
|||
|
"id": "a64099b8c61a884",
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Jak instalować?"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 67,
|
|||
|
"id": "d99c1dbe",
|
|||
|
"metadata": {
|
|||
|
"ExecuteTime": {
|
|||
|
"end_time": "2024-06-05T16:57:22.800834Z",
|
|||
|
"start_time": "2024-06-05T16:57:22.798725Z"
|
|||
|
}
|
|||
|
},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Np. tak\n",
|
|||
|
"# pip install scikit-learn==1.3.2"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3 (ipykernel)",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.12.3"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 5
|
|||
|
}
|