756 lines
20 KiB
Plaintext
756 lines
20 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"metadata": {
|
||
|
"jupyter": {
|
||
|
"is_executing": true
|
||
|
},
|
||
|
"ExecuteTime": {
|
||
|
"start_time": "2024-06-05T20:03:23.481431Z"
|
||
|
}
|
||
|
},
|
||
|
"cell_type": "code",
|
||
|
"source": [
|
||
|
"%pip install pandas\n",
|
||
|
"%pip install matplotlib\n",
|
||
|
"%pip install nltk\n",
|
||
|
"%pip install wordcloud\n",
|
||
|
"%pip install scikit-learn==1.3.2\n",
|
||
|
"%pip install scikit-fuzzy==0.4.2\n",
|
||
|
"# Import pakietów\n",
|
||
|
"import nltk\n",
|
||
|
"nltk.download('punkt')\n",
|
||
|
"nltk.download('stopwords')\n",
|
||
|
"import pandas as pd\n",
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"import re\n",
|
||
|
"import string\n",
|
||
|
"from wordcloud import WordCloud\n",
|
||
|
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||
|
"from sklearn.model_selection import train_test_split\n",
|
||
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
||
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
||
|
"from sklearn.tree import DecisionTreeClassifier\n",
|
||
|
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
|
||
|
"from nltk.corpus import stopwords\n",
|
||
|
"from nltk.stem import PorterStemmer\n",
|
||
|
"from nltk.tokenize import word_tokenize\n",
|
||
|
"import joblib\n",
|
||
|
"import pickle"
|
||
|
],
|
||
|
"id": "b313cab7d5cc49c0",
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Requirement already satisfied: pandas in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (2.2.2)\n",
|
||
|
"Requirement already satisfied: numpy>=1.26.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (1.26.4)\n",
|
||
|
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from pandas) (2.9.0.post0)\n",
|
||
|
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n",
|
||
|
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n",
|
||
|
"Requirement already satisfied: six>=1.5 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
|
||
|
"Note: you may need to restart the kernel to use updated packages.\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"execution_count": null
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Załaduj dane\n",
|
||
|
"data_path = \"joined_data.csv\"\n",
|
||
|
"data = pd.read_csv(data_path)"
|
||
|
],
|
||
|
"id": "768266dbb79c5e9d"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "print(data.head())",
|
||
|
"id": "ee08266d5c30627b"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "print(data.info())",
|
||
|
"id": "1798f605e33fe5e5"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data",
|
||
|
"id": "b4f43d913b92485b"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Usuwamy NaN",
|
||
|
"id": "e3bf0f04a2be4e1a"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data.dropna(inplace=True)",
|
||
|
"id": "71a6bbebdb0dccd4"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Usuwamy puste wiadomości i wiadomości zawierające jedynie \"\\n\"",
|
||
|
"id": "b7fca25d67381cdd"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data = data[data['Body'] != '\\n']",
|
||
|
"id": "72d84bf6c1e7023a"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data = data[data['Body'] != 'empty']",
|
||
|
"id": "7c94c4dca6c4cdae"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data.reset_index(drop=True, inplace=True)",
|
||
|
"id": "7e6fd3f8014498f3"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data",
|
||
|
"id": "a0c33f82a936c59"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Sprawdźmy rozkład targetów\n",
|
||
|
"print(data['Label'].value_counts())"
|
||
|
],
|
||
|
"id": "19af5936d0cfeba2"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Analiza długości wiadomości",
|
||
|
"id": "96c861e2655312cb"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"def get_len(row):\n",
|
||
|
" try:\n",
|
||
|
" return len(row)\n",
|
||
|
" except:\n",
|
||
|
" return row"
|
||
|
],
|
||
|
"id": "e1ec1ed8aa7c856d"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data['message_length'] = data['Body'].apply(get_len)",
|
||
|
"id": "63c023f34d234f3e"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data.sort_values(by='message_length')",
|
||
|
"id": "d4fd0e2dcc2bfee9"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Jedna wiadomość jest bardzo długa 17085626",
|
||
|
"id": "e62112260ebc17f0"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data['message_length'].value_counts()",
|
||
|
"id": "7c369131e3c91ce3"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 200.000 znaków celem wyświetlenia histogramów\n",
|
||
|
"hist_data = data[data['message_length'] < 200000]\n",
|
||
|
"plt.figure(figsize=(10, 6))\n",
|
||
|
"hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')\n",
|
||
|
"hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')\n",
|
||
|
"plt.legend()\n",
|
||
|
"plt.xlabel('Długość wiadomości')\n",
|
||
|
"plt.ylabel('Liczba wiadomości')\n",
|
||
|
"plt.title('Rozkład długości wiadomości')\n",
|
||
|
"plt.show()"
|
||
|
],
|
||
|
"id": "b6b509692fd7c541"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Ograniczamy jeszcze bardziej ",
|
||
|
"id": "7182d6a1d6600c2"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 10000 znaków celem wyświetlenia histogramów\n",
|
||
|
"hist_data = data[data['message_length'] < 10000]\n",
|
||
|
"plt.figure(figsize=(10, 6))\n",
|
||
|
"hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')\n",
|
||
|
"hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')\n",
|
||
|
"plt.legend()\n",
|
||
|
"plt.xlabel('Długość wiadomości')\n",
|
||
|
"plt.ylabel('Liczba wiadomości')\n",
|
||
|
"plt.title('Rozkład długości wiadomości')\n",
|
||
|
"plt.show()"
|
||
|
],
|
||
|
"id": "962efe0bd652ecdb"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Można zauważyć, że trudno odróżnić widomości po samej długości. W tym celu należy skorzystać z bardziej zaawansowanych metod.",
|
||
|
"id": "eaa483deb9c81942"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Przetwarzanie tekstu",
|
||
|
"id": "6e0ee5fccf308cd1"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data",
|
||
|
"id": "50c0131db25859cb"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"stop_words = set(stopwords.words('english'))\n",
|
||
|
"ps = PorterStemmer()\n",
|
||
|
"\n",
|
||
|
"def preprocess_text(text):\n",
|
||
|
" # Usuwanie znaków specjalnych i tokenizacja\n",
|
||
|
" text = re.sub(r'\\d+', '', text)\n",
|
||
|
" text = text.translate(str.maketrans('', '', string.punctuation))\n",
|
||
|
" words = word_tokenize(text)\n",
|
||
|
" # Usuwanie stopwords i stemming\n",
|
||
|
" words = [ps.stem(word) for word in words if word.lower() not in stop_words]\n",
|
||
|
" return \" \".join(words)"
|
||
|
],
|
||
|
"id": "c32c52a7b2575a3b"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Ten proces jest czasochłonny",
|
||
|
"id": "5953cb974349cb33"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data['processed_message'] = data['Body'].apply(preprocess_text)",
|
||
|
"id": "89b8cdeaa9da5c2d"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data.head()",
|
||
|
"id": "ccce395ac94c39a1"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "data['processed_message']",
|
||
|
"id": "7ce382be7bcdff2c"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Analiza słów za pomocą WordCloud\n",
|
||
|
"spam_words = ' '.join(list(data[data['Label'] == 1]['processed_message']))\n",
|
||
|
"not_spam_words = ' '.join(list(data[data['Label'] == 0]['processed_message']))"
|
||
|
],
|
||
|
"id": "dc456d793b576f7"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"plt.figure(figsize=(10, 6))\n",
|
||
|
"wordcloud_spam = WordCloud(width=800, height=400).generate(spam_words)\n",
|
||
|
"plt.imshow(wordcloud_spam, interpolation='bilinear')\n",
|
||
|
"plt.axis('off')\n",
|
||
|
"plt.title('Word Cloud dla Spam')\n",
|
||
|
"plt.show()"
|
||
|
],
|
||
|
"id": "c9d7d9c9f4ae91ed"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"plt.figure(figsize=(10, 6))\n",
|
||
|
"wordcloud_not_spam = WordCloud(width=800, height=400).generate(not_spam_words)\n",
|
||
|
"plt.imshow(wordcloud_not_spam, interpolation='bilinear')\n",
|
||
|
"plt.axis('off')\n",
|
||
|
"plt.title('Word Cloud dla Not Spam')\n",
|
||
|
"plt.show()"
|
||
|
],
|
||
|
"id": "d954e01a1d0b3a97"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Budowa modelu klasyfikacyjnego",
|
||
|
"id": "743000c7d99b8a85"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Zamiana tekstu na wektory\n",
|
||
|
"vectorizer = CountVectorizer()\n",
|
||
|
"X = vectorizer.fit_transform(data['processed_message'])\n",
|
||
|
"y = data['Label']"
|
||
|
],
|
||
|
"id": "7b3ba8e5b035cdc0"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Podział na zbiór treningowy i testowy\n",
|
||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
|
||
|
],
|
||
|
"id": "5d66dcf506f4f399"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Trenowanie modelu Naiwnego Bayesa\n",
|
||
|
"model_NB = MultinomialNB()\n",
|
||
|
"model_NB.fit(X_train, y_train)"
|
||
|
],
|
||
|
"id": "b3c2a6673c718301"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Predykcja i ocena Naiwny Bayes\n",
|
||
|
"y_pred_NB = model_NB.predict(X_test)\n",
|
||
|
"accuracy_NB = accuracy_score(y_test, y_pred_NB)\n",
|
||
|
"classification_rep_NB = classification_report(y_test, y_pred_NB)\n",
|
||
|
"confusion_matrix_NB = confusion_matrix(y_test, y_pred_NB)"
|
||
|
],
|
||
|
"id": "82f18edc9161422a"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "accuracy_NB",
|
||
|
"id": "a629b6b89d5cdf34"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "print(classification_rep_NB)",
|
||
|
"id": "53c0cf3dc8aa02bc"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "print(confusion_matrix_NB)",
|
||
|
"id": "9b915d02828de60"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Trening Drzewa Decyzyjnego (DT)",
|
||
|
"id": "160da18f95c142a0"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Parametry domyślne\n",
|
||
|
"model_DT = DecisionTreeClassifier(criterion= 'gini',\n",
|
||
|
" max_depth= None,\n",
|
||
|
" min_samples_leaf= 1,\n",
|
||
|
" min_samples_split= 2,\n",
|
||
|
" splitter= 'best')\n",
|
||
|
"model_DT.fit(X_train, y_train)"
|
||
|
],
|
||
|
"id": "8720ed4fd0ed5c72"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Predykcja i ocena DT\n",
|
||
|
"y_pred_DT = model_DT.predict(X_test)\n",
|
||
|
"accuracy_DT = accuracy_score(y_test, y_pred_DT)\n",
|
||
|
"classification_rep_DT = classification_report(y_test, y_pred_DT)\n",
|
||
|
"confusion_matrix_DT = confusion_matrix(y_test, y_pred_DT)"
|
||
|
],
|
||
|
"id": "7aee079d59bdd4eb"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "accuracy_DT",
|
||
|
"id": "57ac5a3ffe724fd5"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "print(classification_rep_DT)",
|
||
|
"id": "ed8955dc5d5cdeaf"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "print(confusion_matrix_DT)",
|
||
|
"id": "3ebfee20eb06e8cc"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Las losowy",
|
||
|
"id": "85d3dc4e44a2a4b3"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"model_RF = RandomForestClassifier(n_estimators= 100,\n",
|
||
|
" bootstrap= True,\n",
|
||
|
" ccp_alpha= 0.0,\n",
|
||
|
" criterion= 'gini',\n",
|
||
|
" max_depth= None,\n",
|
||
|
" min_samples_leaf= 1,\n",
|
||
|
" min_samples_split= 2,\n",
|
||
|
" random_state=123)\n",
|
||
|
"model_RF.fit(X_train, y_train)"
|
||
|
],
|
||
|
"id": "6f454235f54aa9cc"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Predykcja i ocena RF\n",
|
||
|
"y_pred_RF = model_RF.predict(X_test)\n",
|
||
|
"accuracy_RF = accuracy_score(y_test, y_pred_RF)\n",
|
||
|
"classification_rep_RF = classification_report(y_test, y_pred_RF)\n",
|
||
|
"confusion_matrix_RF = confusion_matrix(y_test, y_pred_RF)"
|
||
|
],
|
||
|
"id": "23d68d066dc47f9"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "accuracy_RF",
|
||
|
"id": "55789560bb43f9b8"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "print(classification_rep_RF)",
|
||
|
"id": "d15d57c467b94bad"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "print(confusion_matrix_RF)",
|
||
|
"id": "477ea9a19dbe7389"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Najlepszym modelem okazał się Las losowy - lepiej sklasyfikować spam jako wiadomość nie będącą spamem niż odwrotnie. \n",
|
||
|
"# Dlatego wybieramy RF, a nie NB."
|
||
|
],
|
||
|
"id": "9c3308c811b9d014"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Teraz dokonamy treningu na pełnych danych i zapiszemy model celem wykorzystania na danych rzeczywistych w późniejszej \n",
|
||
|
"# aplikacji."
|
||
|
],
|
||
|
"id": "81f08fa14ba4daf5"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"model_RF_full = RandomForestClassifier(n_estimators= 100,\n",
|
||
|
" bootstrap= True,\n",
|
||
|
" ccp_alpha= 0.0,\n",
|
||
|
" criterion= 'gini',\n",
|
||
|
" max_depth= None,\n",
|
||
|
" min_samples_leaf= 1,\n",
|
||
|
" min_samples_split= 2,\n",
|
||
|
" random_state=123)"
|
||
|
],
|
||
|
"id": "7f580653f470d7af"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "model_RF_full.fit(X, y)",
|
||
|
"id": "f75fc9a4d4746e5a"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Predykcja i ocena RF\n",
|
||
|
"y_pred_RF_full = model_RF_full.predict(X)\n",
|
||
|
"accuracy_RF_full = accuracy_score(y, y_pred_RF_full)\n",
|
||
|
"classification_rep_RF_full = classification_report(y, y_pred_RF_full)\n",
|
||
|
"confusion_matrix_RF_full = confusion_matrix(y, y_pred_RF_full)"
|
||
|
],
|
||
|
"id": "3d77bed327ac2fa1"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "accuracy_RF_full",
|
||
|
"id": "a76a53da77128562"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "print(classification_rep_RF_full)",
|
||
|
"id": "9a66104fd13572f8"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "print(confusion_matrix_RF_full)",
|
||
|
"id": "823635f2315ecf05"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "model_RF_full",
|
||
|
"id": "d0136f7b9f6344c4"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": [
|
||
|
"# Zapisz model i vectorizer\n",
|
||
|
"joblib.dump(model_RF_full, 'spam_classifier_model.pkl')\n",
|
||
|
"joblib.dump(vectorizer, 'vectorizer.pkl')"
|
||
|
],
|
||
|
"id": "e02e9031d10617f6"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Uwaga, ważna jest zgodność wersji scikita i joblib tutaj i w środowisku aplikacji",
|
||
|
"id": "2ac5943e18571301"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "pip freeze | findstr scikit",
|
||
|
"id": "a238743e07978f4"
|
||
|
},
|
||
|
{
|
||
|
"metadata": {},
|
||
|
"cell_type": "code",
|
||
|
"outputs": [],
|
||
|
"execution_count": null,
|
||
|
"source": "# Jak instalować?",
|
||
|
"id": "a64099b8c61a884"
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 140,
|
||
|
"id": "d99c1dbe",
|
||
|
"metadata": {
|
||
|
"ExecuteTime": {
|
||
|
"end_time": "2024-06-05T16:57:22.800834Z",
|
||
|
"start_time": "2024-06-05T16:57:22.798725Z"
|
||
|
}
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Np. tak\n",
|
||
|
"# pip install scikit-learn==1.3.2"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3 (ipykernel)",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": "3.12.3"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 5
|
||
|
}
|