{ "cells": [ { "metadata": { "jupyter": { "is_executing": true }, "ExecuteTime": { "start_time": "2024-06-05T20:03:23.481431Z" } }, "cell_type": "code", "source": [ "%pip install pandas\n", "%pip install matplotlib\n", "%pip install nltk\n", "%pip install wordcloud\n", "%pip install scikit-learn==1.3.2\n", "%pip install scikit-fuzzy==0.4.2\n", "# Import pakietów\n", "import nltk\n", "nltk.download('punkt')\n", "nltk.download('stopwords')\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import re\n", "import string\n", "from wordcloud import WordCloud\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", "from nltk.corpus import stopwords\n", "from nltk.stem import PorterStemmer\n", "from nltk.tokenize import word_tokenize\n", "import joblib\n", "import pickle" ], "id": "b313cab7d5cc49c0", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: pandas in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (2.2.2)\n", "Requirement already satisfied: numpy>=1.26.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (1.26.4)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from pandas) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n", "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "execution_count": null }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Załaduj dane\n", "data_path = \"joined_data.csv\"\n", "data = pd.read_csv(data_path)" ], "id": "768266dbb79c5e9d" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "print(data.head())", "id": "ee08266d5c30627b" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "print(data.info())", "id": "1798f605e33fe5e5" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data", "id": "b4f43d913b92485b" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Usuwamy NaN", "id": "e3bf0f04a2be4e1a" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data.dropna(inplace=True)", "id": "71a6bbebdb0dccd4" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Usuwamy puste wiadomości i wiadomości zawierające jedynie \"\\n\"", "id": "b7fca25d67381cdd" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data = data[data['Body'] != '\\n']", "id": "72d84bf6c1e7023a" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data = data[data['Body'] != 'empty']", "id": "7c94c4dca6c4cdae" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data.reset_index(drop=True, inplace=True)", "id": "7e6fd3f8014498f3" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data", "id": "a0c33f82a936c59" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Sprawdźmy rozkład targetów\n", "print(data['Label'].value_counts())" ], "id": "19af5936d0cfeba2" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Analiza długości wiadomości", "id": "96c861e2655312cb" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "def get_len(row):\n", " try:\n", " return len(row)\n", " except:\n", " return row" ], "id": "e1ec1ed8aa7c856d" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data['message_length'] = data['Body'].apply(get_len)", "id": "63c023f34d234f3e" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data.sort_values(by='message_length')", "id": "d4fd0e2dcc2bfee9" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Jedna wiadomość jest bardzo długa 17085626", "id": "e62112260ebc17f0" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data['message_length'].value_counts()", "id": "7c369131e3c91ce3" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 200.000 znaków celem wyświetlenia histogramów\n", "hist_data = data[data['message_length'] < 200000]\n", "plt.figure(figsize=(10, 6))\n", "hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')\n", "hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')\n", "plt.legend()\n", "plt.xlabel('Długość wiadomości')\n", "plt.ylabel('Liczba wiadomości')\n", "plt.title('Rozkład długości wiadomości')\n", "plt.show()" ], "id": "b6b509692fd7c541" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Ograniczamy jeszcze bardziej ", "id": "7182d6a1d6600c2" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 10000 znaków celem wyświetlenia histogramów\n", "hist_data = data[data['message_length'] < 10000]\n", "plt.figure(figsize=(10, 6))\n", "hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')\n", "hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')\n", "plt.legend()\n", "plt.xlabel('Długość wiadomości')\n", "plt.ylabel('Liczba wiadomości')\n", "plt.title('Rozkład długości wiadomości')\n", "plt.show()" ], "id": "962efe0bd652ecdb" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Można zauważyć, że trudno odróżnić widomości po samej długości. W tym celu należy skorzystać z bardziej zaawansowanych metod.", "id": "eaa483deb9c81942" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Przetwarzanie tekstu", "id": "6e0ee5fccf308cd1" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data", "id": "50c0131db25859cb" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "stop_words = set(stopwords.words('english'))\n", "ps = PorterStemmer()\n", "\n", "def preprocess_text(text):\n", " # Usuwanie znaków specjalnych i tokenizacja\n", " text = re.sub(r'\\d+', '', text)\n", " text = text.translate(str.maketrans('', '', string.punctuation))\n", " words = word_tokenize(text)\n", " # Usuwanie stopwords i stemming\n", " words = [ps.stem(word) for word in words if word.lower() not in stop_words]\n", " return \" \".join(words)" ], "id": "c32c52a7b2575a3b" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Ten proces jest czasochłonny", "id": "5953cb974349cb33" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data['processed_message'] = data['Body'].apply(preprocess_text)", "id": "89b8cdeaa9da5c2d" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data.head()", "id": "ccce395ac94c39a1" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "data['processed_message']", "id": "7ce382be7bcdff2c" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Analiza słów za pomocą WordCloud\n", "spam_words = ' '.join(list(data[data['Label'] == 1]['processed_message']))\n", "not_spam_words = ' '.join(list(data[data['Label'] == 0]['processed_message']))" ], "id": "dc456d793b576f7" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "plt.figure(figsize=(10, 6))\n", "wordcloud_spam = WordCloud(width=800, height=400).generate(spam_words)\n", "plt.imshow(wordcloud_spam, interpolation='bilinear')\n", "plt.axis('off')\n", "plt.title('Word Cloud dla Spam')\n", "plt.show()" ], "id": "c9d7d9c9f4ae91ed" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "plt.figure(figsize=(10, 6))\n", "wordcloud_not_spam = WordCloud(width=800, height=400).generate(not_spam_words)\n", "plt.imshow(wordcloud_not_spam, interpolation='bilinear')\n", "plt.axis('off')\n", "plt.title('Word Cloud dla Not Spam')\n", "plt.show()" ], "id": "d954e01a1d0b3a97" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Budowa modelu klasyfikacyjnego", "id": "743000c7d99b8a85" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Zamiana tekstu na wektory\n", "vectorizer = CountVectorizer()\n", "X = vectorizer.fit_transform(data['processed_message'])\n", "y = data['Label']" ], "id": "7b3ba8e5b035cdc0" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Podział na zbiór treningowy i testowy\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)" ], "id": "5d66dcf506f4f399" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Trenowanie modelu Naiwnego Bayesa\n", "model_NB = MultinomialNB()\n", "model_NB.fit(X_train, y_train)" ], "id": "b3c2a6673c718301" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Predykcja i ocena Naiwny Bayes\n", "y_pred_NB = model_NB.predict(X_test)\n", "accuracy_NB = accuracy_score(y_test, y_pred_NB)\n", "classification_rep_NB = classification_report(y_test, y_pred_NB)\n", "confusion_matrix_NB = confusion_matrix(y_test, y_pred_NB)" ], "id": "82f18edc9161422a" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "accuracy_NB", "id": "a629b6b89d5cdf34" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "print(classification_rep_NB)", "id": "53c0cf3dc8aa02bc" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "print(confusion_matrix_NB)", "id": "9b915d02828de60" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Trening Drzewa Decyzyjnego (DT)", "id": "160da18f95c142a0" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Parametry domyślne\n", "model_DT = DecisionTreeClassifier(criterion= 'gini',\n", " max_depth= None,\n", " min_samples_leaf= 1,\n", " min_samples_split= 2,\n", " splitter= 'best')\n", "model_DT.fit(X_train, y_train)" ], "id": "8720ed4fd0ed5c72" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Predykcja i ocena DT\n", "y_pred_DT = model_DT.predict(X_test)\n", "accuracy_DT = accuracy_score(y_test, y_pred_DT)\n", "classification_rep_DT = classification_report(y_test, y_pred_DT)\n", "confusion_matrix_DT = confusion_matrix(y_test, y_pred_DT)" ], "id": "7aee079d59bdd4eb" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "accuracy_DT", "id": "57ac5a3ffe724fd5" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "print(classification_rep_DT)", "id": "ed8955dc5d5cdeaf" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "print(confusion_matrix_DT)", "id": "3ebfee20eb06e8cc" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Las losowy", "id": "85d3dc4e44a2a4b3" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "model_RF = RandomForestClassifier(n_estimators= 100,\n", " bootstrap= True,\n", " ccp_alpha= 0.0,\n", " criterion= 'gini',\n", " max_depth= None,\n", " min_samples_leaf= 1,\n", " min_samples_split= 2,\n", " random_state=123)\n", "model_RF.fit(X_train, y_train)" ], "id": "6f454235f54aa9cc" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Predykcja i ocena RF\n", "y_pred_RF = model_RF.predict(X_test)\n", "accuracy_RF = accuracy_score(y_test, y_pred_RF)\n", "classification_rep_RF = classification_report(y_test, y_pred_RF)\n", "confusion_matrix_RF = confusion_matrix(y_test, y_pred_RF)" ], "id": "23d68d066dc47f9" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "accuracy_RF", "id": "55789560bb43f9b8" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "print(classification_rep_RF)", "id": "d15d57c467b94bad" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "print(confusion_matrix_RF)", "id": "477ea9a19dbe7389" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Najlepszym modelem okazał się Las losowy - lepiej sklasyfikować spam jako wiadomość nie będącą spamem niż odwrotnie. \n", "# Dlatego wybieramy RF, a nie NB." ], "id": "9c3308c811b9d014" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Teraz dokonamy treningu na pełnych danych i zapiszemy model celem wykorzystania na danych rzeczywistych w późniejszej \n", "# aplikacji." ], "id": "81f08fa14ba4daf5" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "model_RF_full = RandomForestClassifier(n_estimators= 100,\n", " bootstrap= True,\n", " ccp_alpha= 0.0,\n", " criterion= 'gini',\n", " max_depth= None,\n", " min_samples_leaf= 1,\n", " min_samples_split= 2,\n", " random_state=123)" ], "id": "7f580653f470d7af" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "model_RF_full.fit(X, y)", "id": "f75fc9a4d4746e5a" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Predykcja i ocena RF\n", "y_pred_RF_full = model_RF_full.predict(X)\n", "accuracy_RF_full = accuracy_score(y, y_pred_RF_full)\n", "classification_rep_RF_full = classification_report(y, y_pred_RF_full)\n", "confusion_matrix_RF_full = confusion_matrix(y, y_pred_RF_full)" ], "id": "3d77bed327ac2fa1" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "accuracy_RF_full", "id": "a76a53da77128562" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "print(classification_rep_RF_full)", "id": "9a66104fd13572f8" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "print(confusion_matrix_RF_full)", "id": "823635f2315ecf05" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "model_RF_full", "id": "d0136f7b9f6344c4" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": [ "# Zapisz model i vectorizer\n", "joblib.dump(model_RF_full, 'spam_classifier_model.pkl')\n", "joblib.dump(vectorizer, 'vectorizer.pkl')" ], "id": "e02e9031d10617f6" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Uwaga, ważna jest zgodność wersji scikita i joblib tutaj i w środowisku aplikacji", "id": "2ac5943e18571301" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "pip freeze | findstr scikit", "id": "a238743e07978f4" }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "# Jak instalować?", "id": "a64099b8c61a884" }, { "cell_type": "code", "execution_count": 140, "id": "d99c1dbe", "metadata": { "ExecuteTime": { "end_time": "2024-06-05T16:57:22.800834Z", "start_time": "2024-06-05T16:57:22.798725Z" } }, "outputs": [], "source": [ "# Np. tak\n", "# pip install scikit-learn==1.3.2" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }