PhishGuardian/backend/.ipynb_checkpoints/ML-checkpoint.ipynb

{
 "cells": [
  {
   "metadata": {
    "jupyter": {
     "is_executing": true
    },
    "ExecuteTime": {
     "start_time": "2024-06-05T20:03:23.481431Z"
    }
   },
   "cell_type": "code",
   "source": [
    "%pip install pandas\n",
    "%pip install matplotlib\n",
    "%pip install nltk\n",
    "%pip install wordcloud\n",
    "%pip install scikit-learn==1.3.2\n",
    "%pip install scikit-fuzzy==0.4.2\n",
    "# Import pakietów\n",
    "import nltk\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import re\n",
    "import string\n",
    "from wordcloud import WordCloud\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.stem import PorterStemmer\n",
    "from nltk.tokenize import word_tokenize\n",
    "import joblib\n",
    "import pickle"
   ],
   "id": "b313cab7d5cc49c0",
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pandas in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (2.2.2)\n",
      "Requirement already satisfied: numpy>=1.26.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (1.26.4)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from pandas) (2.9.0.post0)\n",
      "Requirement already satisfied: pytz>=2020.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n",
      "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n",
      "Requirement already satisfied: six>=1.5 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Załaduj dane\n",
    "data_path = \"joined_data.csv\"\n",
    "data = pd.read_csv(data_path)"
   ],
   "id": "768266dbb79c5e9d"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "print(data.head())",
   "id": "ee08266d5c30627b"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "print(data.info())",
   "id": "1798f605e33fe5e5"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data",
   "id": "b4f43d913b92485b"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Usuwamy NaN",
   "id": "e3bf0f04a2be4e1a"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data.dropna(inplace=True)",
   "id": "71a6bbebdb0dccd4"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Usuwamy puste wiadomości i wiadomości zawierające jedynie \"\\n\"",
   "id": "b7fca25d67381cdd"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data = data[data['Body'] != '\\n']",
   "id": "72d84bf6c1e7023a"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data = data[data['Body'] != 'empty']",
   "id": "7c94c4dca6c4cdae"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data.reset_index(drop=True, inplace=True)",
   "id": "7e6fd3f8014498f3"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data",
   "id": "a0c33f82a936c59"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Sprawdźmy rozkład targetów\n",
    "print(data['Label'].value_counts())"
   ],
   "id": "19af5936d0cfeba2"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Analiza długości wiadomości",
   "id": "96c861e2655312cb"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "def get_len(row):\n",
    "    try:\n",
    "        return len(row)\n",
    "    except:\n",
    "        return row"
   ],
   "id": "e1ec1ed8aa7c856d"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data['message_length'] = data['Body'].apply(get_len)",
   "id": "63c023f34d234f3e"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data.sort_values(by='message_length')",
   "id": "d4fd0e2dcc2bfee9"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Jedna wiadomość jest bardzo długa 17085626",
   "id": "e62112260ebc17f0"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data['message_length'].value_counts()",
   "id": "7c369131e3c91ce3"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 200.000 znaków celem wyświetlenia histogramów\n",
    "hist_data = data[data['message_length'] < 200000]\n",
    "plt.figure(figsize=(10, 6))\n",
    "hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')\n",
    "hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')\n",
    "plt.legend()\n",
    "plt.xlabel('Długość wiadomości')\n",
    "plt.ylabel('Liczba wiadomości')\n",
    "plt.title('Rozkład długości wiadomości')\n",
    "plt.show()"
   ],
   "id": "b6b509692fd7c541"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Ograniczamy jeszcze bardziej ",
   "id": "7182d6a1d6600c2"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 10000 znaków celem wyświetlenia histogramów\n",
    "hist_data = data[data['message_length'] < 10000]\n",
    "plt.figure(figsize=(10, 6))\n",
    "hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')\n",
    "hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')\n",
    "plt.legend()\n",
    "plt.xlabel('Długość wiadomości')\n",
    "plt.ylabel('Liczba wiadomości')\n",
    "plt.title('Rozkład długości wiadomości')\n",
    "plt.show()"
   ],
   "id": "962efe0bd652ecdb"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Można zauważyć, że trudno odróżnić widomości po samej długości. W tym celu należy skorzystać z bardziej zaawansowanych metod.",
   "id": "eaa483deb9c81942"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Przetwarzanie tekstu",
   "id": "6e0ee5fccf308cd1"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data",
   "id": "50c0131db25859cb"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "stop_words = set(stopwords.words('english'))\n",
    "ps = PorterStemmer()\n",
    "\n",
    "def preprocess_text(text):\n",
    "    # Usuwanie znaków specjalnych i tokenizacja\n",
    "    text = re.sub(r'\\d+', '', text)\n",
    "    text = text.translate(str.maketrans('', '', string.punctuation))\n",
    "    words = word_tokenize(text)\n",
    "    # Usuwanie stopwords i stemming\n",
    "    words = [ps.stem(word) for word in words if word.lower() not in stop_words]\n",
    "    return \" \".join(words)"
   ],
   "id": "c32c52a7b2575a3b"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Ten proces jest czasochłonny",
   "id": "5953cb974349cb33"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data['processed_message'] = data['Body'].apply(preprocess_text)",
   "id": "89b8cdeaa9da5c2d"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data.head()",
   "id": "ccce395ac94c39a1"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "data['processed_message']",
   "id": "7ce382be7bcdff2c"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Analiza słów za pomocą WordCloud\n",
    "spam_words = ' '.join(list(data[data['Label'] == 1]['processed_message']))\n",
    "not_spam_words = ' '.join(list(data[data['Label'] == 0]['processed_message']))"
   ],
   "id": "dc456d793b576f7"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "plt.figure(figsize=(10, 6))\n",
    "wordcloud_spam = WordCloud(width=800, height=400).generate(spam_words)\n",
    "plt.imshow(wordcloud_spam, interpolation='bilinear')\n",
    "plt.axis('off')\n",
    "plt.title('Word Cloud dla Spam')\n",
    "plt.show()"
   ],
   "id": "c9d7d9c9f4ae91ed"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "plt.figure(figsize=(10, 6))\n",
    "wordcloud_not_spam = WordCloud(width=800, height=400).generate(not_spam_words)\n",
    "plt.imshow(wordcloud_not_spam, interpolation='bilinear')\n",
    "plt.axis('off')\n",
    "plt.title('Word Cloud dla Not Spam')\n",
    "plt.show()"
   ],
   "id": "d954e01a1d0b3a97"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Budowa modelu klasyfikacyjnego",
   "id": "743000c7d99b8a85"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Zamiana tekstu na wektory\n",
    "vectorizer = CountVectorizer()\n",
    "X = vectorizer.fit_transform(data['processed_message'])\n",
    "y = data['Label']"
   ],
   "id": "7b3ba8e5b035cdc0"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Podział na zbiór treningowy i testowy\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
   ],
   "id": "5d66dcf506f4f399"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Trenowanie modelu Naiwnego Bayesa\n",
    "model_NB = MultinomialNB()\n",
    "model_NB.fit(X_train, y_train)"
   ],
   "id": "b3c2a6673c718301"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Predykcja i ocena Naiwny Bayes\n",
    "y_pred_NB = model_NB.predict(X_test)\n",
    "accuracy_NB = accuracy_score(y_test, y_pred_NB)\n",
    "classification_rep_NB = classification_report(y_test, y_pred_NB)\n",
    "confusion_matrix_NB = confusion_matrix(y_test, y_pred_NB)"
   ],
   "id": "82f18edc9161422a"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "accuracy_NB",
   "id": "a629b6b89d5cdf34"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "print(classification_rep_NB)",
   "id": "53c0cf3dc8aa02bc"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "print(confusion_matrix_NB)",
   "id": "9b915d02828de60"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Trening Drzewa Decyzyjnego (DT)",
   "id": "160da18f95c142a0"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Parametry domyślne\n",
    "model_DT = DecisionTreeClassifier(criterion= 'gini',\n",
    "                                  max_depth= None,\n",
    "                                  min_samples_leaf= 1,\n",
    "                                  min_samples_split= 2,\n",
    "                                  splitter= 'best')\n",
    "model_DT.fit(X_train, y_train)"
   ],
   "id": "8720ed4fd0ed5c72"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Predykcja i ocena DT\n",
    "y_pred_DT = model_DT.predict(X_test)\n",
    "accuracy_DT = accuracy_score(y_test, y_pred_DT)\n",
    "classification_rep_DT = classification_report(y_test, y_pred_DT)\n",
    "confusion_matrix_DT = confusion_matrix(y_test, y_pred_DT)"
   ],
   "id": "7aee079d59bdd4eb"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "accuracy_DT",
   "id": "57ac5a3ffe724fd5"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "print(classification_rep_DT)",
   "id": "ed8955dc5d5cdeaf"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "print(confusion_matrix_DT)",
   "id": "3ebfee20eb06e8cc"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Las losowy",
   "id": "85d3dc4e44a2a4b3"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "model_RF = RandomForestClassifier(n_estimators= 100,\n",
    "                                  bootstrap= True,\n",
    "                                  ccp_alpha= 0.0,\n",
    "                                  criterion= 'gini',\n",
    "                                  max_depth= None,\n",
    "                                  min_samples_leaf= 1,\n",
    "                                  min_samples_split= 2,\n",
    "                                  random_state=123)\n",
    "model_RF.fit(X_train, y_train)"
   ],
   "id": "6f454235f54aa9cc"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Predykcja i ocena RF\n",
    "y_pred_RF = model_RF.predict(X_test)\n",
    "accuracy_RF = accuracy_score(y_test, y_pred_RF)\n",
    "classification_rep_RF = classification_report(y_test, y_pred_RF)\n",
    "confusion_matrix_RF = confusion_matrix(y_test, y_pred_RF)"
   ],
   "id": "23d68d066dc47f9"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "accuracy_RF",
   "id": "55789560bb43f9b8"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "print(classification_rep_RF)",
   "id": "d15d57c467b94bad"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "print(confusion_matrix_RF)",
   "id": "477ea9a19dbe7389"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Najlepszym modelem okazał się Las losowy - lepiej sklasyfikować spam jako wiadomość nie będącą spamem niż odwrotnie. \n",
    "# Dlatego wybieramy RF, a nie NB."
   ],
   "id": "9c3308c811b9d014"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Teraz dokonamy treningu na pełnych danych i zapiszemy model celem wykorzystania na danych rzeczywistych w późniejszej \n",
    "# aplikacji."
   ],
   "id": "81f08fa14ba4daf5"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "model_RF_full = RandomForestClassifier(n_estimators= 100,\n",
    "                                  bootstrap= True,\n",
    "                                  ccp_alpha= 0.0,\n",
    "                                  criterion= 'gini',\n",
    "                                  max_depth= None,\n",
    "                                  min_samples_leaf= 1,\n",
    "                                  min_samples_split= 2,\n",
    "                                  random_state=123)"
   ],
   "id": "7f580653f470d7af"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "model_RF_full.fit(X, y)",
   "id": "f75fc9a4d4746e5a"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Predykcja i ocena RF\n",
    "y_pred_RF_full = model_RF_full.predict(X)\n",
    "accuracy_RF_full = accuracy_score(y, y_pred_RF_full)\n",
    "classification_rep_RF_full = classification_report(y, y_pred_RF_full)\n",
    "confusion_matrix_RF_full = confusion_matrix(y, y_pred_RF_full)"
   ],
   "id": "3d77bed327ac2fa1"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "accuracy_RF_full",
   "id": "a76a53da77128562"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "print(classification_rep_RF_full)",
   "id": "9a66104fd13572f8"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "print(confusion_matrix_RF_full)",
   "id": "823635f2315ecf05"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "model_RF_full",
   "id": "d0136f7b9f6344c4"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": [
    "# Zapisz model i vectorizer\n",
    "joblib.dump(model_RF_full, 'spam_classifier_model.pkl')\n",
    "joblib.dump(vectorizer, 'vectorizer.pkl')"
   ],
   "id": "e02e9031d10617f6"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Uwaga, ważna jest zgodność wersji scikita i joblib tutaj i w środowisku aplikacji",
   "id": "2ac5943e18571301"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "pip freeze | findstr scikit",
   "id": "a238743e07978f4"
  },
  {
   "metadata": {},
   "cell_type": "code",
   "outputs": [],
   "execution_count": null,
   "source": "# Jak instalować?",
   "id": "a64099b8c61a884"
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "id": "d99c1dbe",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-06-05T16:57:22.800834Z",
     "start_time": "2024-06-05T16:57:22.798725Z"
    }
   },
   "outputs": [],
   "source": [
    "# Np. tak\n",
    "# pip install scikit-learn==1.3.2"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}