PhishGuardian/backend/.ipynb_checkpoints/ML-checkpoint.ipynb

756 lines
20 KiB
Plaintext

{
"cells": [
{
"metadata": {
"jupyter": {
"is_executing": true
},
"ExecuteTime": {
"start_time": "2024-06-05T20:03:23.481431Z"
}
},
"cell_type": "code",
"source": [
"%pip install pandas\n",
"%pip install matplotlib\n",
"%pip install nltk\n",
"%pip install wordcloud\n",
"%pip install scikit-learn==1.3.2\n",
"%pip install scikit-fuzzy==0.4.2\n",
"# Import pakietów\n",
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import re\n",
"import string\n",
"from wordcloud import WordCloud\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n",
"from nltk.corpus import stopwords\n",
"from nltk.stem import PorterStemmer\n",
"from nltk.tokenize import word_tokenize\n",
"import joblib\n",
"import pickle"
],
"id": "b313cab7d5cc49c0",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: pandas in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (2.2.2)\n",
"Requirement already satisfied: numpy>=1.26.0 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (1.26.4)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from pandas) (2.9.0.post0)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n",
"Requirement already satisfied: tzdata>=2022.7 in c:\\users\\alicj\\appdata\\local\\programs\\python\\python312\\lib\\site-packages (from pandas) (2024.1)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\alicj\\appdata\\roaming\\python\\python312\\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Załaduj dane\n",
"data_path = \"joined_data.csv\"\n",
"data = pd.read_csv(data_path)"
],
"id": "768266dbb79c5e9d"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "print(data.head())",
"id": "ee08266d5c30627b"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "print(data.info())",
"id": "1798f605e33fe5e5"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data",
"id": "b4f43d913b92485b"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Usuwamy NaN",
"id": "e3bf0f04a2be4e1a"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data.dropna(inplace=True)",
"id": "71a6bbebdb0dccd4"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Usuwamy puste wiadomości i wiadomości zawierające jedynie \"\\n\"",
"id": "b7fca25d67381cdd"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data = data[data['Body'] != '\\n']",
"id": "72d84bf6c1e7023a"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data = data[data['Body'] != 'empty']",
"id": "7c94c4dca6c4cdae"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data.reset_index(drop=True, inplace=True)",
"id": "7e6fd3f8014498f3"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data",
"id": "a0c33f82a936c59"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Sprawdźmy rozkład targetów\n",
"print(data['Label'].value_counts())"
],
"id": "19af5936d0cfeba2"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Analiza długości wiadomości",
"id": "96c861e2655312cb"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"def get_len(row):\n",
" try:\n",
" return len(row)\n",
" except:\n",
" return row"
],
"id": "e1ec1ed8aa7c856d"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data['message_length'] = data['Body'].apply(get_len)",
"id": "63c023f34d234f3e"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data.sort_values(by='message_length')",
"id": "d4fd0e2dcc2bfee9"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Jedna wiadomość jest bardzo długa 17085626",
"id": "e62112260ebc17f0"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data['message_length'].value_counts()",
"id": "7c369131e3c91ce3"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 200.000 znaków celem wyświetlenia histogramów\n",
"hist_data = data[data['message_length'] < 200000]\n",
"plt.figure(figsize=(10, 6))\n",
"hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')\n",
"hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')\n",
"plt.legend()\n",
"plt.xlabel('Długość wiadomości')\n",
"plt.ylabel('Liczba wiadomości')\n",
"plt.title('Rozkład długości wiadomości')\n",
"plt.show()"
],
"id": "b6b509692fd7c541"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Ograniczamy jeszcze bardziej ",
"id": "7182d6a1d6600c2"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 10000 znaków celem wyświetlenia histogramów\n",
"hist_data = data[data['message_length'] < 10000]\n",
"plt.figure(figsize=(10, 6))\n",
"hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')\n",
"hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')\n",
"plt.legend()\n",
"plt.xlabel('Długość wiadomości')\n",
"plt.ylabel('Liczba wiadomości')\n",
"plt.title('Rozkład długości wiadomości')\n",
"plt.show()"
],
"id": "962efe0bd652ecdb"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Można zauważyć, że trudno odróżnić widomości po samej długości. W tym celu należy skorzystać z bardziej zaawansowanych metod.",
"id": "eaa483deb9c81942"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Przetwarzanie tekstu",
"id": "6e0ee5fccf308cd1"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data",
"id": "50c0131db25859cb"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"stop_words = set(stopwords.words('english'))\n",
"ps = PorterStemmer()\n",
"\n",
"def preprocess_text(text):\n",
" # Usuwanie znaków specjalnych i tokenizacja\n",
" text = re.sub(r'\\d+', '', text)\n",
" text = text.translate(str.maketrans('', '', string.punctuation))\n",
" words = word_tokenize(text)\n",
" # Usuwanie stopwords i stemming\n",
" words = [ps.stem(word) for word in words if word.lower() not in stop_words]\n",
" return \" \".join(words)"
],
"id": "c32c52a7b2575a3b"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Ten proces jest czasochłonny",
"id": "5953cb974349cb33"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data['processed_message'] = data['Body'].apply(preprocess_text)",
"id": "89b8cdeaa9da5c2d"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data.head()",
"id": "ccce395ac94c39a1"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "data['processed_message']",
"id": "7ce382be7bcdff2c"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Analiza słów za pomocą WordCloud\n",
"spam_words = ' '.join(list(data[data['Label'] == 1]['processed_message']))\n",
"not_spam_words = ' '.join(list(data[data['Label'] == 0]['processed_message']))"
],
"id": "dc456d793b576f7"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"plt.figure(figsize=(10, 6))\n",
"wordcloud_spam = WordCloud(width=800, height=400).generate(spam_words)\n",
"plt.imshow(wordcloud_spam, interpolation='bilinear')\n",
"plt.axis('off')\n",
"plt.title('Word Cloud dla Spam')\n",
"plt.show()"
],
"id": "c9d7d9c9f4ae91ed"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"plt.figure(figsize=(10, 6))\n",
"wordcloud_not_spam = WordCloud(width=800, height=400).generate(not_spam_words)\n",
"plt.imshow(wordcloud_not_spam, interpolation='bilinear')\n",
"plt.axis('off')\n",
"plt.title('Word Cloud dla Not Spam')\n",
"plt.show()"
],
"id": "d954e01a1d0b3a97"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Budowa modelu klasyfikacyjnego",
"id": "743000c7d99b8a85"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Zamiana tekstu na wektory\n",
"vectorizer = CountVectorizer()\n",
"X = vectorizer.fit_transform(data['processed_message'])\n",
"y = data['Label']"
],
"id": "7b3ba8e5b035cdc0"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Podział na zbiór treningowy i testowy\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
],
"id": "5d66dcf506f4f399"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Trenowanie modelu Naiwnego Bayesa\n",
"model_NB = MultinomialNB()\n",
"model_NB.fit(X_train, y_train)"
],
"id": "b3c2a6673c718301"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Predykcja i ocena Naiwny Bayes\n",
"y_pred_NB = model_NB.predict(X_test)\n",
"accuracy_NB = accuracy_score(y_test, y_pred_NB)\n",
"classification_rep_NB = classification_report(y_test, y_pred_NB)\n",
"confusion_matrix_NB = confusion_matrix(y_test, y_pred_NB)"
],
"id": "82f18edc9161422a"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "accuracy_NB",
"id": "a629b6b89d5cdf34"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "print(classification_rep_NB)",
"id": "53c0cf3dc8aa02bc"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "print(confusion_matrix_NB)",
"id": "9b915d02828de60"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Trening Drzewa Decyzyjnego (DT)",
"id": "160da18f95c142a0"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Parametry domyślne\n",
"model_DT = DecisionTreeClassifier(criterion= 'gini',\n",
" max_depth= None,\n",
" min_samples_leaf= 1,\n",
" min_samples_split= 2,\n",
" splitter= 'best')\n",
"model_DT.fit(X_train, y_train)"
],
"id": "8720ed4fd0ed5c72"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Predykcja i ocena DT\n",
"y_pred_DT = model_DT.predict(X_test)\n",
"accuracy_DT = accuracy_score(y_test, y_pred_DT)\n",
"classification_rep_DT = classification_report(y_test, y_pred_DT)\n",
"confusion_matrix_DT = confusion_matrix(y_test, y_pred_DT)"
],
"id": "7aee079d59bdd4eb"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "accuracy_DT",
"id": "57ac5a3ffe724fd5"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "print(classification_rep_DT)",
"id": "ed8955dc5d5cdeaf"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "print(confusion_matrix_DT)",
"id": "3ebfee20eb06e8cc"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Las losowy",
"id": "85d3dc4e44a2a4b3"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"model_RF = RandomForestClassifier(n_estimators= 100,\n",
" bootstrap= True,\n",
" ccp_alpha= 0.0,\n",
" criterion= 'gini',\n",
" max_depth= None,\n",
" min_samples_leaf= 1,\n",
" min_samples_split= 2,\n",
" random_state=123)\n",
"model_RF.fit(X_train, y_train)"
],
"id": "6f454235f54aa9cc"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Predykcja i ocena RF\n",
"y_pred_RF = model_RF.predict(X_test)\n",
"accuracy_RF = accuracy_score(y_test, y_pred_RF)\n",
"classification_rep_RF = classification_report(y_test, y_pred_RF)\n",
"confusion_matrix_RF = confusion_matrix(y_test, y_pred_RF)"
],
"id": "23d68d066dc47f9"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "accuracy_RF",
"id": "55789560bb43f9b8"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "print(classification_rep_RF)",
"id": "d15d57c467b94bad"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "print(confusion_matrix_RF)",
"id": "477ea9a19dbe7389"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Najlepszym modelem okazał się Las losowy - lepiej sklasyfikować spam jako wiadomość nie będącą spamem niż odwrotnie. \n",
"# Dlatego wybieramy RF, a nie NB."
],
"id": "9c3308c811b9d014"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Teraz dokonamy treningu na pełnych danych i zapiszemy model celem wykorzystania na danych rzeczywistych w późniejszej \n",
"# aplikacji."
],
"id": "81f08fa14ba4daf5"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"model_RF_full = RandomForestClassifier(n_estimators= 100,\n",
" bootstrap= True,\n",
" ccp_alpha= 0.0,\n",
" criterion= 'gini',\n",
" max_depth= None,\n",
" min_samples_leaf= 1,\n",
" min_samples_split= 2,\n",
" random_state=123)"
],
"id": "7f580653f470d7af"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "model_RF_full.fit(X, y)",
"id": "f75fc9a4d4746e5a"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Predykcja i ocena RF\n",
"y_pred_RF_full = model_RF_full.predict(X)\n",
"accuracy_RF_full = accuracy_score(y, y_pred_RF_full)\n",
"classification_rep_RF_full = classification_report(y, y_pred_RF_full)\n",
"confusion_matrix_RF_full = confusion_matrix(y, y_pred_RF_full)"
],
"id": "3d77bed327ac2fa1"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "accuracy_RF_full",
"id": "a76a53da77128562"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "print(classification_rep_RF_full)",
"id": "9a66104fd13572f8"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "print(confusion_matrix_RF_full)",
"id": "823635f2315ecf05"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "model_RF_full",
"id": "d0136f7b9f6344c4"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": [
"# Zapisz model i vectorizer\n",
"joblib.dump(model_RF_full, 'spam_classifier_model.pkl')\n",
"joblib.dump(vectorizer, 'vectorizer.pkl')"
],
"id": "e02e9031d10617f6"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Uwaga, ważna jest zgodność wersji scikita i joblib tutaj i w środowisku aplikacji",
"id": "2ac5943e18571301"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "pip freeze | findstr scikit",
"id": "a238743e07978f4"
},
{
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "# Jak instalować?",
"id": "a64099b8c61a884"
},
{
"cell_type": "code",
"execution_count": 140,
"id": "d99c1dbe",
"metadata": {
"ExecuteTime": {
"end_time": "2024-06-05T16:57:22.800834Z",
"start_time": "2024-06-05T16:57:22.798725Z"
}
},
"outputs": [],
"source": [
"# Np. tak\n",
"# pip install scikit-learn==1.3.2"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}