PhishGuardian/backend/ML.ipynb

887 KiB
Raw Permalink Blame History

%pip install pandas
%pip install matplotlib
%pip install nltk
%pip install wordcloud
%pip install scikit-learn==1.3.2
%pip install scikit-fuzzy==0.4.2
# Import pakietów
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import joblib
import pickle
Requirement already satisfied: pandas in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (2.2.2)Note: you may need to restart the kernel to use updated packages.

Requirement already satisfied: numpy>=1.26.0 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from pandas) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\alicj\appdata\roaming\python\python312\site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from pandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from pandas) (2024.1)
Requirement already satisfied: six>=1.5 in c:\users\alicj\appdata\roaming\python\python312\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Requirement already satisfied: matplotlib in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (3.9.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (1.2.1)
Requirement already satisfied: cycler>=0.10 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (4.53.0)
Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (1.4.5)
Requirement already satisfied: numpy>=1.23 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (1.26.4)
Requirement already satisfied: packaging>=20.0 in c:\users\alicj\appdata\roaming\python\python312\site-packages (from matplotlib) (24.0)
Requirement already satisfied: pillow>=8 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (10.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from matplotlib) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\alicj\appdata\roaming\python\python312\site-packages (from matplotlib) (2.9.0.post0)
Requirement already satisfied: six>=1.5 in c:\users\alicj\appdata\roaming\python\python312\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
Requirement already satisfied: nltk in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (3.8.1)
Requirement already satisfied: click in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from nltk) (8.1.7)
Requirement already satisfied: joblib in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from nltk) (1.4.2)
Requirement already satisfied: regex>=2021.8.3 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from nltk) (2024.5.15)
Requirement already satisfied: tqdm in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from nltk) (4.66.4)
Requirement already satisfied: colorama in c:\users\alicj\appdata\roaming\python\python312\site-packages (from click->nltk) (0.4.6)
Note: you may need to restart the kernel to use updated packages.
Requirement already satisfied: wordcloud in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (1.9.3)
Requirement already satisfied: numpy>=1.6.1 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from wordcloud) (1.26.4)
Requirement already satisfied: pillow in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from wordcloud) (10.3.0)
Requirement already satisfied: matplotlib in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from wordcloud) (3.9.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (1.2.1)
Requirement already satisfied: cycler>=0.10 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (4.53.0)
Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (1.4.5)
Requirement already satisfied: packaging>=20.0 in c:\users\alicj\appdata\roaming\python\python312\site-packages (from matplotlib->wordcloud) (24.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from matplotlib->wordcloud) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\alicj\appdata\roaming\python\python312\site-packages (from matplotlib->wordcloud) (2.9.0.post0)
Requirement already satisfied: six>=1.5 in c:\users\alicj\appdata\roaming\python\python312\site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
Requirement already satisfied: scikit-learn==1.3.2 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (1.3.2)
Requirement already satisfied: numpy<2.0,>=1.17.3 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn==1.3.2) (1.26.4)
Requirement already satisfied: scipy>=1.5.0 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn==1.3.2) (1.13.1)
Requirement already satisfied: joblib>=1.1.1 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn==1.3.2) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from scikit-learn==1.3.2) (3.5.0)
Note: you may need to restart the kernel to use updated packages.
Requirement already satisfied: scikit-fuzzy==0.4.2 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (0.4.2)
Requirement already satisfied: numpy>=1.6.0 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from scikit-fuzzy==0.4.2) (1.26.4)
Requirement already satisfied: scipy>=0.9.0 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from scikit-fuzzy==0.4.2) (1.13.1)
Requirement already satisfied: networkx>=1.9.0 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from scikit-fuzzy==0.4.2) (3.3)
Note: you may need to restart the kernel to use updated packages.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alicj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alicj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
# Załaduj dane
data_path = "joined_data.csv"
data = pd.read_csv(data_path)
print(data.head())
   Unnamed: 0                                               Body  Label
0           0  Subject: congratulations\n vince ,\n congratul...      0
1           1  \nhttp://news.bbc.co.uk/1/hi/scotland/2515231....      0
2           2  Big and big\nMAIN PAGE\nHuge big titties @ big...      1
3           3  Subject: re : enron visit - - thanks\n larry ,...      0
4           4  On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan...      0
print(data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18651 entries, 0 to 18650
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  18651 non-null  int64 
 1   Body        18650 non-null  object
 2   Label       18651 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 437.3+ KB
None
data
Unnamed: 0 Body Label
0 0 Subject: congratulations\n vince ,\n congratul... 0
1 1 \nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0
2 2 Big and big\nMAIN PAGE\nHuge big titties @ big... 1
3 3 Subject: re : enron visit - - thanks\n larry ,... 0
4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0
... ... ... ...
18646 18646 Subject: fluid analysis\n our customer speak v... 1
18647 18647 Subject: guadalupe\n i rolled 740208 , 740209 ... 0
18648 18648 100% Free Porn!\nWhat more can you ask for?\nC... 1
18649 18649 Subject: revised nominations\n daren ,\n we ha... 0
18650 18650 Hello,\nI've got a small problem but still ann... 0

18651 rows × 3 columns

# Usuwamy NaN
data.dropna(inplace=True)
# Usuwamy puste wiadomości i wiadomości zawierające jedynie "\n"
data = data[data['Body'] != '\n']
data = data[data['Body'] != 'empty']
data.reset_index(drop=True, inplace=True)
data
Unnamed: 0 Body Label
0 0 Subject: congratulations\n vince ,\n congratul... 0
1 1 \nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0
2 2 Big and big\nMAIN PAGE\nHuge big titties @ big... 1
3 3 Subject: re : enron visit - - thanks\n larry ,... 0
4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0
... ... ... ...
18109 18646 Subject: fluid analysis\n our customer speak v... 1
18110 18647 Subject: guadalupe\n i rolled 740208 , 740209 ... 0
18111 18648 100% Free Porn!\nWhat more can you ask for?\nC... 1
18112 18649 Subject: revised nominations\n daren ,\n we ha... 0
18113 18650 Hello,\nI've got a small problem but still ann... 0

18114 rows × 3 columns

# Sprawdźmy rozkład targetów
print(data['Label'].value_counts())
Label
0    11124
1     6990
Name: count, dtype: int64
# Analiza długości wiadomości
def get_len(row):
    try:
        return len(row)
    except:
        return row
data['message_length'] = data['Body'].apply(get_len)
data.sort_values(by='message_length')
Unnamed: 0 Body Label message_length
16293 16774 \n4623\n 1 6
6071 6254 Subject: \n 1 10
3683 3792 Subject: \n 1 10
12843 13228 Subject: \n 1 10
17867 18399 Subject: \n 1 10
... ... ... ... ...
6927 7128 ------------------------ Yahoo! Groups Sponsor... 0 107989
6887 7088 Subject: enron mentions\n enron discusses cred... 0 121502
2422 2488 =?GB2312?B?yNW12squ0ru97NbQufq5+rzKtefX08nosb... 1 129635
1522 1569 change your settings: http://blo.gs/settings.p... 0 194978
4844 4987 ,Body,Label\n 0,"Subject: great part-time or s... 0 17085626

18114 rows × 4 columns

# Jedna wiadomość jest bardzo długa 17085626
data['message_length'].value_counts()
message_length
293     68
295     53
291     52
539     44
446     40
        ..
2394     1
4856     1
6192     1
2597     1
4004     1
Name: count, Length: 4903, dtype: int64
# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 200.000 znaków celem wyświetlenia histogramów
hist_data = data[data['message_length'] < 200000]
plt.figure(figsize=(10, 6))
hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')
hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')
plt.legend()
plt.xlabel('Długość wiadomości')
plt.ylabel('Liczba wiadomości')
plt.title('Rozkład długości wiadomości')
plt.show()
# Ograniczamy jeszcze bardziej 
# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 10000 znaków celem wyświetlenia histogramów
hist_data = data[data['message_length'] < 10000]
plt.figure(figsize=(10, 6))
hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')
hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')
plt.legend()
plt.xlabel('Długość wiadomości')
plt.ylabel('Liczba wiadomości')
plt.title('Rozkład długości wiadomości')
plt.show()
# Można zauważyć, że trudno odróżnić widomości po samej długości. W tym celu należy skorzystać z bardziej zaawansowanych metod.
# Przetwarzanie tekstu
data
Unnamed: 0 Body Label message_length
0 0 Subject: congratulations\n vince ,\n congratul... 0 129
1 1 \nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0 435
2 2 Big and big\nMAIN PAGE\nHuge big titties @ big... 1 231
3 3 Subject: re : enron visit - - thanks\n larry ,... 0 1180
4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0 574
... ... ... ... ...
18109 18646 Subject: fluid analysis\n our customer speak v... 1 927
18110 18647 Subject: guadalupe\n i rolled 740208 , 740209 ... 0 337
18111 18648 100% Free Porn!\nWhat more can you ask for?\nC... 1 345
18112 18649 Subject: revised nominations\n daren ,\n we ha... 0 346
18113 18650 Hello,\nI've got a small problem but still ann... 0 744

18114 rows × 4 columns

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    # Usuwanie znaków specjalnych i tokenizacja
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    # Usuwanie stopwords i stemming
    words = [ps.stem(word) for word in words if word.lower() not in stop_words]
    return " ".join(words)
# Ten proces jest czasochłonny
data['processed_message'] = data['Body'].apply(preprocess_text)
data.head()
Unnamed: 0 Body Label message_length processed_message
0 0 Subject: congratulations\n vince ,\n congratul... 0 129 subject congratul vinc congratul wish best luc...
1 1 \nhttp://news.bbc.co.uk/1/hi/scotland/2515231.... 0 435 httpnewsbbccoukhiscotlandstm yahoo group spons...
2 2 Big and big\nMAIN PAGE\nHuge big titties @ big... 1 231 big big main page huge big titti bigbigscom sa...
3 3 Subject: re : enron visit - - thanks\n larry ,... 0 1180 subject enron visit thank larri think potenti ...
4 4 On Fri, Aug 09, 2002 at 09:30:29AM +0100, Ryan... 0 574 fri aug ryan shane mention imho stop spammer g...
data['processed_message']
0        subject congratul vinc congratul wish best luc...
1        httpnewsbbccoukhiscotlandstm yahoo group spons...
2        big big main page huge big titti bigbigscom sa...
3        subject enron visit thank larri think potenti ...
4        fri aug ryan shane mention imho stop spammer g...
                               ...                        
18109    subject fluid analysi custom speak volum spur ...
18110    subject guadalup roll june ena deal guadalup d...
18111    free porn ask click â â â remov instruct striv...
18112    subject revis nomin daren receiv revis nomin p...
18113    hello ive got small problem still annoy upgrad...
Name: processed_message, Length: 18114, dtype: object
# Analiza słów za pomocą WordCloud
spam_words = ' '.join(list(data[data['Label'] == 1]['processed_message']))
not_spam_words = ' '.join(list(data[data['Label'] == 0]['processed_message']))
plt.figure(figsize=(10, 6))
wordcloud_spam = WordCloud(width=800, height=400).generate(spam_words)
plt.imshow(wordcloud_spam, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud dla Spam')
plt.show()
plt.figure(figsize=(10, 6))
wordcloud_not_spam = WordCloud(width=800, height=400).generate(not_spam_words)
plt.imshow(wordcloud_not_spam, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud dla Not Spam')
plt.show()
# Budowa modelu klasyfikacyjnego
# Zamiana tekstu na wektory
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['processed_message'])
y = data['Label']
# Podział na zbiór treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Trenowanie modelu Naiwnego Bayesa
model_NB = MultinomialNB()
model_NB.fit(X_train, y_train)
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()
# Predykcja i ocena Naiwny Bayes
y_pred_NB = model_NB.predict(X_test)
accuracy_NB = accuracy_score(y_test, y_pred_NB)
classification_rep_NB = classification_report(y_test, y_pred_NB)
confusion_matrix_NB = confusion_matrix(y_test, y_pred_NB)
accuracy_NB
0.9536295887386144
print(classification_rep_NB)
              precision    recall  f1-score   support

           0       0.98      0.95      0.96      2229
           1       0.92      0.96      0.94      1394

    accuracy                           0.95      3623
   macro avg       0.95      0.96      0.95      3623
weighted avg       0.95      0.95      0.95      3623

print(confusion_matrix_NB)
[[2110  119]
 [  49 1345]]
# Trening Drzewa Decyzyjnego (DT)
# Parametry domyślne
model_DT = DecisionTreeClassifier(criterion= 'gini',
                                  max_depth= None,
                                  min_samples_leaf= 1,
                                  min_samples_split= 2,
                                  splitter= 'best')
model_DT.fit(X_train, y_train)
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
# Predykcja i ocena DT
y_pred_DT = model_DT.predict(X_test)
accuracy_DT = accuracy_score(y_test, y_pred_DT)
classification_rep_DT = classification_report(y_test, y_pred_DT)
confusion_matrix_DT = confusion_matrix(y_test, y_pred_DT)
accuracy_DT
0.9354126414573558
print(classification_rep_DT)
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      2229
           1       0.91      0.93      0.92      1394

    accuracy                           0.94      3623
   macro avg       0.93      0.93      0.93      3623
weighted avg       0.94      0.94      0.94      3623

print(confusion_matrix_DT)
[[2098  131]
 [ 103 1291]]
# Las losowy
model_RF = RandomForestClassifier(n_estimators= 100,
                                  bootstrap= True,
                                  ccp_alpha= 0.0,
                                  criterion= 'gini',
                                  max_depth= None,
                                  min_samples_leaf= 1,
                                  min_samples_split= 2,
                                  random_state=123)
model_RF.fit(X_train, y_train)
RandomForestClassifier(random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=123)
# Predykcja i ocena RF
y_pred_RF = model_RF.predict(X_test)
accuracy_RF = accuracy_score(y_test, y_pred_RF)
classification_rep_RF = classification_report(y_test, y_pred_RF)
confusion_matrix_RF = confusion_matrix(y_test, y_pred_RF)
accuracy_RF
0.9770908087220536
print(classification_rep_RF)
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      2229
           1       0.98      0.96      0.97      1394

    accuracy                           0.98      3623
   macro avg       0.98      0.97      0.98      3623
weighted avg       0.98      0.98      0.98      3623

print(confusion_matrix_RF)
[[2201   28]
 [  55 1339]]
# Najlepszym modelem okazał się Las losowy - lepiej sklasyfikować spam jako wiadomość nie będącą spamem niż odwrotnie. 
# Dlatego wybieramy RF, a nie NB.
# Teraz dokonamy treningu na pełnych danych i zapiszemy model celem wykorzystania na danych rzeczywistych w późniejszej 
# aplikacji.
model_RF_full = RandomForestClassifier(n_estimators= 100,
                                  bootstrap= True,
                                  ccp_alpha= 0.0,
                                  criterion= 'gini',
                                  max_depth= None,
                                  min_samples_leaf= 1,
                                  min_samples_split= 2,
                                  random_state=123)
model_RF_full.fit(X, y)
RandomForestClassifier(random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=123)
# Predykcja i ocena RF
y_pred_RF_full = model_RF_full.predict(X)
accuracy_RF_full = accuracy_score(y, y_pred_RF_full)
classification_rep_RF_full = classification_report(y, y_pred_RF_full)
confusion_matrix_RF_full = confusion_matrix(y, y_pred_RF_full)
accuracy_RF_full
1.0
print(classification_rep_RF_full)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11124
           1       1.00      1.00      1.00      6990

    accuracy                           1.00     18114
   macro avg       1.00      1.00      1.00     18114
weighted avg       1.00      1.00      1.00     18114

print(confusion_matrix_RF_full)
[[11124     0]
 [    0  6990]]
model_RF_full
RandomForestClassifier(random_state=123)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=123)
# Zapisz model i vectorizer
joblib.dump(model_RF_full, 'spam_classifier_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
['vectorizer.pkl']
# Uwaga, ważna jest zgodność wersji scikita i joblib tutaj i w środowisku aplikacji
pip freeze | findstr scikit
scikit-fuzzy==0.4.2
scikit-learn==1.3.2
Note: you may need to restart the kernel to use updated packages.
# Jak instalować?
# Np. tak
# pip install scikit-learn==1.3.2