PhishGuardian/ML-checkpoint.ipynb at a84edc687cdc2410436cdb72cdaed696ab4833e9

s452649 f084188680 Adding machine learning to analyze email content. Updating documentation

2024-06-08 11:04:41 +02:00

20 KiB

Raw Blame History

%pip install pandas
%pip install matplotlib
%pip install nltk
%pip install wordcloud
%pip install scikit-learn==1.3.2
%pip install scikit-fuzzy==0.4.2
# Import pakietów
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import joblib
import pickle

Requirement already satisfied: pandas in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (2.2.2)
Requirement already satisfied: numpy>=1.26.0 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from pandas) (1.26.4)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\alicj\appdata\roaming\python\python312\site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from pandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\users\alicj\appdata\local\programs\python\python312\lib\site-packages (from pandas) (2024.1)
Requirement already satisfied: six>=1.5 in c:\users\alicj\appdata\roaming\python\python312\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Note: you may need to restart the kernel to use updated packages.

# Załaduj dane
data_path = "joined_data.csv"
data = pd.read_csv(data_path)

print(data.head())

print(data.info())

data

# Usuwamy NaN

data.dropna(inplace=True)

# Usuwamy puste wiadomości i wiadomości zawierające jedynie "\n"

data = data[data['Body'] != '\n']

data = data[data['Body'] != 'empty']

data.reset_index(drop=True, inplace=True)

data

# Sprawdźmy rozkład targetów
print(data['Label'].value_counts())

# Analiza długości wiadomości

def get_len(row):
    try:
        return len(row)
    except:
        return row

data['message_length'] = data['Body'].apply(get_len)

data.sort_values(by='message_length')

# Jedna wiadomość jest bardzo długa 17085626

data['message_length'].value_counts()

# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 200.000 znaków celem wyświetlenia histogramów
hist_data = data[data['message_length'] < 200000]
plt.figure(figsize=(10, 6))
hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')
hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')
plt.legend()
plt.xlabel('Długość wiadomości')
plt.ylabel('Liczba wiadomości')
plt.title('Rozkład długości wiadomości')
plt.show()

# Ograniczamy jeszcze bardziej

# Histogram długości wiadomości dla każdej kategorii - ograniczamy do 10000 znaków celem wyświetlenia histogramów
hist_data = data[data['message_length'] < 10000]
plt.figure(figsize=(10, 6))
hist_data[hist_data['Label'] == 0]['message_length'].hist(bins=100, alpha=0.6, label='Not Spam')
hist_data[hist_data['Label'] == 1]['message_length'].hist(bins=100, alpha=0.6, label='Spam')
plt.legend()
plt.xlabel('Długość wiadomości')
plt.ylabel('Liczba wiadomości')
plt.title('Rozkład długości wiadomości')
plt.show()

# Można zauważyć, że trudno odróżnić widomości po samej długości. W tym celu należy skorzystać z bardziej zaawansowanych metod.

# Przetwarzanie tekstu

data

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    # Usuwanie znaków specjalnych i tokenizacja
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    words = word_tokenize(text)
    # Usuwanie stopwords i stemming
    words = [ps.stem(word) for word in words if word.lower() not in stop_words]
    return " ".join(words)

# Ten proces jest czasochłonny

data['processed_message'] = data['Body'].apply(preprocess_text)

data.head()

data['processed_message']

# Analiza słów za pomocą WordCloud
spam_words = ' '.join(list(data[data['Label'] == 1]['processed_message']))
not_spam_words = ' '.join(list(data[data['Label'] == 0]['processed_message']))

plt.figure(figsize=(10, 6))
wordcloud_spam = WordCloud(width=800, height=400).generate(spam_words)
plt.imshow(wordcloud_spam, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud dla Spam')
plt.show()

plt.figure(figsize=(10, 6))
wordcloud_not_spam = WordCloud(width=800, height=400).generate(not_spam_words)
plt.imshow(wordcloud_not_spam, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud dla Not Spam')
plt.show()

# Budowa modelu klasyfikacyjnego

# Zamiana tekstu na wektory
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['processed_message'])
y = data['Label']

# Podział na zbiór treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Trenowanie modelu Naiwnego Bayesa
model_NB = MultinomialNB()
model_NB.fit(X_train, y_train)

# Predykcja i ocena Naiwny Bayes
y_pred_NB = model_NB.predict(X_test)
accuracy_NB = accuracy_score(y_test, y_pred_NB)
classification_rep_NB = classification_report(y_test, y_pred_NB)
confusion_matrix_NB = confusion_matrix(y_test, y_pred_NB)

accuracy_NB

print(classification_rep_NB)

print(confusion_matrix_NB)

# Trening Drzewa Decyzyjnego (DT)

# Parametry domyślne
model_DT = DecisionTreeClassifier(criterion= 'gini',
                                  max_depth= None,
                                  min_samples_leaf= 1,
                                  min_samples_split= 2,
                                  splitter= 'best')
model_DT.fit(X_train, y_train)

# Predykcja i ocena DT
y_pred_DT = model_DT.predict(X_test)
accuracy_DT = accuracy_score(y_test, y_pred_DT)
classification_rep_DT = classification_report(y_test, y_pred_DT)
confusion_matrix_DT = confusion_matrix(y_test, y_pred_DT)

accuracy_DT

print(classification_rep_DT)

print(confusion_matrix_DT)

# Las losowy

model_RF = RandomForestClassifier(n_estimators= 100,
                                  bootstrap= True,
                                  ccp_alpha= 0.0,
                                  criterion= 'gini',
                                  max_depth= None,
                                  min_samples_leaf= 1,
                                  min_samples_split= 2,
                                  random_state=123)
model_RF.fit(X_train, y_train)

# Predykcja i ocena RF
y_pred_RF = model_RF.predict(X_test)
accuracy_RF = accuracy_score(y_test, y_pred_RF)
classification_rep_RF = classification_report(y_test, y_pred_RF)
confusion_matrix_RF = confusion_matrix(y_test, y_pred_RF)

accuracy_RF

print(classification_rep_RF)

print(confusion_matrix_RF)

# Najlepszym modelem okazał się Las losowy - lepiej sklasyfikować spam jako wiadomość nie będącą spamem niż odwrotnie. 
# Dlatego wybieramy RF, a nie NB.

# Teraz dokonamy treningu na pełnych danych i zapiszemy model celem wykorzystania na danych rzeczywistych w późniejszej 
# aplikacji.

model_RF_full = RandomForestClassifier(n_estimators= 100,
                                  bootstrap= True,
                                  ccp_alpha= 0.0,
                                  criterion= 'gini',
                                  max_depth= None,
                                  min_samples_leaf= 1,
                                  min_samples_split= 2,
                                  random_state=123)

model_RF_full.fit(X, y)

# Predykcja i ocena RF
y_pred_RF_full = model_RF_full.predict(X)
accuracy_RF_full = accuracy_score(y, y_pred_RF_full)
classification_rep_RF_full = classification_report(y, y_pred_RF_full)
confusion_matrix_RF_full = confusion_matrix(y, y_pred_RF_full)

accuracy_RF_full

print(classification_rep_RF_full)

print(confusion_matrix_RF_full)

model_RF_full

# Zapisz model i vectorizer
joblib.dump(model_RF_full, 'spam_classifier_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

# Uwaga, ważna jest zgodność wersji scikita i joblib tutaj i w środowisku aplikacji

pip freeze | findstr scikit

# Jak instalować?

# Np. tak
# pip install scikit-learn==1.3.2

20 KiB Raw Blame History

20 KiB

Raw Blame History