{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import re\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n", "\n", "# Wczytywanie danych\n", "data = pd.read_csv('spam.csv')\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def preprocess_text(text):\n", " text = re.sub(r'\\W', ' ', text)\n", " text = text.lower()\n", " text = text.split()\n", " text = ' '.join(text)\n", " return text\n", "\n", "data['Message'] = data['Message'].apply(preprocess_text)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "TF-IDF + Naive Bayes" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Naive Bayes TF-IDF:\n", "Accuracy: 0.97847533632287\n", "Precision: 1.0\n", "Recall: 0.85\n", "F1-score: 0.918918918918919\n" ] } ], "source": [ "# Podział danych na zbiór treningowy i testowy\n", "X_train, X_test, y_train, y_test = train_test_split(data['Message'], data['Category'], test_size=0.2, random_state=0)\n", "\n", "# Wektoryzacja TF-IDF\n", "tfidf = TfidfVectorizer(max_features=3000)\n", "X_train_tfidf = tfidf.fit_transform(X_train)\n", "X_test_tfidf = tfidf.transform(X_test)\n", "\n", "# Model Naive Bayes\n", "nb_model = MultinomialNB()\n", "nb_model.fit(X_train_tfidf, y_train)\n", "y_pred = nb_model.predict(X_test_tfidf)\n", "\n", "# Ewaluacja\n", "print('Naive Bayes TF-IDF:')\n", "print(f'Accuracy: {accuracy_score(y_test, y_pred)}')\n", "print(f'Precision: {precision_score(y_test, y_pred, pos_label=\"spam\")}')\n", "print(f'Recall: {recall_score(y_test, y_pred, pos_label=\"spam\")}')\n", "print(f'F1-score: {f1_score(y_test, y_pred, pos_label=\"spam\")}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "RNN (LSTM)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\walcz\\Desktop\\studia\\uczenie\\projekt\\myenv\\lib\\site-packages\\keras\\src\\layers\\core\\embedding.py:90: UserWarning: Argument `input_length` is deprecated. Just remove it.\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 54ms/step - accuracy: 0.9047 - loss: 0.3002 - val_accuracy: 0.9843 - val_loss: 0.0670\n", "Epoch 2/5\n", "\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 53ms/step - accuracy: 0.9902 - loss: 0.0401 - val_accuracy: 0.9865 - val_loss: 0.0522\n", "Epoch 3/5\n", "\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 52ms/step - accuracy: 0.9972 - loss: 0.0149 - val_accuracy: 0.9843 - val_loss: 0.0582\n", "Epoch 4/5\n", "\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 47ms/step - accuracy: 0.9983 - loss: 0.0078 - val_accuracy: 0.9865 - val_loss: 0.0601\n", "Epoch 5/5\n", "\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 49ms/step - accuracy: 0.9974 - loss: 0.0071 - val_accuracy: 0.9865 - val_loss: 0.0628\n", "\u001b[1m35/35\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 19ms/step\n", "LSTM:\n", "Accuracy: 0.9856502242152466\n", "Precision: 0.9615384615384616\n", "Recall: 0.9375\n", "F1-score: 0.9493670886075949\n" ] } ], "source": [ "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Embedding, LSTM, Dense\n", "\n", "# Wczytanie danych\n", "data = pd.read_csv('spam.csv')\n", "\n", "# Tokenizacja i padding\n", "tokenizer = Tokenizer(num_words=5000)\n", "tokenizer.fit_on_texts(data['Message'])\n", "X = tokenizer.texts_to_sequences(data['Message'])\n", "X = pad_sequences(X, maxlen=100)\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, data['Category'], test_size=0.2, random_state=0)\n", "\n", "# Przekształcenie etykiet 'ham' i 'spam' na wartości liczbowe\n", "label_mapping = {'ham': 0, 'spam': 1}\n", "y_train = y_train.map(label_mapping)\n", "y_test = y_test.map(label_mapping)\n", "\n", "# Model LSTM\n", "model = Sequential()\n", "model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))\n", "model.add(LSTM(128))\n", "model.add(Dense(1, activation='sigmoid'))\n", "\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)\n", "\n", "y_pred = (model.predict(X_test) > 0.5).astype(\"int32\")\n", "\n", "# Ewaluacja\n", "print('LSTM:')\n", "print(f'Accuracy: {accuracy_score(y_test, y_pred)}')\n", "print(f'Precision: {precision_score(y_test, y_pred)}')\n", "print(f'Recall: {recall_score(y_test, y_pred)}')\n", "print(f'F1-score: {f1_score(y_test, y_pred)}')\n", "\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 2 }