DL_Project
This commit is contained in:
commit
1c3acadbfa
BIN
Raport.docx
Normal file
BIN
Raport.docx
Normal file
Binary file not shown.
196
project.ipynb
Normal file
196
project.ipynb
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import re\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||||
|
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
|
||||||
|
"\n",
|
||||||
|
"# Wczytywanie danych\n",
|
||||||
|
"data = pd.read_csv('spam.csv')\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def preprocess_text(text):\n",
|
||||||
|
" text = re.sub(r'\\W', ' ', text)\n",
|
||||||
|
" text = text.lower()\n",
|
||||||
|
" text = text.split()\n",
|
||||||
|
" text = ' '.join(text)\n",
|
||||||
|
" return text\n",
|
||||||
|
"\n",
|
||||||
|
"data['Message'] = data['Message'].apply(preprocess_text)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"TF-IDF + Naive Bayes"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Naive Bayes TF-IDF:\n",
|
||||||
|
"Accuracy: 0.97847533632287\n",
|
||||||
|
"Precision: 1.0\n",
|
||||||
|
"Recall: 0.85\n",
|
||||||
|
"F1-score: 0.918918918918919\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Podział danych na zbiór treningowy i testowy\n",
|
||||||
|
"X_train, X_test, y_train, y_test = train_test_split(data['Message'], data['Category'], test_size=0.2, random_state=0)\n",
|
||||||
|
"\n",
|
||||||
|
"# Wektoryzacja TF-IDF\n",
|
||||||
|
"tfidf = TfidfVectorizer(max_features=3000)\n",
|
||||||
|
"X_train_tfidf = tfidf.fit_transform(X_train)\n",
|
||||||
|
"X_test_tfidf = tfidf.transform(X_test)\n",
|
||||||
|
"\n",
|
||||||
|
"# Model Naive Bayes\n",
|
||||||
|
"nb_model = MultinomialNB()\n",
|
||||||
|
"nb_model.fit(X_train_tfidf, y_train)\n",
|
||||||
|
"y_pred = nb_model.predict(X_test_tfidf)\n",
|
||||||
|
"\n",
|
||||||
|
"# Ewaluacja\n",
|
||||||
|
"print('Naive Bayes TF-IDF:')\n",
|
||||||
|
"print(f'Accuracy: {accuracy_score(y_test, y_pred)}')\n",
|
||||||
|
"print(f'Precision: {precision_score(y_test, y_pred, pos_label=\"spam\")}')\n",
|
||||||
|
"print(f'Recall: {recall_score(y_test, y_pred, pos_label=\"spam\")}')\n",
|
||||||
|
"print(f'F1-score: {f1_score(y_test, y_pred, pos_label=\"spam\")}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"RNN (LSTM)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Epoch 1/5\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"c:\\Users\\walcz\\Desktop\\studia\\uczenie\\projekt\\myenv\\lib\\site-packages\\keras\\src\\layers\\core\\embedding.py:90: UserWarning: Argument `input_length` is deprecated. Just remove it.\n",
|
||||||
|
" warnings.warn(\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 54ms/step - accuracy: 0.9047 - loss: 0.3002 - val_accuracy: 0.9843 - val_loss: 0.0670\n",
|
||||||
|
"Epoch 2/5\n",
|
||||||
|
"\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 53ms/step - accuracy: 0.9902 - loss: 0.0401 - val_accuracy: 0.9865 - val_loss: 0.0522\n",
|
||||||
|
"Epoch 3/5\n",
|
||||||
|
"\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 52ms/step - accuracy: 0.9972 - loss: 0.0149 - val_accuracy: 0.9843 - val_loss: 0.0582\n",
|
||||||
|
"Epoch 4/5\n",
|
||||||
|
"\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 47ms/step - accuracy: 0.9983 - loss: 0.0078 - val_accuracy: 0.9865 - val_loss: 0.0601\n",
|
||||||
|
"Epoch 5/5\n",
|
||||||
|
"\u001b[1m126/126\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m6s\u001b[0m 49ms/step - accuracy: 0.9974 - loss: 0.0071 - val_accuracy: 0.9865 - val_loss: 0.0628\n",
|
||||||
|
"\u001b[1m35/35\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 19ms/step\n",
|
||||||
|
"LSTM:\n",
|
||||||
|
"Accuracy: 0.9856502242152466\n",
|
||||||
|
"Precision: 0.9615384615384616\n",
|
||||||
|
"Recall: 0.9375\n",
|
||||||
|
"F1-score: 0.9493670886075949\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
|
||||||
|
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
|
||||||
|
"from tensorflow.keras.models import Sequential\n",
|
||||||
|
"from tensorflow.keras.layers import Embedding, LSTM, Dense\n",
|
||||||
|
"\n",
|
||||||
|
"# Wczytanie danych\n",
|
||||||
|
"data = pd.read_csv('spam.csv')\n",
|
||||||
|
"\n",
|
||||||
|
"# Tokenizacja i padding\n",
|
||||||
|
"tokenizer = Tokenizer(num_words=5000)\n",
|
||||||
|
"tokenizer.fit_on_texts(data['Message'])\n",
|
||||||
|
"X = tokenizer.texts_to_sequences(data['Message'])\n",
|
||||||
|
"X = pad_sequences(X, maxlen=100)\n",
|
||||||
|
"\n",
|
||||||
|
"X_train, X_test, y_train, y_test = train_test_split(X, data['Category'], test_size=0.2, random_state=0)\n",
|
||||||
|
"\n",
|
||||||
|
"# Przekształcenie etykiet 'ham' i 'spam' na wartości liczbowe\n",
|
||||||
|
"label_mapping = {'ham': 0, 'spam': 1}\n",
|
||||||
|
"y_train = y_train.map(label_mapping)\n",
|
||||||
|
"y_test = y_test.map(label_mapping)\n",
|
||||||
|
"\n",
|
||||||
|
"# Model LSTM\n",
|
||||||
|
"model = Sequential()\n",
|
||||||
|
"model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))\n",
|
||||||
|
"model.add(LSTM(128))\n",
|
||||||
|
"model.add(Dense(1, activation='sigmoid'))\n",
|
||||||
|
"\n",
|
||||||
|
"model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
|
||||||
|
"model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)\n",
|
||||||
|
"\n",
|
||||||
|
"y_pred = (model.predict(X_test) > 0.5).astype(\"int32\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Ewaluacja\n",
|
||||||
|
"print('LSTM:')\n",
|
||||||
|
"print(f'Accuracy: {accuracy_score(y_test, y_pred)}')\n",
|
||||||
|
"print(f'Precision: {precision_score(y_test, y_pred)}')\n",
|
||||||
|
"print(f'Recall: {recall_score(y_test, y_pred)}')\n",
|
||||||
|
"print(f'F1-score: {f1_score(y_test, y_pred)}')\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.13"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user