{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "LHtKZx0myNWa" }, "source": [ "### Import bibliotek" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "id": "ZTlYCCtCyNWc" }, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.svm import SVC\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.pipeline import Pipeline\n", "from gensim.models import Word2Vec\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "import numpy as np\n", "import re" ] }, { "cell_type": "markdown", "metadata": { "id": "v16vUmROyNWc" }, "source": [ "### Przygotowanie danych" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def get_str_cleaned(str_dirty):\n", " punctuation = '!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'\n", " new_str = str_dirty.lower()\n", " new_str = re.sub(' +', ' ', new_str)\n", " for char in punctuation:\n", " new_str = new_str.replace(char, '')\n", " return new_str" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "d4Kuyx7JyNWd", "outputId": "0c9de8ef-4e90-44fd-9af4-d5e5833994aa" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " review sentiment\n", "0 One of the other reviewers has mentioned that ... positive\n", "1 A wonderful little production.

The... positive\n", "2 I thought this was a wonderful way to spend ti... positive\n", "3 Basically there's a family where a little boy ... negative\n", "4 Petter Mattei's \"Love in the Time of Money\" is... positive\n", " review sentiment \\\n", "0 One of the other reviewers has mentioned that ... 1 \n", "1 A wonderful little production.

The... 1 \n", "2 I thought this was a wonderful way to spend ti... 1 \n", "3 Basically there's a family where a little boy ... 0 \n", "4 Petter Mattei's \"Love in the Time of Money\" is... 1 \n", "\n", " cleaned_review \n", "0 one of the other reviewers has mentioned that ... \n", "1 a wonderful little production br br the filmin... \n", "2 i thought this was a wonderful way to spend ti... \n", "3 basically theres a family where a little boy j... \n", "4 petter matteis love in the time of money is a ... \n" ] } ], "source": [ "# Source: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews\n", "data = pd.read_csv('IMDB_reviews.csv')\n", "print(data.head())\n", "\n", "# Czyszczenie danych\n", "data['cleaned_review'] = data['review'].apply(get_str_cleaned)\n", "\n", "# Przekształcenie etykiet na format numeryczny\n", "label_encoder = LabelEncoder()\n", "data['sentiment'] = label_encoder.fit_transform(data['sentiment'])\n", "\n", "print(data.head())\n", "\n", "# Podział danych na zbiór treningowy i testowy\n", "X = data['cleaned_review']\n", "y = data['sentiment']\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n" ] }, { "cell_type": "markdown", "metadata": { "id": "8Lz-Y4ZCyNWd" }, "source": [ "### TF-IDF + SVM" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "id": "ES_5Q4BEyNWd" }, "outputs": [], "source": [ "tfidf_svm_pipeline = Pipeline([\n", " ('tfidf', TfidfVectorizer(max_features=200)),\n", " ('svm', SVC(kernel='linear'))\n", "])\n", "tfidf_svm_pipeline.fit(X_train, y_train)\n", "y_pred_tfidf_svm = tfidf_svm_pipeline.predict(X_test)" ] }, { "cell_type": "markdown", "metadata": { "id": "fadLd3cEyNWd" }, "source": [ "### TF-IDF + RandomForest" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "id": "xUq30-FryNWe" }, "outputs": [], "source": [ "tfidf_rf_pipeline = Pipeline([\n", " ('tfidf', TfidfVectorizer(max_features=200)),\n", " ('rf', RandomForestClassifier(n_estimators=100))\n", "])\n", "tfidf_rf_pipeline.fit(X_train, y_train)\n", "y_pred_tfidf_rf = tfidf_rf_pipeline.predict(X_test)" ] }, { "cell_type": "markdown", "metadata": { "id": "d08OJrCnyNWe" }, "source": [ "### Model Word2Vec i transformator dokumentów do postaci wektorowej" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "id": "J5agaWJFyNWe" }, "outputs": [], "source": [ "w2v_model = Word2Vec(sentences=[doc.split() for doc in X_train], vector_size=200, window=5, min_count=5, workers=4)\n", "class Word2VecTransformer(BaseEstimator, TransformerMixin):\n", " def __init__(self, w2v_model):\n", " self.w2v_model = w2v_model\n", "\n", " def fit(self, X, y=None):\n", " return self\n", "\n", " def transform(self, X):\n", " return np.array([\n", " np.mean([self.w2v_model.wv[word] for word in doc.split() if word in self.w2v_model.wv]\n", " or [np.zeros(self.w2v_model.vector_size)], axis=0)\n", " for doc in X\n", " ])" ] }, { "cell_type": "markdown", "metadata": { "id": "KbKeeZBdyNWe" }, "source": [ "### Word2Vec + SVM" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "id": "FPBL7g75yNWe" }, "outputs": [], "source": [ "w2v_svm_pipeline = Pipeline([\n", " ('w2v_transform', Word2VecTransformer(w2v_model)),\n", " ('svm', SVC(kernel='linear'))\n", "])\n", "w2v_svm_pipeline.fit(X_train, y_train)\n", "y_pred_w2v_svm = w2v_svm_pipeline.predict(X_test)" ] }, { "cell_type": "markdown", "metadata": { "id": "KT-Cnwx7yNWe" }, "source": [ "### Word2Vec + RandomForest" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "id": "t9mCasDmyNWe" }, "outputs": [], "source": [ "w2v_rf_pipeline = Pipeline([\n", " ('w2v_transform', Word2VecTransformer(w2v_model)),\n", " ('rf', RandomForestClassifier(n_estimators=100))\n", "])\n", "w2v_rf_pipeline.fit(X_train, y_train)\n", "y_pred_w2v_rf = w2v_rf_pipeline.predict(X_test)" ] }, { "cell_type": "markdown", "metadata": { "id": "lkFzZ1MjyNWf" }, "source": [ "### Wyświetlanie metryk" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def get_scores(y_true, y_pred):\n", " # Funkcja zwraca trafność, precyzję, pokrycie i F1\n", " acc_score = 0\n", " acc_total = 0\n", " tp = 0\n", " fp = 0\n", " selected_items = 0\n", " relevant_items = 0\n", "\n", " for p, t in zip(y_pred, y_true):\n", " acc_total += 1\n", "\n", " if p == t:\n", " acc_score += 1\n", "\n", " if p > 0 and p == t:\n", " tp += 1\n", "\n", " if p > 0:\n", " selected_items += 1\n", "\n", " if t > 0:\n", " relevant_items += 1\n", "\n", " accuracy = acc_score / acc_total\n", "\n", " if selected_items == 0:\n", " precision = 1.0\n", " else:\n", " precision = tp / selected_items\n", "\n", " if relevant_items == 0:\n", " recall = 1.0\n", " else:\n", " recall = tp / relevant_items\n", "\n", " if precision + recall == 0.0:\n", " f1 = 0.0\n", " else:\n", " f1 = 2 * precision * recall / (precision + recall)\n", "\n", " return accuracy, precision, recall, f1" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "id": "W8RJkm0CyNWf" }, "outputs": [], "source": [ "def print_metrics(y_true, y_pred, model_name):\n", " accuracy, precision, recall, f1 = get_scores(y_true, y_pred)\n", " print(f'{model_name} Accuracy: {accuracy:.4f}')\n", " print(f'{model_name} Precision: {precision:.4f}')\n", " print(f'{model_name} Recall: {recall:.4f}')\n", " print(f'{model_name} F1-Score: {f1:.4f}')\n", " print('-' * 30)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tPPkR8MOyNWf", "outputId": "ceae2217-10b0-4533-9f43-3c7add2d19b4" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TF-IDF + SVM Accuracy: 0.7764\n", "TF-IDF + SVM Precision: 0.7719\n", "TF-IDF + SVM Recall: 0.7896\n", "TF-IDF + SVM F1-Score: 0.7807\n", "------------------------------\n", "TF-IDF + Random Forest Accuracy: 0.7500\n", "TF-IDF + Random Forest Precision: 0.7626\n", "TF-IDF + Random Forest Recall: 0.7317\n", "TF-IDF + Random Forest F1-Score: 0.7468\n", "------------------------------\n", "Word2Vec + SVM Accuracy: 0.8584\n", "Word2Vec + SVM Precision: 0.8522\n", "Word2Vec + SVM Recall: 0.8698\n", "Word2Vec + SVM F1-Score: 0.8609\n", "------------------------------\n", "Word2Vec + Random Forest Accuracy: 0.8137\n", "Word2Vec + Random Forest Precision: 0.8106\n", "Word2Vec + Random Forest Recall: 0.8224\n", "Word2Vec + Random Forest F1-Score: 0.8165\n", "------------------------------\n" ] } ], "source": [ "# Ocena modelu TF-IDF + SVM\n", "print_metrics(y_test, y_pred_tfidf_svm, 'TF-IDF + SVM')\n", "\n", "# Ocena modelu TF-IDF + Random Forest\n", "print_metrics(y_test, y_pred_tfidf_rf, 'TF-IDF + Random Forest')\n", "\n", "# Ocena modelu Word2Vec + SVM\n", "print_metrics(y_test, y_pred_w2v_svm, 'Word2Vec + SVM')\n", "\n", "# Ocena modelu Word2Vec + Random Forest\n", "print_metrics(y_test, y_pred_w2v_rf, 'Word2Vec + Random Forest')" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.2" } }, "nbformat": 4, "nbformat_minor": 0 }