{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Biblioteki" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.metrics import accuracy_score\n", "import numpy as np\n", "from gensim.models import Word2Vec\n", "from sklearn.naive_bayes import GaussianNB\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Wczytanie danych" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "train_data = pd.read_csv('Train.csv')\n", "valid_data = pd.read_csv('Valid.csv')\n", "test_data = pd.read_csv('Test.csv')\n", "\n", "X_train = train_data['text']\n", "y_train = train_data['label']\n", "X_valid = valid_data['text']\n", "y_valid = valid_data['label']\n", "X_test = test_data['text']\n", "y_test = test_data['label']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Naiwny klasyfikator bayesowski z wektoryzacją TF-IDF" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dokładność na zbiorze walidacyjnym: 0.8616\n", "Dokładność na zbiorze testowym: 0.8670\n" ] } ], "source": [ "# Stworzenie pipeline do przetwarzania tekstu i uczenia modelu\n", "model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n", "\n", "# Trenowanie modelu\n", "model.fit(X_train, y_train)\n", "\n", "# Ewaluacja modelu\n", "y_valid_pred = model.predict(X_valid)\n", "valid_accuracy = accuracy_score(y_valid, y_valid_pred)\n", "\n", "y_test_pred = model.predict(X_test)\n", "test_accuracy = accuracy_score(y_test, y_test_pred)\n", "\n", "print(f'Dokładność na zbiorze walidacyjnym: {valid_accuracy:.4f}')\n", "print(f'Dokładność na zbiorze testowym: {test_accuracy:.4f}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Naiwny klasyfikator bayesowski z osadzeniami słów (Word2Vec)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dokładność na zbiorze walidacyjnym (Word2Vec): 0.7584\n", "Dokładność na zbiorze testowym (Word2Vec): 0.7644\n" ] } ], "source": [ "stop_words = set(stopwords.words('english'))\n", "def tokenize(text):\n", " tokens = word_tokenize(text.lower())\n", " tokens = [word for word in tokens if word.isalnum()]\n", " tokens = [word for word in tokens if word not in stop_words]\n", " return tokens\n", "\n", "X_train_tokens = [tokenize(review) for review in X_train]\n", "X_valid_tokens = [tokenize(review) for review in X_valid]\n", "X_test_tokens = [tokenize(review) for review in X_test]\n", "\n", "w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)\n", "\n", "# Funkcja do konwersji recenzji na osadzenia słów\n", "def document_vector(tokens, model):\n", " vec = [model.wv[word] for word in tokens if word in model.wv]\n", " return np.mean(vec, axis=0) if len(vec) > 0 else np.zeros(model.vector_size)\n", "\n", "X_train_vectors = np.array([document_vector(tokens, w2v_model) for tokens in X_train_tokens])\n", "X_valid_vectors = np.array([document_vector(tokens, w2v_model) for tokens in X_valid_tokens])\n", "X_test_vectors = np.array([document_vector(tokens, w2v_model) for tokens in X_test_tokens])\n", "\n", "# Klasyfikator Naive Bayes\n", "model_w2v = GaussianNB()\n", "\n", "# Trenowanie modelu\n", "model_w2v.fit(X_train_vectors, y_train)\n", "\n", "# Ewaluacja modelu\n", "y_valid_pred_w2v = model_w2v.predict(X_valid_vectors)\n", "valid_accuracy_w2v = accuracy_score(y_valid, y_valid_pred_w2v)\n", "\n", "y_test_pred_w2v = model_w2v.predict(X_test_vectors)\n", "test_accuracy_w2v = accuracy_score(y_test, y_test_pred_w2v)\n", "\n", "print(f'Dokładność na zbiorze walidacyjnym (Word2Vec): {valid_accuracy_w2v:.4f}')\n", "print(f'Dokładność na zbiorze testowym (Word2Vec): {test_accuracy_w2v:.4f}')" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.19" } }, "nbformat": 4, "nbformat_minor": 2 }