Project implementation

This commit is contained in:
Mateusz Grzegorzewski 2024-06-07 21:14:40 +02:00
parent 9469fc9409
commit e9f10de4f0
5 changed files with 50178 additions and 0 deletions

175
Movies_reviews.ipynb Normal file
View File

@ -0,0 +1,175 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Biblioteki"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.naive_bayes import MultinomialNB\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.metrics import accuracy_score\n",
"import numpy as np\n",
"from gensim.models import Word2Vec\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import stopwords"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Wczytanie danych"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"train_data = pd.read_csv('Train.csv')\n",
"valid_data = pd.read_csv('Valid.csv')\n",
"test_data = pd.read_csv('Test.csv')\n",
"\n",
"X_train = train_data['text']\n",
"y_train = train_data['label']\n",
"X_valid = valid_data['text']\n",
"y_valid = valid_data['label']\n",
"X_test = test_data['text']\n",
"y_test = test_data['label']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Naiwny klasyfikator bayesowski z wektoryzacją TF-IDF"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dokładność na zbiorze walidacyjnym: 0.8616\n",
"Dokładność na zbiorze testowym: 0.8670\n"
]
}
],
"source": [
"# Stworzenie pipeline do przetwarzania tekstu i uczenia modelu\n",
"model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",
"\n",
"# Trenowanie modelu\n",
"model.fit(X_train, y_train)\n",
"\n",
"# Ewaluacja modelu\n",
"y_valid_pred = model.predict(X_valid)\n",
"valid_accuracy = accuracy_score(y_valid, y_valid_pred)\n",
"\n",
"y_test_pred = model.predict(X_test)\n",
"test_accuracy = accuracy_score(y_test, y_test_pred)\n",
"\n",
"print(f'Dokładność na zbiorze walidacyjnym: {valid_accuracy:.4f}')\n",
"print(f'Dokładność na zbiorze testowym: {test_accuracy:.4f}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Naiwny klasyfikator bayesowski z osadzeniami słów (Word2Vec)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dokładność na zbiorze walidacyjnym (Word2Vec): 0.7584\n",
"Dokładność na zbiorze testowym (Word2Vec): 0.7644\n"
]
}
],
"source": [
"stop_words = set(stopwords.words('english'))\n",
"def tokenize(text):\n",
" tokens = word_tokenize(text.lower())\n",
" tokens = [word for word in tokens if word.isalnum()]\n",
" tokens = [word for word in tokens if word not in stop_words]\n",
" return tokens\n",
"\n",
"X_train_tokens = [tokenize(review) for review in X_train]\n",
"X_valid_tokens = [tokenize(review) for review in X_valid]\n",
"X_test_tokens = [tokenize(review) for review in X_test]\n",
"\n",
"w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)\n",
"\n",
"# Funkcja do konwersji recenzji na osadzenia słów\n",
"def document_vector(tokens, model):\n",
" vec = [model.wv[word] for word in tokens if word in model.wv]\n",
" return np.mean(vec, axis=0) if len(vec) > 0 else np.zeros(model.vector_size)\n",
"\n",
"X_train_vectors = np.array([document_vector(tokens, w2v_model) for tokens in X_train_tokens])\n",
"X_valid_vectors = np.array([document_vector(tokens, w2v_model) for tokens in X_valid_tokens])\n",
"X_test_vectors = np.array([document_vector(tokens, w2v_model) for tokens in X_test_tokens])\n",
"\n",
"# Klasyfikator Naive Bayes\n",
"model_w2v = GaussianNB()\n",
"\n",
"# Trenowanie modelu\n",
"model_w2v.fit(X_train_vectors, y_train)\n",
"\n",
"# Ewaluacja modelu\n",
"y_valid_pred_w2v = model_w2v.predict(X_valid_vectors)\n",
"valid_accuracy_w2v = accuracy_score(y_valid, y_valid_pred_w2v)\n",
"\n",
"y_test_pred_w2v = model_w2v.predict(X_test_vectors)\n",
"test_accuracy_w2v = accuracy_score(y_test, y_test_pred_w2v)\n",
"\n",
"print(f'Dokładność na zbiorze walidacyjnym (Word2Vec): {valid_accuracy_w2v:.4f}')\n",
"print(f'Dokładność na zbiorze testowym (Word2Vec): {test_accuracy_w2v:.4f}')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

BIN
Raport-projekt.docx Normal file

Binary file not shown.

5001
Test.csv Normal file

File diff suppressed because one or more lines are too long

40001
Train.csv Normal file

File diff suppressed because one or more lines are too long

5001
Valid.csv Normal file

File diff suppressed because one or more lines are too long