Project implementation
This commit is contained in:
parent
9469fc9409
commit
e9f10de4f0
175
Movies_reviews.ipynb
Normal file
175
Movies_reviews.ipynb
Normal file
@ -0,0 +1,175 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Biblioteki"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||
"from sklearn.pipeline import make_pipeline\n",
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"import numpy as np\n",
|
||||
"from gensim.models import Word2Vec\n",
|
||||
"from sklearn.naive_bayes import GaussianNB\n",
|
||||
"from nltk.tokenize import word_tokenize\n",
|
||||
"from nltk.corpus import stopwords"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Wczytanie danych"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_data = pd.read_csv('Train.csv')\n",
|
||||
"valid_data = pd.read_csv('Valid.csv')\n",
|
||||
"test_data = pd.read_csv('Test.csv')\n",
|
||||
"\n",
|
||||
"X_train = train_data['text']\n",
|
||||
"y_train = train_data['label']\n",
|
||||
"X_valid = valid_data['text']\n",
|
||||
"y_valid = valid_data['label']\n",
|
||||
"X_test = test_data['text']\n",
|
||||
"y_test = test_data['label']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Naiwny klasyfikator bayesowski z wektoryzacją TF-IDF"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dokładność na zbiorze walidacyjnym: 0.8616\n",
|
||||
"Dokładność na zbiorze testowym: 0.8670\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Stworzenie pipeline do przetwarzania tekstu i uczenia modelu\n",
|
||||
"model = make_pipeline(TfidfVectorizer(), MultinomialNB())\n",
|
||||
"\n",
|
||||
"# Trenowanie modelu\n",
|
||||
"model.fit(X_train, y_train)\n",
|
||||
"\n",
|
||||
"# Ewaluacja modelu\n",
|
||||
"y_valid_pred = model.predict(X_valid)\n",
|
||||
"valid_accuracy = accuracy_score(y_valid, y_valid_pred)\n",
|
||||
"\n",
|
||||
"y_test_pred = model.predict(X_test)\n",
|
||||
"test_accuracy = accuracy_score(y_test, y_test_pred)\n",
|
||||
"\n",
|
||||
"print(f'Dokładność na zbiorze walidacyjnym: {valid_accuracy:.4f}')\n",
|
||||
"print(f'Dokładność na zbiorze testowym: {test_accuracy:.4f}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Naiwny klasyfikator bayesowski z osadzeniami słów (Word2Vec)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Dokładność na zbiorze walidacyjnym (Word2Vec): 0.7584\n",
|
||||
"Dokładność na zbiorze testowym (Word2Vec): 0.7644\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"stop_words = set(stopwords.words('english'))\n",
|
||||
"def tokenize(text):\n",
|
||||
" tokens = word_tokenize(text.lower())\n",
|
||||
" tokens = [word for word in tokens if word.isalnum()]\n",
|
||||
" tokens = [word for word in tokens if word not in stop_words]\n",
|
||||
" return tokens\n",
|
||||
"\n",
|
||||
"X_train_tokens = [tokenize(review) for review in X_train]\n",
|
||||
"X_valid_tokens = [tokenize(review) for review in X_valid]\n",
|
||||
"X_test_tokens = [tokenize(review) for review in X_test]\n",
|
||||
"\n",
|
||||
"w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)\n",
|
||||
"\n",
|
||||
"# Funkcja do konwersji recenzji na osadzenia słów\n",
|
||||
"def document_vector(tokens, model):\n",
|
||||
" vec = [model.wv[word] for word in tokens if word in model.wv]\n",
|
||||
" return np.mean(vec, axis=0) if len(vec) > 0 else np.zeros(model.vector_size)\n",
|
||||
"\n",
|
||||
"X_train_vectors = np.array([document_vector(tokens, w2v_model) for tokens in X_train_tokens])\n",
|
||||
"X_valid_vectors = np.array([document_vector(tokens, w2v_model) for tokens in X_valid_tokens])\n",
|
||||
"X_test_vectors = np.array([document_vector(tokens, w2v_model) for tokens in X_test_tokens])\n",
|
||||
"\n",
|
||||
"# Klasyfikator Naive Bayes\n",
|
||||
"model_w2v = GaussianNB()\n",
|
||||
"\n",
|
||||
"# Trenowanie modelu\n",
|
||||
"model_w2v.fit(X_train_vectors, y_train)\n",
|
||||
"\n",
|
||||
"# Ewaluacja modelu\n",
|
||||
"y_valid_pred_w2v = model_w2v.predict(X_valid_vectors)\n",
|
||||
"valid_accuracy_w2v = accuracy_score(y_valid, y_valid_pred_w2v)\n",
|
||||
"\n",
|
||||
"y_test_pred_w2v = model_w2v.predict(X_test_vectors)\n",
|
||||
"test_accuracy_w2v = accuracy_score(y_test, y_test_pred_w2v)\n",
|
||||
"\n",
|
||||
"print(f'Dokładność na zbiorze walidacyjnym (Word2Vec): {valid_accuracy_w2v:.4f}')\n",
|
||||
"print(f'Dokładność na zbiorze testowym (Word2Vec): {test_accuracy_w2v:.4f}')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.19"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
BIN
Raport-projekt.docx
Normal file
BIN
Raport-projekt.docx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user