Inicjalizacja projektu
This commit is contained in:
commit
4fbe2cfe2c
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
@ -0,0 +1,4 @@
|
||||
*
|
||||
!Projekt*
|
||||
!Raport*
|
||||
!IMDB*
|
50001
IMDB_reviews.csv
Normal file
50001
IMDB_reviews.csv
Normal file
File diff suppressed because one or more lines are too long
401
Projekt.ipynb
Normal file
401
Projekt.ipynb
Normal file
@ -0,0 +1,401 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "LHtKZx0myNWa"
|
||||
},
|
||||
"source": [
|
||||
"### Import bibliotek"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {
|
||||
"id": "ZTlYCCtCyNWc"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.preprocessing import LabelEncoder\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.svm import SVC\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"from sklearn.pipeline import Pipeline\n",
|
||||
"from gensim.models import Word2Vec\n",
|
||||
"from sklearn.base import BaseEstimator, TransformerMixin\n",
|
||||
"import numpy as np\n",
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "v16vUmROyNWc"
|
||||
},
|
||||
"source": [
|
||||
"### Przygotowanie danych"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_str_cleaned(str_dirty):\n",
|
||||
" punctuation = '!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'\n",
|
||||
" new_str = str_dirty.lower()\n",
|
||||
" new_str = re.sub(' +', ' ', new_str)\n",
|
||||
" for char in punctuation:\n",
|
||||
" new_str = new_str.replace(char, '')\n",
|
||||
" return new_str"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "d4Kuyx7JyNWd",
|
||||
"outputId": "0c9de8ef-4e90-44fd-9af4-d5e5833994aa"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" review sentiment\n",
|
||||
"0 One of the other reviewers has mentioned that ... positive\n",
|
||||
"1 A wonderful little production. <br /><br />The... positive\n",
|
||||
"2 I thought this was a wonderful way to spend ti... positive\n",
|
||||
"3 Basically there's a family where a little boy ... negative\n",
|
||||
"4 Petter Mattei's \"Love in the Time of Money\" is... positive\n",
|
||||
" review sentiment \\\n",
|
||||
"0 One of the other reviewers has mentioned that ... 1 \n",
|
||||
"1 A wonderful little production. <br /><br />The... 1 \n",
|
||||
"2 I thought this was a wonderful way to spend ti... 1 \n",
|
||||
"3 Basically there's a family where a little boy ... 0 \n",
|
||||
"4 Petter Mattei's \"Love in the Time of Money\" is... 1 \n",
|
||||
"\n",
|
||||
" cleaned_review \n",
|
||||
"0 one of the other reviewers has mentioned that ... \n",
|
||||
"1 a wonderful little production br br the filmin... \n",
|
||||
"2 i thought this was a wonderful way to spend ti... \n",
|
||||
"3 basically theres a family where a little boy j... \n",
|
||||
"4 petter matteis love in the time of money is a ... \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Source: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews\n",
|
||||
"data = pd.read_csv('IMDB_reviews.csv')\n",
|
||||
"print(data.head())\n",
|
||||
"\n",
|
||||
"# Czyszczenie danych\n",
|
||||
"data['cleaned_review'] = data['review'].apply(get_str_cleaned)\n",
|
||||
"\n",
|
||||
"# Przekształcenie etykiet na format numeryczny\n",
|
||||
"label_encoder = LabelEncoder()\n",
|
||||
"data['sentiment'] = label_encoder.fit_transform(data['sentiment'])\n",
|
||||
"\n",
|
||||
"print(data.head())\n",
|
||||
"\n",
|
||||
"# Podział danych na zbiór treningowy i testowy\n",
|
||||
"X = data['cleaned_review']\n",
|
||||
"y = data['sentiment']\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "8Lz-Y4ZCyNWd"
|
||||
},
|
||||
"source": [
|
||||
"### TF-IDF + SVM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {
|
||||
"id": "ES_5Q4BEyNWd"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tfidf_svm_pipeline = Pipeline([\n",
|
||||
" ('tfidf', TfidfVectorizer(max_features=200)),\n",
|
||||
" ('svm', SVC(kernel='linear'))\n",
|
||||
"])\n",
|
||||
"tfidf_svm_pipeline.fit(X_train, y_train)\n",
|
||||
"y_pred_tfidf_svm = tfidf_svm_pipeline.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "fadLd3cEyNWd"
|
||||
},
|
||||
"source": [
|
||||
"### TF-IDF + RandomForest"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {
|
||||
"id": "xUq30-FryNWe"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tfidf_rf_pipeline = Pipeline([\n",
|
||||
" ('tfidf', TfidfVectorizer(max_features=200)),\n",
|
||||
" ('rf', RandomForestClassifier(n_estimators=100))\n",
|
||||
"])\n",
|
||||
"tfidf_rf_pipeline.fit(X_train, y_train)\n",
|
||||
"y_pred_tfidf_rf = tfidf_rf_pipeline.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "d08OJrCnyNWe"
|
||||
},
|
||||
"source": [
|
||||
"### Model Word2Vec i transformator dokumentów do postaci wektorowej"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {
|
||||
"id": "J5agaWJFyNWe"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"w2v_model = Word2Vec(sentences=[doc.split() for doc in X_train], vector_size=200, window=5, min_count=5, workers=4)\n",
|
||||
"class Word2VecTransformer(BaseEstimator, TransformerMixin):\n",
|
||||
" def __init__(self, w2v_model):\n",
|
||||
" self.w2v_model = w2v_model\n",
|
||||
"\n",
|
||||
" def fit(self, X, y=None):\n",
|
||||
" return self\n",
|
||||
"\n",
|
||||
" def transform(self, X):\n",
|
||||
" return np.array([\n",
|
||||
" np.mean([self.w2v_model.wv[word] for word in doc.split() if word in self.w2v_model.wv]\n",
|
||||
" or [np.zeros(self.w2v_model.vector_size)], axis=0)\n",
|
||||
" for doc in X\n",
|
||||
" ])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "KbKeeZBdyNWe"
|
||||
},
|
||||
"source": [
|
||||
"### Word2Vec + SVM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {
|
||||
"id": "FPBL7g75yNWe"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"w2v_svm_pipeline = Pipeline([\n",
|
||||
" ('w2v_transform', Word2VecTransformer(w2v_model)),\n",
|
||||
" ('svm', SVC(kernel='linear'))\n",
|
||||
"])\n",
|
||||
"w2v_svm_pipeline.fit(X_train, y_train)\n",
|
||||
"y_pred_w2v_svm = w2v_svm_pipeline.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "KT-Cnwx7yNWe"
|
||||
},
|
||||
"source": [
|
||||
"### Word2Vec + RandomForest"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {
|
||||
"id": "t9mCasDmyNWe"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"w2v_rf_pipeline = Pipeline([\n",
|
||||
" ('w2v_transform', Word2VecTransformer(w2v_model)),\n",
|
||||
" ('rf', RandomForestClassifier(n_estimators=100))\n",
|
||||
"])\n",
|
||||
"w2v_rf_pipeline.fit(X_train, y_train)\n",
|
||||
"y_pred_w2v_rf = w2v_rf_pipeline.predict(X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "lkFzZ1MjyNWf"
|
||||
},
|
||||
"source": [
|
||||
"### Wyświetlanie metryk"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_scores(y_true, y_pred):\n",
|
||||
" # Funkcja zwraca trafność, precyzję, pokrycie i F1\n",
|
||||
" acc_score = 0\n",
|
||||
" acc_total = 0\n",
|
||||
" tp = 0\n",
|
||||
" fp = 0\n",
|
||||
" selected_items = 0\n",
|
||||
" relevant_items = 0\n",
|
||||
"\n",
|
||||
" for p, t in zip(y_pred, y_true):\n",
|
||||
" acc_total += 1\n",
|
||||
"\n",
|
||||
" if p == t:\n",
|
||||
" acc_score += 1\n",
|
||||
"\n",
|
||||
" if p > 0 and p == t:\n",
|
||||
" tp += 1\n",
|
||||
"\n",
|
||||
" if p > 0:\n",
|
||||
" selected_items += 1\n",
|
||||
"\n",
|
||||
" if t > 0:\n",
|
||||
" relevant_items += 1\n",
|
||||
"\n",
|
||||
" accuracy = acc_score / acc_total\n",
|
||||
"\n",
|
||||
" if selected_items == 0:\n",
|
||||
" precision = 1.0\n",
|
||||
" else:\n",
|
||||
" precision = tp / selected_items\n",
|
||||
"\n",
|
||||
" if relevant_items == 0:\n",
|
||||
" recall = 1.0\n",
|
||||
" else:\n",
|
||||
" recall = tp / relevant_items\n",
|
||||
"\n",
|
||||
" if precision + recall == 0.0:\n",
|
||||
" f1 = 0.0\n",
|
||||
" else:\n",
|
||||
" f1 = 2 * precision * recall / (precision + recall)\n",
|
||||
"\n",
|
||||
" return accuracy, precision, recall, f1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {
|
||||
"id": "W8RJkm0CyNWf"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def print_metrics(y_true, y_pred, model_name):\n",
|
||||
" accuracy, precision, recall, f1 = get_scores(y_true, y_pred)\n",
|
||||
" print(f'{model_name} Accuracy: {accuracy:.4f}')\n",
|
||||
" print(f'{model_name} Precision: {precision:.4f}')\n",
|
||||
" print(f'{model_name} Recall: {recall:.4f}')\n",
|
||||
" print(f'{model_name} F1-Score: {f1:.4f}')\n",
|
||||
" print('-' * 30)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "tPPkR8MOyNWf",
|
||||
"outputId": "ceae2217-10b0-4533-9f43-3c7add2d19b4"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"TF-IDF + SVM Accuracy: 0.7764\n",
|
||||
"TF-IDF + SVM Precision: 0.7719\n",
|
||||
"TF-IDF + SVM Recall: 0.7896\n",
|
||||
"TF-IDF + SVM F1-Score: 0.7807\n",
|
||||
"------------------------------\n",
|
||||
"TF-IDF + Random Forest Accuracy: 0.7500\n",
|
||||
"TF-IDF + Random Forest Precision: 0.7626\n",
|
||||
"TF-IDF + Random Forest Recall: 0.7317\n",
|
||||
"TF-IDF + Random Forest F1-Score: 0.7468\n",
|
||||
"------------------------------\n",
|
||||
"Word2Vec + SVM Accuracy: 0.8584\n",
|
||||
"Word2Vec + SVM Precision: 0.8522\n",
|
||||
"Word2Vec + SVM Recall: 0.8698\n",
|
||||
"Word2Vec + SVM F1-Score: 0.8609\n",
|
||||
"------------------------------\n",
|
||||
"Word2Vec + Random Forest Accuracy: 0.8137\n",
|
||||
"Word2Vec + Random Forest Precision: 0.8106\n",
|
||||
"Word2Vec + Random Forest Recall: 0.8224\n",
|
||||
"Word2Vec + Random Forest F1-Score: 0.8165\n",
|
||||
"------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Ocena modelu TF-IDF + SVM\n",
|
||||
"print_metrics(y_test, y_pred_tfidf_svm, 'TF-IDF + SVM')\n",
|
||||
"\n",
|
||||
"# Ocena modelu TF-IDF + Random Forest\n",
|
||||
"print_metrics(y_test, y_pred_tfidf_rf, 'TF-IDF + Random Forest')\n",
|
||||
"\n",
|
||||
"# Ocena modelu Word2Vec + SVM\n",
|
||||
"print_metrics(y_test, y_pred_w2v_svm, 'Word2Vec + SVM')\n",
|
||||
"\n",
|
||||
"# Ocena modelu Word2Vec + Random Forest\n",
|
||||
"print_metrics(y_test, y_pred_w2v_rf, 'Word2Vec + Random Forest')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"accelerator": "GPU",
|
||||
"colab": {
|
||||
"gpuType": "T4",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
BIN
Raport.docx
Normal file
BIN
Raport.docx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user