uczenie-glebokie-projekt/Projekt.ipynb

12 KiB

Import bibliotek

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import re

Przygotowanie danych

def get_str_cleaned(str_dirty):
    punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
    new_str = str_dirty.lower()
    new_str = re.sub(' +', ' ', new_str)
    for char in punctuation:
        new_str = new_str.replace(char, '')
    return new_str
# Source: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
data = pd.read_csv('IMDB_reviews.csv')
print(data.head())

# Czyszczenie danych
data['cleaned_review'] = data['review'].apply(get_str_cleaned)

# Przekształcenie etykiet na format numeryczny
label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])

print(data.head())

# Podział danych na zbiór treningowy i testowy
X = data['cleaned_review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
                                              review  sentiment  \
0  One of the other reviewers has mentioned that ...          1   
1  A wonderful little production. <br /><br />The...          1   
2  I thought this was a wonderful way to spend ti...          1   
3  Basically there's a family where a little boy ...          0   
4  Petter Mattei's "Love in the Time of Money" is...          1   

                                      cleaned_review  
0  one of the other reviewers has mentioned that ...  
1  a wonderful little production br br the filmin...  
2  i thought this was a wonderful way to spend ti...  
3  basically theres a family where a little boy j...  
4  petter matteis love in the time of money is a ...  

TF-IDF + SVM

tfidf_svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=200)),
    ('svm', SVC(kernel='linear'))
])
tfidf_svm_pipeline.fit(X_train, y_train)
y_pred_tfidf_svm = tfidf_svm_pipeline.predict(X_test)

TF-IDF + RandomForest

tfidf_rf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=200)),
    ('rf', RandomForestClassifier(n_estimators=100))
])
tfidf_rf_pipeline.fit(X_train, y_train)
y_pred_tfidf_rf = tfidf_rf_pipeline.predict(X_test)

Model Word2Vec i transformator dokumentów do postaci wektorowej

w2v_model = Word2Vec(sentences=[doc.split() for doc in X_train], vector_size=200, window=5, min_count=5, workers=4)
class Word2VecTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, w2v_model):
        self.w2v_model = w2v_model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.w2v_model.wv[word] for word in doc.split() if word in self.w2v_model.wv]
                    or [np.zeros(self.w2v_model.vector_size)], axis=0)
            for doc in X
        ])

Word2Vec + SVM

w2v_svm_pipeline = Pipeline([
    ('w2v_transform', Word2VecTransformer(w2v_model)),
    ('svm', SVC(kernel='linear'))
])
w2v_svm_pipeline.fit(X_train, y_train)
y_pred_w2v_svm = w2v_svm_pipeline.predict(X_test)

Word2Vec + RandomForest

w2v_rf_pipeline = Pipeline([
    ('w2v_transform', Word2VecTransformer(w2v_model)),
    ('rf', RandomForestClassifier(n_estimators=100))
])
w2v_rf_pipeline.fit(X_train, y_train)
y_pred_w2v_rf = w2v_rf_pipeline.predict(X_test)

Wyświetlanie metryk

def get_scores(y_true, y_pred):
    # Funkcja zwraca trafność, precyzję, pokrycie i F1
    acc_score = 0
    acc_total = 0
    tp = 0
    fp = 0
    selected_items = 0
    relevant_items = 0

    for p, t in zip(y_pred, y_true):
        acc_total += 1

        if p == t:
            acc_score += 1

        if p > 0 and p == t:
            tp += 1

        if p > 0:
            selected_items += 1

        if t > 0:
            relevant_items += 1

    accuracy = acc_score / acc_total

    if selected_items == 0:
        precision = 1.0
    else:
        precision = tp / selected_items

    if relevant_items == 0:
        recall = 1.0
    else:
        recall = tp / relevant_items

    if precision + recall == 0.0:
        f1 = 0.0
    else:
        f1 = 2 * precision * recall / (precision + recall)

    return accuracy, precision, recall, f1
def print_metrics(y_true, y_pred, model_name):
    accuracy, precision, recall, f1 = get_scores(y_true, y_pred)
    print(f'{model_name} Accuracy: {accuracy:.4f}')
    print(f'{model_name} Precision: {precision:.4f}')
    print(f'{model_name} Recall: {recall:.4f}')
    print(f'{model_name} F1-Score: {f1:.4f}')
    print('-' * 30)
# Ocena modelu TF-IDF + SVM
print_metrics(y_test, y_pred_tfidf_svm, 'TF-IDF + SVM')

# Ocena modelu TF-IDF + Random Forest
print_metrics(y_test, y_pred_tfidf_rf, 'TF-IDF + Random Forest')

# Ocena modelu Word2Vec + SVM
print_metrics(y_test, y_pred_w2v_svm, 'Word2Vec + SVM')

# Ocena modelu Word2Vec + Random Forest
print_metrics(y_test, y_pred_w2v_rf, 'Word2Vec + Random Forest')
TF-IDF + SVM Accuracy: 0.7764
TF-IDF + SVM Precision: 0.7719
TF-IDF + SVM Recall: 0.7896
TF-IDF + SVM F1-Score: 0.7807
------------------------------
TF-IDF + Random Forest Accuracy: 0.7500
TF-IDF + Random Forest Precision: 0.7626
TF-IDF + Random Forest Recall: 0.7317
TF-IDF + Random Forest F1-Score: 0.7468
------------------------------
Word2Vec + SVM Accuracy: 0.8584
Word2Vec + SVM Precision: 0.8522
Word2Vec + SVM Recall: 0.8698
Word2Vec + SVM F1-Score: 0.8609
------------------------------
Word2Vec + Random Forest Accuracy: 0.8137
Word2Vec + Random Forest Precision: 0.8106
Word2Vec + Random Forest Recall: 0.8224
Word2Vec + Random Forest F1-Score: 0.8165
------------------------------