12 KiB
12 KiB
Import bibliotek
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import re
Przygotowanie danych
def get_str_cleaned(str_dirty):
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\\\]^_`{|}~'
new_str = str_dirty.lower()
new_str = re.sub(' +', ' ', new_str)
for char in punctuation:
new_str = new_str.replace(char, '')
return new_str
# Source: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
data = pd.read_csv('IMDB_reviews.csv')
print(data.head())
# Czyszczenie danych
data['cleaned_review'] = data['review'].apply(get_str_cleaned)
# Przekształcenie etykiet na format numeryczny
label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])
print(data.head())
# Podział danych na zbiór treningowy i testowy
X = data['cleaned_review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
review sentiment 0 One of the other reviewers has mentioned that ... positive 1 A wonderful little production. <br /><br />The... positive 2 I thought this was a wonderful way to spend ti... positive 3 Basically there's a family where a little boy ... negative 4 Petter Mattei's "Love in the Time of Money" is... positive review sentiment \ 0 One of the other reviewers has mentioned that ... 1 1 A wonderful little production. <br /><br />The... 1 2 I thought this was a wonderful way to spend ti... 1 3 Basically there's a family where a little boy ... 0 4 Petter Mattei's "Love in the Time of Money" is... 1 cleaned_review 0 one of the other reviewers has mentioned that ... 1 a wonderful little production br br the filmin... 2 i thought this was a wonderful way to spend ti... 3 basically theres a family where a little boy j... 4 petter matteis love in the time of money is a ...
TF-IDF + SVM
tfidf_svm_pipeline = Pipeline([
('tfidf', TfidfVectorizer(max_features=200)),
('svm', SVC(kernel='linear'))
])
tfidf_svm_pipeline.fit(X_train, y_train)
y_pred_tfidf_svm = tfidf_svm_pipeline.predict(X_test)
TF-IDF + RandomForest
tfidf_rf_pipeline = Pipeline([
('tfidf', TfidfVectorizer(max_features=200)),
('rf', RandomForestClassifier(n_estimators=100))
])
tfidf_rf_pipeline.fit(X_train, y_train)
y_pred_tfidf_rf = tfidf_rf_pipeline.predict(X_test)
Model Word2Vec i transformator dokumentów do postaci wektorowej
w2v_model = Word2Vec(sentences=[doc.split() for doc in X_train], vector_size=200, window=5, min_count=5, workers=4)
class Word2VecTransformer(BaseEstimator, TransformerMixin):
def __init__(self, w2v_model):
self.w2v_model = w2v_model
def fit(self, X, y=None):
return self
def transform(self, X):
return np.array([
np.mean([self.w2v_model.wv[word] for word in doc.split() if word in self.w2v_model.wv]
or [np.zeros(self.w2v_model.vector_size)], axis=0)
for doc in X
])
Word2Vec + SVM
w2v_svm_pipeline = Pipeline([
('w2v_transform', Word2VecTransformer(w2v_model)),
('svm', SVC(kernel='linear'))
])
w2v_svm_pipeline.fit(X_train, y_train)
y_pred_w2v_svm = w2v_svm_pipeline.predict(X_test)
Word2Vec + RandomForest
w2v_rf_pipeline = Pipeline([
('w2v_transform', Word2VecTransformer(w2v_model)),
('rf', RandomForestClassifier(n_estimators=100))
])
w2v_rf_pipeline.fit(X_train, y_train)
y_pred_w2v_rf = w2v_rf_pipeline.predict(X_test)
Wyświetlanie metryk
def get_scores(y_true, y_pred):
# Funkcja zwraca trafność, precyzję, pokrycie i F1
acc_score = 0
acc_total = 0
tp = 0
fp = 0
selected_items = 0
relevant_items = 0
for p, t in zip(y_pred, y_true):
acc_total += 1
if p == t:
acc_score += 1
if p > 0 and p == t:
tp += 1
if p > 0:
selected_items += 1
if t > 0:
relevant_items += 1
accuracy = acc_score / acc_total
if selected_items == 0:
precision = 1.0
else:
precision = tp / selected_items
if relevant_items == 0:
recall = 1.0
else:
recall = tp / relevant_items
if precision + recall == 0.0:
f1 = 0.0
else:
f1 = 2 * precision * recall / (precision + recall)
return accuracy, precision, recall, f1
def print_metrics(y_true, y_pred, model_name):
accuracy, precision, recall, f1 = get_scores(y_true, y_pred)
print(f'{model_name} Accuracy: {accuracy:.4f}')
print(f'{model_name} Precision: {precision:.4f}')
print(f'{model_name} Recall: {recall:.4f}')
print(f'{model_name} F1-Score: {f1:.4f}')
print('-' * 30)
# Ocena modelu TF-IDF + SVM
print_metrics(y_test, y_pred_tfidf_svm, 'TF-IDF + SVM')
# Ocena modelu TF-IDF + Random Forest
print_metrics(y_test, y_pred_tfidf_rf, 'TF-IDF + Random Forest')
# Ocena modelu Word2Vec + SVM
print_metrics(y_test, y_pred_w2v_svm, 'Word2Vec + SVM')
# Ocena modelu Word2Vec + Random Forest
print_metrics(y_test, y_pred_w2v_rf, 'Word2Vec + Random Forest')
TF-IDF + SVM Accuracy: 0.7764 TF-IDF + SVM Precision: 0.7719 TF-IDF + SVM Recall: 0.7896 TF-IDF + SVM F1-Score: 0.7807 ------------------------------ TF-IDF + Random Forest Accuracy: 0.7500 TF-IDF + Random Forest Precision: 0.7626 TF-IDF + Random Forest Recall: 0.7317 TF-IDF + Random Forest F1-Score: 0.7468 ------------------------------ Word2Vec + SVM Accuracy: 0.8584 Word2Vec + SVM Precision: 0.8522 Word2Vec + SVM Recall: 0.8698 Word2Vec + SVM F1-Score: 0.8609 ------------------------------ Word2Vec + Random Forest Accuracy: 0.8137 Word2Vec + Random Forest Precision: 0.8106 Word2Vec + Random Forest Recall: 0.8224 Word2Vec + Random Forest F1-Score: 0.8165 ------------------------------