DL_projekt/projekt.ipynb

13 KiB
Raw Permalink Blame History

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
train_df = pd.read_csv('train.csv', header=None, names=['polarity', 'title', 'text'])
test_df = pd.read_csv('test.csv', header=None, names=['polarity', 'title', 'text'])
train_df = train_df.sample(n=40000, random_state=1)
test_df = test_df.sample(n=10000, random_state=1)
train_df['text'] = train_df['title'].fillna('') + ' ' + train_df['text'].fillna('')
test_df['text'] = test_df['title'].fillna('') + ' ' + test_df['text'].fillna('')

train_df.drop(columns=['title'], inplace=True)
test_df.drop(columns=['title'], inplace=True)
train_df['polarity'] = train_df['polarity'] - 1
test_df['polarity'] = test_df['polarity'] - 1
train_df
polarity text
3281328 1 Excellent home help for parents Volume 1 of Do...
2662721 0 Stay far, far away. I made it through about 6,...
1600544 0 Lost Woods Lost WoodsI didn't really understan...
815246 0 Renaissance -12, Women's brown suead shoes Rec...
1254178 1 Best Novel I've Read This Year Intrigued by th...
... ... ...
1132008 1 Pleasant, eclectic mix of coffee-shop favorite...
1712954 1 A Valuable Text This is not light reading. It ...
3191827 0 NOT GOOD TO USE A WORKOUT TO MAKE ADVERTICING ...
1692342 1 Good Read David Wellington brings a new twist ...
1944752 1 Edge of Paradise: America in Micronesia B.C. h...

40000 rows × 2 columns

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])
# Word2Vec
def tokenize(text):
    return text.split()

train_df['tokens'] = train_df['text'].apply(tokenize)
test_df['tokens'] = test_df['text'].apply(tokenize)

w2v_model = Word2Vec(sentences=train_df['tokens'], vector_size=100, window=5, min_count=5, workers=4)

def get_avg_w2v(tokens, model):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

X_train_w2v = np.array([get_avg_w2v(tokens, w2v_model) for tokens in train_df['tokens']])
X_test_w2v = np.array([get_avg_w2v(tokens, w2v_model) for tokens in test_df['tokens']])
# Klasyfikatory
log_reg = LogisticRegression(max_iter=1000)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=1)
# TF-IDF
log_reg.fit(X_train_tfidf, train_df['polarity'])
rf_clf.fit(X_train_tfidf, train_df['polarity'])

y_pred_log_reg_tfidf = log_reg.predict(X_test_tfidf)
y_pred_rf_clf_tfidf = rf_clf.predict(X_test_tfidf)
# Word2Vec
log_reg.fit(X_train_w2v, train_df['polarity'])
rf_clf.fit(X_train_w2v, train_df['polarity'])

y_pred_log_reg_w2v = log_reg.predict(X_test_w2v)
y_pred_rf_clf_w2v = rf_clf.predict(X_test_w2v)
def display_metrics(y_true, y_pred, name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"{name} Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    print()
# TF-IDF + Regresja Logistyczna
display_metrics(test_df['polarity'], y_pred_log_reg_tfidf, "TF-IDF + Logistic Regression")

# TF-IDF + Random Forest
display_metrics(test_df['polarity'], y_pred_rf_clf_tfidf, "TF-IDF + Random Forest")

# Word2Vec + Regresja Logistyczna
display_metrics(test_df['polarity'], y_pred_log_reg_w2v, "Word2Vec + Logistic Regression")

# Word2Vec + Random Forest
display_metrics(test_df['polarity'], y_pred_rf_clf_w2v, "Word2Vec + Random Forest")
TF-IDF + Logistic Regression Metrics:
Accuracy: 0.8865
Precision: 0.8851
Recall: 0.8898
F1-score: 0.8874

TF-IDF + Random Forest Metrics:
Accuracy: 0.8504
Precision: 0.8668
Recall: 0.8300
F1-score: 0.8480

Word2Vec + Logistic Regression Metrics:
Accuracy: 0.7906
Precision: 0.7964
Recall: 0.7840
F1-score: 0.7901

Word2Vec + Random Forest Metrics:
Accuracy: 0.7546
Precision: 0.7643
Recall: 0.7403
F1-score: 0.7521