13 KiB
13 KiB
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
train_df = pd.read_csv('train.csv', header=None, names=['polarity', 'title', 'text'])
test_df = pd.read_csv('test.csv', header=None, names=['polarity', 'title', 'text'])
train_df = train_df.sample(n=40000, random_state=1)
test_df = test_df.sample(n=10000, random_state=1)
train_df['text'] = train_df['title'].fillna('') + ' ' + train_df['text'].fillna('')
test_df['text'] = test_df['title'].fillna('') + ' ' + test_df['text'].fillna('')
train_df.drop(columns=['title'], inplace=True)
test_df.drop(columns=['title'], inplace=True)
train_df['polarity'] = train_df['polarity'] - 1
test_df['polarity'] = test_df['polarity'] - 1
train_df
polarity | text | |
---|---|---|
3281328 | 1 | Excellent home help for parents Volume 1 of Do... |
2662721 | 0 | Stay far, far away. I made it through about 6,... |
1600544 | 0 | Lost Woods Lost WoodsI didn't really understan... |
815246 | 0 | Renaissance -12, Women's brown suead shoes Rec... |
1254178 | 1 | Best Novel I've Read This Year Intrigued by th... |
... | ... | ... |
1132008 | 1 | Pleasant, eclectic mix of coffee-shop favorite... |
1712954 | 1 | A Valuable Text This is not light reading. It ... |
3191827 | 0 | NOT GOOD TO USE A WORKOUT TO MAKE ADVERTICING ... |
1692342 | 1 | Good Read David Wellington brings a new twist ... |
1944752 | 1 | Edge of Paradise: America in Micronesia B.C. h... |
40000 rows × 2 columns
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['text'])
# Word2Vec
def tokenize(text):
return text.split()
train_df['tokens'] = train_df['text'].apply(tokenize)
test_df['tokens'] = test_df['text'].apply(tokenize)
w2v_model = Word2Vec(sentences=train_df['tokens'], vector_size=100, window=5, min_count=5, workers=4)
def get_avg_w2v(tokens, model):
vectors = [model.wv[token] for token in tokens if token in model.wv]
if len(vectors) == 0:
return np.zeros(model.vector_size)
return np.mean(vectors, axis=0)
X_train_w2v = np.array([get_avg_w2v(tokens, w2v_model) for tokens in train_df['tokens']])
X_test_w2v = np.array([get_avg_w2v(tokens, w2v_model) for tokens in test_df['tokens']])
# Klasyfikatory
log_reg = LogisticRegression(max_iter=1000)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=1)
# TF-IDF
log_reg.fit(X_train_tfidf, train_df['polarity'])
rf_clf.fit(X_train_tfidf, train_df['polarity'])
y_pred_log_reg_tfidf = log_reg.predict(X_test_tfidf)
y_pred_rf_clf_tfidf = rf_clf.predict(X_test_tfidf)
# Word2Vec
log_reg.fit(X_train_w2v, train_df['polarity'])
rf_clf.fit(X_train_w2v, train_df['polarity'])
y_pred_log_reg_w2v = log_reg.predict(X_test_w2v)
y_pred_rf_clf_w2v = rf_clf.predict(X_test_w2v)
def display_metrics(y_true, y_pred, name):
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
print(f"{name} Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print()
# TF-IDF + Regresja Logistyczna
display_metrics(test_df['polarity'], y_pred_log_reg_tfidf, "TF-IDF + Logistic Regression")
# TF-IDF + Random Forest
display_metrics(test_df['polarity'], y_pred_rf_clf_tfidf, "TF-IDF + Random Forest")
# Word2Vec + Regresja Logistyczna
display_metrics(test_df['polarity'], y_pred_log_reg_w2v, "Word2Vec + Logistic Regression")
# Word2Vec + Random Forest
display_metrics(test_df['polarity'], y_pred_rf_clf_w2v, "Word2Vec + Random Forest")
TF-IDF + Logistic Regression Metrics: Accuracy: 0.8865 Precision: 0.8851 Recall: 0.8898 F1-score: 0.8874 TF-IDF + Random Forest Metrics: Accuracy: 0.8504 Precision: 0.8668 Recall: 0.8300 F1-score: 0.8480 Word2Vec + Logistic Regression Metrics: Accuracy: 0.7906 Precision: 0.7964 Recall: 0.7840 F1-score: 0.7901 Word2Vec + Random Forest Metrics: Accuracy: 0.7546 Precision: 0.7643 Recall: 0.7403 F1-score: 0.7521