import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
valid = pd.read_csv("valid.csv")

train.loc[train["review_score"]==-1, "review_score"]=0
test.loc[test["review_score"]==-1, "review_score"]=0
valid.loc[valid["review_score"]==-1, "review_score"]=0
import torch
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis", device=0)
test["predicted_score"] = sentiment_pipeline(test["review_text"].tolist(), truncation=True)
{'label': 'POSITIVE', 'score': 0.9997923970222473}
str_to_int_score = {"POSITIVE" : 1, "NEGATIVE" : 0}

test["model_predictions"] = test["predicted_score"].apply(lambda x: str_to_int_score[x["label"]])
Unnamed: 0 review_text review_score predicted_score model_predictions
0 1265039 I love the Fact you can do what EVER you want ... 1 {'label': 'POSITIVE', 'score': 0.9997923970222... 1
1 3132003 Tony Hawk's without the Pro Skater. Finding ou... 1 {'label': 'POSITIVE', 'score': 0.9989967942237... 1
2 880195 It's pretty good. 1 {'label': 'POSITIVE', 'score': 0.9998482465744... 1
3 717128 This the best dungeon game I have played since... 1 {'label': 'POSITIVE', 'score': 0.9998807907104... 1
4 5221356 Totally awesome game alone or with a friend. I... 1 {'label': 'POSITIVE', 'score': 0.9998763799667... 1
def get_metrics():
    df = test
    predictions = df["model_predictions"].to_numpy()
    true_values = df["review_score"].to_numpy()
    accuracy = np.sum(np.rint(predictions) == true_values)/len(true_values)
    TN_count = len(df.query("`review_score`==0 and `model_predictions`==0").index)
    TP_count = len(df.query("`review_score`==1 and `model_predictions`==1").index)
    FP_count = len(df.query("`review_score`==0 and `model_predictions`==1").index)
    FN_count = len(df.query("`review_score`==1 and `model_predictions`==0").index)
    precision = TP_count/(TP_count+FP_count)
    recall = TP_count/(TP_count+FN_count)
    F1_score = (2*precision*recall)/(precision+recall)
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {F1_score:.2f}")
Accuracy: 0.77
Precision: 0.97
Recall: 0.75
F1 Score: 0.84

Użyty domyślnie model (distilbert/distilbert-base-uncased-finetuned-sst-2-english) jest (wg. karty modelu) modelem do klasyfikacji tematów. Spróbujmy modelu, który jest dedykowany pod zadanie sentiment analysis dla recenzji.

sentiment_pipeline = pipeline(model="nlptown/bert-base-multilingual-uncased-sentiment", device=0)
[{'label': '5 stars', 'score': 0.8000338673591614}]
test["predicted_score"] = sentiment_pipeline(test["review_text"].tolist(), truncation=True)
test["predicted_score"] = test["predicted_score"].apply(lambda x : x["label"])
5 stars    6183
4 stars    3952
1 star     2399
3 stars    1883
2 stars    1299
Name: count, dtype: int64
str_to_int_score = {"5 stars" : 1, "4 stars" : 1, "3 stars": 1, "2 stars": 0, "1 star": 0} # Arbitralnie ustalone progi

test["model_predictions"] = test["predicted_score"].apply(lambda x: str_to_int_score[x])
Accuracy: 0.86
Precision: 0.95
Recall: 0.88
F1 Score: 0.91

Wyniki są teraz lepsze. W porównaniu z LSTM model ten ma odrobinę niższą precyzję i wyższy recall, czyli więcej recenzji (również błędnie) uznaje za pozytywne.

def test_review_text(sentence):
    model_output = sentiment_pipeline([sentence])
    score = str_to_int_score[model_output[0]["label"]]
    if score==0:
        print("Negative review")
        print("Positive review")
test_review_text("A buggy, uninspired mess")
Negative review
test_review_text("This game is bad")
Negative review
test_review_text("This game destroyed my life")
Negative review
test_review_text("Best game I've ever played")
Positive review
test_review_text("Fun cooperative play with scalable difficulty. Rapid path to get into a game with friends or open public games. ")
Positive review
test_review_text("Deliriously buggy. Fun if/when it works properly. Wait and see if they actually QA the next few patches before you play.")
Negative review