## Analiza sentymentu w opiniach z Twitter'a


### Download dataset and prepare data


#### Installation of packages


In [98]:
%pip install pandas
%pip install scikit-learn
%pip install emoji
%pip install gensim



#### Importing libraries


In [99]:
import pandas as pd
from sklearn.model_selection import train_test_split
import emoji
from gensim.utils import simple_preprocess

#### Download the dataset


In [100]:
!kaggle datasets download -d jp797498e/twitter-entity-sentiment-analysis

Dataset URL: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis
License(s): CC0-1.0
twitter-entity-sentiment-analysis.zip: Skipping, found more recently modified local copy (use --force to force download)


#### Unzip the dataset


In [101]:
!unzip -o twitter-entity-sentiment-analysis.zip

Archive:  twitter-entity-sentiment-analysis.zip
  inflating: twitter_training.csv    
  inflating: twitter_validation.csv  


#### Load the dataset


In [102]:
cols = ["tweetid", "entity", "sentiment", "content"]
twitter_training = pd.read_csv("twitter_training.csv", names=cols)
twitter_validation = pd.read_csv("twitter_validation.csv", names=cols)
dataset = pd.concat([twitter_training, twitter_validation])

#### Info about the dataset


In [103]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 75682 entries, 0 to 999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweetid    75682 non-null  int64 
 1   entity     75682 non-null  object
 2   sentiment  75682 non-null  object
 3   content    74996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.9+ MB


In [104]:
dataset.shape

(75682, 4)

In [105]:
dataset["sentiment"].value_counts()

sentiment
Negative      22808
Positive      21109
Neutral       18603
Irrelevant    13162
Name: count, dtype: int64

In [106]:
dataset.isna().sum()

tweetid        0
entity         0
sentiment      0
content      686
dtype: int64

In [107]:
dataset.duplicated().sum()

3217

#### Prepare the dataset


##### Drop tweetid and entity columns


In [108]:
dataset = dataset.drop(columns=["tweetid", "entity"], axis=1)

##### Drop null values


In [109]:
dataset.dropna(inplace=True)

##### Remove emojis


In [110]:
dataset["content"] = dataset["content"].apply(
    lambda x: emoji.replace_emoji(x, replace="")
)

##### Simple Preprocess


In [111]:
dataset["content"] = dataset["content"].apply(lambda x: " ".join(simple_preprocess(x)))

##### Drop null values


In [112]:
dataset.dropna(inplace=True)

##### Drop duplicates


In [113]:
dataset.drop_duplicates(inplace=True)

#### Info about the dataset after cleaning


In [114]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 65839 entries, 0 to 991
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  65839 non-null  object
 1   content    65839 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [115]:
dataset.shape

(65839, 2)

In [116]:
dataset["sentiment"].value_counts()

sentiment
Negative      20147
Positive      17868
Neutral       16193
Irrelevant    11631
Name: count, dtype: int64

In [117]:
dataset.isna().sum()

sentiment    0
content      0
dtype: int64

In [118]:
dataset.duplicated().sum()

0

#### Split the dataset into training and testing sets


In [119]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset["content"], dataset["sentiment"], test_size=0.2, random_state=0
)

In [120]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((52671,), (13168,), (52671,), (13168,))

### TD-IDF - Logistic Regression


#### Importing libraries


In [121]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

#### Text Vectorization Using TF-IDF


In [122]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

#### Training a Logistic Regression model


In [123]:
model = LogisticRegression(solver="lbfgs", penalty="l2", max_iter=1000)
model.fit(X_train_tfidf, y_train)

#### Predicting


In [124]:
y_pred = model.predict(X_test_tfidf)

#### Classification report


In [125]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  Irrelevant       0.82      0.70      0.75      2304
    Negative       0.80      0.86      0.83      4024
     Neutral       0.79      0.74      0.77      3169
    Positive       0.78      0.82      0.80      3671

    accuracy                           0.79     13168
   macro avg       0.80      0.78      0.79     13168
weighted avg       0.79      0.79      0.79     13168



### TD-IDF - Random Forest Classifier


#### Importing libraries


In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

#### Text Vectorization Using TF-IDF


In [127]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

#### Training a Random Forest Classifier model


In [128]:
model = RandomForestClassifier(criterion="gini")
model.fit(X_train_tfidf, y_train)

#### Predicting


In [129]:
y_pred = model.predict(X_test_tfidf)

#### Classification report


In [130]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

  Irrelevant       0.95      0.87      0.91      2304
    Negative       0.92      0.95      0.93      4024
     Neutral       0.94      0.91      0.93      3169
    Positive       0.90      0.94      0.92      3671

    accuracy                           0.93     13168
   macro avg       0.93      0.92      0.92     13168
weighted avg       0.93      0.93      0.92     13168



### Word2Vec - LSTM


#### Installation of packages


In [131]:
%pip install tensorflow
%pip install numpy



#### Importing libraries


In [132]:
from gensim.models import Word2Vec
import numpy as np
import tensorflow as tf
from sklearn.calibration import LabelEncoder

#### Function to convert text to Word2Vec vectors


In [133]:
def text_to_vector(text, word2vec, vector_size):
    words = simple_preprocess(text)
    text_vector = np.zeros(vector_size)
    word_count = 0
    for word in words:
        if word in word2vec.wv:
            text_vector += word2vec.wv[word]
            word_count += 1
    if word_count > 0:
        text_vector /= word_count
    return text_vector

#### Tokenize texts


In [134]:
tokenized_text = dataset["content"].apply(lambda x: x.split())

#### Vector size parameter


In [147]:
vector_size = 100

#### Train Word2Vec model


In [148]:
model_word2vec = Word2Vec(
    tokenized_text, window=5, min_count=2, workers=4, vector_size=vector_size, epochs=20
)

#### Convert texts to Word2Vec vectors


In [149]:
train_vectors = np.array(
    [text_to_vector(text, model_word2vec, vector_size) for text in X_train]
)

test_vectors = np.array(
    [text_to_vector(text, model_word2vec, vector_size) for text in X_test]
)

#### Find the maximum sequence length in the training set


In [150]:
max_len = max(len(seq) for seq in train_vectors)

#### Pad sequences to the same length


In [151]:
X_train_emb = tf.keras.preprocessing.sequence.pad_sequences(
    train_vectors, maxlen=max_len, dtype="float32", padding="post"
)
X_test_emb = tf.keras.preprocessing.sequence.pad_sequences(
    test_vectors, maxlen=max_len, dtype="float32", padding="post"
)

#### Encode labels


In [152]:
label_encoder = LabelEncoder()
y_train_enc = label_encoder.fit_transform(y_train)
y_test_enc = label_encoder.transform(y_test)

#### Define LSTM model


In [153]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(input_dim=X_train_emb.shape[1], output_dim=100),
        tf.keras.layers.LSTM(128),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(4, activation="softmax"),
    ]
)

#### Compile the model


In [154]:
model.compile(
    optimizer=tf.optimizers.Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

#### Train the model


In [155]:
model.fit(X_train_emb, y_train_enc, epochs=50, batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x790a27fa2b60>

#### Predicting


In [156]:
y_pred = model.predict(X_test_emb)

y_preds_argmax = []
for i in range(len(y_pred)):
    y_preds_argmax.append(y_pred[i].argmax())



#### Classification report


In [157]:
print(classification_report(y_test_enc, y_preds_argmax))

              precision    recall  f1-score   support

           0       0.32      0.20      0.25      2304
           1       0.46      0.62      0.53      4024
           2       0.44      0.43      0.44      3169
           3       0.45      0.39      0.42      3671

    accuracy                           0.44     13168
   macro avg       0.42      0.41      0.41     13168
weighted avg       0.43      0.44      0.42     13168

