## Analiza sentymentu

In [1]:
import pandas as pd
import matplotlib
import tensorflow

Użyty dataset: https://www.kaggle.com/datasets/abhi8923shriv/sentiment-analysis-dataset

In [56]:
df_train = pd.read_csv("train.csv", encoding='unicode_escape')
df_test = pd.read_csv("test.csv", encoding='unicode_escape')

In [12]:
review_df = df_train[['text', 'sentiment']]

print(review_df.shape)
review_df.head(5)

(27481, 2)


Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [17]:
review_df = review_df[review_df['sentiment'] != 'neutral']

print(review_df.shape)
review_df.head(5)

(16363, 2)


Unnamed: 0,text,sentiment
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
6,2am feedings for the baby are fun when he is a...,positive


In [18]:
sentiment_label = review_df.sentiment.factorize()
sentiment_label

(array([0, 0, 0, ..., 0, 1, 1], dtype=int64),
 Index(['negative', 'positive'], dtype='object'))

In [19]:
tweet = review_df.text.values
tweet

array([' Sooo SAD I will miss you here in San Diego!!!',
       'my boss is bullying me...', ' what interview! leave me alone',
       ...,
       ' I`ve wondered about rake to.  The client has made it clear .NET only, don`t force devs to learn a new lang  #agile #ccnet',
       ' Yay good for both of you. Enjoy the break - you probably need it after such hectic weekend  Take care hun xxxx',
       ' But it was worth it  ****.'], dtype=object)

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweet)

#tutaj potrzebne jest pozbycie się neutralnych opinii

In [22]:
encoded_docs = tokenizer.texts_to_sequences(tweet)

In [23]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_sequence = pad_sequences(encoded_docs, maxlen=200)

## Klasyfikator z użyciem LSTM

In [28]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding

embedding_vector_length = 32
vocab_size = 200

model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

print(model.summary())

None


In [29]:
history = model.fit(padded_sequence,sentiment_label[0],validation_split=0.2, epochs=5, batch_size=32)

Epoch 1/5
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 96ms/step - accuracy: 0.5876 - loss: 0.6557 - val_accuracy: 0.7666 - val_loss: 0.4750
Epoch 2/5
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 94ms/step - accuracy: 0.7560 - loss: 0.4941 - val_accuracy: 0.7904 - val_loss: 0.4362
Epoch 3/5
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 97ms/step - accuracy: 0.7674 - loss: 0.4661 - val_accuracy: 0.7941 - val_loss: 0.4266
Epoch 4/5
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 96ms/step - accuracy: 0.7792 - loss: 0.4545 - val_accuracy: 0.7932 - val_loss: 0.4305
Epoch 5/5
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 98ms/step - accuracy: 0.7876 - loss: 0.4464 - val_accuracy: 0.7962 - val_loss: 0.4187


In [30]:
def predict_sentiment(text):
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw,maxlen=200)
    prediction = int(model.predict(tw).round().item())
    predicted_label = sentiment_label[1][prediction]
    return predicted_label

In [74]:
from sklearn.metrics import classification_report

X_test = df_test[df_test['sentiment'] != 'neutral']
X_test = X_test['text'].astype(str)

In [78]:
y_pred = []

for txt in X_test:
    y_pred.append(predict_sentiment(txt))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42

In [90]:
y_test = df_test[df_test['sentiment'] != 'neutral']
y_test = y_test[['sentiment']]
print(y_test.head())
print(len(y_test))

  sentiment
1  positive
2  negative
3  positive
4  positive
5  positive
2104


In [87]:
y_labels = y_test.sentiment.factorize(list(sentiment_label[1])) 
print(y_labels)

(array([1, 0, 1, ..., 0, 1, 1], dtype=int64), Index(['negative', 'positive'], dtype='object'))


In [88]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.75      0.90      0.82      1001
    positive       0.89      0.72      0.80      1103

    accuracy                           0.81      2104
   macro avg       0.82      0.81      0.81      2104
weighted avg       0.82      0.81      0.81      2104



## TD-IDF - Logistic Regression

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [92]:
X_train = df_train[df_train['sentiment'] != 'neutral']
X_train = X_train['text'].astype(str)

X_test = df_test[df_test['sentiment'] != 'neutral']
X_test = X_test['text'].astype(str)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [93]:
y_train = df_train[df_train['sentiment'] != 'neutral']
y_train = y_train[['sentiment']]

model = LogisticRegression(solver="lbfgs", penalty="l2", max_iter=1000)
model.fit(X_train_tfidf, y_train)

  y = column_or_1d(y, warn=True)


In [94]:
y_pred = model.predict(X_test_tfidf)

In [95]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.86      0.90      0.88      1001
    positive       0.90      0.87      0.88      1103

    accuracy                           0.88      2104
   macro avg       0.88      0.88      0.88      2104
weighted avg       0.88      0.88      0.88      2104



## TD-IDF - Linear SVM

In [96]:
from sklearn.neural_network import MLPClassifier

In [97]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [102]:
y_train = df_train[df_train['sentiment'] != 'neutral']
y_train = y_train[['sentiment']]

model = MLPClassifier(solver="lbfgs", max_iter=1000)
model.fit(X_train_tfidf, y_train)

  y = column_or_1d(y, warn=True)


In [103]:
y_pred = model.predict(X_test_tfidf)

In [104]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.83      0.84      0.83      1001
    positive       0.85      0.84      0.85      1103

    accuracy                           0.84      2104
   macro avg       0.84      0.84      0.84      2104
weighted avg       0.84      0.84      0.84      2104



## TD-IDF - Random Forest

In [105]:
from sklearn.ensemble import RandomForestClassifier

In [106]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [108]:
y_train = df_train[df_train['sentiment'] != 'neutral']
y_train = y_train[['sentiment']]

model = RandomForestClassifier()
model.fit(X_train_tfidf, y_train)

  model.fit(X_train_tfidf, y_train)


In [109]:
y_pred = model.predict(X_test_tfidf)

In [110]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.83      0.89      0.86      1001
    positive       0.89      0.84      0.86      1103

    accuracy                           0.86      2104
   macro avg       0.86      0.86      0.86      2104
weighted avg       0.86      0.86      0.86      2104

