213 KiB
213 KiB
Importy
import pandas as pd
import os
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.optimizers import Adam
from transformers import pipeline
from tqdm import tqdm
from keras_preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
Pobiernie danych
!kaggle datasets download -d shivamkushwaha/bbc-full-text-document-classification --unzip
Warning: Looks like you're using an outdated API Version, please consider updating (server 1.6.14 / client 1.6.11) Dataset URL: https://www.kaggle.com/datasets/shivamkushwaha/bbc-full-text-document-classification License(s): DbCL-1.0 Downloading bbc-full-text-document-classification.zip to C:\Users\adamw\PycharmProjects\pythonProject\dl_projekt
0%| | 0.00/5.59M [00:00<?, ?B/s] 18%|#7 | 1.00M/5.59M [00:00<00:03, 1.58MB/s] 36%|###5 | 2.00M/5.59M [00:00<00:01, 3.16MB/s] 72%|#######1 | 4.00M/5.59M [00:00<00:00, 6.34MB/s] 100%|##########| 5.59M/5.59M [00:00<00:00, 6.05MB/s]
Sprawdzenie dostępności GPU
# Check GPU availability
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))
print(tf.config.list_physical_devices('GPU'))
Num GPUs Available: 1 [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Ładowanie danych
datapath = 'bbc/'
directory, file, title, text, label = [], [], [], [], []
for dirname, _, filenames in os.walk(datapath):
for filename in filenames:
if filename == 'README.TXT':
continue
directory.append(dirname)
file.append(filename)
label.append(dirname.split('/')[-1])
fullpathfile = os.path.join(dirname, filename)
with open(fullpathfile, 'r', encoding="utf8", errors='ignore') as infile:
intext = ''
firstline = True
for line in infile:
if firstline:
title.append(line.replace('\n', ''))
firstline = False
else:
intext += ' ' + line.replace('\n', '')
text.append(intext)
Konwersja na DataFrame
df = pd.DataFrame(list(zip(directory, file, title, text, label)), columns=['directory', 'file', 'title', 'text', 'label'])
df = df.filter(['title', 'text', 'label'], axis=1)
df.head()
title | text | label | |
---|---|---|---|
0 | Ad sales boost Time Warner profit | Quarterly profits at US media giant TimeWarn... | business |
1 | Dollar gains on Greenspan speech | The dollar has hit its highest level against... | business |
2 | Yukos unit buyer faces loan claim | The owners of embattled Russian oil giant Yu... | business |
3 | High fuel prices hit BA's profits | British Airways has blamed high fuel prices ... | business |
4 | Pernod takeover talk lifts Domecq | Shares in UK drinks and food firm Allied Dom... | business |
df.shape
(2225, 3)
df["label"].unique()
array(['business', 'entertainment', 'politics', 'sport', 'tech'], dtype=object)
df.isnull().sum() # Sprawdzenie brakujących wartości
title 0 text 0 label 0 dtype: int64
label_counts = df['label'].value_counts()
plt.figure(figsize=(6, 6))
sns.barplot(x=label_counts.index, y=label_counts.values)
plt.title("Ilość elementów w danej klasie")
plt.xlabel("Type")
plt.ylabel("Number of Articles")
plt.show()
Podział danych na zbiory treningowy, walidacyjny i testowy
X_train_full, X_test, y_train_full, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, X_test.shape
((1424,), (356,), (445,))
Tf-idf z wykorzystaniem Naive Bayes
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()
y_pred = nb_classifier.predict(X_test_tfidf)
nb_accuracy = accuracy_score(y_test, y_pred)
print(f'Naive Bayes Accuracy: {nb_accuracy}')
Naive Bayes Accuracy: 0.9707865168539326
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Naive Bayes Confusion Matrix')
plt.show()
print(classification_report(y_test, y_pred))
precision recall f1-score support business 0.97 0.97 0.97 115 entertainment 0.99 0.93 0.96 72 politics 0.93 0.97 0.95 76 sport 1.00 0.99 1.00 102 tech 0.96 0.99 0.98 80 accuracy 0.97 445 macro avg 0.97 0.97 0.97 445 weighted avg 0.97 0.97 0.97 445
LSTM
Przygotowanie danych
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(y_train)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
y_train = onehot_encoder.fit_transform(integer_encoded)
integer_encoded = label_encoder.transform(y_val).reshape(len(y_val), 1)
y_val = onehot_encoder.transform(integer_encoded)
integer_encoded = label_encoder.transform(y_test).reshape(len(y_test), 1)
y_test = onehot_encoder.transform(integer_encoded)
Tokenizacja
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = pad_sequences(X_train, maxlen=1000)
X_val = pad_sequences(X_val, maxlen=1000)
X_test = pad_sequences(X_test, maxlen=1000)
Model
model = Sequential([
Embedding(input_dim=5000, output_dim=128, input_length=1000),
LSTM(128, return_sequences=True),
Dropout(0.2),
LSTM(64),
Dropout(0.2),
Dense(y_train.shape[1], activation='softmax')
])
optimizer = Adam(learning_rate=0.001)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.fit(X_train, y_train, epochs=15, batch_size=64, validation_data=(X_val, y_val))
Epoch 1/15 23/23 [==============================] - 8s 147ms/step - loss: 1.5697 - accuracy: 0.2725 - val_loss: 1.2654 - val_accuracy: 0.4410 Epoch 2/15 23/23 [==============================] - 2s 108ms/step - loss: 1.1485 - accuracy: 0.4698 - val_loss: 1.1248 - val_accuracy: 0.5197 Epoch 3/15 23/23 [==============================] - 2s 109ms/step - loss: 0.8194 - accuracy: 0.6678 - val_loss: 0.6958 - val_accuracy: 0.8090 Epoch 4/15 23/23 [==============================] - 2s 107ms/step - loss: 0.3153 - accuracy: 0.9178 - val_loss: 0.4955 - val_accuracy: 0.8624 Epoch 5/15 23/23 [==============================] - 2s 104ms/step - loss: 0.1949 - accuracy: 0.9396 - val_loss: 0.4209 - val_accuracy: 0.8567 Epoch 6/15 23/23 [==============================] - 2s 106ms/step - loss: 0.0780 - accuracy: 0.9860 - val_loss: 0.5346 - val_accuracy: 0.8567 Epoch 7/15 23/23 [==============================] - 2s 106ms/step - loss: 0.0673 - accuracy: 0.9874 - val_loss: 0.4814 - val_accuracy: 0.8904 Epoch 8/15 23/23 [==============================] - 2s 109ms/step - loss: 0.0527 - accuracy: 0.9888 - val_loss: 0.4456 - val_accuracy: 0.8792 Epoch 9/15 23/23 [==============================] - 2s 108ms/step - loss: 0.0259 - accuracy: 0.9944 - val_loss: 0.4536 - val_accuracy: 0.8736 Epoch 10/15 23/23 [==============================] - 2s 106ms/step - loss: 0.0147 - accuracy: 0.9993 - val_loss: 0.4479 - val_accuracy: 0.8792 Epoch 11/15 23/23 [==============================] - 2s 107ms/step - loss: 0.0179 - accuracy: 0.9958 - val_loss: 0.5509 - val_accuracy: 0.8764 Epoch 12/15 23/23 [==============================] - 2s 106ms/step - loss: 0.0269 - accuracy: 0.9951 - val_loss: 0.4670 - val_accuracy: 0.8764 Epoch 13/15 23/23 [==============================] - 2s 106ms/step - loss: 0.0150 - accuracy: 0.9972 - val_loss: 0.5061 - val_accuracy: 0.8652 Epoch 14/15 23/23 [==============================] - 2s 106ms/step - loss: 0.0094 - accuracy: 0.9993 - val_loss: 0.4863 - val_accuracy: 0.8708 Epoch 15/15 23/23 [==============================] - 2s 106ms/step - loss: 0.0050 - accuracy: 1.0000 - val_loss: 0.4513 - val_accuracy: 0.8904
<keras.callbacks.History at 0x1d303c4ba00>
Ocena modelu
val_loss, val_accuracy = model.evaluate(X_val, y_val)
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'LSTM Validation Accuracy: {val_accuracy}, loss: {val_loss}')
print(f'LSTM Test Accuracy: {test_accuracy}, loss: {test_loss}')
12/12 [==============================] - 0s 38ms/step - loss: 0.4513 - accuracy: 0.8904 14/14 [==============================] - 1s 34ms/step - loss: 0.5677 - accuracy: 0.8629 LSTM Validation Accuracy: 0.8904494643211365, loss: 0.4512944519519806 LSTM Test Accuracy: 0.8629213571548462, loss: 0.5676819086074829
y_pred = model.predict(X_test)
y_pred = onehot_encoder.inverse_transform(y_pred)
y_test = onehot_encoder.inverse_transform(y_test)
y_pred = label_encoder.inverse_transform(y_pred)
y_test = label_encoder.inverse_transform(y_test)
14/14 [==============================] - 1s 35ms/step
C:\Users\adamw\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\preprocessing\_label.py:154: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('LSTM Confusion Matrix')
plt.show()
print(classification_report(y_test, y_pred))
precision recall f1-score support business 0.86 0.86 0.86 115 entertainment 0.92 0.79 0.85 72 politics 0.72 0.93 0.81 76 sport 0.96 0.88 0.92 102 tech 0.89 0.84 0.86 80 accuracy 0.86 445 macro avg 0.87 0.86 0.86 445 weighted avg 0.87 0.86 0.86 445
Trnsformers pipeline na pre-trenowanym modelu
# Encode labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
# Split data
X_train_full, X_test, y_train_full, y_test = train_test_split(df['text'], df['label_encoded'], test_size=0.2,
random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)
classifier = pipeline('text-classification', model='distilbert-base-uncased')
def get_predictions(texts):
predictions = []
for text in tqdm(texts, desc="Processing"):
result = classifier(text, truncation=True)
predicted_label = int(result[0]['label'].split('_')[-1])
predictions.append(predicted_label)
return predictions
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
val_predictions = get_predictions(X_val)
test_predictions = get_predictions(X_test)
trans_val_accuracy = (val_predictions == y_val).mean()
trans_test_accuracy = (test_predictions == y_test).mean()
print(f'Pre-trained Model Validation Accuracy: {trans_val_accuracy}')
print(f'Pre-trained Model Test Accuracy: {trans_test_accuracy}')
Processing: 100%|██████████| 356/356 [02:00<00:00, 2.96it/s] Processing: 100%|██████████| 445/445 [02:47<00:00, 2.66it/s]
Pre-trained Model Validation Accuracy: 0.20224719101123595 Pre-trained Model Test Accuracy: 0.25842696629213485
conf_matrix = confusion_matrix(y_test, test_predictions)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Pre-trained Transformer Confusion Matrix')
plt.show()
print(classification_report(y_test, test_predictions))
precision recall f1-score support 0 0.26 0.99 0.41 115 1 0.14 0.01 0.03 72 2 0.00 0.00 0.00 76 3 0.00 0.00 0.00 102 4 0.00 0.00 0.00 80 accuracy 0.26 445 macro avg 0.08 0.20 0.09 445 weighted avg 0.09 0.26 0.11 445
C:\Users\adamw\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\metrics\_classification.py:1327: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\adamw\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\metrics\_classification.py:1327: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\adamw\PycharmProjects\pythonProject\venv\lib\site-packages\sklearn\metrics\_classification.py:1327: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
Podsumowanie
results = pd.DataFrame({
'Model': ['Naive Bayes', 'LSTM', 'Pre-trained Transformer'],
'Validation Accuracy': [nb_accuracy, val_accuracy, trans_val_accuracy],
'Test Accuracy': [nb_accuracy, test_accuracy, trans_test_accuracy]
})
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Validation Accuracy', data=results)
plt.title('Porównanie dokładności walidacyjnej modeli')
plt.ylabel('Validation Accuracy')
plt.show()
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Test Accuracy', data=results)
plt.title('Porównanie dokładności testowej modeli')
plt.ylabel('Test Accuracy')
plt.show()