19 KiB
19 KiB
Importy
import gzip
import math
import re
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.regularizers import l2
from sklearn.model_selection import train_test_split
Wczytywanie oraz czyszczenie danych
def load_and_filter_data(file_path):
texts = []
labels = []
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
for line in f:
parts = line.strip().split('\t')
if len(parts) == 2:
labels.append(int(parts[0]))
texts.append(parts[1])
data = pd.DataFrame({'label': labels, 'text': texts})
return data
def load_and_filter_tsv(file_path):
texts = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
parts = line.strip().split('\t')
if len(parts) == 1:
texts.append(parts[0])
data = pd.DataFrame({'text': texts})
return data
def load_labels(file_path):
labels = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
labels.append(int(line.strip()))
return np.array(labels)
def clean_text(text):
text = text.lower()
text = re.sub(r'\d+', '', text)
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s]', '', text)
return text
Wczytywanie danych treningowych oraz testowych
train_data = load_and_filter_data('train/train.tsv.gz')
train_data['text'] = train_data['text'].apply(clean_text)
dev_data = load_and_filter_tsv('dev-0/in.tsv')
dev_data['text'] = dev_data['text'].apply(clean_text)
dev_labels = load_labels('dev-0/expected.tsv')
test_data = load_and_filter_tsv('test-A/in.tsv')
test_data['text'] = test_data['text'].apply(clean_text)
Wczytywanie modelu word2vec
word2vec_model = KeyedVectors.load("word2vec_100_3_polish.bin")
Przekształcenie danych na wektory
def text_to_vector(text, model):
words = text.split()
word_vecs = [model[word] for word in words if word in model]
return np.mean(word_vecs, axis=0) if len(word_vecs) > 0 else np.zeros(model.vector_size)
X_train = np.array([text_to_vector(text, word2vec_model) for text in train_data['text']])
y_train = np.array(train_data['label'])
X_dev = np.array([text_to_vector(text, word2vec_model) for text in dev_data['text']])
X_test = np.array([text_to_vector(text, word2vec_model) for text in test_data['text']])
Model
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
C:\Users\adamw\PycharmProjects\pythonProject\venv\lib\site-packages\keras\src\layers\core\dense.py:86: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(activity_regularizer=activity_regularizer, **kwargs)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=35, batch_size=32, validation_data=(X_dev, dev_labels))
Epoch 1/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8769 - loss: 0.4540 - val_accuracy: 0.9310 - val_loss: 0.2222 Epoch 2/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9270 - loss: 0.2362 - val_accuracy: 0.9303 - val_loss: 0.2106 Epoch 3/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9320 - loss: 0.2191 - val_accuracy: 0.9415 - val_loss: 0.1890 Epoch 4/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9306 - loss: 0.2139 - val_accuracy: 0.9406 - val_loss: 0.1850 Epoch 5/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9322 - loss: 0.2098 - val_accuracy: 0.9395 - val_loss: 0.1883 Epoch 6/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9325 - loss: 0.2074 - val_accuracy: 0.9404 - val_loss: 0.1814 Epoch 7/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9320 - loss: 0.2093 - val_accuracy: 0.9441 - val_loss: 0.1810 Epoch 8/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9326 - loss: 0.2094 - val_accuracy: 0.9441 - val_loss: 0.1804 Epoch 9/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9327 - loss: 0.2064 - val_accuracy: 0.9400 - val_loss: 0.1807 Epoch 10/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9319 - loss: 0.2073 - val_accuracy: 0.9408 - val_loss: 0.1799 Epoch 11/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9324 - loss: 0.2061 - val_accuracy: 0.9391 - val_loss: 0.1826 Epoch 12/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9320 - loss: 0.2066 - val_accuracy: 0.9433 - val_loss: 0.1814 Epoch 13/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9325 - loss: 0.2066 - val_accuracy: 0.9382 - val_loss: 0.1882 Epoch 14/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.9330 - loss: 0.2045 - val_accuracy: 0.9406 - val_loss: 0.1813 Epoch 15/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9316 - loss: 0.2106 - val_accuracy: 0.9408 - val_loss: 0.1831 Epoch 16/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9338 - loss: 0.2036 - val_accuracy: 0.9384 - val_loss: 0.1862 Epoch 17/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9330 - loss: 0.2063 - val_accuracy: 0.9398 - val_loss: 0.1862 Epoch 18/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9320 - loss: 0.2102 - val_accuracy: 0.9408 - val_loss: 0.1802 Epoch 19/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9323 - loss: 0.2059 - val_accuracy: 0.9397 - val_loss: 0.1794 Epoch 20/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9338 - loss: 0.2039 - val_accuracy: 0.9431 - val_loss: 0.1728 Epoch 21/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9319 - loss: 0.2102 - val_accuracy: 0.9415 - val_loss: 0.1787 Epoch 22/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9351 - loss: 0.2034 - val_accuracy: 0.9433 - val_loss: 0.1780 Epoch 23/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9330 - loss: 0.2059 - val_accuracy: 0.9404 - val_loss: 0.1759 Epoch 24/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9335 - loss: 0.2042 - val_accuracy: 0.9409 - val_loss: 0.1789 Epoch 25/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.9341 - loss: 0.2052 - val_accuracy: 0.9389 - val_loss: 0.1813 Epoch 26/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.9322 - loss: 0.2078 - val_accuracy: 0.9406 - val_loss: 0.1813 Epoch 27/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.9319 - loss: 0.2069 - val_accuracy: 0.9283 - val_loss: 0.2017 Epoch 28/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.9324 - loss: 0.2083 - val_accuracy: 0.9409 - val_loss: 0.1883 Epoch 29/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.9326 - loss: 0.2054 - val_accuracy: 0.9411 - val_loss: 0.1791 Epoch 30/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.9333 - loss: 0.2041 - val_accuracy: 0.9419 - val_loss: 0.1769 Epoch 31/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.9343 - loss: 0.2029 - val_accuracy: 0.9439 - val_loss: 0.1756 Epoch 32/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9330 - loss: 0.2060 - val_accuracy: 0.9384 - val_loss: 0.1805 Epoch 33/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.9333 - loss: 0.2023 - val_accuracy: 0.9395 - val_loss: 0.1780 Epoch 34/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.9347 - loss: 0.2025 - val_accuracy: 0.9408 - val_loss: 0.1806 Epoch 35/35 [1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.9315 - loss: 0.2038 - val_accuracy: 0.9419 - val_loss: 0.1762
<keras.src.callbacks.history.History at 0x280fa5dcb80>
Ewaluacja modelu na zbiorze walidacyjnym
loss, accuracy = model.evaluate(X_dev, dev_labels)
print(f'Accuracy on validation set: {accuracy}')
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 814us/step - accuracy: 0.9413 - loss: 0.1863 Accuracy on validation set: 0.9418562054634094
Predykcja na danych walidacyjnych oraz testowych
dev_predictions = model.predict(X_dev)
test_predictions = model.predict(X_test)
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 900us/step [1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 765us/step
Zapis wyników
dev_predictions = (dev_predictions > 0.5).astype(int)
test_predictions = (test_predictions > 0.5).astype(int)
pd.DataFrame(dev_predictions).to_csv('dev-0/out.tsv', sep='\t', header=False, index=False)
pd.DataFrame(test_predictions).to_csv('test-A/out.tsv', sep='\t', header=False, index=False)