word2vec/word2vec.ipynb
2024-05-19 18:25:39 +02:00

19 KiB
Raw Permalink Blame History

Importy

import gzip
import math
import re

import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.regularizers import l2
from sklearn.model_selection import train_test_split

Wczytywanie oraz czyszczenie danych

def load_and_filter_data(file_path):
    texts = []
    labels = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                labels.append(int(parts[0]))
                texts.append(parts[1])
    data = pd.DataFrame({'label': labels, 'text': texts})
    return data

def load_and_filter_tsv(file_path):
    texts = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 1:
                texts.append(parts[0])
    data = pd.DataFrame({'text': texts})
    return data

def load_labels(file_path):
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            labels.append(int(line.strip()))
    return np.array(labels)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

Wczytywanie danych treningowych oraz testowych

train_data = load_and_filter_data('train/train.tsv.gz')
train_data['text'] = train_data['text'].apply(clean_text)
dev_data = load_and_filter_tsv('dev-0/in.tsv')
dev_data['text'] = dev_data['text'].apply(clean_text)
dev_labels = load_labels('dev-0/expected.tsv')
test_data = load_and_filter_tsv('test-A/in.tsv')
test_data['text'] = test_data['text'].apply(clean_text)

Wczytywanie modelu word2vec

word2vec_model = KeyedVectors.load("word2vec_100_3_polish.bin")

Przekształcenie danych na wektory

def text_to_vector(text, model):
    words = text.split()
    word_vecs = [model[word] for word in words if word in model]
    return np.mean(word_vecs, axis=0) if len(word_vecs) > 0 else np.zeros(model.vector_size)
X_train = np.array([text_to_vector(text, word2vec_model) for text in train_data['text']])
y_train = np.array(train_data['label'])
X_dev = np.array([text_to_vector(text, word2vec_model) for text in dev_data['text']])
X_test = np.array([text_to_vector(text, word2vec_model) for text in test_data['text']])

Model

model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
C:\Users\adamw\PycharmProjects\pythonProject\venv\lib\site-packages\keras\src\layers\core\dense.py:86: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=35, batch_size=32, validation_data=(X_dev, dev_labels))
Epoch 1/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 6s 2ms/step - accuracy: 0.8769 - loss: 0.4540 - val_accuracy: 0.9310 - val_loss: 0.2222
Epoch 2/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - accuracy: 0.9270 - loss: 0.2362 - val_accuracy: 0.9303 - val_loss: 0.2106
Epoch 3/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - accuracy: 0.9320 - loss: 0.2191 - val_accuracy: 0.9415 - val_loss: 0.1890
Epoch 4/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 1ms/step - accuracy: 0.9306 - loss: 0.2139 - val_accuracy: 0.9406 - val_loss: 0.1850
Epoch 5/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 1ms/step - accuracy: 0.9322 - loss: 0.2098 - val_accuracy: 0.9395 - val_loss: 0.1883
Epoch 6/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - accuracy: 0.9325 - loss: 0.2074 - val_accuracy: 0.9404 - val_loss: 0.1814
Epoch 7/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 1ms/step - accuracy: 0.9320 - loss: 0.2093 - val_accuracy: 0.9441 - val_loss: 0.1810
Epoch 8/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - accuracy: 0.9326 - loss: 0.2094 - val_accuracy: 0.9441 - val_loss: 0.1804
Epoch 9/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 1ms/step - accuracy: 0.9327 - loss: 0.2064 - val_accuracy: 0.9400 - val_loss: 0.1807
Epoch 10/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 1ms/step - accuracy: 0.9319 - loss: 0.2073 - val_accuracy: 0.9408 - val_loss: 0.1799
Epoch 11/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - accuracy: 0.9324 - loss: 0.2061 - val_accuracy: 0.9391 - val_loss: 0.1826
Epoch 12/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 1ms/step - accuracy: 0.9320 - loss: 0.2066 - val_accuracy: 0.9433 - val_loss: 0.1814
Epoch 13/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 1ms/step - accuracy: 0.9325 - loss: 0.2066 - val_accuracy: 0.9382 - val_loss: 0.1882
Epoch 14/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - accuracy: 0.9330 - loss: 0.2045 - val_accuracy: 0.9406 - val_loss: 0.1813
Epoch 15/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - accuracy: 0.9316 - loss: 0.2106 - val_accuracy: 0.9408 - val_loss: 0.1831
Epoch 16/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 1ms/step - accuracy: 0.9338 - loss: 0.2036 - val_accuracy: 0.9384 - val_loss: 0.1862
Epoch 17/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - accuracy: 0.9330 - loss: 0.2063 - val_accuracy: 0.9398 - val_loss: 0.1862
Epoch 18/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 1ms/step - accuracy: 0.9320 - loss: 0.2102 - val_accuracy: 0.9408 - val_loss: 0.1802
Epoch 19/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - accuracy: 0.9323 - loss: 0.2059 - val_accuracy: 0.9397 - val_loss: 0.1794
Epoch 20/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - accuracy: 0.9338 - loss: 0.2039 - val_accuracy: 0.9431 - val_loss: 0.1728
Epoch 21/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 1ms/step - accuracy: 0.9319 - loss: 0.2102 - val_accuracy: 0.9415 - val_loss: 0.1787
Epoch 22/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 1ms/step - accuracy: 0.9351 - loss: 0.2034 - val_accuracy: 0.9433 - val_loss: 0.1780
Epoch 23/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - accuracy: 0.9330 - loss: 0.2059 - val_accuracy: 0.9404 - val_loss: 0.1759
Epoch 24/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - accuracy: 0.9335 - loss: 0.2042 - val_accuracy: 0.9409 - val_loss: 0.1789
Epoch 25/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - accuracy: 0.9341 - loss: 0.2052 - val_accuracy: 0.9389 - val_loss: 0.1813
Epoch 26/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - accuracy: 0.9322 - loss: 0.2078 - val_accuracy: 0.9406 - val_loss: 0.1813
Epoch 27/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - accuracy: 0.9319 - loss: 0.2069 - val_accuracy: 0.9283 - val_loss: 0.2017
Epoch 28/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - accuracy: 0.9324 - loss: 0.2083 - val_accuracy: 0.9409 - val_loss: 0.1883
Epoch 29/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - accuracy: 0.9326 - loss: 0.2054 - val_accuracy: 0.9411 - val_loss: 0.1791
Epoch 30/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - accuracy: 0.9333 - loss: 0.2041 - val_accuracy: 0.9419 - val_loss: 0.1769
Epoch 31/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - accuracy: 0.9343 - loss: 0.2029 - val_accuracy: 0.9439 - val_loss: 0.1756
Epoch 32/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 5s 1ms/step - accuracy: 0.9330 - loss: 0.2060 - val_accuracy: 0.9384 - val_loss: 0.1805
Epoch 33/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - accuracy: 0.9333 - loss: 0.2023 - val_accuracy: 0.9395 - val_loss: 0.1780
Epoch 34/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - accuracy: 0.9347 - loss: 0.2025 - val_accuracy: 0.9408 - val_loss: 0.1806
Epoch 35/35
3067/3067 ━━━━━━━━━━━━━━━━━━━━ 4s 1ms/step - accuracy: 0.9315 - loss: 0.2038 - val_accuracy: 0.9419 - val_loss: 0.1762
<keras.src.callbacks.history.History at 0x280fa5dcb80>

Ewaluacja modelu na zbiorze walidacyjnym

loss, accuracy = model.evaluate(X_dev, dev_labels)
print(f'Accuracy on validation set: {accuracy}')
171/171 ━━━━━━━━━━━━━━━━━━━━ 0s 814us/step - accuracy: 0.9413 - loss: 0.1863
Accuracy on validation set: 0.9418562054634094

Predykcja na danych walidacyjnych oraz testowych

dev_predictions = model.predict(X_dev)
test_predictions = model.predict(X_test)
171/171 ━━━━━━━━━━━━━━━━━━━━ 0s 900us/step
171/171 ━━━━━━━━━━━━━━━━━━━━ 0s 765us/step

Zapis wyników

dev_predictions = (dev_predictions > 0.5).astype(int)
test_predictions = (test_predictions > 0.5).astype(int)

pd.DataFrame(dev_predictions).to_csv('dev-0/out.tsv', sep='\t', header=False, index=False)
pd.DataFrame(test_predictions).to_csv('test-A/out.tsv', sep='\t', header=False, index=False)