### Importy

In [18]:
import gzip
import math
import re

import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.regularizers import l2
from sklearn.model_selection import train_test_split

### Wczytywanie oraz czyszczenie danych

In [19]:
def load_and_filter_data(file_path):
    texts = []
    labels = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                labels.append(int(parts[0]))
                texts.append(parts[1])
    data = pd.DataFrame({'label': labels, 'text': texts})
    return data

def load_and_filter_tsv(file_path):
    texts = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 1:
                texts.append(parts[0])
    data = pd.DataFrame({'text': texts})
    return data

def load_labels(file_path):
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            labels.append(int(line.strip()))
    return np.array(labels)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

### Wczytywanie danych treningowych oraz testowych

In [20]:
train_data = load_and_filter_data('train/train.tsv.gz')
train_data['text'] = train_data['text'].apply(clean_text)
dev_data = load_and_filter_tsv('dev-0/in.tsv')
dev_data['text'] = dev_data['text'].apply(clean_text)
dev_labels = load_labels('dev-0/expected.tsv')
test_data = load_and_filter_tsv('test-A/in.tsv')
test_data['text'] = test_data['text'].apply(clean_text)

### Wczytywanie modelu word2vec

In [21]:
word2vec_model = KeyedVectors.load("word2vec_100_3_polish.bin")

### Przekształcenie danych na wektory

In [22]:
def text_to_vector(text, model):
    words = text.split()
    word_vecs = [model[word] for word in words if word in model]
    return np.mean(word_vecs, axis=0) if len(word_vecs) > 0 else np.zeros(model.vector_size)

In [23]:
X_train = np.array([text_to_vector(text, word2vec_model) for text in train_data['text']])
y_train = np.array(train_data['label'])
X_dev = np.array([text_to_vector(text, word2vec_model) for text in dev_data['text']])
X_test = np.array([text_to_vector(text, word2vec_model) for text in test_data['text']])

### Model

In [24]:
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [25]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=35, batch_size=32, validation_data=(X_dev, dev_labels))

Epoch 1/35
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.8769 - loss: 0.4540 - val_accuracy: 0.9310 - val_loss: 0.2222
Epoch 2/35
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9270 - loss: 0.2362 - val_accuracy: 0.9303 - val_loss: 0.2106
Epoch 3/35
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9320 - loss: 0.2191 - val_accuracy: 0.9415 - val_loss: 0.1890
Epoch 4/35
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9306 - loss: 0.2139 - val_accuracy: 0.9406 - val_loss: 0.1850
Epoch 5/35
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.9322 - loss: 0.2098 - val_accuracy: 0.9395 - val_loss: 0.1883
Epoch 6/35
[1m3067/3067[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9325 - loss: 0.2074 - val_accuracy: 0.9404 - val_loss: 0.1814
Epoch 7/35
[1m3

<keras.src.callbacks.history.History at 0x280fa5dcb80>

### Ewaluacja modelu na zbiorze walidacyjnym

In [26]:
loss, accuracy = model.evaluate(X_dev, dev_labels)
print(f'Accuracy on validation set: {accuracy}')

[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 814us/step - accuracy: 0.9413 - loss: 0.1863
Accuracy on validation set: 0.9418562054634094


### Predykcja na danych walidacyjnych oraz testowych

In [27]:
dev_predictions = model.predict(X_dev)
test_predictions = model.predict(X_test)

[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 900us/step
[1m171/171[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 765us/step


### Zapis wyników

In [28]:
dev_predictions = (dev_predictions > 0.5).astype(int)
test_predictions = (test_predictions > 0.5).astype(int)

pd.DataFrame(dev_predictions).to_csv('dev-0/out.tsv', sep='\t', header=False, index=False)
pd.DataFrame(test_predictions).to_csv('test-A/out.tsv', sep='\t', header=False, index=False)