Added word2vec solution

This commit is contained in:
AWieczarek 2024-05-19 18:14:03 +02:00
commit d905b5fde9
6 changed files with 27716 additions and 0 deletions

5452
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
dev-0/in.tsv Normal file

File diff suppressed because it is too large Load Diff

5452
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

5447
test-A/in.tsv Normal file

File diff suppressed because it is too large Load Diff

5445
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

468
word2vec.ipynb Normal file
View File

@ -0,0 +1,468 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"### Importy"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": true,
"ExecuteTime": {
"start_time": "2024-05-19T18:08:45.407869Z",
"end_time": "2024-05-19T18:08:45.510869Z"
}
},
"outputs": [],
"source": [
"import gzip\n",
"import math\n",
"import re\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"from gensim.models import KeyedVectors\n",
"from keras.layers import Dense, Dropout\n",
"from keras.models import Sequential\n",
"from keras.regularizers import l2\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "markdown",
"source": [
"### Wczytywanie oraz czyszczenie danych"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [],
"source": [
"def load_and_filter_data(file_path):\n",
" texts = []\n",
" labels = []\n",
" with gzip.open(file_path, 'rt', encoding='utf-8') as f:\n",
" for line in f:\n",
" parts = line.strip().split('\\t')\n",
" if len(parts) == 2:\n",
" labels.append(int(parts[0]))\n",
" texts.append(parts[1])\n",
" data = pd.DataFrame({'label': labels, 'text': texts})\n",
" return data\n",
"\n",
"def load_and_filter_tsv(file_path):\n",
" texts = []\n",
" with open(file_path, 'r', encoding='utf-8') as f:\n",
" for line in f:\n",
" parts = line.strip().split('\\t')\n",
" if len(parts) == 1:\n",
" texts.append(parts[0])\n",
" data = pd.DataFrame({'text': texts})\n",
" return data\n",
"\n",
"def clean_text(text):\n",
" text = text.lower()\n",
" text = re.sub(r'\\d+', '', text)\n",
" text = re.sub(r'\\s+', ' ', text)\n",
" text = re.sub(r'[^\\w\\s]', '', text)\n",
" return text"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2024-05-19T18:08:45.425869Z",
"end_time": "2024-05-19T18:08:45.579869Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"### Wczytywanie danych treningowych oraz testowych"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [],
"source": [
"train_data = load_and_filter_data('train/train.tsv.gz')\n",
"train_data['text'] = train_data['text'].apply(clean_text)\n",
"dev_data = load_and_filter_tsv('dev-0/in.tsv')\n",
"dev_data['text'] = dev_data['text'].apply(clean_text)\n",
"test_data = load_and_filter_tsv('test-A/in.tsv')\n",
"test_data['text'] = test_data['text'].apply(clean_text)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2024-05-19T18:08:45.435869Z",
"end_time": "2024-05-19T18:08:48.741093Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"### Wczytywanie modelu word2vec"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"word2vec_model = KeyedVectors.load(\"word2vec_100_3_polish.bin\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2024-05-19T18:08:48.743093Z",
"end_time": "2024-05-19T18:09:04.607384Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"### Przekształcenie danych na wektory"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [],
"source": [
"def text_to_vector(text, model):\n",
" words = text.split()\n",
" word_vecs = [model[word] for word in words if word in model]\n",
" return np.mean(word_vecs, axis=0) if len(word_vecs) > 0 else np.zeros(model.vector_size)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2024-05-19T18:09:04.609383Z",
"end_time": "2024-05-19T18:09:04.621383Z"
}
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"X = np.array([text_to_vector(text, word2vec_model) for text in train_data['text']])\n",
"y = np.array(train_data['label'])"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2024-05-19T18:09:04.623384Z",
"end_time": "2024-05-19T18:09:12.703303Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"### Dodatkowy podział danych na zbiór treningowy oraz walidacyjny"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [],
"source": [
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2024-05-19T18:09:12.705305Z",
"end_time": "2024-05-19T18:09:12.749303Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"### Model"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\adamw\\PycharmProjects\\pythonProject\\venv\\lib\\site-packages\\keras\\src\\layers\\core\\dense.py:86: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.\n",
" super().__init__(activity_regularizer=activity_regularizer, **kwargs)\n"
]
}
],
"source": [
"model = Sequential()\n",
"model.add(Dense(256, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.001)))\n",
"model.add(Dropout(0.5))\n",
"model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))\n",
"model.add(Dropout(0.5))\n",
"model.add(Dense(1, activation='sigmoid'))"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2024-05-19T18:09:12.750302Z",
"end_time": "2024-05-19T18:09:12.954821Z"
}
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.8631 - loss: 0.4892 - val_accuracy: 0.9238 - val_loss: 0.2468\n",
"Epoch 2/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9240 - loss: 0.2481 - val_accuracy: 0.9367 - val_loss: 0.2040\n",
"Epoch 3/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9289 - loss: 0.2213 - val_accuracy: 0.9377 - val_loss: 0.1938\n",
"Epoch 4/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9293 - loss: 0.2195 - val_accuracy: 0.9417 - val_loss: 0.1869\n",
"Epoch 5/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9328 - loss: 0.2120 - val_accuracy: 0.9364 - val_loss: 0.1930\n",
"Epoch 6/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9302 - loss: 0.2114 - val_accuracy: 0.9384 - val_loss: 0.1898\n",
"Epoch 7/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9312 - loss: 0.2134 - val_accuracy: 0.9438 - val_loss: 0.1803\n",
"Epoch 8/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9316 - loss: 0.2091 - val_accuracy: 0.9413 - val_loss: 0.1822\n",
"Epoch 9/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9330 - loss: 0.2104 - val_accuracy: 0.9228 - val_loss: 0.2174\n",
"Epoch 10/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9325 - loss: 0.2093 - val_accuracy: 0.9402 - val_loss: 0.1839\n",
"Epoch 11/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9298 - loss: 0.2123 - val_accuracy: 0.9411 - val_loss: 0.1834\n",
"Epoch 12/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9323 - loss: 0.2071 - val_accuracy: 0.9445 - val_loss: 0.1774\n",
"Epoch 13/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9326 - loss: 0.2089 - val_accuracy: 0.9439 - val_loss: 0.1786\n",
"Epoch 14/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9329 - loss: 0.2050 - val_accuracy: 0.9387 - val_loss: 0.1866\n",
"Epoch 15/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9331 - loss: 0.2035 - val_accuracy: 0.9447 - val_loss: 0.1815\n",
"Epoch 16/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9326 - loss: 0.2078 - val_accuracy: 0.9352 - val_loss: 0.1954\n",
"Epoch 17/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9331 - loss: 0.2059 - val_accuracy: 0.9436 - val_loss: 0.1762\n",
"Epoch 18/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9332 - loss: 0.2050 - val_accuracy: 0.9437 - val_loss: 0.1765\n",
"Epoch 19/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9343 - loss: 0.2038 - val_accuracy: 0.9452 - val_loss: 0.1788\n",
"Epoch 20/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9343 - loss: 0.2037 - val_accuracy: 0.9368 - val_loss: 0.1887\n",
"Epoch 21/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9326 - loss: 0.2054 - val_accuracy: 0.9435 - val_loss: 0.1773\n",
"Epoch 22/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9327 - loss: 0.2059 - val_accuracy: 0.9417 - val_loss: 0.1813\n",
"Epoch 23/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9333 - loss: 0.2041 - val_accuracy: 0.9405 - val_loss: 0.1809\n",
"Epoch 24/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9340 - loss: 0.2045 - val_accuracy: 0.9393 - val_loss: 0.1840\n",
"Epoch 25/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9324 - loss: 0.2046 - val_accuracy: 0.9405 - val_loss: 0.1833\n",
"Epoch 26/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9338 - loss: 0.2030 - val_accuracy: 0.9404 - val_loss: 0.1825\n",
"Epoch 27/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9346 - loss: 0.2051 - val_accuracy: 0.9385 - val_loss: 0.1875\n",
"Epoch 28/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9306 - loss: 0.2091 - val_accuracy: 0.9431 - val_loss: 0.1784\n",
"Epoch 29/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9352 - loss: 0.2033 - val_accuracy: 0.9396 - val_loss: 0.1877\n",
"Epoch 30/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9333 - loss: 0.2037 - val_accuracy: 0.9403 - val_loss: 0.1808\n",
"Epoch 31/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9313 - loss: 0.2090 - val_accuracy: 0.9413 - val_loss: 0.1783\n",
"Epoch 32/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9340 - loss: 0.2063 - val_accuracy: 0.9428 - val_loss: 0.1815\n",
"Epoch 33/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9324 - loss: 0.2029 - val_accuracy: 0.9405 - val_loss: 0.1822\n",
"Epoch 34/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9328 - loss: 0.2046 - val_accuracy: 0.9411 - val_loss: 0.1824\n",
"Epoch 35/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9322 - loss: 0.2063 - val_accuracy: 0.9414 - val_loss: 0.1820\n"
]
},
{
"data": {
"text/plain": "<keras.src.callbacks.history.History at 0x2809ceeab60>"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
"model.fit(X_train, y_train, epochs=35, batch_size=32, validation_data=(X_val, y_val))"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2024-05-19T18:09:12.957822Z",
"end_time": "2024-05-19T18:11:23.248486Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"### Ewaluacja modelu na zbiorze walidacyjnym"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[1m614/614\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 766us/step - accuracy: 0.9409 - loss: 0.1851\n",
"Accuracy on validation set: 0.9413533210754395\n"
]
}
],
"source": [
"loss, accuracy = model.evaluate(X_val, y_val)\n",
"print(f'Accuracy on validation set: {accuracy}')"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2024-05-19T18:11:23.208454Z",
"end_time": "2024-05-19T18:11:23.753363Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"### Predykcja na danych walidacyjnych oraz testowych"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 882us/step\n",
"\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 700us/step\n"
]
}
],
"source": [
"X_dev = np.array([text_to_vector(text, word2vec_model) for text in dev_data['text']])\n",
"X_test = np.array([text_to_vector(text, word2vec_model) for text in test_data['text']])\n",
"\n",
"dev_predictions = model.predict(X_dev)\n",
"test_predictions = model.predict(X_test)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2024-05-19T18:11:23.754367Z",
"end_time": "2024-05-19T18:11:25.114539Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"### Zapis wyników"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 17,
"outputs": [],
"source": [
"dev_predictions = (dev_predictions > 0.5).astype(int)\n",
"test_predictions = (test_predictions > 0.5).astype(int)\n",
"\n",
"pd.DataFrame(dev_predictions).to_csv('dev-0/out.tsv', sep='\\t', header=False, index=False)\n",
"pd.DataFrame(test_predictions).to_csv('test-A/out.tsv', sep='\\t', header=False, index=False)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2024-05-19T18:11:25.117540Z",
"end_time": "2024-05-19T18:11:25.149572Z"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}