Added word2vec solution

This commit is contained in:
AWieczarek 2024-05-19 18:25:39 +02:00
parent d905b5fde9
commit 1a5ff2253f

View File

@ -11,12 +11,12 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 18,
"metadata": { "metadata": {
"collapsed": true, "collapsed": true,
"ExecuteTime": { "ExecuteTime": {
"start_time": "2024-05-19T18:08:45.407869Z", "start_time": "2024-05-19T18:21:27.211216Z",
"end_time": "2024-05-19T18:08:45.510869Z" "end_time": "2024-05-19T18:21:27.318205Z"
} }
}, },
"outputs": [], "outputs": [],
@ -45,7 +45,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 19,
"outputs": [], "outputs": [],
"source": [ "source": [
"def load_and_filter_data(file_path):\n", "def load_and_filter_data(file_path):\n",
@ -70,6 +70,13 @@
" data = pd.DataFrame({'text': texts})\n", " data = pd.DataFrame({'text': texts})\n",
" return data\n", " return data\n",
"\n", "\n",
"def load_labels(file_path):\n",
" labels = []\n",
" with open(file_path, 'r', encoding='utf-8') as f:\n",
" for line in f:\n",
" labels.append(int(line.strip()))\n",
" return np.array(labels)\n",
"\n",
"def clean_text(text):\n", "def clean_text(text):\n",
" text = text.lower()\n", " text = text.lower()\n",
" text = re.sub(r'\\d+', '', text)\n", " text = re.sub(r'\\d+', '', text)\n",
@ -80,8 +87,8 @@
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"start_time": "2024-05-19T18:08:45.425869Z", "start_time": "2024-05-19T18:21:27.231204Z",
"end_time": "2024-05-19T18:08:45.579869Z" "end_time": "2024-05-19T18:21:27.377342Z"
} }
} }
}, },
@ -96,21 +103,22 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 20,
"outputs": [], "outputs": [],
"source": [ "source": [
"train_data = load_and_filter_data('train/train.tsv.gz')\n", "train_data = load_and_filter_data('train/train.tsv.gz')\n",
"train_data['text'] = train_data['text'].apply(clean_text)\n", "train_data['text'] = train_data['text'].apply(clean_text)\n",
"dev_data = load_and_filter_tsv('dev-0/in.tsv')\n", "dev_data = load_and_filter_tsv('dev-0/in.tsv')\n",
"dev_data['text'] = dev_data['text'].apply(clean_text)\n", "dev_data['text'] = dev_data['text'].apply(clean_text)\n",
"dev_labels = load_labels('dev-0/expected.tsv')\n",
"test_data = load_and_filter_tsv('test-A/in.tsv')\n", "test_data = load_and_filter_tsv('test-A/in.tsv')\n",
"test_data['text'] = test_data['text'].apply(clean_text)" "test_data['text'] = test_data['text'].apply(clean_text)"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"start_time": "2024-05-19T18:08:45.435869Z", "start_time": "2024-05-19T18:21:27.241222Z",
"end_time": "2024-05-19T18:08:48.741093Z" "end_time": "2024-05-19T18:21:31.160229Z"
} }
} }
}, },
@ -125,7 +133,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 21,
"outputs": [], "outputs": [],
"source": [ "source": [
"word2vec_model = KeyedVectors.load(\"word2vec_100_3_polish.bin\")" "word2vec_model = KeyedVectors.load(\"word2vec_100_3_polish.bin\")"
@ -133,8 +141,8 @@
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"start_time": "2024-05-19T18:08:48.743093Z", "start_time": "2024-05-19T18:21:31.161230Z",
"end_time": "2024-05-19T18:09:04.607384Z" "end_time": "2024-05-19T18:21:54.895038Z"
} }
} }
}, },
@ -149,7 +157,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 22,
"outputs": [], "outputs": [],
"source": [ "source": [
"def text_to_vector(text, model):\n", "def text_to_vector(text, model):\n",
@ -160,48 +168,26 @@
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"start_time": "2024-05-19T18:09:04.609383Z", "start_time": "2024-05-19T18:21:54.900047Z",
"end_time": "2024-05-19T18:09:04.621383Z" "end_time": "2024-05-19T18:21:54.909040Z"
} }
} }
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 23,
"outputs": [], "outputs": [],
"source": [ "source": [
"X = np.array([text_to_vector(text, word2vec_model) for text in train_data['text']])\n", "X_train = np.array([text_to_vector(text, word2vec_model) for text in train_data['text']])\n",
"y = np.array(train_data['label'])" "y_train = np.array(train_data['label'])\n",
"X_dev = np.array([text_to_vector(text, word2vec_model) for text in dev_data['text']])\n",
"X_test = np.array([text_to_vector(text, word2vec_model) for text in test_data['text']])"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"start_time": "2024-05-19T18:09:04.623384Z", "start_time": "2024-05-19T18:21:54.913039Z",
"end_time": "2024-05-19T18:09:12.703303Z" "end_time": "2024-05-19T18:22:03.870813Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"### Dodatkowy podział danych na zbiór treningowy oraz walidacyjny"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [],
"source": [
"X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2024-05-19T18:09:12.705305Z",
"end_time": "2024-05-19T18:09:12.749303Z"
} }
} }
}, },
@ -216,7 +202,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 13, "execution_count": 24,
"outputs": [ "outputs": [
{ {
"name": "stderr", "name": "stderr",
@ -238,109 +224,109 @@
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"start_time": "2024-05-19T18:09:12.750302Z", "start_time": "2024-05-19T18:22:03.872859Z",
"end_time": "2024-05-19T18:09:12.954821Z" "end_time": "2024-05-19T18:22:04.122687Z"
} }
} }
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 25,
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"Epoch 1/35\n", "Epoch 1/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.8631 - loss: 0.4892 - val_accuracy: 0.9238 - val_loss: 0.2468\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m6s\u001B[0m 2ms/step - accuracy: 0.8769 - loss: 0.4540 - val_accuracy: 0.9310 - val_loss: 0.2222\n",
"Epoch 2/35\n", "Epoch 2/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9240 - loss: 0.2481 - val_accuracy: 0.9367 - val_loss: 0.2040\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9270 - loss: 0.2362 - val_accuracy: 0.9303 - val_loss: 0.2106\n",
"Epoch 3/35\n", "Epoch 3/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9289 - loss: 0.2213 - val_accuracy: 0.9377 - val_loss: 0.1938\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9320 - loss: 0.2191 - val_accuracy: 0.9415 - val_loss: 0.1890\n",
"Epoch 4/35\n", "Epoch 4/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9293 - loss: 0.2195 - val_accuracy: 0.9417 - val_loss: 0.1869\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9306 - loss: 0.2139 - val_accuracy: 0.9406 - val_loss: 0.1850\n",
"Epoch 5/35\n", "Epoch 5/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9328 - loss: 0.2120 - val_accuracy: 0.9364 - val_loss: 0.1930\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9322 - loss: 0.2098 - val_accuracy: 0.9395 - val_loss: 0.1883\n",
"Epoch 6/35\n", "Epoch 6/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9302 - loss: 0.2114 - val_accuracy: 0.9384 - val_loss: 0.1898\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9325 - loss: 0.2074 - val_accuracy: 0.9404 - val_loss: 0.1814\n",
"Epoch 7/35\n", "Epoch 7/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9312 - loss: 0.2134 - val_accuracy: 0.9438 - val_loss: 0.1803\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9320 - loss: 0.2093 - val_accuracy: 0.9441 - val_loss: 0.1810\n",
"Epoch 8/35\n", "Epoch 8/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9316 - loss: 0.2091 - val_accuracy: 0.9413 - val_loss: 0.1822\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9326 - loss: 0.2094 - val_accuracy: 0.9441 - val_loss: 0.1804\n",
"Epoch 9/35\n", "Epoch 9/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9330 - loss: 0.2104 - val_accuracy: 0.9228 - val_loss: 0.2174\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9327 - loss: 0.2064 - val_accuracy: 0.9400 - val_loss: 0.1807\n",
"Epoch 10/35\n", "Epoch 10/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9325 - loss: 0.2093 - val_accuracy: 0.9402 - val_loss: 0.1839\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9319 - loss: 0.2073 - val_accuracy: 0.9408 - val_loss: 0.1799\n",
"Epoch 11/35\n", "Epoch 11/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9298 - loss: 0.2123 - val_accuracy: 0.9411 - val_loss: 0.1834\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9324 - loss: 0.2061 - val_accuracy: 0.9391 - val_loss: 0.1826\n",
"Epoch 12/35\n", "Epoch 12/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9323 - loss: 0.2071 - val_accuracy: 0.9445 - val_loss: 0.1774\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9320 - loss: 0.2066 - val_accuracy: 0.9433 - val_loss: 0.1814\n",
"Epoch 13/35\n", "Epoch 13/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9326 - loss: 0.2089 - val_accuracy: 0.9439 - val_loss: 0.1786\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9325 - loss: 0.2066 - val_accuracy: 0.9382 - val_loss: 0.1882\n",
"Epoch 14/35\n", "Epoch 14/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9329 - loss: 0.2050 - val_accuracy: 0.9387 - val_loss: 0.1866\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9330 - loss: 0.2045 - val_accuracy: 0.9406 - val_loss: 0.1813\n",
"Epoch 15/35\n", "Epoch 15/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9331 - loss: 0.2035 - val_accuracy: 0.9447 - val_loss: 0.1815\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9316 - loss: 0.2106 - val_accuracy: 0.9408 - val_loss: 0.1831\n",
"Epoch 16/35\n", "Epoch 16/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9326 - loss: 0.2078 - val_accuracy: 0.9352 - val_loss: 0.1954\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9338 - loss: 0.2036 - val_accuracy: 0.9384 - val_loss: 0.1862\n",
"Epoch 17/35\n", "Epoch 17/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9331 - loss: 0.2059 - val_accuracy: 0.9436 - val_loss: 0.1762\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9330 - loss: 0.2063 - val_accuracy: 0.9398 - val_loss: 0.1862\n",
"Epoch 18/35\n", "Epoch 18/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9332 - loss: 0.2050 - val_accuracy: 0.9437 - val_loss: 0.1765\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9320 - loss: 0.2102 - val_accuracy: 0.9408 - val_loss: 0.1802\n",
"Epoch 19/35\n", "Epoch 19/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9343 - loss: 0.2038 - val_accuracy: 0.9452 - val_loss: 0.1788\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9323 - loss: 0.2059 - val_accuracy: 0.9397 - val_loss: 0.1794\n",
"Epoch 20/35\n", "Epoch 20/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9343 - loss: 0.2037 - val_accuracy: 0.9368 - val_loss: 0.1887\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9338 - loss: 0.2039 - val_accuracy: 0.9431 - val_loss: 0.1728\n",
"Epoch 21/35\n", "Epoch 21/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9326 - loss: 0.2054 - val_accuracy: 0.9435 - val_loss: 0.1773\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9319 - loss: 0.2102 - val_accuracy: 0.9415 - val_loss: 0.1787\n",
"Epoch 22/35\n", "Epoch 22/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9327 - loss: 0.2059 - val_accuracy: 0.9417 - val_loss: 0.1813\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9351 - loss: 0.2034 - val_accuracy: 0.9433 - val_loss: 0.1780\n",
"Epoch 23/35\n", "Epoch 23/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9333 - loss: 0.2041 - val_accuracy: 0.9405 - val_loss: 0.1809\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9330 - loss: 0.2059 - val_accuracy: 0.9404 - val_loss: 0.1759\n",
"Epoch 24/35\n", "Epoch 24/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9340 - loss: 0.2045 - val_accuracy: 0.9393 - val_loss: 0.1840\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 2ms/step - accuracy: 0.9335 - loss: 0.2042 - val_accuracy: 0.9409 - val_loss: 0.1789\n",
"Epoch 25/35\n", "Epoch 25/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9324 - loss: 0.2046 - val_accuracy: 0.9405 - val_loss: 0.1833\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9341 - loss: 0.2052 - val_accuracy: 0.9389 - val_loss: 0.1813\n",
"Epoch 26/35\n", "Epoch 26/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9338 - loss: 0.2030 - val_accuracy: 0.9404 - val_loss: 0.1825\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9322 - loss: 0.2078 - val_accuracy: 0.9406 - val_loss: 0.1813\n",
"Epoch 27/35\n", "Epoch 27/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9346 - loss: 0.2051 - val_accuracy: 0.9385 - val_loss: 0.1875\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9319 - loss: 0.2069 - val_accuracy: 0.9283 - val_loss: 0.2017\n",
"Epoch 28/35\n", "Epoch 28/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9306 - loss: 0.2091 - val_accuracy: 0.9431 - val_loss: 0.1784\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9324 - loss: 0.2083 - val_accuracy: 0.9409 - val_loss: 0.1883\n",
"Epoch 29/35\n", "Epoch 29/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9352 - loss: 0.2033 - val_accuracy: 0.9396 - val_loss: 0.1877\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9326 - loss: 0.2054 - val_accuracy: 0.9411 - val_loss: 0.1791\n",
"Epoch 30/35\n", "Epoch 30/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 2ms/step - accuracy: 0.9333 - loss: 0.2037 - val_accuracy: 0.9403 - val_loss: 0.1808\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9333 - loss: 0.2041 - val_accuracy: 0.9419 - val_loss: 0.1769\n",
"Epoch 31/35\n", "Epoch 31/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9313 - loss: 0.2090 - val_accuracy: 0.9413 - val_loss: 0.1783\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9343 - loss: 0.2029 - val_accuracy: 0.9439 - val_loss: 0.1756\n",
"Epoch 32/35\n", "Epoch 32/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9340 - loss: 0.2063 - val_accuracy: 0.9428 - val_loss: 0.1815\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m5s\u001B[0m 1ms/step - accuracy: 0.9330 - loss: 0.2060 - val_accuracy: 0.9384 - val_loss: 0.1805\n",
"Epoch 33/35\n", "Epoch 33/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9324 - loss: 0.2029 - val_accuracy: 0.9405 - val_loss: 0.1822\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9333 - loss: 0.2023 - val_accuracy: 0.9395 - val_loss: 0.1780\n",
"Epoch 34/35\n", "Epoch 34/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9328 - loss: 0.2046 - val_accuracy: 0.9411 - val_loss: 0.1824\n", "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9347 - loss: 0.2025 - val_accuracy: 0.9408 - val_loss: 0.1806\n",
"Epoch 35/35\n", "Epoch 35/35\n",
"\u001B[1m2454/2454\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 1ms/step - accuracy: 0.9322 - loss: 0.2063 - val_accuracy: 0.9414 - val_loss: 0.1820\n" "\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 1ms/step - accuracy: 0.9315 - loss: 0.2038 - val_accuracy: 0.9419 - val_loss: 0.1762\n"
] ]
}, },
{ {
"data": { "data": {
"text/plain": "<keras.src.callbacks.history.History at 0x2809ceeab60>" "text/plain": "<keras.src.callbacks.history.History at 0x280fa5dcb80>"
}, },
"execution_count": 14, "execution_count": 25,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
], ],
"source": [ "source": [
"model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
"model.fit(X_train, y_train, epochs=35, batch_size=32, validation_data=(X_val, y_val))" "model.fit(X_train, y_train, epochs=35, batch_size=32, validation_data=(X_dev, dev_labels))"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"start_time": "2024-05-19T18:09:12.957822Z", "start_time": "2024-05-19T18:22:04.124694Z",
"end_time": "2024-05-19T18:11:23.248486Z" "end_time": "2024-05-19T18:24:44.659379Z"
} }
} }
}, },
@ -355,26 +341,26 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 26,
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"\u001B[1m614/614\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 766us/step - accuracy: 0.9409 - loss: 0.1851\n", "\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 814us/step - accuracy: 0.9413 - loss: 0.1863\n",
"Accuracy on validation set: 0.9413533210754395\n" "Accuracy on validation set: 0.9418562054634094\n"
] ]
} }
], ],
"source": [ "source": [
"loss, accuracy = model.evaluate(X_val, y_val)\n", "loss, accuracy = model.evaluate(X_dev, dev_labels)\n",
"print(f'Accuracy on validation set: {accuracy}')" "print(f'Accuracy on validation set: {accuracy}')"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"start_time": "2024-05-19T18:11:23.208454Z", "start_time": "2024-05-19T18:24:44.661382Z",
"end_time": "2024-05-19T18:11:23.753363Z" "end_time": "2024-05-19T18:24:44.864668Z"
} }
} }
}, },
@ -389,29 +375,26 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 27,
"outputs": [ "outputs": [
{ {
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 882us/step\n", "\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 900us/step\n",
"\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 700us/step\n" "\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 765us/step\n"
] ]
} }
], ],
"source": [ "source": [
"X_dev = np.array([text_to_vector(text, word2vec_model) for text in dev_data['text']])\n",
"X_test = np.array([text_to_vector(text, word2vec_model) for text in test_data['text']])\n",
"\n",
"dev_predictions = model.predict(X_dev)\n", "dev_predictions = model.predict(X_dev)\n",
"test_predictions = model.predict(X_test)" "test_predictions = model.predict(X_test)"
], ],
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"start_time": "2024-05-19T18:11:23.754367Z", "start_time": "2024-05-19T18:24:44.863671Z",
"end_time": "2024-05-19T18:11:25.114539Z" "end_time": "2024-05-19T18:24:45.395043Z"
} }
} }
}, },
@ -426,7 +409,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 28,
"outputs": [], "outputs": [],
"source": [ "source": [
"dev_predictions = (dev_predictions > 0.5).astype(int)\n", "dev_predictions = (dev_predictions > 0.5).astype(int)\n",
@ -438,8 +421,8 @@
"metadata": { "metadata": {
"collapsed": false, "collapsed": false,
"ExecuteTime": { "ExecuteTime": {
"start_time": "2024-05-19T18:11:25.117540Z", "start_time": "2024-05-19T18:24:45.398007Z",
"end_time": "2024-05-19T18:11:25.149572Z" "end_time": "2024-05-19T18:24:45.438575Z"
} }
} }
} }