word2vec/word2vec.ipynb

532 lines
35 KiB
Plaintext
Raw Permalink Normal View History

2024-05-20 00:12:27 +02:00
{
"cells": [
{
"cell_type": "code",
"id": "initial_id",
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2024-05-19T22:00:19.457327Z",
"start_time": "2024-05-19T22:00:16.498540Z"
}
},
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from gensim.models import Word2Vec\n",
"from tensorflow.keras.models import Sequential\n",
"from tensorflow.keras.layers import Dense"
],
"outputs": [],
"execution_count": 1
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:00:19.463374Z",
"start_time": "2024-05-19T22:00:19.458329Z"
}
},
"cell_type": "code",
"source": [
"def load_train_data(file_path):\n",
" labels = []\n",
" texts = []\n",
" with open(file_path, 'r', encoding='utf-8') as file:\n",
" for line in file:\n",
" parts = line.split('\\t', 1)\n",
" if len(parts) == 2:\n",
" label, text = parts\n",
" labels.append(int(label))\n",
" texts.append(text.strip())\n",
" return pd.DataFrame({'label': labels, 'text': texts})"
],
"id": "3224192629baf09c",
"outputs": [],
"execution_count": 2
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:00:19.472523Z",
"start_time": "2024-05-19T22:00:19.464342Z"
}
},
"cell_type": "code",
"source": [
"def load_data(file_path):\n",
" texts = []\n",
" with open(file_path, 'r', encoding='utf-8') as file:\n",
" for line in file:\n",
" texts.append(line.strip())\n",
" return pd.DataFrame({'text': texts})"
],
"id": "1a8254758b45c975",
"outputs": [],
"execution_count": 3
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:00:19.477364Z",
"start_time": "2024-05-19T22:00:19.473525Z"
}
},
"cell_type": "code",
"source": [
"def load_labels(file_path):\n",
" labels = []\n",
" with open(file_path, 'r', encoding='utf-8') as file:\n",
" for line in file:\n",
" labels.append(int(line.strip()))\n",
" return pd.DataFrame({'label': labels})"
],
"id": "f2aaa6dba99b1ff6",
"outputs": [],
"execution_count": 4
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:00:19.483099Z",
"start_time": "2024-05-19T22:00:19.479367Z"
}
},
"cell_type": "code",
"source": [
"def get_average_word2vec(tokens_list, model, k=100):\n",
" vec = np.zeros(k)\n",
" count = 0\n",
" for word in tokens_list:\n",
" if word in model.wv:\n",
" vec += model.wv[word]\n",
" count += 1\n",
" if count != 0:\n",
" vec /= count\n",
" return vec"
],
"id": "139f942641e876f",
"outputs": [],
"execution_count": 5
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:00:19.487397Z",
"start_time": "2024-05-19T22:00:19.484101Z"
}
},
"cell_type": "code",
"source": [
"def preprocess_data(file_path, word2vec_model):\n",
" data = load_data(file_path)\n",
" X = np.array([get_average_word2vec(text.split(), word2vec_model) for text in data['text']])\n",
" return X"
],
"id": "98797563dcad610b",
"outputs": [],
"execution_count": 6
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:00:33.384817Z",
"start_time": "2024-05-19T22:00:19.488400Z"
}
},
"cell_type": "code",
"source": [
"train_data = load_train_data('train/train.tsv')\n",
"sentences = [text.split() for text in train_data['text']]\n",
"word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)"
],
"id": "a821ea109f28215b",
"outputs": [],
"execution_count": 7
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:00:42.574909Z",
"start_time": "2024-05-19T22:00:33.385840Z"
}
},
"cell_type": "code",
"source": [
"X_train = np.array([get_average_word2vec(text.split(), word2vec_model) for text in train_data['text']])\n",
"y_train = np.array(train_data['label'])\n",
"X_dev = preprocess_data('dev-0/in.tsv', word2vec_model)\n",
"dev_labels = load_labels('dev-0/expected.tsv')"
],
"id": "3cabe99193dd82ad",
"outputs": [],
"execution_count": 8
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:05:09.554039Z",
"start_time": "2024-05-19T22:00:42.575910Z"
}
},
"cell_type": "code",
"source": [
"model = Sequential()\n",
"model.add(Dense(64, activation='relu'))\n",
"model.add(Dense(32, activation='relu'))\n",
"model.add(Dense(1, activation='sigmoid'))\n",
"\n",
"model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
"model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_dev, dev_labels))"
],
"id": "31e13e484c35a433",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m4s\u001B[0m 880us/step - accuracy: 0.9494 - loss: 0.1326 - val_accuracy: 0.9718 - val_loss: 0.0791\n",
"Epoch 2/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 879us/step - accuracy: 0.9693 - loss: 0.0806 - val_accuracy: 0.9714 - val_loss: 0.0764\n",
"Epoch 3/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 874us/step - accuracy: 0.9710 - loss: 0.0749 - val_accuracy: 0.9727 - val_loss: 0.0743\n",
"Epoch 4/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 872us/step - accuracy: 0.9720 - loss: 0.0740 - val_accuracy: 0.9725 - val_loss: 0.0725\n",
"Epoch 5/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 854us/step - accuracy: 0.9723 - loss: 0.0718 - val_accuracy: 0.9732 - val_loss: 0.0709\n",
"Epoch 6/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 855us/step - accuracy: 0.9737 - loss: 0.0687 - val_accuracy: 0.9685 - val_loss: 0.0921\n",
"Epoch 7/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 859us/step - accuracy: 0.9734 - loss: 0.0670 - val_accuracy: 0.9723 - val_loss: 0.0737\n",
"Epoch 8/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 867us/step - accuracy: 0.9755 - loss: 0.0636 - val_accuracy: 0.9730 - val_loss: 0.0725\n",
"Epoch 9/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 854us/step - accuracy: 0.9757 - loss: 0.0625 - val_accuracy: 0.9719 - val_loss: 0.0731\n",
"Epoch 10/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 872us/step - accuracy: 0.9766 - loss: 0.0604 - val_accuracy: 0.9718 - val_loss: 0.0751\n",
"Epoch 11/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 832us/step - accuracy: 0.9769 - loss: 0.0595 - val_accuracy: 0.9729 - val_loss: 0.0736\n",
"Epoch 12/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 909us/step - accuracy: 0.9785 - loss: 0.0571 - val_accuracy: 0.9723 - val_loss: 0.0735\n",
"Epoch 13/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 825us/step - accuracy: 0.9787 - loss: 0.0560 - val_accuracy: 0.9723 - val_loss: 0.0735\n",
"Epoch 14/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 819us/step - accuracy: 0.9787 - loss: 0.0543 - val_accuracy: 0.9727 - val_loss: 0.0741\n",
"Epoch 15/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 817us/step - accuracy: 0.9790 - loss: 0.0544 - val_accuracy: 0.9719 - val_loss: 0.0740\n",
"Epoch 16/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 829us/step - accuracy: 0.9788 - loss: 0.0539 - val_accuracy: 0.9729 - val_loss: 0.0748\n",
"Epoch 17/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 817us/step - accuracy: 0.9798 - loss: 0.0524 - val_accuracy: 0.9729 - val_loss: 0.0727\n",
"Epoch 18/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 818us/step - accuracy: 0.9810 - loss: 0.0503 - val_accuracy: 0.9710 - val_loss: 0.0782\n",
"Epoch 19/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 829us/step - accuracy: 0.9788 - loss: 0.0530 - val_accuracy: 0.9699 - val_loss: 0.0773\n",
"Epoch 20/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 838us/step - accuracy: 0.9803 - loss: 0.0512 - val_accuracy: 0.9714 - val_loss: 0.0747\n",
"Epoch 21/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 824us/step - accuracy: 0.9802 - loss: 0.0513 - val_accuracy: 0.9723 - val_loss: 0.0795\n",
"Epoch 22/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 824us/step - accuracy: 0.9810 - loss: 0.0483 - val_accuracy: 0.9727 - val_loss: 0.0775\n",
"Epoch 23/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 819us/step - accuracy: 0.9814 - loss: 0.0473 - val_accuracy: 0.9716 - val_loss: 0.0835\n",
"Epoch 24/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 825us/step - accuracy: 0.9810 - loss: 0.0480 - val_accuracy: 0.9710 - val_loss: 0.0767\n",
"Epoch 25/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 824us/step - accuracy: 0.9815 - loss: 0.0471 - val_accuracy: 0.9712 - val_loss: 0.0803\n",
"Epoch 26/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 826us/step - accuracy: 0.9829 - loss: 0.0449 - val_accuracy: 0.9707 - val_loss: 0.0811\n",
"Epoch 27/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 821us/step - accuracy: 0.9823 - loss: 0.0449 - val_accuracy: 0.9697 - val_loss: 0.0813\n",
"Epoch 28/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 821us/step - accuracy: 0.9829 - loss: 0.0432 - val_accuracy: 0.9719 - val_loss: 0.0803\n",
"Epoch 29/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 826us/step - accuracy: 0.9828 - loss: 0.0433 - val_accuracy: 0.9705 - val_loss: 0.0884\n",
"Epoch 30/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 873us/step - accuracy: 0.9832 - loss: 0.0425 - val_accuracy: 0.9707 - val_loss: 0.0855\n",
"Epoch 31/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 866us/step - accuracy: 0.9829 - loss: 0.0433 - val_accuracy: 0.9707 - val_loss: 0.0845\n",
"Epoch 32/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 857us/step - accuracy: 0.9830 - loss: 0.0430 - val_accuracy: 0.9727 - val_loss: 0.0840\n",
"Epoch 33/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 870us/step - accuracy: 0.9835 - loss: 0.0406 - val_accuracy: 0.9661 - val_loss: 0.0911\n",
"Epoch 34/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 874us/step - accuracy: 0.9840 - loss: 0.0407 - val_accuracy: 0.9707 - val_loss: 0.0866\n",
"Epoch 35/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 897us/step - accuracy: 0.9841 - loss: 0.0400 - val_accuracy: 0.9718 - val_loss: 0.0807\n",
"Epoch 36/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 859us/step - accuracy: 0.9841 - loss: 0.0399 - val_accuracy: 0.9696 - val_loss: 0.0841\n",
"Epoch 37/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 873us/step - accuracy: 0.9850 - loss: 0.0390 - val_accuracy: 0.9734 - val_loss: 0.0892\n",
"Epoch 38/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 866us/step - accuracy: 0.9847 - loss: 0.0378 - val_accuracy: 0.9690 - val_loss: 0.0956\n",
"Epoch 39/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 864us/step - accuracy: 0.9851 - loss: 0.0377 - val_accuracy: 0.9708 - val_loss: 0.0889\n",
"Epoch 40/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 863us/step - accuracy: 0.9852 - loss: 0.0377 - val_accuracy: 0.9725 - val_loss: 0.0888\n",
"Epoch 41/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 857us/step - accuracy: 0.9858 - loss: 0.0360 - val_accuracy: 0.9718 - val_loss: 0.0914\n",
"Epoch 42/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 859us/step - accuracy: 0.9844 - loss: 0.0376 - val_accuracy: 0.9699 - val_loss: 0.0980\n",
"Epoch 43/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 866us/step - accuracy: 0.9857 - loss: 0.0362 - val_accuracy: 0.9699 - val_loss: 0.0922\n",
"Epoch 44/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 860us/step - accuracy: 0.9858 - loss: 0.0368 - val_accuracy: 0.9701 - val_loss: 0.0956\n",
"Epoch 45/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 852us/step - accuracy: 0.9862 - loss: 0.0354 - val_accuracy: 0.9690 - val_loss: 0.0942\n",
"Epoch 46/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 862us/step - accuracy: 0.9869 - loss: 0.0331 - val_accuracy: 0.9690 - val_loss: 0.0977\n",
"Epoch 47/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 885us/step - accuracy: 0.9865 - loss: 0.0334 - val_accuracy: 0.9712 - val_loss: 0.0947\n",
"Epoch 48/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 870us/step - accuracy: 0.9871 - loss: 0.0338 - val_accuracy: 0.9699 - val_loss: 0.0983\n",
"Epoch 49/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 855us/step - accuracy: 0.9865 - loss: 0.0335 - val_accuracy: 0.9708 - val_loss: 0.1039\n",
"Epoch 50/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 864us/step - accuracy: 0.9865 - loss: 0.0338 - val_accuracy: 0.9705 - val_loss: 0.1021\n",
"Epoch 51/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 859us/step - accuracy: 0.9867 - loss: 0.0336 - val_accuracy: 0.9705 - val_loss: 0.1011\n",
"Epoch 52/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 854us/step - accuracy: 0.9871 - loss: 0.0321 - val_accuracy: 0.9692 - val_loss: 0.1045\n",
"Epoch 53/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 885us/step - accuracy: 0.9878 - loss: 0.0310 - val_accuracy: 0.9686 - val_loss: 0.1098\n",
"Epoch 54/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 877us/step - accuracy: 0.9870 - loss: 0.0318 - val_accuracy: 0.9701 - val_loss: 0.1042\n",
"Epoch 55/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 864us/step - accuracy: 0.9883 - loss: 0.0290 - val_accuracy: 0.9690 - val_loss: 0.1131\n",
"Epoch 56/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 866us/step - accuracy: 0.9884 - loss: 0.0298 - val_accuracy: 0.9697 - val_loss: 0.1078\n",
"Epoch 57/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 859us/step - accuracy: 0.9879 - loss: 0.0296 - val_accuracy: 0.9683 - val_loss: 0.1089\n",
"Epoch 58/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 891us/step - accuracy: 0.9881 - loss: 0.0302 - val_accuracy: 0.9707 - val_loss: 0.1103\n",
"Epoch 59/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 869us/step - accuracy: 0.9878 - loss: 0.0307 - val_accuracy: 0.9690 - val_loss: 0.1105\n",
"Epoch 60/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 859us/step - accuracy: 0.9873 - loss: 0.0317 - val_accuracy: 0.9685 - val_loss: 0.1166\n",
"Epoch 61/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 867us/step - accuracy: 0.9879 - loss: 0.0291 - val_accuracy: 0.9710 - val_loss: 0.1139\n",
"Epoch 62/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 853us/step - accuracy: 0.9878 - loss: 0.0287 - val_accuracy: 0.9705 - val_loss: 0.1148\n",
"Epoch 63/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 859us/step - accuracy: 0.9886 - loss: 0.0283 - val_accuracy: 0.9679 - val_loss: 0.1263\n",
"Epoch 64/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 861us/step - accuracy: 0.9884 - loss: 0.0283 - val_accuracy: 0.9701 - val_loss: 0.1200\n",
"Epoch 65/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 869us/step - accuracy: 0.9886 - loss: 0.0283 - val_accuracy: 0.9692 - val_loss: 0.1217\n",
"Epoch 66/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 858us/step - accuracy: 0.9895 - loss: 0.0262 - val_accuracy: 0.9701 - val_loss: 0.1157\n",
"Epoch 67/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 851us/step - accuracy: 0.9890 - loss: 0.0259 - val_accuracy: 0.9683 - val_loss: 0.1164\n",
"Epoch 68/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 852us/step - accuracy: 0.9891 - loss: 0.0265 - val_accuracy: 0.9685 - val_loss: 0.1275\n",
"Epoch 69/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 870us/step - accuracy: 0.9888 - loss: 0.0268 - val_accuracy: 0.9679 - val_loss: 0.1218\n",
"Epoch 70/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 862us/step - accuracy: 0.9892 - loss: 0.0268 - val_accuracy: 0.9694 - val_loss: 0.1320\n",
"Epoch 71/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 855us/step - accuracy: 0.9895 - loss: 0.0254 - val_accuracy: 0.9694 - val_loss: 0.1236\n",
"Epoch 72/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 856us/step - accuracy: 0.9895 - loss: 0.0251 - val_accuracy: 0.9708 - val_loss: 0.1271\n",
"Epoch 73/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 860us/step - accuracy: 0.9897 - loss: 0.0254 - val_accuracy: 0.9703 - val_loss: 0.1363\n",
"Epoch 74/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 861us/step - accuracy: 0.9895 - loss: 0.0257 - val_accuracy: 0.9705 - val_loss: 0.1315\n",
"Epoch 75/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 869us/step - accuracy: 0.9891 - loss: 0.0254 - val_accuracy: 0.9683 - val_loss: 0.1385\n",
"Epoch 76/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 886us/step - accuracy: 0.9899 - loss: 0.0247 - val_accuracy: 0.9681 - val_loss: 0.1324\n",
"Epoch 77/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 851us/step - accuracy: 0.9896 - loss: 0.0247 - val_accuracy: 0.9697 - val_loss: 0.1502\n",
"Epoch 78/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 860us/step - accuracy: 0.9901 - loss: 0.0252 - val_accuracy: 0.9701 - val_loss: 0.1304\n",
"Epoch 79/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 853us/step - accuracy: 0.9896 - loss: 0.0246 - val_accuracy: 0.9688 - val_loss: 0.1537\n",
"Epoch 80/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 882us/step - accuracy: 0.9901 - loss: 0.0239 - val_accuracy: 0.9705 - val_loss: 0.1453\n",
"Epoch 81/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 851us/step - accuracy: 0.9902 - loss: 0.0236 - val_accuracy: 0.9696 - val_loss: 0.1482\n",
"Epoch 82/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 862us/step - accuracy: 0.9901 - loss: 0.0237 - val_accuracy: 0.9672 - val_loss: 0.1538\n",
"Epoch 83/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 853us/step - accuracy: 0.9904 - loss: 0.0234 - val_accuracy: 0.9679 - val_loss: 0.1426\n",
"Epoch 84/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 858us/step - accuracy: 0.9903 - loss: 0.0234 - val_accuracy: 0.9692 - val_loss: 0.1495\n",
"Epoch 85/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 868us/step - accuracy: 0.9905 - loss: 0.0232 - val_accuracy: 0.9690 - val_loss: 0.1374\n",
"Epoch 86/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 861us/step - accuracy: 0.9903 - loss: 0.0232 - val_accuracy: 0.9644 - val_loss: 0.1494\n",
"Epoch 87/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 860us/step - accuracy: 0.9907 - loss: 0.0232 - val_accuracy: 0.9675 - val_loss: 0.1575\n",
"Epoch 88/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 860us/step - accuracy: 0.9908 - loss: 0.0215 - val_accuracy: 0.9685 - val_loss: 0.1655\n",
"Epoch 89/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 853us/step - accuracy: 0.9910 - loss: 0.0213 - val_accuracy: 0.9668 - val_loss: 0.1522\n",
"Epoch 90/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 890us/step - accuracy: 0.9909 - loss: 0.0213 - val_accuracy: 0.9670 - val_loss: 0.1697\n",
"Epoch 91/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 854us/step - accuracy: 0.9910 - loss: 0.0208 - val_accuracy: 0.9679 - val_loss: 0.1665\n",
"Epoch 92/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 861us/step - accuracy: 0.9904 - loss: 0.0224 - val_accuracy: 0.9685 - val_loss: 0.1520\n",
"Epoch 93/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 878us/step - accuracy: 0.9906 - loss: 0.0217 - val_accuracy: 0.9674 - val_loss: 0.1594\n",
"Epoch 94/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 848us/step - accuracy: 0.9917 - loss: 0.0205 - val_accuracy: 0.9655 - val_loss: 0.1640\n",
"Epoch 95/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 866us/step - accuracy: 0.9912 - loss: 0.0214 - val_accuracy: 0.9677 - val_loss: 0.1560\n",
"Epoch 96/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 851us/step - accuracy: 0.9912 - loss: 0.0213 - val_accuracy: 0.9679 - val_loss: 0.1666\n",
"Epoch 97/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 869us/step - accuracy: 0.9917 - loss: 0.0209 - val_accuracy: 0.9675 - val_loss: 0.1539\n",
"Epoch 98/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 886us/step - accuracy: 0.9915 - loss: 0.0206 - val_accuracy: 0.9683 - val_loss: 0.1764\n",
"Epoch 99/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 899us/step - accuracy: 0.9918 - loss: 0.0199 - val_accuracy: 0.9677 - val_loss: 0.1634\n",
"Epoch 100/100\n",
"\u001B[1m3067/3067\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m3s\u001B[0m 854us/step - accuracy: 0.9914 - loss: 0.0197 - val_accuracy: 0.9659 - val_loss: 0.1713\n"
]
},
{
"data": {
"text/plain": [
"<keras.src.callbacks.history.History at 0x2a5e3e7da50>"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"execution_count": 9
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:05:09.728675Z",
"start_time": "2024-05-19T22:05:09.554039Z"
}
},
"cell_type": "code",
"source": [
"loss, accuracy = model.evaluate(X_dev, dev_labels)\n",
"print(f\"Accuracy: {accuracy}\")\n"
],
"id": "cf8b0f801768560b",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 660us/step - accuracy: 0.9641 - loss: 0.1876\n",
"Accuracy: 0.9658840894699097\n"
]
}
],
"execution_count": 10
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:05:10.010233Z",
"start_time": "2024-05-19T22:05:09.728675Z"
}
},
"cell_type": "code",
"source": [
"dev0_pred = model.predict(X_dev)\n",
"dev0_pred = (dev0_pred > 0.5).astype(int)"
],
"id": "bf1843efc9fbcc41",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 794us/step\n"
]
}
],
"execution_count": 11
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:05:10.018255Z",
"start_time": "2024-05-19T22:05:10.010233Z"
}
},
"cell_type": "code",
"source": [
"dev0_pred = pd.DataFrame(dev0_pred)\n",
"dev0_pred.to_csv(\"dev-0/out.tsv\", index=False, header=False)"
],
"id": "5c5a511249859def",
"outputs": [],
"execution_count": 12
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:05:10.724357Z",
"start_time": "2024-05-19T22:05:10.019265Z"
}
},
"cell_type": "code",
"source": [
"X_testA = preprocess_data('test-A/in.tsv', word2vec_model)\n",
"testA_pred = model.predict(X_testA)\n",
"testA_pred = (testA_pred > 0.5).astype(int)"
],
"id": "3a736eea3080da17",
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m0s\u001B[0m 630us/step\n"
]
}
],
"execution_count": 13
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-05-19T22:05:10.732380Z",
"start_time": "2024-05-19T22:05:10.726373Z"
}
},
"cell_type": "code",
"source": [
"testA_pred = pd.DataFrame(testA_pred)\n",
"testA_pred.to_csv(\"test-A/out.tsv\", index=False, header=False)"
],
"id": "87af989ea8614b3c",
"outputs": [],
"execution_count": 14
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}