Word2Vec implemetation

2024-05-20 04:30:46 +02:00 · 2024-05-20 04:30:46 +02:00 · fe140d27be
commit fe140d27be
parent 0cf206db5c
10 changed files with 125962 additions and 16377 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,4 @@
+fasttext_100_3_polish.bin*
+dev-0/out.tsv
+test-A/out.tsv
+test-A/expected.tsv
--- a/README.md
+++ b/README.md
@ -1,25 +1,25 @@
-
-Sport Texts Classification Challenge - Ball
-======================
-
-Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
-
-Classes
-------
-
-* `1` — ball
-* `0` — no-ball
-
-Directory structure
-------------------
-
-* `README.md` — this file
-* `config.txt` — configuration file
-* `train/` — directory with training data
-* `train/train.tsv` — sample train set
-* `dev-0/` — directory with dev (test) data
-* `dev-0/in.tsv` — input data for the dev set
-* `dev-0/expected.tsv` — expected (reference) data for the dev set
-* `test-A` — directory with test data
-* `test-A/in.tsv` — input data for the test set
-* `test-A/expected.tsv` — expected (reference) data for the test set
+
+Sport Texts Classification Challenge - Ball
+======================
+
+Guess whether the sport is connected to the ball for a Polish article. Evaluation metrics: Accuracy, Likelihood.
+
+Classes
+-------
+
+* `1` — ball
+* `0` — no-ball
+
+Directory structure
+-------------------
+
+* `README.md` — this file
+* `config.txt` — configuration file
+* `train/` — directory with training data
+* `train/train.tsv` — sample train set
+* `dev-0/` — directory with dev (test) data
+* `dev-0/in.tsv` — input data for the dev set
+* `dev-0/expected.tsv` — expected (reference) data for the dev set
+* `test-A` — directory with test data
+* `test-A/in.tsv` — input data for the test set
+* `test-A/expected.tsv` — expected (reference) data for the test set
--- a/Word2Vec.ipynb
+++ b/Word2Vec.ipynb
@ -0,0 +1,550 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Word2Vec"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Import bibliotek"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from gensim.models import KeyedVectors\n",
+    "from gensim.utils import simple_preprocess\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from keras.models import Sequential\n",
+    "from keras.layers import Dense\n",
+    "from sklearn.preprocessing import LabelEncoder"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Wczytanie danych"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Mindaugas Budzinauskas wierzy w odbudowę formy...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Przyjmujący reprezentacji Polski wrócił do PGE...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Aleksander Filipiak: Czuję się dobrze w nowym ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Victoria Carl i Aleksiej Czerwotkin mistrzami ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Text\n",
+       "0  Mindaugas Budzinauskas wierzy w odbudowę formy...\n",
+       "1  Przyjmujący reprezentacji Polski wrócił do PGE...\n",
+       "2  FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...\n",
+       "3  Aleksander Filipiak: Czuję się dobrze w nowym ...\n",
+       "4  Victoria Carl i Aleksiej Czerwotkin mistrzami ..."
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ATP Sztokholm: Juergen Zopp wykorzystał szansę...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Krowicki z reprezentacją kobiet aż do igrzysk ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Wielki powrót Łukasza Kubota Odradza się zawsz...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Marcel Hirscher wygrał ostatni slalom gigant m...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Polki do Czarnogóry z pełnią zaangażowania. Sy...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Text\n",
+       "0  ATP Sztokholm: Juergen Zopp wykorzystał szansę...\n",
+       "1  Krowicki z reprezentacją kobiet aż do igrzysk ...\n",
+       "2  Wielki powrót Łukasza Kubota Odradza się zawsz...\n",
+       "3  Marcel Hirscher wygrał ostatni slalom gigant m...\n",
+       "4  Polki do Czarnogóry z pełnią zaangażowania. Sy..."
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Mundial 2018. Były reprezentant Anglii trenere...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Liga Mistrzyń: Podopieczne Kima Rasmussena bli...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Wyczerpujące treningi biegowe Justyny Kowalczy...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Mundial 2018. Zagraniczne media zareagowały na...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>BCL. Artur Gronek: Musimy grać twardziej. Pope...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                Text\n",
+       "0  Mundial 2018. Były reprezentant Anglii trenere...\n",
+       "1  Liga Mistrzyń: Podopieczne Kima Rasmussena bli...\n",
+       "2  Wyczerpujące treningi biegowe Justyny Kowalczy...\n",
+       "3  Mundial 2018. Zagraniczne media zareagowały na...\n",
+       "4  BCL. Artur Gronek: Musimy grać twardziej. Pope..."
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Label\n",
+       "0      1\n",
+       "1      1\n",
+       "2      0\n",
+       "3      1\n",
+       "4      0"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Label\n",
+       "0      1\n",
+       "1      1\n",
+       "2      0\n",
+       "3      1\n",
+       "4      1"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "data_train = pd.read_csv('train/train.tsv', sep=\"\\t\", names=[\"Text\"], usecols=[1])\n",
+    "data_test = pd.read_csv('test-A/in.tsv', sep=\"\\t\", names=[\"Text\"])\n",
+    "data_dev = pd.read_csv('dev-0/in.tsv', sep=\"\\t\", names=[\"Text\"])\n",
+    "\n",
+    "labels_train = pd.read_csv('train/train.tsv', sep='\\t', header=None, names=['Label'], usecols=[0])\n",
+    "labels_dev = pd.read_csv('dev-0/expected.tsv', sep='\\t', header=None, names=['Label'])\n",
+    "\n",
+    "display(data_train.head())\n",
+    "display(data_test.head())\n",
+    "display(data_dev.head())\n",
+    "display(labels_train.head())\n",
+    "display(labels_dev.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Załadowanie wektorów Word2Vec"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "W2V_model = KeyedVectors.load('fasttext_100_3_polish.bin')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Funkcj przekształcania tekstu na wektory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def text_to_vector(text, word2vec, vector_size):\n",
+    "    words = simple_preprocess(text)\n",
+    "    text_vector = np.zeros(vector_size)\n",
+    "    word_count = 0\n",
+    "    for word in words:\n",
+    "        if word in word2vec.wv:\n",
+    "            text_vector += word2vec.wv[word]\n",
+    "            word_count += 1\n",
+    "    if word_count > 0:\n",
+    "        text_vector /= word_count\n",
+    "    return text_vector"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Dostosowanie formatu danych do modelu"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Zamiana tekstów na wektory\n",
+    "train_vectors = np.array([text_to_vector(text, W2V_model, 100) for text in data_train['Text']])\n",
+    "dev_vectors = np.array([text_to_vector(text, W2V_model, 100) for text in data_dev['Text']])\n",
+    "test_vectors = np.array([text_to_vector(text, W2V_model, 100) for text in data_test['Text']])\n",
+    "\n",
+    "# Zamiana etykiet na liczby\n",
+    "label_encoder = LabelEncoder()\n",
+    "train_labels_enc = label_encoder.fit_transform(labels_train['Label'])\n",
+    "dev_labels_enc = label_encoder.transform(labels_dev['Label'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Stworzenie modelu"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/10\n",
+      "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m3s\u001b[0m 783us/step - accuracy: 0.9121 - loss: 0.2125 - val_accuracy: 0.9514 - val_loss: 0.1274\n",
+      "Epoch 2/10\n",
+      "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 752us/step - accuracy: 0.9528 - loss: 0.1238 - val_accuracy: 0.9565 - val_loss: 0.1127\n",
+      "Epoch 3/10\n",
+      "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 752us/step - accuracy: 0.9578 - loss: 0.1101 - val_accuracy: 0.9529 - val_loss: 0.1167\n",
+      "Epoch 4/10\n",
+      "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 754us/step - accuracy: 0.9605 - loss: 0.1020 - val_accuracy: 0.9622 - val_loss: 0.1060\n",
+      "Epoch 5/10\n",
+      "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 746us/step - accuracy: 0.9624 - loss: 0.0951 - val_accuracy: 0.9580 - val_loss: 0.1058\n",
+      "Epoch 6/10\n",
+      "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 756us/step - accuracy: 0.9632 - loss: 0.0935 - val_accuracy: 0.9631 - val_loss: 0.0924\n",
+      "Epoch 7/10\n",
+      "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 757us/step - accuracy: 0.9661 - loss: 0.0885 - val_accuracy: 0.9602 - val_loss: 0.1000\n",
+      "Epoch 8/10\n",
+      "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 754us/step - accuracy: 0.9662 - loss: 0.0869 - val_accuracy: 0.9642 - val_loss: 0.0927\n",
+      "Epoch 9/10\n",
+      "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 758us/step - accuracy: 0.9667 - loss: 0.0840 - val_accuracy: 0.9617 - val_loss: 0.0921\n",
+      "Epoch 10/10\n",
+      "\u001b[1m3067/3067\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m2s\u001b[0m 766us/step - accuracy: 0.9678 - loss: 0.0831 - val_accuracy: 0.9652 - val_loss: 0.0898\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<keras.src.callbacks.history.History at 0x1d117e0b450>"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Stworzenie modelu\n",
+    "model = Sequential()\n",
+    "model.add(Dense(128, input_dim=100, activation='relu'))\n",
+    "model.add(Dense(64, activation='relu'))\n",
+    "model.add(Dense(1, activation='sigmoid'))\n",
+    "\n",
+    "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
+    "\n",
+    "# Trening modelu\n",
+    "model.fit(train_vectors, train_labels_enc, epochs=10, batch_size=32, validation_data=(dev_vectors, dev_labels_enc))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Predykcja i zapis danych wyjścowych"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 718us/step\n",
+      "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 591us/step\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Predykcje dla danych walidacyjnych\n",
+    "dev_predictions = model.predict(dev_vectors)\n",
+    "dev_predictions = (dev_predictions > 0.5).astype(int)\n",
+    "\n",
+    "# Predykcje dla danych testowych\n",
+    "test_predictions = model.predict(test_vectors)\n",
+    "test_predictions = (test_predictions > 0.5).astype(int)\n",
+    "\n",
+    "# Zapisanie wyników do plików\n",
+    "pd.DataFrame(dev_predictions).to_csv('dev-0/out.tsv', sep='\\t', index=False, header=False)\n",
+    "pd.DataFrame(test_predictions).to_csv('test-A/out.tsv', sep='\\t', index=False, header=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/config.txt
+++ b/config.txt
@ -1 +1 @@
--metric Likelihood --metric Accuracy --precision 5
+--metric Likelihood --metric Accuracy --precision 5
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/train.tsv
+++ b/train/train.tsv