{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e3b4fa50-eb34-4e53-b938-65cbd4c21f43", "metadata": {}, "outputs": [], "source": [ "import re\n", "from nltk.tokenize import word_tokenize\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "import gensim\n", "from gensim.models import Word2Vec\n", "import numpy as np\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score, classification_report\n", "import pandas as pd\n", "from keras.src.utils import pad_sequences\n", "from keras.src.legacy.preprocessing.text import Tokenizer\n", "from keras.src.layers import Dropout, Dense, Activation, Embedding, MaxPooling1D, GlobalMaxPooling1D\n", "from keras.src.layers import Conv1D\n", "from keras import Sequential\n", "from keras.optimizers import Adam\n", "from sklearn.metrics import accuracy_score" ] }, { "cell_type": "markdown", "id": "4363302f-a5e2-450b-bcee-5a0045101986", "metadata": {}, "source": [ "# Data preprocessing" ] }, { "cell_type": "markdown", "id": "40c5fe05-fe48-42ed-8d1a-9c128e8a7550", "metadata": {}, "source": [ "## Train data preprocessing" ] }, { "cell_type": "code", "execution_count": 2, "id": "80cdc748-8645-4724-afaf-f5af64be9052", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelsentence
01Mindaugas Budzinauskas wierzy w odbudowę formy...
11Przyjmujący reprezentacji Polski wrócił do PGE...
20FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...
31Aleksander Filipiak: Czuję się dobrze w nowym ...
40Victoria Carl i Aleksiej Czerwotkin mistrzami ...
.........
981271Kamil Syprzak zaczyna kolekcjonować trofea. FC...
981281Holandia: dwa gole Piotra Parzyszka Piotr Parz...
981291Sparingowo: Korona gorsza od Stali. Lettieri s...
981301Vive - Wisła. Ośmiu debiutantów w tegorocznej ...
981311WTA Miami: Timea Bacsinszky pokonana, Swietłan...
\n", "

98132 rows × 2 columns

\n", "
" ], "text/plain": [ " label sentence\n", "0 1 Mindaugas Budzinauskas wierzy w odbudowę formy...\n", "1 1 Przyjmujący reprezentacji Polski wrócił do PGE...\n", "2 0 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...\n", "3 1 Aleksander Filipiak: Czuję się dobrze w nowym ...\n", "4 0 Victoria Carl i Aleksiej Czerwotkin mistrzami ...\n", "... ... ...\n", "98127 1 Kamil Syprzak zaczyna kolekcjonować trofea. FC...\n", "98128 1 Holandia: dwa gole Piotra Parzyszka Piotr Parz...\n", "98129 1 Sparingowo: Korona gorsza od Stali. Lettieri s...\n", "98130 1 Vive - Wisła. Ośmiu debiutantów w tegorocznej ...\n", "98131 1 WTA Miami: Timea Bacsinszky pokonana, Swietłan...\n", "\n", "[98132 rows x 2 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Loading data\n", "train_data = pd.read_csv('./train/train.tsv/train.tsv', sep='\\t', header=None, on_bad_lines='skip')\n", "\n", "# Removing unnecessary column\n", "train_data = train_data.drop(train_data.columns[2], axis=1)\n", "\n", "# Renaming columns\n", "train_data.columns = [\"label\", \"sentence\"]\n", "\n", "display(train_data)" ] }, { "cell_type": "code", "execution_count": 3, "id": "93ca5e9a-56f7-493f-b088-81d272cfe4aa", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelsentencesentence_split
01Mindaugas Budzinauskas wierzy w odbudowę formy...[mindaugas, budzinauskas, wierzy, w, odbudowę,...
11Przyjmujący reprezentacji Polski wrócił do PGE...[przyjmujący, reprezentacji, polski, wrócił, d...
20FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...[fen, 9, zapowiedź, walki, róża, gumienna, vs,...
31Aleksander Filipiak: Czuję się dobrze w nowym ...[aleksander, filipiak, czuję, się, dobrze, w, ...
40Victoria Carl i Aleksiej Czerwotkin mistrzami ...[victoria, carl, i, aleksiej, czerwotkin, mist...
............
981271Kamil Syprzak zaczyna kolekcjonować trofea. FC...[kamil, syprzak, zaczyna, kolekcjonować, trofe...
981281Holandia: dwa gole Piotra Parzyszka Piotr Parz...[holandia, dwa, gole, piotra, parzyszka, piotr...
981291Sparingowo: Korona gorsza od Stali. Lettieri s...[sparingowo, korona, gorsza, od, stali, lettie...
981301Vive - Wisła. Ośmiu debiutantów w tegorocznej ...[vive, wisła, ośmiu, debiutantów, w, tegoroczn...
981311WTA Miami: Timea Bacsinszky pokonana, Swietłan...[wta, miami, timea, bacsinszky, pokonana, swie...
\n", "

98132 rows × 3 columns

\n", "
" ], "text/plain": [ " label sentence \\\n", "0 1 Mindaugas Budzinauskas wierzy w odbudowę formy... \n", "1 1 Przyjmujący reprezentacji Polski wrócił do PGE... \n", "2 0 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz... \n", "3 1 Aleksander Filipiak: Czuję się dobrze w nowym ... \n", "4 0 Victoria Carl i Aleksiej Czerwotkin mistrzami ... \n", "... ... ... \n", "98127 1 Kamil Syprzak zaczyna kolekcjonować trofea. FC... \n", "98128 1 Holandia: dwa gole Piotra Parzyszka Piotr Parz... \n", "98129 1 Sparingowo: Korona gorsza od Stali. Lettieri s... \n", "98130 1 Vive - Wisła. Ośmiu debiutantów w tegorocznej ... \n", "98131 1 WTA Miami: Timea Bacsinszky pokonana, Swietłan... \n", "\n", " sentence_split \n", "0 [mindaugas, budzinauskas, wierzy, w, odbudowę,... \n", "1 [przyjmujący, reprezentacji, polski, wrócił, d... \n", "2 [fen, 9, zapowiedź, walki, róża, gumienna, vs,... \n", "3 [aleksander, filipiak, czuję, się, dobrze, w, ... \n", "4 [victoria, carl, i, aleksiej, czerwotkin, mist... \n", "... ... \n", "98127 [kamil, syprzak, zaczyna, kolekcjonować, trofe... \n", "98128 [holandia, dwa, gole, piotra, parzyszka, piotr... \n", "98129 [sparingowo, korona, gorsza, od, stali, lettie... \n", "98130 [vive, wisła, ośmiu, debiutantów, w, tegoroczn... \n", "98131 [wta, miami, timea, bacsinszky, pokonana, swie... \n", "\n", "[98132 rows x 3 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def preprocess(sentence):\n", " sentence = sentence.lower()\n", " sentence = re.sub(r'\\W', ' ', sentence)\n", " tokens = word_tokenize(sentence)\n", " return tokens\n", "\n", "train_data[\"sentence_split\"] = train_data[\"sentence\"].apply(preprocess)\n", "display(train_data)" ] }, { "cell_type": "code", "execution_count": 4, "id": "e6611779-e621-47f7-a908-1d8123d4426b", "metadata": {}, "outputs": [], "source": [ "w2v = gensim.models.Word2Vec(train_data[\"sentence_split\"], vector_size=500, window=5, min_count=2, workers=4)" ] }, { "cell_type": "code", "execution_count": 5, "id": "85ffca71-529f-463f-a7da-25aa0aa9f8a1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "65861\n" ] } ], "source": [ "# number of words in w2v\n", "print(len(w2v.wv))" ] }, { "cell_type": "code", "execution_count": 6, "id": "c5366e81-5883-4fb7-a721-dd53b5016369", "metadata": {}, "outputs": [], "source": [ "token = Tokenizer(len(w2v.wv))\n", "token.fit_on_texts(train_data[\"sentence_split\"])\n", "text = token.texts_to_sequences(train_data[\"sentence_split\"])\n", "text = pad_sequences(text)" ] }, { "cell_type": "code", "execution_count": 7, "id": "68346de7-774f-4b50-8256-1203be8dbec0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X_train:\n", "\t[[ 0 0 0 ... 1630 724 11557]\n", " [ 0 0 0 ... 3 129 594]\n", " [ 0 0 0 ... 4 781 28351]\n", " ...\n", " [ 0 0 0 ... 390 35 55]\n", " [ 0 0 0 ... 44454 12175 329]\n", " [ 0 0 0 ... 159 455 1172]]\n", "y_train:\n", "\t[1 1 0 ... 1 1 1]\n", "Vocabulary size:\n", "\t104277\n" ] } ], "source": [ "X_train = np.array(text)\n", "y_train = train_data[\"label\"].to_numpy()\n", "vocabulary_size = len(token.word_index)\n", "print(f\"X_train:\\n\\t{X_train}\\ny_train:\\n\\t{y_train}\\nVocabulary size:\\n\\t{vocabulary_size}\")" ] }, { "cell_type": "code", "execution_count": 8, "id": "1c76644e-91a2-4189-84ca-5e4c5ee57e4a", "metadata": {}, "outputs": [], "source": [ "embedding_matrix = np.zeros((vocabulary_size, 500))\n", "for word, i in token.word_index.items():\n", " if word in w2v.wv:\n", " embedding_matrix[i] = w2v.wv[word]" ] }, { "cell_type": "markdown", "id": "a24fd19f-6aa3-4b93-bf73-8b7c939f08bf", "metadata": {}, "source": [ "## Test data preprocessing" ] }, { "cell_type": "code", "execution_count": 34, "id": "908ab909-c05e-473b-82cf-9f7e044eb300", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
0ATP Sztokholm: Juergen Zopp wykorzystał szansę...
1Krowicki z reprezentacją kobiet aż do igrzysk ...
2Wielki powrót Łukasza Kubota Odradza się zawsz...
3Marcel Hirscher wygrał ostatni slalom gigant m...
4Polki do Czarnogóry z pełnią zaangażowania. Sy...
......
5443Biało-czerwona siła w Falun. Oni będą reprezen...
5444Finał WTA Tokio na żywo: Woźniacka - Osaka LIV...
5445Oni zapisali się w annałach. Hubert Hurkacz 15...
5446Poprawia się stan Nikiego Laudy. Austriak może...
5447Liga Mistrzów. Zabójcza końcówka Interu Mediol...
\n", "

5448 rows × 1 columns

\n", "
" ], "text/plain": [ " 0\n", "0 ATP Sztokholm: Juergen Zopp wykorzystał szansę...\n", "1 Krowicki z reprezentacją kobiet aż do igrzysk ...\n", "2 Wielki powrót Łukasza Kubota Odradza się zawsz...\n", "3 Marcel Hirscher wygrał ostatni slalom gigant m...\n", "4 Polki do Czarnogóry z pełnią zaangażowania. Sy...\n", "... ...\n", "5443 Biało-czerwona siła w Falun. Oni będą reprezen...\n", "5444 Finał WTA Tokio na żywo: Woźniacka - Osaka LIV...\n", "5445 Oni zapisali się w annałach. Hubert Hurkacz 15...\n", "5446 Poprawia się stan Nikiego Laudy. Austriak może...\n", "5447 Liga Mistrzów. Zabójcza końcówka Interu Mediol...\n", "\n", "[5448 rows x 1 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sentencesentence_split
0ATP Sztokholm: Juergen Zopp wykorzystał szansę...[atp, sztokholm, juergen, zopp, wykorzystał, s...
1Krowicki z reprezentacją kobiet aż do igrzysk ...[krowicki, z, reprezentacją, kobiet, aż, do, i...
2Wielki powrót Łukasza Kubota Odradza się zawsz...[wielki, powrót, łukasza, kubota, odradza, się...
3Marcel Hirscher wygrał ostatni slalom gigant m...[marcel, hirscher, wygrał, ostatni, slalom, gi...
4Polki do Czarnogóry z pełnią zaangażowania. Sy...[polki, do, czarnogóry, z, pełnią, zaangażowan...
.........
5443Biało-czerwona siła w Falun. Oni będą reprezen...[biało, czerwona, siła, w, falun, oni, będą, r...
5444Finał WTA Tokio na żywo: Woźniacka - Osaka LIV...[finał, wta, tokio, na, żywo, woźniacka, osaka...
5445Oni zapisali się w annałach. Hubert Hurkacz 15...[oni, zapisali, się, w, annałach, hubert, hurk...
5446Poprawia się stan Nikiego Laudy. Austriak może...[poprawia, się, stan, nikiego, laudy, austriak...
5447Liga Mistrzów. Zabójcza końcówka Interu Mediol...[liga, mistrzów, zabójcza, końcówka, interu, m...
\n", "

5448 rows × 2 columns

\n", "
" ], "text/plain": [ " sentence \\\n", "0 ATP Sztokholm: Juergen Zopp wykorzystał szansę... \n", "1 Krowicki z reprezentacją kobiet aż do igrzysk ... \n", "2 Wielki powrót Łukasza Kubota Odradza się zawsz... \n", "3 Marcel Hirscher wygrał ostatni slalom gigant m... \n", "4 Polki do Czarnogóry z pełnią zaangażowania. Sy... \n", "... ... \n", "5443 Biało-czerwona siła w Falun. Oni będą reprezen... \n", "5444 Finał WTA Tokio na żywo: Woźniacka - Osaka LIV... \n", "5445 Oni zapisali się w annałach. Hubert Hurkacz 15... \n", "5446 Poprawia się stan Nikiego Laudy. Austriak może... \n", "5447 Liga Mistrzów. Zabójcza końcówka Interu Mediol... \n", "\n", " sentence_split \n", "0 [atp, sztokholm, juergen, zopp, wykorzystał, s... \n", "1 [krowicki, z, reprezentacją, kobiet, aż, do, i... \n", "2 [wielki, powrót, łukasza, kubota, odradza, się... \n", "3 [marcel, hirscher, wygrał, ostatni, slalom, gi... \n", "4 [polki, do, czarnogóry, z, pełnią, zaangażowan... \n", "... ... \n", "5443 [biało, czerwona, siła, w, falun, oni, będą, r... \n", "5444 [finał, wta, tokio, na, żywo, woźniacka, osaka... \n", "5445 [oni, zapisali, się, w, annałach, hubert, hurk... \n", "5446 [poprawia, się, stan, nikiego, laudy, austriak... \n", "5447 [liga, mistrzów, zabójcza, końcówka, interu, m... \n", "\n", "[5448 rows x 2 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "X_test_pd = pd.read_csv(\"test-A/in.tsv\", sep=\"\\t\", header=None)\n", "X_test_pd = X_test_pd.drop(X_test_pd.columns[1], axis=1)\n", "display(X_test_pd)\n", "X_test_pd.columns = [\"sentence\"]\n", "X_test_pd['sentence_split'] = X_test_pd['sentence'].apply(preprocess)\n", "X_test = token.texts_to_sequences(X_test_pd['sentence_split'])\n", "X_test = pad_sequences(X_test)\n", "display(X_test_pd)" ] }, { "cell_type": "markdown", "id": "62b38223-cda6-443a-a05f-a1661233e94c", "metadata": {}, "source": [ "## Dev data preprocessing" ] }, { "cell_type": "code", "execution_count": 37, "id": "6a5fbb7d-2a8e-4633-8f4c-bec6acf624b2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
0Mundial 2018. Były reprezentant Anglii trenere...
1Liga Mistrzyń: Podopieczne Kima Rasmussena bli...
2Wyczerpujące treningi biegowe Justyny Kowalczy...
3Mundial 2018. Zagraniczne media zareagowały na...
4BCL. Artur Gronek: Musimy grać twardziej. Pope...
......
5447Michał Probierz szuka powodów do optymizmu. \"C...
5448ME 2017 w siatkówce. Znakomita frekwencja. Kib...
5449Zobacz oficjalny trailer KSW 42 (wideo) Organi...
5450Rummenigge nie wyklucza, że ktoś odejdzie z Ba...
5451Sympatyczny gest argentyńskich tenisistów. Obd...
\n", "

5452 rows × 1 columns

\n", "
" ], "text/plain": [ " 0\n", "0 Mundial 2018. Były reprezentant Anglii trenere...\n", "1 Liga Mistrzyń: Podopieczne Kima Rasmussena bli...\n", "2 Wyczerpujące treningi biegowe Justyny Kowalczy...\n", "3 Mundial 2018. Zagraniczne media zareagowały na...\n", "4 BCL. Artur Gronek: Musimy grać twardziej. Pope...\n", "... ...\n", "5447 Michał Probierz szuka powodów do optymizmu. \"C...\n", "5448 ME 2017 w siatkówce. Znakomita frekwencja. Kib...\n", "5449 Zobacz oficjalny trailer KSW 42 (wideo) Organi...\n", "5450 Rummenigge nie wyklucza, że ktoś odejdzie z Ba...\n", "5451 Sympatyczny gest argentyńskich tenisistów. Obd...\n", "\n", "[5452 rows x 1 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "array([[ 0, 0, 0, ..., 149, 3657, 28408],\n", " [ 0, 0, 0, ..., 2378, 59831, 31454],\n", " [ 0, 0, 0, ..., 311, 991, 15435],\n", " ...,\n", " [ 0, 0, 0, ..., 2, 2999, 11543],\n", " [ 0, 0, 0, ..., 4, 1077, 38402],\n", " [ 0, 0, 0, ..., 1001, 39, 18089]])" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "array([[1],\n", " [1],\n", " [0],\n", " ...,\n", " [0],\n", " [1],\n", " [1]], dtype=int64)" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dev_in = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", header=None, on_bad_lines='skip')\n", "display(dev_in)\n", "dev_in.columns = [\"sentence\"]\n", "dev_in['sentence_split'] = dev_in['sentence'].apply(preprocess)\n", "dev_text = token.texts_to_sequences(dev_in['sentence_split'])\n", "dev_text = pad_sequences(dev_text)\n", "\n", "dev_expected = pd.read_csv(\"dev-0/expected.tsv\", sep=\"\\t\", header=None, on_bad_lines='skip').to_numpy()\n", "display(dev_text)\n", "display(dev_expected)" ] }, { "cell_type": "markdown", "id": "d6dac3f7-d148-4629-bece-27d91fb7169b", "metadata": {}, "source": [ "# Training the model" ] }, { "cell_type": "code", "execution_count": 11, "id": "11c1dc1d-18af-44e8-b026-7d5f637a2081", "metadata": {}, "outputs": [], "source": [ "opt = Adam(learning_rate=0.0001)" ] }, { "cell_type": "code", "execution_count": 12, "id": "9bb42d4d-c33f-4f74-b111-c4366c3026d4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m31s\u001b[0m 18ms/step - acc: 0.8497 - loss: 0.2841\n", "Epoch 2/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 18ms/step - acc: 0.9729 - loss: 0.0774\n", "Epoch 3/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 18ms/step - acc: 0.9763 - loss: 0.0695\n", "Epoch 4/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 18ms/step - acc: 0.9790 - loss: 0.0608\n", "Epoch 5/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 20ms/step - acc: 0.9796 - loss: 0.0586\n", "Epoch 6/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9823 - loss: 0.0494\n", "Epoch 7/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9831 - loss: 0.0490\n", "Epoch 8/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9832 - loss: 0.0477\n", "Epoch 9/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9842 - loss: 0.0437\n", "Epoch 10/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9848 - loss: 0.0444\n", "Epoch 11/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9855 - loss: 0.0418\n", "Epoch 12/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m32s\u001b[0m 21ms/step - acc: 0.9862 - loss: 0.0420\n", "Epoch 13/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m36s\u001b[0m 24ms/step - acc: 0.9860 - loss: 0.0400\n", "Epoch 14/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m32s\u001b[0m 21ms/step - acc: 0.9864 - loss: 0.0392\n", "Epoch 15/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9865 - loss: 0.0387\n", "Epoch 16/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9871 - loss: 0.0369\n", "Epoch 17/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9875 - loss: 0.0349\n", "Epoch 18/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9877 - loss: 0.0357\n", "Epoch 19/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9888 - loss: 0.0312\n", "Epoch 20/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 19ms/step - acc: 0.9879 - loss: 0.0347\n", "Epoch 21/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 20ms/step - acc: 0.9879 - loss: 0.0326\n", "Epoch 22/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 20ms/step - acc: 0.9881 - loss: 0.0329\n", "Epoch 23/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 19ms/step - acc: 0.9874 - loss: 0.0337\n", "Epoch 24/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9889 - loss: 0.0307\n", "Epoch 25/25\n", "\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m37s\u001b[0m 24ms/step - acc: 0.9893 - loss: 0.0290\n" ] } ], "source": [ "keras_model = Sequential()\n", "keras_model.add(Embedding(vocabulary_size, 500, weights=[embedding_matrix], trainable=False))\n", "keras_model.add(Dropout(0.4))\n", "keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))\n", "keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))\n", "keras_model.add(MaxPooling1D())\n", "keras_model.add(Dropout(0.4))\n", "keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))\n", "keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))\n", "keras_model.add(MaxPooling1D())\n", "keras_model.add(Dropout(0.2))\n", "keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))\n", "keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))\n", "keras_model.add(GlobalMaxPooling1D())\n", "keras_model.add(Dropout(0.4))\n", "keras_model.add(Dense(200))\n", "keras_model.add(Activation('relu'))\n", "keras_model.add(Dropout(0.4))\n", "keras_model.add(Dense(1))\n", "keras_model.add(Activation('sigmoid'))\n", "keras_model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer=opt)\n", "keras_model.fit(X_train, y_train, batch_size=64, epochs=25)\n", "\n", "model = keras_model" ] }, { "cell_type": "code", "execution_count": 39, "id": "586e83f2-ff5a-498d-a06a-42476df2d972", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step\n" ] } ], "source": [ "test_result = model.predict(X_test)\n", "test_predictions = np.where(test_result>=0.50, 1, 0)\n", "pd.DataFrame(test_predictions).to_csv('test-A/out.tsv', sep=\"\\t\", index=False, encoding='utf-8') " ] }, { "cell_type": "markdown", "id": "f4727d5e-e15d-425d-abd3-7f01427e8385", "metadata": {}, "source": [ "# Evaluation\n", "I had problems installing GEval on Windows so I did the evaluation the old fashioned way " ] }, { "cell_type": "code", "execution_count": 38, "id": "6c646808-088d-41f0-8f9c-1d143aec769f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
01
11
20
31
41
......
54471
54481
54490
54501
54511
\n", "

5452 rows × 1 columns

\n", "
" ], "text/plain": [ " 0\n", "0 1\n", "1 1\n", "2 0\n", "3 1\n", "4 1\n", "... ..\n", "5447 1\n", "5448 1\n", "5449 0\n", "5450 1\n", "5451 1\n", "\n", "[5452 rows x 1 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dev_result = model.predict(dev_text)\n", "dev_predictions = np.where(dev_result>=0.50, 1, 0)\n", "predictions_df = pd.DataFrame(dev_predictions)\n", "display(predictions_df)" ] }, { "cell_type": "code", "execution_count": 40, "id": "b5c1c0a1-8dae-4f6d-a0a6-badee37d32a3", "metadata": {}, "outputs": [], "source": [ "dev_data = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", header=None, on_bad_lines='skip')\n", "predictions_df.to_csv('dev-0/out.tsv', sep=\"\\t\", index=False, header=None)\n", "dev_data.to_csv('in.tsv', sep=\"\\t\", index=False, header=None, encoding='utf-8')" ] }, { "cell_type": "code", "execution_count": 15, "id": "ed802c5c-f63f-4618-b30d-5d8c0fe5e16d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.9888114453411592\n" ] } ], "source": [ "score = accuracy_score(y_true=dev_expected, y_pred=dev_predictions)\n", "print(score)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 }