sport-text-classification-ball/Word2Vec.ipynb
jenkins-promoscan f268de4aa1 Upload project
2024-05-19 23:58:06 +02:00

1104 lines
40 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "e3b4fa50-eb34-4e53-b938-65cbd4c21f43",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from nltk.tokenize import word_tokenize\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import LabelEncoder\n",
"import gensim\n",
"from gensim.models import Word2Vec\n",
"import numpy as np\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"import pandas as pd\n",
"from keras.src.utils import pad_sequences\n",
"from keras.src.legacy.preprocessing.text import Tokenizer\n",
"from keras.src.layers import Dropout, Dense, Activation, Embedding, MaxPooling1D, GlobalMaxPooling1D\n",
"from keras.src.layers import Conv1D\n",
"from keras import Sequential\n",
"from keras.optimizers import Adam\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "markdown",
"id": "4363302f-a5e2-450b-bcee-5a0045101986",
"metadata": {},
"source": [
"# Data preprocessing"
]
},
{
"cell_type": "markdown",
"id": "40c5fe05-fe48-42ed-8d1a-9c128e8a7550",
"metadata": {},
"source": [
"## Train data preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "80cdc748-8645-4724-afaf-f5af64be9052",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>sentence</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Mindaugas Budzinauskas wierzy w odbudowę formy...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Przyjmujący reprezentacji Polski wrócił do PGE...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>Aleksander Filipiak: Czuję się dobrze w nowym ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>Victoria Carl i Aleksiej Czerwotkin mistrzami ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98127</th>\n",
" <td>1</td>\n",
" <td>Kamil Syprzak zaczyna kolekcjonować trofea. FC...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98128</th>\n",
" <td>1</td>\n",
" <td>Holandia: dwa gole Piotra Parzyszka Piotr Parz...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98129</th>\n",
" <td>1</td>\n",
" <td>Sparingowo: Korona gorsza od Stali. Lettieri s...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98130</th>\n",
" <td>1</td>\n",
" <td>Vive - Wisła. Ośmiu debiutantów w tegorocznej ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98131</th>\n",
" <td>1</td>\n",
" <td>WTA Miami: Timea Bacsinszky pokonana, Swietłan...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>98132 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" label sentence\n",
"0 1 Mindaugas Budzinauskas wierzy w odbudowę formy...\n",
"1 1 Przyjmujący reprezentacji Polski wrócił do PGE...\n",
"2 0 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...\n",
"3 1 Aleksander Filipiak: Czuję się dobrze w nowym ...\n",
"4 0 Victoria Carl i Aleksiej Czerwotkin mistrzami ...\n",
"... ... ...\n",
"98127 1 Kamil Syprzak zaczyna kolekcjonować trofea. FC...\n",
"98128 1 Holandia: dwa gole Piotra Parzyszka Piotr Parz...\n",
"98129 1 Sparingowo: Korona gorsza od Stali. Lettieri s...\n",
"98130 1 Vive - Wisła. Ośmiu debiutantów w tegorocznej ...\n",
"98131 1 WTA Miami: Timea Bacsinszky pokonana, Swietłan...\n",
"\n",
"[98132 rows x 2 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Loading data\n",
"train_data = pd.read_csv('./train/train.tsv/train.tsv', sep='\\t', header=None, on_bad_lines='skip')\n",
"\n",
"# Removing unnecessary column\n",
"train_data = train_data.drop(train_data.columns[2], axis=1)\n",
"\n",
"# Renaming columns\n",
"train_data.columns = [\"label\", \"sentence\"]\n",
"\n",
"display(train_data)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "93ca5e9a-56f7-493f-b088-81d272cfe4aa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>sentence</th>\n",
" <th>sentence_split</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Mindaugas Budzinauskas wierzy w odbudowę formy...</td>\n",
" <td>[mindaugas, budzinauskas, wierzy, w, odbudowę,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Przyjmujący reprezentacji Polski wrócił do PGE...</td>\n",
" <td>[przyjmujący, reprezentacji, polski, wrócił, d...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...</td>\n",
" <td>[fen, 9, zapowiedź, walki, róża, gumienna, vs,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>Aleksander Filipiak: Czuję się dobrze w nowym ...</td>\n",
" <td>[aleksander, filipiak, czuję, się, dobrze, w, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>Victoria Carl i Aleksiej Czerwotkin mistrzami ...</td>\n",
" <td>[victoria, carl, i, aleksiej, czerwotkin, mist...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98127</th>\n",
" <td>1</td>\n",
" <td>Kamil Syprzak zaczyna kolekcjonować trofea. FC...</td>\n",
" <td>[kamil, syprzak, zaczyna, kolekcjonować, trofe...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98128</th>\n",
" <td>1</td>\n",
" <td>Holandia: dwa gole Piotra Parzyszka Piotr Parz...</td>\n",
" <td>[holandia, dwa, gole, piotra, parzyszka, piotr...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98129</th>\n",
" <td>1</td>\n",
" <td>Sparingowo: Korona gorsza od Stali. Lettieri s...</td>\n",
" <td>[sparingowo, korona, gorsza, od, stali, lettie...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98130</th>\n",
" <td>1</td>\n",
" <td>Vive - Wisła. Ośmiu debiutantów w tegorocznej ...</td>\n",
" <td>[vive, wisła, ośmiu, debiutantów, w, tegoroczn...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98131</th>\n",
" <td>1</td>\n",
" <td>WTA Miami: Timea Bacsinszky pokonana, Swietłan...</td>\n",
" <td>[wta, miami, timea, bacsinszky, pokonana, swie...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>98132 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" label sentence \\\n",
"0 1 Mindaugas Budzinauskas wierzy w odbudowę formy... \n",
"1 1 Przyjmujący reprezentacji Polski wrócił do PGE... \n",
"2 0 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz... \n",
"3 1 Aleksander Filipiak: Czuję się dobrze w nowym ... \n",
"4 0 Victoria Carl i Aleksiej Czerwotkin mistrzami ... \n",
"... ... ... \n",
"98127 1 Kamil Syprzak zaczyna kolekcjonować trofea. FC... \n",
"98128 1 Holandia: dwa gole Piotra Parzyszka Piotr Parz... \n",
"98129 1 Sparingowo: Korona gorsza od Stali. Lettieri s... \n",
"98130 1 Vive - Wisła. Ośmiu debiutantów w tegorocznej ... \n",
"98131 1 WTA Miami: Timea Bacsinszky pokonana, Swietłan... \n",
"\n",
" sentence_split \n",
"0 [mindaugas, budzinauskas, wierzy, w, odbudowę,... \n",
"1 [przyjmujący, reprezentacji, polski, wrócił, d... \n",
"2 [fen, 9, zapowiedź, walki, róża, gumienna, vs,... \n",
"3 [aleksander, filipiak, czuję, się, dobrze, w, ... \n",
"4 [victoria, carl, i, aleksiej, czerwotkin, mist... \n",
"... ... \n",
"98127 [kamil, syprzak, zaczyna, kolekcjonować, trofe... \n",
"98128 [holandia, dwa, gole, piotra, parzyszka, piotr... \n",
"98129 [sparingowo, korona, gorsza, od, stali, lettie... \n",
"98130 [vive, wisła, ośmiu, debiutantów, w, tegoroczn... \n",
"98131 [wta, miami, timea, bacsinszky, pokonana, swie... \n",
"\n",
"[98132 rows x 3 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def preprocess(sentence):\n",
" sentence = sentence.lower()\n",
" sentence = re.sub(r'\\W', ' ', sentence)\n",
" tokens = word_tokenize(sentence)\n",
" return tokens\n",
"\n",
"train_data[\"sentence_split\"] = train_data[\"sentence\"].apply(preprocess)\n",
"display(train_data)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e6611779-e621-47f7-a908-1d8123d4426b",
"metadata": {},
"outputs": [],
"source": [
"w2v = gensim.models.Word2Vec(train_data[\"sentence_split\"], vector_size=500, window=5, min_count=2, workers=4)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "85ffca71-529f-463f-a7da-25aa0aa9f8a1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"65861\n"
]
}
],
"source": [
"# number of words in w2v\n",
"print(len(w2v.wv))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c5366e81-5883-4fb7-a721-dd53b5016369",
"metadata": {},
"outputs": [],
"source": [
"token = Tokenizer(len(w2v.wv))\n",
"token.fit_on_texts(train_data[\"sentence_split\"])\n",
"text = token.texts_to_sequences(train_data[\"sentence_split\"])\n",
"text = pad_sequences(text)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "68346de7-774f-4b50-8256-1203be8dbec0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X_train:\n",
"\t[[ 0 0 0 ... 1630 724 11557]\n",
" [ 0 0 0 ... 3 129 594]\n",
" [ 0 0 0 ... 4 781 28351]\n",
" ...\n",
" [ 0 0 0 ... 390 35 55]\n",
" [ 0 0 0 ... 44454 12175 329]\n",
" [ 0 0 0 ... 159 455 1172]]\n",
"y_train:\n",
"\t[1 1 0 ... 1 1 1]\n",
"Vocabulary size:\n",
"\t104277\n"
]
}
],
"source": [
"X_train = np.array(text)\n",
"y_train = train_data[\"label\"].to_numpy()\n",
"vocabulary_size = len(token.word_index)\n",
"print(f\"X_train:\\n\\t{X_train}\\ny_train:\\n\\t{y_train}\\nVocabulary size:\\n\\t{vocabulary_size}\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1c76644e-91a2-4189-84ca-5e4c5ee57e4a",
"metadata": {},
"outputs": [],
"source": [
"embedding_matrix = np.zeros((vocabulary_size, 500))\n",
"for word, i in token.word_index.items():\n",
" if word in w2v.wv:\n",
" embedding_matrix[i] = w2v.wv[word]"
]
},
{
"cell_type": "markdown",
"id": "a24fd19f-6aa3-4b93-bf73-8b7c939f08bf",
"metadata": {},
"source": [
"## Test data preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "908ab909-c05e-473b-82cf-9f7e044eb300",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ATP Sztokholm: Juergen Zopp wykorzystał szansę...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Krowicki z reprezentacją kobiet aż do igrzysk ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Wielki powrót Łukasza Kubota Odradza się zawsz...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Marcel Hirscher wygrał ostatni slalom gigant m...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Polki do Czarnogóry z pełnią zaangażowania. Sy...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5443</th>\n",
" <td>Biało-czerwona siła w Falun. Oni będą reprezen...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5444</th>\n",
" <td>Finał WTA Tokio na żywo: Woźniacka - Osaka LIV...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5445</th>\n",
" <td>Oni zapisali się w annałach. Hubert Hurkacz 15...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5446</th>\n",
" <td>Poprawia się stan Nikiego Laudy. Austriak może...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5447</th>\n",
" <td>Liga Mistrzów. Zabójcza końcówka Interu Mediol...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5448 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 ATP Sztokholm: Juergen Zopp wykorzystał szansę...\n",
"1 Krowicki z reprezentacją kobiet aż do igrzysk ...\n",
"2 Wielki powrót Łukasza Kubota Odradza się zawsz...\n",
"3 Marcel Hirscher wygrał ostatni slalom gigant m...\n",
"4 Polki do Czarnogóry z pełnią zaangażowania. Sy...\n",
"... ...\n",
"5443 Biało-czerwona siła w Falun. Oni będą reprezen...\n",
"5444 Finał WTA Tokio na żywo: Woźniacka - Osaka LIV...\n",
"5445 Oni zapisali się w annałach. Hubert Hurkacz 15...\n",
"5446 Poprawia się stan Nikiego Laudy. Austriak może...\n",
"5447 Liga Mistrzów. Zabójcza końcówka Interu Mediol...\n",
"\n",
"[5448 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sentence</th>\n",
" <th>sentence_split</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ATP Sztokholm: Juergen Zopp wykorzystał szansę...</td>\n",
" <td>[atp, sztokholm, juergen, zopp, wykorzystał, s...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Krowicki z reprezentacją kobiet aż do igrzysk ...</td>\n",
" <td>[krowicki, z, reprezentacją, kobiet, aż, do, i...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Wielki powrót Łukasza Kubota Odradza się zawsz...</td>\n",
" <td>[wielki, powrót, łukasza, kubota, odradza, się...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Marcel Hirscher wygrał ostatni slalom gigant m...</td>\n",
" <td>[marcel, hirscher, wygrał, ostatni, slalom, gi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Polki do Czarnogóry z pełnią zaangażowania. Sy...</td>\n",
" <td>[polki, do, czarnogóry, z, pełnią, zaangażowan...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5443</th>\n",
" <td>Biało-czerwona siła w Falun. Oni będą reprezen...</td>\n",
" <td>[biało, czerwona, siła, w, falun, oni, będą, r...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5444</th>\n",
" <td>Finał WTA Tokio na żywo: Woźniacka - Osaka LIV...</td>\n",
" <td>[finał, wta, tokio, na, żywo, woźniacka, osaka...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5445</th>\n",
" <td>Oni zapisali się w annałach. Hubert Hurkacz 15...</td>\n",
" <td>[oni, zapisali, się, w, annałach, hubert, hurk...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5446</th>\n",
" <td>Poprawia się stan Nikiego Laudy. Austriak może...</td>\n",
" <td>[poprawia, się, stan, nikiego, laudy, austriak...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5447</th>\n",
" <td>Liga Mistrzów. Zabójcza końcówka Interu Mediol...</td>\n",
" <td>[liga, mistrzów, zabójcza, końcówka, interu, m...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5448 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" sentence \\\n",
"0 ATP Sztokholm: Juergen Zopp wykorzystał szansę... \n",
"1 Krowicki z reprezentacją kobiet aż do igrzysk ... \n",
"2 Wielki powrót Łukasza Kubota Odradza się zawsz... \n",
"3 Marcel Hirscher wygrał ostatni slalom gigant m... \n",
"4 Polki do Czarnogóry z pełnią zaangażowania. Sy... \n",
"... ... \n",
"5443 Biało-czerwona siła w Falun. Oni będą reprezen... \n",
"5444 Finał WTA Tokio na żywo: Woźniacka - Osaka LIV... \n",
"5445 Oni zapisali się w annałach. Hubert Hurkacz 15... \n",
"5446 Poprawia się stan Nikiego Laudy. Austriak może... \n",
"5447 Liga Mistrzów. Zabójcza końcówka Interu Mediol... \n",
"\n",
" sentence_split \n",
"0 [atp, sztokholm, juergen, zopp, wykorzystał, s... \n",
"1 [krowicki, z, reprezentacją, kobiet, aż, do, i... \n",
"2 [wielki, powrót, łukasza, kubota, odradza, się... \n",
"3 [marcel, hirscher, wygrał, ostatni, slalom, gi... \n",
"4 [polki, do, czarnogóry, z, pełnią, zaangażowan... \n",
"... ... \n",
"5443 [biało, czerwona, siła, w, falun, oni, będą, r... \n",
"5444 [finał, wta, tokio, na, żywo, woźniacka, osaka... \n",
"5445 [oni, zapisali, się, w, annałach, hubert, hurk... \n",
"5446 [poprawia, się, stan, nikiego, laudy, austriak... \n",
"5447 [liga, mistrzów, zabójcza, końcówka, interu, m... \n",
"\n",
"[5448 rows x 2 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"X_test_pd = pd.read_csv(\"test-A/in.tsv\", sep=\"\\t\", header=None)\n",
"X_test_pd = X_test_pd.drop(X_test_pd.columns[1], axis=1)\n",
"display(X_test_pd)\n",
"X_test_pd.columns = [\"sentence\"]\n",
"X_test_pd['sentence_split'] = X_test_pd['sentence'].apply(preprocess)\n",
"X_test = token.texts_to_sequences(X_test_pd['sentence_split'])\n",
"X_test = pad_sequences(X_test)\n",
"display(X_test_pd)"
]
},
{
"cell_type": "markdown",
"id": "62b38223-cda6-443a-a05f-a1661233e94c",
"metadata": {},
"source": [
"## Dev data preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "6a5fbb7d-2a8e-4633-8f4c-bec6acf624b2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Mundial 2018. Były reprezentant Anglii trenere...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Liga Mistrzyń: Podopieczne Kima Rasmussena bli...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Wyczerpujące treningi biegowe Justyny Kowalczy...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Mundial 2018. Zagraniczne media zareagowały na...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>BCL. Artur Gronek: Musimy grać twardziej. Pope...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5447</th>\n",
" <td>Michał Probierz szuka powodów do optymizmu. \"C...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5448</th>\n",
" <td>ME 2017 w siatkówce. Znakomita frekwencja. Kib...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5449</th>\n",
" <td>Zobacz oficjalny trailer KSW 42 (wideo) Organi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5450</th>\n",
" <td>Rummenigge nie wyklucza, że ktoś odejdzie z Ba...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5451</th>\n",
" <td>Sympatyczny gest argentyńskich tenisistów. Obd...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5452 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 Mundial 2018. Były reprezentant Anglii trenere...\n",
"1 Liga Mistrzyń: Podopieczne Kima Rasmussena bli...\n",
"2 Wyczerpujące treningi biegowe Justyny Kowalczy...\n",
"3 Mundial 2018. Zagraniczne media zareagowały na...\n",
"4 BCL. Artur Gronek: Musimy grać twardziej. Pope...\n",
"... ...\n",
"5447 Michał Probierz szuka powodów do optymizmu. \"C...\n",
"5448 ME 2017 w siatkówce. Znakomita frekwencja. Kib...\n",
"5449 Zobacz oficjalny trailer KSW 42 (wideo) Organi...\n",
"5450 Rummenigge nie wyklucza, że ktoś odejdzie z Ba...\n",
"5451 Sympatyczny gest argentyńskich tenisistów. Obd...\n",
"\n",
"[5452 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"array([[ 0, 0, 0, ..., 149, 3657, 28408],\n",
" [ 0, 0, 0, ..., 2378, 59831, 31454],\n",
" [ 0, 0, 0, ..., 311, 991, 15435],\n",
" ...,\n",
" [ 0, 0, 0, ..., 2, 2999, 11543],\n",
" [ 0, 0, 0, ..., 4, 1077, 38402],\n",
" [ 0, 0, 0, ..., 1001, 39, 18089]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"array([[1],\n",
" [1],\n",
" [0],\n",
" ...,\n",
" [0],\n",
" [1],\n",
" [1]], dtype=int64)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dev_in = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", header=None, on_bad_lines='skip')\n",
"display(dev_in)\n",
"dev_in.columns = [\"sentence\"]\n",
"dev_in['sentence_split'] = dev_in['sentence'].apply(preprocess)\n",
"dev_text = token.texts_to_sequences(dev_in['sentence_split'])\n",
"dev_text = pad_sequences(dev_text)\n",
"\n",
"dev_expected = pd.read_csv(\"dev-0/expected.tsv\", sep=\"\\t\", header=None, on_bad_lines='skip').to_numpy()\n",
"display(dev_text)\n",
"display(dev_expected)"
]
},
{
"cell_type": "markdown",
"id": "d6dac3f7-d148-4629-bece-27d91fb7169b",
"metadata": {},
"source": [
"# Training the model"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "11c1dc1d-18af-44e8-b026-7d5f637a2081",
"metadata": {},
"outputs": [],
"source": [
"opt = Adam(learning_rate=0.0001)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "9bb42d4d-c33f-4f74-b111-c4366c3026d4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m31s\u001b[0m 18ms/step - acc: 0.8497 - loss: 0.2841\n",
"Epoch 2/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 18ms/step - acc: 0.9729 - loss: 0.0774\n",
"Epoch 3/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 18ms/step - acc: 0.9763 - loss: 0.0695\n",
"Epoch 4/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 18ms/step - acc: 0.9790 - loss: 0.0608\n",
"Epoch 5/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 20ms/step - acc: 0.9796 - loss: 0.0586\n",
"Epoch 6/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9823 - loss: 0.0494\n",
"Epoch 7/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9831 - loss: 0.0490\n",
"Epoch 8/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9832 - loss: 0.0477\n",
"Epoch 9/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9842 - loss: 0.0437\n",
"Epoch 10/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9848 - loss: 0.0444\n",
"Epoch 11/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9855 - loss: 0.0418\n",
"Epoch 12/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m32s\u001b[0m 21ms/step - acc: 0.9862 - loss: 0.0420\n",
"Epoch 13/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m36s\u001b[0m 24ms/step - acc: 0.9860 - loss: 0.0400\n",
"Epoch 14/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m32s\u001b[0m 21ms/step - acc: 0.9864 - loss: 0.0392\n",
"Epoch 15/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9865 - loss: 0.0387\n",
"Epoch 16/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9871 - loss: 0.0369\n",
"Epoch 17/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9875 - loss: 0.0349\n",
"Epoch 18/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9877 - loss: 0.0357\n",
"Epoch 19/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9888 - loss: 0.0312\n",
"Epoch 20/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 19ms/step - acc: 0.9879 - loss: 0.0347\n",
"Epoch 21/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 20ms/step - acc: 0.9879 - loss: 0.0326\n",
"Epoch 22/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 20ms/step - acc: 0.9881 - loss: 0.0329\n",
"Epoch 23/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 19ms/step - acc: 0.9874 - loss: 0.0337\n",
"Epoch 24/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9889 - loss: 0.0307\n",
"Epoch 25/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m37s\u001b[0m 24ms/step - acc: 0.9893 - loss: 0.0290\n"
]
}
],
"source": [
"keras_model = Sequential()\n",
"keras_model.add(Embedding(vocabulary_size, 500, weights=[embedding_matrix], trainable=False))\n",
"keras_model.add(Dropout(0.4))\n",
"keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))\n",
"keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))\n",
"keras_model.add(MaxPooling1D())\n",
"keras_model.add(Dropout(0.4))\n",
"keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))\n",
"keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))\n",
"keras_model.add(MaxPooling1D())\n",
"keras_model.add(Dropout(0.2))\n",
"keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))\n",
"keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))\n",
"keras_model.add(GlobalMaxPooling1D())\n",
"keras_model.add(Dropout(0.4))\n",
"keras_model.add(Dense(200))\n",
"keras_model.add(Activation('relu'))\n",
"keras_model.add(Dropout(0.4))\n",
"keras_model.add(Dense(1))\n",
"keras_model.add(Activation('sigmoid'))\n",
"keras_model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer=opt)\n",
"keras_model.fit(X_train, y_train, batch_size=64, epochs=25)\n",
"\n",
"model = keras_model"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "586e83f2-ff5a-498d-a06a-42476df2d972",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step\n"
]
}
],
"source": [
"test_result = model.predict(X_test)\n",
"test_predictions = np.where(test_result>=0.50, 1, 0)\n",
"pd.DataFrame(test_predictions).to_csv('test-A/out.tsv', sep=\"\\t\", index=False, encoding='utf-8') "
]
},
{
"cell_type": "markdown",
"id": "f4727d5e-e15d-425d-abd3-7f01427e8385",
"metadata": {},
"source": [
"# Evaluation\n",
"I had problems installing GEval on Windows so I did the evaluation the old fashioned way "
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "6c646808-088d-41f0-8f9c-1d143aec769f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5447</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5448</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5449</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5450</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5451</th>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5452 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 1\n",
"1 1\n",
"2 0\n",
"3 1\n",
"4 1\n",
"... ..\n",
"5447 1\n",
"5448 1\n",
"5449 0\n",
"5450 1\n",
"5451 1\n",
"\n",
"[5452 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dev_result = model.predict(dev_text)\n",
"dev_predictions = np.where(dev_result>=0.50, 1, 0)\n",
"predictions_df = pd.DataFrame(dev_predictions)\n",
"display(predictions_df)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "b5c1c0a1-8dae-4f6d-a0a6-badee37d32a3",
"metadata": {},
"outputs": [],
"source": [
"dev_data = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", header=None, on_bad_lines='skip')\n",
"predictions_df.to_csv('dev-0/out.tsv', sep=\"\\t\", index=False, header=None)\n",
"dev_data.to_csv('in.tsv', sep=\"\\t\", index=False, header=None, encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "ed802c5c-f63f-4618-b30d-5d8c0fe5e16d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9888114453411592\n"
]
}
],
"source": [
"score = accuracy_score(y_true=dev_expected, y_pred=dev_predictions)\n",
"print(score)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}