sport-text-classification-ball/Word2Vec.ipynb

1104 lines
40 KiB
Plaintext
Raw Normal View History

2024-05-19 23:58:06 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "e3b4fa50-eb34-4e53-b938-65cbd4c21f43",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from nltk.tokenize import word_tokenize\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import LabelEncoder\n",
"import gensim\n",
"from gensim.models import Word2Vec\n",
"import numpy as np\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import accuracy_score, classification_report\n",
"import pandas as pd\n",
"from keras.src.utils import pad_sequences\n",
"from keras.src.legacy.preprocessing.text import Tokenizer\n",
"from keras.src.layers import Dropout, Dense, Activation, Embedding, MaxPooling1D, GlobalMaxPooling1D\n",
"from keras.src.layers import Conv1D\n",
"from keras import Sequential\n",
"from keras.optimizers import Adam\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "markdown",
"id": "4363302f-a5e2-450b-bcee-5a0045101986",
"metadata": {},
"source": [
"# Data preprocessing"
]
},
{
"cell_type": "markdown",
"id": "40c5fe05-fe48-42ed-8d1a-9c128e8a7550",
"metadata": {},
"source": [
"## Train data preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "80cdc748-8645-4724-afaf-f5af64be9052",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>sentence</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Mindaugas Budzinauskas wierzy w odbudowę formy...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Przyjmujący reprezentacji Polski wrócił do PGE...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>Aleksander Filipiak: Czuję się dobrze w nowym ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>Victoria Carl i Aleksiej Czerwotkin mistrzami ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98127</th>\n",
" <td>1</td>\n",
" <td>Kamil Syprzak zaczyna kolekcjonować trofea. FC...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98128</th>\n",
" <td>1</td>\n",
" <td>Holandia: dwa gole Piotra Parzyszka Piotr Parz...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98129</th>\n",
" <td>1</td>\n",
" <td>Sparingowo: Korona gorsza od Stali. Lettieri s...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98130</th>\n",
" <td>1</td>\n",
" <td>Vive - Wisła. Ośmiu debiutantów w tegorocznej ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98131</th>\n",
" <td>1</td>\n",
" <td>WTA Miami: Timea Bacsinszky pokonana, Swietłan...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>98132 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" label sentence\n",
"0 1 Mindaugas Budzinauskas wierzy w odbudowę formy...\n",
"1 1 Przyjmujący reprezentacji Polski wrócił do PGE...\n",
"2 0 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...\n",
"3 1 Aleksander Filipiak: Czuję się dobrze w nowym ...\n",
"4 0 Victoria Carl i Aleksiej Czerwotkin mistrzami ...\n",
"... ... ...\n",
"98127 1 Kamil Syprzak zaczyna kolekcjonować trofea. FC...\n",
"98128 1 Holandia: dwa gole Piotra Parzyszka Piotr Parz...\n",
"98129 1 Sparingowo: Korona gorsza od Stali. Lettieri s...\n",
"98130 1 Vive - Wisła. Ośmiu debiutantów w tegorocznej ...\n",
"98131 1 WTA Miami: Timea Bacsinszky pokonana, Swietłan...\n",
"\n",
"[98132 rows x 2 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Loading data\n",
"train_data = pd.read_csv('./train/train.tsv/train.tsv', sep='\\t', header=None, on_bad_lines='skip')\n",
"\n",
"# Removing unnecessary column\n",
"train_data = train_data.drop(train_data.columns[2], axis=1)\n",
"\n",
"# Renaming columns\n",
"train_data.columns = [\"label\", \"sentence\"]\n",
"\n",
"display(train_data)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "93ca5e9a-56f7-493f-b088-81d272cfe4aa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>sentence</th>\n",
" <th>sentence_split</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>Mindaugas Budzinauskas wierzy w odbudowę formy...</td>\n",
" <td>[mindaugas, budzinauskas, wierzy, w, odbudowę,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>Przyjmujący reprezentacji Polski wrócił do PGE...</td>\n",
" <td>[przyjmujący, reprezentacji, polski, wrócił, d...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...</td>\n",
" <td>[fen, 9, zapowiedź, walki, róża, gumienna, vs,...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>Aleksander Filipiak: Czuję się dobrze w nowym ...</td>\n",
" <td>[aleksander, filipiak, czuję, się, dobrze, w, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>Victoria Carl i Aleksiej Czerwotkin mistrzami ...</td>\n",
" <td>[victoria, carl, i, aleksiej, czerwotkin, mist...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98127</th>\n",
" <td>1</td>\n",
" <td>Kamil Syprzak zaczyna kolekcjonować trofea. FC...</td>\n",
" <td>[kamil, syprzak, zaczyna, kolekcjonować, trofe...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98128</th>\n",
" <td>1</td>\n",
" <td>Holandia: dwa gole Piotra Parzyszka Piotr Parz...</td>\n",
" <td>[holandia, dwa, gole, piotra, parzyszka, piotr...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98129</th>\n",
" <td>1</td>\n",
" <td>Sparingowo: Korona gorsza od Stali. Lettieri s...</td>\n",
" <td>[sparingowo, korona, gorsza, od, stali, lettie...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98130</th>\n",
" <td>1</td>\n",
" <td>Vive - Wisła. Ośmiu debiutantów w tegorocznej ...</td>\n",
" <td>[vive, wisła, ośmiu, debiutantów, w, tegoroczn...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98131</th>\n",
" <td>1</td>\n",
" <td>WTA Miami: Timea Bacsinszky pokonana, Swietłan...</td>\n",
" <td>[wta, miami, timea, bacsinszky, pokonana, swie...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>98132 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" label sentence \\\n",
"0 1 Mindaugas Budzinauskas wierzy w odbudowę formy... \n",
"1 1 Przyjmujący reprezentacji Polski wrócił do PGE... \n",
"2 0 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz... \n",
"3 1 Aleksander Filipiak: Czuję się dobrze w nowym ... \n",
"4 0 Victoria Carl i Aleksiej Czerwotkin mistrzami ... \n",
"... ... ... \n",
"98127 1 Kamil Syprzak zaczyna kolekcjonować trofea. FC... \n",
"98128 1 Holandia: dwa gole Piotra Parzyszka Piotr Parz... \n",
"98129 1 Sparingowo: Korona gorsza od Stali. Lettieri s... \n",
"98130 1 Vive - Wisła. Ośmiu debiutantów w tegorocznej ... \n",
"98131 1 WTA Miami: Timea Bacsinszky pokonana, Swietłan... \n",
"\n",
" sentence_split \n",
"0 [mindaugas, budzinauskas, wierzy, w, odbudowę,... \n",
"1 [przyjmujący, reprezentacji, polski, wrócił, d... \n",
"2 [fen, 9, zapowiedź, walki, róża, gumienna, vs,... \n",
"3 [aleksander, filipiak, czuję, się, dobrze, w, ... \n",
"4 [victoria, carl, i, aleksiej, czerwotkin, mist... \n",
"... ... \n",
"98127 [kamil, syprzak, zaczyna, kolekcjonować, trofe... \n",
"98128 [holandia, dwa, gole, piotra, parzyszka, piotr... \n",
"98129 [sparingowo, korona, gorsza, od, stali, lettie... \n",
"98130 [vive, wisła, ośmiu, debiutantów, w, tegoroczn... \n",
"98131 [wta, miami, timea, bacsinszky, pokonana, swie... \n",
"\n",
"[98132 rows x 3 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def preprocess(sentence):\n",
" sentence = sentence.lower()\n",
" sentence = re.sub(r'\\W', ' ', sentence)\n",
" tokens = word_tokenize(sentence)\n",
" return tokens\n",
"\n",
"train_data[\"sentence_split\"] = train_data[\"sentence\"].apply(preprocess)\n",
"display(train_data)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e6611779-e621-47f7-a908-1d8123d4426b",
"metadata": {},
"outputs": [],
"source": [
"w2v = gensim.models.Word2Vec(train_data[\"sentence_split\"], vector_size=500, window=5, min_count=2, workers=4)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "85ffca71-529f-463f-a7da-25aa0aa9f8a1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"65861\n"
]
}
],
"source": [
"# number of words in w2v\n",
"print(len(w2v.wv))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "c5366e81-5883-4fb7-a721-dd53b5016369",
"metadata": {},
"outputs": [],
"source": [
"token = Tokenizer(len(w2v.wv))\n",
"token.fit_on_texts(train_data[\"sentence_split\"])\n",
"text = token.texts_to_sequences(train_data[\"sentence_split\"])\n",
"text = pad_sequences(text)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "68346de7-774f-4b50-8256-1203be8dbec0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"X_train:\n",
"\t[[ 0 0 0 ... 1630 724 11557]\n",
" [ 0 0 0 ... 3 129 594]\n",
" [ 0 0 0 ... 4 781 28351]\n",
" ...\n",
" [ 0 0 0 ... 390 35 55]\n",
" [ 0 0 0 ... 44454 12175 329]\n",
" [ 0 0 0 ... 159 455 1172]]\n",
"y_train:\n",
"\t[1 1 0 ... 1 1 1]\n",
"Vocabulary size:\n",
"\t104277\n"
]
}
],
"source": [
"X_train = np.array(text)\n",
"y_train = train_data[\"label\"].to_numpy()\n",
"vocabulary_size = len(token.word_index)\n",
"print(f\"X_train:\\n\\t{X_train}\\ny_train:\\n\\t{y_train}\\nVocabulary size:\\n\\t{vocabulary_size}\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1c76644e-91a2-4189-84ca-5e4c5ee57e4a",
"metadata": {},
"outputs": [],
"source": [
"embedding_matrix = np.zeros((vocabulary_size, 500))\n",
"for word, i in token.word_index.items():\n",
" if word in w2v.wv:\n",
" embedding_matrix[i] = w2v.wv[word]"
]
},
{
"cell_type": "markdown",
"id": "a24fd19f-6aa3-4b93-bf73-8b7c939f08bf",
"metadata": {},
"source": [
"## Test data preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "908ab909-c05e-473b-82cf-9f7e044eb300",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ATP Sztokholm: Juergen Zopp wykorzystał szansę...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Krowicki z reprezentacją kobiet aż do igrzysk ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Wielki powrót Łukasza Kubota Odradza się zawsz...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Marcel Hirscher wygrał ostatni slalom gigant m...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Polki do Czarnogóry z pełnią zaangażowania. Sy...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5443</th>\n",
" <td>Biało-czerwona siła w Falun. Oni będą reprezen...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5444</th>\n",
" <td>Finał WTA Tokio na żywo: Woźniacka - Osaka LIV...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5445</th>\n",
" <td>Oni zapisali się w annałach. Hubert Hurkacz 15...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5446</th>\n",
" <td>Poprawia się stan Nikiego Laudy. Austriak może...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5447</th>\n",
" <td>Liga Mistrzów. Zabójcza końcówka Interu Mediol...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5448 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 ATP Sztokholm: Juergen Zopp wykorzystał szansę...\n",
"1 Krowicki z reprezentacją kobiet aż do igrzysk ...\n",
"2 Wielki powrót Łukasza Kubota Odradza się zawsz...\n",
"3 Marcel Hirscher wygrał ostatni slalom gigant m...\n",
"4 Polki do Czarnogóry z pełnią zaangażowania. Sy...\n",
"... ...\n",
"5443 Biało-czerwona siła w Falun. Oni będą reprezen...\n",
"5444 Finał WTA Tokio na żywo: Woźniacka - Osaka LIV...\n",
"5445 Oni zapisali się w annałach. Hubert Hurkacz 15...\n",
"5446 Poprawia się stan Nikiego Laudy. Austriak może...\n",
"5447 Liga Mistrzów. Zabójcza końcówka Interu Mediol...\n",
"\n",
"[5448 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sentence</th>\n",
" <th>sentence_split</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ATP Sztokholm: Juergen Zopp wykorzystał szansę...</td>\n",
" <td>[atp, sztokholm, juergen, zopp, wykorzystał, s...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Krowicki z reprezentacją kobiet aż do igrzysk ...</td>\n",
" <td>[krowicki, z, reprezentacją, kobiet, aż, do, i...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Wielki powrót Łukasza Kubota Odradza się zawsz...</td>\n",
" <td>[wielki, powrót, łukasza, kubota, odradza, się...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Marcel Hirscher wygrał ostatni slalom gigant m...</td>\n",
" <td>[marcel, hirscher, wygrał, ostatni, slalom, gi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Polki do Czarnogóry z pełnią zaangażowania. Sy...</td>\n",
" <td>[polki, do, czarnogóry, z, pełnią, zaangażowan...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5443</th>\n",
" <td>Biało-czerwona siła w Falun. Oni będą reprezen...</td>\n",
" <td>[biało, czerwona, siła, w, falun, oni, będą, r...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5444</th>\n",
" <td>Finał WTA Tokio na żywo: Woźniacka - Osaka LIV...</td>\n",
" <td>[finał, wta, tokio, na, żywo, woźniacka, osaka...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5445</th>\n",
" <td>Oni zapisali się w annałach. Hubert Hurkacz 15...</td>\n",
" <td>[oni, zapisali, się, w, annałach, hubert, hurk...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5446</th>\n",
" <td>Poprawia się stan Nikiego Laudy. Austriak może...</td>\n",
" <td>[poprawia, się, stan, nikiego, laudy, austriak...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5447</th>\n",
" <td>Liga Mistrzów. Zabójcza końcówka Interu Mediol...</td>\n",
" <td>[liga, mistrzów, zabójcza, końcówka, interu, m...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5448 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" sentence \\\n",
"0 ATP Sztokholm: Juergen Zopp wykorzystał szansę... \n",
"1 Krowicki z reprezentacją kobiet aż do igrzysk ... \n",
"2 Wielki powrót Łukasza Kubota Odradza się zawsz... \n",
"3 Marcel Hirscher wygrał ostatni slalom gigant m... \n",
"4 Polki do Czarnogóry z pełnią zaangażowania. Sy... \n",
"... ... \n",
"5443 Biało-czerwona siła w Falun. Oni będą reprezen... \n",
"5444 Finał WTA Tokio na żywo: Woźniacka - Osaka LIV... \n",
"5445 Oni zapisali się w annałach. Hubert Hurkacz 15... \n",
"5446 Poprawia się stan Nikiego Laudy. Austriak może... \n",
"5447 Liga Mistrzów. Zabójcza końcówka Interu Mediol... \n",
"\n",
" sentence_split \n",
"0 [atp, sztokholm, juergen, zopp, wykorzystał, s... \n",
"1 [krowicki, z, reprezentacją, kobiet, aż, do, i... \n",
"2 [wielki, powrót, łukasza, kubota, odradza, się... \n",
"3 [marcel, hirscher, wygrał, ostatni, slalom, gi... \n",
"4 [polki, do, czarnogóry, z, pełnią, zaangażowan... \n",
"... ... \n",
"5443 [biało, czerwona, siła, w, falun, oni, będą, r... \n",
"5444 [finał, wta, tokio, na, żywo, woźniacka, osaka... \n",
"5445 [oni, zapisali, się, w, annałach, hubert, hurk... \n",
"5446 [poprawia, się, stan, nikiego, laudy, austriak... \n",
"5447 [liga, mistrzów, zabójcza, końcówka, interu, m... \n",
"\n",
"[5448 rows x 2 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"X_test_pd = pd.read_csv(\"test-A/in.tsv\", sep=\"\\t\", header=None)\n",
"X_test_pd = X_test_pd.drop(X_test_pd.columns[1], axis=1)\n",
"display(X_test_pd)\n",
"X_test_pd.columns = [\"sentence\"]\n",
"X_test_pd['sentence_split'] = X_test_pd['sentence'].apply(preprocess)\n",
"X_test = token.texts_to_sequences(X_test_pd['sentence_split'])\n",
"X_test = pad_sequences(X_test)\n",
"display(X_test_pd)"
]
},
{
"cell_type": "markdown",
"id": "62b38223-cda6-443a-a05f-a1661233e94c",
"metadata": {},
"source": [
"## Dev data preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "6a5fbb7d-2a8e-4633-8f4c-bec6acf624b2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Mundial 2018. Były reprezentant Anglii trenere...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Liga Mistrzyń: Podopieczne Kima Rasmussena bli...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Wyczerpujące treningi biegowe Justyny Kowalczy...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Mundial 2018. Zagraniczne media zareagowały na...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>BCL. Artur Gronek: Musimy grać twardziej. Pope...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5447</th>\n",
" <td>Michał Probierz szuka powodów do optymizmu. \"C...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5448</th>\n",
" <td>ME 2017 w siatkówce. Znakomita frekwencja. Kib...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5449</th>\n",
" <td>Zobacz oficjalny trailer KSW 42 (wideo) Organi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5450</th>\n",
" <td>Rummenigge nie wyklucza, że ktoś odejdzie z Ba...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5451</th>\n",
" <td>Sympatyczny gest argentyńskich tenisistów. Obd...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5452 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 Mundial 2018. Były reprezentant Anglii trenere...\n",
"1 Liga Mistrzyń: Podopieczne Kima Rasmussena bli...\n",
"2 Wyczerpujące treningi biegowe Justyny Kowalczy...\n",
"3 Mundial 2018. Zagraniczne media zareagowały na...\n",
"4 BCL. Artur Gronek: Musimy grać twardziej. Pope...\n",
"... ...\n",
"5447 Michał Probierz szuka powodów do optymizmu. \"C...\n",
"5448 ME 2017 w siatkówce. Znakomita frekwencja. Kib...\n",
"5449 Zobacz oficjalny trailer KSW 42 (wideo) Organi...\n",
"5450 Rummenigge nie wyklucza, że ktoś odejdzie z Ba...\n",
"5451 Sympatyczny gest argentyńskich tenisistów. Obd...\n",
"\n",
"[5452 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"array([[ 0, 0, 0, ..., 149, 3657, 28408],\n",
" [ 0, 0, 0, ..., 2378, 59831, 31454],\n",
" [ 0, 0, 0, ..., 311, 991, 15435],\n",
" ...,\n",
" [ 0, 0, 0, ..., 2, 2999, 11543],\n",
" [ 0, 0, 0, ..., 4, 1077, 38402],\n",
" [ 0, 0, 0, ..., 1001, 39, 18089]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"array([[1],\n",
" [1],\n",
" [0],\n",
" ...,\n",
" [0],\n",
" [1],\n",
" [1]], dtype=int64)"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dev_in = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", header=None, on_bad_lines='skip')\n",
"display(dev_in)\n",
"dev_in.columns = [\"sentence\"]\n",
"dev_in['sentence_split'] = dev_in['sentence'].apply(preprocess)\n",
"dev_text = token.texts_to_sequences(dev_in['sentence_split'])\n",
"dev_text = pad_sequences(dev_text)\n",
"\n",
"dev_expected = pd.read_csv(\"dev-0/expected.tsv\", sep=\"\\t\", header=None, on_bad_lines='skip').to_numpy()\n",
"display(dev_text)\n",
"display(dev_expected)"
]
},
{
"cell_type": "markdown",
"id": "d6dac3f7-d148-4629-bece-27d91fb7169b",
"metadata": {},
"source": [
"# Training the model"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "11c1dc1d-18af-44e8-b026-7d5f637a2081",
"metadata": {},
"outputs": [],
"source": [
"opt = Adam(learning_rate=0.0001)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "9bb42d4d-c33f-4f74-b111-c4366c3026d4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m31s\u001b[0m 18ms/step - acc: 0.8497 - loss: 0.2841\n",
"Epoch 2/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 18ms/step - acc: 0.9729 - loss: 0.0774\n",
"Epoch 3/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 18ms/step - acc: 0.9763 - loss: 0.0695\n",
"Epoch 4/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 18ms/step - acc: 0.9790 - loss: 0.0608\n",
"Epoch 5/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 20ms/step - acc: 0.9796 - loss: 0.0586\n",
"Epoch 6/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9823 - loss: 0.0494\n",
"Epoch 7/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9831 - loss: 0.0490\n",
"Epoch 8/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9832 - loss: 0.0477\n",
"Epoch 9/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9842 - loss: 0.0437\n",
"Epoch 10/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9848 - loss: 0.0444\n",
"Epoch 11/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9855 - loss: 0.0418\n",
"Epoch 12/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m32s\u001b[0m 21ms/step - acc: 0.9862 - loss: 0.0420\n",
"Epoch 13/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m36s\u001b[0m 24ms/step - acc: 0.9860 - loss: 0.0400\n",
"Epoch 14/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m32s\u001b[0m 21ms/step - acc: 0.9864 - loss: 0.0392\n",
"Epoch 15/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9865 - loss: 0.0387\n",
"Epoch 16/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9871 - loss: 0.0369\n",
"Epoch 17/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9875 - loss: 0.0349\n",
"Epoch 18/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9877 - loss: 0.0357\n",
"Epoch 19/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9888 - loss: 0.0312\n",
"Epoch 20/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m28s\u001b[0m 19ms/step - acc: 0.9879 - loss: 0.0347\n",
"Epoch 21/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 20ms/step - acc: 0.9879 - loss: 0.0326\n",
"Epoch 22/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 20ms/step - acc: 0.9881 - loss: 0.0329\n",
"Epoch 23/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m30s\u001b[0m 19ms/step - acc: 0.9874 - loss: 0.0337\n",
"Epoch 24/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m29s\u001b[0m 19ms/step - acc: 0.9889 - loss: 0.0307\n",
"Epoch 25/25\n",
"\u001b[1m1534/1534\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m37s\u001b[0m 24ms/step - acc: 0.9893 - loss: 0.0290\n"
]
}
],
"source": [
"keras_model = Sequential()\n",
"keras_model.add(Embedding(vocabulary_size, 500, weights=[embedding_matrix], trainable=False))\n",
"keras_model.add(Dropout(0.4))\n",
"keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))\n",
"keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))\n",
"keras_model.add(MaxPooling1D())\n",
"keras_model.add(Dropout(0.4))\n",
"keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))\n",
"keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))\n",
"keras_model.add(MaxPooling1D())\n",
"keras_model.add(Dropout(0.2))\n",
"keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))\n",
"keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))\n",
"keras_model.add(GlobalMaxPooling1D())\n",
"keras_model.add(Dropout(0.4))\n",
"keras_model.add(Dense(200))\n",
"keras_model.add(Activation('relu'))\n",
"keras_model.add(Dropout(0.4))\n",
"keras_model.add(Dense(1))\n",
"keras_model.add(Activation('sigmoid'))\n",
"keras_model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer=opt)\n",
"keras_model.fit(X_train, y_train, batch_size=64, epochs=25)\n",
"\n",
"model = keras_model"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "586e83f2-ff5a-498d-a06a-42476df2d972",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step\n"
]
}
],
"source": [
"test_result = model.predict(X_test)\n",
"test_predictions = np.where(test_result>=0.50, 1, 0)\n",
"pd.DataFrame(test_predictions).to_csv('test-A/out.tsv', sep=\"\\t\", index=False, encoding='utf-8') "
]
},
{
"cell_type": "markdown",
"id": "f4727d5e-e15d-425d-abd3-7f01427e8385",
"metadata": {},
"source": [
"# Evaluation\n",
"I had problems installing GEval on Windows so I did the evaluation the old fashioned way "
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "6c646808-088d-41f0-8f9c-1d143aec769f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[1m171/171\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m1s\u001b[0m 4ms/step\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5447</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5448</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5449</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5450</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5451</th>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5452 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 1\n",
"1 1\n",
"2 0\n",
"3 1\n",
"4 1\n",
"... ..\n",
"5447 1\n",
"5448 1\n",
"5449 0\n",
"5450 1\n",
"5451 1\n",
"\n",
"[5452 rows x 1 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"dev_result = model.predict(dev_text)\n",
"dev_predictions = np.where(dev_result>=0.50, 1, 0)\n",
"predictions_df = pd.DataFrame(dev_predictions)\n",
"display(predictions_df)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "b5c1c0a1-8dae-4f6d-a0a6-badee37d32a3",
"metadata": {},
"outputs": [],
"source": [
"dev_data = pd.read_csv(\"dev-0/in.tsv\", sep=\"\\t\", header=None, on_bad_lines='skip')\n",
"predictions_df.to_csv('dev-0/out.tsv', sep=\"\\t\", index=False, header=None)\n",
"dev_data.to_csv('in.tsv', sep=\"\\t\", index=False, header=None, encoding='utf-8')"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "ed802c5c-f63f-4618-b30d-5d8c0fe5e16d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.9888114453411592\n"
]
}
],
"source": [
"score = accuracy_score(y_true=dev_expected, y_pred=dev_predictions)\n",
"print(score)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.14"
}
},
"nbformat": 4,
"nbformat_minor": 5
}