{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np\n", "from gensim.models import KeyedVectors\n", "# https://github.com/sdadas/polish-nlp-resources?tab=readme-ov-file#fasttext\n", "fasttext_model = KeyedVectors.load(\"fasttext_100_3_polish.bin\")" ] }, { "cell_type": "code", "execution_count": 6, "outputs": [ { "data": { "text/plain": "array([-0.10575686, 0.00275842, -0.15149923, 0.04684225, -0.09484185,\n 0.27445596, -0.13551135, -0.08899829, -0.16027121, 0.0851451 ,\n 0.18053234, -0.0375574 , 0.08118784, -0.03693336, 0.13899295,\n 0.0148702 , -0.03542297, 0.07549705, -0.06520785, -0.09142417,\n -0.00903711, 0.15116395, -0.100382 , 0.0491643 , 0.00223149,\n -0.06079744, 0.0959003 , -0.12254302, -0.06442568, -0.01175186,\n 0.14357556, 0.082293 , -0.00633075, 0.12356292, -0.12873764,\n 0.03582585, 0.00486956, 0.02265417, 0.09742602, 0.00961361,\n -0.07241934, 0.05235291, -0.15645239, 0.05410094, -0.03922489,\n -0.19014828, 0.05091096, -0.16526255, -0.04351336, 0.02157344,\n -0.15707618, -0.01369421, 0.05524002, -0.12716308, 0.10982089,\n 0.11500968, 0.00536837, 0.16475938, -0.13811931, -0.02000868,\n 0.06066024, -0.03149116, -0.12379967, -0.21108894, 0.07293601,\n -0.14373247, -0.10874739, -0.03041346, 0.24131383, 0.06944644,\n -0.00836486, -0.11847664, -0.04725966, 0.00336932, 0.04964857,\n 0.01957623, -0.02785001, -0.0883517 , -0.12014113, -0.02970322,\n -0.00858476, -0.0711842 , 0.04591263, -0.05298669, -0.0397255 ,\n 0.06707988, -0.01675842, 0.08076061, -0.01310711, 0.01628348,\n 0.03469754, -0.04314699, -0.00516709, 0.2871206 , 0.05852846,\n -0.18093199, -0.00342047, -0.147456 , -0.04751889, -0.02945601],\n dtype=float32)" }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasttext_model.wv.get_vector('office', norm=True)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 8, "outputs": [ { "data": { "text/plain": "0.72575206" }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fasttext_model.wv.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 60, "outputs": [], "source": [ "import string\n", "import numpy as np\n", "def get_document_vector(document_string):\n", " processed = document_string.translate(str.maketrans('', '', string.punctuation)).split()\n", " try:\n", " processed = np.array(fasttext_model.wv.get_sentence_vector(processed))\n", " return processed\n", " except ValueError:\n", " return np.NAN" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 61, "outputs": [ { "data": { "text/plain": "array([ 0.01578879, -0.0966718 , -0.12096456, 0.03103824, 0.11989886,\n 0.08134278, -0.02491791, -0.04889391, -0.05900102, 0.05264781,\n 0.07366802, 0.05264994, 0.03538202, 0.03959122, 0.08029908,\n -0.05133899, -0.00391489, 0.05500277, 0.02347905, 0.08629225,\n -0.08096454, 0.033292 , -0.07492353, 0.03558746, -0.03898185,\n 0.01894082, -0.00977144, -0.02125431, -0.09896845, -0.07426734,\n 0.07132851, 0.05021148, 0.06596912, -0.02060991, -0.06927098,\n -0.0590184 , 0.03158417, -0.00033762, 0.18291356, 0.02761706,\n -0.0305428 , -0.07682855, -0.0167096 , 0.02518708, -0.01596445,\n -0.0379869 , 0.02503271, -0.10872342, -0.0715234 , -0.10176589,\n -0.03008098, -0.1061382 , 0.04008991, -0.01109458, 0.01513245,\n 0.00942784, 0.00155242, 0.05995774, -0.11261091, -0.06303023,\n 0.02372515, 0.00859607, -0.02200282, 0.02862521, -0.091718 ,\n 0.01269631, -0.02191854, -0.09026017, 0.03745283, -0.00393062,\n -0.02468689, -0.08132526, -0.0274496 , -0.09630067, 0.07670791,\n 0.01474745, -0.05055737, -0.00122033, -0.07364829, 0.01220732,\n -0.09696812, -0.13338262, 0.06731747, -0.03619792, 0.03923816,\n -0.03797578, 0.0150913 , -0.04379996, -0.01847179, 0.06803966,\n 0.0418974 , -0.0373757 , 0.0374969 , 0.08460734, 0.02028288,\n -0.0726779 , 0.04701586, 0.02269063, 0.09565686, 0.02680007],\n dtype=float32)" }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_document_vector(\"This, is - a test.\")" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 62, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\Adrian\\AppData\\Local\\Temp\\ipykernel_1428\\2569974358.py:3: ParserWarning: Skipping line 25706: expected 2 fields, saw 3\n", "Skipping line 58881: expected 2 fields, saw 3\n", "Skipping line 73761: expected 2 fields, saw 3\n", "\n", " training_file = pd.read_csv(\"train/train.tsv\", sep='\\t', on_bad_lines=\"warn\", names=[\"class\",\"text_data\"])\n" ] }, { "data": { "text/plain": " class text_data\n0 1 Mindaugas Budzinauskas wierzy w odbudowę formy...\n1 1 Przyjmujący reprezentacji Polski wrócił do PGE...\n2 0 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...\n3 1 Aleksander Filipiak: Czuję się dobrze w nowym ...\n4 0 Victoria Carl i Aleksiej Czerwotkin mistrzami ...", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
classtext_data
01Mindaugas Budzinauskas wierzy w odbudowę formy...
11Przyjmujący reprezentacji Polski wrócił do PGE...
20FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...
31Aleksander Filipiak: Czuję się dobrze w nowym ...
40Victoria Carl i Aleksiej Czerwotkin mistrzami ...
\n
" }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "training_file = pd.read_csv(\"train/train.tsv\", sep='\\t', on_bad_lines=\"warn\", names=[\"class\",\"text_data\"])\n", "training_file.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 63, "outputs": [ { "data": { "text/plain": "0 Mindaugas Budzinauskas wierzy w odbudowę formy...\n1 Przyjmujący reprezentacji Polski wrócił do PGE...\n2 FEN 9 Zapowiedź walki Róża Gumienna vs Katarzy...\n3 Aleksander Filipiak Czuję się dobrze w nowym k...\n4 Victoria Carl i Aleksiej Czerwotkin mistrzami ...\n ... \n98124 Kamil Syprzak zaczyna kolekcjonować trofea FC ...\n98125 Holandia dwa gole Piotra Parzyszka Piotr Parzy...\n98126 Sparingowo Korona gorsza od Stali Lettieri spr...\n98127 Vive Wisła Ośmiu debiutantów w tegorocznej św...\n98128 WTA Miami Timea Bacsinszky pokonana Swietłana ...\nName: text_data, Length: 98129, dtype: object" }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_file[\"text_data\"].apply(lambda row: row.translate(str.maketrans('', '', string.punctuation)))" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 64, "outputs": [], "source": [ "training_file.dropna(inplace=True)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 65, "outputs": [ { "data": { "text/plain": "98129" }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(training_file.index)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 66, "outputs": [], "source": [ "training_file[\"vectorized\"] = training_file[\"text_data\"].apply(get_document_vector)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 67, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "98128\n" ] } ], "source": [ "training_file.dropna(inplace=True)\n", "print(len(training_file.index))" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 68, "outputs": [ { "data": { "text/plain": " class text_data \\\n0 1 Mindaugas Budzinauskas wierzy w odbudowę formy... \n1 1 Przyjmujący reprezentacji Polski wrócił do PGE... \n2 0 FEN 9: Zapowiedź walki Róża Gumienna vs Katarz... \n3 1 Aleksander Filipiak: Czuję się dobrze w nowym ... \n4 0 Victoria Carl i Aleksiej Czerwotkin mistrzami ... \n\n vectorized \n0 [-0.010491192, -0.058443062, -0.1072605, 0.068... \n1 [0.019159772, -0.03807462, -0.093816765, 0.080... \n2 [0.019561907, -0.09903135, -0.08141139, 0.0962... \n3 [0.0019692876, -0.040995505, -0.112910554, 0.0... \n4 [0.026810315, -0.07052034, -0.12447791, 0.0609... ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
classtext_datavectorized
01Mindaugas Budzinauskas wierzy w odbudowę formy...[-0.010491192, -0.058443062, -0.1072605, 0.068...
11Przyjmujący reprezentacji Polski wrócił do PGE...[0.019159772, -0.03807462, -0.093816765, 0.080...
20FEN 9: Zapowiedź walki Róża Gumienna vs Katarz...[0.019561907, -0.09903135, -0.08141139, 0.0962...
31Aleksander Filipiak: Czuję się dobrze w nowym ...[0.0019692876, -0.040995505, -0.112910554, 0.0...
40Victoria Carl i Aleksiej Czerwotkin mistrzami ...[0.026810315, -0.07052034, -0.12447791, 0.0609...
\n
" }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_file.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 69, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 98128 entries, 0 to 98128\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 class 98128 non-null int64 \n", " 1 text_data 98128 non-null object\n", " 2 vectorized 98128 non-null object\n", "dtypes: int64(1), object(2)\n", "memory usage: 3.0+ MB\n" ] } ], "source": [ "training_file.info()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 70, "outputs": [ { "data": { "text/plain": "array([-0.01049119, -0.05844306, -0.1072605 , 0.0680153 , 0.01738467,\n 0.02759303, 0.02690293, -0.02688588, -0.00051565, 0.04960843,\n 0.0267325 , -0.07590238, -0.00112739, -0.02663443, -0.01215785,\n -0.02335822, 0.00596362, 0.03255358, 0.03372947, -0.0320864 ,\n 0.06242761, 0.05441704, -0.10440411, 0.02391675, -0.02517564,\n -0.00581436, 0.01041707, -0.02866426, -0.07569201, -0.05691882,\n 0.01377875, 0.05586738, 0.02601947, -0.01073826, -0.07011177,\n 0.05394488, 0.00468541, -0.0290179 , 0.12888645, 0.05720428,\n -0.04035591, -0.05646745, -0.00185273, 0.01846331, 0.02260421,\n -0.05327827, -0.0299728 , -0.01459699, -0.01037856, -0.05196216,\n -0.02092045, -0.00421424, -0.0101665 , -0.01815657, 0.03365456,\n 0.01784232, -0.01427742, -0.05149295, -0.01840808, 0.08991119,\n 0.03609616, -0.03085677, -0.02868558, 0.0879923 , -0.08462378,\n -0.02428374, -0.06649223, 0.05328292, 0.09114845, -0.0074865 ,\n 0.07581685, 0.02017863, 0.01063073, 0.02651897, 0.00125264,\n -0.04205399, -0.15118514, -0.01358473, -0.04589266, 0.00465928,\n -0.01037135, -0.0240653 , 0.01271867, -0.00046581, -0.0062453 ,\n -0.01982017, -0.00213563, 0.0068075 , -0.01338028, -0.01335924,\n 0.11551541, 0.01461171, -0.0956174 , 0.09537749, 0.02394151,\n -0.1085504 , 0.0310267 , 0.02344807, 0.01435937, 0.03094357],\n dtype=float32)" }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "training_file[\"vectorized\"].iloc[0]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 71, "outputs": [], "source": [ "import tensorflow as tf\n", "from tensorflow import keras\n", "from keras import layers\n", "from keras.optimizers import Adam\n", "\n", "def create_model():\n", " inputs = keras.Input(shape=(100,))\n", " dense = layers.Dense(64, activation=\"relu\")(inputs)\n", " output = layers.Dense(1, activation=\"sigmoid\")(dense)\n", " model = keras.Model(inputs=inputs, outputs=output)\n", " model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])\n", " return model" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 113, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "training_file = training_file.sample(frac=1).reset_index(drop=True)\n", "train, valid = train_test_split(training_file, test_size=0.2)\n", "train_x = np.stack(train[\"vectorized\"].values)\n", "train_y = np.stack(train[\"class\"].values)\n", "valid_x = np.stack(valid[\"vectorized\"].values)\n", "valid_y = np.stack(valid[\"class\"].values)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 114, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "11925 [0.00907336, -0.035000063, -0.046387862, 0.107...\n", "29681 [0.029005446, -0.062998086, -0.10763814, 0.059...\n", "39659 [0.0073042903, -0.028519068, -0.07020145, 0.08...\n", "26203 [-0.012138679, -0.036977977, -0.05332508, 0.05...\n", "93611 [0.034259614, -0.06937863, -0.09370455, 0.0414...\n", " ... \n", "90557 [0.015421399, -0.051549092, -0.118074715, 0.07...\n", "10805 [-0.017966524, -0.07279962, -0.10843535, 0.071...\n", "17336 [0.038043424, -0.024239093, -0.11319029, 0.066...\n", "39497 [0.03166563, -0.061132513, -0.09316901, 0.1028...\n", "87005 [0.02179843, -0.042094912, -0.078197055, 0.084...\n", "Name: vectorized, Length: 78502, dtype: object\n" ] } ], "source": [ "print(train[\"vectorized\"])" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 115, "outputs": [ { "data": { "text/plain": " class text_data \\\n11925 1 Tam było czuć historię. Leo Beenhakker zaurocz... \n29681 1 Filip Dylewicz: Po raz pierwszy od 20 lat prow... \n39659 1 Czytaj w \"PN\". Finaliści MŚ. Piękno i rygor Hi... \n26203 1 Novak Djoković podał do sądu władze miasta Rio... \n93611 1 Wimbledon: Jelena Ostapenko nie stawiła oporu ... \n\n vectorized \n11925 [0.00907336, -0.035000063, -0.046387862, 0.107... \n29681 [0.029005446, -0.062998086, -0.10763814, 0.059... \n39659 [0.0073042903, -0.028519068, -0.07020145, 0.08... \n26203 [-0.012138679, -0.036977977, -0.05332508, 0.05... \n93611 [0.034259614, -0.06937863, -0.09370455, 0.0414... ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
classtext_datavectorized
119251Tam było czuć historię. Leo Beenhakker zaurocz...[0.00907336, -0.035000063, -0.046387862, 0.107...
296811Filip Dylewicz: Po raz pierwszy od 20 lat prow...[0.029005446, -0.062998086, -0.10763814, 0.059...
396591Czytaj w \"PN\". Finaliści MŚ. Piękno i rygor Hi...[0.0073042903, -0.028519068, -0.07020145, 0.08...
262031Novak Djoković podał do sądu władze miasta Rio...[-0.012138679, -0.036977977, -0.05332508, 0.05...
936111Wimbledon: Jelena Ostapenko nie stawiła oporu ...[0.034259614, -0.06937863, -0.09370455, 0.0414...
\n
" }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 116, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "78502\n" ] } ], "source": [ "print(len(train.index))" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 117, "outputs": [ { "data": { "text/plain": " class text_data \\\n87636 1 Szok w Hali Mistrzów, Energa Czarni w półfinal... \n96088 1 Postępy Kuby Błaszczykowskiego Jakub Błaszczyk... \n54386 1 Pobici piłkarze odchodzą ze Sportingu Lizbona.... \n29418 1 El. LE: polskie kluby znają potencjalnych rywa... \n80561 1 Było ofensywnie i efektownie. Polpharma Starog... \n\n vectorized \n87636 [0.0040451484, -0.034110088, -0.1111216, 0.050... \n96088 [0.01574161, -0.055649985, -0.077657014, 0.085... \n54386 [-0.013020566, -0.076468304, -0.127176, 0.0720... \n29418 [-0.005546203, -0.033757057, -0.10181239, 0.07... \n80561 [0.021034276, -0.06635279, -0.091047965, 0.054... ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
classtext_datavectorized
876361Szok w Hali Mistrzów, Energa Czarni w półfinal...[0.0040451484, -0.034110088, -0.1111216, 0.050...
960881Postępy Kuby Błaszczykowskiego Jakub Błaszczyk...[0.01574161, -0.055649985, -0.077657014, 0.085...
543861Pobici piłkarze odchodzą ze Sportingu Lizbona....[-0.013020566, -0.076468304, -0.127176, 0.0720...
294181El. LE: polskie kluby znają potencjalnych rywa...[-0.005546203, -0.033757057, -0.10181239, 0.07...
805611Było ofensywnie i efektownie. Polpharma Starog...[0.021034276, -0.06635279, -0.091047965, 0.054...
\n
" }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "valid.head()" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 118, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "19626\n" ] } ], "source": [ "print(len(valid.index))" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 119, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/20\n", "2454/2454 [==============================] - 3s 1ms/step - loss: 0.3017 - accuracy: 0.8763 - val_loss: 0.2039 - val_accuracy: 0.9214\n", "Epoch 2/20\n", "2454/2454 [==============================] - 2s 946us/step - loss: 0.1920 - accuracy: 0.9256 - val_loss: 0.1896 - val_accuracy: 0.9268\n", "Epoch 3/20\n", "2454/2454 [==============================] - 2s 989us/step - loss: 0.1837 - accuracy: 0.9285 - val_loss: 0.1848 - val_accuracy: 0.9287\n", "Epoch 4/20\n", "2454/2454 [==============================] - 2s 954us/step - loss: 0.1795 - accuracy: 0.9298 - val_loss: 0.1820 - val_accuracy: 0.9301\n", "Epoch 5/20\n", "2454/2454 [==============================] - 3s 1ms/step - loss: 0.1768 - accuracy: 0.9308 - val_loss: 0.1804 - val_accuracy: 0.9304\n", "Epoch 6/20\n", "2454/2454 [==============================] - 3s 1ms/step - loss: 0.1733 - accuracy: 0.9320 - val_loss: 0.1756 - val_accuracy: 0.9324\n", "Epoch 7/20\n", "2454/2454 [==============================] - 2s 982us/step - loss: 0.1692 - accuracy: 0.9336 - val_loss: 0.1721 - val_accuracy: 0.9331\n", "Epoch 8/20\n", "2454/2454 [==============================] - 3s 1ms/step - loss: 0.1636 - accuracy: 0.9359 - val_loss: 0.1699 - val_accuracy: 0.9349\n", "Epoch 9/20\n", "2454/2454 [==============================] - 2s 952us/step - loss: 0.1578 - accuracy: 0.9379 - val_loss: 0.1671 - val_accuracy: 0.9358\n", "Epoch 10/20\n", "2454/2454 [==============================] - 3s 1ms/step - loss: 0.1519 - accuracy: 0.9413 - val_loss: 0.1549 - val_accuracy: 0.9400\n", "Epoch 11/20\n", "2454/2454 [==============================] - 2s 974us/step - loss: 0.1462 - accuracy: 0.9432 - val_loss: 0.1509 - val_accuracy: 0.9420\n", "Epoch 12/20\n", "2454/2454 [==============================] - 2s 967us/step - loss: 0.1413 - accuracy: 0.9450 - val_loss: 0.1460 - val_accuracy: 0.9437\n", "Epoch 13/20\n", "2454/2454 [==============================] - 2s 971us/step - loss: 0.1367 - accuracy: 0.9459 - val_loss: 0.1396 - val_accuracy: 0.9472\n", "Epoch 14/20\n", "2454/2454 [==============================] - 3s 1ms/step - loss: 0.1322 - accuracy: 0.9492 - val_loss: 0.1380 - val_accuracy: 0.9482\n", "Epoch 15/20\n", "2454/2454 [==============================] - 2s 986us/step - loss: 0.1293 - accuracy: 0.9501 - val_loss: 0.1343 - val_accuracy: 0.9495\n", "Epoch 16/20\n", "2454/2454 [==============================] - 2s 975us/step - loss: 0.1262 - accuracy: 0.9516 - val_loss: 0.1320 - val_accuracy: 0.9494\n", "Epoch 17/20\n", "2454/2454 [==============================] - 2s 975us/step - loss: 0.1236 - accuracy: 0.9523 - val_loss: 0.1289 - val_accuracy: 0.9510\n", "Epoch 18/20\n", "2454/2454 [==============================] - 2s 990us/step - loss: 0.1210 - accuracy: 0.9534 - val_loss: 0.1272 - val_accuracy: 0.9514\n", "Epoch 19/20\n", "2454/2454 [==============================] - 3s 1ms/step - loss: 0.1193 - accuracy: 0.9539 - val_loss: 0.1246 - val_accuracy: 0.9530\n", "Epoch 20/20\n", "2454/2454 [==============================] - 3s 1ms/step - loss: 0.1174 - accuracy: 0.9544 - val_loss: 0.1240 - val_accuracy: 0.9518\n" ] } ], "source": [ "callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)\n", "model = create_model()\n", "history = model.fit(train_x, train_y, validation_data=(valid_x, valid_y), epochs=20, callbacks=[callback])" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 120, "outputs": [ { "data": { "text/plain": "" }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": "
", "image/png": "\n" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from matplotlib import pyplot as plt\n", "plt.plot(history.history['accuracy'])\n", "plt.plot(history.history['val_accuracy'])\n", "plt.title('model accuracy')\n", "plt.ylabel('accuracy')\n", "plt.xlabel('epoch')\n", "plt.legend(['train', 'test'], loc='upper left')" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 121, "outputs": [ { "data": { "text/plain": "" }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" }, { "data": { "text/plain": "
", "image/png": "\n" }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from matplotlib import pyplot as plt\n", "plt.plot(history.history['loss'])\n", "plt.plot(history.history['val_loss'])\n", "plt.title('model loss')\n", "plt.ylabel('accuracy')\n", "plt.xlabel('epoch')\n", "plt.legend(['train', 'test'], loc='upper right')" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 138, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "171/171 [==============================] - 0s 591us/step\n" ] } ], "source": [ "def process_input(directory):\n", " with open(directory+\"/in.tsv\", encoding=\"utf-8\") as data_file:\n", " df = pd.DataFrame([], columns=['text_data'])\n", " for line in data_file:\n", " df = df._append({'text_data': line}, ignore_index=True)\n", " df[\"text_data\"].apply(lambda row: row.translate(str.maketrans('', '', string.punctuation)))\n", " df[\"vectorized\"] = df[\"text_data\"].apply(get_document_vector)\n", " data_x = np.stack(df[\"vectorized\"].values)\n", " predictions = model.predict(data_x)\n", " return predictions\n", "\n", "predictions = process_input(\"dev-0\")[:,0]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 139, "outputs": [ { "data": { "text/plain": "array([9.9900788e-01, 9.9998349e-01, 2.1855670e-03, ..., 2.0675772e-04,\n 9.9930727e-01, 9.8721308e-01], dtype=float32)" }, "execution_count": 139, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 142, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[1. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1.\n", " 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0.\n", " 1. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 0.\n", " 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 0.\n", " 1. 0. 0. 1.]\n" ] } ], "source": [ "print(np.rint(predictions)[:100])" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 144, "outputs": [], "source": [ "predictions_rounded = np.rint(predictions)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 148, "outputs": [], "source": [ "np.savetxt(\"dev-0/out.tsv\",predictions_rounded, fmt='%i')" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 149, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "171/171 [==============================] - 0s 619us/step\n" ] } ], "source": [ "predictions = process_input(\"test-A\")[:,0]" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 150, "outputs": [], "source": [ "predictions_rounded = np.rint(predictions)" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 151, "outputs": [], "source": [ "np.savetxt(\"test-A/out.tsv\",predictions_rounded, fmt='%i')" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 152, "outputs": [], "source": [ "model.save(\"model.keras\")" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }