{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2024-05-18T13:48:56.414451700Z", "start_time": "2024-05-18T13:48:54.900019200Z" } }, "outputs": [], "source": [ "# Read in the data and clean up column names\n", "import gensim\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "pd.set_option('display.max_colwidth', 100)\n", "data = pd.read_csv(\"train.tsv\", sep=\"\\t\", on_bad_lines='skip')" ] }, { "cell_type": "code", "execution_count": 2, "outputs": [ { "data": { "text/plain": " label \\\n0 1 \n1 0 \n2 1 \n3 0 \n4 1 \n\n text \n0 Przyjmujący reprezentacji Polski wrócił do PGE Skry Bełchatów Tylko rok trwał rozbrat Artura Sza... \n1 FEN 9: Zapowiedź walki Róża Gumienna vs Katarzyna Posiadała (wideo) Podczas Fight Exclusive Nigh... \n2 Aleksander Filipiak: Czuję się dobrze w nowym klubie Aleksander Filipiak w przerwie letniej zami... \n3 Victoria Carl i Aleksiej Czerwotkin mistrzami świata juniorów na 5 i 10 kilometrów Biegi na 5 i ... \n4 Świat poznał ją na mundialu. Francuska WAG czaruje pięknym ciałem Rachel Legrain-Trapani to jedn... ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
labeltext
01Przyjmujący reprezentacji Polski wrócił do PGE Skry Bełchatów Tylko rok trwał rozbrat Artura Sza...
10FEN 9: Zapowiedź walki Róża Gumienna vs Katarzyna Posiadała (wideo) Podczas Fight Exclusive Nigh...
21Aleksander Filipiak: Czuję się dobrze w nowym klubie Aleksander Filipiak w przerwie letniej zami...
30Victoria Carl i Aleksiej Czerwotkin mistrzami świata juniorów na 5 i 10 kilometrów Biegi na 5 i ...
41Świat poznał ją na mundialu. Francuska WAG czaruje pięknym ciałem Rachel Legrain-Trapani to jedn...
\n
" }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.columns = [\"label\", \"text\"]\n", "data.head()" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T13:48:56.414451700Z", "start_time": "2024-05-18T13:48:56.401231900Z" } }, "id": "9c4c96b33245043a" }, { "cell_type": "code", "execution_count": 3, "outputs": [ { "data": { "text/plain": " label \\\n0 1 \n1 0 \n2 1 \n3 0 \n4 1 \n\n text \\\n0 Przyjmujący reprezentacji Polski wrócił do PGE Skry Bełchatów Tylko rok trwał rozbrat Artura Sza... \n1 FEN 9: Zapowiedź walki Róża Gumienna vs Katarzyna Posiadała (wideo) Podczas Fight Exclusive Nigh... \n2 Aleksander Filipiak: Czuję się dobrze w nowym klubie Aleksander Filipiak w przerwie letniej zami... \n3 Victoria Carl i Aleksiej Czerwotkin mistrzami świata juniorów na 5 i 10 kilometrów Biegi na 5 i ... \n4 Świat poznał ją na mundialu. Francuska WAG czaruje pięknym ciałem Rachel Legrain-Trapani to jedn... \n\n text_clean \n0 [przyjmujący, reprezentacji, polski, wrócił, do, pge, skry, bełchatów, tylko, rok, trwał, rozbra... \n1 [fen, zapowiedź, walki, róża, gumienna, vs, katarzyna, posiadała, wideo, podczas, fight, exclusi... \n2 [aleksander, filipiak, czuję, się, dobrze, nowym, klubie, aleksander, filipiak, przerwie, letnie... \n3 [victoria, carl, aleksiej, czerwotkin, mistrzami, świata, juniorów, na, kilometrów, biegi, na, k... \n4 [świat, poznał, ją, na, mundialu, francuska, wag, czaruje, pięknym, ciałem, rachel, legrain, tra... ", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
labeltexttext_clean
01Przyjmujący reprezentacji Polski wrócił do PGE Skry Bełchatów Tylko rok trwał rozbrat Artura Sza...[przyjmujący, reprezentacji, polski, wrócił, do, pge, skry, bełchatów, tylko, rok, trwał, rozbra...
10FEN 9: Zapowiedź walki Róża Gumienna vs Katarzyna Posiadała (wideo) Podczas Fight Exclusive Nigh...[fen, zapowiedź, walki, róża, gumienna, vs, katarzyna, posiadała, wideo, podczas, fight, exclusi...
21Aleksander Filipiak: Czuję się dobrze w nowym klubie Aleksander Filipiak w przerwie letniej zami...[aleksander, filipiak, czuję, się, dobrze, nowym, klubie, aleksander, filipiak, przerwie, letnie...
30Victoria Carl i Aleksiej Czerwotkin mistrzami świata juniorów na 5 i 10 kilometrów Biegi na 5 i ...[victoria, carl, aleksiej, czerwotkin, mistrzami, świata, juniorów, na, kilometrów, biegi, na, k...
41Świat poznał ją na mundialu. Francuska WAG czaruje pięknym ciałem Rachel Legrain-Trapani to jedn...[świat, poznał, ją, na, mundialu, francuska, wag, czaruje, pięknym, ciałem, rachel, legrain, tra...
\n
" }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data['text_clean'] = data['text'].apply(lambda x: gensim.utils.simple_preprocess(x))\n", "data.head()" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T13:49:00.316282200Z", "start_time": "2024-05-18T13:48:56.410929500Z" } }, "id": "e97f8679dd19876b" }, { "cell_type": "code", "execution_count": 24, "outputs": [], "source": [ "w2v_model = gensim.models.Word2Vec(data[\"text_clean\"],\n", " vector_size=500,\n", " window=5,\n", " min_count=2,\n", " workers=4)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:15:38.926464200Z", "start_time": "2024-05-18T14:15:22.986675300Z" } }, "id": "c336ac485de74549" }, { "cell_type": "code", "execution_count": 55, "outputs": [], "source": [ "from keras.src.utils import pad_sequences\n", "from keras.src.legacy.preprocessing.text import Tokenizer\n", "\n", "token = Tokenizer(7229)\n", "token.fit_on_texts(data['text_clean'])\n", "text = token.texts_to_sequences(data['text_clean'])\n", "text = pad_sequences(text, 75)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:40:46.467152700Z", "start_time": "2024-05-18T14:40:43.515266200Z" } }, "id": "738bc0b5f175c1d4" }, { "cell_type": "code", "execution_count": 10, "outputs": [ { "data": { "text/plain": "array([[ 0, 0, 0, ..., 2, 20, 1957],\n [ 0, 0, 0, ..., 7153, 441, 292],\n [ 0, 0, 0, ..., 3702, 2385, 9],\n ...,\n [ 0, 0, 0, ..., 520, 1094, 3132],\n [ 0, 0, 0, ..., 44, 287, 1800],\n [ 0, 0, 0, ..., 160, 57, 187]])" }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(np.array(text), data['label'], test_size=0.2)\n", "X_train" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T13:53:11.632921800Z", "start_time": "2024-05-18T13:53:11.609890400Z" } }, "id": "8e68a73523198872" }, { "cell_type": "code", "execution_count": 12, "outputs": [], "source": [ "vocab_size = len(token.word_index) + 1" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T13:55:24.933067600Z", "start_time": "2024-05-18T13:55:24.892887400Z" } }, "id": "ee4996706330dd07" }, { "cell_type": "code", "execution_count": 26, "outputs": [], "source": [ "embedding_matrix = np.zeros((vocab_size, 500))\n", "for word, i in token.word_index.items():\n", " if word in w2v_model.wv:\n", " embedding_matrix[i] = w2v_model.wv[word]" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:17:28.570268600Z", "start_time": "2024-05-18T14:17:28.377171400Z" } }, "id": "80de5dc612e62c33" }, { "cell_type": "code", "execution_count": 27, "outputs": [], "source": [ "import keras\n", "\n", "opt = keras.optimizers.Adam(learning_rate=0.001)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:17:29.132463600Z", "start_time": "2024-05-18T14:17:29.117615100Z" } }, "id": "fba513611f471de8" }, { "cell_type": "code", "execution_count": 28, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "\u001B[1m1227/1227\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m30s\u001B[0m 22ms/step - acc: 0.9434 - loss: 0.1327 - val_acc: 0.9795 - val_loss: 0.0647\n", "Epoch 2/10\n", "\u001B[1m1227/1227\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m26s\u001B[0m 21ms/step - acc: 0.9801 - loss: 0.0614 - val_acc: 0.9798 - val_loss: 0.0675\n", "Epoch 3/10\n", "\u001B[1m1227/1227\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m27s\u001B[0m 22ms/step - acc: 0.9831 - loss: 0.0532 - val_acc: 0.9822 - val_loss: 0.0542\n", "Epoch 4/10\n", "\u001B[1m1227/1227\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m27s\u001B[0m 22ms/step - acc: 0.9851 - loss: 0.0455 - val_acc: 0.9832 - val_loss: 0.0502\n", "Epoch 5/10\n", "\u001B[1m1227/1227\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m27s\u001B[0m 22ms/step - acc: 0.9850 - loss: 0.0458 - val_acc: 0.9803 - val_loss: 0.0574\n", "Epoch 6/10\n", "\u001B[1m1227/1227\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m26s\u001B[0m 21ms/step - acc: 0.9860 - loss: 0.0419 - val_acc: 0.9836 - val_loss: 0.0552\n", "Epoch 7/10\n", "\u001B[1m1227/1227\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m27s\u001B[0m 22ms/step - acc: 0.9862 - loss: 0.0395 - val_acc: 0.9830 - val_loss: 0.0646\n", "Epoch 8/10\n", "\u001B[1m1227/1227\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m27s\u001B[0m 22ms/step - acc: 0.9874 - loss: 0.0362 - val_acc: 0.9787 - val_loss: 0.0723\n", "Epoch 9/10\n", "\u001B[1m1227/1227\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m34s\u001B[0m 27ms/step - acc: 0.9876 - loss: 0.0346 - val_acc: 0.9798 - val_loss: 0.0796\n", "Epoch 10/10\n", "\u001B[1m1227/1227\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m50s\u001B[0m 40ms/step - acc: 0.9885 - loss: 0.0347 - val_acc: 0.9829 - val_loss: 0.0487\n" ] }, { "data": { "text/plain": "" }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from keras.src.layers import Dropout, Dense, Activation, Embedding, MaxPooling1D, GlobalMaxPooling1D\n", "from keras.src.layers import Conv1D\n", "from keras import Sequential\n", "\n", "keras_model = Sequential()\n", "keras_model.add(Embedding(vocab_size, 500, weights=[embedding_matrix], trainable=False))\n", "keras_model.add(Dropout(0.4))\n", "keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))\n", "keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))\n", "keras_model.add(MaxPooling1D())\n", "keras_model.add(Dropout(0.4))\n", "keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))\n", "keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))\n", "keras_model.add(MaxPooling1D())\n", "keras_model.add(Dropout(0.2))\n", "keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))\n", "keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))\n", "keras_model.add(GlobalMaxPooling1D())\n", "keras_model.add(Dropout(0.4))\n", "keras_model.add(Dense(200))\n", "keras_model.add(Activation('relu'))\n", "keras_model.add(Dropout(0.4))\n", "keras_model.add(Dense(1))\n", "keras_model.add(Activation('sigmoid'))\n", "keras_model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer=opt)\n", "keras_model.fit(X_train, y_train, batch_size=64, epochs=10, validation_data=(X_test, y_test))" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:22:33.393859600Z", "start_time": "2024-05-18T14:17:33.183305300Z" } }, "id": "81d4e85a23cdcc4d" }, { "cell_type": "code", "execution_count": 29, "outputs": [], "source": [ "model = keras_model" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:22:45.908737900Z", "start_time": "2024-05-18T14:22:45.901698900Z" } }, "id": "8cac96bd584177b9" }, { "cell_type": "code", "execution_count": 56, "outputs": [], "source": [ "def preprocess(path):\n", " data = pd.read_csv(path, sep=\"\\t\", on_bad_lines='skip')\n", " data.columns = [\"text\"]\n", " data['text_clean'] = data['text'].apply(lambda x: gensim.utils.simple_preprocess(x))\n", " text = token.texts_to_sequences(data['text_clean'])\n", " text = pad_sequences(text, 75)\n", " return text" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:40:56.599000800Z", "start_time": "2024-05-18T14:40:56.587505200Z" } }, "id": "104927eec2bcdcc8" }, { "cell_type": "code", "execution_count": 57, "outputs": [], "source": [ "x = preprocess(\"test-A/in.tsv\")" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:40:57.904309600Z", "start_time": "2024-05-18T14:40:57.622468400Z" } }, "id": "4ac94af81d290ba5" }, { "cell_type": "code", "execution_count": 58, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m1s\u001B[0m 4ms/step\n" ] } ], "source": [ "res = model.predict(x)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:40:59.833278700Z", "start_time": "2024-05-18T14:40:58.985503300Z" } }, "id": "532a596a0cca6e3d" }, { "cell_type": "code", "execution_count": 59, "outputs": [], "source": [ "y_predictions = np.where(res>=0.49, 1, 0) " ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:41:01.471611600Z", "start_time": "2024-05-18T14:41:01.452033600Z" } }, "id": "8688068e9edcd98c" }, { "cell_type": "code", "execution_count": 60, "outputs": [], "source": [ "out = pd.DataFrame(y_predictions)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:41:01.961092200Z", "start_time": "2024-05-18T14:41:01.952610400Z" } }, "id": "e9df2cfccc9da22" }, { "cell_type": "code", "execution_count": 61, "outputs": [ { "data": { "text/plain": " 0\n0 1\n1 1\n2 0\n3 1\n4 1\n... ..\n5439 1\n5440 1\n5441 1\n5442 0\n5443 1\n\n[5444 rows x 1 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
0
01
11
20
31
41
......
54391
54401
54411
54420
54431
\n

5444 rows × 1 columns

\n
" }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "out" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:41:03.483852600Z", "start_time": "2024-05-18T14:41:03.461120700Z" } }, "id": "84595ca2b4d0c5ac" }, { "cell_type": "code", "execution_count": 62, "outputs": [], "source": [ "out.to_csv('out.tsv', sep=\"\\t\", index=False) " ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:41:14.478686200Z", "start_time": "2024-05-18T14:41:14.466508400Z" } }, "id": "e94c7c3f0aeacf35" }, { "cell_type": "code", "execution_count": 45, "outputs": [], "source": [ "import pandas as pd\n", "import pathlib\n", "import matplotlib.pyplot as plt\n", "from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)\n", "from sklearn.metrics import confusion_matrix as cm_sklearn\n", "from sklearn.metrics import precision_score\n", "from sklearn.metrics import recall_score\n", "from sklearn.metrics import f1_score\n", "\n", "def plot_discrimination_threshold(clf, X_test, y_test, argmax='f1', title='Metrics vs Discriminant Threshold', fig_size=(10, 8), dpi=100, save_fig_path=None):\n", " \"\"\"\n", " Plot precision, recall and f1-score vs discriminant threshold for the given pipeline model\n", " Parameters\n", " ----------\n", " clf : estimator instance (either sklearn.Pipeline, imblearn.Pipeline or a classifier)\n", " PRE-FITTED classifier or a PRE-FITTED Pipeline in which the last estimator is a classifier.\n", " X_test : pandas.DataFrame of shape (n_samples, n_features)\n", " Test features.\n", " y_test : pandas.Series of shape (n_samples,)\n", " Target values.\n", " argmax : str, default: 'f1'\n", " Annotate the threshold maximized by the supplied metric. Options: 'f1', 'precision', 'recall'\n", " title : str, default ='FPR and FNR vs Discriminant Threshold'\n", " Plot title.\n", " fig_size : tuple, default = (10, 8)\n", " Size (inches) of the plot.\n", " dpi : int, default = 100\n", " Image DPI.\n", " save_fig_path : str, defaut=None\n", " Full path where to save the plot. Will generate the folders if they don't exist already.\n", " Returns\n", " -------\n", " fig : Matplotlib.pyplot.Figure\n", " Figure from matplotlib\n", " ax : Matplotlib.pyplot.Axe\n", " Axe object from matplotlib\n", " \"\"\"\n", " \n", " thresholds = np.linspace(0, 1, 100)\n", " \n", " precision_ls = []\n", " recall_ls = []\n", " f1_ls = []\n", " fpr_ls = []\n", " fnr_ls = []\n", " \n", " # obtain probabilities\n", " probs = clf.predict(X_test)\n", "\n", " for threshold in thresholds: \n", " \n", " # obtain class prediction based on threshold\n", " y_predictions = np.where(probs>=threshold, 1, 0) \n", " \n", " # obtain confusion matrix\n", " tn, fp, fn, tp = cm_sklearn(y_test, y_predictions).ravel()\n", " \n", " # obtain FRP and FNR\n", " FPR = fp / (tn + fp)\n", " FNR = fn / (tp + fn)\n", " \n", " # obtain precision, recall and f1 scores\n", " precision = precision_score(y_test, y_predictions, average='binary')\n", " recall = recall_score(y_test, y_predictions, average='binary')\n", " f1 = f1_score(y_test, y_predictions, average='binary')\n", " \n", " precision_ls.append(precision)\n", " recall_ls.append(recall)\n", " f1_ls.append(f1)\n", " fpr_ls.append(FPR)\n", " fnr_ls.append(FNR)\n", " \n", " metrics = pd.concat([\n", " pd.Series(precision_ls),\n", " pd.Series(recall_ls),\n", " pd.Series(f1_ls),\n", " pd.Series(fpr_ls),\n", " pd.Series(fnr_ls)], axis=1)\n", "\n", " metrics.columns = ['precision', 'recall', 'f1', 'fpr', 'fnr']\n", " metrics.index = thresholds\n", " \n", " plt.rcParams[\"figure.facecolor\"] = 'white'\n", " plt.rcParams[\"axes.facecolor\"] = 'white'\n", " plt.rcParams[\"savefig.facecolor\"] = 'white'\n", " \n", " fig, ax = plt.subplots(1, 1, figsize=fig_size, dpi=dpi)\n", " ax.plot(metrics['precision'], label='Precision')\n", " ax.plot(metrics['recall'], label='Recall')\n", " ax.plot(metrics['f1'], label='f1')\n", " ax.plot(metrics['fpr'], label='False Positive Rate (FPR)', linestyle='dotted')\n", " ax.plot(metrics['fnr'], label='False Negative Rate (FNR)', linestyle='dotted')\n", " \n", " # Draw a threshold line\n", " disc_threshold = round(metrics[argmax].idxmax(), 2)\n", " ax.axvline(x=metrics[argmax].idxmax(), color='black', linestyle='dashed', label=\"$t_r$=\"+str(disc_threshold))\n", "\n", " ax.xaxis.set_major_locator(MultipleLocator(0.1))\n", " ax.xaxis.set_major_formatter('{x:.1f}')\n", " \n", " ax.yaxis.set_major_locator(MultipleLocator(0.1))\n", " ax.yaxis.set_major_formatter('{x:.1f}')\n", "\n", " ax.xaxis.set_minor_locator(MultipleLocator(0.05)) \n", " ax.yaxis.set_minor_locator(MultipleLocator(0.05)) \n", "\n", " ax.tick_params(which='both', width=2)\n", " ax.tick_params(which='major', length=7)\n", " ax.tick_params(which='minor', length=4, color='black') \n", " \n", " plt.grid(True)\n", " \n", " plt.xlabel('Probability Threshold', fontsize=18)\n", " plt.ylabel('Scores', fontsize=18)\n", " plt.title(title, fontsize=18)\n", " leg = ax.legend(loc='best', frameon=True, framealpha=0.7)\n", " leg_frame = leg.get_frame()\n", " leg_frame.set_color('gold')\n", " plt.show()\n", "\n", " if (save_fig_path != None):\n", " path = pathlib.Path(save_fig_path)\n", " path.parent.mkdir(parents=True, exist_ok=True)\n", " fig.savefig(save_fig_path, dpi=dpi)\n", "\n", " return fig, ax, disc_threshold" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:35:04.038699500Z", "start_time": "2024-05-18T14:35:04.033068300Z" } }, "id": "f9d6a8e84ef28b4d" }, { "cell_type": "code", "execution_count": 46, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001B[1m614/614\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m2s\u001B[0m 4ms/step\n" ] }, { "data": { "text/plain": "
", "image/png": "" }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": "(
,\n ,\n 0.49)" }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "plot_discrimination_threshold(model,X_test, y_test)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:35:09.245565700Z", "start_time": "2024-05-18T14:35:04.261983600Z" } }, "id": "f01f4db44a119b53" }, { "cell_type": "code", "execution_count": 63, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001B[1m171/171\u001B[0m \u001B[32m━━━━━━━━━━━━━━━━━━━━\u001B[0m\u001B[37m\u001B[0m \u001B[1m1s\u001B[0m 4ms/step\n" ] } ], "source": [ "x = preprocess(\"dev-0/in.tsv\")\n", "res = model.predict(x)\n", "y_predictions = np.where(res >= 0.49, 1, 0)\n", "out = pd.DataFrame(y_predictions)\n", "out.to_csv('dev-0/out.tsv', sep=\"\\t\", index=False) " ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:50:05.273321200Z", "start_time": "2024-05-18T14:50:04.167585600Z" } }, "id": "e4b7e4685d5422fa" }, { "cell_type": "code", "execution_count": 67, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", "y = pd.read_csv(\"./dev-0/expected.tsv\")\n", "score = accuracy_score(y_true=y, y_pred=out)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:53:19.968430700Z", "start_time": "2024-05-18T14:53:19.941257100Z" } }, "id": "d8b05baeae6d0980" }, { "cell_type": "code", "execution_count": 87, "outputs": [ { "data": { "text/plain": "0.9814712896716199" }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "score" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:55:05.443038600Z", "start_time": "2024-05-18T14:55:05.419651600Z" } }, "id": "ed98ddb1ead2ae3" }, { "cell_type": "code", "execution_count": 85, "outputs": [], "source": [ "import math\n", "\n", "points = math.ceil(score * 7.0)" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:54:23.693419700Z", "start_time": "2024-05-18T14:54:23.684500400Z" } }, "id": "8b90ccda303fdf83" }, { "cell_type": "code", "execution_count": 86, "outputs": [ { "data": { "text/plain": "7" }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "points" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2024-05-18T14:54:23.815715800Z", "start_time": "2024-05-18T14:54:23.810173500Z" } }, "id": "448ba41113b8218f" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }