dl_rnn/train.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": "                                               label  \\\n0  B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ...   \n1  O B-PER O O O O O O O O O B-LOC O O O O O O O ...   \n2  B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ...   \n3  B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ...   \n4  B-MISC O O O O O O O O O O O B-LOC O O B-MISC ...   \n\n                                            document  \n0  EU rejects German call to boycott British lamb...  \n1  Rare Hendrix song draft sells for almost $ 17,...  \n2  China says Taiwan spoils atmosphere for talks ...  \n3  China says time right for Taiwan talks . </S> ...  \n4  German July car registrations up 14.2 pct yr /...  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>label</th>\n      <th>document</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ...</td>\n      <td>EU rejects German call to boycott British lamb...</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>O B-PER O O O O O O O O O B-LOC O O O O O O O ...</td>\n      <td>Rare Hendrix song draft sells for almost $ 17,...</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ...</td>\n      <td>China says Taiwan spoils atmosphere for talks ...</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ...</td>\n      <td>China says time right for Taiwan talks . &lt;/S&gt; ...</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>B-MISC O O O O O O O O O O O B-LOC O O B-MISC ...</td>\n      <td>German July car registrations up 14.2 pct yr /...</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "training_file = pd.read_csv(\"en-ner-conll-2003/train/train.tsv\", sep='\\t', on_bad_lines=\"warn\", names=[\"label\",\"document\"])\n",
    "training_file.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "training_file[\"tag_list\"] = training_file[\"label\"].apply(lambda x : x.split())\n",
    "training_file[\"tokenized\"] = training_file[\"document\"].apply(lambda x : x.split())\n",
    "training_file[\"len_tags\"] = training_file[\"tag_list\"].apply(len)\n",
    "training_file[\"len_tokenized\"] = training_file[\"tokenized\"].apply(len)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [
    {
     "data": {
      "text/plain": "Empty DataFrame\nColumns: [label, document, tag_list, tokenized, len_tags, len_tokenized]\nIndex: []",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>label</th>\n      <th>document</th>\n      <th>tag_list</th>\n      <th>tokenized</th>\n      <th>len_tags</th>\n      <th>len_tokenized</th>\n    </tr>\n  </thead>\n  <tbody>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_file.loc[~(training_file['len_tokenized'] == training_file['len_tags'])]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [
    {
     "data": {
      "text/plain": "                                               label  \\\n0  B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ...   \n1  O B-PER O O O O O O O O O B-LOC O O O O O O O ...   \n2  B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ...   \n3  B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ...   \n4  B-MISC O O O O O O O O O O O B-LOC O O B-MISC ...   \n\n                                            document  \\\n0  EU rejects German call to boycott British lamb...   \n1  Rare Hendrix song draft sells for almost $ 17,...   \n2  China says Taiwan spoils atmosphere for talks ...   \n3  China says time right for Taiwan talks . </S> ...   \n4  German July car registrations up 14.2 pct yr /...   \n\n                                            tag_list  \\\n0  [B-ORG, O, B-MISC, O, O, O, B-MISC, O, O, O, B...   \n1  [O, B-PER, O, O, O, O, O, O, O, O, O, B-LOC, O...   \n2  [B-LOC, O, B-LOC, O, O, O, O, O, O, B-LOC, O, ...   \n3  [B-LOC, O, O, O, O, B-LOC, O, O, O, B-LOC, O, ...   \n4  [B-MISC, O, O, O, O, O, O, O, O, O, O, O, B-LO...   \n\n                                           tokenized  len_tags  len_tokenized  \n0  [EU, rejects, German, call, to, boycott, Briti...       489            489  \n1  [Rare, Hendrix, song, draft, sells, for, almos...       197            197  \n2  [China, says, Taiwan, spoils, atmosphere, for,...       248            248  \n3  [China, says, time, right, for, Taiwan, talks,...        80             80  \n4  [German, July, car, registrations, up, 14.2, p...       235            235  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>label</th>\n      <th>document</th>\n      <th>tag_list</th>\n      <th>tokenized</th>\n      <th>len_tags</th>\n      <th>len_tokenized</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ...</td>\n      <td>EU rejects German call to boycott British lamb...</td>\n      <td>[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O, O, B...</td>\n      <td>[EU, rejects, German, call, to, boycott, Briti...</td>\n      <td>489</td>\n      <td>489</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>O B-PER O O O O O O O O O B-LOC O O O O O O O ...</td>\n      <td>Rare Hendrix song draft sells for almost $ 17,...</td>\n      <td>[O, B-PER, O, O, O, O, O, O, O, O, O, B-LOC, O...</td>\n      <td>[Rare, Hendrix, song, draft, sells, for, almos...</td>\n      <td>197</td>\n      <td>197</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ...</td>\n      <td>China says Taiwan spoils atmosphere for talks ...</td>\n      <td>[B-LOC, O, B-LOC, O, O, O, O, O, O, B-LOC, O, ...</td>\n      <td>[China, says, Taiwan, spoils, atmosphere, for,...</td>\n      <td>248</td>\n      <td>248</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ...</td>\n      <td>China says time right for Taiwan talks . &lt;/S&gt; ...</td>\n      <td>[B-LOC, O, O, O, O, B-LOC, O, O, O, B-LOC, O, ...</td>\n      <td>[China, says, time, right, for, Taiwan, talks,...</td>\n      <td>80</td>\n      <td>80</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>B-MISC O O O O O O O O O O O B-LOC O O B-MISC ...</td>\n      <td>German July car registrations up 14.2 pct yr /...</td>\n      <td>[B-MISC, O, O, O, O, O, O, O, O, O, O, O, B-LO...</td>\n      <td>[German, July, car, registrations, up, 14.2, p...</td>\n      <td>235</td>\n      <td>235</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_file.head()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1532\n"
     ]
    }
   ],
   "source": [
    "max_length = training_file[\"len_tokenized\"].max()\n",
    "print(max_length) # 1532 ---> ~2048"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Testowanie wektoryzacji / dewektoryzacji tekstu"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor(\n",
      "[18792   316  1335   896     8   479  7287   284     3     2 18492     4\n",
      " 11364     3     2   137     2 18793 18637 20290   346    15    14    68\n",
      "    27     9  1335  9461    59  3210    42  5299   507     6    52  4906\n",
      "    71     7    64  1712   554    49   540     3     2    20   132    15\n",
      "    27   257     5   540     4    60   536   232    18     4    37  1257\n",
      "    52   234    71  1398  1164     6    64  2541 23235    65   880  5156\n",
      "   280  3526     3     2    20  5156    40  1257    17    52 22125    71\n",
      "     3     2  2016 18381     4   449   834  1318     6     5 13472    12\n",
      "  1339  2356   132     4    15     5  9461    13  1240    42  2542     8\n",
      "  2525     5   132    16  8166   666   724  1190    12  2129   618   622\n",
      "  5276    12   836     3    13     2], shape=(126,), dtype=int64)\n"
     ]
    }
   ],
   "source": [
    "vectorize_layer = tf.keras.layers.TextVectorization(standardize=None)\n",
    "vectorize_layer.adapt(training_file[\"document\"])\n",
    "print(vectorize_layer(training_file[\"document\"][20]))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [
    {
     "data": {
      "text/plain": "126"
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(training_file[\"document\"][20].split())"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "data": {
      "text/plain": "['',\n '[UNK]',\n '</S>',\n '.',\n ',',\n 'the',\n 'of',\n 'in',\n 'to',\n 'a',\n ')',\n '(',\n 'and',\n '\"',\n 'on',\n 'said',\n \"'s\",\n 'for',\n '1',\n '-',\n 'The',\n 'was',\n '2',\n '0',\n '3',\n 'at',\n 'with',\n 'that',\n 'from',\n 'by',\n 'is',\n ':',\n 'as',\n 'he',\n '4',\n 'had',\n 'has',\n 'it',\n 'his',\n 'not',\n 'were',\n 'be',\n 'an',\n 'have',\n 'after',\n 'who',\n 'will',\n '5',\n 'but',\n 'first',\n 'U.S.',\n 'been',\n '$',\n '--',\n 'two',\n 'their',\n 'are',\n '6',\n 'beat',\n 'would',\n 'which',\n 'up',\n 'I',\n 'they',\n 'its',\n 'percent',\n 'year',\n 'out',\n 'Thursday',\n 'this',\n 'last',\n 'million',\n 'over',\n 'Wednesday',\n 'one',\n '7',\n 'government',\n 'against',\n '/',\n 'police',\n 'when',\n 'second',\n 'also',\n 'Tuesday',\n 'He',\n 'It',\n 'A',\n 'three',\n 'told',\n 'new',\n '10',\n 'Monday',\n 'or',\n 'about',\n 'Friday',\n 'people',\n 'In',\n 'her',\n '9',\n '1996-08-28',\n 'no',\n 'won',\n 'we',\n 'New',\n 'into',\n 'under',\n 'some',\n 'Sunday',\n 'But',\n '8',\n 'more',\n 'before',\n 'week',\n \"'\",\n 'time',\n 'than',\n 'market',\n 'could',\n 'Germany',\n 'points',\n 'We',\n 'between',\n 'Australia',\n 'years',\n 'since',\n 'Britain',\n 'other',\n 'AT',\n 'SOCCER',\n 'played',\n 'all',\n 'state',\n 'company',\n 'France',\n 'England',\n 'Saturday',\n 'only',\n '1996-08-22',\n 'officials',\n 'group',\n '1996-08-29',\n 'there',\n 'round',\n '1996',\n 'South',\n 'Minister',\n '1996-08-27',\n '11',\n 'off',\n 'match',\n '13',\n 'six',\n 'four',\n 'down',\n '6-4',\n '6-3',\n 'because',\n '21',\n 'five',\n '15',\n 'him',\n 'Spain',\n '1996-08-26',\n 'next',\n 'President',\n 'official',\n 'former',\n 'she',\n 'home',\n 'United',\n 'third',\n 'do',\n 'spokesman',\n 'just',\n 'games',\n 'expected',\n 'did',\n 'day',\n 'win',\n 'through',\n 'statement',\n 'made',\n 'NEW',\n '70',\n '12',\n '1996-08-23',\n 'them',\n 'lost',\n '14',\n 'world',\n 'where',\n '6-2',\n '20',\n 'September',\n 'Russian',\n 'July',\n 'shares',\n \"n't\",\n 'if',\n 'back',\n 'RESULTS',\n 'Italy',\n 'YORK',\n 'China',\n 'August',\n 'president',\n 'Cup',\n '3.',\n '2.',\n 'DIVISION',\n '1.',\n 'Clinton',\n 'British',\n 'while',\n 'seconds',\n 'any',\n 'LONDON',\n 'Japan',\n 'reported',\n 'billion',\n '69',\n 'matches',\n 'v',\n 'team',\n 'month',\n 'Russia',\n 'division',\n 'Pakistan',\n 'meeting',\n 'being',\n 'They',\n 'London',\n 'June',\n 'European',\n '30',\n 'news',\n 'added',\n 'German',\n '71',\n '1996-08-25',\n 'still',\n 'peace',\n 'metres',\n 'half',\n 'Results',\n 'At',\n '1/2',\n 'talks',\n 'set',\n 'earlier',\n 'tonnes',\n 'killed',\n 'season',\n 'now',\n 'Sweden',\n 'take',\n 'held',\n 'during',\n 'Reuters',\n 'should',\n 'part',\n 'around',\n 'India',\n 'party',\n 'elections',\n 'National',\n 'took',\n 'game',\n 'Bank',\n 'soccer',\n 'number',\n 'minutes',\n 'lead',\n 'innings',\n 'early',\n 'capital',\n '68',\n '6-1',\n 'saying',\n 'end',\n 'due',\n 'days',\n 'b',\n '7-6',\n 'results',\n 'Open',\n '100',\n 'so',\n 'foreign',\n 'you',\n 'political',\n 'per',\n 'international',\n 'final',\n 'can',\n 'York',\n 'West',\n 'Belgium',\n '22',\n 'well',\n 'victory',\n 'most',\n 'Newsroom',\n 'French',\n 'Netherlands',\n '50',\n 'visit',\n 'seven',\n 'country',\n 'champion',\n 'Iraq',\n '25',\n 'our',\n 'minute',\n 'Israel',\n 'American',\n 'says',\n 'left',\n 'Czech',\n 'Africa',\n '66',\n '1996-08-24',\n 'profit',\n 'play',\n 'LEAGUE',\n '4.',\n 'vs.',\n 'league',\n '67',\n '6.',\n '5.',\n 'very',\n 'local',\n 'leader',\n 'Republic',\n '7-5',\n '24',\n '1995',\n 'war',\n 'same',\n 'go',\n 'found',\n 'support',\n 'run',\n 'newsroom',\n 'close',\n 'Inc',\n 'then',\n 'say',\n 'meet',\n 'man',\n 'called',\n 'World',\n 'States',\n 'CHICAGO',\n 'what',\n 'town',\n 'singles',\n 'prices',\n 'military',\n 'lower',\n 'eight',\n 'both',\n 'ago',\n '64',\n 'runs',\n 'put',\n 'newspaper',\n 'deal',\n 'bank',\n 'Moscow',\n 'Mark',\n '72',\n 'trade',\n 'rate',\n 'race',\n 'make',\n 'goals',\n 'cents',\n 'St',\n 'OF',\n 'Men',\n '60',\n '16',\n 'pct',\n 'months',\n 'issue',\n 'gave',\n 'behind',\n
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vectorize_layer.get_vocabulary()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [
    {
     "data": {
      "text/plain": "'Kindercare says debt buy to hit Q1 results . </S> MONTGOMERY , Ala . </S> 1996-08-22 </S> KinderCare Learning Centers Inc said on Thursday that a debt buyback would mean an extraordinary loss of $ 1.2 million in its fiscal 1997 first quarter . </S> The company said that during the quarter , which began June 1 , it bought $ 30 million par value of its outstanding 10-3/8 percent senior notes due 2001 . </S> The notes were bought for $ 31.5 million . </S> Philip Maslowe , chief financial officer of the preschool and child care company , said the buyback \" offered an opportunity to reduce the company \\'s weighted average interest costs and improve future cash flows and earnings . \" </S>'"
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "vocabulary = vectorize_layer.get_vocabulary()\n",
    "vocab_arr = np.asarray(vocabulary)\n",
    "\" \".join(vocab_arr[vectorize_layer(training_file[\"document\"][20])])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "data": {
      "text/plain": "'Kindercare says debt buy to hit Q1 results . </S> MONTGOMERY , Ala . </S> 1996-08-22 </S> KinderCare Learning Centers Inc said on Thursday that a debt buyback would mean an extraordinary loss of $ 1.2 million in its fiscal 1997 first quarter . </S> The company said that during the quarter , which began June 1 , it bought $ 30 million par value of its outstanding 10-3/8 percent senior notes due 2001 . </S> The notes were bought for $ 31.5 million . </S> Philip Maslowe , chief financial officer of the preschool and child care company , said the buyback \" offered an opportunity to reduce the company \\'s weighted average interest costs and improve future cash flows and earnings . \" </S>'"
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_file[\"document\"][20]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [],
   "source": [
    "# Separate vectorizer for input / output"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [
    {
     "data": {
      "text/plain": "<AxesSubplot:>"
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": "<Figure size 640x480 with 1 Axes>",
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGtCAYAAADqPVUWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8+0lEQVR4nO3de3QU9eH//9duNtmEXMl1Ew0QL0CiCAiCsUi9pEREC0ov1Hj5tFRam+hHqCh8q1TRiqK1CB/qpfXawgcvp3AQNUJBRSUGiAUxYlBEEiCbACHZXDeb7P7+8JP5sRCUwOY2eT7OmQOZec/Me+Y9l9fOzsxafD6fTwAAACZh7e4KAAAABBLhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmEqHw83GjRt17bXXKiUlRRaLRatWrTquzM6dO/XjH/9Y0dHRCg8P10UXXaTS0lJjeFNTk3JzcxUXF6eIiAhNnTpVFRUVftMoLS3VpEmT1K9fPyUmJmr27NlqaWnp+BICAIA+xdbREerr6zV8+HD96le/0vXXX3/c8N27d2vcuHGaPn26HnjgAUVFRam4uFihoaFGmZkzZ+rNN9/Ua6+9pujoaOXl5en666/XRx99JElqbW3VpEmT5HA4tGnTJpWXl+vmm29WcHCwHn744ZOqp9fr1YEDBxQZGSmLxdLRxQQAAN3A5/OptrZWKSkpslpP8Qsm32mQ5Fu5cqVfv5///Oe+G2+88YTjVFdX+4KDg32vvfaa0W/nzp0+Sb6CggKfz+fzvfXWWz6r1epzOp1GmaeeesoXFRXlc7vdJ1W3srIynyQ6Ojo6Ojq6XtiVlZV1IJH46/CVm+/i9Xr15ptv6u6771Z2drb+85//KC0tTXPnztWUKVMkSUVFRfJ4PMrKyjLGGzp0qAYMGKCCggJdfPHFKigo0LBhw5SUlGSUyc7O1m233abi4mKNHDnyuHm73W653W7jb9///dh5WVmZoqKiArmYAACgk7hcLqWmpioyMvKUpxHQcFNZWam6ujo98sgjeuihh/Too48qPz9f119/vd5991398Ic/lNPpVEhIiGJiYvzGTUpKktPplCQ5nU6/YNM2vG1YexYsWKAHHnjguP5RUVGEGwAAepnTuaUkoE9Leb1eSdLkyZM1c+ZMjRgxQnPmzNE111yjp59+OpCzOs7cuXNVU1NjdGVlZZ06PwAA0DMFNNzEx8fLZrMpIyPDr396errxtJTD4VBzc7Oqq6v9ylRUVMjhcBhljn16qu3vtjLHstvtxlUartYAANB3BTTchISE6KKLLlJJSYlf/127dmngwIGSpFGjRik4OFjr1683hpeUlKi0tFSZmZmSpMzMTO3YsUOVlZVGmXXr1ikqKuq44AQAAHC0Dt9zU1dXp6+++sr4e8+ePdq2bZtiY2M1YMAAzZ49Wz//+c81fvx4XX755crPz9cbb7yh9957T5IUHR2t6dOna9asWYqNjVVUVJRuv/12ZWZm6uKLL5YkTZgwQRkZGbrpppu0cOFCOZ1O3XvvvcrNzZXdbg/MkgMAAHPq6ONV7777bruPbN1yyy1Gmeeee853zjnn+EJDQ33Dhw/3rVq1ym8ajY2Nvt/97ne+/v37+/r16+e77rrrfOXl5X5lvvnmG9/EiRN9YWFhvvj4eN/vf/97n8fjOel61tTU+CT5ampqOrqIAACgmwTi/G3x+f7vmWmTcblcio6OVk1NDfffAADQSwTi/M1vSwEAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFPpcLjZuHGjrr32WqWkpMhisWjVqlUnLPvb3/5WFotFixYt8utfVVWlnJwcRUVFKSYmRtOnT1ddXZ1fmU8//VSXXnqpQkNDlZqaqoULF3a0qgAAoA/qcLipr6/X8OHDtXTp0u8st3LlSn388cdKSUk5blhOTo6Ki4u1bt06rVmzRhs3btSMGTOM4S6XSxMmTNDAgQNVVFSkxx57TPfff7+effbZjlYXAAD0MbaOjjBx4kRNnDjxO8vs379ft99+u9555x1NmjTJb9jOnTuVn5+vLVu2aPTo0ZKkJUuW6Oqrr9bjjz+ulJQULVu2TM3NzXr++ecVEhKi8847T9u2bdMTTzzhF4IAAACOFfB7brxer2666SbNnj1b55133nHDCwoKFBMTYwQbScrKypLValVhYaFRZvz48QoJCTHKZGdnq6SkREeOHGl3vm63Wy6Xy68DAAB9T8DDzaOPPiqbzaY77rij3eFOp1OJiYl+/Ww2m2JjY+V0Oo0ySUlJfmXa/m4rc6wFCxYoOjra6FJTU093UQAAQC8U0HBTVFSkJ598Ui+++KIsFksgJ/295s6dq5qaGqMrKyvr0vkDAICeIaDh5oMPPlBlZaUGDBggm80mm82mvXv36ve//70GDRokSXI4HKqsrPQbr6WlRVVVVXI4HEaZiooKvzJtf7eVOZbdbldUVJRfBwAA+p6AhpubbrpJn376qbZt22Z0KSkpmj17tt555x1JUmZmpqqrq1VUVGSMt2HDBnm9Xo0dO9Yos3HjRnk8HqPMunXrNGTIEPXv3z+QVQYAACbT4ael6urq9NVXXxl/79mzR9u2bVNsbKwGDBiguLg4v/LBwcFyOBwaMmSIJCk9PV1XXXWVbr31Vj399NPyeDzKy8vTtGnTjMfGb7jhBj3wwAOaPn267rnnHn322Wd68skn9Ze//OV0lhUAAPQBHQ43W7du1eWXX278PWvWLEnSLbfcohdffPGkprFs2TLl5eXpyiuvlNVq1dSpU7V48WJjeHR0tNauXavc3FyNGjVK8fHxmjdvHo+BAwCA72Xx+Xy+7q5EZ3C5XIqOjlZNTQ333wAA0EsE4vzNb0sBAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABT6XC42bhxo6699lqlpKTIYrFo1apVxjCPx6N77rlHw4YNU3h4uFJSUnTzzTfrwIEDftOoqqpSTk6OoqKiFBMTo+nTp6uurs6vzKeffqpLL71UoaGhSk1N1cKFC09tCQEAQJ/S4XBTX1+v4cOHa+nSpccNa2ho0CeffKL77rtPn3zyif71r3+ppKREP/7xj/3K5eTkqLi4WOvWrdOaNWu0ceNGzZgxwxjucrk0YcIEDRw4UEVFRXrsscd0//3369lnnz2FRQQAAH2Jxefz+U55ZItFK1eu1JQpU05YZsuWLRozZoz27t2rAQMGaOfOncrIyNCWLVs0evRoSVJ+fr6uvvpq7du3TykpKXrqqaf0hz/8QU6nUyEhIZKkOXPmaNWqVfriiy9Oqm4ul0vR0dGqqalRVFTUqS4iAADoQoE4f3f6PTc1NTWyWCyKiYmRJBUUFCgmJsYINpKUlZUlq9WqwsJCo8z48eONYCNJ2dnZKikp0ZEjR9qdj9vtlsvl8usAAEDf06nhpqmpSffcc49+8YtfGOnL6XQqMTHRr5zNZlNsbKycTqdRJikpya9M299tZY61YMECRUdHG11qamqgFwcAAPQCnRZuPB6Pfvazn8nn8+mpp57qrNkY5s6dq5qaGqMrKyvr9Hk
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "training_file[\"len_tokenized\"].plot.bar()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Padding przykładów do 2048 słów"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor([18792   316  1335 ...     0     0     0], shape=(2048,), dtype=int64)\n"
     ]
    }
   ],
   "source": [
    "sentence_vectorizer = tf.keras.layers.TextVectorization(standardize=None, output_sequence_length=2048)\n",
    "sentence_vectorizer.adapt(training_file[\"document\"])\n",
    "print(sentence_vectorizer(training_file[\"document\"][20]))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor([2 2 2 ... 0 0 0], shape=(2048,), dtype=int64)\n"
     ]
    }
   ],
   "source": [
    "label_vectorizer = tf.keras.layers.TextVectorization(standardize=None, output_sequence_length=2048)\n",
    "label_vectorizer.adapt(training_file[\"label\"])\n",
    "print(label_vectorizer(training_file[\"label\"][20]))"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "outputs": [],
   "source": [
    "tags_list = label_vectorizer.get_vocabulary()\n",
    "tags_length = label_vectorizer.vocabulary_size()\n",
    "\n",
    "vocab_list = sentence_vectorizer.get_vocabulary()\n",
    "vocab_length = sentence_vectorizer.vocabulary_size()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "outputs": [],
   "source": [
    "training_file[\"document_vectorized\"] = training_file[\"document\"].apply(sentence_vectorizer)\n",
    "training_file[\"label_vectorized\"] = training_file[\"label\"].apply(label_vectorizer)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "outputs": [],
   "source": [
    "from keras.utils import to_categorical\n",
    "from sklearn.model_selection import train_test_split\n",
    "train, valid = train_test_split(training_file, test_size=0.2)\n",
    "train_x = np.stack(train[\"document_vectorized\"].values)\n",
    "train_y = np.stack(train[\"label_vectorized\"].values)\n",
    "train_y = np.array([to_categorical(i,num_classes = tags_length) for i in  train_y])\n",
    "\n",
    "val_x = np.stack(valid[\"document_vectorized\"].values)\n",
    "val_y = np.stack(valid[\"label_vectorized\"].values)\n",
    "val_y = np.array([to_categorical(i,num_classes = tags_length) for i in  val_y])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2014   19  122 ...    0    0    0]\n"
     ]
    }
   ],
   "source": [
    "print(val_x[0])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "outputs": [
    {
     "data": {
      "text/plain": "(756, 2048)"
     },
     "execution_count": 80,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_x.shape"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "outputs": [
    {
     "data": {
      "text/plain": "(756, 2048, 11)"
     },
     "execution_count": 81,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_y.shape"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "outputs": [
    {
     "data": {
      "text/plain": "array([ 128,   19, 1368, ...,    0,    0,    0], dtype=int64)"
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_x[0]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "outputs": [
    {
     "data": {
      "text/plain": "array([[0., 0., 1., ..., 0., 0., 0.],\n       [0., 0., 1., ..., 0., 0., 0.],\n       [0., 0., 0., ..., 1., 0., 0.],\n       ...,\n       [1., 0., 0., ..., 0., 0., 0.],\n       [1., 0., 0., ..., 0., 0., 0.],\n       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)"
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_y[0]"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"model_14\"\n",
      "_________________________________________________________________\n",
      " Layer (type)                Output Shape              Param #   \n",
      "=================================================================\n",
      " input_16 (InputLayer)       [(None, 2048)]            0         \n",
      "                                                                 \n",
      " embedding_15 (Embedding)    (None, 2048, 128)         3024256   \n",
      "                                                                 \n",
      " lstm_20 (LSTM)              (None, 2048, 256)         394240    \n",
      "                                                                 \n",
      " time_distributed_18 (TimeDi  (None, 2048, 11)         2827      \n",
      " stributed)                                                      \n",
      "                                                                 \n",
      "=================================================================\n",
      "Total params: 3,421,323\n",
      "Trainable params: 3,421,323\n",
      "Non-trainable params: 0\n",
      "_________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "from keras.optimizers import Adam\n",
    "import keras.layers as layers\n",
    "import keras\n",
    "\n",
    "\n",
    "def create_model():\n",
    "    input_layer = layers.Input(shape=(2048,))\n",
    "    embedding_layer = layers.Embedding(input_dim = vocab_length+1,output_dim = 128,input_length = 2048)(input_layer)\n",
    "    lstm_layer = layers.LSTM(256, return_sequences=True)(embedding_layer)\n",
    "    output_layer = layers.TimeDistributed(layers.Dense(tags_length,activation=\"softmax\"))(lstm_layer)\n",
    "    #out = layers.Dense(2048,activation=\"linear\")(dropout)\n",
    "    model = keras.Model(inputs=input_layer, outputs=output_layer)\n",
    "    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])\n",
    "    return model\n",
    "model = create_model()\n",
    "model.summary()"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/50\n",
      "24/24 [==============================] - 29s 1s/step - loss: 0.6602 - accuracy: 0.8703 - val_loss: 0.2673 - val_accuracy: 0.9425\n",
      "Epoch 2/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.2500 - accuracy: 0.9653 - val_loss: 0.1613 - val_accuracy: 0.9781\n",
      "Epoch 3/50\n",
      "24/24 [==============================] - 28s 1s/step - loss: 0.1062 - accuracy: 0.9790 - val_loss: 0.0984 - val_accuracy: 0.9793\n",
      "Epoch 4/50\n",
      "24/24 [==============================] - 28s 1s/step - loss: 0.0920 - accuracy: 0.9806 - val_loss: 0.0936 - val_accuracy: 0.9799\n",
      "Epoch 5/50\n",
      "24/24 [==============================] - 28s 1s/step - loss: 0.0874 - accuracy: 0.9812 - val_loss: 0.0901 - val_accuracy: 0.9800\n",
      "Epoch 6/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0828 - accuracy: 0.9816 - val_loss: 0.0867 - val_accuracy: 0.9804\n",
      "Epoch 7/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0774 - accuracy: 0.9818 - val_loss: 0.0805 - val_accuracy: 0.9804\n",
      "Epoch 8/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0715 - accuracy: 0.9819 - val_loss: 0.0741 - val_accuracy: 0.9807\n",
      "Epoch 9/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0628 - accuracy: 0.9822 - val_loss: 0.0660 - val_accuracy: 0.9808\n",
      "Epoch 10/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0543 - accuracy: 0.9826 - val_loss: 0.0579 - val_accuracy: 0.9815\n",
      "Epoch 11/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0465 - accuracy: 0.9843 - val_loss: 0.0500 - val_accuracy: 0.9851\n",
      "Epoch 12/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0385 - accuracy: 0.9879 - val_loss: 0.0453 - val_accuracy: 0.9867\n",
      "Epoch 13/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0330 - accuracy: 0.9901 - val_loss: 0.0413 - val_accuracy: 0.9873\n",
      "Epoch 14/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0298 - accuracy: 0.9909 - val_loss: 0.0395 - val_accuracy: 0.9887\n",
      "Epoch 15/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0257 - accuracy: 0.9922 - val_loss: 0.0380 - val_accuracy: 0.9887\n",
      "Epoch 16/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0241 - accuracy: 0.9924 - val_loss: 0.0362 - val_accuracy: 0.9887\n",
      "Epoch 17/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0215 - accuracy: 0.9935 - val_loss: 0.0344 - val_accuracy: 0.9897\n",
      "Epoch 18/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0191 - accuracy: 0.9942 - val_loss: 0.0335 - val_accuracy: 0.9898\n",
      "Epoch 19/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0173 - accuracy: 0.9948 - val_loss: 0.0322 - val_accuracy: 0.9906\n",
      "Epoch 20/50\n",
      "24/24 [==============================] - 28s 1s/step - loss: 0.0160 - accuracy: 0.9952 - val_loss: 0.0322 - val_accuracy: 0.9908\n",
      "Epoch 21/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0147 - accuracy: 0.9958 - val_loss: 0.0338 - val_accuracy: 0.9900\n",
      "Epoch 22/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0133 - accuracy: 0.9962 - val_loss: 0.0307 - val_accuracy: 0.9915\n",
      "Epoch 23/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0117 - accuracy: 0.9968 - val_loss: 0.0303 - val_accuracy: 0.9918\n",
      "Epoch 24/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0105 - accuracy: 0.9973 - val_loss: 0.0289 - val_accuracy: 0.9922\n",
      "Epoch 25/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0094 - accuracy: 0.9977 - val_loss: 0.0315 - val_accuracy: 0.9917\n",
      "Epoch 26/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0084 - accuracy: 0.9980 - val_loss: 0.0300 - val_accuracy: 0.9924\n",
      "Epoch 27/50\n",
      "24/24 [==============================] - 27s 1s/step - loss: 0.0073 - accuracy: 0.9984 - val_loss: 0.0295 - val_accuracy: 0.9926\n"
     ]
    }
   ],
   "source": [
    "callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)\n",
    "history = model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=50, callbacks=[callback])"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "outputs": [
    {
     "data": {
      "text/plain": "[('China', 'B-LOC'),\n ('says', 'O'),\n ('time', 'O'),\n ('right', 'O'),\n ('for', 'O'),\n ('Taiwan', 'B-LOC'),\n ('talks', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('BEIJING', 'B-LOC'),\n ('1996-08-22', 'O'),\n ('</S>', 'O'),\n ('China', 'B-LOC'),\n ('has', 'O'),\n ('said', 'O'),\n ('it', 'O'),\n ('was', 'O'),\n ('time', 'O'),\n ('for', 'O'),\n ('political', 'O'),\n ('talks', 'O'),\n ('with', 'O'),\n ('Taiwan', 'B-LOC'),\n ('and', 'O'),\n ('that', 'O'),\n ('the', 'O'),\n ('rival', 'O'),\n ('island', 'O'),\n ('should', 'O'),\n ('take', 'O'),\n ('practical', 'O'),\n ('steps', 'O'),\n ('towards', 'O'),\n ('that', 'O'),\n ('goal', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Consultations', 'O'),\n ('should', 'O'),\n ('be', 'O'),\n ('held', 'O'),\n ('to', 'O'),\n ('set', 'O'),\n ('the', 'O'),\n ('time', 'O'),\n ('and', 'O'),\n ('format', 'O'),\n ('of', 'O'),\n ('the', 'O'),\n ('talks', 'O'),\n (',', 'O'),\n ('the', 'O'),\n ('official', 'O'),\n ('Xinhua', 'B-ORG'),\n ('news', 'O'),\n ('agency', 'O'),\n ('quoted', 'O'),\n ('Tang', 'B-PER'),\n ('Shubei', 'I-PER'),\n (',', 'O'),\n ('executive', 'O'),\n ('vice', 'O'),\n ('chairman', 'O'),\n ('of', 'O'),\n ('the', 'O'),\n ('Association', 'B-ORG'),\n ('for', 'I-ORG'),\n ('Relations', 'O'),\n ('Across', 'I-ORG'),\n ('the', 'I-ORG'),\n ('Taiwan', 'I-ORG'),\n ('Straits', 'I-ORG'),\n (',', 'O'),\n ('as', 'O'),\n ('saying', 'O'),\n ('late', 'O'),\n ('on', 'O'),\n ('Wednesday', 'O'),\n ('.', 'O'),\n ('</S>', 'O')]"
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tag_list_numpy = np.array(tags_list)\n",
    "def get_tag_from_int(input_integer):\n",
    "    return tag_list_numpy[input_integer]\n",
    "def get_ner_output_single_sentence(input_sentence):\n",
    "    sentence_length = len(input_sentence.split())\n",
    "    vectorized = sentence_vectorizer(input_sentence)\n",
    "    #print(vectorized)\n",
    "    model_output = model(np.stack(tf.expand_dims(vectorized,0)))\n",
    "    #print(model_output.numpy())\n",
    "    #print(model_output.shape)\n",
    "    max_indices = np.argmax(model_output, axis=2).flatten()\n",
    "    #print(max_indices)\n",
    "    #print(len(max_indices))\n",
    "    #\" \".join(vocab_arr[vectorize_layer(training_file[\"document\"][20])])\n",
    "    tokenized = [get_tag_from_int(x) for x in max_indices[:]]\n",
    "    return tokenized[:sentence_length]\n",
    "#get_ner_output_single_sentence(\"China says time right for Taiwan talks . </S> BEIJING 1996-08-22 </S> China has said it was time for political talks with Taiwan and that the rival island should take practical steps towards that goal . </S> Consultations should be held to set the time and format of the talks , the official Xinhua news agency quoted Tang Shubei , executive vice chairman of the Association for Relations Across the Taiwan Straits , as saying late on Wednesday . </S>\")\n",
    "\n",
    "def test_sentence(sentence):\n",
    "    model_output = get_ner_output_single_sentence(sentence)\n",
    "    input_tokens = sentence.split()\n",
    "    return list(zip(input_tokens, model_output))\n",
    "\n",
    "test_sentence(\"China says time right for Taiwan talks . </S> BEIJING 1996-08-22 </S> China has said it was time for political talks with Taiwan and that the rival island should take practical steps towards that goal . </S> Consultations should be held to set the time and format of the talks , the official Xinhua news agency quoted Tang Shubei , executive vice chairman of the Association for Relations Across the Taiwan Straits , as saying late on Wednesday . </S>\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor([  128    19 18713 ...     0     0     0], shape=(2048,), dtype=int64)\n",
      "[[[3.0971142e-03 1.5280694e-03 9.8057139e-01 ... 3.6668889e-03\n",
      "   1.4106639e-03 3.3225205e-03]\n",
      "  [2.1369425e-04 1.2225067e-04 9.9616271e-01 ... 1.4002173e-03\n",
      "   1.0539902e-04 2.7582867e-04]\n",
      "  [6.3146334e-05 3.8070513e-05 9.9278271e-01 ... 2.4660169e-03\n",
      "   5.7447112e-05 1.3038449e-04]\n",
      "  ...\n",
      "  [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794704e-08\n",
      "   2.6146161e-08 5.0399006e-08]\n",
      "  [9.9999696e-01 1.7784757e-08 2.5151198e-07 ... 1.3794731e-08\n",
      "   2.6146161e-08 5.0399006e-08]\n",
      "  [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794704e-08\n",
      "   2.6146161e-08 5.0399006e-08]]]\n",
      "(1, 2048, 11)\n",
      "[2 2 2 ... 0 0 0]\n",
      "2048\n"
     ]
    },
    {
     "data": {
      "text/plain": "[('SOCCER', 'O'),\n ('-', 'O'),\n ('LATE', 'O'),\n ('GOALS', 'O'),\n ('GIVE', 'O'),\n ('JAPAN', 'O'),\n ('WIN', 'O'),\n ('OVER', 'O'),\n ('SYRIA', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('AL-AIN', 'O'),\n (',', 'O'),\n ('United', 'B-LOC'),\n ('Arab', 'I-LOC'),\n ('Emirates', 'I-LOC'),\n ('1996-12-06', 'O'),\n ('</S>', 'O'),\n ('Two', 'O'),\n ('goals', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('last', 'O'),\n ('six', 'O'),\n ('minutes', 'O'),\n ('gave', 'O'),\n ('holders', 'O'),\n ('Japan', 'B-LOC'),\n ('an', 'O'),\n ('uninspiring', 'O'),\n ('2-1', 'O'),\n ('Asian', 'B-LOC'),\n ('Cup', 'I-MISC'),\n ('victory', 'O'),\n ('over', 'O'),\n ('Syria', 'B-LOC'),\n ('on', 'O'),\n ('Friday', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Takuya', 'O'),\n ('Takagi', 'O'),\n ('headed', 'O'),\n ('the', 'O'),\n ('winner', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('88th', 'O'),\n ('minute', 'O'),\n ('of', 'O'),\n ('the', 'O'),\n ('group', 'O'),\n ('C', 'O'),\n ('game', 'O'),\n ('after', 'O'),\n ('goalkeeper', 'O'),\n ('Salem', 'O'),\n ('Bitar', 'O'),\n ('spoiled', 'O'),\n ('a', 'O'),\n ('mistake-free', 'O'),\n ('display', 'O'),\n ('by', 'O'),\n ('allowing', 'O'),\n ('the', 'O'),\n ('ball', 'O'),\n ('to', 'O'),\n ('slip', 'O'),\n ('under', 'O'),\n ('his', 'O'),\n ('body', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('It', 'O'),\n ('was', 'O'),\n ('the', 'O'),\n ('second', 'O'),\n ('Syrian', 'B-PER'),\n ('defensive', 'O'),\n ('blunder', 'O'),\n ('in', 'O'),\n ('four', 'O'),\n ('minutes', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Defender', 'O'),\n ('Hassan', 'B-PER'),\n ('Abbas', 'I-PER'),\n ('rose', 'O'),\n ('to', 'O'),\n ('intercept', 'O'),\n ('a', 'O'),\n ('long', 'O'),\n ('ball', 'O'),\n ('into', 'O'),\n ('the', 'O'),\n ('area', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('84th', 'O'),\n ('minute', 'O'),\n ('but', 'O'),\n ('only', 'O'),\n ('managed', 'O'),\n ('to', 'O'),\n ('divert', 'O'),\n ('it', 'O'),\n ('into', 'O'),\n ('the', 'O'),\n ('top', 'O'),\n ('corner', 'O'),\n ('of', 'O'),\n ('Bitar', 'O'),\n (\"'s\", 'O'),\n ('goal', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Syria', 'B-ORG'),\n ('had', 'O'),\n ('taken', 'O'),\n ('the', 'O'),\n ('lead', 'O'),\n ('from', 'O'),\n ('their', 'O'),\n ('first', 'O'),\n ('serious', 'O'),\n ('attack', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('seventh', 'O'),\n ('minute', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Nader', 'O'),\n ('Jokhadar', 'O'),\n ('headed', 'O'),\n ('a', 'O'),\n ('cross', 'O'),\n ('from', 'O'),\n ('the', 'O'),\n ('right', 'O'),\n ('by', 'O'),\n ('Ammar', 'O'),\n ('Awad', 'O'),\n ('into', 'O'),\n ('the', 'O'),\n ('top', 'O'),\n ('right', 'O'),\n ('corner', 'O'),\n ('of', 'O'),\n ('Kenichi', 'O'),\n ('Shimokawa', 'O'),\n (\"'s\", 'O'),\n ('goal', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Japan', 'B-LOC'),\n ('then', 'O'),\n ('laid', 'O'),\n ('siege', 'O'),\n ('to', 'O'),\n ('the', 'O'),\n ('Syrian', 'B-ORG'),\n ('penalty', 'O'),\n ('area', 'O'),\n ('and', 'O'),\n ('had', 'O'),\n ('a', 'O'),\n ('goal', 'O'),\n ('disallowed', 'O'),\n ('for', 'O'),\n ('offside', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('16th', 'O'),\n ('minute', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('A', 'O'),\n ('minute', 'O'),\n ('later', 'O'),\n (',', 'O'),\n ('Bitar', 'O'),\n ('produced', 'O'),\n ('a', 'O'),\n ('good', 'O'),\n ('double', 'O'),\n ('save', 'O'),\n (',', 'O'),\n ('first', 'O'),\n ('from', 'O'),\n ('Kazuyoshi', 'O'),\n ('Miura', 'O'),\n (\"'s\", 'O'),\n ('header', 'O'),\n ('and', 'O'),\n ('then', 'O'),\n ('blocked', 'O'),\n ('a', 'O'),\n ('Takagi', 'O'),\n ('follow-up', 'O'),\n ('shot', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Bitar', 'O'),\n ('saved', 'O'),\n ('well', 'O'),\n ('again', 'O'),\n ('from', 'O'),\n ('Miura', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('37th', 'O'),\n ('minute', 'O'),\n (',', 'O'),\n ('parrying', 'O'),\n ('away', 'O'),\n ('his', 'O'),\n ('header', 'O'),\n ('from', 'O'),\n ('a', 'O'),\n ('corner', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Japan', 'B-ORG'),\n ('started', 'O'),\n ('the', 'O'),\n ('second', 'O'),\n ('half', 'O'),\n ('brightly', 'O'),\n ('but', 'O'),\n ('Bitar', 'O
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_sentence(\"SOCCER - LATE GOALS GIVE JAPAN WIN OVER SYRIA . </S> AL-AIN , United Arab Emirates 1996-12-06 </S> Two goals in the last six minutes gave holders Japan an uninspiring 2-1 Asian Cup victory over Syria on Friday . </S> Takuya Takagi headed the winner in the 88th minute of the group C game after goalkeeper Salem Bitar spoiled a mistake-free display by allowing the ball to slip under his body . </S> It was the second Syrian defensive blunder in four minutes . </S> Defender Hassan Abbas rose to intercept a long ball into the area in the 84th minute but only managed to divert it into the top corner of Bitar 's goal . </S> Syria had taken the lead from their first serious attack in the seventh minute . </S> Nader Jokhadar headed a cross from the right by Ammar Awad into the top right corner of Kenichi Shimokawa 's goal . </S> Japan then laid siege to the Syrian penalty area and had a goal disallowed for offside in the 16th minute . </S> A minute later , Bitar produced a good double save , first from Kazuyoshi Miura 's header and then blocked a Takagi follow-up shot . </S> Bitar saved well again from Miura in the 37th minute , parrying away his header from a corner . </S> Japan started the second half brightly but Bitar denied them an equaliser when he dived to his right to save Naoki Soma 's low drive in the 53rd minute . </S> Japan : 19 - Kenichi Shimokawa , 2 - Hiroshige Yanagimoto , 3 - Naoki Soma , 4 - Masami Ihara , 5 - Norio Omura , 6 - Motohiro Yamaguchi , 8 - Masakiyo Maezono ( 7 - Yasuto Honda 71 ) , 9 - Takuya Takagi , 10 - Hiroshi Nanami , 11 - Kazuyoshi Miura , 15 - Hiroaki Morishima ( 14 - Masayuki Okano 75 ) . </S> Syria : 24 - Salem Bitar , 3 - Bachar Srour ; 4 - Hassan Abbas , 5 - Tarek Jabban , 6 - Ammar Awad ( 9 - Louay Taleb 69 ) , 8 - Nihad al-Boushi , 10 - Mohammed Afash , 12 - Ali Dib , 13 - Abdul Latif Helou ( 17 - Ammar Rihawiy 46 ) , 14 - Khaled Zaher ; 16 - Nader Jokhadar . </S>\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tf.Tensor([ 1 16  1 ...  0  0  0], shape=(2048,), dtype=int64)\n",
      "[[[9.1573365e-02 8.5647009e-02 1.1034752e-01 ... 8.8930450e-02\n",
      "   8.8644758e-02 8.9963131e-02]\n",
      "  [5.5477720e-02 4.6575051e-02 5.2461910e-01 ... 6.4232960e-02\n",
      "   4.4661559e-02 5.8426060e-02]\n",
      "  [4.9609054e-02 4.3161135e-02 4.3743923e-01 ... 9.0816177e-02\n",
      "   4.6578653e-02 5.5895649e-02]\n",
      "  ...\n",
      "  [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794731e-08\n",
      "   2.6146161e-08 5.0399006e-08]\n",
      "  [9.9999696e-01 1.7784757e-08 2.5151198e-07 ... 1.3794731e-08\n",
      "   2.6146161e-08 5.0399006e-08]\n",
      "  [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794731e-08\n",
      "   2.6146161e-08 5.0399006e-08]]]\n",
      "(1, 2048, 11)\n",
      "[2 2 2 ... 0 0 0]\n",
      "2048\n"
     ]
    },
    {
     "data": {
      "text/plain": "[('Mussolini', 'O'),\n (\"'s\", 'O'),\n ('granddaughter', 'O'),\n ('rejoins', 'O'),\n ('far-right', 'O'),\n ('party', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('ROME', 'B-LOC'),\n ('1996-12-06', 'O'),\n ('</S>', 'O'),\n ('Alessandra', 'O'),\n ('Mussolini', 'O'),\n (',', 'O'),\n ('the', 'O'),\n ('granddaughter', 'O'),\n ('of', 'O'),\n ('Italy', 'B-LOC'),\n (\"'s\", 'O'),\n ('Fascist', 'O'),\n ('dictator', 'O'),\n ('Benito', 'B-PER'),\n ('Mussolini', 'I-PER'),\n (',', 'O'),\n ('said', 'O'),\n ('on', 'O'),\n ('Friday', 'O'),\n ('she', 'O'),\n ('had', 'O'),\n ('rejoined', 'O'),\n ('the', 'O'),\n ('far-right', 'O'),\n ('National', 'B-PER'),\n ('Alliance', 'I-PER'),\n ('(', 'O'),\n ('AN', 'O'),\n (')', 'O'),\n ('party', 'O'),\n ('she', 'O'),\n ('quit', 'O'),\n ('over', 'O'),\n ('policy', 'O'),\n ('differences', 'O'),\n ('last', 'O'),\n ('month', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('\"', 'O'),\n ('I', 'O'),\n (\"'ve\", 'O'),\n ('gone', 'O'),\n ('back', 'O'),\n (',', 'O'),\n ('\"', 'O'),\n ('she', 'O'),\n ('told', 'O'),\n ('a', 'O'),\n ('radio', 'O'),\n ('show', 'O'),\n ('shortly', 'O'),\n ('after', 'O'),\n ('AN', 'O'),\n ('leader', 'O'),\n ('Gianfranco', 'B-PER'),\n ('Fini', 'I-PER'),\n (',', 'O'),\n ('who', 'O'),\n ('was', 'O'),\n ('being', 'O'),\n ('interviewed', 'O'),\n ('on', 'O'),\n ('the', 'O'),\n ('programme', 'O'),\n (',', 'O'),\n ('said', 'O'),\n ('the', 'O'),\n ('row', 'O'),\n ('had', 'O'),\n ('been', 'O'),\n ('resolved', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('\"', 'O'),\n ('He', 'O'),\n ('did', 'O'),\n (\"n't\", 'O'),\n ('want', 'O'),\n ('to', 'O'),\n ('lose', 'O'),\n ('me', 'O'),\n ('and', 'O'),\n ('I', 'O'),\n ('did', 'O'),\n (\"n't\", 'O'),\n ('want', 'O'),\n ('to', 'O'),\n ('lose', 'O'),\n ('him', 'O'),\n ('.', 'O'),\n ('\"', 'O'),\n ('</S>', 'O'),\n ('Fini', 'O'),\n ('told', 'O'),\n ('state', 'O'),\n ('radio', 'O'),\n ('RAI', 'B-PER'),\n ('he', 'O'),\n ('met', 'O'),\n ('Mussolini', 'O'),\n ('thanks', 'O'),\n ('to', 'O'),\n ('the', 'O'),\n ('good', 'O'),\n ('offices', 'O'),\n ('of', 'O'),\n ('Giuseppe', 'B-PER'),\n ('Tatarella', 'I-PER'),\n (',', 'O'),\n ('AN', 'O'),\n (\"'s\", 'O'),\n ('leader', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('Chamber', 'B-PER'),\n ('of', 'O'),\n ('Deputies', 'O'),\n ('(', 'O'),\n ('lower', 'O'),\n ('house', 'O'),\n (')', 'O'),\n (',', 'O'),\n ('and', 'O'),\n ('had', 'O'),\n ('overcome', 'O'),\n ('their', 'O'),\n ('differences', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Mussolini', 'O'),\n (',', 'O'),\n ('33', 'O'),\n (',', 'O'),\n ('resigned', 'O'),\n ('from', 'O'),\n ('the', 'O'),\n ('parliamentary', 'O'),\n ('party', 'O'),\n ('group', 'O'),\n ('for', 'O'),\n ('what', 'O'),\n ('she', 'O'),\n ('said', 'O'),\n ('were', 'O'),\n ('strictly', 'O'),\n ('political', 'O'),\n ('reasons', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('The', 'O'),\n ('fiery', 'O'),\n ('politician', 'O'),\n (',', 'O'),\n ('who', 'O'),\n ('is', 'O'),\n ('also', 'O'),\n ('a', 'O'),\n ('niece', 'O'),\n ('of', 'O'),\n ('screen', 'O'),\n ('star', 'O'),\n ('Sophia', 'B-PER'),\n ('Loren', 'I-PER'),\n (',', 'O'),\n ('had', 'O'),\n ('accused', 'O'),\n ('AN', 'O'),\n ('leaders', 'O'),\n ('of', 'O'),\n ('stifling', 'O'),\n ('internal', 'O'),\n ('party', 'O'),\n ('debate', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Mussolini', 'O'),\n (',', 'O'),\n ('who', 'O'),\n ('sits', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('Chamber', 'B-PER'),\n (',', 'O'),\n ('told', 'O'),\n ('La', 'B-ORG'),\n ('Stampa', 'I-ORG'),\n ('newspaper', 'O'),\n ('last', 'O'),\n ('month', 'O'),\n ('after', 'O'),\n ('quitting', 'O'),\n ('AN', 'O'),\n (\"'s\", 'O'),\n ('parliamentary', 'O'),\n ('party', 'O'),\n ('that', 'O'),\n ('she', 'O'),\n ('was', 'O'),\n ('considering', 'O'),\n ('joining', 'O'),\n ('the', 'O'),\n ('neo-fascist', 'O'),\n ('Social', 'B-ORG'),\n ('Movement', 'I-ORG'),\n ('(', 'O'),\n ('MS-Fiamma', 'O'),\n (')', 'O'),\n ('formed', 'O'),\n ('by', 'O'),\n ('some', 'O'),\n ('of', 'O'),\n ('the', 'O'),\n ('Duce', 'O'),\n (\"'s\", 'O'),\n ('World', 'B-ORG'),\n ('War', 'I-ORG'),\n ('Two', 'O'),\n ('followers', 'O'),\n ('.', 'O'),\n 
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "news_string = \"\"\"Mussolini 's granddaughter rejoins far-right party . </S> ROME 1996-12-06 </S> Alessandra Mussolini , the granddaughter of Italy 's Fascist dictator Benito Mussolini , said on Friday she had rejoined the far-right National Alliance ( AN ) party she quit over policy differences last month . </S> \" I 've gone back , \" she told a radio show shortly after AN leader Gianfranco Fini , who was being interviewed on the programme , said the row had been resolved . </S> \" He did n't want to lose me and I did n't want to lose him . \" </S> Fini told state radio RAI he met Mussolini thanks to the good offices of Giuseppe Tatarella , AN 's leader in the Chamber of Deputies ( lower house ) , and had overcome their differences . </S> Mussolini , 33 , resigned from the parliamentary party group for what she said were strictly political reasons . </S> The fiery politician , who is also a niece of screen star Sophia Loren , had accused AN leaders of stifling internal party debate . </S> Mussolini , who sits in the Chamber , told La Stampa newspaper last month after quitting AN 's parliamentary party that she was considering joining the neo-fascist Social Movement ( MS-Fiamma ) formed by some of the Duce 's World War Two followers . </S>\"\"\"\n",
    "\n",
    "test_sentence(news_string)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "outputs": [],
   "source": [
    "model.save(\"model_v2.keras\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "outputs": [],
   "source": [
    "import keras\n",
    "model = keras.models.load_model('model_v2.keras')"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ERROR:tensorflow:==================================\n",
      "Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):\n",
      "<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x00000262307DCA00>\n",
      "If you want to mark it as used call its \"mark_used()\" method.\n",
      "It was originally created here:\n",
      "  File \"C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\keras\\backend.py\", line 5130, in <genexpr>\n",
      "    ta.write(ta_index_to_write, out)  File \"C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\tensorflow\\python\\util\\tf_should_use.py\", line 243, in wrapped\n",
      "    return _add_should_use_warning(fn(*args, **kwargs),\n",
      "==================================\n"
     ]
    }
   ],
   "source": [
    "with open(\"en-ner-conll-2003/dev-0/in.tsv\", \"r\", encoding=\"utf-8\") as f:\n",
    "    lines = f.readlines()\n",
    "processed = [\" \".join(get_ner_output_single_sentence(x)) for x in lines if len(x.strip())>0]\n",
    "with open('en-ner-conll-2003/dev-0/out.tsv', 'w',encoding=\"utf-8\") as f:\n",
    "    for line in processed:\n",
    "        f.write(f\"{line}\\n\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "outputs": [],
   "source": [
    "with open(\"en-ner-conll-2003/test-A/in.tsv\", \"r\", encoding=\"utf-8\") as f:\n",
    "    lines = f.readlines()\n",
    "processed = [\" \".join(get_ner_output_single_sentence(x)) for x in lines if len(x.strip())>0]\n",
    "with open('en-ner-conll-2003/test-A/out.tsv', 'w',encoding=\"utf-8\") as f:\n",
    "    for line in processed:\n",
    "        f.write(f\"{line}\\n\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "### Czyszczenie tagów"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'B-LOC', 'I-LOC', 'O', 'I-MISC', 'B-ORG', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC'}\n"
     ]
    }
   ],
   "source": [
    "tag_set = set()\n",
    "with open(\"en-ner-conll-2003/dev-0/out.tsv\", \"r\", encoding=\"utf-8\") as f:\n",
    "    lines = f.readlines()\n",
    "for line in lines:\n",
    "    line_split = line.split()\n",
    "    for tag in line_split:\n",
    "        if tag not in tag_set:\n",
    "            tag_set.add(tag)\n",
    "print(tag_set)"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [],
   "source": [
    "inter_to_begin_mapping = {\n",
    "    \"I-LOC\": \"B-LOC\",\n",
    "    \"I-MISC\": 'B-MISC',\n",
    "    'I-ORG': 'B-ORG',\n",
    "    'I-PER': 'B-PER'\n",
    "}\n",
    "begin_to_inter_mapping = {v: k for k, v in inter_to_begin_mapping.items()}"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [
    {
     "data": {
      "text/plain": "{'I-LOC': 'B-LOC', 'I-MISC': 'B-MISC', 'I-ORG': 'B-ORG', 'I-PER': 'B-PER'}"
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "inter_to_begin_mapping"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [
    {
     "data": {
      "text/plain": "{'B-LOC': 'I-LOC', 'B-MISC': 'I-MISC', 'B-ORG': 'I-ORG', 'B-PER': 'I-PER'}"
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "begin_to_inter_mapping"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [],
   "source": [
    "def fix_tags_in_file(filename, filename_fixed):\n",
    "    lines_fixed = []\n",
    "    with open(filename, \"r\", encoding=\"utf-8\") as f:\n",
    "        lines = f.readlines()\n",
    "    lines_tokenized = [line.split() for line in lines]\n",
    "    for line in lines_tokenized:\n",
    "        line_fixed = []\n",
    "        for counter, element in enumerate(line):\n",
    "            if element==\"O\": # O tag can be placed anywhere\n",
    "                line_fixed.append(element)\n",
    "            elif element in inter_to_begin_mapping:\n",
    "                if counter==0: # Beginning of line, can't check previous tag\n",
    "                    line_fixed.append(inter_to_begin_mapping[element])\n",
    "                else:\n",
    "                    previous_element = line_fixed[counter-1]\n",
    "                    if previous_element==element or previous_element==inter_to_begin_mapping[element]: # Tag was compatible (same inters or compatible B-->I)\n",
    "                        line_fixed.append(element)\n",
    "                    elif previous_element==\"O\": # O--> Inter\n",
    "                        line_fixed.append(inter_to_begin_mapping[element])\n",
    "                    elif previous_element in inter_to_begin_mapping and element in inter_to_begin_mapping and previous_element!=element: # Incompatible subsequent inter-tags\n",
    "                        line_fixed.append(previous_element)\n",
    "                    else: # Begin --> Incompatible inter\n",
    "                        corrected_tag = begin_to_inter_mapping[previous_element]\n",
    "                        line_fixed.append(corrected_tag)\n",
    "            elif element in begin_to_inter_mapping: # Beginning tag can be added safely\n",
    "                line_fixed.append(element)\n",
    "            else:\n",
    "                print(\"This shouldn't happen\")\n",
    "        lines_fixed.append(\" \".join(line_fixed))\n",
    "    with open(filename_fixed, \"w\", encoding=\"utf-8\") as f:\n",
    "       for line in lines_fixed:\n",
    "           f.write(f\"{line}\\n\")\n",
    "fix_tags_in_file(\"en-ner-conll-2003/test-A/out.tsv\", \"en-ner-conll-2003/test-A/out_fixed.tsv\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [],
   "source": [
    "fix_tags_in_file(\"en-ner-conll-2003/dev-0/out.tsv\", \"en-ner-conll-2003/dev-0/out_fixed.tsv\")"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}