2024-05-24 14:52:46 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [
{
"data": {
"text/plain": " label \\\n0 B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ... \n1 O B-PER O O O O O O O O O B-LOC O O O O O O O ... \n2 B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ... \n3 B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ... \n4 B-MISC O O O O O O O O O O O B-LOC O O B-MISC ... \n\n document \n0 EU rejects German call to boycott British lamb... \n1 Rare Hendrix song draft sells for almost $ 17,... \n2 China says Taiwan spoils atmosphere for talks ... \n3 China says time right for Taiwan talks . </S> ... \n4 German July car registrations up 14.2 pct yr /... ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>label</th>\n <th>document</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ...</td>\n <td>EU rejects German call to boycott British lamb...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>O B-PER O O O O O O O O O B-LOC O O O O O O O ...</td>\n <td>Rare Hendrix song draft sells for almost $ 17,...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ...</td>\n <td>China says Taiwan spoils atmosphere for talks ...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ...</td>\n <td>China says time right for Taiwan talks . </S> ...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>B-MISC O O O O O O O O O O O B-LOC O O B-MISC ...</td>\n <td>German July car registrations up 14.2 pct yr /...</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"\n",
"training_file = pd.read_csv(\"en-ner-conll-2003/train/train.tsv\", sep='\\t', on_bad_lines=\"warn\", names=[\"label\",\"document\"])\n",
"training_file.head()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [],
"source": [
"import tensorflow as tf\n",
"training_file[\"tag_list\"] = training_file[\"label\"].apply(lambda x : x.split())\n",
"training_file[\"tokenized\"] = training_file[\"document\"].apply(lambda x : x.split())\n",
"training_file[\"len_tags\"] = training_file[\"tag_list\"].apply(len)\n",
"training_file[\"len_tokenized\"] = training_file[\"tokenized\"].apply(len)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"data": {
"text/plain": "Empty DataFrame\nColumns: [label, document, tag_list, tokenized, len_tags, len_tokenized]\nIndex: []",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>label</th>\n <th>document</th>\n <th>tag_list</th>\n <th>tokenized</th>\n <th>len_tags</th>\n <th>len_tokenized</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table>\n</div>"
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"training_file.loc[~(training_file['len_tokenized'] == training_file['len_tags'])]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"data": {
"text/plain": " label \\\n0 B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ... \n1 O B-PER O O O O O O O O O B-LOC O O O O O O O ... \n2 B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ... \n3 B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ... \n4 B-MISC O O O O O O O O O O O B-LOC O O B-MISC ... \n\n document \\\n0 EU rejects German call to boycott British lamb... \n1 Rare Hendrix song draft sells for almost $ 17,... \n2 China says Taiwan spoils atmosphere for talks ... \n3 China says time right for Taiwan talks . </S> ... \n4 German July car registrations up 14.2 pct yr /... \n\n tag_list \\\n0 [B-ORG, O, B-MISC, O, O, O, B-MISC, O, O, O, B... \n1 [O, B-PER, O, O, O, O, O, O, O, O, O, B-LOC, O... \n2 [B-LOC, O, B-LOC, O, O, O, O, O, O, B-LOC, O, ... \n3 [B-LOC, O, O, O, O, B-LOC, O, O, O, B-LOC, O, ... \n4 [B-MISC, O, O, O, O, O, O, O, O, O, O, O, B-LO... \n\n tokenized len_tags len_tokenized \n0 [EU, rejects, German, call, to, boycott, Briti... 489 489 \n1 [Rare, Hendrix, song, draft, sells, for, almos... 197 197 \n2 [China, says, Taiwan, spoils, atmosphere, for,... 248 248 \n3 [China, says, time, right, for, Taiwan, talks,... 80 80 \n4 [German, July, car, registrations, up, 14.2, p... 235 235 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>label</th>\n <th>document</th>\n <th>tag_list</th>\n <th>tokenized</th>\n <th>len_tags</th>\n <th>len_tokenized</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ...</td>\n <td>EU rejects German call to boycott British lamb...</td>\n <td>[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O, O, B...</td>\n <td>[EU, rejects, German, call, to, boycott, Briti...</td>\n <td>489</td>\n <td>489</td>\n </tr>\n <tr>\n <th>1</th>\n <td>O B-PER O O O O O O O O O B-LOC O O O O O O O ...</td>\n <td>Rare Hendrix song draft sells for almost $ 17,...</td>\n <td>[O, B-PER, O, O, O, O, O, O, O, O, O, B-LOC, O...</td>\n <td>[Rare, Hendrix, song, draft, sells, for, almos...</td>\n <td>197</td>\n <td>197</td>\n </tr>\n <tr>\n <th>2</th>\n <td>B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ...</td>\n <td>China says Taiwan spoils atmosphere for talks ...</td>\n <td>[B-LOC, O, B-LOC, O, O, O, O, O, O, B-LOC, O, ...</td>\n <td>[China, says, Taiwan, spoils, atmosphere, for,...</td>\n <td>248</td>\n <td>248</td>\n </tr>\n <tr>\n <th>3</th>\n <td>B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ...</td>\n <td>China says time right for Taiwan talks . </S> ...</td>\n <td>[B-LOC, O, O, O, O, B-LOC, O, O, O, B-LOC, O, ...</td>\n <td>[China, says, time, right, for, Taiwan, talks,...</td>\n <td>80</td>\n <td>80</td>\n </tr>\n <tr>\n <th>4</th>\n <td>B-MISC O O O O O O O O O O O B-LOC O O B-MISC ...</td>\n <td>German July car registrations up 14.2 pct yr /...</td>\n <td>[B-MISC, O, O, O, O, O, O, O, O, O, O, O, B-LO...</td>\n <td>[German, July, car, registrations, up, 14.2, p...</td>\n <td>235</td>\n <td>235</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"training_file.head()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1532\n"
]
}
],
"source": [
"max_length = training_file[\"len_tokenized\"].max()\n",
"print(max_length) # 1532 ---> ~2048"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Testowanie wektoryzacji / dewektoryzacji tekstu"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tf.Tensor(\n",
"[18792 316 1335 896 8 479 7287 284 3 2 18492 4\n",
" 11364 3 2 137 2 18793 18637 20290 346 15 14 68\n",
" 27 9 1335 9461 59 3210 42 5299 507 6 52 4906\n",
" 71 7 64 1712 554 49 540 3 2 20 132 15\n",
" 27 257 5 540 4 60 536 232 18 4 37 1257\n",
" 52 234 71 1398 1164 6 64 2541 23235 65 880 5156\n",
" 280 3526 3 2 20 5156 40 1257 17 52 22125 71\n",
" 3 2 2016 18381 4 449 834 1318 6 5 13472 12\n",
" 1339 2356 132 4 15 5 9461 13 1240 42 2542 8\n",
" 2525 5 132 16 8166 666 724 1190 12 2129 618 622\n",
" 5276 12 836 3 13 2], shape=(126,), dtype=int64)\n"
]
}
],
"source": [
"vectorize_layer = tf.keras.layers.TextVectorization(standardize=None)\n",
"vectorize_layer.adapt(training_file[\"document\"])\n",
"print(vectorize_layer(training_file[\"document\"][20]))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"data": {
"text/plain": "126"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(training_file[\"document\"][20].split())"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": "['',\n '[UNK]',\n '</S>',\n '.',\n ',',\n 'the',\n 'of',\n 'in',\n 'to',\n 'a',\n ')',\n '(',\n 'and',\n '\"',\n 'on',\n 'said',\n \"'s\",\n 'for',\n '1',\n '-',\n 'The',\n 'was',\n '2',\n '0',\n '3',\n 'at',\n 'with',\n 'that',\n 'from',\n 'by',\n 'is',\n ':',\n 'as',\n 'he',\n '4',\n 'had',\n 'has',\n 'it',\n 'his',\n 'not',\n 'were',\n 'be',\n 'an',\n 'have',\n 'after',\n 'who',\n 'will',\n '5',\n 'but',\n 'first',\n 'U.S.',\n 'been',\n '$',\n '--',\n 'two',\n 'their',\n 'are',\n '6',\n 'beat',\n 'would',\n 'which',\n 'up',\n 'I',\n 'they',\n 'its',\n 'percent',\n 'year',\n 'out',\n 'Thursday',\n 'this',\n 'last',\n 'million',\n 'over',\n 'Wednesday',\n 'one',\n '7',\n 'government',\n 'against',\n '/',\n 'police',\n 'when',\n 'second',\n 'also',\n 'Tuesday',\n 'He',\n 'It',\n 'A',\n 'three',\n 'told',\n 'new',\n '10',\n 'Monday',\n 'or',\n 'about',\n 'Friday',\n 'people',\n 'In',\n 'her',\n '9',\n '1996-08-28',\n 'no',\n 'won',\n 'we',\n 'New',\n 'into',\n 'under',\n 'some',\n 'Sunday',\n 'But',\n '8',\n 'more',\n 'before',\n 'week',\n \"'\",\n 'time',\n 'than',\n 'market',\n 'could',\n 'Germany',\n 'points',\n 'We',\n 'between',\n 'Australia',\n 'years',\n 'since',\n 'Britain',\n 'other',\n 'AT',\n 'SOCCER',\n 'played',\n 'all',\n 'state',\n 'company',\n 'France',\n 'England',\n 'Saturday',\n 'only',\n '1996-08-22',\n 'officials',\n 'group',\n '1996-08-29',\n 'there',\n 'round',\n '1996',\n 'South',\n 'Minister',\n '1996-08-27',\n '11',\n 'off',\n 'match',\n '13',\n 'six',\n 'four',\n 'down',\n '6-4',\n '6-3',\n 'because',\n '21',\n 'five',\n '15',\n 'him',\n 'Spain',\n '1996-08-26',\n 'next',\n 'President',\n 'official',\n 'former',\n 'she',\n 'home',\n 'United',\n 'third',\n 'do',\n 'spokesman',\n 'just',\n 'games',\n 'expected',\n 'did',\n 'day',\n 'win',\n 'through',\n 'statement',\n 'made',\n 'NEW',\n '70',\n '12',\n '1996-08-23',\n 'them',\n 'lost',\n '14',\n 'world',\n 'where',\n '6-2',\n '20',\n 'September',\n 'Russian',\n 'July',\n 'shares',\n \"n't\",\n 'if',\n 'back',\n 'RESULTS',\n 'Italy',\n 'YORK',\n 'China',\n 'August',\n 'president',\n 'Cup',\n '3.',\n '2.',\n 'DIVISION',\n '1.',\n 'Clinton',\n 'British',\n 'while',\n 'seconds',\n 'any',\n 'LONDON',\n 'Japan',\n 'reported',\n 'billion',\n '69',\n 'matches',\n 'v',\n 'team',\n 'month',\n 'Russia',\n 'division',\n 'Pakistan',\n 'meeting',\n 'being',\n 'They',\n 'London',\n 'June',\n 'European',\n '30',\n 'news',\n 'added',\n 'German',\n '71',\n '1996-08-25',\n 'still',\n 'peace',\n 'metres',\n 'half',\n 'Results',\n 'At',\n '1/2',\n 'talks',\n 'set',\n 'earlier',\n 'tonnes',\n 'killed',\n 'season',\n 'now',\n 'Sweden',\n 'take',\n 'held',\n 'during',\n 'Reuters',\n 'should',\n 'part',\n 'around',\n 'India',\n 'party',\n 'elections',\n 'National',\n 'took',\n 'game',\n 'Bank',\n 'soccer',\n 'number',\n 'minutes',\n 'lead',\n 'innings',\n 'early',\n 'capital',\n '68',\n '6-1',\n 'saying',\n 'end',\n 'due',\n 'days',\n 'b',\n '7-6',\n 'results',\n 'Open',\n '100',\n 'so',\n 'foreign',\n 'you',\n 'political',\n 'per',\n 'international',\n 'final',\n 'can',\n 'York',\n 'West',\n 'Belgium',\n '22',\n 'well',\n 'victory',\n 'most',\n 'Newsroom',\n 'French',\n 'Netherlands',\n '50',\n 'visit',\n 'seven',\n 'country',\n 'champion',\n 'Iraq',\n '25',\n 'our',\n 'minute',\n 'Israel',\n 'American',\n 'says',\n 'left',\n 'Czech',\n 'Africa',\n '66',\n '1996-08-24',\n 'profit',\n 'play',\n 'LEAGUE',\n '4.',\n 'vs.',\n 'league',\n '67',\n '6.',\n '5.',\n 'very',\n 'local',\n 'leader',\n 'Republic',\n '7-5',\n '24',\n '1995',\n 'war',\n 'same',\n 'go',\n 'found',\n 'support',\n 'run',\n 'newsroom',\n 'close',\n 'Inc',\n 'then',\n 'say',\n 'meet',\n 'man',\n 'called',\n 'World',\n 'States',\n 'CHICAGO',\n 'what',\n 'town',\n 'singles',\n 'prices',\n 'military',\n 'lower',\n 'eight',\n 'both',\n 'ago',\n '64',\n 'runs',\n 'put',\n 'newspaper',\n 'deal',\n 'bank',\n 'Moscow',\n 'Mark',\n '72',\n 'trade',\n 'rate',\n 'race',\n 'make',\n 'goals',\n 'cents',\n 'St',\n 'OF',\n 'Men',\n '60',\n '16',\n 'pct',\n 'months',\n 'issue',\n 'gave',\n 'behind',\n
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectorize_layer.get_vocabulary()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"data": {
"text/plain": "'Kindercare says debt buy to hit Q1 results . </S> MONTGOMERY , Ala . </S> 1996-08-22 </S> KinderCare Learning Centers Inc said on Thursday that a debt buyback would mean an extraordinary loss of $ 1.2 million in its fiscal 1997 first quarter . </S> The company said that during the quarter , which began June 1 , it bought $ 30 million par value of its outstanding 10-3/8 percent senior notes due 2001 . </S> The notes were bought for $ 31.5 million . </S> Philip Maslowe , chief financial officer of the preschool and child care company , said the buyback \" offered an opportunity to reduce the company \\'s weighted average interest costs and improve future cash flows and earnings . \" </S>'"
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"vocabulary = vectorize_layer.get_vocabulary()\n",
"vocab_arr = np.asarray(vocabulary)\n",
"\" \".join(vocab_arr[vectorize_layer(training_file[\"document\"][20])])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": "'Kindercare says debt buy to hit Q1 results . </S> MONTGOMERY , Ala . </S> 1996-08-22 </S> KinderCare Learning Centers Inc said on Thursday that a debt buyback would mean an extraordinary loss of $ 1.2 million in its fiscal 1997 first quarter . </S> The company said that during the quarter , which began June 1 , it bought $ 30 million par value of its outstanding 10-3/8 percent senior notes due 2001 . </S> The notes were bought for $ 31.5 million . </S> Philip Maslowe , chief financial officer of the preschool and child care company , said the buyback \" offered an opportunity to reduce the company \\'s weighted average interest costs and improve future cash flows and earnings . \" </S>'"
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"training_file[\"document\"][20]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [],
"source": [
"# Separate vectorizer for input / output"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": "<AxesSubplot:>"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGtCAYAAADqPVUWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8+0lEQVR4nO3de3QU9eH//9duNtmEXMl1Ew0QL0CiCAiCsUi9pEREC0ov1Hj5tFRam+hHqCh8q1TRiqK1CB/qpfXawgcvp3AQNUJBRSUGiAUxYlBEEiCbACHZXDeb7P7+8JP5sRCUwOY2eT7OmQOZec/Me+Y9l9fOzsxafD6fTwAAACZh7e4KAAAABBLhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmEqHw83GjRt17bXXKiUlRRaLRatWrTquzM6dO/XjH/9Y0dHRCg8P10UXXaTS0lJjeFNTk3JzcxUXF6eIiAhNnTpVFRUVftMoLS3VpEmT1K9fPyUmJmr27NlqaWnp+BICAIA+xdbREerr6zV8+HD96le/0vXXX3/c8N27d2vcuHGaPn26HnjgAUVFRam4uFihoaFGmZkzZ+rNN9/Ua6+9pujoaOXl5en666/XRx99JElqbW3VpEmT5HA4tGnTJpWXl+vmm29WcHCwHn744ZOqp9fr1YEDBxQZGSmLxdLRxQQAAN3A5/OptrZWKSkpslpP8Qsm32mQ5Fu5cqVfv5///Oe+G2+88YTjVFdX+4KDg32vvfaa0W/nzp0+Sb6CggKfz+fzvfXWWz6r1epzOp1GmaeeesoXFRXlc7vdJ1W3srIynyQ6Ojo6Ojq6XtiVlZV1IJH46/CVm+/i9Xr15ptv6u6771Z2drb+85//KC0tTXPnztWUKVMkSUVFRfJ4PMrKyjLGGzp0qAYMGKCCggJdfPHFKigo0LBhw5SUlGSUyc7O1m233abi4mKNHDnyuHm73W653W7jb9///dh5WVmZoqKiArmYAACgk7hcLqWmpioyMvKUpxHQcFNZWam6ujo98sgjeuihh/Too48qPz9f119/vd5991398Ic/lNPpVEhIiGJiYvzGTUpKktPplCQ5nU6/YNM2vG1YexYsWKAHHnjguP5RUVGEGwAAepnTuaUkoE9Leb1eSdLkyZM1c+ZMjRgxQnPmzNE111yjp59+OpCzOs7cuXNVU1NjdGVlZZ06PwAA0DMFNNzEx8fLZrMpIyPDr396errxtJTD4VBzc7Oqq6v9ylRUVMjhcBhljn16qu3vtjLHstvtxlUartYAANB3BTTchISE6KKLLlJJSYlf/127dmngwIGSpFGjRik4OFjr1683hpeUlKi0tFSZmZmSpMzMTO3YsUOVlZVGmXXr1ikqKuq44AQAAHC0Dt9zU1dXp6+++sr4e8+ePdq2bZtiY2M1YMAAzZ49Wz//+c81fvx4XX755crPz9cbb7yh9957T5IUHR2t6dOna9asWYqNjVVUVJRuv/12ZWZm6uKLL5YkTZgwQRkZGbrpppu0cOFCOZ1O3XvvvcrNzZXdbg/MkgMAAHPq6ONV7777bruPbN1yyy1Gmeeee853zjnn+EJDQ33Dhw/3rVq1ym8ajY2Nvt/97ne+/v37+/r16+e77rrrfOXl5X5lvvnmG9/EiRN9YWFhvvj4eN/vf/97n8fjOel61tTU+CT5ampqOrqIAACgmwTi/G3x+f7vmWmTcblcio6OVk1NDfffAADQSwTi/M1vSwEAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFPpcLjZuHGjrr32WqWkpMhisWjVqlUnLPvb3/5WFotFixYt8utfVVWlnJwcRUVFKSYmRtOnT1ddXZ1fmU8//VSXXnqpQkNDlZqaqoULF3a0qgAAoA/qcLipr6/X8OHDtXTp0u8st3LlSn388cdKSUk5blhOTo6Ki4u1bt06rVmzRhs3btSMGTOM4S6XSxMmTNDAgQNVVFSkxx57TPfff7+effbZjlYXAAD0MbaOjjBx4kRNnDjxO8vs379ft99+u9555x1NmjTJb9jOnTuVn5+vLVu2aPTo0ZKkJUuW6Oqrr9bjjz+ulJQULVu2TM3NzXr++ecVEhKi8847T9u2bdMTTzzhF4IAAACOFfB7brxer2666SbNnj1b55133nHDCwoKFBMTYwQbScrKypLValVhYaFRZvz48QoJCTHKZGdnq6SkREeOHGl3vm63Wy6Xy68DAAB9T8DDzaOPPiqbzaY77rij3eFOp1OJiYl+/Ww2m2JjY+V0Oo0ySUlJfmXa/m4rc6wFCxYoOjra6FJTU093UQAAQC8U0HBTVFSkJ598Ui+++KIsFksgJ/295s6dq5qaGqMrKyvr0vkDAICeIaDh5oMPPlBlZaUGDBggm80mm82mvXv36ve//70GDRokSXI4HKqsrPQbr6WlRVVVVXI4HEaZiooKvzJtf7eVOZbdbldUVJRfBwAA+p6AhpubbrpJn376qbZt22Z0KSkpmj17tt555x1JUmZmpqqrq1VUVGSMt2HDBnm9Xo0dO9Yos3HjRnk8HqPMunXrNGTIEPXv3z+QVQYAACbT4ael6urq9NVXXxl/79mzR9u2bVNsbKwGDBiguLg4v/LBwcFyOBwaMmSIJCk9PV1XXXWVbr31Vj399NPyeDzKy8vTtGnTjMfGb7jhBj3wwAOaPn267rnnHn322Wd68skn9Ze//OV0lhUAAPQBHQ43W7du1eWXX278PWvWLEnSLbfcohdffPGkprFs2TLl5eXpyiuvlNVq1dSpU7V48WJjeHR0tNauXavc3FyNGjVK8fHxmjdvHo+BAwCA72Xx+Xy+7q5EZ3C5XIqOjlZNTQ333wAA0EsE4vzNb0sBAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABT6XC42bhxo6699lqlpKTIYrFo1apVxjCPx6N77rlHw4YNU3h4uFJSUnTzzTfrwIEDftOoqqpSTk6OoqKiFBMTo+nTp6uurs6vzKeffqpLL71UoaGhSk1N1cKFC09tCQEAQJ/S4XBTX1+v4cOHa+nSpccNa2ho0CeffKL77rtPn3zyif71r3+ppKREP/7xj/3K5eTkqLi4WOvWrdOaNWu0ceNGzZgxwxjucrk0YcIEDRw4UEVFRXrsscd0//3369lnnz2FRQQAAH2Jxefz+U55ZItFK1eu1JQpU05YZsuWLRozZoz27t2rAQMGaOfOncrIyNCWLVs0evRoSVJ+fr6uvvpq7du3TykpKXrqqaf0hz/8QU6nUyEhIZKkOXPmaNWqVfriiy9Oqm4ul0vR0dGqqalRVFTUqS4iAADoQoE4f3f6PTc1NTWyWCyKiYmRJBUUFCgmJsYINpKUlZUlq9WqwsJCo8z48eONYCNJ2dnZKikp0ZEjR9qdj9vtlsvl8usAAEDf06nhpqmpSffcc49+8YtfGOnL6XQqMTHRr5zNZlNsbKycTqdRJikpya9M299tZY61YMECRUdHG11qamqgFwcAAPQCnRZuPB6Pfvazn8nn8+mpp57qrNkY5s6dq5qaGqMrKyvr9Hk
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"training_file[\"len_tokenized\"].plot.bar()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Padding przykładów do 2048 słów"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tf.Tensor([18792 316 1335 ... 0 0 0], shape=(2048,), dtype=int64)\n"
]
}
],
"source": [
"sentence_vectorizer = tf.keras.layers.TextVectorization(standardize=None, output_sequence_length=2048)\n",
"sentence_vectorizer.adapt(training_file[\"document\"])\n",
"print(sentence_vectorizer(training_file[\"document\"][20]))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tf.Tensor([2 2 2 ... 0 0 0], shape=(2048,), dtype=int64)\n"
]
}
],
"source": [
"label_vectorizer = tf.keras.layers.TextVectorization(standardize=None, output_sequence_length=2048)\n",
"label_vectorizer.adapt(training_file[\"label\"])\n",
"print(label_vectorizer(training_file[\"label\"][20]))"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [],
"source": [
"tags_list = label_vectorizer.get_vocabulary()\n",
"tags_length = label_vectorizer.vocabulary_size()\n",
"\n",
"vocab_list = sentence_vectorizer.get_vocabulary()\n",
"vocab_length = sentence_vectorizer.vocabulary_size()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [],
"source": [
"training_file[\"document_vectorized\"] = training_file[\"document\"].apply(sentence_vectorizer)\n",
"training_file[\"label_vectorized\"] = training_file[\"label\"].apply(label_vectorizer)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 79,
"outputs": [],
"source": [
"from keras.utils import to_categorical\n",
"from sklearn.model_selection import train_test_split\n",
"train, valid = train_test_split(training_file, test_size=0.2)\n",
"train_x = np.stack(train[\"document_vectorized\"].values)\n",
"train_y = np.stack(train[\"label_vectorized\"].values)\n",
"train_y = np.array([to_categorical(i,num_classes = tags_length) for i in train_y])\n",
"\n",
"val_x = np.stack(valid[\"document_vectorized\"].values)\n",
"val_y = np.stack(valid[\"label_vectorized\"].values)\n",
"val_y = np.array([to_categorical(i,num_classes = tags_length) for i in val_y])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 97,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2014 19 122 ... 0 0 0]\n"
]
}
],
"source": [
"print(val_x[0])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 80,
"outputs": [
{
"data": {
"text/plain": "(756, 2048)"
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_x.shape"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 81,
"outputs": [
{
"data": {
"text/plain": "(756, 2048, 11)"
},
"execution_count": 81,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_y.shape"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 82,
"outputs": [
{
"data": {
"text/plain": "array([ 128, 19, 1368, ..., 0, 0, 0], dtype=int64)"
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_x[0]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 83,
"outputs": [
{
"data": {
"text/plain": "array([[0., 0., 1., ..., 0., 0., 0.],\n [0., 0., 1., ..., 0., 0., 0.],\n [0., 0., 0., ..., 1., 0., 0.],\n ...,\n [1., 0., 0., ..., 0., 0., 0.],\n [1., 0., 0., ..., 0., 0., 0.],\n [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)"
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_y[0]"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 122,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model: \"model_14\"\n",
"_________________________________________________________________\n",
" Layer (type) Output Shape Param # \n",
"=================================================================\n",
" input_16 (InputLayer) [(None, 2048)] 0 \n",
" \n",
" embedding_15 (Embedding) (None, 2048, 128) 3024256 \n",
" \n",
" lstm_20 (LSTM) (None, 2048, 256) 394240 \n",
" \n",
" time_distributed_18 (TimeDi (None, 2048, 11) 2827 \n",
" stributed) \n",
" \n",
"=================================================================\n",
"Total params: 3,421,323\n",
"Trainable params: 3,421,323\n",
"Non-trainable params: 0\n",
"_________________________________________________________________\n"
]
}
],
"source": [
"from keras.optimizers import Adam\n",
"import keras.layers as layers\n",
"import keras\n",
"\n",
"\n",
"def create_model():\n",
" input_layer = layers.Input(shape=(2048,))\n",
" embedding_layer = layers.Embedding(input_dim = vocab_length+1,output_dim = 128,input_length = 2048)(input_layer)\n",
" lstm_layer = layers.LSTM(256, return_sequences=True)(embedding_layer)\n",
" output_layer = layers.TimeDistributed(layers.Dense(tags_length,activation=\"softmax\"))(lstm_layer)\n",
" #out = layers.Dense(2048,activation=\"linear\")(dropout)\n",
" model = keras.Model(inputs=input_layer, outputs=output_layer)\n",
" model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])\n",
" return model\n",
"model = create_model()\n",
"model.summary()"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 123,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/50\n",
"24/24 [==============================] - 29s 1s/step - loss: 0.6602 - accuracy: 0.8703 - val_loss: 0.2673 - val_accuracy: 0.9425\n",
"Epoch 2/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.2500 - accuracy: 0.9653 - val_loss: 0.1613 - val_accuracy: 0.9781\n",
"Epoch 3/50\n",
"24/24 [==============================] - 28s 1s/step - loss: 0.1062 - accuracy: 0.9790 - val_loss: 0.0984 - val_accuracy: 0.9793\n",
"Epoch 4/50\n",
"24/24 [==============================] - 28s 1s/step - loss: 0.0920 - accuracy: 0.9806 - val_loss: 0.0936 - val_accuracy: 0.9799\n",
"Epoch 5/50\n",
"24/24 [==============================] - 28s 1s/step - loss: 0.0874 - accuracy: 0.9812 - val_loss: 0.0901 - val_accuracy: 0.9800\n",
"Epoch 6/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0828 - accuracy: 0.9816 - val_loss: 0.0867 - val_accuracy: 0.9804\n",
"Epoch 7/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0774 - accuracy: 0.9818 - val_loss: 0.0805 - val_accuracy: 0.9804\n",
"Epoch 8/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0715 - accuracy: 0.9819 - val_loss: 0.0741 - val_accuracy: 0.9807\n",
"Epoch 9/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0628 - accuracy: 0.9822 - val_loss: 0.0660 - val_accuracy: 0.9808\n",
"Epoch 10/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0543 - accuracy: 0.9826 - val_loss: 0.0579 - val_accuracy: 0.9815\n",
"Epoch 11/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0465 - accuracy: 0.9843 - val_loss: 0.0500 - val_accuracy: 0.9851\n",
"Epoch 12/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0385 - accuracy: 0.9879 - val_loss: 0.0453 - val_accuracy: 0.9867\n",
"Epoch 13/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0330 - accuracy: 0.9901 - val_loss: 0.0413 - val_accuracy: 0.9873\n",
"Epoch 14/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0298 - accuracy: 0.9909 - val_loss: 0.0395 - val_accuracy: 0.9887\n",
"Epoch 15/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0257 - accuracy: 0.9922 - val_loss: 0.0380 - val_accuracy: 0.9887\n",
"Epoch 16/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0241 - accuracy: 0.9924 - val_loss: 0.0362 - val_accuracy: 0.9887\n",
"Epoch 17/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0215 - accuracy: 0.9935 - val_loss: 0.0344 - val_accuracy: 0.9897\n",
"Epoch 18/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0191 - accuracy: 0.9942 - val_loss: 0.0335 - val_accuracy: 0.9898\n",
"Epoch 19/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0173 - accuracy: 0.9948 - val_loss: 0.0322 - val_accuracy: 0.9906\n",
"Epoch 20/50\n",
"24/24 [==============================] - 28s 1s/step - loss: 0.0160 - accuracy: 0.9952 - val_loss: 0.0322 - val_accuracy: 0.9908\n",
"Epoch 21/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0147 - accuracy: 0.9958 - val_loss: 0.0338 - val_accuracy: 0.9900\n",
"Epoch 22/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0133 - accuracy: 0.9962 - val_loss: 0.0307 - val_accuracy: 0.9915\n",
"Epoch 23/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0117 - accuracy: 0.9968 - val_loss: 0.0303 - val_accuracy: 0.9918\n",
"Epoch 24/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0105 - accuracy: 0.9973 - val_loss: 0.0289 - val_accuracy: 0.9922\n",
"Epoch 25/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0094 - accuracy: 0.9977 - val_loss: 0.0315 - val_accuracy: 0.9917\n",
"Epoch 26/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0084 - accuracy: 0.9980 - val_loss: 0.0300 - val_accuracy: 0.9924\n",
"Epoch 27/50\n",
"24/24 [==============================] - 27s 1s/step - loss: 0.0073 - accuracy: 0.9984 - val_loss: 0.0295 - val_accuracy: 0.9926\n"
]
}
],
"source": [
"callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)\n",
"history = model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=50, callbacks=[callback])"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 22,
"outputs": [
{
"data": {
"text/plain": "[('China', 'B-LOC'),\n ('says', 'O'),\n ('time', 'O'),\n ('right', 'O'),\n ('for', 'O'),\n ('Taiwan', 'B-LOC'),\n ('talks', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('BEIJING', 'B-LOC'),\n ('1996-08-22', 'O'),\n ('</S>', 'O'),\n ('China', 'B-LOC'),\n ('has', 'O'),\n ('said', 'O'),\n ('it', 'O'),\n ('was', 'O'),\n ('time', 'O'),\n ('for', 'O'),\n ('political', 'O'),\n ('talks', 'O'),\n ('with', 'O'),\n ('Taiwan', 'B-LOC'),\n ('and', 'O'),\n ('that', 'O'),\n ('the', 'O'),\n ('rival', 'O'),\n ('island', 'O'),\n ('should', 'O'),\n ('take', 'O'),\n ('practical', 'O'),\n ('steps', 'O'),\n ('towards', 'O'),\n ('that', 'O'),\n ('goal', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Consultations', 'O'),\n ('should', 'O'),\n ('be', 'O'),\n ('held', 'O'),\n ('to', 'O'),\n ('set', 'O'),\n ('the', 'O'),\n ('time', 'O'),\n ('and', 'O'),\n ('format', 'O'),\n ('of', 'O'),\n ('the', 'O'),\n ('talks', 'O'),\n (',', 'O'),\n ('the', 'O'),\n ('official', 'O'),\n ('Xinhua', 'B-ORG'),\n ('news', 'O'),\n ('agency', 'O'),\n ('quoted', 'O'),\n ('Tang', 'B-PER'),\n ('Shubei', 'I-PER'),\n (',', 'O'),\n ('executive', 'O'),\n ('vice', 'O'),\n ('chairman', 'O'),\n ('of', 'O'),\n ('the', 'O'),\n ('Association', 'B-ORG'),\n ('for', 'I-ORG'),\n ('Relations', 'O'),\n ('Across', 'I-ORG'),\n ('the', 'I-ORG'),\n ('Taiwan', 'I-ORG'),\n ('Straits', 'I-ORG'),\n (',', 'O'),\n ('as', 'O'),\n ('saying', 'O'),\n ('late', 'O'),\n ('on', 'O'),\n ('Wednesday', 'O'),\n ('.', 'O'),\n ('</S>', 'O')]"
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tag_list_numpy = np.array(tags_list)\n",
"def get_tag_from_int(input_integer):\n",
" return tag_list_numpy[input_integer]\n",
"def get_ner_output_single_sentence(input_sentence):\n",
" sentence_length = len(input_sentence.split())\n",
" vectorized = sentence_vectorizer(input_sentence)\n",
" #print(vectorized)\n",
" model_output = model(np.stack(tf.expand_dims(vectorized,0)))\n",
" #print(model_output.numpy())\n",
" #print(model_output.shape)\n",
" max_indices = np.argmax(model_output, axis=2).flatten()\n",
" #print(max_indices)\n",
" #print(len(max_indices))\n",
" #\" \".join(vocab_arr[vectorize_layer(training_file[\"document\"][20])])\n",
" tokenized = [get_tag_from_int(x) for x in max_indices[:]]\n",
" return tokenized[:sentence_length]\n",
"#get_ner_output_single_sentence(\"China says time right for Taiwan talks . </S> BEIJING 1996-08-22 </S> China has said it was time for political talks with Taiwan and that the rival island should take practical steps towards that goal . </S> Consultations should be held to set the time and format of the talks , the official Xinhua news agency quoted Tang Shubei , executive vice chairman of the Association for Relations Across the Taiwan Straits , as saying late on Wednesday . </S>\")\n",
"\n",
"def test_sentence(sentence):\n",
" model_output = get_ner_output_single_sentence(sentence)\n",
" input_tokens = sentence.split()\n",
" return list(zip(input_tokens, model_output))\n",
"\n",
"test_sentence(\"China says time right for Taiwan talks . </S> BEIJING 1996-08-22 </S> China has said it was time for political talks with Taiwan and that the rival island should take practical steps towards that goal . </S> Consultations should be held to set the time and format of the talks , the official Xinhua news agency quoted Tang Shubei , executive vice chairman of the Association for Relations Across the Taiwan Straits , as saying late on Wednesday . </S>\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tf.Tensor([ 128 19 18713 ... 0 0 0], shape=(2048,), dtype=int64)\n",
"[[[3.0971142e-03 1.5280694e-03 9.8057139e-01 ... 3.6668889e-03\n",
" 1.4106639e-03 3.3225205e-03]\n",
" [2.1369425e-04 1.2225067e-04 9.9616271e-01 ... 1.4002173e-03\n",
" 1.0539902e-04 2.7582867e-04]\n",
" [6.3146334e-05 3.8070513e-05 9.9278271e-01 ... 2.4660169e-03\n",
" 5.7447112e-05 1.3038449e-04]\n",
" ...\n",
" [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794704e-08\n",
" 2.6146161e-08 5.0399006e-08]\n",
" [9.9999696e-01 1.7784757e-08 2.5151198e-07 ... 1.3794731e-08\n",
" 2.6146161e-08 5.0399006e-08]\n",
" [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794704e-08\n",
" 2.6146161e-08 5.0399006e-08]]]\n",
"(1, 2048, 11)\n",
"[2 2 2 ... 0 0 0]\n",
"2048\n"
]
},
{
"data": {
"text/plain": "[('SOCCER', 'O'),\n ('-', 'O'),\n ('LATE', 'O'),\n ('GOALS', 'O'),\n ('GIVE', 'O'),\n ('JAPAN', 'O'),\n ('WIN', 'O'),\n ('OVER', 'O'),\n ('SYRIA', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('AL-AIN', 'O'),\n (',', 'O'),\n ('United', 'B-LOC'),\n ('Arab', 'I-LOC'),\n ('Emirates', 'I-LOC'),\n ('1996-12-06', 'O'),\n ('</S>', 'O'),\n ('Two', 'O'),\n ('goals', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('last', 'O'),\n ('six', 'O'),\n ('minutes', 'O'),\n ('gave', 'O'),\n ('holders', 'O'),\n ('Japan', 'B-LOC'),\n ('an', 'O'),\n ('uninspiring', 'O'),\n ('2-1', 'O'),\n ('Asian', 'B-LOC'),\n ('Cup', 'I-MISC'),\n ('victory', 'O'),\n ('over', 'O'),\n ('Syria', 'B-LOC'),\n ('on', 'O'),\n ('Friday', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Takuya', 'O'),\n ('Takagi', 'O'),\n ('headed', 'O'),\n ('the', 'O'),\n ('winner', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('88th', 'O'),\n ('minute', 'O'),\n ('of', 'O'),\n ('the', 'O'),\n ('group', 'O'),\n ('C', 'O'),\n ('game', 'O'),\n ('after', 'O'),\n ('goalkeeper', 'O'),\n ('Salem', 'O'),\n ('Bitar', 'O'),\n ('spoiled', 'O'),\n ('a', 'O'),\n ('mistake-free', 'O'),\n ('display', 'O'),\n ('by', 'O'),\n ('allowing', 'O'),\n ('the', 'O'),\n ('ball', 'O'),\n ('to', 'O'),\n ('slip', 'O'),\n ('under', 'O'),\n ('his', 'O'),\n ('body', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('It', 'O'),\n ('was', 'O'),\n ('the', 'O'),\n ('second', 'O'),\n ('Syrian', 'B-PER'),\n ('defensive', 'O'),\n ('blunder', 'O'),\n ('in', 'O'),\n ('four', 'O'),\n ('minutes', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Defender', 'O'),\n ('Hassan', 'B-PER'),\n ('Abbas', 'I-PER'),\n ('rose', 'O'),\n ('to', 'O'),\n ('intercept', 'O'),\n ('a', 'O'),\n ('long', 'O'),\n ('ball', 'O'),\n ('into', 'O'),\n ('the', 'O'),\n ('area', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('84th', 'O'),\n ('minute', 'O'),\n ('but', 'O'),\n ('only', 'O'),\n ('managed', 'O'),\n ('to', 'O'),\n ('divert', 'O'),\n ('it', 'O'),\n ('into', 'O'),\n ('the', 'O'),\n ('top', 'O'),\n ('corner', 'O'),\n ('of', 'O'),\n ('Bitar', 'O'),\n (\"'s\", 'O'),\n ('goal', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Syria', 'B-ORG'),\n ('had', 'O'),\n ('taken', 'O'),\n ('the', 'O'),\n ('lead', 'O'),\n ('from', 'O'),\n ('their', 'O'),\n ('first', 'O'),\n ('serious', 'O'),\n ('attack', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('seventh', 'O'),\n ('minute', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Nader', 'O'),\n ('Jokhadar', 'O'),\n ('headed', 'O'),\n ('a', 'O'),\n ('cross', 'O'),\n ('from', 'O'),\n ('the', 'O'),\n ('right', 'O'),\n ('by', 'O'),\n ('Ammar', 'O'),\n ('Awad', 'O'),\n ('into', 'O'),\n ('the', 'O'),\n ('top', 'O'),\n ('right', 'O'),\n ('corner', 'O'),\n ('of', 'O'),\n ('Kenichi', 'O'),\n ('Shimokawa', 'O'),\n (\"'s\", 'O'),\n ('goal', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Japan', 'B-LOC'),\n ('then', 'O'),\n ('laid', 'O'),\n ('siege', 'O'),\n ('to', 'O'),\n ('the', 'O'),\n ('Syrian', 'B-ORG'),\n ('penalty', 'O'),\n ('area', 'O'),\n ('and', 'O'),\n ('had', 'O'),\n ('a', 'O'),\n ('goal', 'O'),\n ('disallowed', 'O'),\n ('for', 'O'),\n ('offside', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('16th', 'O'),\n ('minute', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('A', 'O'),\n ('minute', 'O'),\n ('later', 'O'),\n (',', 'O'),\n ('Bitar', 'O'),\n ('produced', 'O'),\n ('a', 'O'),\n ('good', 'O'),\n ('double', 'O'),\n ('save', 'O'),\n (',', 'O'),\n ('first', 'O'),\n ('from', 'O'),\n ('Kazuyoshi', 'O'),\n ('Miura', 'O'),\n (\"'s\", 'O'),\n ('header', 'O'),\n ('and', 'O'),\n ('then', 'O'),\n ('blocked', 'O'),\n ('a', 'O'),\n ('Takagi', 'O'),\n ('follow-up', 'O'),\n ('shot', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Bitar', 'O'),\n ('saved', 'O'),\n ('well', 'O'),\n ('again', 'O'),\n ('from', 'O'),\n ('Miura', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('37th', 'O'),\n ('minute', 'O'),\n (',', 'O'),\n ('parrying', 'O'),\n ('away', 'O'),\n ('his', 'O'),\n ('header', 'O'),\n ('from', 'O'),\n ('a', 'O'),\n ('corner', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Japan', 'B-ORG'),\n ('started', 'O'),\n ('the', 'O'),\n ('second', 'O'),\n ('half', 'O'),\n ('brightly', 'O'),\n ('but', 'O'),\n ('Bitar', 'O
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_sentence(\"SOCCER - LATE GOALS GIVE JAPAN WIN OVER SYRIA . </S> AL-AIN , United Arab Emirates 1996-12-06 </S> Two goals in the last six minutes gave holders Japan an uninspiring 2-1 Asian Cup victory over Syria on Friday . </S> Takuya Takagi headed the winner in the 88th minute of the group C game after goalkeeper Salem Bitar spoiled a mistake-free display by allowing the ball to slip under his body . </S> It was the second Syrian defensive blunder in four minutes . </S> Defender Hassan Abbas rose to intercept a long ball into the area in the 84th minute but only managed to divert it into the top corner of Bitar 's goal . </S> Syria had taken the lead from their first serious attack in the seventh minute . </S> Nader Jokhadar headed a cross from the right by Ammar Awad into the top right corner of Kenichi Shimokawa 's goal . </S> Japan then laid siege to the Syrian penalty area and had a goal disallowed for offside in the 16th minute . </S> A minute later , Bitar produced a good double save , first from Kazuyoshi Miura 's header and then blocked a Takagi follow-up shot . </S> Bitar saved well again from Miura in the 37th minute , parrying away his header from a corner . </S> Japan started the second half brightly but Bitar denied them an equaliser when he dived to his right to save Naoki Soma 's low drive in the 53rd minute . </S> Japan : 19 - Kenichi Shimokawa , 2 - Hiroshige Yanagimoto , 3 - Naoki Soma , 4 - Masami Ihara , 5 - Norio Omura , 6 - Motohiro Yamaguchi , 8 - Masakiyo Maezono ( 7 - Yasuto Honda 71 ) , 9 - Takuya Takagi , 10 - Hiroshi Nanami , 11 - Kazuyoshi Miura , 15 - Hiroaki Morishima ( 14 - Masayuki Okano 75 ) . </S> Syria : 24 - Salem Bitar , 3 - Bachar Srour ; 4 - Hassan Abbas , 5 - Tarek Jabban , 6 - Ammar Awad ( 9 - Louay Taleb 69 ) , 8 - Nihad al-Boushi , 10 - Mohammed Afash , 12 - Ali Dib , 13 - Abdul Latif Helou ( 17 - Ammar Rihawiy 46 ) , 14 - Khaled Zaher ; 16 - Nader Jokhadar . </S>\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 127,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tf.Tensor([ 1 16 1 ... 0 0 0], shape=(2048,), dtype=int64)\n",
"[[[9.1573365e-02 8.5647009e-02 1.1034752e-01 ... 8.8930450e-02\n",
" 8.8644758e-02 8.9963131e-02]\n",
" [5.5477720e-02 4.6575051e-02 5.2461910e-01 ... 6.4232960e-02\n",
" 4.4661559e-02 5.8426060e-02]\n",
" [4.9609054e-02 4.3161135e-02 4.3743923e-01 ... 9.0816177e-02\n",
" 4.6578653e-02 5.5895649e-02]\n",
" ...\n",
" [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794731e-08\n",
" 2.6146161e-08 5.0399006e-08]\n",
" [9.9999696e-01 1.7784757e-08 2.5151198e-07 ... 1.3794731e-08\n",
" 2.6146161e-08 5.0399006e-08]\n",
" [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794731e-08\n",
" 2.6146161e-08 5.0399006e-08]]]\n",
"(1, 2048, 11)\n",
"[2 2 2 ... 0 0 0]\n",
"2048\n"
]
},
{
"data": {
"text/plain": "[('Mussolini', 'O'),\n (\"'s\", 'O'),\n ('granddaughter', 'O'),\n ('rejoins', 'O'),\n ('far-right', 'O'),\n ('party', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('ROME', 'B-LOC'),\n ('1996-12-06', 'O'),\n ('</S>', 'O'),\n ('Alessandra', 'O'),\n ('Mussolini', 'O'),\n (',', 'O'),\n ('the', 'O'),\n ('granddaughter', 'O'),\n ('of', 'O'),\n ('Italy', 'B-LOC'),\n (\"'s\", 'O'),\n ('Fascist', 'O'),\n ('dictator', 'O'),\n ('Benito', 'B-PER'),\n ('Mussolini', 'I-PER'),\n (',', 'O'),\n ('said', 'O'),\n ('on', 'O'),\n ('Friday', 'O'),\n ('she', 'O'),\n ('had', 'O'),\n ('rejoined', 'O'),\n ('the', 'O'),\n ('far-right', 'O'),\n ('National', 'B-PER'),\n ('Alliance', 'I-PER'),\n ('(', 'O'),\n ('AN', 'O'),\n (')', 'O'),\n ('party', 'O'),\n ('she', 'O'),\n ('quit', 'O'),\n ('over', 'O'),\n ('policy', 'O'),\n ('differences', 'O'),\n ('last', 'O'),\n ('month', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('\"', 'O'),\n ('I', 'O'),\n (\"'ve\", 'O'),\n ('gone', 'O'),\n ('back', 'O'),\n (',', 'O'),\n ('\"', 'O'),\n ('she', 'O'),\n ('told', 'O'),\n ('a', 'O'),\n ('radio', 'O'),\n ('show', 'O'),\n ('shortly', 'O'),\n ('after', 'O'),\n ('AN', 'O'),\n ('leader', 'O'),\n ('Gianfranco', 'B-PER'),\n ('Fini', 'I-PER'),\n (',', 'O'),\n ('who', 'O'),\n ('was', 'O'),\n ('being', 'O'),\n ('interviewed', 'O'),\n ('on', 'O'),\n ('the', 'O'),\n ('programme', 'O'),\n (',', 'O'),\n ('said', 'O'),\n ('the', 'O'),\n ('row', 'O'),\n ('had', 'O'),\n ('been', 'O'),\n ('resolved', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('\"', 'O'),\n ('He', 'O'),\n ('did', 'O'),\n (\"n't\", 'O'),\n ('want', 'O'),\n ('to', 'O'),\n ('lose', 'O'),\n ('me', 'O'),\n ('and', 'O'),\n ('I', 'O'),\n ('did', 'O'),\n (\"n't\", 'O'),\n ('want', 'O'),\n ('to', 'O'),\n ('lose', 'O'),\n ('him', 'O'),\n ('.', 'O'),\n ('\"', 'O'),\n ('</S>', 'O'),\n ('Fini', 'O'),\n ('told', 'O'),\n ('state', 'O'),\n ('radio', 'O'),\n ('RAI', 'B-PER'),\n ('he', 'O'),\n ('met', 'O'),\n ('Mussolini', 'O'),\n ('thanks', 'O'),\n ('to', 'O'),\n ('the', 'O'),\n ('good', 'O'),\n ('offices', 'O'),\n ('of', 'O'),\n ('Giuseppe', 'B-PER'),\n ('Tatarella', 'I-PER'),\n (',', 'O'),\n ('AN', 'O'),\n (\"'s\", 'O'),\n ('leader', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('Chamber', 'B-PER'),\n ('of', 'O'),\n ('Deputies', 'O'),\n ('(', 'O'),\n ('lower', 'O'),\n ('house', 'O'),\n (')', 'O'),\n (',', 'O'),\n ('and', 'O'),\n ('had', 'O'),\n ('overcome', 'O'),\n ('their', 'O'),\n ('differences', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Mussolini', 'O'),\n (',', 'O'),\n ('33', 'O'),\n (',', 'O'),\n ('resigned', 'O'),\n ('from', 'O'),\n ('the', 'O'),\n ('parliamentary', 'O'),\n ('party', 'O'),\n ('group', 'O'),\n ('for', 'O'),\n ('what', 'O'),\n ('she', 'O'),\n ('said', 'O'),\n ('were', 'O'),\n ('strictly', 'O'),\n ('political', 'O'),\n ('reasons', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('The', 'O'),\n ('fiery', 'O'),\n ('politician', 'O'),\n (',', 'O'),\n ('who', 'O'),\n ('is', 'O'),\n ('also', 'O'),\n ('a', 'O'),\n ('niece', 'O'),\n ('of', 'O'),\n ('screen', 'O'),\n ('star', 'O'),\n ('Sophia', 'B-PER'),\n ('Loren', 'I-PER'),\n (',', 'O'),\n ('had', 'O'),\n ('accused', 'O'),\n ('AN', 'O'),\n ('leaders', 'O'),\n ('of', 'O'),\n ('stifling', 'O'),\n ('internal', 'O'),\n ('party', 'O'),\n ('debate', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Mussolini', 'O'),\n (',', 'O'),\n ('who', 'O'),\n ('sits', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('Chamber', 'B-PER'),\n (',', 'O'),\n ('told', 'O'),\n ('La', 'B-ORG'),\n ('Stampa', 'I-ORG'),\n ('newspaper', 'O'),\n ('last', 'O'),\n ('month', 'O'),\n ('after', 'O'),\n ('quitting', 'O'),\n ('AN', 'O'),\n (\"'s\", 'O'),\n ('parliamentary', 'O'),\n ('party', 'O'),\n ('that', 'O'),\n ('she', 'O'),\n ('was', 'O'),\n ('considering', 'O'),\n ('joining', 'O'),\n ('the', 'O'),\n ('neo-fascist', 'O'),\n ('Social', 'B-ORG'),\n ('Movement', 'I-ORG'),\n ('(', 'O'),\n ('MS-Fiamma', 'O'),\n (')', 'O'),\n ('formed', 'O'),\n ('by', 'O'),\n ('some', 'O'),\n ('of', 'O'),\n ('the', 'O'),\n ('Duce', 'O'),\n (\"'s\", 'O'),\n ('World', 'B-ORG'),\n ('War', 'I-ORG'),\n ('Two', 'O'),\n ('followers', 'O'),\n ('.', 'O'),\n
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"news_string = \"\"\"Mussolini 's granddaughter rejoins far-right party . </S> ROME 1996-12-06 </S> Alessandra Mussolini , the granddaughter of Italy 's Fascist dictator Benito Mussolini , said on Friday she had rejoined the far-right National Alliance ( AN ) party she quit over policy differences last month . </S> \" I 've gone back , \" she told a radio show shortly after AN leader Gianfranco Fini , who was being interviewed on the programme , said the row had been resolved . </S> \" He did n't want to lose me and I did n't want to lose him . \" </S> Fini told state radio RAI he met Mussolini thanks to the good offices of Giuseppe Tatarella , AN 's leader in the Chamber of Deputies ( lower house ) , and had overcome their differences . </S> Mussolini , 33 , resigned from the parliamentary party group for what she said were strictly political reasons . </S> The fiery politician , who is also a niece of screen star Sophia Loren , had accused AN leaders of stifling internal party debate . </S> Mussolini , who sits in the Chamber , told La Stampa newspaper last month after quitting AN 's parliamentary party that she was considering joining the neo-fascist Social Movement ( MS-Fiamma ) formed by some of the Duce 's World War Two followers . </S>\"\"\"\n",
"\n",
"test_sentence(news_string)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 126,
"outputs": [],
"source": [
"model.save(\"model_v2.keras\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 18,
"outputs": [],
"source": [
"import keras\n",
"model = keras.models.load_model('model_v2.keras')"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 24,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"ERROR:tensorflow:==================================\n",
"Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):\n",
"<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x00000262307DCA00>\n",
"If you want to mark it as used call its \"mark_used()\" method.\n",
"It was originally created here:\n",
" File \"C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\keras\\backend.py\", line 5130, in <genexpr>\n",
" ta.write(ta_index_to_write, out) File \"C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\tensorflow\\python\\util\\tf_should_use.py\", line 243, in wrapped\n",
" return _add_should_use_warning(fn(*args, **kwargs),\n",
"==================================\n"
]
}
],
"source": [
"with open(\"en-ner-conll-2003/dev-0/in.tsv\", \"r\", encoding=\"utf-8\") as f:\n",
" lines = f.readlines()\n",
"processed = [\" \".join(get_ner_output_single_sentence(x)) for x in lines if len(x.strip())>0]\n",
"with open('en-ner-conll-2003/dev-0/out.tsv', 'w',encoding=\"utf-8\") as f:\n",
" for line in processed:\n",
" f.write(f\"{line}\\n\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 25,
"outputs": [],
"source": [
"with open(\"en-ner-conll-2003/test-A/in.tsv\", \"r\", encoding=\"utf-8\") as f:\n",
" lines = f.readlines()\n",
"processed = [\" \".join(get_ner_output_single_sentence(x)) for x in lines if len(x.strip())>0]\n",
"with open('en-ner-conll-2003/test-A/out.tsv', 'w',encoding=\"utf-8\") as f:\n",
" for line in processed:\n",
" f.write(f\"{line}\\n\")"
],
"metadata": {
"collapsed": false
}
},
2024-05-27 21:28:31 +02:00
{
"cell_type": "markdown",
"source": [
"### Czyszczenie tagów"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'B-LOC', 'I-LOC', 'O', 'I-MISC', 'B-ORG', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC'}\n"
]
}
],
"source": [
"tag_set = set()\n",
"with open(\"en-ner-conll-2003/dev-0/out.tsv\", \"r\", encoding=\"utf-8\") as f:\n",
" lines = f.readlines()\n",
"for line in lines:\n",
" line_split = line.split()\n",
" for tag in line_split:\n",
" if tag not in tag_set:\n",
" tag_set.add(tag)\n",
"print(tag_set)"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [],
"source": [
"inter_to_begin_mapping = {\n",
" \"I-LOC\": \"B-LOC\",\n",
" \"I-MISC\": 'B-MISC',\n",
" 'I-ORG': 'B-ORG',\n",
" 'I-PER': 'B-PER'\n",
"}\n",
"begin_to_inter_mapping = {v: k for k, v in inter_to_begin_mapping.items()}"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": "{'I-LOC': 'B-LOC', 'I-MISC': 'B-MISC', 'I-ORG': 'B-ORG', 'I-PER': 'B-PER'}"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inter_to_begin_mapping"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"data": {
"text/plain": "{'B-LOC': 'I-LOC', 'B-MISC': 'I-MISC', 'B-ORG': 'I-ORG', 'B-PER': 'I-PER'}"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"begin_to_inter_mapping"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [],
"source": [
"def fix_tags_in_file(filename, filename_fixed):\n",
" lines_fixed = []\n",
" with open(filename, \"r\", encoding=\"utf-8\") as f:\n",
" lines = f.readlines()\n",
" lines_tokenized = [line.split() for line in lines]\n",
" for line in lines_tokenized:\n",
" line_fixed = []\n",
" for counter, element in enumerate(line):\n",
" if element==\"O\": # O tag can be placed anywhere\n",
" line_fixed.append(element)\n",
" elif element in inter_to_begin_mapping:\n",
" if counter==0: # Beginning of line, can't check previous tag\n",
" line_fixed.append(inter_to_begin_mapping[element])\n",
" else:\n",
" previous_element = line_fixed[counter-1]\n",
" if previous_element==element or previous_element==inter_to_begin_mapping[element]: # Tag was compatible (same inters or compatible B-->I)\n",
" line_fixed.append(element)\n",
" elif previous_element==\"O\": # O--> Inter\n",
" line_fixed.append(inter_to_begin_mapping[element])\n",
" elif previous_element in inter_to_begin_mapping and element in inter_to_begin_mapping and previous_element!=element: # Incompatible subsequent inter-tags\n",
" line_fixed.append(previous_element)\n",
" else: # Begin --> Incompatible inter\n",
" corrected_tag = begin_to_inter_mapping[previous_element]\n",
" line_fixed.append(corrected_tag)\n",
" elif element in begin_to_inter_mapping: # Beginning tag can be added safely\n",
" line_fixed.append(element)\n",
" else:\n",
" print(\"This shouldn't happen\")\n",
" lines_fixed.append(\" \".join(line_fixed))\n",
" with open(filename_fixed, \"w\", encoding=\"utf-8\") as f:\n",
" for line in lines_fixed:\n",
" f.write(f\"{line}\\n\")\n",
"fix_tags_in_file(\"en-ner-conll-2003/test-A/out.tsv\", \"en-ner-conll-2003/test-A/out_fixed.tsv\")"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [],
"source": [
"fix_tags_in_file(\"en-ner-conll-2003/dev-0/out.tsv\", \"en-ner-conll-2003/dev-0/out_fixed.tsv\")"
],
"metadata": {
"collapsed": false
}
},
2024-05-24 14:52:46 +02:00
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}