969 lines
84 KiB
Plaintext
969 lines
84 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": " label \\\n0 B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ... \n1 O B-PER O O O O O O O O O B-LOC O O O O O O O ... \n2 B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ... \n3 B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ... \n4 B-MISC O O O O O O O O O O O B-LOC O O B-MISC ... \n\n document \n0 EU rejects German call to boycott British lamb... \n1 Rare Hendrix song draft sells for almost $ 17,... \n2 China says Taiwan spoils atmosphere for talks ... \n3 China says time right for Taiwan talks . </S> ... \n4 German July car registrations up 14.2 pct yr /... ",
|
|
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>label</th>\n <th>document</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ...</td>\n <td>EU rejects German call to boycott British lamb...</td>\n </tr>\n <tr>\n <th>1</th>\n <td>O B-PER O O O O O O O O O B-LOC O O O O O O O ...</td>\n <td>Rare Hendrix song draft sells for almost $ 17,...</td>\n </tr>\n <tr>\n <th>2</th>\n <td>B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ...</td>\n <td>China says Taiwan spoils atmosphere for talks ...</td>\n </tr>\n <tr>\n <th>3</th>\n <td>B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ...</td>\n <td>China says time right for Taiwan talks . </S> ...</td>\n </tr>\n <tr>\n <th>4</th>\n <td>B-MISC O O O O O O O O O O O B-LOC O O B-MISC ...</td>\n <td>German July car registrations up 14.2 pct yr /...</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
|
},
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"training_file = pd.read_csv(\"en-ner-conll-2003/train/train.tsv\", sep='\\t', on_bad_lines=\"warn\", names=[\"label\",\"document\"])\n",
|
|
"training_file.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"outputs": [],
|
|
"source": [
|
|
"import tensorflow as tf\n",
|
|
"training_file[\"tag_list\"] = training_file[\"label\"].apply(lambda x : x.split())\n",
|
|
"training_file[\"tokenized\"] = training_file[\"document\"].apply(lambda x : x.split())\n",
|
|
"training_file[\"len_tags\"] = training_file[\"tag_list\"].apply(len)\n",
|
|
"training_file[\"len_tokenized\"] = training_file[\"tokenized\"].apply(len)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "Empty DataFrame\nColumns: [label, document, tag_list, tokenized, len_tags, len_tokenized]\nIndex: []",
|
|
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>label</th>\n <th>document</th>\n <th>tag_list</th>\n <th>tokenized</th>\n <th>len_tags</th>\n <th>len_tokenized</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table>\n</div>"
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"training_file.loc[~(training_file['len_tokenized'] == training_file['len_tags'])]"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": " label \\\n0 B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ... \n1 O B-PER O O O O O O O O O B-LOC O O O O O O O ... \n2 B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ... \n3 B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ... \n4 B-MISC O O O O O O O O O O O B-LOC O O B-MISC ... \n\n document \\\n0 EU rejects German call to boycott British lamb... \n1 Rare Hendrix song draft sells for almost $ 17,... \n2 China says Taiwan spoils atmosphere for talks ... \n3 China says time right for Taiwan talks . </S> ... \n4 German July car registrations up 14.2 pct yr /... \n\n tag_list \\\n0 [B-ORG, O, B-MISC, O, O, O, B-MISC, O, O, O, B... \n1 [O, B-PER, O, O, O, O, O, O, O, O, O, B-LOC, O... \n2 [B-LOC, O, B-LOC, O, O, O, O, O, O, B-LOC, O, ... \n3 [B-LOC, O, O, O, O, B-LOC, O, O, O, B-LOC, O, ... \n4 [B-MISC, O, O, O, O, O, O, O, O, O, O, O, B-LO... \n\n tokenized len_tags len_tokenized \n0 [EU, rejects, German, call, to, boycott, Briti... 489 489 \n1 [Rare, Hendrix, song, draft, sells, for, almos... 197 197 \n2 [China, says, Taiwan, spoils, atmosphere, for,... 248 248 \n3 [China, says, time, right, for, Taiwan, talks,... 80 80 \n4 [German, July, car, registrations, up, 14.2, p... 235 235 ",
|
|
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>label</th>\n <th>document</th>\n <th>tag_list</th>\n <th>tokenized</th>\n <th>len_tags</th>\n <th>len_tokenized</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ...</td>\n <td>EU rejects German call to boycott British lamb...</td>\n <td>[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O, O, B...</td>\n <td>[EU, rejects, German, call, to, boycott, Briti...</td>\n <td>489</td>\n <td>489</td>\n </tr>\n <tr>\n <th>1</th>\n <td>O B-PER O O O O O O O O O B-LOC O O O O O O O ...</td>\n <td>Rare Hendrix song draft sells for almost $ 17,...</td>\n <td>[O, B-PER, O, O, O, O, O, O, O, O, O, B-LOC, O...</td>\n <td>[Rare, Hendrix, song, draft, sells, for, almos...</td>\n <td>197</td>\n <td>197</td>\n </tr>\n <tr>\n <th>2</th>\n <td>B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ...</td>\n <td>China says Taiwan spoils atmosphere for talks ...</td>\n <td>[B-LOC, O, B-LOC, O, O, O, O, O, O, B-LOC, O, ...</td>\n <td>[China, says, Taiwan, spoils, atmosphere, for,...</td>\n <td>248</td>\n <td>248</td>\n </tr>\n <tr>\n <th>3</th>\n <td>B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ...</td>\n <td>China says time right for Taiwan talks . </S> ...</td>\n <td>[B-LOC, O, O, O, O, B-LOC, O, O, O, B-LOC, O, ...</td>\n <td>[China, says, time, right, for, Taiwan, talks,...</td>\n <td>80</td>\n <td>80</td>\n </tr>\n <tr>\n <th>4</th>\n <td>B-MISC O O O O O O O O O O O B-LOC O O B-MISC ...</td>\n <td>German July car registrations up 14.2 pct yr /...</td>\n <td>[B-MISC, O, O, O, O, O, O, O, O, O, O, O, B-LO...</td>\n <td>[German, July, car, registrations, up, 14.2, p...</td>\n <td>235</td>\n <td>235</td>\n </tr>\n </tbody>\n</table>\n</div>"
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"training_file.head()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1532\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"max_length = training_file[\"len_tokenized\"].max()\n",
|
|
"print(max_length) # 1532 ---> ~2048"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"source": [
|
|
"### Testowanie wektoryzacji / dewektoryzacji tekstu"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"tf.Tensor(\n",
|
|
"[18792 316 1335 896 8 479 7287 284 3 2 18492 4\n",
|
|
" 11364 3 2 137 2 18793 18637 20290 346 15 14 68\n",
|
|
" 27 9 1335 9461 59 3210 42 5299 507 6 52 4906\n",
|
|
" 71 7 64 1712 554 49 540 3 2 20 132 15\n",
|
|
" 27 257 5 540 4 60 536 232 18 4 37 1257\n",
|
|
" 52 234 71 1398 1164 6 64 2541 23235 65 880 5156\n",
|
|
" 280 3526 3 2 20 5156 40 1257 17 52 22125 71\n",
|
|
" 3 2 2016 18381 4 449 834 1318 6 5 13472 12\n",
|
|
" 1339 2356 132 4 15 5 9461 13 1240 42 2542 8\n",
|
|
" 2525 5 132 16 8166 666 724 1190 12 2129 618 622\n",
|
|
" 5276 12 836 3 13 2], shape=(126,), dtype=int64)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"vectorize_layer = tf.keras.layers.TextVectorization(standardize=None)\n",
|
|
"vectorize_layer.adapt(training_file[\"document\"])\n",
|
|
"print(vectorize_layer(training_file[\"document\"][20]))"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "126"
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"len(training_file[\"document\"][20].split())"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "['',\n '[UNK]',\n '</S>',\n '.',\n ',',\n 'the',\n 'of',\n 'in',\n 'to',\n 'a',\n ')',\n '(',\n 'and',\n '\"',\n 'on',\n 'said',\n \"'s\",\n 'for',\n '1',\n '-',\n 'The',\n 'was',\n '2',\n '0',\n '3',\n 'at',\n 'with',\n 'that',\n 'from',\n 'by',\n 'is',\n ':',\n 'as',\n 'he',\n '4',\n 'had',\n 'has',\n 'it',\n 'his',\n 'not',\n 'were',\n 'be',\n 'an',\n 'have',\n 'after',\n 'who',\n 'will',\n '5',\n 'but',\n 'first',\n 'U.S.',\n 'been',\n '$',\n '--',\n 'two',\n 'their',\n 'are',\n '6',\n 'beat',\n 'would',\n 'which',\n 'up',\n 'I',\n 'they',\n 'its',\n 'percent',\n 'year',\n 'out',\n 'Thursday',\n 'this',\n 'last',\n 'million',\n 'over',\n 'Wednesday',\n 'one',\n '7',\n 'government',\n 'against',\n '/',\n 'police',\n 'when',\n 'second',\n 'also',\n 'Tuesday',\n 'He',\n 'It',\n 'A',\n 'three',\n 'told',\n 'new',\n '10',\n 'Monday',\n 'or',\n 'about',\n 'Friday',\n 'people',\n 'In',\n 'her',\n '9',\n '1996-08-28',\n 'no',\n 'won',\n 'we',\n 'New',\n 'into',\n 'under',\n 'some',\n 'Sunday',\n 'But',\n '8',\n 'more',\n 'before',\n 'week',\n \"'\",\n 'time',\n 'than',\n 'market',\n 'could',\n 'Germany',\n 'points',\n 'We',\n 'between',\n 'Australia',\n 'years',\n 'since',\n 'Britain',\n 'other',\n 'AT',\n 'SOCCER',\n 'played',\n 'all',\n 'state',\n 'company',\n 'France',\n 'England',\n 'Saturday',\n 'only',\n '1996-08-22',\n 'officials',\n 'group',\n '1996-08-29',\n 'there',\n 'round',\n '1996',\n 'South',\n 'Minister',\n '1996-08-27',\n '11',\n 'off',\n 'match',\n '13',\n 'six',\n 'four',\n 'down',\n '6-4',\n '6-3',\n 'because',\n '21',\n 'five',\n '15',\n 'him',\n 'Spain',\n '1996-08-26',\n 'next',\n 'President',\n 'official',\n 'former',\n 'she',\n 'home',\n 'United',\n 'third',\n 'do',\n 'spokesman',\n 'just',\n 'games',\n 'expected',\n 'did',\n 'day',\n 'win',\n 'through',\n 'statement',\n 'made',\n 'NEW',\n '70',\n '12',\n '1996-08-23',\n 'them',\n 'lost',\n '14',\n 'world',\n 'where',\n '6-2',\n '20',\n 'September',\n 'Russian',\n 'July',\n 'shares',\n \"n't\",\n 'if',\n 'back',\n 'RESULTS',\n 'Italy',\n 'YORK',\n 'China',\n 'August',\n 'president',\n 'Cup',\n '3.',\n '2.',\n 'DIVISION',\n '1.',\n 'Clinton',\n 'British',\n 'while',\n 'seconds',\n 'any',\n 'LONDON',\n 'Japan',\n 'reported',\n 'billion',\n '69',\n 'matches',\n 'v',\n 'team',\n 'month',\n 'Russia',\n 'division',\n 'Pakistan',\n 'meeting',\n 'being',\n 'They',\n 'London',\n 'June',\n 'European',\n '30',\n 'news',\n 'added',\n 'German',\n '71',\n '1996-08-25',\n 'still',\n 'peace',\n 'metres',\n 'half',\n 'Results',\n 'At',\n '1/2',\n 'talks',\n 'set',\n 'earlier',\n 'tonnes',\n 'killed',\n 'season',\n 'now',\n 'Sweden',\n 'take',\n 'held',\n 'during',\n 'Reuters',\n 'should',\n 'part',\n 'around',\n 'India',\n 'party',\n 'elections',\n 'National',\n 'took',\n 'game',\n 'Bank',\n 'soccer',\n 'number',\n 'minutes',\n 'lead',\n 'innings',\n 'early',\n 'capital',\n '68',\n '6-1',\n 'saying',\n 'end',\n 'due',\n 'days',\n 'b',\n '7-6',\n 'results',\n 'Open',\n '100',\n 'so',\n 'foreign',\n 'you',\n 'political',\n 'per',\n 'international',\n 'final',\n 'can',\n 'York',\n 'West',\n 'Belgium',\n '22',\n 'well',\n 'victory',\n 'most',\n 'Newsroom',\n 'French',\n 'Netherlands',\n '50',\n 'visit',\n 'seven',\n 'country',\n 'champion',\n 'Iraq',\n '25',\n 'our',\n 'minute',\n 'Israel',\n 'American',\n 'says',\n 'left',\n 'Czech',\n 'Africa',\n '66',\n '1996-08-24',\n 'profit',\n 'play',\n 'LEAGUE',\n '4.',\n 'vs.',\n 'league',\n '67',\n '6.',\n '5.',\n 'very',\n 'local',\n 'leader',\n 'Republic',\n '7-5',\n '24',\n '1995',\n 'war',\n 'same',\n 'go',\n 'found',\n 'support',\n 'run',\n 'newsroom',\n 'close',\n 'Inc',\n 'then',\n 'say',\n 'meet',\n 'man',\n 'called',\n 'World',\n 'States',\n 'CHICAGO',\n 'what',\n 'town',\n 'singles',\n 'prices',\n 'military',\n 'lower',\n 'eight',\n 'both',\n 'ago',\n '64',\n 'runs',\n 'put',\n 'newspaper',\n 'deal',\n 'bank',\n 'Moscow',\n 'Mark',\n '72',\n 'trade',\n 'rate',\n 'race',\n 'make',\n 'goals',\n 'cents',\n 'St',\n 'OF',\n 'Men',\n '60',\n '16',\n 'pct',\n 'months',\n 'issue',\n 'gave',\n 'behind',\n 'There',\n 'Prime',\n 'May',\n 'opposition',\n 'minister',\n 'good',\n 'ended',\n 'city',\n 'Women',\n 'Michael',\n 'League',\n 'Hong',\n 'FIRST',\n '75',\n 'tournament',\n 'report',\n 'rebels',\n 'leaders',\n 'Iraqi',\n 'Dutch',\n 'weekend',\n 'until',\n 'security',\n 'price',\n 'plan',\n 'northern',\n 'net',\n 'near',\n 'late',\n 'get',\n 'dollar',\n 'agreed',\n 'Kong',\n 'Australian',\n '74',\n '7.',\n 'top',\n 'record',\n 'players',\n 'going',\n 'agency',\n 'Attendance',\n 'African',\n ';',\n '73',\n 'want',\n 'start',\n 'refugees',\n 'miles',\n 'drawn',\n 'another',\n 'Sri',\n 'Paul',\n 'taking',\n 'sales',\n 'place',\n 'office',\n 'my',\n 'economic',\n 'court',\n 'chief',\n 'arrested',\n 'SAN',\n 'John',\n 'Democratic',\n 'David',\n 'CRICKET',\n '8.',\n 'those',\n 'quoted',\n 'demand',\n 'championship',\n 'allowed',\n 'Party',\n 'Palestinian',\n 'Israeli',\n 'GMT',\n 'Corp',\n 'Commission',\n 'Ahmed',\n 'women',\n 'several',\n 'many',\n 'including',\n 'central',\n 'already',\n 'IN',\n 'Foreign',\n 'television',\n 'km',\n 'hit',\n 'following',\n 'de',\n 'Yeltsin',\n 'Martin',\n 'Arafat',\n '28',\n '17',\n 'southern',\n 'men',\n 'may',\n 'later',\n 'forces',\n 'fell',\n 'authorities',\n 'ahead',\n 'Union',\n 'M.',\n 'Dole',\n '31',\n '26',\n '1-0',\n 'work',\n 'whether',\n 'weeks',\n 'way',\n 'troops',\n 'reporters',\n 'loss',\n 'hours',\n 'election',\n 'came',\n 'announced',\n 'Brazil',\n '19',\n 'vs',\n 'return',\n 'parliament',\n 'night',\n 'higher',\n 'general',\n 'closed',\n 'Zealand',\n 'Finland',\n 'Chicago',\n '65',\n '23',\n '1994',\n '18',\n '...',\n 'went',\n 'test',\n 'share',\n 'power',\n 'plans',\n 'national',\n 'decision',\n 'began',\n 'agreement',\n 'This',\n 'trading',\n 'quarter',\n 'oil',\n 'north',\n 'morning',\n 'ministry',\n 'like',\n 'head',\n 'few',\n 'countries',\n 'away',\n 'asked',\n 'Washington',\n 'Police',\n 'Lebed',\n '1997',\n 'taken',\n 'money',\n 'main',\n 'leading',\n 'index',\n 'fighting',\n 'Sydney',\n 'Olympic',\n 'English',\n 'Austria',\n 'such',\n 'signed',\n 'side',\n 'scored',\n 'rights',\n 'past',\n 'much',\n 'major',\n 'hits',\n 'current',\n 'c',\n 'business',\n 'budget',\n 'army',\n 'U.N.',\n 'STANDINGS',\n 'Canada',\n '63',\n 'think',\n 'nine',\n 'growth',\n 'area',\n 'Ukraine',\n 'Standings',\n 'Europe',\n 'East',\n '40',\n 'winning',\n 'total',\n 'strike',\n 'region',\n 'recent',\n 'previous',\n 'own',\n 'draw',\n 'campaign',\n 'attack',\n 'accused',\n 'Two',\n 'On',\n 'Lanka',\n 'Co',\n '96',\n '62',\n 'working',\n 'without',\n 'vote',\n 'these',\n 'seen',\n 'plane',\n 'led',\n 'hold',\n 'high',\n 'future',\n 'died',\n 'control',\n 'club',\n 'cash',\n 'best',\n 'available',\n 'again',\n 'White',\n 'PCT',\n 'Ireland',\n 'Akram',\n '9.',\n '27',\n '---',\n 'wickets',\n 'tour',\n 'sent',\n 'right',\n 'released',\n 'might',\n 'little',\n 'help',\n 'give',\n 'fourth',\n 'failed',\n 'does',\n 'conference',\n 'ceasefire',\n 'case',\n 'ban',\n 'Paris',\n 'March',\n 'Dutroux',\n \"'re\",\n 'us',\n 'started',\n 'prime',\n 'period',\n 'overs',\n 'me',\n 'manager',\n 'long',\n 'least',\n 'embassy',\n 'disease',\n 'cut',\n 'champions',\n 'average',\n 'No',\n 'Italian',\n 'City',\n 'An',\n '29',\n 'tennis',\n 'stories',\n 'service',\n 'production',\n 'planned',\n 'order',\n 'members',\n 'free',\n 'airport',\n 'across',\n 'Wasim',\n 'Thomas',\n 'October',\n 'Leading',\n 'Kurdish',\n 'Costa',\n 'Chechnya',\n 'Aug',\n 'Ajax',\n '59',\n 'yet',\n 'strong',\n 'shot',\n 'short',\n 'rose',\n 'public',\n 'press',\n 'given',\n 'declared',\n 'children',\n 'bonds',\n 'Slovakia',\n 'San',\n 'Romania',\n 'Republican',\n 'Ministry',\n 'Jordan',\n 'Bosnian',\n 'Bosnia',\n 'BASEBALL',\n 'April',\n '54',\n '10.',\n 'trying',\n 'tabulate',\n 'stock',\n 'standings',\n 'seed',\n 'reports',\n 'possible',\n 'must',\n 'markets',\n 'interest',\n 'hospital',\n 'further',\n 'State',\n 'Moslem',\n 'Jerusalem',\n 'If',\n 'CITY',\n 'Amsterdam',\n 'A.',\n 'woman',\n 'used',\n 'term',\n 'series',\n 'received',\n 'rates',\n 'opening',\n 'law',\n 'known',\n 'industry',\n 'guerrillas',\n 'forced',\n 'fifth',\n 'face',\n 'death',\n 'come',\n 'coach',\n 'clear',\n 'charges',\n 'brought',\n 'Taiwan',\n 'TENNIS',\n 'She',\n 'Robert',\n 'Poland',\n 'Peter',\n 'Nigeria',\n 'Ltd',\n 'Kenya',\n '2-0',\n 'yen',\n 'train',\n 'squad',\n 'small',\n 'showed',\n 'private',\n 'point',\n 'passengers',\n 'old',\n 'likely',\n 'injured',\n 'immediately',\n 'estimated',\n 'details',\n 'despite',\n 'date',\n 'companies',\n 'call',\n 'Turkey',\n 'That',\n 'PARIS',\n 'Argentina',\n '48',\n '1,000',\n '*',\n \"'S\",\n 'workers',\n 'use',\n 'trip',\n 'result',\n 'process',\n 'policy',\n 'named',\n 'level',\n 'latest',\n 'human',\n 'groups',\n 'got',\n 'forecast',\n 'figures',\n 'each',\n 'daily',\n 'contract',\n 'captain',\n 'better',\n 'action',\n 'Wimbledon',\n 'One',\n 'North',\n 'Nations',\n 'L',\n 'Japanese',\n 'Iran',\n 'Egypt',\n 'California',\n '76',\n '61',\n '0-0',\n 'unless',\n 'soon',\n 'sold',\n 'seeding',\n 'see',\n 'rise',\n 'prison',\n 'pay',\n 'how',\n 'holiday',\n 'halftime',\n 'force',\n 'financial',\n 'exports',\n 'earnings',\n 'believed',\n 'analysts',\n 'Younis',\n 'Waqar',\n 'TORONTO',\n 'Serb',\n 'PSV',\n 'Mushtaq',\n 'Croft',\n 'Belgian',\n 'BALTIMORE',\n 'Atlanta',\n '6-0',\n 'within',\n 'violence',\n 'today',\n 'title',\n 'times',\n 'straight',\n 'scheduled',\n 'rule',\n 'road',\n 'pound',\n 'playing',\n 'nearly',\n 'making',\n 'levels',\n 'convention',\n 'confirmed',\n 'coming',\n 'chairman',\n 'border',\n 'Total',\n 'TO',\n 'Security',\n 'S.',\n 'International',\n 'Exchange',\n '56',\n 'tried',\n 'struck',\n 'south',\n 'services',\n 'senior',\n 'reached',\n 'position',\n 'nuclear',\n 'met',\n 'message',\n 'know',\n 'keep',\n 'inning',\n 'independence',\n 'illegal',\n 'homer',\n 'gold',\n 'completed',\n 'comment',\n 'charged',\n 'buy',\n 'Switzerland',\n 'Saudi',\n 'OPEN',\n 'Net',\n 'Mullally',\n 'Khan',\n 'Indian',\n 'Halftime',\n 'Grand',\n 'First',\n 'Central',\n 'Bill',\n 'AMSTERDAM',\n '55',\n '53',\n 'wife',\n 'wheat',\n 'tie',\n 'sell',\n 'rebel',\n 'problem',\n 'prefix',\n 'poor',\n 'percentage',\n 'parties',\n 'outside',\n 'opened',\n 'letter',\n 'kept',\n 'island',\n 'here',\n 'health',\n 'ground',\n 'full',\n 'even',\n 'course',\n 'continue',\n 'conditions',\n 'civil',\n 'change',\n 'centre',\n 'based',\n 'attacks',\n 'arrived',\n 'areas',\n 'aggregate',\n 'able',\n 'PUK',\n 'OSCE',\n 'Netanyahu',\n 'Mexico',\n 'Grozny',\n 'Group',\n 'FOR',\n 'Chinese',\n 'Association',\n 'After',\n '58',\n '45',\n '2-1',\n '1-1',\n \"'m\",\n 'winner',\n 'village',\n 'treaty',\n 'too',\n 'system',\n 'step',\n 'stage',\n 'source',\n 'returned',\n 'radio',\n 'penalty',\n 'paper',\n 'needed',\n 'less',\n 'leg',\n 'leave',\n 'himself',\n 'great',\n 'goal',\n 'flight',\n 'economy',\n 'director',\n 'denotes',\n 'denied',\n 'break',\n 'bond',\n 'big',\n 'according',\n 'Williams',\n 'Turkish',\n 'Swiss',\n 'Result',\n 'PRESS',\n 'Johnson',\n 'House',\n 'FRANCISCO',\n 'EU',\n 'DIGEST',\n 'BOSTON',\n '77',\n '42',\n ...]"
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"vectorize_layer.get_vocabulary()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "'Kindercare says debt buy to hit Q1 results . </S> MONTGOMERY , Ala . </S> 1996-08-22 </S> KinderCare Learning Centers Inc said on Thursday that a debt buyback would mean an extraordinary loss of $ 1.2 million in its fiscal 1997 first quarter . </S> The company said that during the quarter , which began June 1 , it bought $ 30 million par value of its outstanding 10-3/8 percent senior notes due 2001 . </S> The notes were bought for $ 31.5 million . </S> Philip Maslowe , chief financial officer of the preschool and child care company , said the buyback \" offered an opportunity to reduce the company \\'s weighted average interest costs and improve future cash flows and earnings . \" </S>'"
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"vocabulary = vectorize_layer.get_vocabulary()\n",
|
|
"vocab_arr = np.asarray(vocabulary)\n",
|
|
"\" \".join(vocab_arr[vectorize_layer(training_file[\"document\"][20])])"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "'Kindercare says debt buy to hit Q1 results . </S> MONTGOMERY , Ala . </S> 1996-08-22 </S> KinderCare Learning Centers Inc said on Thursday that a debt buyback would mean an extraordinary loss of $ 1.2 million in its fiscal 1997 first quarter . </S> The company said that during the quarter , which began June 1 , it bought $ 30 million par value of its outstanding 10-3/8 percent senior notes due 2001 . </S> The notes were bought for $ 31.5 million . </S> Philip Maslowe , chief financial officer of the preschool and child care company , said the buyback \" offered an opportunity to reduce the company \\'s weighted average interest costs and improve future cash flows and earnings . \" </S>'"
|
|
},
|
|
"execution_count": 10,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"training_file[\"document\"][20]"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"outputs": [],
|
|
"source": [
|
|
"# Separate vectorizer for input / output"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "<AxesSubplot:>"
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": "<Figure size 640x480 with 1 Axes>",
|
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGtCAYAAADqPVUWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/MnkTPAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8+0lEQVR4nO3de3QU9eH//9duNtmEXMl1Ew0QL0CiCAiCsUi9pEREC0ov1Hj5tFRam+hHqCh8q1TRiqK1CB/qpfXawgcvp3AQNUJBRSUGiAUxYlBEEiCbACHZXDeb7P7+8JP5sRCUwOY2eT7OmQOZec/Me+Y9l9fOzsxafD6fTwAAACZh7e4KAAAABBLhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmArhBgAAmEqHw83GjRt17bXXKiUlRRaLRatWrTquzM6dO/XjH/9Y0dHRCg8P10UXXaTS0lJjeFNTk3JzcxUXF6eIiAhNnTpVFRUVftMoLS3VpEmT1K9fPyUmJmr27NlqaWnp+BICAIA+xdbREerr6zV8+HD96le/0vXXX3/c8N27d2vcuHGaPn26HnjgAUVFRam4uFihoaFGmZkzZ+rNN9/Ua6+9pujoaOXl5en666/XRx99JElqbW3VpEmT5HA4tGnTJpWXl+vmm29WcHCwHn744ZOqp9fr1YEDBxQZGSmLxdLRxQQAAN3A5/OptrZWKSkpslpP8Qsm32mQ5Fu5cqVfv5///Oe+G2+88YTjVFdX+4KDg32vvfaa0W/nzp0+Sb6CggKfz+fzvfXWWz6r1epzOp1GmaeeesoXFRXlc7vdJ1W3srIynyQ6Ojo6Ojq6XtiVlZV1IJH46/CVm+/i9Xr15ptv6u6771Z2drb+85//KC0tTXPnztWUKVMkSUVFRfJ4PMrKyjLGGzp0qAYMGKCCggJdfPHFKigo0LBhw5SUlGSUyc7O1m233abi4mKNHDnyuHm73W653W7jb9///dh5WVmZoqKiArmYAACgk7hcLqWmpioyMvKUpxHQcFNZWam6ujo98sgjeuihh/Too48qPz9f119/vd5991398Ic/lNPpVEhIiGJiYvzGTUpKktPplCQ5nU6/YNM2vG1YexYsWKAHHnjguP5RUVGEGwAAepnTuaUkoE9Leb1eSdLkyZM1c+ZMjRgxQnPmzNE111yjp59+OpCzOs7cuXNVU1NjdGVlZZ06PwAA0DMFNNzEx8fLZrMpIyPDr396errxtJTD4VBzc7Oqq6v9ylRUVMjhcBhljn16qu3vtjLHstvtxlUartYAANB3BTTchISE6KKLLlJJSYlf/127dmngwIGSpFGjRik4OFjr1683hpeUlKi0tFSZmZmSpMzMTO3YsUOVlZVGmXXr1ikqKuq44AQAAHC0Dt9zU1dXp6+++sr4e8+ePdq2bZtiY2M1YMAAzZ49Wz//+c81fvx4XX755crPz9cbb7yh9957T5IUHR2t6dOna9asWYqNjVVUVJRuv/12ZWZm6uKLL5YkTZgwQRkZGbrpppu0cOFCOZ1O3XvvvcrNzZXdbg/MkgMAAHPq6ONV7777bruPbN1yyy1Gmeeee853zjnn+EJDQ33Dhw/3rVq1ym8ajY2Nvt/97ne+/v37+/r16+e77rrrfOXl5X5lvvnmG9/EiRN9YWFhvvj4eN/vf/97n8fjOel61tTU+CT5ampqOrqIAACgmwTi/G3x+f7vmWmTcblcio6OVk1NDfffAADQSwTi/M1vSwEAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFMh3AAAAFPpcLjZuHGjrr32WqWkpMhisWjVqlUnLPvb3/5WFotFixYt8utfVVWlnJwcRUVFKSYmRtOnT1ddXZ1fmU8//VSXXnqpQkNDlZqaqoULF3a0qgAAoA/qcLipr6/X8OHDtXTp0u8st3LlSn388cdKSUk5blhOTo6Ki4u1bt06rVmzRhs3btSMGTOM4S6XSxMmTNDAgQNVVFSkxx57TPfff7+effbZjlYXAAD0MbaOjjBx4kRNnDjxO8vs379ft99+u9555x1NmjTJb9jOnTuVn5+vLVu2aPTo0ZKkJUuW6Oqrr9bjjz+ulJQULVu2TM3NzXr++ecVEhKi8847T9u2bdMTTzzhF4IAAACOFfB7brxer2666SbNnj1b55133nHDCwoKFBMTYwQbScrKypLValVhYaFRZvz48QoJCTHKZGdnq6SkREeOHGl3vm63Wy6Xy68DAAB9T8DDzaOPPiqbzaY77rij3eFOp1OJiYl+/Ww2m2JjY+V0Oo0ySUlJfmXa/m4rc6wFCxYoOjra6FJTU093UQAAQC8U0HBTVFSkJ598Ui+++KIsFksgJ/295s6dq5qaGqMrKyvr0vkDAICeIaDh5oMPPlBlZaUGDBggm80mm82mvXv36ve//70GDRokSXI4HKqsrPQbr6WlRVVVVXI4HEaZiooKvzJtf7eVOZbdbldUVJRfBwAA+p6AhpubbrpJn376qbZt22Z0KSkpmj17tt555x1JUmZmpqqrq1VUVGSMt2HDBnm9Xo0dO9Yos3HjRnk8HqPMunXrNGTIEPXv3z+QVQYAACbT4ael6urq9NVXXxl/79mzR9u2bVNsbKwGDBiguLg4v/LBwcFyOBwaMmSIJCk9PV1XXXWVbr31Vj399NPyeDzKy8vTtGnTjMfGb7jhBj3wwAOaPn267rnnHn322Wd68skn9Ze//OV0lhUAAPQBHQ43W7du1eWXX278PWvWLEnSLbfcohdffPGkprFs2TLl5eXpyiuvlNVq1dSpU7V48WJjeHR0tNauXavc3FyNGjVK8fHxmjdvHo+BAwCA72Xx+Xy+7q5EZ3C5XIqOjlZNTQ333wAA0EsE4vzNb0sBAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABTIdwAAABT6XC42bhxo6699lqlpKTIYrFo1apVxjCPx6N77rlHw4YNU3h4uFJSUnTzzTfrwIEDftOoqqpSTk6OoqKiFBMTo+nTp6uurs6vzKeffqpLL71UoaGhSk1N1cKFC09tCQEAQJ/S4XBTX1+v4cOHa+nSpccNa2ho0CeffKL77rtPn3zyif71r3+ppKREP/7xj/3K5eTkqLi4WOvWrdOaNWu0ceNGzZgxwxjucrk0YcIEDRw4UEVFRXrsscd0//3369lnnz2FRQQAAH2Jxefz+U55ZItFK1eu1JQpU05YZsuWLRozZoz27t2rAQMGaOfOncrIyNCWLVs0evRoSVJ+fr6uvvpq7du3TykpKXrqqaf0hz/8QU6nUyEhIZKkOXPmaNWqVfriiy9Oqm4ul0vR0dGqqalRVFTUqS4iAADoQoE4f3f6PTc1NTWyWCyKiYmRJBUUFCgmJsYINpKUlZUlq9WqwsJCo8z48eONYCNJ2dnZKikp0ZEjR9qdj9vtlsvl8usAAEDf06nhpqmpSffcc49+8YtfGOnL6XQqMTHRr5zNZlNsbKycTqdRJikpya9M299tZY61YMECRUdHG11qamqgFwcAAPQCnRZuPB6Pfvazn8nn8+mpp57qrNkY5s6dq5qaGqMrKyvr9HkCAICex9YZE20LNnv37tWGDRv8vjNzOByqrKz0K9/S0qKqqio5HA6jTEVFhV+Ztr/byhzLbrfLbrcHcjEAAEAvFPArN23B5ssvv9S///1vxcXF+Q3PzMxUdXW1ioqKjH4bNmyQ1+vV2LFjjTIbN26Ux+Mxyqxbt05DhgxR//79A11lAABgIh0ON3V1ddq2bZu2bdsmSdqzZ4+2bdum0tJSeTwe/eQnP9HWrVu1bNkytba2yul0yul0qrm5WZKUnp6uq666Srfeeqs2b96sjz76SHl5eZo2bZpSUlIkSTfccINCQkI0ffp0FRcX65VXXtGTTz6pWbNmBW7JAQCAKXX4UfD33ntPl19++XH9b7nlFt1///1KS0trd7x3331Xl112maRvX+KXl5enN954Q1arVVOnTtXixYsVERFhlP/000+Vm5urLVu2KD4+Xrfffrvuueeek64nj4IDAND7BOL8fVrvuenJCDcAAPQ+veI9NwAAAF2JcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEyFcAMAAEylw+Fm48aNuvbaa5WSkiKLxaJVq1b5Dff5fJo3b56Sk5MVFhamrKwsffnll35lqqqqlJOTo6ioKMXExGj69Omqq6vzK/Ppp5/q0ksvVWhoqFJTU7Vw4cKOLx0AAOhzOhxu6uvrNXz4cC1durTd4QsXLtTixYv19NNPq7CwUOHh4crOzlZTU5NRJicnR8XFxVq3bp3WrFmjjRs3asaMGcZwl8ulCRMmaODAgSoqKtJjjz2m+++/X88+++wpLCIAAOhTfKdBkm/lypXG316v1+dwOHyPPfaY0a+6utpnt9t9//u//+vz+Xy+zz//3CfJt2XLFqPM22+/7bNYLL79+/f7fD6f769//auvf//+PrfbbZS55557fEOGDDnputXU1Pgk+Wpqak518QAAQBcLxPk7oPfc7NmzR06nU1lZWUa/6OhojR07VgUFBZKkgoICxcTEaPTo0UaZrKwsWa1WFRYWGmXGjx+vkJAQo0x2drZKSkp05MiRduftdrvlcrn8OgAA0PcENNw4nU5JUlJSkl//pKQkY5jT6VRiYqLfcJvNptjYWL8y7U3j6Hkca8GCBYqOjja61NTU018gAADQ65jmaam5c+eqpqbG6MrKyrq7SkCvNmjOm91dBQA4JQENNw6HQ5JUUVHh17+iosIY5nA4VFlZ6Te8paVFVVVVfmXam8bR8ziW3W5XVFSUXwcAAPqegIabtLQ0ORwOrV+/3ujncrlUWFiozMxMSVJmZqaqq6tVVFRklNmwYYO8Xq/Gjh1rlNm4caM8Ho9RZt26dRoyZIj69+8fyCoDAACT6XC4qaur07Zt27Rt2zZJ395EvG3bNpWWlspisejOO+/UQw89pNWrV2vHjh26+eablZKSoilTpkiS0tPTddVVV+nWW2/V5s2b9dFHHykvL0/Tpk1TSkqKJOmGG25QSEiIpk+fruLiYr3yyit68sknNWvWrIAtOAAAMCdbR0fYunWrLr/8cuPvtsBxyy236MUXX9Tdd9+t+vp6zZgxQ9XV1Ro3bpzy8/MVGhpqjLNs2TLl5eXpyiuvlNVq1dSpU7V48WJjeHR0tNauXavc3FyNGjVK8fHxmjdvnt+7cAAAANpj8fl8vu6uRGdwuVyKjo5WTU0N998Ap2DQnDf1zSOTursaAPqYQJy/TfO0FAAAgES4AQAAJkO4AQAApkK4CRBeeAYAQM9AuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAEAAKZCuAGAUzBozpvdXQUAJ0C4AYD/Q2ABzIFwAwAATIVwAwAATIVwAwAATIVwAwAATIVwAwAATIVwAwAATIVwAwAATIVwAwAATIVwAwAATIVwAwAATIVwAwAATIVwAwAATCXg4aa1tVX33Xef0tLSFBYWprPPPlsPPvigfD6fUcbn82nevHlKTk5WWFiYsrKy9OWXX/pNp6qqSjk5OYqKilJMTIymT5+uurq6QFcXAACYTMDDzaOPPqqnnnpK//M//6OdO3fq0Ucf1cKFC7VkyRKjzMKFC7V48WI9/fTTKiwsVHh4uLKzs9XU1GSUycnJUXFxsdatW6c1a9Zo48aNmjFjRqCrCwAATCbg4WbTpk2aPHmyJk2apEGDBuknP/mJJkyYoM2bN0v69qrNokWLdO+992ry5Mm64IIL9PLLL+vAgQNatWqVJGnnzp3Kz8/X3//+d40dO1bjxo3TkiVLtGLFCh04cCDQVT4tg+a82d1VAAAARwl4uLnkkku0fv167dq1S5K0fft2ffjhh5o4caIkac+ePXI6ncrKyjLGiY6O1tixY1VQUCBJKigoUExMjEaPHm2UycrKktVqVWFhYbvzdbvdcrlcfh0AAOh7bIGe4Jw5c+RyuTR06FAFBQWptbVVf/rTn5STkyNJcjqdkqSkpCS/8ZKSkoxhTqdTiYmJ/hW12RQbG2uUOdaCBQv0wAMPBHpxAABALxPwKzevvvqqli1bpuXLl+uTTz7RSy+9pMcff1wvvfRSoGflZ+7cuaqpqTG6srKyTp0fAADomQJ+5Wb27NmaM2eOpk2bJkkaNmyY9u7dqwULFuiWW26Rw+GQJFVUVCg5OdkYr6KiQiNGjJAkORwOVVZW+k23paVFVVVVxvjHstvtstvtgV4cAADQywT8yk1DQ4OsVv/JBgUFyev1SpLS0tLkcDi0fv16Y7jL5VJhYaEyMzMlSZmZmaqurlZRUZFRZsOGDfJ6vRo7dmygqwwAAEwk4OHm2muv1Z/+9Ce9+eab+uabb7Ry5Uo98cQTuu666yRJFotFd955px566CGtXr1aO3bs0M0336yUlBRNmTJFkpSenq6rrrpKt956qzZv3qyPPvpIeXl5mjZtmlJSUgJdZQDoVDxVefJYVwiEgH8ttWTJEt1333363e9+p8rKSqWkpOg3v/mN5s2bZ5S5++67VV9frxkzZqi6ulrjxo1Tfn6+QkNDjTLLli1TXl6errzySlmtVk2dOlWLFy8OdHUBAIDJBDzcREZGatGiRVq0aNEJy1gsFs2fP1/z588/YZnY2FgtX7480NUDAAAmx29LAQAAUyHcAAAAUyHcoFfj5kMAwLEINwAAwFQINwCAPourv+ZEuAEAAKZCuAEAAKZCuAHQJbj8D6CrEG4AAICpEG4AAICpEG4AAICpEG6APoJ7XgD0FYQbAABgKoQbAABgKoQbABBf2wFmQrhBj8NJBgBwOgg3AADAVAg3AADAVAg3AIA+ja/CzYdwAwAATIVwAwAATIVwAwAATIVwA6BH4L4HAIFCuAEAAKZCuAEAAKZCuAEA9Ep8lYkTIdwAAABTIdx0MT5pAADQuQg3AADAVAg3AADAVAg3AADAVAg3AADAVAg36FG44RoAcLoINwDQiQjsQNcj3ADoMpzoAXSFTgk3+/fv14033qi4uDiFhYVp2LBh2rp1qzHc5/Np3rx5Sk5OVlhYmLKysvTll1/6TaOqqko5OTmKiopSTEyMpk+frrq6us6oLgAAMJGAh5sjR47oBz/4gYKDg/X222/r888/15///Gf179/fKLNw4UItXrxYTz/9tAoLCxUeHq7s7Gw1NTUZZXJyclRcXKx169ZpzZo12rhxo2bMmBHo6gIAAJOxBXqCjz76qFJTU/XCCy8Y/dLS0oz/+3w+LVq0SPfee68mT54sSXr55ZeVlJSkVatWadq0adq5c6fy8/O1ZcsWjR49WpK0ZMkSXX311Xr88ceVkpIS6GoDAACTCPiVm9WrV2v06NH66U9/qsTERI0cOVJ/+9vfjOF79uyR0+lUVlaW0S86Olpjx45VQUGBJKmgoEAxMTFGsJGkrKwsWa1WFRYWtjtft9stl8vl1wEAgL4n4OHm66+/1lNPPaVzzz1X77zzjm677TbdcccdeumllyRJTqdTkpSUlOQ3XlJSkjHM6XQqMTHRb7jNZlNsbKxR5lgLFixQdHS00aWmpgZ60QAAQC8Q8HDj9Xp14YUX6uGHH9bIkSM1Y8YM3XrrrXr66acDPSs/c+fOVU1NjdGVlZV16vx6Ip5EAYDeieN3YAU83CQnJysjI8OvX3p6ukpLSyVJDodDklRRUeFXpqKiwhjmcDhUWVnpN7ylpUVVVVVGmWPZ7XZFRUX5dQAAoO8JeLj5wQ9+oJKSEr9+u3bt0sCBAyV9e3Oxw+HQ+vXrjeEul0uFhYXKzMyUJGVmZqq6ulpFRUVGmQ0bNsjr9Wrs2LGBrjIAADCRgD8tNXPmTF1yySV6+OGH9bOf/UybN2/Ws88+q2effVaSZLFYdOedd+qhhx7Sueeeq7S0NN13331KSUnRlClTJH17peeqq64yvs7yeDzKy8vTtGnTeFIKAAB8p4CHm4suukgrV67U3LlzNX/+fKWlpWnRokXKyckxytx9992qr6/XjBkzVF1drXHjxik/P1+hoaFGmWXLlikvL09XXnmlrFarpk6dqsWLFwe6ugAAwGQCHm4k6ZprrtE111xzwuEWi0Xz58/X/PnzT1gmNjZWy5cv74zqAQAAE+O3pYBeiCcrAODECDcAAMBUCDcAAMBUCDcAAMBUCDcAAMBUCDcAAMBUCDfoMjzhA6Crcdzpmwg3AADAVAg3AADAVAg3AADAVAg3AADAVAg3AADAVAg3ANBBPIHTMawvdDXCDQCYHOECfQ3hBgAAmArhBgAAmArhBgCAHoCvDwOHcAMAALpcZ4Y5wg0AADAVwg0AADAVwg0AAH2YGe/1IdwAAHoNM56IEXiEGwB9AidFoO8g3MD0OKkBQN9CuAE6EcEKALoe4QYAAJgK4QbAcbjiBKA3I9wAAABTIdwAAABTIdwAAABTIdwAAABTIdwA6HTcoAygKxFugE7GiR3AyeJ4ERiEGwAAYCqEGwAAYCqdHm4eeeQRWSwW3XnnnUa/pqYm5ebmKi4uThEREZo6daoqKir8xistLdWkSZPUr18/JSYmavbs2Wppaens6gIA8L34+qhn69Rws2XLFj3zzDO64IIL/PrPnDlTb7zxhl577TW9//77OnDggK6//npjeGtrqyZNmqTm5mZt2rRJL730kl588UXNmzevM6vbI7EDATA7jnMItE4LN3V1dcrJydHf/vY39e/f3+hfU1Oj5557Tk888YSuuOIKjRo1Si+88II2bdqkjz/+WJK0du1aff755/rnP/+pESNGaOLEiXrwwQe1dOlSNTc3d1aVAQCACXRauMnNzdWkSZOUlZXl17+oqEgej8ev/9ChQzVgwAAVFBRIkgoKCjRs2DAlJSUZZbKzs+VyuVRcXNzu/Nxut1wul18HAPh+XDmB2dg6Y6IrVqzQJ598oi1bthw3zOl0KiQkRDExMX79k5KS5HQ6jTJHB5u24W3D2rNgwQI98MADAag9AADozQJ+5aasrEz//d//rWXLlik0NDTQkz+huXPnqqamxujKysq6bN6AWfAJHoAZBDzcFBUVqbKyUhdeeKFsNptsNpvef/99LV68WDabTUlJSWpublZ1dbXfeBUVFXI4HJIkh8Nx3NNTbX+3lTmW3W5XVFSUXwcAAPqegIebK6+8Ujt27NC2bduMbvTo0crJyTH+HxwcrPXr1xvjlJSUqLS0VJmZmZKkzMxM7dixQ5WVlUaZdevWKSoqShkZGYGuMgAAMJGA33MTGRmp888/369feHi44uLijP7Tp0/XrFmzFBsbq6ioKN1+++3KzMzUxRdfLEmaMGGCMjIydNNNN2nhwoVyOp269957lZubK7vdHugqA+igQXPe1DePTOruagBAuzrlhuLv85e//EVWq1VTp06V2+1Wdna2/vrXvxrDg4KCtGbNGt12223KzMxUeHi4brnlFs2fP787qgsAAHqRLgk37733nt/foaGhWrp0qZYuXXrCcQYOHKi33nqrk2sGAOhtuHKI78NvSwEAAFMh3PRyPLoL9A7sq0DXIdwAAABTIdygT+BTMwD0HYQbAABgKoQbAOgkXDGEGfWG7Zpwgx6rN+xAHWXGZQKAnoZwg+/EyRgA0NsQboA+gJAaOKxL9HRso4QbAABgMoQbAABMqC9fwSHcBEBf3oAAAOhpCDcATklnh3o+NAA4VYQbAABOAoG79yDcAAAAUyHcnIbuTvHdPX8AAHoiws13IDwAANA1AnnOJdwAAABTIdz0QVyRAnAiHB9gBoQbAL0SJ2EAJ0K4AdrBiRMAei/CDXAaOiMEEaxOHusKPRnbZ/ch3PQC7CAAegqOR+gNCDd9FAco9CRsjx3HOgNOjHDThY4+GHFgAtATmPVYZMblMuMydRbCDboUOyfQ+7Efo6cj3CAgzHSwM9OyAD1FX9uv2pa3ry13T0G4AWBK7X0NbLYTjZmWx0zL0pf1lHYk3MA0espOdap6e/2BvoJ9tecj3JwAG6/59NU27avLDZhZoPZrsx4fCDcImEDuJGbd4SRzLxsA9ASEGwDoJQjGwMkh3ACQxInzaCe7LlhnXYv13bnMtH4JNwHWUzeOnlqvnubY9cR6Q1dgO+tcrN++h3ADAABMJeDhZsGCBbrooosUGRmpxMRETZkyRSUlJX5lmpqalJubq7i4OEVERGjq1KmqqKjwK1NaWqpJkyapX79+SkxM1OzZs9XS0hLo6qILmPVTk1mXC4HB9gF0n4CHm/fff1+5ubn6+OOPtW7dOnk8Hk2YMEH19fVGmZkzZ+qNN97Qa6+9pvfff18HDhzQ9ddfbwxvbW3VpEmT1NzcrE2bNumll17Siy++qHnz5gW6ugCA/0MgM4++3pa2QE8wPz/f7+8XX3xRiYmJKioq0vjx41VTU6PnnntOy5cv1xVXXCFJeuGFF5Senq6PP/5YF198sdauXavPP/9c//73v5WUlKQRI0bowQcf1D333KP7779fISEhga72dxo0501988ikLp0n0Jf19QMzgNPT6ffc1NTUSJJiY2MlSUVFRfJ4PMrKyjLKDB06VAMGDFBBQYEkqaCgQMOGDVNSUpJRJjs7Wy6XS8XFxe3Ox+12y+Vy+XW9HQd4c6E9O4cZ12t3L1N3zx84XZ0abrxer+6880794Ac/0Pnnny9JcjqdCgkJUUxMjF/ZpKQkOZ1Oo8zRwaZteNuw9ixYsEDR0dFGl5qaGuClAdDZOKmip2Lb7F06Ndzk5ubqs88+04oVKzpzNpKkuXPnqqamxujKyso6fZ4AAKDn6bRwk5eXpzVr1ujdd9/VmWeeafR3OBxqbm5WdXW1X/mKigo5HA6jzLFPT7X93VbmWHa7XVFRUX5dT9LR1M+nBPRVbPvm0V1tyTaEgIcbn8+nvLw8rVy5Uhs2bFBaWprf8FGjRik4OFjr1683+pWUlKi0tFSZmZmSpMzMTO3YsUOVlZVGmXXr1ikqKkoZGRmBrjJwWvragbSvLS+Ajuvu40TAw01ubq7++c9/avny5YqMjJTT6ZTT6VRjY6MkKTo6WtOnT9esWbP07rvvqqioSL/85S+VmZmpiy++WJI0YcIEZWRk6KabbtL27dv1zjvv6N5771Vubq7sdnugq9yjdPcGAQBAbxfwcPPUU0+ppqZGl112mZKTk43ulVdeMcr85S9/0TXXXKOpU6dq/Pjxcjgc+te//mUMDwoK0po1axQUFKTMzEzdeOONuvnmmzV//vxAVxenqTeGsd5YZwDAyQv4e258Pt/3lgkNDdXSpUu1dOnSE5YZOHCg3nrrrUBWDSeJ9/rATAiz32K/Rl/Cb0sBOGVmCQ5mWQ70DGxP3Y9wA1PgYIK+gm0dPV1P2EYJNwAAwFT6XLjpCYmyM5h1uQAcj/3dXOvATMvSU/S5cNMTsCGbD22Knqivb5d9ffl7g85qI8INANPi5Ab0TX023PTFg15fXObehjZCoPSWbam31BO9S58NN2YSyINDVxxoAj0PDo49G+3Tvbp7/Xf3/HuC3rAOOqOO3bncfSbc9IaNq01vqisA9CU94cdAOUd8vz4TbtB5TmZH42oN0DnYF3ov2q7zEG4A+OGAi76I7d5c+mS4YSPG6WIbAoCeq0+GG8AMuuPrQByvL63jvrSsZtHdbdZd8yfcADhtXXkA6+6DdWcaNOdNUy8f0FUIN9+DAw3MhO0ZwKnobccOwg16tN62Q6FrsF0AJ6+3vQstEAg3AAD0Er0lXHQ3wg0A9ABmO2mZbXnQu9qUcNOO3tSAABAIHPe6B+u9cxBuAACAqRBu0KfwKQnAd+EYYQ6EG7SLF8QBAHorwg0AAOiQnv7hlnCDXotfGv9Wb603erfv2u7YJnsHM7cT4QbooXrLgae31BPoC3rr/hjoehNugBPoi2/1BG3Vl9H23aMz1jvhJoDYMdBd2PYA4P9HuAF6IMIKegO2U/RUhBsAnYaTH4DuQLgBejHCAwAcj3ADAABMpU+FGz7lsg4A9FwcnxAofSrcoGM40AAAeiPCDQAAMBXCDQAAMJUeHW6WLl2qQYMGKTQ0VGPHjtXmzZu7u0oAAKCH67Hh5pVXXtGsWbP0xz/+UZ988omGDx+u7OxsVVZWdnfVAABAD9Zjw80TTzyhW2+9Vb/85S+VkZGhp59+Wv369dPzzz/f3VUDAAA9mK27K9Ce5uZmFRUVae7cuUY/q9WqrKwsFRQUtDuO2+2W2+02/q6pqZEkuVwuSZLX3XDcOG3DjnWyZdsr15GyXTXN051/b5nm6c6/t0zzdOfPumfd98Zpnu78e8s0T3f+Zlj3beV9Pl+7450UXw+0f/9+nyTfpk2b/PrPnj3bN2bMmHbH+eMf/+iTREdHR0dHR2eCbvfu3aecI3rs11IdNXfuXNXU1Bjd3r17u7tKAADgFMXGxp7yuD3ya6n4+HgFBQWpoqLCr39FRYUcDke749jtdtnt9q6oHgAA6GRW66lff+mRV25CQkI0atQorV+/3ujn9Xq1fv16ZWZmdmPNAABAT9cjr9xI0qxZs3TLLbdo9OjRGjNmjBYtWqT6+nr98pe/7O6qAQCAHqzHhpuf//znOnjwoObNmyen06kRI0YoPz9fSUlJJzW+3W7XPffco40bN8rr9Rr9rVarxo4dq8LCQr/+xzrZct09ze6ef1+eZnfPvy9Ps7vn35en2d3z78vT7O75d9U0rVarfvjDH57WrSYWn+90nrUCAADoWXrkPTcAAACninADAABMhXADAABMhXADAABMpU+EG+6ZBgCg7+ixj4J31KFDh/T888+roKBATqdTkhQTEyOPx6MNGzYoLS1NoaGhOuusszRlyhSlp6frscce04ABA1RbW6uvv/5aBw8e1K5duxQaGqqIiAiNHj1aWVlZKi8v1+eff66YmBi53W7V1NTovffek81mk81mU1pamm677TYFBQXp8ccf1xlnnKGEhARFRUVp9+7d2rRpk+666y5ddtllGjdunFatWqXVq1dr06ZNGjx4sAYPHiy3263nnntONptNDodDkydP1rBhwzRz5kyNHj1aI0eO1KOPPqrg4OBuXtOnp6WlRcXFxSopKVFERIR8Pp8GDhyoadOmqampSXa7Xenp6brwwgu1f/9+vfzyy4qKilJkZKTS09M1ZswY/eMf/1BERIT69++vYcOG6cYbb9T06dPl8XgUGRmps846S7NmzdKtt94qn88ni8WiMWPG6LrrrtM333yjBx54QAkJCQoODlZsbKyio6OVn5+vsLAwxcTEaNiwYZo2bZrmzZunhoYGhYeHy+Fw6JprrtGDDz6olpYWBQUF6eyzz9b48ePV3Nys5cuXKy4uTnFxcbrssst0ww03aNiwYd29uk9Je20UHh6u2267TS0tLbLb7XI4HJo2bZr+8Ic/GB8ehg8frhtvvFFvvPGGamtr5Xa7ZbfbNXToUA0ePFhLly71a7eJEyfqrrvuks1mU0hIiM466yz96le/0u23337ctvDyyy+rublZra2tCgsLU3p6us4//3wtX75cNptN0dHRGjBggFJSUvTyyy/LYrHIbrcrPj5eN910k5YsWaLg4GB5vV6NHTtWM2fO7NUvBHU6nSosLNTHH3+s2NhYY3099NBD8ng8CgoK0plnnqnU1FQdOnRI27ZtU0hIiEJDQxUbG6uBAwdq69atCgkJUWRkpAYMGKD09HS9/vrram1tVWhoqBISEjRlyhQ99dRTx23zr7/+uqqqqmSz2RQcHKzQ0FBZrVaVlpbKZrPJbrcrJSVFgwcP1rvvvqvW1lYFBwcrJiZG55xzjt5//31J376cNTExUYMGDdLOnTsVHBys5uZmRUdHa+TIkcrLy1N2dnY3r+1T05E2KigoUGtrq3w+n+x2+wnbaMiQIXr99df9tu9AtNH69evl8/kUEhKi6OjodtsoJSVFX3zxhbG/h4WF6YwzzlD//v318ccfq1+/frLb7cZ8ysrKZLVaFRoaqpSUFE2aNEkvvfSSWlpaZLFYdOaZZxp1l769CJGenq65c+cqISFB9913n37961/rZz/72Smtf1M8Cr5lyxZlZ2erublZYWFhqq+vV3Bw8Al/ibQ3S0pK0nXXXafXX39ddXV1CgoKUkJCgmJiYtTQ0KBdu3bJbrcrODhYKSkpGjlypPLz80+4M7RtZDfddJMWLVqkpqYmo196errq6+v1zjvvfO+GO27cOL366qvGNOPj4zVmzBitXr1ara2t8nq9Cg0NVVNTU3evwi7jcDgUGRmpmJgYDRgwQBMnTtT/+3//Tz6fTz6fT0OGDNGcOXP02GOPyePxqLm5WRkZGRo/frzCwsKMEBYREXHCgHDNNdcoLy9PTU1NCgoKUmpq6glDR3Nz83HBLjQ0VOvXr5fNZlNkZKS8Xq/279/fzWuua1x66aU6cOCArFbrd7bR7Nmz9eCDDx4XlKuqqnTfffedVBvl5ub6BeVTbaOIiAg1NTXp0KFD3bz2uk5iYqKio6O7fD/KycnRvffeaxy/hgwZookTJyo2NlYPPfQQbdSFtm/frgsuuKBD45gi3Fx88cUaOHCgXn311e6uCgAACCCr1arW1tYOjWOKr6W2b9+upqYmBQcHy+PxdHd1AABAgHz99dcdHscUV27S0tL0zTffdHc1AABAgJ1KTDHF01J33XWXJCkoKMjoFx8f313VAQAAp8hqPf1oYoorN5IUHh4ut9vd4e/lAABAzxEaGiq32y2bzSaPx3NKV25Mcc+NJN14443Kz8/XJZdcojVr1igyMlKHDh3iHpxeJCIiQuHh4YqNjZX07eP9TU1NGjRokA4cOKDm5ma53W5ZLBZFRESosbFRISEhqq+vl8/nU0xMjJqbmyVJDQ0N8nq9io2NNR4fbm1tVVNTk9xutzHNtie4goOD1a9fPzU3N8tqtaqmpkYtLS1KTk5WY2OjbDabGhsbZbFYNHDgQHk8Hh08eFAVFRXducq6lNVqNR4pjY2NVVVVlbGO24bt27dPbrdbsbGx8vl88nq9amhoUGtrq4KCghQeHn5cu7W1TWhoqBobG9Xa2qrk5GQ1NTUZ86mvr1dCQoJxY2FVVZXxZF5cXJxqamqUkJCg2tpaeTweJSQkqKWlRS6XSwcPHuzuVddl2l5n0PZYfFJSkrZv366KigqlpaUpIiJCX331lY4cOSLp2yvc9fX1io6OltPpVFNTkxISEmS322W1WrVv3z55vV5jm4+Li9P+/ftVW1urESNGqL6+3mjTuro6eb1e9e/f33is+Ouvv5bL5dJZZ50lq9WqlJQUlZaWqqGhQZmZmWpsbNS2bdv02WefdfOa6zrHtlFjY6O++uornXvuubJYLAoODj6pNmppafHrF4g2qq+vV0ZGhg4fPiyn02m0UXJysrZv367PPvtMZ599tiIjIxUXF6fS0lJVVlYa8/rmm28UGxt73HwsFou+/PJLNTQ0aODAgWpublZQUJAOHjyo2tpajRkzRgcPHlRKSop+8YtfqKamRu+++66mTJmiffv2ndJ6Ns2VmxN5/fXXdeTIEWVlZenDDz/U3XffbbwH51gjRoxQVlaW9uzZo+uvv15Wq1X5+fnavXu3Zs+erT179ujOO++U9O2Bvu3n2S0Wi37729+qtrZWtbW1mjZtmnbt2qW4uDj5fD7Fx8ervLzcaOj333/fb5pBQUHKz89XQUGBqqqqjquXxWJRTEyMLrnkEiUkJGjr1q06ePCgLrnkEh04cEAxMTEKDw/X/v37dfjwYY0cOVKHDx+Wz+czNqgT7QxtG1l1dbXq6uo0dOhQuVwuhYWFqbm5WXV1dXK73d+54R69g1x44YU6cuSI4uLi1NjYqMOHD+tHP/qRKioqFBYWprCwMFVVVfmtp0suuUSXXXaZbLbel7Xff/99rVixQuXl5YqPj9eBAwe0fft2VVdX+514jw0Dra2tam5uVm1trZqbm5WQkGB8UmlsbFRzc7N8Pp+Sk5NVWVkpi8WilpYWWa1WI9gdGwYaGxtlt9tVU1Mjt9sth8Mhq9VqBAmXy6Xm5mYj2PXr109er1c2m804gCYmJqq2tlYul0v/9V//pZKSEp155pmaNGmSLr/88u5e3afkdNqo7b1WLS0tfkHZ7XaroaHBCMC00enpaftRaGioGhoa1NTUpJSUFONDcm1trfHBqa+1UW9j+nDTnsOHD2vevHnasWOH9uzZI6/XqyNHjhif/FtbW+VyuYzw0lFHB5+jBQcHKysrS4cOHdKePXvU1NQkn8+n+vr6k552fHy8vvjiC8XFxZ1S3brb5s2btWjRIq1fv16HDx/2+xrRZrMpLi5OV1xxhe68806lpaVpxYoVamhoUFFRkbxerw4fPqyYmBiNHz9eDQ0NevPNN3X48GFjnbe2turgwYPGAc5iscjr9SouLk6XXnqpjhw5otraWlVXV6ukpOS4K3tWq1WDBg3S+eefL6fTKbfbraamJjmdTtXW1p5wm7BYLAoNDdXw4cM1f/58/ehHP+rU9diZXnjhBS1ZskQlJSVqamoylrntJJyQkKBx48bpjjvu0Nlnn63nn39en3zyiXGlprq6WgMGDFBGRoY8Ho/Wrl2rr776ynjXkdVq1d69exUcHCyr1arBgwcbLwzMyMhQa2urGhoaVFpaqoMHDx53SbrtRZdt+0B4eLg8Ho8+/fRTud3uEy5XUFCQ4uPjNXnyZN11110699xzO28ldqLm5matWLFCjz/+uHbu3KmWlpbjyoSEhCg9PV2///3vFRMTo2eeeUZff/21zjrrLO3YscN4n8uFF16o8vJy7dq1y7iC2XbSb2xslNfrlcVikfTty9yCgoIUFRUli8WikJAQ1dbWnvD4FR4eruTkZOOFji0tLUYw+C5Wq1WJiYmaNWuWZs+efforrBucahuVlJQoODhYiYmJ2rlzpywWiwYPHiyr1apPPvlEDQ0NxofdoKAgHTp0qFvaqD1Wq1WxsbGaNm2a8vLytGnTJj388MOSpNbWVjU2NkqSLrzwQjmdTpWXl+vIkSNqbW2V1WqVzWYzXnTodruVkJCgjRs3nvJ+2ufCzTvvvKOJEyea5icZjt5B2g5cf/7znyV9e8Ktr683PmU0NzervLxc1dXVxguw2jaoth2kLQyEhIQoLi5OdXV18vl8xldC7WlvB2n7JNSXWSwWhYWFaejQobr99tt188036/Dhw3r++edVVFQk6dunAI4cOWKEAbfbrTVr1mjfvn3GDn9sGJC+PVj84he/UHl5ufFpcvfu3e0GsMjISI0YMUJut9tow+rqaiO89WXf10ZtHz5sNpsSExM1ePBg/ec//9Hu3btVX19vfFXZXht5PB5NmjRJVqtV9fX1qqiooI1OQVfvR/v379fBgwdpox4gISFBycnJmj59uu64444OjWvqcLN69WoVFhbqvffeM74KKS8vP+UrMgDQk/W1t4D3RrRRx6WlpXX4XTemDjdWq9U0V2gAoC9p+3oCfVdxcbEyMjJOaVxTvOfmRJKTk7u7CgCAU0CwwekwdbgZNWqUgoKCZLfbje/FJRnfwR790r/v0xN+jTsQLzYCAMDsTH22nD17tgYPHqzzzjtPqampio2Nld1ulyQj7LQFnDPOOOM7p9W/f3+FhIR8Z5m2O9Y7y8ncK9TZdehKbW3Utkx2u/07Hxcn/HW9oKAgv/2CNuh52h4UkL5tn7CwMPXv379LjhVmOh51pqPbyGKxKCoqSqGhob26jdrOrRaLRUFBQQoLC+vS40Pve7FIB1x66aV65plnjJcYFRYWSpLOP/98XXXVVZKk+vp6bd26VaNHj9YHH3wg6duXLP3whz/s0Lyqq6v1yCOP6I033tDu3bvbvaRqsViUlJSkcePGGe9g2Lt3r8rLy43H5CwWi/F4ctuGcLI3QLeFAZ/Pp6CgIHm9XgUHB3fJzWsWi+W07m9qeyIiOztbl19+uVpbW9WvXz9jeFJSkiZPnixJOnDggP7+97/7jX/w4EFdddVV+vDDD7VixQodOnRIFotFNptN/fr10xVXXKHHH39cdXV1Kigo0OrVq7VlyxZ5PB61tLQoJiZGI0aM0HXXXaef/vSnx+3wxcXFWrJkiT766CM1NTWptbVVHo9HTqez3cc8T6StXY6efm+5wb3tUc8f/OAHGjp0qJKSkhQfH6/4+Hjt3bvXaKPa2lqtW7dOn332mex2u8455xwVFxfr4MGDGjFihFauXKkPP/xQra2tioiIUEJCglwuly699FL9+c9/1t69e7V8+XJt3bpVBw4cMN6lFBMTo7S0NP3mN7/RZZdddlz91q5dqxdeeEE7duwwnhL0+Xw6dOhQh77iaGujtqcJJX1nG5/utn+6+vXrJ7fbbdT37LPPVl5eni6//PLjrjiHhobqzDPPlCQ1NTXpo48+8hve1NSk9PR0ffnll3rmmWf0xRdfGMeTyMhI/ehHP9LMmTMVEhKigwcPaseOHSorK1NTU5M8Ho9iYmJ0wQUXaPjw4X7779HT37x5s9577z2Vl5crOjpaFotFzzzzjPHSupPR2/Yjm83m94RqINpox44d+sc//qGdO3caj4B3VhuFhYWpvr5eqampio+PV1FRkXbt2qWqqiq53W71799fF1xwga6++mqdf/75crlcfsseGxurqKgotbS0aPfu3X4v5mt7/1plZaVWr16t8vJyHTp0SFdeeeVp3Vpi6huKzWLPnj16++239cEHH6isrExut1uhoaFKTU3VuHHjjttBbDaboqKijH+P3UHa3idTV1eniooKvfDCC9q1a1e7B7H+/fvL6XRqy5YtfqHtZHaQgwcP6p133tF//vMfeTweI2hcdNFFmjx5skJDQzt/5XWBffv26dlnn9Vbb72lvXv3Gu+i6NevnwYOHKjLLrtMw4cPV0NDgzFOeHi4RowYofPOO0+1tbVau3atiouLjeF2u11hYWEKDw/XypUrtXHjRuNFYUFBQUpMTNTrr7+u8847T1u2bNGyZcv04YcfqqqqynhRXGpqqsaMGaPc3FydddZZx9U7Pz/fCAPV1dXyer0KDw/X0KFDde211+onP/mJ4uPj/U4gNpvtpMLcyZbrrmmGhoYqKCjotNqovr5egwYNksPh0HPPPWe0Udujxke30apVq/TBBx+ooKBATqfzpNto1apVWrx4scrLy9Xc3KzGxkbjseij28gM9u3bp2eeeUZvv/12t+5HW7duVWlpqTweT0D3o96svLxcCxcu1IoVK074EtxAanub9b333qtf//rXHbqFpA3hppcqKyvTbbfdpu3btxv9bDabzjvvPL9+7TnZcoEu6/P5jJ8/iIqKMt7G3HbFqe1tr0cLCgpSXFycKisrJX270ffr108tLS3HXZE6uuzJluvINNsr63a7/a66Wa1WHT58+HvXFXqOthOdzWZTa2urWlpaTnglpu0SeyDLdmSaR4/j8/lM/y6p8PBwRUdHn3D4Oeeco3379p3U1emTLXu60zznnHM0cuRIrVy5Uk6nU16v1+jQcaNHj9aGDRsUGRnZofEIN73U9u3bNWLEiO6uBgAAARcSEqKWlhaFhIRoxowZevLJJzs0PuGmh1q9erX+8Y9/6LPPPlNlZaXx44Otra18AgAAmF5kZKTq6uqUnJys/fv3d2hcwk0PxQsIAQB9WXBwsDwej/Hjph3Bc5s9VHJysvF4YFsXERHh9zcAAGZktVoVGRkpq9X6va9qaY+pHwXvzUaNGqUNGzaoubnZ6BcSEmL8iKXP5+vQI8hAR7V9agLQMd39eoDerF+/fkpPT9fAgQONn+CYNm1ah6dDuOmhZs+erTPPPFOlpaVGP4fDoZCQEJWWlqq1tVWVlZWqq6szhrc9ySN9+wRIdXV1u9NuK1dXV6eIiIiAl21v/lVVVaqpqTGe7mjb+YOCghQcHKzGxsY+cTBoe3dR27KeaJmPfsdR21W6QJZtb/5t7wVqe0S6rQ09Ho/xrpmMjAydd9553xl6wsPDlZqaqt27d39vODrZsqc6zbVr16q6urrdJ5A4AfV8R7/Esz3JyckqLy8/qWmdbNnTnabD4dDNN9+sZ555xngi1MzOOOMMvfrqq4qLizP6tb1wtb6+/nvHbyvb0tKi+Ph4RUREBKRe3HMDAABMhXtuAACAqRBuAACAqRBuAACAqRBuAACAqRBuAACAqRBuAACAqRBuAACAqRBuAACAqfx/aIDk5cd3KAgAAAAASUVORK5CYII=\n"
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"training_file[\"len_tokenized\"].plot.bar()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"source": [
|
|
"### Padding przykładów do 2048 słów"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"tf.Tensor([18792 316 1335 ... 0 0 0], shape=(2048,), dtype=int64)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"sentence_vectorizer = tf.keras.layers.TextVectorization(standardize=None, output_sequence_length=2048)\n",
|
|
"sentence_vectorizer.adapt(training_file[\"document\"])\n",
|
|
"print(sentence_vectorizer(training_file[\"document\"][20]))"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"tf.Tensor([2 2 2 ... 0 0 0], shape=(2048,), dtype=int64)\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"label_vectorizer = tf.keras.layers.TextVectorization(standardize=None, output_sequence_length=2048)\n",
|
|
"label_vectorizer.adapt(training_file[\"label\"])\n",
|
|
"print(label_vectorizer(training_file[\"label\"][20]))"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"outputs": [],
|
|
"source": [
|
|
"tags_list = label_vectorizer.get_vocabulary()\n",
|
|
"tags_length = label_vectorizer.vocabulary_size()\n",
|
|
"\n",
|
|
"vocab_list = sentence_vectorizer.get_vocabulary()\n",
|
|
"vocab_length = sentence_vectorizer.vocabulary_size()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"outputs": [],
|
|
"source": [
|
|
"training_file[\"document_vectorized\"] = training_file[\"document\"].apply(sentence_vectorizer)\n",
|
|
"training_file[\"label_vectorized\"] = training_file[\"label\"].apply(label_vectorizer)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 79,
|
|
"outputs": [],
|
|
"source": [
|
|
"from keras.utils import to_categorical\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"train, valid = train_test_split(training_file, test_size=0.2)\n",
|
|
"train_x = np.stack(train[\"document_vectorized\"].values)\n",
|
|
"train_y = np.stack(train[\"label_vectorized\"].values)\n",
|
|
"train_y = np.array([to_categorical(i,num_classes = tags_length) for i in train_y])\n",
|
|
"\n",
|
|
"val_x = np.stack(valid[\"document_vectorized\"].values)\n",
|
|
"val_y = np.stack(valid[\"label_vectorized\"].values)\n",
|
|
"val_y = np.array([to_categorical(i,num_classes = tags_length) for i in val_y])"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 97,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"[2014 19 122 ... 0 0 0]\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(val_x[0])"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 80,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "(756, 2048)"
|
|
},
|
|
"execution_count": 80,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"train_x.shape"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 81,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "(756, 2048, 11)"
|
|
},
|
|
"execution_count": 81,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"train_y.shape"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 82,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "array([ 128, 19, 1368, ..., 0, 0, 0], dtype=int64)"
|
|
},
|
|
"execution_count": 82,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"train_x[0]"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 83,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "array([[0., 0., 1., ..., 0., 0., 0.],\n [0., 0., 1., ..., 0., 0., 0.],\n [0., 0., 0., ..., 1., 0., 0.],\n ...,\n [1., 0., 0., ..., 0., 0., 0.],\n [1., 0., 0., ..., 0., 0., 0.],\n [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)"
|
|
},
|
|
"execution_count": 83,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"train_y[0]"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 122,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Model: \"model_14\"\n",
|
|
"_________________________________________________________________\n",
|
|
" Layer (type) Output Shape Param # \n",
|
|
"=================================================================\n",
|
|
" input_16 (InputLayer) [(None, 2048)] 0 \n",
|
|
" \n",
|
|
" embedding_15 (Embedding) (None, 2048, 128) 3024256 \n",
|
|
" \n",
|
|
" lstm_20 (LSTM) (None, 2048, 256) 394240 \n",
|
|
" \n",
|
|
" time_distributed_18 (TimeDi (None, 2048, 11) 2827 \n",
|
|
" stributed) \n",
|
|
" \n",
|
|
"=================================================================\n",
|
|
"Total params: 3,421,323\n",
|
|
"Trainable params: 3,421,323\n",
|
|
"Non-trainable params: 0\n",
|
|
"_________________________________________________________________\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from keras.optimizers import Adam\n",
|
|
"import keras.layers as layers\n",
|
|
"import keras\n",
|
|
"\n",
|
|
"\n",
|
|
"def create_model():\n",
|
|
" input_layer = layers.Input(shape=(2048,))\n",
|
|
" embedding_layer = layers.Embedding(input_dim = vocab_length+1,output_dim = 128,input_length = 2048)(input_layer)\n",
|
|
" lstm_layer = layers.LSTM(256, return_sequences=True)(embedding_layer)\n",
|
|
" output_layer = layers.TimeDistributed(layers.Dense(tags_length,activation=\"softmax\"))(lstm_layer)\n",
|
|
" #out = layers.Dense(2048,activation=\"linear\")(dropout)\n",
|
|
" model = keras.Model(inputs=input_layer, outputs=output_layer)\n",
|
|
" model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])\n",
|
|
" return model\n",
|
|
"model = create_model()\n",
|
|
"model.summary()"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 123,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Epoch 1/50\n",
|
|
"24/24 [==============================] - 29s 1s/step - loss: 0.6602 - accuracy: 0.8703 - val_loss: 0.2673 - val_accuracy: 0.9425\n",
|
|
"Epoch 2/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.2500 - accuracy: 0.9653 - val_loss: 0.1613 - val_accuracy: 0.9781\n",
|
|
"Epoch 3/50\n",
|
|
"24/24 [==============================] - 28s 1s/step - loss: 0.1062 - accuracy: 0.9790 - val_loss: 0.0984 - val_accuracy: 0.9793\n",
|
|
"Epoch 4/50\n",
|
|
"24/24 [==============================] - 28s 1s/step - loss: 0.0920 - accuracy: 0.9806 - val_loss: 0.0936 - val_accuracy: 0.9799\n",
|
|
"Epoch 5/50\n",
|
|
"24/24 [==============================] - 28s 1s/step - loss: 0.0874 - accuracy: 0.9812 - val_loss: 0.0901 - val_accuracy: 0.9800\n",
|
|
"Epoch 6/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0828 - accuracy: 0.9816 - val_loss: 0.0867 - val_accuracy: 0.9804\n",
|
|
"Epoch 7/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0774 - accuracy: 0.9818 - val_loss: 0.0805 - val_accuracy: 0.9804\n",
|
|
"Epoch 8/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0715 - accuracy: 0.9819 - val_loss: 0.0741 - val_accuracy: 0.9807\n",
|
|
"Epoch 9/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0628 - accuracy: 0.9822 - val_loss: 0.0660 - val_accuracy: 0.9808\n",
|
|
"Epoch 10/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0543 - accuracy: 0.9826 - val_loss: 0.0579 - val_accuracy: 0.9815\n",
|
|
"Epoch 11/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0465 - accuracy: 0.9843 - val_loss: 0.0500 - val_accuracy: 0.9851\n",
|
|
"Epoch 12/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0385 - accuracy: 0.9879 - val_loss: 0.0453 - val_accuracy: 0.9867\n",
|
|
"Epoch 13/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0330 - accuracy: 0.9901 - val_loss: 0.0413 - val_accuracy: 0.9873\n",
|
|
"Epoch 14/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0298 - accuracy: 0.9909 - val_loss: 0.0395 - val_accuracy: 0.9887\n",
|
|
"Epoch 15/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0257 - accuracy: 0.9922 - val_loss: 0.0380 - val_accuracy: 0.9887\n",
|
|
"Epoch 16/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0241 - accuracy: 0.9924 - val_loss: 0.0362 - val_accuracy: 0.9887\n",
|
|
"Epoch 17/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0215 - accuracy: 0.9935 - val_loss: 0.0344 - val_accuracy: 0.9897\n",
|
|
"Epoch 18/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0191 - accuracy: 0.9942 - val_loss: 0.0335 - val_accuracy: 0.9898\n",
|
|
"Epoch 19/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0173 - accuracy: 0.9948 - val_loss: 0.0322 - val_accuracy: 0.9906\n",
|
|
"Epoch 20/50\n",
|
|
"24/24 [==============================] - 28s 1s/step - loss: 0.0160 - accuracy: 0.9952 - val_loss: 0.0322 - val_accuracy: 0.9908\n",
|
|
"Epoch 21/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0147 - accuracy: 0.9958 - val_loss: 0.0338 - val_accuracy: 0.9900\n",
|
|
"Epoch 22/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0133 - accuracy: 0.9962 - val_loss: 0.0307 - val_accuracy: 0.9915\n",
|
|
"Epoch 23/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0117 - accuracy: 0.9968 - val_loss: 0.0303 - val_accuracy: 0.9918\n",
|
|
"Epoch 24/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0105 - accuracy: 0.9973 - val_loss: 0.0289 - val_accuracy: 0.9922\n",
|
|
"Epoch 25/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0094 - accuracy: 0.9977 - val_loss: 0.0315 - val_accuracy: 0.9917\n",
|
|
"Epoch 26/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0084 - accuracy: 0.9980 - val_loss: 0.0300 - val_accuracy: 0.9924\n",
|
|
"Epoch 27/50\n",
|
|
"24/24 [==============================] - 27s 1s/step - loss: 0.0073 - accuracy: 0.9984 - val_loss: 0.0295 - val_accuracy: 0.9926\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)\n",
|
|
"history = model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=50, callbacks=[callback])"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "[('China', 'B-LOC'),\n ('says', 'O'),\n ('time', 'O'),\n ('right', 'O'),\n ('for', 'O'),\n ('Taiwan', 'B-LOC'),\n ('talks', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('BEIJING', 'B-LOC'),\n ('1996-08-22', 'O'),\n ('</S>', 'O'),\n ('China', 'B-LOC'),\n ('has', 'O'),\n ('said', 'O'),\n ('it', 'O'),\n ('was', 'O'),\n ('time', 'O'),\n ('for', 'O'),\n ('political', 'O'),\n ('talks', 'O'),\n ('with', 'O'),\n ('Taiwan', 'B-LOC'),\n ('and', 'O'),\n ('that', 'O'),\n ('the', 'O'),\n ('rival', 'O'),\n ('island', 'O'),\n ('should', 'O'),\n ('take', 'O'),\n ('practical', 'O'),\n ('steps', 'O'),\n ('towards', 'O'),\n ('that', 'O'),\n ('goal', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Consultations', 'O'),\n ('should', 'O'),\n ('be', 'O'),\n ('held', 'O'),\n ('to', 'O'),\n ('set', 'O'),\n ('the', 'O'),\n ('time', 'O'),\n ('and', 'O'),\n ('format', 'O'),\n ('of', 'O'),\n ('the', 'O'),\n ('talks', 'O'),\n (',', 'O'),\n ('the', 'O'),\n ('official', 'O'),\n ('Xinhua', 'B-ORG'),\n ('news', 'O'),\n ('agency', 'O'),\n ('quoted', 'O'),\n ('Tang', 'B-PER'),\n ('Shubei', 'I-PER'),\n (',', 'O'),\n ('executive', 'O'),\n ('vice', 'O'),\n ('chairman', 'O'),\n ('of', 'O'),\n ('the', 'O'),\n ('Association', 'B-ORG'),\n ('for', 'I-ORG'),\n ('Relations', 'O'),\n ('Across', 'I-ORG'),\n ('the', 'I-ORG'),\n ('Taiwan', 'I-ORG'),\n ('Straits', 'I-ORG'),\n (',', 'O'),\n ('as', 'O'),\n ('saying', 'O'),\n ('late', 'O'),\n ('on', 'O'),\n ('Wednesday', 'O'),\n ('.', 'O'),\n ('</S>', 'O')]"
|
|
},
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"tag_list_numpy = np.array(tags_list)\n",
|
|
"def get_tag_from_int(input_integer):\n",
|
|
" return tag_list_numpy[input_integer]\n",
|
|
"def get_ner_output_single_sentence(input_sentence):\n",
|
|
" sentence_length = len(input_sentence.split())\n",
|
|
" vectorized = sentence_vectorizer(input_sentence)\n",
|
|
" #print(vectorized)\n",
|
|
" model_output = model(np.stack(tf.expand_dims(vectorized,0)))\n",
|
|
" #print(model_output.numpy())\n",
|
|
" #print(model_output.shape)\n",
|
|
" max_indices = np.argmax(model_output, axis=2).flatten()\n",
|
|
" #print(max_indices)\n",
|
|
" #print(len(max_indices))\n",
|
|
" #\" \".join(vocab_arr[vectorize_layer(training_file[\"document\"][20])])\n",
|
|
" tokenized = [get_tag_from_int(x) for x in max_indices[:]]\n",
|
|
" return tokenized[:sentence_length]\n",
|
|
"#get_ner_output_single_sentence(\"China says time right for Taiwan talks . </S> BEIJING 1996-08-22 </S> China has said it was time for political talks with Taiwan and that the rival island should take practical steps towards that goal . </S> Consultations should be held to set the time and format of the talks , the official Xinhua news agency quoted Tang Shubei , executive vice chairman of the Association for Relations Across the Taiwan Straits , as saying late on Wednesday . </S>\")\n",
|
|
"\n",
|
|
"def test_sentence(sentence):\n",
|
|
" model_output = get_ner_output_single_sentence(sentence)\n",
|
|
" input_tokens = sentence.split()\n",
|
|
" return list(zip(input_tokens, model_output))\n",
|
|
"\n",
|
|
"test_sentence(\"China says time right for Taiwan talks . </S> BEIJING 1996-08-22 </S> China has said it was time for political talks with Taiwan and that the rival island should take practical steps towards that goal . </S> Consultations should be held to set the time and format of the talks , the official Xinhua news agency quoted Tang Shubei , executive vice chairman of the Association for Relations Across the Taiwan Straits , as saying late on Wednesday . </S>\")"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"tf.Tensor([ 128 19 18713 ... 0 0 0], shape=(2048,), dtype=int64)\n",
|
|
"[[[3.0971142e-03 1.5280694e-03 9.8057139e-01 ... 3.6668889e-03\n",
|
|
" 1.4106639e-03 3.3225205e-03]\n",
|
|
" [2.1369425e-04 1.2225067e-04 9.9616271e-01 ... 1.4002173e-03\n",
|
|
" 1.0539902e-04 2.7582867e-04]\n",
|
|
" [6.3146334e-05 3.8070513e-05 9.9278271e-01 ... 2.4660169e-03\n",
|
|
" 5.7447112e-05 1.3038449e-04]\n",
|
|
" ...\n",
|
|
" [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794704e-08\n",
|
|
" 2.6146161e-08 5.0399006e-08]\n",
|
|
" [9.9999696e-01 1.7784757e-08 2.5151198e-07 ... 1.3794731e-08\n",
|
|
" 2.6146161e-08 5.0399006e-08]\n",
|
|
" [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794704e-08\n",
|
|
" 2.6146161e-08 5.0399006e-08]]]\n",
|
|
"(1, 2048, 11)\n",
|
|
"[2 2 2 ... 0 0 0]\n",
|
|
"2048\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": "[('SOCCER', 'O'),\n ('-', 'O'),\n ('LATE', 'O'),\n ('GOALS', 'O'),\n ('GIVE', 'O'),\n ('JAPAN', 'O'),\n ('WIN', 'O'),\n ('OVER', 'O'),\n ('SYRIA', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('AL-AIN', 'O'),\n (',', 'O'),\n ('United', 'B-LOC'),\n ('Arab', 'I-LOC'),\n ('Emirates', 'I-LOC'),\n ('1996-12-06', 'O'),\n ('</S>', 'O'),\n ('Two', 'O'),\n ('goals', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('last', 'O'),\n ('six', 'O'),\n ('minutes', 'O'),\n ('gave', 'O'),\n ('holders', 'O'),\n ('Japan', 'B-LOC'),\n ('an', 'O'),\n ('uninspiring', 'O'),\n ('2-1', 'O'),\n ('Asian', 'B-LOC'),\n ('Cup', 'I-MISC'),\n ('victory', 'O'),\n ('over', 'O'),\n ('Syria', 'B-LOC'),\n ('on', 'O'),\n ('Friday', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Takuya', 'O'),\n ('Takagi', 'O'),\n ('headed', 'O'),\n ('the', 'O'),\n ('winner', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('88th', 'O'),\n ('minute', 'O'),\n ('of', 'O'),\n ('the', 'O'),\n ('group', 'O'),\n ('C', 'O'),\n ('game', 'O'),\n ('after', 'O'),\n ('goalkeeper', 'O'),\n ('Salem', 'O'),\n ('Bitar', 'O'),\n ('spoiled', 'O'),\n ('a', 'O'),\n ('mistake-free', 'O'),\n ('display', 'O'),\n ('by', 'O'),\n ('allowing', 'O'),\n ('the', 'O'),\n ('ball', 'O'),\n ('to', 'O'),\n ('slip', 'O'),\n ('under', 'O'),\n ('his', 'O'),\n ('body', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('It', 'O'),\n ('was', 'O'),\n ('the', 'O'),\n ('second', 'O'),\n ('Syrian', 'B-PER'),\n ('defensive', 'O'),\n ('blunder', 'O'),\n ('in', 'O'),\n ('four', 'O'),\n ('minutes', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Defender', 'O'),\n ('Hassan', 'B-PER'),\n ('Abbas', 'I-PER'),\n ('rose', 'O'),\n ('to', 'O'),\n ('intercept', 'O'),\n ('a', 'O'),\n ('long', 'O'),\n ('ball', 'O'),\n ('into', 'O'),\n ('the', 'O'),\n ('area', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('84th', 'O'),\n ('minute', 'O'),\n ('but', 'O'),\n ('only', 'O'),\n ('managed', 'O'),\n ('to', 'O'),\n ('divert', 'O'),\n ('it', 'O'),\n ('into', 'O'),\n ('the', 'O'),\n ('top', 'O'),\n ('corner', 'O'),\n ('of', 'O'),\n ('Bitar', 'O'),\n (\"'s\", 'O'),\n ('goal', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Syria', 'B-ORG'),\n ('had', 'O'),\n ('taken', 'O'),\n ('the', 'O'),\n ('lead', 'O'),\n ('from', 'O'),\n ('their', 'O'),\n ('first', 'O'),\n ('serious', 'O'),\n ('attack', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('seventh', 'O'),\n ('minute', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Nader', 'O'),\n ('Jokhadar', 'O'),\n ('headed', 'O'),\n ('a', 'O'),\n ('cross', 'O'),\n ('from', 'O'),\n ('the', 'O'),\n ('right', 'O'),\n ('by', 'O'),\n ('Ammar', 'O'),\n ('Awad', 'O'),\n ('into', 'O'),\n ('the', 'O'),\n ('top', 'O'),\n ('right', 'O'),\n ('corner', 'O'),\n ('of', 'O'),\n ('Kenichi', 'O'),\n ('Shimokawa', 'O'),\n (\"'s\", 'O'),\n ('goal', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Japan', 'B-LOC'),\n ('then', 'O'),\n ('laid', 'O'),\n ('siege', 'O'),\n ('to', 'O'),\n ('the', 'O'),\n ('Syrian', 'B-ORG'),\n ('penalty', 'O'),\n ('area', 'O'),\n ('and', 'O'),\n ('had', 'O'),\n ('a', 'O'),\n ('goal', 'O'),\n ('disallowed', 'O'),\n ('for', 'O'),\n ('offside', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('16th', 'O'),\n ('minute', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('A', 'O'),\n ('minute', 'O'),\n ('later', 'O'),\n (',', 'O'),\n ('Bitar', 'O'),\n ('produced', 'O'),\n ('a', 'O'),\n ('good', 'O'),\n ('double', 'O'),\n ('save', 'O'),\n (',', 'O'),\n ('first', 'O'),\n ('from', 'O'),\n ('Kazuyoshi', 'O'),\n ('Miura', 'O'),\n (\"'s\", 'O'),\n ('header', 'O'),\n ('and', 'O'),\n ('then', 'O'),\n ('blocked', 'O'),\n ('a', 'O'),\n ('Takagi', 'O'),\n ('follow-up', 'O'),\n ('shot', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Bitar', 'O'),\n ('saved', 'O'),\n ('well', 'O'),\n ('again', 'O'),\n ('from', 'O'),\n ('Miura', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('37th', 'O'),\n ('minute', 'O'),\n (',', 'O'),\n ('parrying', 'O'),\n ('away', 'O'),\n ('his', 'O'),\n ('header', 'O'),\n ('from', 'O'),\n ('a', 'O'),\n ('corner', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Japan', 'B-ORG'),\n ('started', 'O'),\n ('the', 'O'),\n ('second', 'O'),\n ('half', 'O'),\n ('brightly', 'O'),\n ('but', 'O'),\n ('Bitar', 'O'),\n ('denied', 'O'),\n ('them', 'O'),\n ('an', 'O'),\n ('equaliser', 'O'),\n ('when', 'O'),\n ('he', 'O'),\n ('dived', 'O'),\n ('to', 'O'),\n ('his', 'O'),\n ('right', 'O'),\n ('to', 'O'),\n ('save', 'O'),\n ('Naoki', 'O'),\n ('Soma', 'O'),\n (\"'s\", 'O'),\n ('low', 'O'),\n ('drive', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('53rd', 'O'),\n ('minute', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Japan', 'B-LOC'),\n (':', 'O'),\n ('19', 'O'),\n ('-', 'O'),\n ('Kenichi', 'O'),\n ('Shimokawa', 'O'),\n (',', 'O'),\n ('2', 'O'),\n ('-', 'O'),\n ('Hiroshige', 'O'),\n ('Yanagimoto', 'O'),\n (',', 'O'),\n ('3', 'O'),\n ('-', 'O'),\n ('Naoki', 'O'),\n ('Soma', 'O'),\n (',', 'O'),\n ('4', 'O'),\n ('-', 'O'),\n ('Masami', 'O'),\n ('Ihara', 'O'),\n (',', 'O'),\n ('5', 'O'),\n ('-', 'O'),\n ('Norio', 'O'),\n ('Omura', 'O'),\n (',', 'O'),\n ('6', 'O'),\n ('-', 'O'),\n ('Motohiro', 'O'),\n ('Yamaguchi', 'O'),\n (',', 'O'),\n ('8', 'O'),\n ('-', 'O'),\n ('Masakiyo', 'O'),\n ('Maezono', 'O'),\n ('(', 'O'),\n ('7', 'O'),\n ('-', 'O'),\n ('Yasuto', 'O'),\n ('Honda', 'B-ORG'),\n ('71', 'O'),\n (')', 'O'),\n (',', 'O'),\n ('9', 'O'),\n ('-', 'O'),\n ('Takuya', 'O'),\n ('Takagi', 'O'),\n (',', 'O'),\n ('10', 'O'),\n ('-', 'O'),\n ('Hiroshi', 'O'),\n ('Nanami', 'O'),\n (',', 'O'),\n ('11', 'O'),\n ('-', 'O'),\n ('Kazuyoshi', 'O'),\n ('Miura', 'O'),\n (',', 'O'),\n ('15', 'O'),\n ('-', 'O'),\n ('Hiroaki', 'O'),\n ('Morishima', 'O'),\n ('(', 'O'),\n ('14', 'O'),\n ('-', 'O'),\n ('Masayuki', 'O'),\n ('Okano', 'O'),\n ('75', 'O'),\n (')', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Syria', 'B-PER'),\n (':', 'O'),\n ('24', 'O'),\n ('-', 'O'),\n ('Salem', 'O'),\n ('Bitar', 'O'),\n (',', 'O'),\n ('3', 'O'),\n ('-', 'O'),\n ('Bachar', 'O'),\n ('Srour', 'O'),\n (';', 'O'),\n ('4', 'O'),\n ('-', 'O'),\n ('Hassan', 'B-PER'),\n ('Abbas', 'I-PER'),\n (',', 'O'),\n ('5', 'O'),\n ('-', 'O'),\n ('Tarek', 'O'),\n ('Jabban', 'O'),\n (',', 'O'),\n ('6', 'O'),\n ('-', 'O'),\n ('Ammar', 'O'),\n ('Awad', 'O'),\n ('(', 'O'),\n ('9', 'O'),\n ('-', 'O'),\n ('Louay', 'O'),\n ('Taleb', 'O'),\n ('69', 'O'),\n (')', 'O'),\n (',', 'O'),\n ('8', 'O'),\n ('-', 'O'),\n ('Nihad', 'O'),\n ('al-Boushi', 'O'),\n (',', 'O'),\n ('10', 'O'),\n ('-', 'O'),\n ('Mohammed', 'B-PER'),\n ('Afash', 'I-PER'),\n (',', 'O'),\n ('12', 'O'),\n ('-', 'O'),\n ('Ali', 'B-PER'),\n ('Dib', 'I-PER'),\n (',', 'O'),\n ('13', 'O'),\n ('-', 'O'),\n ('Abdul', 'B-PER'),\n ('Latif', 'I-PER'),\n ('Helou', 'O'),\n ('(', 'O'),\n ('17', 'O'),\n ('-', 'O'),\n ('Ammar', 'O'),\n ('Rihawiy', 'O'),\n ('46', 'O'),\n (')', 'O'),\n (',', 'O'),\n ('14', 'O'),\n ('-', 'O'),\n ('Khaled', 'B-PER'),\n ('Zaher', 'I-PER'),\n (';', 'O'),\n ('16', 'O'),\n ('-', 'O'),\n ('Nader', 'O'),\n ('Jokhadar', 'O'),\n ('.', 'O'),\n ('</S>', 'O')]"
|
|
},
|
|
"execution_count": 20,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"test_sentence(\"SOCCER - LATE GOALS GIVE JAPAN WIN OVER SYRIA . </S> AL-AIN , United Arab Emirates 1996-12-06 </S> Two goals in the last six minutes gave holders Japan an uninspiring 2-1 Asian Cup victory over Syria on Friday . </S> Takuya Takagi headed the winner in the 88th minute of the group C game after goalkeeper Salem Bitar spoiled a mistake-free display by allowing the ball to slip under his body . </S> It was the second Syrian defensive blunder in four minutes . </S> Defender Hassan Abbas rose to intercept a long ball into the area in the 84th minute but only managed to divert it into the top corner of Bitar 's goal . </S> Syria had taken the lead from their first serious attack in the seventh minute . </S> Nader Jokhadar headed a cross from the right by Ammar Awad into the top right corner of Kenichi Shimokawa 's goal . </S> Japan then laid siege to the Syrian penalty area and had a goal disallowed for offside in the 16th minute . </S> A minute later , Bitar produced a good double save , first from Kazuyoshi Miura 's header and then blocked a Takagi follow-up shot . </S> Bitar saved well again from Miura in the 37th minute , parrying away his header from a corner . </S> Japan started the second half brightly but Bitar denied them an equaliser when he dived to his right to save Naoki Soma 's low drive in the 53rd minute . </S> Japan : 19 - Kenichi Shimokawa , 2 - Hiroshige Yanagimoto , 3 - Naoki Soma , 4 - Masami Ihara , 5 - Norio Omura , 6 - Motohiro Yamaguchi , 8 - Masakiyo Maezono ( 7 - Yasuto Honda 71 ) , 9 - Takuya Takagi , 10 - Hiroshi Nanami , 11 - Kazuyoshi Miura , 15 - Hiroaki Morishima ( 14 - Masayuki Okano 75 ) . </S> Syria : 24 - Salem Bitar , 3 - Bachar Srour ; 4 - Hassan Abbas , 5 - Tarek Jabban , 6 - Ammar Awad ( 9 - Louay Taleb 69 ) , 8 - Nihad al-Boushi , 10 - Mohammed Afash , 12 - Ali Dib , 13 - Abdul Latif Helou ( 17 - Ammar Rihawiy 46 ) , 14 - Khaled Zaher ; 16 - Nader Jokhadar . </S>\")"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 127,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"tf.Tensor([ 1 16 1 ... 0 0 0], shape=(2048,), dtype=int64)\n",
|
|
"[[[9.1573365e-02 8.5647009e-02 1.1034752e-01 ... 8.8930450e-02\n",
|
|
" 8.8644758e-02 8.9963131e-02]\n",
|
|
" [5.5477720e-02 4.6575051e-02 5.2461910e-01 ... 6.4232960e-02\n",
|
|
" 4.4661559e-02 5.8426060e-02]\n",
|
|
" [4.9609054e-02 4.3161135e-02 4.3743923e-01 ... 9.0816177e-02\n",
|
|
" 4.6578653e-02 5.5895649e-02]\n",
|
|
" ...\n",
|
|
" [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794731e-08\n",
|
|
" 2.6146161e-08 5.0399006e-08]\n",
|
|
" [9.9999696e-01 1.7784757e-08 2.5151198e-07 ... 1.3794731e-08\n",
|
|
" 2.6146161e-08 5.0399006e-08]\n",
|
|
" [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794731e-08\n",
|
|
" 2.6146161e-08 5.0399006e-08]]]\n",
|
|
"(1, 2048, 11)\n",
|
|
"[2 2 2 ... 0 0 0]\n",
|
|
"2048\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"text/plain": "[('Mussolini', 'O'),\n (\"'s\", 'O'),\n ('granddaughter', 'O'),\n ('rejoins', 'O'),\n ('far-right', 'O'),\n ('party', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('ROME', 'B-LOC'),\n ('1996-12-06', 'O'),\n ('</S>', 'O'),\n ('Alessandra', 'O'),\n ('Mussolini', 'O'),\n (',', 'O'),\n ('the', 'O'),\n ('granddaughter', 'O'),\n ('of', 'O'),\n ('Italy', 'B-LOC'),\n (\"'s\", 'O'),\n ('Fascist', 'O'),\n ('dictator', 'O'),\n ('Benito', 'B-PER'),\n ('Mussolini', 'I-PER'),\n (',', 'O'),\n ('said', 'O'),\n ('on', 'O'),\n ('Friday', 'O'),\n ('she', 'O'),\n ('had', 'O'),\n ('rejoined', 'O'),\n ('the', 'O'),\n ('far-right', 'O'),\n ('National', 'B-PER'),\n ('Alliance', 'I-PER'),\n ('(', 'O'),\n ('AN', 'O'),\n (')', 'O'),\n ('party', 'O'),\n ('she', 'O'),\n ('quit', 'O'),\n ('over', 'O'),\n ('policy', 'O'),\n ('differences', 'O'),\n ('last', 'O'),\n ('month', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('\"', 'O'),\n ('I', 'O'),\n (\"'ve\", 'O'),\n ('gone', 'O'),\n ('back', 'O'),\n (',', 'O'),\n ('\"', 'O'),\n ('she', 'O'),\n ('told', 'O'),\n ('a', 'O'),\n ('radio', 'O'),\n ('show', 'O'),\n ('shortly', 'O'),\n ('after', 'O'),\n ('AN', 'O'),\n ('leader', 'O'),\n ('Gianfranco', 'B-PER'),\n ('Fini', 'I-PER'),\n (',', 'O'),\n ('who', 'O'),\n ('was', 'O'),\n ('being', 'O'),\n ('interviewed', 'O'),\n ('on', 'O'),\n ('the', 'O'),\n ('programme', 'O'),\n (',', 'O'),\n ('said', 'O'),\n ('the', 'O'),\n ('row', 'O'),\n ('had', 'O'),\n ('been', 'O'),\n ('resolved', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('\"', 'O'),\n ('He', 'O'),\n ('did', 'O'),\n (\"n't\", 'O'),\n ('want', 'O'),\n ('to', 'O'),\n ('lose', 'O'),\n ('me', 'O'),\n ('and', 'O'),\n ('I', 'O'),\n ('did', 'O'),\n (\"n't\", 'O'),\n ('want', 'O'),\n ('to', 'O'),\n ('lose', 'O'),\n ('him', 'O'),\n ('.', 'O'),\n ('\"', 'O'),\n ('</S>', 'O'),\n ('Fini', 'O'),\n ('told', 'O'),\n ('state', 'O'),\n ('radio', 'O'),\n ('RAI', 'B-PER'),\n ('he', 'O'),\n ('met', 'O'),\n ('Mussolini', 'O'),\n ('thanks', 'O'),\n ('to', 'O'),\n ('the', 'O'),\n ('good', 'O'),\n ('offices', 'O'),\n ('of', 'O'),\n ('Giuseppe', 'B-PER'),\n ('Tatarella', 'I-PER'),\n (',', 'O'),\n ('AN', 'O'),\n (\"'s\", 'O'),\n ('leader', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('Chamber', 'B-PER'),\n ('of', 'O'),\n ('Deputies', 'O'),\n ('(', 'O'),\n ('lower', 'O'),\n ('house', 'O'),\n (')', 'O'),\n (',', 'O'),\n ('and', 'O'),\n ('had', 'O'),\n ('overcome', 'O'),\n ('their', 'O'),\n ('differences', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Mussolini', 'O'),\n (',', 'O'),\n ('33', 'O'),\n (',', 'O'),\n ('resigned', 'O'),\n ('from', 'O'),\n ('the', 'O'),\n ('parliamentary', 'O'),\n ('party', 'O'),\n ('group', 'O'),\n ('for', 'O'),\n ('what', 'O'),\n ('she', 'O'),\n ('said', 'O'),\n ('were', 'O'),\n ('strictly', 'O'),\n ('political', 'O'),\n ('reasons', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('The', 'O'),\n ('fiery', 'O'),\n ('politician', 'O'),\n (',', 'O'),\n ('who', 'O'),\n ('is', 'O'),\n ('also', 'O'),\n ('a', 'O'),\n ('niece', 'O'),\n ('of', 'O'),\n ('screen', 'O'),\n ('star', 'O'),\n ('Sophia', 'B-PER'),\n ('Loren', 'I-PER'),\n (',', 'O'),\n ('had', 'O'),\n ('accused', 'O'),\n ('AN', 'O'),\n ('leaders', 'O'),\n ('of', 'O'),\n ('stifling', 'O'),\n ('internal', 'O'),\n ('party', 'O'),\n ('debate', 'O'),\n ('.', 'O'),\n ('</S>', 'O'),\n ('Mussolini', 'O'),\n (',', 'O'),\n ('who', 'O'),\n ('sits', 'O'),\n ('in', 'O'),\n ('the', 'O'),\n ('Chamber', 'B-PER'),\n (',', 'O'),\n ('told', 'O'),\n ('La', 'B-ORG'),\n ('Stampa', 'I-ORG'),\n ('newspaper', 'O'),\n ('last', 'O'),\n ('month', 'O'),\n ('after', 'O'),\n ('quitting', 'O'),\n ('AN', 'O'),\n (\"'s\", 'O'),\n ('parliamentary', 'O'),\n ('party', 'O'),\n ('that', 'O'),\n ('she', 'O'),\n ('was', 'O'),\n ('considering', 'O'),\n ('joining', 'O'),\n ('the', 'O'),\n ('neo-fascist', 'O'),\n ('Social', 'B-ORG'),\n ('Movement', 'I-ORG'),\n ('(', 'O'),\n ('MS-Fiamma', 'O'),\n (')', 'O'),\n ('formed', 'O'),\n ('by', 'O'),\n ('some', 'O'),\n ('of', 'O'),\n ('the', 'O'),\n ('Duce', 'O'),\n (\"'s\", 'O'),\n ('World', 'B-ORG'),\n ('War', 'I-ORG'),\n ('Two', 'O'),\n ('followers', 'O'),\n ('.', 'O'),\n ('</S>', 'O')]"
|
|
},
|
|
"execution_count": 127,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"news_string = \"\"\"Mussolini 's granddaughter rejoins far-right party . </S> ROME 1996-12-06 </S> Alessandra Mussolini , the granddaughter of Italy 's Fascist dictator Benito Mussolini , said on Friday she had rejoined the far-right National Alliance ( AN ) party she quit over policy differences last month . </S> \" I 've gone back , \" she told a radio show shortly after AN leader Gianfranco Fini , who was being interviewed on the programme , said the row had been resolved . </S> \" He did n't want to lose me and I did n't want to lose him . \" </S> Fini told state radio RAI he met Mussolini thanks to the good offices of Giuseppe Tatarella , AN 's leader in the Chamber of Deputies ( lower house ) , and had overcome their differences . </S> Mussolini , 33 , resigned from the parliamentary party group for what she said were strictly political reasons . </S> The fiery politician , who is also a niece of screen star Sophia Loren , had accused AN leaders of stifling internal party debate . </S> Mussolini , who sits in the Chamber , told La Stampa newspaper last month after quitting AN 's parliamentary party that she was considering joining the neo-fascist Social Movement ( MS-Fiamma ) formed by some of the Duce 's World War Two followers . </S>\"\"\"\n",
|
|
"\n",
|
|
"test_sentence(news_string)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 126,
|
|
"outputs": [],
|
|
"source": [
|
|
"model.save(\"model_v2.keras\")"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"outputs": [],
|
|
"source": [
|
|
"import keras\n",
|
|
"model = keras.models.load_model('model_v2.keras')"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"ERROR:tensorflow:==================================\n",
|
|
"Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):\n",
|
|
"<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x00000262307DCA00>\n",
|
|
"If you want to mark it as used call its \"mark_used()\" method.\n",
|
|
"It was originally created here:\n",
|
|
" File \"C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\keras\\backend.py\", line 5130, in <genexpr>\n",
|
|
" ta.write(ta_index_to_write, out) File \"C:\\Users\\Adrian\\miniconda3\\lib\\site-packages\\tensorflow\\python\\util\\tf_should_use.py\", line 243, in wrapped\n",
|
|
" return _add_should_use_warning(fn(*args, **kwargs),\n",
|
|
"==================================\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"with open(\"en-ner-conll-2003/dev-0/in.tsv\", \"r\", encoding=\"utf-8\") as f:\n",
|
|
" lines = f.readlines()\n",
|
|
"processed = [\" \".join(get_ner_output_single_sentence(x)) for x in lines if len(x.strip())>0]\n",
|
|
"with open('en-ner-conll-2003/dev-0/out.tsv', 'w',encoding=\"utf-8\") as f:\n",
|
|
" for line in processed:\n",
|
|
" f.write(f\"{line}\\n\")"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(\"en-ner-conll-2003/test-A/in.tsv\", \"r\", encoding=\"utf-8\") as f:\n",
|
|
" lines = f.readlines()\n",
|
|
"processed = [\" \".join(get_ner_output_single_sentence(x)) for x in lines if len(x.strip())>0]\n",
|
|
"with open('en-ner-conll-2003/test-A/out.tsv', 'w',encoding=\"utf-8\") as f:\n",
|
|
" for line in processed:\n",
|
|
" f.write(f\"{line}\\n\")"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"source": [
|
|
"### Czyszczenie tagów"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"{'B-LOC', 'I-LOC', 'O', 'I-MISC', 'B-ORG', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC'}\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"tag_set = set()\n",
|
|
"with open(\"en-ner-conll-2003/dev-0/out.tsv\", \"r\", encoding=\"utf-8\") as f:\n",
|
|
" lines = f.readlines()\n",
|
|
"for line in lines:\n",
|
|
" line_split = line.split()\n",
|
|
" for tag in line_split:\n",
|
|
" if tag not in tag_set:\n",
|
|
" tag_set.add(tag)\n",
|
|
"print(tag_set)"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"outputs": [],
|
|
"source": [
|
|
"inter_to_begin_mapping = {\n",
|
|
" \"I-LOC\": \"B-LOC\",\n",
|
|
" \"I-MISC\": 'B-MISC',\n",
|
|
" 'I-ORG': 'B-ORG',\n",
|
|
" 'I-PER': 'B-PER'\n",
|
|
"}\n",
|
|
"begin_to_inter_mapping = {v: k for k, v in inter_to_begin_mapping.items()}"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "{'I-LOC': 'B-LOC', 'I-MISC': 'B-MISC', 'I-ORG': 'B-ORG', 'I-PER': 'B-PER'}"
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"inter_to_begin_mapping"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": "{'B-LOC': 'I-LOC', 'B-MISC': 'I-MISC', 'B-ORG': 'I-ORG', 'B-PER': 'I-PER'}"
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"begin_to_inter_mapping"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"outputs": [],
|
|
"source": [
|
|
"def fix_tags_in_file(filename, filename_fixed):\n",
|
|
" lines_fixed = []\n",
|
|
" with open(filename, \"r\", encoding=\"utf-8\") as f:\n",
|
|
" lines = f.readlines()\n",
|
|
" lines_tokenized = [line.split() for line in lines]\n",
|
|
" for line in lines_tokenized:\n",
|
|
" line_fixed = []\n",
|
|
" for counter, element in enumerate(line):\n",
|
|
" if element==\"O\": # O tag can be placed anywhere\n",
|
|
" line_fixed.append(element)\n",
|
|
" elif element in inter_to_begin_mapping:\n",
|
|
" if counter==0: # Beginning of line, can't check previous tag\n",
|
|
" line_fixed.append(inter_to_begin_mapping[element])\n",
|
|
" else:\n",
|
|
" previous_element = line_fixed[counter-1]\n",
|
|
" if previous_element==element or previous_element==inter_to_begin_mapping[element]: # Tag was compatible (same inters or compatible B-->I)\n",
|
|
" line_fixed.append(element)\n",
|
|
" elif previous_element==\"O\": # O--> Inter\n",
|
|
" line_fixed.append(inter_to_begin_mapping[element])\n",
|
|
" elif previous_element in inter_to_begin_mapping and element in inter_to_begin_mapping and previous_element!=element: # Incompatible subsequent inter-tags\n",
|
|
" line_fixed.append(previous_element)\n",
|
|
" else: # Begin --> Incompatible inter\n",
|
|
" corrected_tag = begin_to_inter_mapping[previous_element]\n",
|
|
" line_fixed.append(corrected_tag)\n",
|
|
" elif element in begin_to_inter_mapping: # Beginning tag can be added safely\n",
|
|
" line_fixed.append(element)\n",
|
|
" else:\n",
|
|
" print(\"This shouldn't happen\")\n",
|
|
" lines_fixed.append(\" \".join(line_fixed))\n",
|
|
" with open(filename_fixed, \"w\", encoding=\"utf-8\") as f:\n",
|
|
" for line in lines_fixed:\n",
|
|
" f.write(f\"{line}\\n\")\n",
|
|
"fix_tags_in_file(\"en-ner-conll-2003/test-A/out.tsv\", \"en-ner-conll-2003/test-A/out_fixed.tsv\")"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"outputs": [],
|
|
"source": [
|
|
"fix_tags_in_file(\"en-ner-conll-2003/dev-0/out.tsv\", \"en-ner-conll-2003/dev-0/out_fixed.tsv\")"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [],
|
|
"metadata": {
|
|
"collapsed": false
|
|
}
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 2
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython2",
|
|
"version": "2.7.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0
|
|
}
|