dl_rnn/train.ipynb at master

Adrian Klessa a43491188b Tag cleanup (remove incompatible subsequent tags in the output)

2024-05-27 21:28:31 +02:00

84 KiB

Raw Permalink Blame History

import pandas as pd

training_file = pd.read_csv("en-ner-conll-2003/train/train.tsv", sep='\t', on_bad_lines="warn", names=["label","document"])
training_file.head()

	label	document
0	B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ...	EU rejects German call to boycott British lamb...
1	O B-PER O O O O O O O O O B-LOC O O O O O O O ...	Rare Hendrix song draft sells for almost $ 17,...
2	B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ...	China says Taiwan spoils atmosphere for talks ...
3	B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ...	China says time right for Taiwan talks . </S> ...
4	B-MISC O O O O O O O O O O O B-LOC O O B-MISC ...	German July car registrations up 14.2 pct yr /...

import tensorflow as tf
training_file["tag_list"] = training_file["label"].apply(lambda x : x.split())
training_file["tokenized"] = training_file["document"].apply(lambda x : x.split())
training_file["len_tags"] = training_file["tag_list"].apply(len)
training_file["len_tokenized"] = training_file["tokenized"].apply(len)

training_file.loc[~(training_file['len_tokenized'] == training_file['len_tags'])]

	label	document	tag_list	tokenized	len_tags	len_tokenized

training_file.head()

	label	document	tag_list	tokenized	len_tags	len_tokenized
0	B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ...	EU rejects German call to boycott British lamb...	[B-ORG, O, B-MISC, O, O, O, B-MISC, O, O, O, B...	[EU, rejects, German, call, to, boycott, Briti...	489	489
1	O B-PER O O O O O O O O O B-LOC O O O O O O O ...	Rare Hendrix song draft sells for almost $ 17,...	[O, B-PER, O, O, O, O, O, O, O, O, O, B-LOC, O...	[Rare, Hendrix, song, draft, sells, for, almos...	197	197
2	B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ...	China says Taiwan spoils atmosphere for talks ...	[B-LOC, O, B-LOC, O, O, O, O, O, O, B-LOC, O, ...	[China, says, Taiwan, spoils, atmosphere, for,...	248	248
3	B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ...	China says time right for Taiwan talks . </S> ...	[B-LOC, O, O, O, O, B-LOC, O, O, O, B-LOC, O, ...	[China, says, time, right, for, Taiwan, talks,...	80	80
4	B-MISC O O O O O O O O O O O B-LOC O O B-MISC ...	German July car registrations up 14.2 pct yr /...	[B-MISC, O, O, O, O, O, O, O, O, O, O, O, B-LO...	[German, July, car, registrations, up, 14.2, p...	235	235

max_length = training_file["len_tokenized"].max()
print(max_length) # 1532 ---> ~2048

Testowanie wektoryzacji / dewektoryzacji tekstu

vectorize_layer = tf.keras.layers.TextVectorization(standardize=None)
vectorize_layer.adapt(training_file["document"])
print(vectorize_layer(training_file["document"][20]))

tf.Tensor(
[18792   316  1335   896     8   479  7287   284     3     2 18492     4
 11364     3     2   137     2 18793 18637 20290   346    15    14    68
    27     9  1335  9461    59  3210    42  5299   507     6    52  4906
    71     7    64  1712   554    49   540     3     2    20   132    15
    27   257     5   540     4    60   536   232    18     4    37  1257
    52   234    71  1398  1164     6    64  2541 23235    65   880  5156
   280  3526     3     2    20  5156    40  1257    17    52 22125    71
     3     2  2016 18381     4   449   834  1318     6     5 13472    12
  1339  2356   132     4    15     5  9461    13  1240    42  2542     8
  2525     5   132    16  8166   666   724  1190    12  2129   618   622
  5276    12   836     3    13     2], shape=(126,), dtype=int64)

len(training_file["document"][20].split())

vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 '</S>',
 '.',
 ',',
 'the',
 'of',
 'in',
 'to',
 'a',
 ')',
 '(',
 'and',
 '"',
 'on',
 'said',
 "'s",
 'for',
 '1',
 '-',
 'The',
 'was',
 '2',
 '0',
 '3',
 'at',
 'with',
 'that',
 'from',
 'by',
 'is',
 ':',
 'as',
 'he',
 '4',
 'had',
 'has',
 'it',
 'his',
 'not',
 'were',
 'be',
 'an',
 'have',
 'after',
 'who',
 'will',
 '5',
 'but',
 'first',
 'U.S.',
 'been',
 '$',
 '--',
 'two',
 'their',
 'are',
 '6',
 'beat',
 'would',
 'which',
 'up',
 'I',
 'they',
 'its',
 'percent',
 'year',
 'out',
 'Thursday',
 'this',
 'last',
 'million',
 'over',
 'Wednesday',
 'one',
 '7',
 'government',
 'against',
 '/',
 'police',
 'when',
 'second',
 'also',
 'Tuesday',
 'He',
 'It',
 'A',
 'three',
 'told',
 'new',
 '10',
 'Monday',
 'or',
 'about',
 'Friday',
 'people',
 'In',
 'her',
 '9',
 '1996-08-28',
 'no',
 'won',
 'we',
 'New',
 'into',
 'under',
 'some',
 'Sunday',
 'But',
 '8',
 'more',
 'before',
 'week',
 "'",
 'time',
 'than',
 'market',
 'could',
 'Germany',
 'points',
 'We',
 'between',
 'Australia',
 'years',
 'since',
 'Britain',
 'other',
 'AT',
 'SOCCER',
 'played',
 'all',
 'state',
 'company',
 'France',
 'England',
 'Saturday',
 'only',
 '1996-08-22',
 'officials',
 'group',
 '1996-08-29',
 'there',
 'round',
 '1996',
 'South',
 'Minister',
 '1996-08-27',
 '11',
 'off',
 'match',
 '13',
 'six',
 'four',
 'down',
 '6-4',
 '6-3',
 'because',
 '21',
 'five',
 '15',
 'him',
 'Spain',
 '1996-08-26',
 'next',
 'President',
 'official',
 'former',
 'she',
 'home',
 'United',
 'third',
 'do',
 'spokesman',
 'just',
 'games',
 'expected',
 'did',
 'day',
 'win',
 'through',
 'statement',
 'made',
 'NEW',
 '70',
 '12',
 '1996-08-23',
 'them',
 'lost',
 '14',
 'world',
 'where',
 '6-2',
 '20',
 'September',
 'Russian',
 'July',
 'shares',
 "n't",
 'if',
 'back',
 'RESULTS',
 'Italy',
 'YORK',
 'China',
 'August',
 'president',
 'Cup',
 '3.',
 '2.',
 'DIVISION',
 '1.',
 'Clinton',
 'British',
 'while',
 'seconds',
 'any',
 'LONDON',
 'Japan',
 'reported',
 'billion',
 '69',
 'matches',
 'v',
 'team',
 'month',
 'Russia',
 'division',
 'Pakistan',
 'meeting',
 'being',
 'They',
 'London',
 'June',
 'European',
 '30',
 'news',
 'added',
 'German',
 '71',
 '1996-08-25',
 'still',
 'peace',
 'metres',
 'half',
 'Results',
 'At',
 '1/2',
 'talks',
 'set',
 'earlier',
 'tonnes',
 'killed',
 'season',
 'now',
 'Sweden',
 'take',
 'held',
 'during',
 'Reuters',
 'should',
 'part',
 'around',
 'India',
 'party',
 'elections',
 'National',
 'took',
 'game',
 'Bank',
 'soccer',
 'number',
 'minutes',
 'lead',
 'innings',
 'early',
 'capital',
 '68',
 '6-1',
 'saying',
 'end',
 'due',
 'days',
 'b',
 '7-6',
 'results',
 'Open',
 '100',
 'so',
 'foreign',
 'you',
 'political',
 'per',
 'international',
 'final',
 'can',
 'York',
 'West',
 'Belgium',
 '22',
 'well',
 'victory',
 'most',
 'Newsroom',
 'French',
 'Netherlands',
 '50',
 'visit',
 'seven',
 'country',
 'champion',
 'Iraq',
 '25',
 'our',
 'minute',
 'Israel',
 'American',
 'says',
 'left',
 'Czech',
 'Africa',
 '66',
 '1996-08-24',
 'profit',
 'play',
 'LEAGUE',
 '4.',
 'vs.',
 'league',
 '67',
 '6.',
 '5.',
 'very',
 'local',
 'leader',
 'Republic',
 '7-5',
 '24',
 '1995',
 'war',
 'same',
 'go',
 'found',
 'support',
 'run',
 'newsroom',
 'close',
 'Inc',
 'then',
 'say',
 'meet',
 'man',
 'called',
 'World',
 'States',
 'CHICAGO',
 'what',
 'town',
 'singles',
 'prices',
 'military',
 'lower',
 'eight',
 'both',
 'ago',
 '64',
 'runs',
 'put',
 'newspaper',
 'deal',
 'bank',
 'Moscow',
 'Mark',
 '72',
 'trade',
 'rate',
 'race',
 'make',
 'goals',
 'cents',
 'St',
 'OF',
 'Men',
 '60',
 '16',
 'pct',
 'months',
 'issue',
 'gave',
 'behind',
 'There',
 'Prime',
 'May',
 'opposition',
 'minister',
 'good',
 'ended',
 'city',
 'Women',
 'Michael',
 'League',
 'Hong',
 'FIRST',
 '75',
 'tournament',
 'report',
 'rebels',
 'leaders',
 'Iraqi',
 'Dutch',
 'weekend',
 'until',
 'security',
 'price',
 'plan',
 'northern',
 'net',
 'near',
 'late',
 'get',
 'dollar',
 'agreed',
 'Kong',
 'Australian',
 '74',
 '7.',
 'top',
 'record',
 'players',
 'going',
 'agency',
 'Attendance',
 'African',
 ';',
 '73',
 'want',
 'start',
 'refugees',
 'miles',
 'drawn',
 'another',
 'Sri',
 'Paul',
 'taking',
 'sales',
 'place',
 'office',
 'my',
 'economic',
 'court',
 'chief',
 'arrested',
 'SAN',
 'John',
 'Democratic',
 'David',
 'CRICKET',
 '8.',
 'those',
 'quoted',
 'demand',
 'championship',
 'allowed',
 'Party',
 'Palestinian',
 'Israeli',
 'GMT',
 'Corp',
 'Commission',
 'Ahmed',
 'women',
 'several',
 'many',
 'including',
 'central',
 'already',
 'IN',
 'Foreign',
 'television',
 'km',
 'hit',
 'following',
 'de',
 'Yeltsin',
 'Martin',
 'Arafat',
 '28',
 '17',
 'southern',
 'men',
 'may',
 'later',
 'forces',
 'fell',
 'authorities',
 'ahead',
 'Union',
 'M.',
 'Dole',
 '31',
 '26',
 '1-0',
 'work',
 'whether',
 'weeks',
 'way',
 'troops',
 'reporters',
 'loss',
 'hours',
 'election',
 'came',
 'announced',
 'Brazil',
 '19',
 'vs',
 'return',
 'parliament',
 'night',
 'higher',
 'general',
 'closed',
 'Zealand',
 'Finland',
 'Chicago',
 '65',
 '23',
 '1994',
 '18',
 '...',
 'went',
 'test',
 'share',
 'power',
 'plans',
 'national',
 'decision',
 'began',
 'agreement',
 'This',
 'trading',
 'quarter',
 'oil',
 'north',
 'morning',
 'ministry',
 'like',
 'head',
 'few',
 'countries',
 'away',
 'asked',
 'Washington',
 'Police',
 'Lebed',
 '1997',
 'taken',
 'money',
 'main',
 'leading',
 'index',
 'fighting',
 'Sydney',
 'Olympic',
 'English',
 'Austria',
 'such',
 'signed',
 'side',
 'scored',
 'rights',
 'past',
 'much',
 'major',
 'hits',
 'current',
 'c',
 'business',
 'budget',
 'army',
 'U.N.',
 'STANDINGS',
 'Canada',
 '63',
 'think',
 'nine',
 'growth',
 'area',
 'Ukraine',
 'Standings',
 'Europe',
 'East',
 '40',
 'winning',
 'total',
 'strike',
 'region',
 'recent',
 'previous',
 'own',
 'draw',
 'campaign',
 'attack',
 'accused',
 'Two',
 'On',
 'Lanka',
 'Co',
 '96',
 '62',
 'working',
 'without',
 'vote',
 'these',
 'seen',
 'plane',
 'led',
 'hold',
 'high',
 'future',
 'died',
 'control',
 'club',
 'cash',
 'best',
 'available',
 'again',
 'White',
 'PCT',
 'Ireland',
 'Akram',
 '9.',
 '27',
 '---',
 'wickets',
 'tour',
 'sent',
 'right',
 'released',
 'might',
 'little',
 'help',
 'give',
 'fourth',
 'failed',
 'does',
 'conference',
 'ceasefire',
 'case',
 'ban',
 'Paris',
 'March',
 'Dutroux',
 "'re",
 'us',
 'started',
 'prime',
 'period',
 'overs',
 'me',
 'manager',
 'long',
 'least',
 'embassy',
 'disease',
 'cut',
 'champions',
 'average',
 'No',
 'Italian',
 'City',
 'An',
 '29',
 'tennis',
 'stories',
 'service',
 'production',
 'planned',
 'order',
 'members',
 'free',
 'airport',
 'across',
 'Wasim',
 'Thomas',
 'October',
 'Leading',
 'Kurdish',
 'Costa',
 'Chechnya',
 'Aug',
 'Ajax',
 '59',
 'yet',
 'strong',
 'shot',
 'short',
 'rose',
 'public',
 'press',
 'given',
 'declared',
 'children',
 'bonds',
 'Slovakia',
 'San',
 'Romania',
 'Republican',
 'Ministry',
 'Jordan',
 'Bosnian',
 'Bosnia',
 'BASEBALL',
 'April',
 '54',
 '10.',
 'trying',
 'tabulate',
 'stock',
 'standings',
 'seed',
 'reports',
 'possible',
 'must',
 'markets',
 'interest',
 'hospital',
 'further',
 'State',
 'Moslem',
 'Jerusalem',
 'If',
 'CITY',
 'Amsterdam',
 'A.',
 'woman',
 'used',
 'term',
 'series',
 'received',
 'rates',
 'opening',
 'law',
 'known',
 'industry',
 'guerrillas',
 'forced',
 'fifth',
 'face',
 'death',
 'come',
 'coach',
 'clear',
 'charges',
 'brought',
 'Taiwan',
 'TENNIS',
 'She',
 'Robert',
 'Poland',
 'Peter',
 'Nigeria',
 'Ltd',
 'Kenya',
 '2-0',
 'yen',
 'train',
 'squad',
 'small',
 'showed',
 'private',
 'point',
 'passengers',
 'old',
 'likely',
 'injured',
 'immediately',
 'estimated',
 'details',
 'despite',
 'date',
 'companies',
 'call',
 'Turkey',
 'That',
 'PARIS',
 'Argentina',
 '48',
 '1,000',
 '*',
 "'S",
 'workers',
 'use',
 'trip',
 'result',
 'process',
 'policy',
 'named',
 'level',
 'latest',
 'human',
 'groups',
 'got',
 'forecast',
 'figures',
 'each',
 'daily',
 'contract',
 'captain',
 'better',
 'action',
 'Wimbledon',
 'One',
 'North',
 'Nations',
 'L',
 'Japanese',
 'Iran',
 'Egypt',
 'California',
 '76',
 '61',
 '0-0',
 'unless',
 'soon',
 'sold',
 'seeding',
 'see',
 'rise',
 'prison',
 'pay',
 'how',
 'holiday',
 'halftime',
 'force',
 'financial',
 'exports',
 'earnings',
 'believed',
 'analysts',
 'Younis',
 'Waqar',
 'TORONTO',
 'Serb',
 'PSV',
 'Mushtaq',
 'Croft',
 'Belgian',
 'BALTIMORE',
 'Atlanta',
 '6-0',
 'within',
 'violence',
 'today',
 'title',
 'times',
 'straight',
 'scheduled',
 'rule',
 'road',
 'pound',
 'playing',
 'nearly',
 'making',
 'levels',
 'convention',
 'confirmed',
 'coming',
 'chairman',
 'border',
 'Total',
 'TO',
 'Security',
 'S.',
 'International',
 'Exchange',
 '56',
 'tried',
 'struck',
 'south',
 'services',
 'senior',
 'reached',
 'position',
 'nuclear',
 'met',
 'message',
 'know',
 'keep',
 'inning',
 'independence',
 'illegal',
 'homer',
 'gold',
 'completed',
 'comment',
 'charged',
 'buy',
 'Switzerland',
 'Saudi',
 'OPEN',
 'Net',
 'Mullally',
 'Khan',
 'Indian',
 'Halftime',
 'Grand',
 'First',
 'Central',
 'Bill',
 'AMSTERDAM',
 '55',
 '53',
 'wife',
 'wheat',
 'tie',
 'sell',
 'rebel',
 'problem',
 'prefix',
 'poor',
 'percentage',
 'parties',
 'outside',
 'opened',
 'letter',
 'kept',
 'island',
 'here',
 'health',
 'ground',
 'full',
 'even',
 'course',
 'continue',
 'conditions',
 'civil',
 'change',
 'centre',
 'based',
 'attacks',
 'arrived',
 'areas',
 'aggregate',
 'able',
 'PUK',
 'OSCE',
 'Netanyahu',
 'Mexico',
 'Grozny',
 'Group',
 'FOR',
 'Chinese',
 'Association',
 'After',
 '58',
 '45',
 '2-1',
 '1-1',
 "'m",
 'winner',
 'village',
 'treaty',
 'too',
 'system',
 'step',
 'stage',
 'source',
 'returned',
 'radio',
 'penalty',
 'paper',
 'needed',
 'less',
 'leg',
 'leave',
 'himself',
 'great',
 'goal',
 'flight',
 'economy',
 'director',
 'denotes',
 'denied',
 'break',
 'bond',
 'big',
 'according',
 'Williams',
 'Turkish',
 'Swiss',
 'Result',
 'PRESS',
 'Johnson',
 'House',
 'FRANCISCO',
 'EU',
 'DIGEST',
 'BOSTON',
 '77',
 '42',
 ...]

import numpy as np
vocabulary = vectorize_layer.get_vocabulary()
vocab_arr = np.asarray(vocabulary)
" ".join(vocab_arr[vectorize_layer(training_file["document"][20])])

'Kindercare says debt buy to hit Q1 results . </S> MONTGOMERY , Ala . </S> 1996-08-22 </S> KinderCare Learning Centers Inc said on Thursday that a debt buyback would mean an extraordinary loss of $ 1.2 million in its fiscal 1997 first quarter . </S> The company said that during the quarter , which began June 1 , it bought $ 30 million par value of its outstanding 10-3/8 percent senior notes due 2001 . </S> The notes were bought for $ 31.5 million . </S> Philip Maslowe , chief financial officer of the preschool and child care company , said the buyback " offered an opportunity to reduce the company \'s weighted average interest costs and improve future cash flows and earnings . " </S>'

training_file["document"][20]

'Kindercare says debt buy to hit Q1 results . </S> MONTGOMERY , Ala . </S> 1996-08-22 </S> KinderCare Learning Centers Inc said on Thursday that a debt buyback would mean an extraordinary loss of $ 1.2 million in its fiscal 1997 first quarter . </S> The company said that during the quarter , which began June 1 , it bought $ 30 million par value of its outstanding 10-3/8 percent senior notes due 2001 . </S> The notes were bought for $ 31.5 million . </S> Philip Maslowe , chief financial officer of the preschool and child care company , said the buyback " offered an opportunity to reduce the company \'s weighted average interest costs and improve future cash flows and earnings . " </S>'

# Separate vectorizer for input / output

training_file["len_tokenized"].plot.bar()

<AxesSubplot:>

Padding przykładów do 2048 słów

sentence_vectorizer = tf.keras.layers.TextVectorization(standardize=None, output_sequence_length=2048)
sentence_vectorizer.adapt(training_file["document"])
print(sentence_vectorizer(training_file["document"][20]))

tf.Tensor([18792   316  1335 ...     0     0     0], shape=(2048,), dtype=int64)

label_vectorizer = tf.keras.layers.TextVectorization(standardize=None, output_sequence_length=2048)
label_vectorizer.adapt(training_file["label"])
print(label_vectorizer(training_file["label"][20]))

tf.Tensor([2 2 2 ... 0 0 0], shape=(2048,), dtype=int64)

tags_list = label_vectorizer.get_vocabulary()
tags_length = label_vectorizer.vocabulary_size()

vocab_list = sentence_vectorizer.get_vocabulary()
vocab_length = sentence_vectorizer.vocabulary_size()

training_file["document_vectorized"] = training_file["document"].apply(sentence_vectorizer)
training_file["label_vectorized"] = training_file["label"].apply(label_vectorizer)

from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
train, valid = train_test_split(training_file, test_size=0.2)
train_x = np.stack(train["document_vectorized"].values)
train_y = np.stack(train["label_vectorized"].values)
train_y = np.array([to_categorical(i,num_classes = tags_length) for i in  train_y])

val_x = np.stack(valid["document_vectorized"].values)
val_y = np.stack(valid["label_vectorized"].values)
val_y = np.array([to_categorical(i,num_classes = tags_length) for i in  val_y])

print(val_x[0])

[2014   19  122 ...    0    0    0]

train_x.shape

(756, 2048)

train_y.shape

(756, 2048, 11)

train_x[0]

array([ 128,   19, 1368, ...,    0,    0,    0], dtype=int64)

train_y[0]

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

from keras.optimizers import Adam
import keras.layers as layers
import keras


def create_model():
    input_layer = layers.Input(shape=(2048,))
    embedding_layer = layers.Embedding(input_dim = vocab_length+1,output_dim = 128,input_length = 2048)(input_layer)
    lstm_layer = layers.LSTM(256, return_sequences=True)(embedding_layer)
    output_layer = layers.TimeDistributed(layers.Dense(tags_length,activation="softmax"))(lstm_layer)
    #out = layers.Dense(2048,activation="linear")(dropout)
    model = keras.Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    return model
model = create_model()
model.summary()

Model: "model_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_16 (InputLayer)       [(None, 2048)]            0         
                                                                 
 embedding_15 (Embedding)    (None, 2048, 128)         3024256   
                                                                 
 lstm_20 (LSTM)              (None, 2048, 256)         394240    
                                                                 
 time_distributed_18 (TimeDi  (None, 2048, 11)         2827      
 stributed)                                                      
                                                                 
=================================================================
Total params: 3,421,323
Trainable params: 3,421,323
Non-trainable params: 0
_________________________________________________________________

callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)
history = model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=50, callbacks=[callback])

Epoch 1/50
24/24 [==============================] - 29s 1s/step - loss: 0.6602 - accuracy: 0.8703 - val_loss: 0.2673 - val_accuracy: 0.9425
Epoch 2/50
24/24 [==============================] - 27s 1s/step - loss: 0.2500 - accuracy: 0.9653 - val_loss: 0.1613 - val_accuracy: 0.9781
Epoch 3/50
24/24 [==============================] - 28s 1s/step - loss: 0.1062 - accuracy: 0.9790 - val_loss: 0.0984 - val_accuracy: 0.9793
Epoch 4/50
24/24 [==============================] - 28s 1s/step - loss: 0.0920 - accuracy: 0.9806 - val_loss: 0.0936 - val_accuracy: 0.9799
Epoch 5/50
24/24 [==============================] - 28s 1s/step - loss: 0.0874 - accuracy: 0.9812 - val_loss: 0.0901 - val_accuracy: 0.9800
Epoch 6/50
24/24 [==============================] - 27s 1s/step - loss: 0.0828 - accuracy: 0.9816 - val_loss: 0.0867 - val_accuracy: 0.9804
Epoch 7/50
24/24 [==============================] - 27s 1s/step - loss: 0.0774 - accuracy: 0.9818 - val_loss: 0.0805 - val_accuracy: 0.9804
Epoch 8/50
24/24 [==============================] - 27s 1s/step - loss: 0.0715 - accuracy: 0.9819 - val_loss: 0.0741 - val_accuracy: 0.9807
Epoch 9/50
24/24 [==============================] - 27s 1s/step - loss: 0.0628 - accuracy: 0.9822 - val_loss: 0.0660 - val_accuracy: 0.9808
Epoch 10/50
24/24 [==============================] - 27s 1s/step - loss: 0.0543 - accuracy: 0.9826 - val_loss: 0.0579 - val_accuracy: 0.9815
Epoch 11/50
24/24 [==============================] - 27s 1s/step - loss: 0.0465 - accuracy: 0.9843 - val_loss: 0.0500 - val_accuracy: 0.9851
Epoch 12/50
24/24 [==============================] - 27s 1s/step - loss: 0.0385 - accuracy: 0.9879 - val_loss: 0.0453 - val_accuracy: 0.9867
Epoch 13/50
24/24 [==============================] - 27s 1s/step - loss: 0.0330 - accuracy: 0.9901 - val_loss: 0.0413 - val_accuracy: 0.9873
Epoch 14/50
24/24 [==============================] - 27s 1s/step - loss: 0.0298 - accuracy: 0.9909 - val_loss: 0.0395 - val_accuracy: 0.9887
Epoch 15/50
24/24 [==============================] - 27s 1s/step - loss: 0.0257 - accuracy: 0.9922 - val_loss: 0.0380 - val_accuracy: 0.9887
Epoch 16/50
24/24 [==============================] - 27s 1s/step - loss: 0.0241 - accuracy: 0.9924 - val_loss: 0.0362 - val_accuracy: 0.9887
Epoch 17/50
24/24 [==============================] - 27s 1s/step - loss: 0.0215 - accuracy: 0.9935 - val_loss: 0.0344 - val_accuracy: 0.9897
Epoch 18/50
24/24 [==============================] - 27s 1s/step - loss: 0.0191 - accuracy: 0.9942 - val_loss: 0.0335 - val_accuracy: 0.9898
Epoch 19/50
24/24 [==============================] - 27s 1s/step - loss: 0.0173 - accuracy: 0.9948 - val_loss: 0.0322 - val_accuracy: 0.9906
Epoch 20/50
24/24 [==============================] - 28s 1s/step - loss: 0.0160 - accuracy: 0.9952 - val_loss: 0.0322 - val_accuracy: 0.9908
Epoch 21/50
24/24 [==============================] - 27s 1s/step - loss: 0.0147 - accuracy: 0.9958 - val_loss: 0.0338 - val_accuracy: 0.9900
Epoch 22/50
24/24 [==============================] - 27s 1s/step - loss: 0.0133 - accuracy: 0.9962 - val_loss: 0.0307 - val_accuracy: 0.9915
Epoch 23/50
24/24 [==============================] - 27s 1s/step - loss: 0.0117 - accuracy: 0.9968 - val_loss: 0.0303 - val_accuracy: 0.9918
Epoch 24/50
24/24 [==============================] - 27s 1s/step - loss: 0.0105 - accuracy: 0.9973 - val_loss: 0.0289 - val_accuracy: 0.9922
Epoch 25/50
24/24 [==============================] - 27s 1s/step - loss: 0.0094 - accuracy: 0.9977 - val_loss: 0.0315 - val_accuracy: 0.9917
Epoch 26/50
24/24 [==============================] - 27s 1s/step - loss: 0.0084 - accuracy: 0.9980 - val_loss: 0.0300 - val_accuracy: 0.9924
Epoch 27/50
24/24 [==============================] - 27s 1s/step - loss: 0.0073 - accuracy: 0.9984 - val_loss: 0.0295 - val_accuracy: 0.9926

tag_list_numpy = np.array(tags_list)
def get_tag_from_int(input_integer):
    return tag_list_numpy[input_integer]
def get_ner_output_single_sentence(input_sentence):
    sentence_length = len(input_sentence.split())
    vectorized = sentence_vectorizer(input_sentence)
    #print(vectorized)
    model_output = model(np.stack(tf.expand_dims(vectorized,0)))
    #print(model_output.numpy())
    #print(model_output.shape)
    max_indices = np.argmax(model_output, axis=2).flatten()
    #print(max_indices)
    #print(len(max_indices))
    #" ".join(vocab_arr[vectorize_layer(training_file["document"][20])])
    tokenized = [get_tag_from_int(x) for x in max_indices[:]]
    return tokenized[:sentence_length]
#get_ner_output_single_sentence("China says time right for Taiwan talks . </S> BEIJING 1996-08-22 </S> China has said it was time for political talks with Taiwan and that the rival island should take practical steps towards that goal . </S> Consultations should be held to set the time and format of the talks , the official Xinhua news agency quoted Tang Shubei , executive vice chairman of the Association for Relations Across the Taiwan Straits , as saying late on Wednesday . </S>")

def test_sentence(sentence):
    model_output = get_ner_output_single_sentence(sentence)
    input_tokens = sentence.split()
    return list(zip(input_tokens, model_output))

test_sentence("China says time right for Taiwan talks . </S> BEIJING 1996-08-22 </S> China has said it was time for political talks with Taiwan and that the rival island should take practical steps towards that goal . </S> Consultations should be held to set the time and format of the talks , the official Xinhua news agency quoted Tang Shubei , executive vice chairman of the Association for Relations Across the Taiwan Straits , as saying late on Wednesday . </S>")

[('China', 'B-LOC'),
 ('says', 'O'),
 ('time', 'O'),
 ('right', 'O'),
 ('for', 'O'),
 ('Taiwan', 'B-LOC'),
 ('talks', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('BEIJING', 'B-LOC'),
 ('1996-08-22', 'O'),
 ('</S>', 'O'),
 ('China', 'B-LOC'),
 ('has', 'O'),
 ('said', 'O'),
 ('it', 'O'),
 ('was', 'O'),
 ('time', 'O'),
 ('for', 'O'),
 ('political', 'O'),
 ('talks', 'O'),
 ('with', 'O'),
 ('Taiwan', 'B-LOC'),
 ('and', 'O'),
 ('that', 'O'),
 ('the', 'O'),
 ('rival', 'O'),
 ('island', 'O'),
 ('should', 'O'),
 ('take', 'O'),
 ('practical', 'O'),
 ('steps', 'O'),
 ('towards', 'O'),
 ('that', 'O'),
 ('goal', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('Consultations', 'O'),
 ('should', 'O'),
 ('be', 'O'),
 ('held', 'O'),
 ('to', 'O'),
 ('set', 'O'),
 ('the', 'O'),
 ('time', 'O'),
 ('and', 'O'),
 ('format', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('talks', 'O'),
 (',', 'O'),
 ('the', 'O'),
 ('official', 'O'),
 ('Xinhua', 'B-ORG'),
 ('news', 'O'),
 ('agency', 'O'),
 ('quoted', 'O'),
 ('Tang', 'B-PER'),
 ('Shubei', 'I-PER'),
 (',', 'O'),
 ('executive', 'O'),
 ('vice', 'O'),
 ('chairman', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('Association', 'B-ORG'),
 ('for', 'I-ORG'),
 ('Relations', 'O'),
 ('Across', 'I-ORG'),
 ('the', 'I-ORG'),
 ('Taiwan', 'I-ORG'),
 ('Straits', 'I-ORG'),
 (',', 'O'),
 ('as', 'O'),
 ('saying', 'O'),
 ('late', 'O'),
 ('on', 'O'),
 ('Wednesday', 'O'),
 ('.', 'O'),
 ('</S>', 'O')]

test_sentence("SOCCER - LATE GOALS GIVE JAPAN WIN OVER SYRIA . </S> AL-AIN , United Arab Emirates 1996-12-06 </S> Two goals in the last six minutes gave holders Japan an uninspiring 2-1 Asian Cup victory over Syria on Friday . </S> Takuya Takagi headed the winner in the 88th minute of the group C game after goalkeeper Salem Bitar spoiled a mistake-free display by allowing the ball to slip under his body . </S> It was the second Syrian defensive blunder in four minutes . </S> Defender Hassan Abbas rose to intercept a long ball into the area in the 84th minute but only managed to divert it into the top corner of Bitar 's goal . </S> Syria had taken the lead from their first serious attack in the seventh minute . </S> Nader Jokhadar headed a cross from the right by Ammar Awad into the top right corner of Kenichi Shimokawa 's goal . </S> Japan then laid siege to the Syrian penalty area and had a goal disallowed for offside in the 16th minute . </S> A minute later , Bitar produced a good double save , first from Kazuyoshi Miura 's header and then blocked a Takagi follow-up shot . </S> Bitar saved well again from Miura in the 37th minute , parrying away his header from a corner . </S> Japan started the second half brightly but Bitar denied them an equaliser when he dived to his right to save Naoki Soma 's low drive in the 53rd minute . </S> Japan : 19 - Kenichi Shimokawa , 2 - Hiroshige Yanagimoto , 3 - Naoki Soma , 4 - Masami Ihara , 5 - Norio Omura , 6 - Motohiro Yamaguchi , 8 - Masakiyo Maezono ( 7 - Yasuto Honda 71 ) , 9 - Takuya Takagi , 10 - Hiroshi Nanami , 11 - Kazuyoshi Miura , 15 - Hiroaki Morishima ( 14 - Masayuki Okano 75 ) . </S> Syria : 24 - Salem Bitar , 3 - Bachar Srour ; 4 - Hassan Abbas , 5 - Tarek Jabban , 6 - Ammar Awad ( 9 - Louay Taleb 69 ) , 8 - Nihad al-Boushi , 10 - Mohammed Afash , 12 - Ali Dib , 13 - Abdul Latif Helou ( 17 - Ammar Rihawiy 46 ) , 14 - Khaled Zaher ; 16 - Nader Jokhadar . </S>")

tf.Tensor([  128    19 18713 ...     0     0     0], shape=(2048,), dtype=int64)
[[[3.0971142e-03 1.5280694e-03 9.8057139e-01 ... 3.6668889e-03
   1.4106639e-03 3.3225205e-03]
  [2.1369425e-04 1.2225067e-04 9.9616271e-01 ... 1.4002173e-03
   1.0539902e-04 2.7582867e-04]
  [6.3146334e-05 3.8070513e-05 9.9278271e-01 ... 2.4660169e-03
   5.7447112e-05 1.3038449e-04]
  ...
  [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794704e-08
   2.6146161e-08 5.0399006e-08]
  [9.9999696e-01 1.7784757e-08 2.5151198e-07 ... 1.3794731e-08
   2.6146161e-08 5.0399006e-08]
  [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794704e-08
   2.6146161e-08 5.0399006e-08]]]
(1, 2048, 11)
[2 2 2 ... 0 0 0]
2048

[('SOCCER', 'O'),
 ('-', 'O'),
 ('LATE', 'O'),
 ('GOALS', 'O'),
 ('GIVE', 'O'),
 ('JAPAN', 'O'),
 ('WIN', 'O'),
 ('OVER', 'O'),
 ('SYRIA', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('AL-AIN', 'O'),
 (',', 'O'),
 ('United', 'B-LOC'),
 ('Arab', 'I-LOC'),
 ('Emirates', 'I-LOC'),
 ('1996-12-06', 'O'),
 ('</S>', 'O'),
 ('Two', 'O'),
 ('goals', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('last', 'O'),
 ('six', 'O'),
 ('minutes', 'O'),
 ('gave', 'O'),
 ('holders', 'O'),
 ('Japan', 'B-LOC'),
 ('an', 'O'),
 ('uninspiring', 'O'),
 ('2-1', 'O'),
 ('Asian', 'B-LOC'),
 ('Cup', 'I-MISC'),
 ('victory', 'O'),
 ('over', 'O'),
 ('Syria', 'B-LOC'),
 ('on', 'O'),
 ('Friday', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('Takuya', 'O'),
 ('Takagi', 'O'),
 ('headed', 'O'),
 ('the', 'O'),
 ('winner', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('88th', 'O'),
 ('minute', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('group', 'O'),
 ('C', 'O'),
 ('game', 'O'),
 ('after', 'O'),
 ('goalkeeper', 'O'),
 ('Salem', 'O'),
 ('Bitar', 'O'),
 ('spoiled', 'O'),
 ('a', 'O'),
 ('mistake-free', 'O'),
 ('display', 'O'),
 ('by', 'O'),
 ('allowing', 'O'),
 ('the', 'O'),
 ('ball', 'O'),
 ('to', 'O'),
 ('slip', 'O'),
 ('under', 'O'),
 ('his', 'O'),
 ('body', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('It', 'O'),
 ('was', 'O'),
 ('the', 'O'),
 ('second', 'O'),
 ('Syrian', 'B-PER'),
 ('defensive', 'O'),
 ('blunder', 'O'),
 ('in', 'O'),
 ('four', 'O'),
 ('minutes', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('Defender', 'O'),
 ('Hassan', 'B-PER'),
 ('Abbas', 'I-PER'),
 ('rose', 'O'),
 ('to', 'O'),
 ('intercept', 'O'),
 ('a', 'O'),
 ('long', 'O'),
 ('ball', 'O'),
 ('into', 'O'),
 ('the', 'O'),
 ('area', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('84th', 'O'),
 ('minute', 'O'),
 ('but', 'O'),
 ('only', 'O'),
 ('managed', 'O'),
 ('to', 'O'),
 ('divert', 'O'),
 ('it', 'O'),
 ('into', 'O'),
 ('the', 'O'),
 ('top', 'O'),
 ('corner', 'O'),
 ('of', 'O'),
 ('Bitar', 'O'),
 ("'s", 'O'),
 ('goal', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('Syria', 'B-ORG'),
 ('had', 'O'),
 ('taken', 'O'),
 ('the', 'O'),
 ('lead', 'O'),
 ('from', 'O'),
 ('their', 'O'),
 ('first', 'O'),
 ('serious', 'O'),
 ('attack', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('seventh', 'O'),
 ('minute', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('Nader', 'O'),
 ('Jokhadar', 'O'),
 ('headed', 'O'),
 ('a', 'O'),
 ('cross', 'O'),
 ('from', 'O'),
 ('the', 'O'),
 ('right', 'O'),
 ('by', 'O'),
 ('Ammar', 'O'),
 ('Awad', 'O'),
 ('into', 'O'),
 ('the', 'O'),
 ('top', 'O'),
 ('right', 'O'),
 ('corner', 'O'),
 ('of', 'O'),
 ('Kenichi', 'O'),
 ('Shimokawa', 'O'),
 ("'s", 'O'),
 ('goal', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('Japan', 'B-LOC'),
 ('then', 'O'),
 ('laid', 'O'),
 ('siege', 'O'),
 ('to', 'O'),
 ('the', 'O'),
 ('Syrian', 'B-ORG'),
 ('penalty', 'O'),
 ('area', 'O'),
 ('and', 'O'),
 ('had', 'O'),
 ('a', 'O'),
 ('goal', 'O'),
 ('disallowed', 'O'),
 ('for', 'O'),
 ('offside', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('16th', 'O'),
 ('minute', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('A', 'O'),
 ('minute', 'O'),
 ('later', 'O'),
 (',', 'O'),
 ('Bitar', 'O'),
 ('produced', 'O'),
 ('a', 'O'),
 ('good', 'O'),
 ('double', 'O'),
 ('save', 'O'),
 (',', 'O'),
 ('first', 'O'),
 ('from', 'O'),
 ('Kazuyoshi', 'O'),
 ('Miura', 'O'),
 ("'s", 'O'),
 ('header', 'O'),
 ('and', 'O'),
 ('then', 'O'),
 ('blocked', 'O'),
 ('a', 'O'),
 ('Takagi', 'O'),
 ('follow-up', 'O'),
 ('shot', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('Bitar', 'O'),
 ('saved', 'O'),
 ('well', 'O'),
 ('again', 'O'),
 ('from', 'O'),
 ('Miura', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('37th', 'O'),
 ('minute', 'O'),
 (',', 'O'),
 ('parrying', 'O'),
 ('away', 'O'),
 ('his', 'O'),
 ('header', 'O'),
 ('from', 'O'),
 ('a', 'O'),
 ('corner', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('Japan', 'B-ORG'),
 ('started', 'O'),
 ('the', 'O'),
 ('second', 'O'),
 ('half', 'O'),
 ('brightly', 'O'),
 ('but', 'O'),
 ('Bitar', 'O'),
 ('denied', 'O'),
 ('them', 'O'),
 ('an', 'O'),
 ('equaliser', 'O'),
 ('when', 'O'),
 ('he', 'O'),
 ('dived', 'O'),
 ('to', 'O'),
 ('his', 'O'),
 ('right', 'O'),
 ('to', 'O'),
 ('save', 'O'),
 ('Naoki', 'O'),
 ('Soma', 'O'),
 ("'s", 'O'),
 ('low', 'O'),
 ('drive', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('53rd', 'O'),
 ('minute', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('Japan', 'B-LOC'),
 (':', 'O'),
 ('19', 'O'),
 ('-', 'O'),
 ('Kenichi', 'O'),
 ('Shimokawa', 'O'),
 (',', 'O'),
 ('2', 'O'),
 ('-', 'O'),
 ('Hiroshige', 'O'),
 ('Yanagimoto', 'O'),
 (',', 'O'),
 ('3', 'O'),
 ('-', 'O'),
 ('Naoki', 'O'),
 ('Soma', 'O'),
 (',', 'O'),
 ('4', 'O'),
 ('-', 'O'),
 ('Masami', 'O'),
 ('Ihara', 'O'),
 (',', 'O'),
 ('5', 'O'),
 ('-', 'O'),
 ('Norio', 'O'),
 ('Omura', 'O'),
 (',', 'O'),
 ('6', 'O'),
 ('-', 'O'),
 ('Motohiro', 'O'),
 ('Yamaguchi', 'O'),
 (',', 'O'),
 ('8', 'O'),
 ('-', 'O'),
 ('Masakiyo', 'O'),
 ('Maezono', 'O'),
 ('(', 'O'),
 ('7', 'O'),
 ('-', 'O'),
 ('Yasuto', 'O'),
 ('Honda', 'B-ORG'),
 ('71', 'O'),
 (')', 'O'),
 (',', 'O'),
 ('9', 'O'),
 ('-', 'O'),
 ('Takuya', 'O'),
 ('Takagi', 'O'),
 (',', 'O'),
 ('10', 'O'),
 ('-', 'O'),
 ('Hiroshi', 'O'),
 ('Nanami', 'O'),
 (',', 'O'),
 ('11', 'O'),
 ('-', 'O'),
 ('Kazuyoshi', 'O'),
 ('Miura', 'O'),
 (',', 'O'),
 ('15', 'O'),
 ('-', 'O'),
 ('Hiroaki', 'O'),
 ('Morishima', 'O'),
 ('(', 'O'),
 ('14', 'O'),
 ('-', 'O'),
 ('Masayuki', 'O'),
 ('Okano', 'O'),
 ('75', 'O'),
 (')', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('Syria', 'B-PER'),
 (':', 'O'),
 ('24', 'O'),
 ('-', 'O'),
 ('Salem', 'O'),
 ('Bitar', 'O'),
 (',', 'O'),
 ('3', 'O'),
 ('-', 'O'),
 ('Bachar', 'O'),
 ('Srour', 'O'),
 (';', 'O'),
 ('4', 'O'),
 ('-', 'O'),
 ('Hassan', 'B-PER'),
 ('Abbas', 'I-PER'),
 (',', 'O'),
 ('5', 'O'),
 ('-', 'O'),
 ('Tarek', 'O'),
 ('Jabban', 'O'),
 (',', 'O'),
 ('6', 'O'),
 ('-', 'O'),
 ('Ammar', 'O'),
 ('Awad', 'O'),
 ('(', 'O'),
 ('9', 'O'),
 ('-', 'O'),
 ('Louay', 'O'),
 ('Taleb', 'O'),
 ('69', 'O'),
 (')', 'O'),
 (',', 'O'),
 ('8', 'O'),
 ('-', 'O'),
 ('Nihad', 'O'),
 ('al-Boushi', 'O'),
 (',', 'O'),
 ('10', 'O'),
 ('-', 'O'),
 ('Mohammed', 'B-PER'),
 ('Afash', 'I-PER'),
 (',', 'O'),
 ('12', 'O'),
 ('-', 'O'),
 ('Ali', 'B-PER'),
 ('Dib', 'I-PER'),
 (',', 'O'),
 ('13', 'O'),
 ('-', 'O'),
 ('Abdul', 'B-PER'),
 ('Latif', 'I-PER'),
 ('Helou', 'O'),
 ('(', 'O'),
 ('17', 'O'),
 ('-', 'O'),
 ('Ammar', 'O'),
 ('Rihawiy', 'O'),
 ('46', 'O'),
 (')', 'O'),
 (',', 'O'),
 ('14', 'O'),
 ('-', 'O'),
 ('Khaled', 'B-PER'),
 ('Zaher', 'I-PER'),
 (';', 'O'),
 ('16', 'O'),
 ('-', 'O'),
 ('Nader', 'O'),
 ('Jokhadar', 'O'),
 ('.', 'O'),
 ('</S>', 'O')]

news_string = """Mussolini 's granddaughter rejoins far-right party . </S> ROME 1996-12-06 </S> Alessandra Mussolini , the granddaughter of Italy 's Fascist dictator Benito Mussolini , said on Friday she had rejoined the far-right National Alliance ( AN ) party she quit over policy differences last month . </S> " I 've gone back , " she told a radio show shortly after AN leader Gianfranco Fini , who was being interviewed on the programme , said the row had been resolved . </S> " He did n't want to lose me and I did n't want to lose him . " </S> Fini told state radio RAI he met Mussolini thanks to the good offices of Giuseppe Tatarella , AN 's leader in the Chamber of Deputies ( lower house ) , and had overcome their differences . </S> Mussolini , 33 , resigned from the parliamentary party group for what she said were strictly political reasons . </S> The fiery politician , who is also a niece of screen star Sophia Loren , had accused AN leaders of stifling internal party debate . </S> Mussolini , who sits in the Chamber , told La Stampa newspaper last month after quitting AN 's parliamentary party that she was considering joining the neo-fascist Social Movement ( MS-Fiamma ) formed by some of the Duce 's World War Two followers . </S>"""

test_sentence(news_string)

tf.Tensor([ 1 16  1 ...  0  0  0], shape=(2048,), dtype=int64)
[[[9.1573365e-02 8.5647009e-02 1.1034752e-01 ... 8.8930450e-02
   8.8644758e-02 8.9963131e-02]
  [5.5477720e-02 4.6575051e-02 5.2461910e-01 ... 6.4232960e-02
   4.4661559e-02 5.8426060e-02]
  [4.9609054e-02 4.3161135e-02 4.3743923e-01 ... 9.0816177e-02
   4.6578653e-02 5.5895649e-02]
  ...
  [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794731e-08
   2.6146161e-08 5.0399006e-08]
  [9.9999696e-01 1.7784757e-08 2.5151198e-07 ... 1.3794731e-08
   2.6146161e-08 5.0399006e-08]
  [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794731e-08
   2.6146161e-08 5.0399006e-08]]]
(1, 2048, 11)
[2 2 2 ... 0 0 0]
2048

[('Mussolini', 'O'),
 ("'s", 'O'),
 ('granddaughter', 'O'),
 ('rejoins', 'O'),
 ('far-right', 'O'),
 ('party', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('ROME', 'B-LOC'),
 ('1996-12-06', 'O'),
 ('</S>', 'O'),
 ('Alessandra', 'O'),
 ('Mussolini', 'O'),
 (',', 'O'),
 ('the', 'O'),
 ('granddaughter', 'O'),
 ('of', 'O'),
 ('Italy', 'B-LOC'),
 ("'s", 'O'),
 ('Fascist', 'O'),
 ('dictator', 'O'),
 ('Benito', 'B-PER'),
 ('Mussolini', 'I-PER'),
 (',', 'O'),
 ('said', 'O'),
 ('on', 'O'),
 ('Friday', 'O'),
 ('she', 'O'),
 ('had', 'O'),
 ('rejoined', 'O'),
 ('the', 'O'),
 ('far-right', 'O'),
 ('National', 'B-PER'),
 ('Alliance', 'I-PER'),
 ('(', 'O'),
 ('AN', 'O'),
 (')', 'O'),
 ('party', 'O'),
 ('she', 'O'),
 ('quit', 'O'),
 ('over', 'O'),
 ('policy', 'O'),
 ('differences', 'O'),
 ('last', 'O'),
 ('month', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('"', 'O'),
 ('I', 'O'),
 ("'ve", 'O'),
 ('gone', 'O'),
 ('back', 'O'),
 (',', 'O'),
 ('"', 'O'),
 ('she', 'O'),
 ('told', 'O'),
 ('a', 'O'),
 ('radio', 'O'),
 ('show', 'O'),
 ('shortly', 'O'),
 ('after', 'O'),
 ('AN', 'O'),
 ('leader', 'O'),
 ('Gianfranco', 'B-PER'),
 ('Fini', 'I-PER'),
 (',', 'O'),
 ('who', 'O'),
 ('was', 'O'),
 ('being', 'O'),
 ('interviewed', 'O'),
 ('on', 'O'),
 ('the', 'O'),
 ('programme', 'O'),
 (',', 'O'),
 ('said', 'O'),
 ('the', 'O'),
 ('row', 'O'),
 ('had', 'O'),
 ('been', 'O'),
 ('resolved', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('"', 'O'),
 ('He', 'O'),
 ('did', 'O'),
 ("n't", 'O'),
 ('want', 'O'),
 ('to', 'O'),
 ('lose', 'O'),
 ('me', 'O'),
 ('and', 'O'),
 ('I', 'O'),
 ('did', 'O'),
 ("n't", 'O'),
 ('want', 'O'),
 ('to', 'O'),
 ('lose', 'O'),
 ('him', 'O'),
 ('.', 'O'),
 ('"', 'O'),
 ('</S>', 'O'),
 ('Fini', 'O'),
 ('told', 'O'),
 ('state', 'O'),
 ('radio', 'O'),
 ('RAI', 'B-PER'),
 ('he', 'O'),
 ('met', 'O'),
 ('Mussolini', 'O'),
 ('thanks', 'O'),
 ('to', 'O'),
 ('the', 'O'),
 ('good', 'O'),
 ('offices', 'O'),
 ('of', 'O'),
 ('Giuseppe', 'B-PER'),
 ('Tatarella', 'I-PER'),
 (',', 'O'),
 ('AN', 'O'),
 ("'s", 'O'),
 ('leader', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('Chamber', 'B-PER'),
 ('of', 'O'),
 ('Deputies', 'O'),
 ('(', 'O'),
 ('lower', 'O'),
 ('house', 'O'),
 (')', 'O'),
 (',', 'O'),
 ('and', 'O'),
 ('had', 'O'),
 ('overcome', 'O'),
 ('their', 'O'),
 ('differences', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('Mussolini', 'O'),
 (',', 'O'),
 ('33', 'O'),
 (',', 'O'),
 ('resigned', 'O'),
 ('from', 'O'),
 ('the', 'O'),
 ('parliamentary', 'O'),
 ('party', 'O'),
 ('group', 'O'),
 ('for', 'O'),
 ('what', 'O'),
 ('she', 'O'),
 ('said', 'O'),
 ('were', 'O'),
 ('strictly', 'O'),
 ('political', 'O'),
 ('reasons', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('The', 'O'),
 ('fiery', 'O'),
 ('politician', 'O'),
 (',', 'O'),
 ('who', 'O'),
 ('is', 'O'),
 ('also', 'O'),
 ('a', 'O'),
 ('niece', 'O'),
 ('of', 'O'),
 ('screen', 'O'),
 ('star', 'O'),
 ('Sophia', 'B-PER'),
 ('Loren', 'I-PER'),
 (',', 'O'),
 ('had', 'O'),
 ('accused', 'O'),
 ('AN', 'O'),
 ('leaders', 'O'),
 ('of', 'O'),
 ('stifling', 'O'),
 ('internal', 'O'),
 ('party', 'O'),
 ('debate', 'O'),
 ('.', 'O'),
 ('</S>', 'O'),
 ('Mussolini', 'O'),
 (',', 'O'),
 ('who', 'O'),
 ('sits', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('Chamber', 'B-PER'),
 (',', 'O'),
 ('told', 'O'),
 ('La', 'B-ORG'),
 ('Stampa', 'I-ORG'),
 ('newspaper', 'O'),
 ('last', 'O'),
 ('month', 'O'),
 ('after', 'O'),
 ('quitting', 'O'),
 ('AN', 'O'),
 ("'s", 'O'),
 ('parliamentary', 'O'),
 ('party', 'O'),
 ('that', 'O'),
 ('she', 'O'),
 ('was', 'O'),
 ('considering', 'O'),
 ('joining', 'O'),
 ('the', 'O'),
 ('neo-fascist', 'O'),
 ('Social', 'B-ORG'),
 ('Movement', 'I-ORG'),
 ('(', 'O'),
 ('MS-Fiamma', 'O'),
 (')', 'O'),
 ('formed', 'O'),
 ('by', 'O'),
 ('some', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('Duce', 'O'),
 ("'s", 'O'),
 ('World', 'B-ORG'),
 ('War', 'I-ORG'),
 ('Two', 'O'),
 ('followers', 'O'),
 ('.', 'O'),
 ('</S>', 'O')]

model.save("model_v2.keras")

import keras
model = keras.models.load_model('model_v2.keras')

with open("en-ner-conll-2003/dev-0/in.tsv", "r", encoding="utf-8") as f:
    lines = f.readlines()
processed = [" ".join(get_ner_output_single_sentence(x)) for x in lines if len(x.strip())>0]
with open('en-ner-conll-2003/dev-0/out.tsv', 'w',encoding="utf-8") as f:
    for line in processed:
        f.write(f"{line}\n")

ERROR:tensorflow:==================================
Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x00000262307DCA00>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "C:\Users\Adrian\miniconda3\lib\site-packages\keras\backend.py", line 5130, in <genexpr>
    ta.write(ta_index_to_write, out)  File "C:\Users\Adrian\miniconda3\lib\site-packages\tensorflow\python\util\tf_should_use.py", line 243, in wrapped
    return _add_should_use_warning(fn(*args, **kwargs),
==================================

with open("en-ner-conll-2003/test-A/in.tsv", "r", encoding="utf-8") as f:
    lines = f.readlines()
processed = [" ".join(get_ner_output_single_sentence(x)) for x in lines if len(x.strip())>0]
with open('en-ner-conll-2003/test-A/out.tsv', 'w',encoding="utf-8") as f:
    for line in processed:
        f.write(f"{line}\n")

Czyszczenie tagów

tag_set = set()
with open("en-ner-conll-2003/dev-0/out.tsv", "r", encoding="utf-8") as f:
    lines = f.readlines()
for line in lines:
    line_split = line.split()
    for tag in line_split:
        if tag not in tag_set:
            tag_set.add(tag)
print(tag_set)

{'B-LOC', 'I-LOC', 'O', 'I-MISC', 'B-ORG', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC'}

inter_to_begin_mapping = {
    "I-LOC": "B-LOC",
    "I-MISC": 'B-MISC',
    'I-ORG': 'B-ORG',
    'I-PER': 'B-PER'
}
begin_to_inter_mapping = {v: k for k, v in inter_to_begin_mapping.items()}

inter_to_begin_mapping

{'I-LOC': 'B-LOC', 'I-MISC': 'B-MISC', 'I-ORG': 'B-ORG', 'I-PER': 'B-PER'}

begin_to_inter_mapping

{'B-LOC': 'I-LOC', 'B-MISC': 'I-MISC', 'B-ORG': 'I-ORG', 'B-PER': 'I-PER'}

def fix_tags_in_file(filename, filename_fixed):
    lines_fixed = []
    with open(filename, "r", encoding="utf-8") as f:
        lines = f.readlines()
    lines_tokenized = [line.split() for line in lines]
    for line in lines_tokenized:
        line_fixed = []
        for counter, element in enumerate(line):
            if element=="O": # O tag can be placed anywhere
                line_fixed.append(element)
            elif element in inter_to_begin_mapping:
                if counter==0: # Beginning of line, can't check previous tag
                    line_fixed.append(inter_to_begin_mapping[element])
                else:
                    previous_element = line_fixed[counter-1]
                    if previous_element==element or previous_element==inter_to_begin_mapping[element]: # Tag was compatible (same inters or compatible B-->I)
                        line_fixed.append(element)
                    elif previous_element=="O": # O--> Inter
                        line_fixed.append(inter_to_begin_mapping[element])
                    elif previous_element in inter_to_begin_mapping and element in inter_to_begin_mapping and previous_element!=element: # Incompatible subsequent inter-tags
                        line_fixed.append(previous_element)
                    else: # Begin --> Incompatible inter
                        corrected_tag = begin_to_inter_mapping[previous_element]
                        line_fixed.append(corrected_tag)
            elif element in begin_to_inter_mapping: # Beginning tag can be added safely
                line_fixed.append(element)
            else:
                print("This shouldn't happen")
        lines_fixed.append(" ".join(line_fixed))
    with open(filename_fixed, "w", encoding="utf-8") as f:
       for line in lines_fixed:
           f.write(f"{line}\n")
fix_tags_in_file("en-ner-conll-2003/test-A/out.tsv", "en-ner-conll-2003/test-A/out_fixed.tsv")

fix_tags_in_file("en-ner-conll-2003/dev-0/out.tsv", "en-ner-conll-2003/dev-0/out_fixed.tsv")

84 KiB Raw Permalink Blame History

Testowanie wektoryzacji / dewektoryzacji tekstu

Padding przykładów do 2048 słów

Czyszczenie tagów

84 KiB

Raw Permalink Blame History