84 KiB
84 KiB
import pandas as pd
training_file = pd.read_csv("en-ner-conll-2003/train/train.tsv", sep='\t', on_bad_lines="warn", names=["label","document"])
training_file.head()
label | document | |
---|---|---|
0 | B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ... | EU rejects German call to boycott British lamb... |
1 | O B-PER O O O O O O O O O B-LOC O O O O O O O ... | Rare Hendrix song draft sells for almost $ 17,... |
2 | B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ... | China says Taiwan spoils atmosphere for talks ... |
3 | B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ... | China says time right for Taiwan talks . </S> ... |
4 | B-MISC O O O O O O O O O O O B-LOC O O B-MISC ... | German July car registrations up 14.2 pct yr /... |
import tensorflow as tf
training_file["tag_list"] = training_file["label"].apply(lambda x : x.split())
training_file["tokenized"] = training_file["document"].apply(lambda x : x.split())
training_file["len_tags"] = training_file["tag_list"].apply(len)
training_file["len_tokenized"] = training_file["tokenized"].apply(len)
training_file.loc[~(training_file['len_tokenized'] == training_file['len_tags'])]
label | document | tag_list | tokenized | len_tags | len_tokenized |
---|
training_file.head()
label | document | tag_list | tokenized | len_tags | len_tokenized | |
---|---|---|---|---|---|---|
0 | B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER ... | EU rejects German call to boycott British lamb... | [B-ORG, O, B-MISC, O, O, O, B-MISC, O, O, O, B... | [EU, rejects, German, call, to, boycott, Briti... | 489 | 489 |
1 | O B-PER O O O O O O O O O B-LOC O O O O O O O ... | Rare Hendrix song draft sells for almost $ 17,... | [O, B-PER, O, O, O, O, O, O, O, O, O, B-LOC, O... | [Rare, Hendrix, song, draft, sells, for, almos... | 197 | 197 |
2 | B-LOC O B-LOC O O O O O O B-LOC O O B-LOC O O ... | China says Taiwan spoils atmosphere for talks ... | [B-LOC, O, B-LOC, O, O, O, O, O, O, B-LOC, O, ... | [China, says, Taiwan, spoils, atmosphere, for,... | 248 | 248 |
3 | B-LOC O O O O B-LOC O O O B-LOC O O B-LOC O O ... | China says time right for Taiwan talks . </S> ... | [B-LOC, O, O, O, O, B-LOC, O, O, O, B-LOC, O, ... | [China, says, time, right, for, Taiwan, talks,... | 80 | 80 |
4 | B-MISC O O O O O O O O O O O B-LOC O O B-MISC ... | German July car registrations up 14.2 pct yr /... | [B-MISC, O, O, O, O, O, O, O, O, O, O, O, B-LO... | [German, July, car, registrations, up, 14.2, p... | 235 | 235 |
max_length = training_file["len_tokenized"].max()
print(max_length) # 1532 ---> ~2048
1532
Testowanie wektoryzacji / dewektoryzacji tekstu
vectorize_layer = tf.keras.layers.TextVectorization(standardize=None)
vectorize_layer.adapt(training_file["document"])
print(vectorize_layer(training_file["document"][20]))
tf.Tensor( [18792 316 1335 896 8 479 7287 284 3 2 18492 4 11364 3 2 137 2 18793 18637 20290 346 15 14 68 27 9 1335 9461 59 3210 42 5299 507 6 52 4906 71 7 64 1712 554 49 540 3 2 20 132 15 27 257 5 540 4 60 536 232 18 4 37 1257 52 234 71 1398 1164 6 64 2541 23235 65 880 5156 280 3526 3 2 20 5156 40 1257 17 52 22125 71 3 2 2016 18381 4 449 834 1318 6 5 13472 12 1339 2356 132 4 15 5 9461 13 1240 42 2542 8 2525 5 132 16 8166 666 724 1190 12 2129 618 622 5276 12 836 3 13 2], shape=(126,), dtype=int64)
len(training_file["document"][20].split())
126
vectorize_layer.get_vocabulary()
['', '[UNK]', '</S>', '.', ',', 'the', 'of', 'in', 'to', 'a', ')', '(', 'and', '"', 'on', 'said', "'s", 'for', '1', '-', 'The', 'was', '2', '0', '3', 'at', 'with', 'that', 'from', 'by', 'is', ':', 'as', 'he', '4', 'had', 'has', 'it', 'his', 'not', 'were', 'be', 'an', 'have', 'after', 'who', 'will', '5', 'but', 'first', 'U.S.', 'been', '$', '--', 'two', 'their', 'are', '6', 'beat', 'would', 'which', 'up', 'I', 'they', 'its', 'percent', 'year', 'out', 'Thursday', 'this', 'last', 'million', 'over', 'Wednesday', 'one', '7', 'government', 'against', '/', 'police', 'when', 'second', 'also', 'Tuesday', 'He', 'It', 'A', 'three', 'told', 'new', '10', 'Monday', 'or', 'about', 'Friday', 'people', 'In', 'her', '9', '1996-08-28', 'no', 'won', 'we', 'New', 'into', 'under', 'some', 'Sunday', 'But', '8', 'more', 'before', 'week', "'", 'time', 'than', 'market', 'could', 'Germany', 'points', 'We', 'between', 'Australia', 'years', 'since', 'Britain', 'other', 'AT', 'SOCCER', 'played', 'all', 'state', 'company', 'France', 'England', 'Saturday', 'only', '1996-08-22', 'officials', 'group', '1996-08-29', 'there', 'round', '1996', 'South', 'Minister', '1996-08-27', '11', 'off', 'match', '13', 'six', 'four', 'down', '6-4', '6-3', 'because', '21', 'five', '15', 'him', 'Spain', '1996-08-26', 'next', 'President', 'official', 'former', 'she', 'home', 'United', 'third', 'do', 'spokesman', 'just', 'games', 'expected', 'did', 'day', 'win', 'through', 'statement', 'made', 'NEW', '70', '12', '1996-08-23', 'them', 'lost', '14', 'world', 'where', '6-2', '20', 'September', 'Russian', 'July', 'shares', "n't", 'if', 'back', 'RESULTS', 'Italy', 'YORK', 'China', 'August', 'president', 'Cup', '3.', '2.', 'DIVISION', '1.', 'Clinton', 'British', 'while', 'seconds', 'any', 'LONDON', 'Japan', 'reported', 'billion', '69', 'matches', 'v', 'team', 'month', 'Russia', 'division', 'Pakistan', 'meeting', 'being', 'They', 'London', 'June', 'European', '30', 'news', 'added', 'German', '71', '1996-08-25', 'still', 'peace', 'metres', 'half', 'Results', 'At', '1/2', 'talks', 'set', 'earlier', 'tonnes', 'killed', 'season', 'now', 'Sweden', 'take', 'held', 'during', 'Reuters', 'should', 'part', 'around', 'India', 'party', 'elections', 'National', 'took', 'game', 'Bank', 'soccer', 'number', 'minutes', 'lead', 'innings', 'early', 'capital', '68', '6-1', 'saying', 'end', 'due', 'days', 'b', '7-6', 'results', 'Open', '100', 'so', 'foreign', 'you', 'political', 'per', 'international', 'final', 'can', 'York', 'West', 'Belgium', '22', 'well', 'victory', 'most', 'Newsroom', 'French', 'Netherlands', '50', 'visit', 'seven', 'country', 'champion', 'Iraq', '25', 'our', 'minute', 'Israel', 'American', 'says', 'left', 'Czech', 'Africa', '66', '1996-08-24', 'profit', 'play', 'LEAGUE', '4.', 'vs.', 'league', '67', '6.', '5.', 'very', 'local', 'leader', 'Republic', '7-5', '24', '1995', 'war', 'same', 'go', 'found', 'support', 'run', 'newsroom', 'close', 'Inc', 'then', 'say', 'meet', 'man', 'called', 'World', 'States', 'CHICAGO', 'what', 'town', 'singles', 'prices', 'military', 'lower', 'eight', 'both', 'ago', '64', 'runs', 'put', 'newspaper', 'deal', 'bank', 'Moscow', 'Mark', '72', 'trade', 'rate', 'race', 'make', 'goals', 'cents', 'St', 'OF', 'Men', '60', '16', 'pct', 'months', 'issue', 'gave', 'behind', 'There', 'Prime', 'May', 'opposition', 'minister', 'good', 'ended', 'city', 'Women', 'Michael', 'League', 'Hong', 'FIRST', '75', 'tournament', 'report', 'rebels', 'leaders', 'Iraqi', 'Dutch', 'weekend', 'until', 'security', 'price', 'plan', 'northern', 'net', 'near', 'late', 'get', 'dollar', 'agreed', 'Kong', 'Australian', '74', '7.', 'top', 'record', 'players', 'going', 'agency', 'Attendance', 'African', ';', '73', 'want', 'start', 'refugees', 'miles', 'drawn', 'another', 'Sri', 'Paul', 'taking', 'sales', 'place', 'office', 'my', 'economic', 'court', 'chief', 'arrested', 'SAN', 'John', 'Democratic', 'David', 'CRICKET', '8.', 'those', 'quoted', 'demand', 'championship', 'allowed', 'Party', 'Palestinian', 'Israeli', 'GMT', 'Corp', 'Commission', 'Ahmed', 'women', 'several', 'many', 'including', 'central', 'already', 'IN', 'Foreign', 'television', 'km', 'hit', 'following', 'de', 'Yeltsin', 'Martin', 'Arafat', '28', '17', 'southern', 'men', 'may', 'later', 'forces', 'fell', 'authorities', 'ahead', 'Union', 'M.', 'Dole', '31', '26', '1-0', 'work', 'whether', 'weeks', 'way', 'troops', 'reporters', 'loss', 'hours', 'election', 'came', 'announced', 'Brazil', '19', 'vs', 'return', 'parliament', 'night', 'higher', 'general', 'closed', 'Zealand', 'Finland', 'Chicago', '65', '23', '1994', '18', '...', 'went', 'test', 'share', 'power', 'plans', 'national', 'decision', 'began', 'agreement', 'This', 'trading', 'quarter', 'oil', 'north', 'morning', 'ministry', 'like', 'head', 'few', 'countries', 'away', 'asked', 'Washington', 'Police', 'Lebed', '1997', 'taken', 'money', 'main', 'leading', 'index', 'fighting', 'Sydney', 'Olympic', 'English', 'Austria', 'such', 'signed', 'side', 'scored', 'rights', 'past', 'much', 'major', 'hits', 'current', 'c', 'business', 'budget', 'army', 'U.N.', 'STANDINGS', 'Canada', '63', 'think', 'nine', 'growth', 'area', 'Ukraine', 'Standings', 'Europe', 'East', '40', 'winning', 'total', 'strike', 'region', 'recent', 'previous', 'own', 'draw', 'campaign', 'attack', 'accused', 'Two', 'On', 'Lanka', 'Co', '96', '62', 'working', 'without', 'vote', 'these', 'seen', 'plane', 'led', 'hold', 'high', 'future', 'died', 'control', 'club', 'cash', 'best', 'available', 'again', 'White', 'PCT', 'Ireland', 'Akram', '9.', '27', '---', 'wickets', 'tour', 'sent', 'right', 'released', 'might', 'little', 'help', 'give', 'fourth', 'failed', 'does', 'conference', 'ceasefire', 'case', 'ban', 'Paris', 'March', 'Dutroux', "'re", 'us', 'started', 'prime', 'period', 'overs', 'me', 'manager', 'long', 'least', 'embassy', 'disease', 'cut', 'champions', 'average', 'No', 'Italian', 'City', 'An', '29', 'tennis', 'stories', 'service', 'production', 'planned', 'order', 'members', 'free', 'airport', 'across', 'Wasim', 'Thomas', 'October', 'Leading', 'Kurdish', 'Costa', 'Chechnya', 'Aug', 'Ajax', '59', 'yet', 'strong', 'shot', 'short', 'rose', 'public', 'press', 'given', 'declared', 'children', 'bonds', 'Slovakia', 'San', 'Romania', 'Republican', 'Ministry', 'Jordan', 'Bosnian', 'Bosnia', 'BASEBALL', 'April', '54', '10.', 'trying', 'tabulate', 'stock', 'standings', 'seed', 'reports', 'possible', 'must', 'markets', 'interest', 'hospital', 'further', 'State', 'Moslem', 'Jerusalem', 'If', 'CITY', 'Amsterdam', 'A.', 'woman', 'used', 'term', 'series', 'received', 'rates', 'opening', 'law', 'known', 'industry', 'guerrillas', 'forced', 'fifth', 'face', 'death', 'come', 'coach', 'clear', 'charges', 'brought', 'Taiwan', 'TENNIS', 'She', 'Robert', 'Poland', 'Peter', 'Nigeria', 'Ltd', 'Kenya', '2-0', 'yen', 'train', 'squad', 'small', 'showed', 'private', 'point', 'passengers', 'old', 'likely', 'injured', 'immediately', 'estimated', 'details', 'despite', 'date', 'companies', 'call', 'Turkey', 'That', 'PARIS', 'Argentina', '48', '1,000', '*', "'S", 'workers', 'use', 'trip', 'result', 'process', 'policy', 'named', 'level', 'latest', 'human', 'groups', 'got', 'forecast', 'figures', 'each', 'daily', 'contract', 'captain', 'better', 'action', 'Wimbledon', 'One', 'North', 'Nations', 'L', 'Japanese', 'Iran', 'Egypt', 'California', '76', '61', '0-0', 'unless', 'soon', 'sold', 'seeding', 'see', 'rise', 'prison', 'pay', 'how', 'holiday', 'halftime', 'force', 'financial', 'exports', 'earnings', 'believed', 'analysts', 'Younis', 'Waqar', 'TORONTO', 'Serb', 'PSV', 'Mushtaq', 'Croft', 'Belgian', 'BALTIMORE', 'Atlanta', '6-0', 'within', 'violence', 'today', 'title', 'times', 'straight', 'scheduled', 'rule', 'road', 'pound', 'playing', 'nearly', 'making', 'levels', 'convention', 'confirmed', 'coming', 'chairman', 'border', 'Total', 'TO', 'Security', 'S.', 'International', 'Exchange', '56', 'tried', 'struck', 'south', 'services', 'senior', 'reached', 'position', 'nuclear', 'met', 'message', 'know', 'keep', 'inning', 'independence', 'illegal', 'homer', 'gold', 'completed', 'comment', 'charged', 'buy', 'Switzerland', 'Saudi', 'OPEN', 'Net', 'Mullally', 'Khan', 'Indian', 'Halftime', 'Grand', 'First', 'Central', 'Bill', 'AMSTERDAM', '55', '53', 'wife', 'wheat', 'tie', 'sell', 'rebel', 'problem', 'prefix', 'poor', 'percentage', 'parties', 'outside', 'opened', 'letter', 'kept', 'island', 'here', 'health', 'ground', 'full', 'even', 'course', 'continue', 'conditions', 'civil', 'change', 'centre', 'based', 'attacks', 'arrived', 'areas', 'aggregate', 'able', 'PUK', 'OSCE', 'Netanyahu', 'Mexico', 'Grozny', 'Group', 'FOR', 'Chinese', 'Association', 'After', '58', '45', '2-1', '1-1', "'m", 'winner', 'village', 'treaty', 'too', 'system', 'step', 'stage', 'source', 'returned', 'radio', 'penalty', 'paper', 'needed', 'less', 'leg', 'leave', 'himself', 'great', 'goal', 'flight', 'economy', 'director', 'denotes', 'denied', 'break', 'bond', 'big', 'according', 'Williams', 'Turkish', 'Swiss', 'Result', 'PRESS', 'Johnson', 'House', 'FRANCISCO', 'EU', 'DIGEST', 'BOSTON', '77', '42', ...]
import numpy as np
vocabulary = vectorize_layer.get_vocabulary()
vocab_arr = np.asarray(vocabulary)
" ".join(vocab_arr[vectorize_layer(training_file["document"][20])])
'Kindercare says debt buy to hit Q1 results . </S> MONTGOMERY , Ala . </S> 1996-08-22 </S> KinderCare Learning Centers Inc said on Thursday that a debt buyback would mean an extraordinary loss of $ 1.2 million in its fiscal 1997 first quarter . </S> The company said that during the quarter , which began June 1 , it bought $ 30 million par value of its outstanding 10-3/8 percent senior notes due 2001 . </S> The notes were bought for $ 31.5 million . </S> Philip Maslowe , chief financial officer of the preschool and child care company , said the buyback " offered an opportunity to reduce the company \'s weighted average interest costs and improve future cash flows and earnings . " </S>'
training_file["document"][20]
'Kindercare says debt buy to hit Q1 results . </S> MONTGOMERY , Ala . </S> 1996-08-22 </S> KinderCare Learning Centers Inc said on Thursday that a debt buyback would mean an extraordinary loss of $ 1.2 million in its fiscal 1997 first quarter . </S> The company said that during the quarter , which began June 1 , it bought $ 30 million par value of its outstanding 10-3/8 percent senior notes due 2001 . </S> The notes were bought for $ 31.5 million . </S> Philip Maslowe , chief financial officer of the preschool and child care company , said the buyback " offered an opportunity to reduce the company \'s weighted average interest costs and improve future cash flows and earnings . " </S>'
# Separate vectorizer for input / output
training_file["len_tokenized"].plot.bar()
<AxesSubplot:>
Padding przykładów do 2048 słów
sentence_vectorizer = tf.keras.layers.TextVectorization(standardize=None, output_sequence_length=2048)
sentence_vectorizer.adapt(training_file["document"])
print(sentence_vectorizer(training_file["document"][20]))
tf.Tensor([18792 316 1335 ... 0 0 0], shape=(2048,), dtype=int64)
label_vectorizer = tf.keras.layers.TextVectorization(standardize=None, output_sequence_length=2048)
label_vectorizer.adapt(training_file["label"])
print(label_vectorizer(training_file["label"][20]))
tf.Tensor([2 2 2 ... 0 0 0], shape=(2048,), dtype=int64)
tags_list = label_vectorizer.get_vocabulary()
tags_length = label_vectorizer.vocabulary_size()
vocab_list = sentence_vectorizer.get_vocabulary()
vocab_length = sentence_vectorizer.vocabulary_size()
training_file["document_vectorized"] = training_file["document"].apply(sentence_vectorizer)
training_file["label_vectorized"] = training_file["label"].apply(label_vectorizer)
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
train, valid = train_test_split(training_file, test_size=0.2)
train_x = np.stack(train["document_vectorized"].values)
train_y = np.stack(train["label_vectorized"].values)
train_y = np.array([to_categorical(i,num_classes = tags_length) for i in train_y])
val_x = np.stack(valid["document_vectorized"].values)
val_y = np.stack(valid["label_vectorized"].values)
val_y = np.array([to_categorical(i,num_classes = tags_length) for i in val_y])
print(val_x[0])
[2014 19 122 ... 0 0 0]
train_x.shape
(756, 2048)
train_y.shape
(756, 2048, 11)
train_x[0]
array([ 128, 19, 1368, ..., 0, 0, 0], dtype=int64)
train_y[0]
array([[0., 0., 1., ..., 0., 0., 0.], [0., 0., 1., ..., 0., 0., 0.], [0., 0., 0., ..., 1., 0., 0.], ..., [1., 0., 0., ..., 0., 0., 0.], [1., 0., 0., ..., 0., 0., 0.], [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)
from keras.optimizers import Adam
import keras.layers as layers
import keras
def create_model():
input_layer = layers.Input(shape=(2048,))
embedding_layer = layers.Embedding(input_dim = vocab_length+1,output_dim = 128,input_length = 2048)(input_layer)
lstm_layer = layers.LSTM(256, return_sequences=True)(embedding_layer)
output_layer = layers.TimeDistributed(layers.Dense(tags_length,activation="softmax"))(lstm_layer)
#out = layers.Dense(2048,activation="linear")(dropout)
model = keras.Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
return model
model = create_model()
model.summary()
Model: "model_14" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_16 (InputLayer) [(None, 2048)] 0 embedding_15 (Embedding) (None, 2048, 128) 3024256 lstm_20 (LSTM) (None, 2048, 256) 394240 time_distributed_18 (TimeDi (None, 2048, 11) 2827 stributed) ================================================================= Total params: 3,421,323 Trainable params: 3,421,323 Non-trainable params: 0 _________________________________________________________________
callback = keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=3, restore_best_weights=True)
history = model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=50, callbacks=[callback])
Epoch 1/50 24/24 [==============================] - 29s 1s/step - loss: 0.6602 - accuracy: 0.8703 - val_loss: 0.2673 - val_accuracy: 0.9425 Epoch 2/50 24/24 [==============================] - 27s 1s/step - loss: 0.2500 - accuracy: 0.9653 - val_loss: 0.1613 - val_accuracy: 0.9781 Epoch 3/50 24/24 [==============================] - 28s 1s/step - loss: 0.1062 - accuracy: 0.9790 - val_loss: 0.0984 - val_accuracy: 0.9793 Epoch 4/50 24/24 [==============================] - 28s 1s/step - loss: 0.0920 - accuracy: 0.9806 - val_loss: 0.0936 - val_accuracy: 0.9799 Epoch 5/50 24/24 [==============================] - 28s 1s/step - loss: 0.0874 - accuracy: 0.9812 - val_loss: 0.0901 - val_accuracy: 0.9800 Epoch 6/50 24/24 [==============================] - 27s 1s/step - loss: 0.0828 - accuracy: 0.9816 - val_loss: 0.0867 - val_accuracy: 0.9804 Epoch 7/50 24/24 [==============================] - 27s 1s/step - loss: 0.0774 - accuracy: 0.9818 - val_loss: 0.0805 - val_accuracy: 0.9804 Epoch 8/50 24/24 [==============================] - 27s 1s/step - loss: 0.0715 - accuracy: 0.9819 - val_loss: 0.0741 - val_accuracy: 0.9807 Epoch 9/50 24/24 [==============================] - 27s 1s/step - loss: 0.0628 - accuracy: 0.9822 - val_loss: 0.0660 - val_accuracy: 0.9808 Epoch 10/50 24/24 [==============================] - 27s 1s/step - loss: 0.0543 - accuracy: 0.9826 - val_loss: 0.0579 - val_accuracy: 0.9815 Epoch 11/50 24/24 [==============================] - 27s 1s/step - loss: 0.0465 - accuracy: 0.9843 - val_loss: 0.0500 - val_accuracy: 0.9851 Epoch 12/50 24/24 [==============================] - 27s 1s/step - loss: 0.0385 - accuracy: 0.9879 - val_loss: 0.0453 - val_accuracy: 0.9867 Epoch 13/50 24/24 [==============================] - 27s 1s/step - loss: 0.0330 - accuracy: 0.9901 - val_loss: 0.0413 - val_accuracy: 0.9873 Epoch 14/50 24/24 [==============================] - 27s 1s/step - loss: 0.0298 - accuracy: 0.9909 - val_loss: 0.0395 - val_accuracy: 0.9887 Epoch 15/50 24/24 [==============================] - 27s 1s/step - loss: 0.0257 - accuracy: 0.9922 - val_loss: 0.0380 - val_accuracy: 0.9887 Epoch 16/50 24/24 [==============================] - 27s 1s/step - loss: 0.0241 - accuracy: 0.9924 - val_loss: 0.0362 - val_accuracy: 0.9887 Epoch 17/50 24/24 [==============================] - 27s 1s/step - loss: 0.0215 - accuracy: 0.9935 - val_loss: 0.0344 - val_accuracy: 0.9897 Epoch 18/50 24/24 [==============================] - 27s 1s/step - loss: 0.0191 - accuracy: 0.9942 - val_loss: 0.0335 - val_accuracy: 0.9898 Epoch 19/50 24/24 [==============================] - 27s 1s/step - loss: 0.0173 - accuracy: 0.9948 - val_loss: 0.0322 - val_accuracy: 0.9906 Epoch 20/50 24/24 [==============================] - 28s 1s/step - loss: 0.0160 - accuracy: 0.9952 - val_loss: 0.0322 - val_accuracy: 0.9908 Epoch 21/50 24/24 [==============================] - 27s 1s/step - loss: 0.0147 - accuracy: 0.9958 - val_loss: 0.0338 - val_accuracy: 0.9900 Epoch 22/50 24/24 [==============================] - 27s 1s/step - loss: 0.0133 - accuracy: 0.9962 - val_loss: 0.0307 - val_accuracy: 0.9915 Epoch 23/50 24/24 [==============================] - 27s 1s/step - loss: 0.0117 - accuracy: 0.9968 - val_loss: 0.0303 - val_accuracy: 0.9918 Epoch 24/50 24/24 [==============================] - 27s 1s/step - loss: 0.0105 - accuracy: 0.9973 - val_loss: 0.0289 - val_accuracy: 0.9922 Epoch 25/50 24/24 [==============================] - 27s 1s/step - loss: 0.0094 - accuracy: 0.9977 - val_loss: 0.0315 - val_accuracy: 0.9917 Epoch 26/50 24/24 [==============================] - 27s 1s/step - loss: 0.0084 - accuracy: 0.9980 - val_loss: 0.0300 - val_accuracy: 0.9924 Epoch 27/50 24/24 [==============================] - 27s 1s/step - loss: 0.0073 - accuracy: 0.9984 - val_loss: 0.0295 - val_accuracy: 0.9926
tag_list_numpy = np.array(tags_list)
def get_tag_from_int(input_integer):
return tag_list_numpy[input_integer]
def get_ner_output_single_sentence(input_sentence):
sentence_length = len(input_sentence.split())
vectorized = sentence_vectorizer(input_sentence)
#print(vectorized)
model_output = model(np.stack(tf.expand_dims(vectorized,0)))
#print(model_output.numpy())
#print(model_output.shape)
max_indices = np.argmax(model_output, axis=2).flatten()
#print(max_indices)
#print(len(max_indices))
#" ".join(vocab_arr[vectorize_layer(training_file["document"][20])])
tokenized = [get_tag_from_int(x) for x in max_indices[:]]
return tokenized[:sentence_length]
#get_ner_output_single_sentence("China says time right for Taiwan talks . </S> BEIJING 1996-08-22 </S> China has said it was time for political talks with Taiwan and that the rival island should take practical steps towards that goal . </S> Consultations should be held to set the time and format of the talks , the official Xinhua news agency quoted Tang Shubei , executive vice chairman of the Association for Relations Across the Taiwan Straits , as saying late on Wednesday . </S>")
def test_sentence(sentence):
model_output = get_ner_output_single_sentence(sentence)
input_tokens = sentence.split()
return list(zip(input_tokens, model_output))
test_sentence("China says time right for Taiwan talks . </S> BEIJING 1996-08-22 </S> China has said it was time for political talks with Taiwan and that the rival island should take practical steps towards that goal . </S> Consultations should be held to set the time and format of the talks , the official Xinhua news agency quoted Tang Shubei , executive vice chairman of the Association for Relations Across the Taiwan Straits , as saying late on Wednesday . </S>")
[('China', 'B-LOC'), ('says', 'O'), ('time', 'O'), ('right', 'O'), ('for', 'O'), ('Taiwan', 'B-LOC'), ('talks', 'O'), ('.', 'O'), ('</S>', 'O'), ('BEIJING', 'B-LOC'), ('1996-08-22', 'O'), ('</S>', 'O'), ('China', 'B-LOC'), ('has', 'O'), ('said', 'O'), ('it', 'O'), ('was', 'O'), ('time', 'O'), ('for', 'O'), ('political', 'O'), ('talks', 'O'), ('with', 'O'), ('Taiwan', 'B-LOC'), ('and', 'O'), ('that', 'O'), ('the', 'O'), ('rival', 'O'), ('island', 'O'), ('should', 'O'), ('take', 'O'), ('practical', 'O'), ('steps', 'O'), ('towards', 'O'), ('that', 'O'), ('goal', 'O'), ('.', 'O'), ('</S>', 'O'), ('Consultations', 'O'), ('should', 'O'), ('be', 'O'), ('held', 'O'), ('to', 'O'), ('set', 'O'), ('the', 'O'), ('time', 'O'), ('and', 'O'), ('format', 'O'), ('of', 'O'), ('the', 'O'), ('talks', 'O'), (',', 'O'), ('the', 'O'), ('official', 'O'), ('Xinhua', 'B-ORG'), ('news', 'O'), ('agency', 'O'), ('quoted', 'O'), ('Tang', 'B-PER'), ('Shubei', 'I-PER'), (',', 'O'), ('executive', 'O'), ('vice', 'O'), ('chairman', 'O'), ('of', 'O'), ('the', 'O'), ('Association', 'B-ORG'), ('for', 'I-ORG'), ('Relations', 'O'), ('Across', 'I-ORG'), ('the', 'I-ORG'), ('Taiwan', 'I-ORG'), ('Straits', 'I-ORG'), (',', 'O'), ('as', 'O'), ('saying', 'O'), ('late', 'O'), ('on', 'O'), ('Wednesday', 'O'), ('.', 'O'), ('</S>', 'O')]
test_sentence("SOCCER - LATE GOALS GIVE JAPAN WIN OVER SYRIA . </S> AL-AIN , United Arab Emirates 1996-12-06 </S> Two goals in the last six minutes gave holders Japan an uninspiring 2-1 Asian Cup victory over Syria on Friday . </S> Takuya Takagi headed the winner in the 88th minute of the group C game after goalkeeper Salem Bitar spoiled a mistake-free display by allowing the ball to slip under his body . </S> It was the second Syrian defensive blunder in four minutes . </S> Defender Hassan Abbas rose to intercept a long ball into the area in the 84th minute but only managed to divert it into the top corner of Bitar 's goal . </S> Syria had taken the lead from their first serious attack in the seventh minute . </S> Nader Jokhadar headed a cross from the right by Ammar Awad into the top right corner of Kenichi Shimokawa 's goal . </S> Japan then laid siege to the Syrian penalty area and had a goal disallowed for offside in the 16th minute . </S> A minute later , Bitar produced a good double save , first from Kazuyoshi Miura 's header and then blocked a Takagi follow-up shot . </S> Bitar saved well again from Miura in the 37th minute , parrying away his header from a corner . </S> Japan started the second half brightly but Bitar denied them an equaliser when he dived to his right to save Naoki Soma 's low drive in the 53rd minute . </S> Japan : 19 - Kenichi Shimokawa , 2 - Hiroshige Yanagimoto , 3 - Naoki Soma , 4 - Masami Ihara , 5 - Norio Omura , 6 - Motohiro Yamaguchi , 8 - Masakiyo Maezono ( 7 - Yasuto Honda 71 ) , 9 - Takuya Takagi , 10 - Hiroshi Nanami , 11 - Kazuyoshi Miura , 15 - Hiroaki Morishima ( 14 - Masayuki Okano 75 ) . </S> Syria : 24 - Salem Bitar , 3 - Bachar Srour ; 4 - Hassan Abbas , 5 - Tarek Jabban , 6 - Ammar Awad ( 9 - Louay Taleb 69 ) , 8 - Nihad al-Boushi , 10 - Mohammed Afash , 12 - Ali Dib , 13 - Abdul Latif Helou ( 17 - Ammar Rihawiy 46 ) , 14 - Khaled Zaher ; 16 - Nader Jokhadar . </S>")
tf.Tensor([ 128 19 18713 ... 0 0 0], shape=(2048,), dtype=int64) [[[3.0971142e-03 1.5280694e-03 9.8057139e-01 ... 3.6668889e-03 1.4106639e-03 3.3225205e-03] [2.1369425e-04 1.2225067e-04 9.9616271e-01 ... 1.4002173e-03 1.0539902e-04 2.7582867e-04] [6.3146334e-05 3.8070513e-05 9.9278271e-01 ... 2.4660169e-03 5.7447112e-05 1.3038449e-04] ... [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794704e-08 2.6146161e-08 5.0399006e-08] [9.9999696e-01 1.7784757e-08 2.5151198e-07 ... 1.3794731e-08 2.6146161e-08 5.0399006e-08] [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794704e-08 2.6146161e-08 5.0399006e-08]]] (1, 2048, 11) [2 2 2 ... 0 0 0] 2048
[('SOCCER', 'O'), ('-', 'O'), ('LATE', 'O'), ('GOALS', 'O'), ('GIVE', 'O'), ('JAPAN', 'O'), ('WIN', 'O'), ('OVER', 'O'), ('SYRIA', 'O'), ('.', 'O'), ('</S>', 'O'), ('AL-AIN', 'O'), (',', 'O'), ('United', 'B-LOC'), ('Arab', 'I-LOC'), ('Emirates', 'I-LOC'), ('1996-12-06', 'O'), ('</S>', 'O'), ('Two', 'O'), ('goals', 'O'), ('in', 'O'), ('the', 'O'), ('last', 'O'), ('six', 'O'), ('minutes', 'O'), ('gave', 'O'), ('holders', 'O'), ('Japan', 'B-LOC'), ('an', 'O'), ('uninspiring', 'O'), ('2-1', 'O'), ('Asian', 'B-LOC'), ('Cup', 'I-MISC'), ('victory', 'O'), ('over', 'O'), ('Syria', 'B-LOC'), ('on', 'O'), ('Friday', 'O'), ('.', 'O'), ('</S>', 'O'), ('Takuya', 'O'), ('Takagi', 'O'), ('headed', 'O'), ('the', 'O'), ('winner', 'O'), ('in', 'O'), ('the', 'O'), ('88th', 'O'), ('minute', 'O'), ('of', 'O'), ('the', 'O'), ('group', 'O'), ('C', 'O'), ('game', 'O'), ('after', 'O'), ('goalkeeper', 'O'), ('Salem', 'O'), ('Bitar', 'O'), ('spoiled', 'O'), ('a', 'O'), ('mistake-free', 'O'), ('display', 'O'), ('by', 'O'), ('allowing', 'O'), ('the', 'O'), ('ball', 'O'), ('to', 'O'), ('slip', 'O'), ('under', 'O'), ('his', 'O'), ('body', 'O'), ('.', 'O'), ('</S>', 'O'), ('It', 'O'), ('was', 'O'), ('the', 'O'), ('second', 'O'), ('Syrian', 'B-PER'), ('defensive', 'O'), ('blunder', 'O'), ('in', 'O'), ('four', 'O'), ('minutes', 'O'), ('.', 'O'), ('</S>', 'O'), ('Defender', 'O'), ('Hassan', 'B-PER'), ('Abbas', 'I-PER'), ('rose', 'O'), ('to', 'O'), ('intercept', 'O'), ('a', 'O'), ('long', 'O'), ('ball', 'O'), ('into', 'O'), ('the', 'O'), ('area', 'O'), ('in', 'O'), ('the', 'O'), ('84th', 'O'), ('minute', 'O'), ('but', 'O'), ('only', 'O'), ('managed', 'O'), ('to', 'O'), ('divert', 'O'), ('it', 'O'), ('into', 'O'), ('the', 'O'), ('top', 'O'), ('corner', 'O'), ('of', 'O'), ('Bitar', 'O'), ("'s", 'O'), ('goal', 'O'), ('.', 'O'), ('</S>', 'O'), ('Syria', 'B-ORG'), ('had', 'O'), ('taken', 'O'), ('the', 'O'), ('lead', 'O'), ('from', 'O'), ('their', 'O'), ('first', 'O'), ('serious', 'O'), ('attack', 'O'), ('in', 'O'), ('the', 'O'), ('seventh', 'O'), ('minute', 'O'), ('.', 'O'), ('</S>', 'O'), ('Nader', 'O'), ('Jokhadar', 'O'), ('headed', 'O'), ('a', 'O'), ('cross', 'O'), ('from', 'O'), ('the', 'O'), ('right', 'O'), ('by', 'O'), ('Ammar', 'O'), ('Awad', 'O'), ('into', 'O'), ('the', 'O'), ('top', 'O'), ('right', 'O'), ('corner', 'O'), ('of', 'O'), ('Kenichi', 'O'), ('Shimokawa', 'O'), ("'s", 'O'), ('goal', 'O'), ('.', 'O'), ('</S>', 'O'), ('Japan', 'B-LOC'), ('then', 'O'), ('laid', 'O'), ('siege', 'O'), ('to', 'O'), ('the', 'O'), ('Syrian', 'B-ORG'), ('penalty', 'O'), ('area', 'O'), ('and', 'O'), ('had', 'O'), ('a', 'O'), ('goal', 'O'), ('disallowed', 'O'), ('for', 'O'), ('offside', 'O'), ('in', 'O'), ('the', 'O'), ('16th', 'O'), ('minute', 'O'), ('.', 'O'), ('</S>', 'O'), ('A', 'O'), ('minute', 'O'), ('later', 'O'), (',', 'O'), ('Bitar', 'O'), ('produced', 'O'), ('a', 'O'), ('good', 'O'), ('double', 'O'), ('save', 'O'), (',', 'O'), ('first', 'O'), ('from', 'O'), ('Kazuyoshi', 'O'), ('Miura', 'O'), ("'s", 'O'), ('header', 'O'), ('and', 'O'), ('then', 'O'), ('blocked', 'O'), ('a', 'O'), ('Takagi', 'O'), ('follow-up', 'O'), ('shot', 'O'), ('.', 'O'), ('</S>', 'O'), ('Bitar', 'O'), ('saved', 'O'), ('well', 'O'), ('again', 'O'), ('from', 'O'), ('Miura', 'O'), ('in', 'O'), ('the', 'O'), ('37th', 'O'), ('minute', 'O'), (',', 'O'), ('parrying', 'O'), ('away', 'O'), ('his', 'O'), ('header', 'O'), ('from', 'O'), ('a', 'O'), ('corner', 'O'), ('.', 'O'), ('</S>', 'O'), ('Japan', 'B-ORG'), ('started', 'O'), ('the', 'O'), ('second', 'O'), ('half', 'O'), ('brightly', 'O'), ('but', 'O'), ('Bitar', 'O'), ('denied', 'O'), ('them', 'O'), ('an', 'O'), ('equaliser', 'O'), ('when', 'O'), ('he', 'O'), ('dived', 'O'), ('to', 'O'), ('his', 'O'), ('right', 'O'), ('to', 'O'), ('save', 'O'), ('Naoki', 'O'), ('Soma', 'O'), ("'s", 'O'), ('low', 'O'), ('drive', 'O'), ('in', 'O'), ('the', 'O'), ('53rd', 'O'), ('minute', 'O'), ('.', 'O'), ('</S>', 'O'), ('Japan', 'B-LOC'), (':', 'O'), ('19', 'O'), ('-', 'O'), ('Kenichi', 'O'), ('Shimokawa', 'O'), (',', 'O'), ('2', 'O'), ('-', 'O'), ('Hiroshige', 'O'), ('Yanagimoto', 'O'), (',', 'O'), ('3', 'O'), ('-', 'O'), ('Naoki', 'O'), ('Soma', 'O'), (',', 'O'), ('4', 'O'), ('-', 'O'), ('Masami', 'O'), ('Ihara', 'O'), (',', 'O'), ('5', 'O'), ('-', 'O'), ('Norio', 'O'), ('Omura', 'O'), (',', 'O'), ('6', 'O'), ('-', 'O'), ('Motohiro', 'O'), ('Yamaguchi', 'O'), (',', 'O'), ('8', 'O'), ('-', 'O'), ('Masakiyo', 'O'), ('Maezono', 'O'), ('(', 'O'), ('7', 'O'), ('-', 'O'), ('Yasuto', 'O'), ('Honda', 'B-ORG'), ('71', 'O'), (')', 'O'), (',', 'O'), ('9', 'O'), ('-', 'O'), ('Takuya', 'O'), ('Takagi', 'O'), (',', 'O'), ('10', 'O'), ('-', 'O'), ('Hiroshi', 'O'), ('Nanami', 'O'), (',', 'O'), ('11', 'O'), ('-', 'O'), ('Kazuyoshi', 'O'), ('Miura', 'O'), (',', 'O'), ('15', 'O'), ('-', 'O'), ('Hiroaki', 'O'), ('Morishima', 'O'), ('(', 'O'), ('14', 'O'), ('-', 'O'), ('Masayuki', 'O'), ('Okano', 'O'), ('75', 'O'), (')', 'O'), ('.', 'O'), ('</S>', 'O'), ('Syria', 'B-PER'), (':', 'O'), ('24', 'O'), ('-', 'O'), ('Salem', 'O'), ('Bitar', 'O'), (',', 'O'), ('3', 'O'), ('-', 'O'), ('Bachar', 'O'), ('Srour', 'O'), (';', 'O'), ('4', 'O'), ('-', 'O'), ('Hassan', 'B-PER'), ('Abbas', 'I-PER'), (',', 'O'), ('5', 'O'), ('-', 'O'), ('Tarek', 'O'), ('Jabban', 'O'), (',', 'O'), ('6', 'O'), ('-', 'O'), ('Ammar', 'O'), ('Awad', 'O'), ('(', 'O'), ('9', 'O'), ('-', 'O'), ('Louay', 'O'), ('Taleb', 'O'), ('69', 'O'), (')', 'O'), (',', 'O'), ('8', 'O'), ('-', 'O'), ('Nihad', 'O'), ('al-Boushi', 'O'), (',', 'O'), ('10', 'O'), ('-', 'O'), ('Mohammed', 'B-PER'), ('Afash', 'I-PER'), (',', 'O'), ('12', 'O'), ('-', 'O'), ('Ali', 'B-PER'), ('Dib', 'I-PER'), (',', 'O'), ('13', 'O'), ('-', 'O'), ('Abdul', 'B-PER'), ('Latif', 'I-PER'), ('Helou', 'O'), ('(', 'O'), ('17', 'O'), ('-', 'O'), ('Ammar', 'O'), ('Rihawiy', 'O'), ('46', 'O'), (')', 'O'), (',', 'O'), ('14', 'O'), ('-', 'O'), ('Khaled', 'B-PER'), ('Zaher', 'I-PER'), (';', 'O'), ('16', 'O'), ('-', 'O'), ('Nader', 'O'), ('Jokhadar', 'O'), ('.', 'O'), ('</S>', 'O')]
news_string = """Mussolini 's granddaughter rejoins far-right party . </S> ROME 1996-12-06 </S> Alessandra Mussolini , the granddaughter of Italy 's Fascist dictator Benito Mussolini , said on Friday she had rejoined the far-right National Alliance ( AN ) party she quit over policy differences last month . </S> " I 've gone back , " she told a radio show shortly after AN leader Gianfranco Fini , who was being interviewed on the programme , said the row had been resolved . </S> " He did n't want to lose me and I did n't want to lose him . " </S> Fini told state radio RAI he met Mussolini thanks to the good offices of Giuseppe Tatarella , AN 's leader in the Chamber of Deputies ( lower house ) , and had overcome their differences . </S> Mussolini , 33 , resigned from the parliamentary party group for what she said were strictly political reasons . </S> The fiery politician , who is also a niece of screen star Sophia Loren , had accused AN leaders of stifling internal party debate . </S> Mussolini , who sits in the Chamber , told La Stampa newspaper last month after quitting AN 's parliamentary party that she was considering joining the neo-fascist Social Movement ( MS-Fiamma ) formed by some of the Duce 's World War Two followers . </S>"""
test_sentence(news_string)
tf.Tensor([ 1 16 1 ... 0 0 0], shape=(2048,), dtype=int64) [[[9.1573365e-02 8.5647009e-02 1.1034752e-01 ... 8.8930450e-02 8.8644758e-02 8.9963131e-02] [5.5477720e-02 4.6575051e-02 5.2461910e-01 ... 6.4232960e-02 4.4661559e-02 5.8426060e-02] [4.9609054e-02 4.3161135e-02 4.3743923e-01 ... 9.0816177e-02 4.6578653e-02 5.5895649e-02] ... [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794731e-08 2.6146161e-08 5.0399006e-08] [9.9999696e-01 1.7784757e-08 2.5151198e-07 ... 1.3794731e-08 2.6146161e-08 5.0399006e-08] [9.9999696e-01 1.7784757e-08 2.5151175e-07 ... 1.3794731e-08 2.6146161e-08 5.0399006e-08]]] (1, 2048, 11) [2 2 2 ... 0 0 0] 2048
[('Mussolini', 'O'), ("'s", 'O'), ('granddaughter', 'O'), ('rejoins', 'O'), ('far-right', 'O'), ('party', 'O'), ('.', 'O'), ('</S>', 'O'), ('ROME', 'B-LOC'), ('1996-12-06', 'O'), ('</S>', 'O'), ('Alessandra', 'O'), ('Mussolini', 'O'), (',', 'O'), ('the', 'O'), ('granddaughter', 'O'), ('of', 'O'), ('Italy', 'B-LOC'), ("'s", 'O'), ('Fascist', 'O'), ('dictator', 'O'), ('Benito', 'B-PER'), ('Mussolini', 'I-PER'), (',', 'O'), ('said', 'O'), ('on', 'O'), ('Friday', 'O'), ('she', 'O'), ('had', 'O'), ('rejoined', 'O'), ('the', 'O'), ('far-right', 'O'), ('National', 'B-PER'), ('Alliance', 'I-PER'), ('(', 'O'), ('AN', 'O'), (')', 'O'), ('party', 'O'), ('she', 'O'), ('quit', 'O'), ('over', 'O'), ('policy', 'O'), ('differences', 'O'), ('last', 'O'), ('month', 'O'), ('.', 'O'), ('</S>', 'O'), ('"', 'O'), ('I', 'O'), ("'ve", 'O'), ('gone', 'O'), ('back', 'O'), (',', 'O'), ('"', 'O'), ('she', 'O'), ('told', 'O'), ('a', 'O'), ('radio', 'O'), ('show', 'O'), ('shortly', 'O'), ('after', 'O'), ('AN', 'O'), ('leader', 'O'), ('Gianfranco', 'B-PER'), ('Fini', 'I-PER'), (',', 'O'), ('who', 'O'), ('was', 'O'), ('being', 'O'), ('interviewed', 'O'), ('on', 'O'), ('the', 'O'), ('programme', 'O'), (',', 'O'), ('said', 'O'), ('the', 'O'), ('row', 'O'), ('had', 'O'), ('been', 'O'), ('resolved', 'O'), ('.', 'O'), ('</S>', 'O'), ('"', 'O'), ('He', 'O'), ('did', 'O'), ("n't", 'O'), ('want', 'O'), ('to', 'O'), ('lose', 'O'), ('me', 'O'), ('and', 'O'), ('I', 'O'), ('did', 'O'), ("n't", 'O'), ('want', 'O'), ('to', 'O'), ('lose', 'O'), ('him', 'O'), ('.', 'O'), ('"', 'O'), ('</S>', 'O'), ('Fini', 'O'), ('told', 'O'), ('state', 'O'), ('radio', 'O'), ('RAI', 'B-PER'), ('he', 'O'), ('met', 'O'), ('Mussolini', 'O'), ('thanks', 'O'), ('to', 'O'), ('the', 'O'), ('good', 'O'), ('offices', 'O'), ('of', 'O'), ('Giuseppe', 'B-PER'), ('Tatarella', 'I-PER'), (',', 'O'), ('AN', 'O'), ("'s", 'O'), ('leader', 'O'), ('in', 'O'), ('the', 'O'), ('Chamber', 'B-PER'), ('of', 'O'), ('Deputies', 'O'), ('(', 'O'), ('lower', 'O'), ('house', 'O'), (')', 'O'), (',', 'O'), ('and', 'O'), ('had', 'O'), ('overcome', 'O'), ('their', 'O'), ('differences', 'O'), ('.', 'O'), ('</S>', 'O'), ('Mussolini', 'O'), (',', 'O'), ('33', 'O'), (',', 'O'), ('resigned', 'O'), ('from', 'O'), ('the', 'O'), ('parliamentary', 'O'), ('party', 'O'), ('group', 'O'), ('for', 'O'), ('what', 'O'), ('she', 'O'), ('said', 'O'), ('were', 'O'), ('strictly', 'O'), ('political', 'O'), ('reasons', 'O'), ('.', 'O'), ('</S>', 'O'), ('The', 'O'), ('fiery', 'O'), ('politician', 'O'), (',', 'O'), ('who', 'O'), ('is', 'O'), ('also', 'O'), ('a', 'O'), ('niece', 'O'), ('of', 'O'), ('screen', 'O'), ('star', 'O'), ('Sophia', 'B-PER'), ('Loren', 'I-PER'), (',', 'O'), ('had', 'O'), ('accused', 'O'), ('AN', 'O'), ('leaders', 'O'), ('of', 'O'), ('stifling', 'O'), ('internal', 'O'), ('party', 'O'), ('debate', 'O'), ('.', 'O'), ('</S>', 'O'), ('Mussolini', 'O'), (',', 'O'), ('who', 'O'), ('sits', 'O'), ('in', 'O'), ('the', 'O'), ('Chamber', 'B-PER'), (',', 'O'), ('told', 'O'), ('La', 'B-ORG'), ('Stampa', 'I-ORG'), ('newspaper', 'O'), ('last', 'O'), ('month', 'O'), ('after', 'O'), ('quitting', 'O'), ('AN', 'O'), ("'s", 'O'), ('parliamentary', 'O'), ('party', 'O'), ('that', 'O'), ('she', 'O'), ('was', 'O'), ('considering', 'O'), ('joining', 'O'), ('the', 'O'), ('neo-fascist', 'O'), ('Social', 'B-ORG'), ('Movement', 'I-ORG'), ('(', 'O'), ('MS-Fiamma', 'O'), (')', 'O'), ('formed', 'O'), ('by', 'O'), ('some', 'O'), ('of', 'O'), ('the', 'O'), ('Duce', 'O'), ("'s", 'O'), ('World', 'B-ORG'), ('War', 'I-ORG'), ('Two', 'O'), ('followers', 'O'), ('.', 'O'), ('</S>', 'O')]
model.save("model_v2.keras")
import keras
model = keras.models.load_model('model_v2.keras')
with open("en-ner-conll-2003/dev-0/in.tsv", "r", encoding="utf-8") as f:
lines = f.readlines()
processed = [" ".join(get_ner_output_single_sentence(x)) for x in lines if len(x.strip())>0]
with open('en-ner-conll-2003/dev-0/out.tsv', 'w',encoding="utf-8") as f:
for line in processed:
f.write(f"{line}\n")
ERROR:tensorflow:================================== Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>): <tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x00000262307DCA00> If you want to mark it as used call its "mark_used()" method. It was originally created here: File "C:\Users\Adrian\miniconda3\lib\site-packages\keras\backend.py", line 5130, in <genexpr> ta.write(ta_index_to_write, out) File "C:\Users\Adrian\miniconda3\lib\site-packages\tensorflow\python\util\tf_should_use.py", line 243, in wrapped return _add_should_use_warning(fn(*args, **kwargs), ==================================
with open("en-ner-conll-2003/test-A/in.tsv", "r", encoding="utf-8") as f:
lines = f.readlines()
processed = [" ".join(get_ner_output_single_sentence(x)) for x in lines if len(x.strip())>0]
with open('en-ner-conll-2003/test-A/out.tsv', 'w',encoding="utf-8") as f:
for line in processed:
f.write(f"{line}\n")
Czyszczenie tagów
tag_set = set()
with open("en-ner-conll-2003/dev-0/out.tsv", "r", encoding="utf-8") as f:
lines = f.readlines()
for line in lines:
line_split = line.split()
for tag in line_split:
if tag not in tag_set:
tag_set.add(tag)
print(tag_set)
{'B-LOC', 'I-LOC', 'O', 'I-MISC', 'B-ORG', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC'}
inter_to_begin_mapping = {
"I-LOC": "B-LOC",
"I-MISC": 'B-MISC',
'I-ORG': 'B-ORG',
'I-PER': 'B-PER'
}
begin_to_inter_mapping = {v: k for k, v in inter_to_begin_mapping.items()}
inter_to_begin_mapping
{'I-LOC': 'B-LOC', 'I-MISC': 'B-MISC', 'I-ORG': 'B-ORG', 'I-PER': 'B-PER'}
begin_to_inter_mapping
{'B-LOC': 'I-LOC', 'B-MISC': 'I-MISC', 'B-ORG': 'I-ORG', 'B-PER': 'I-PER'}
def fix_tags_in_file(filename, filename_fixed):
lines_fixed = []
with open(filename, "r", encoding="utf-8") as f:
lines = f.readlines()
lines_tokenized = [line.split() for line in lines]
for line in lines_tokenized:
line_fixed = []
for counter, element in enumerate(line):
if element=="O": # O tag can be placed anywhere
line_fixed.append(element)
elif element in inter_to_begin_mapping:
if counter==0: # Beginning of line, can't check previous tag
line_fixed.append(inter_to_begin_mapping[element])
else:
previous_element = line_fixed[counter-1]
if previous_element==element or previous_element==inter_to_begin_mapping[element]: # Tag was compatible (same inters or compatible B-->I)
line_fixed.append(element)
elif previous_element=="O": # O--> Inter
line_fixed.append(inter_to_begin_mapping[element])
elif previous_element in inter_to_begin_mapping and element in inter_to_begin_mapping and previous_element!=element: # Incompatible subsequent inter-tags
line_fixed.append(previous_element)
else: # Begin --> Incompatible inter
corrected_tag = begin_to_inter_mapping[previous_element]
line_fixed.append(corrected_tag)
elif element in begin_to_inter_mapping: # Beginning tag can be added safely
line_fixed.append(element)
else:
print("This shouldn't happen")
lines_fixed.append(" ".join(line_fixed))
with open(filename_fixed, "w", encoding="utf-8") as f:
for line in lines_fixed:
f.write(f"{line}\n")
fix_tags_in_file("en-ner-conll-2003/test-A/out.tsv", "en-ner-conll-2003/test-A/out_fixed.tsv")
fix_tags_in_file("en-ner-conll-2003/dev-0/out.tsv", "en-ner-conll-2003/dev-0/out_fixed.tsv")