transformer_pipeline/en-ner-conll-2003/main.ipynb

475 lines
66 KiB
Plaintext
Raw Permalink Normal View History

2024-06-03 12:23:34 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Uczenie głębokie przetwarzanie tekstu laboratoria\n",
"# 3. RNN"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"# %pip install torch\n",
"# %pip install ipywidgets\n",
"# %pip install pandas\n",
"# %pip install transformers"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]\n",
" [--paths] [--json] [--debug]\n",
" [subcommand]\n",
"\n",
"Jupyter: Interactive Computing\n",
"\n",
"positional arguments:\n",
" subcommand the subcommand to launch\n",
"\n",
"options:\n",
" -h, --help show this help message and exit\n",
" --version show the versions of core jupyter packages and exit\n",
" --config-dir show Jupyter config dir\n",
" --data-dir show Jupyter data dir\n",
" --runtime-dir show Jupyter runtime dir\n",
" --paths show all Jupyter paths. Add --json for machine-readable\n",
" format.\n",
" --json output paths as machine-readable json\n",
" --debug output debug information about paths\n",
"\n",
"Available subcommands: console dejavu events execute kernel kernelspec lab\n",
"labextension labhub migrate nbconvert notebook qtconsole run server\n",
"troubleshoot trust\n",
"\n",
"Jupyter command `jupyter-nbextension` not found.\n"
]
}
],
"source": [
"!jupyter nbextension enable --py widgetsnbextension\n",
"import torch\n",
"from ipywidgets import FloatProgress\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']\n",
"- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
]
}
],
"source": [
"from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline\n",
"\n",
"model = AutoModelForTokenClassification.from_pretrained(\"dslim/bert-base-NER\")\n",
"tokenizer = AutoTokenizer.from_pretrained(\"dslim/bert-base-NER\")\n",
"\n",
"nlp = pipeline(\"ner\", model=model, tokenizer=tokenizer)"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wczytano dane treningowe\n",
"B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER O B-LOC O O O B-ORG I-ORG O O O O O O B-MISC O O O O O B-MISC O O O O O O O O O O O O O O O B-LOC O O O O B-ORG I-ORG O O O B-PER I-PER O O O O O O O O O O O B-LOC O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O O O B-PER I-PER I-PER I-PER O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG I-ORG O O O O O O O O O B-ORG O O B-PER I-PER O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-PER O B-MISC O O O O B-LOC O B-LOC O O O O O O O B-MISC I-MISC I-MISC O B-MISC O O O O O O O O B-PER O O O O O O O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-MISC O O B-PER I-PER I-PER O O O B-PER O O B-ORG O O O O O O O O O O O O O O O O O O B-LOC O B-LOC O B-PER O O O O O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-MISC O O O O O O O O O O O O O O O O B-MISC O O O O O O O O O O O O O O O O O O O B-MISC O O O O O O B-MISC O O O O O B-LOC O O O O O O O O O O O O O O O O O O O B-LOC O O O O B-ORG I-ORG I-ORG I-ORG I-ORG O B-ORG O O B-PER I-PER I-PER O O B-ORG I-ORG O O B-LOC O O O O O O O O O O O O O O O B-MISC O O O O O O O O O O O O O O O O O O B-LOC O O O O B-LOC O O O O O O O O O O O O O O O O B-MISC O O O O O O O O O O EU rejects German call to boycott British lamb . </S> Peter Blackburn </S> BRUSSELS 1996-08-22 </S> The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep . </S> Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer . </S> \" We do n't support any such recommendation because we do n't see any grounds for it , \" the Commission 's chief spokesman Nikolaus van der Pas told a news briefing . </S> He said further scientific study was required and if it was found that action was needed it should be taken by the European Union . </S> He said a proposal last month by EU Farm Commissioner Franz Fischler to ban sheep brains , spleens and spinal cords from the human and animal food chains was a highly specific and precautionary move to protect human health . </S> Fischler proposed EU-wide measures after reports from Britain and France that under laboratory conditions sheep could contract Bovine Spongiform Encephalopathy ( BSE ) -- mad cow disease . </S> But Fischler agreed to review his proposal after the EU 's standing veterinary committee , mational animal health officials , questioned if such action was justified as there was only a slight risk to human health . </S> Spanish Farm Minister Loyola de Palacio had earlier accused Fischler at an EU farm ministers ' meeting of causing unjustified alarm through \" dangerous generalisation . \" </S> . </S> Only France and Britain backed Fischler 's proposal . </S> The EU 's scientific veterinary and multidisciplinary committees are due to re-examine the issue early next month and make recommendations to the senior veterinary officials . </S> Sheep have long been known to contract scrapie , a brain-wasting disease similar to BSE which is believed to have been transferred to cattle through feed containing animal waste . </S> British farmers denied on Thursday there was any danger to human health from their sheep , but expressed concern that German government advice to consumers to avoid British lamb might influence consumers across Europe . </S> \" What we have to be extremely careful of is how other countries are going to take Germany 's lead , \" Welsh National Farmers ' Union ( NFU ) chairman John Lloyd Jones said on BBC radio . </S> Bonn has led efforts to protect public health after consumer confidence collapsed in March after a British report suggested humans could contract an illness similar to mad cow disease by eating contaminated beef . </S> Germany imported 47,600 sheep from Britain last year , nearly half of total imports . </S> It brought in
"489\n",
"489\n",
"EU rejects German call to boycott British lamb . </S> Peter Blackburn </S> BRUSSELS 1996-08-22 </S> The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep . </S> Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer . </S> \" We do n't support any such recommendation because we do n't see any grounds for it , \" the Commission 's chief spokesman Nikolaus van der Pas told a news briefing . </S> He said further scientific study was required and if it was found that action was needed it should be taken by the European Union . </S> He said a proposal last month by EU Farm Commissioner Franz Fischler to ban sheep brains , spleens and spinal cords from the human and animal food chains was a highly specific and precautionary move to protect human health . </S> Fischler proposed EU-wide measures after reports from Britain and France that under laboratory conditions sheep could contract Bovine Spongiform Encephalopathy ( BSE ) -- mad cow disease . </S> But Fischler agreed to review his proposal after the EU 's standing veterinary committee , mational animal health officials , questioned if such action was justified as there was only a slight risk to human health . </S> Spanish Farm Minister Loyola de Palacio had earlier accused Fischler at an EU farm ministers ' meeting of causing unjustified alarm through \" dangerous generalisation . \" </S> . </S> Only France and Britain backed Fischler 's proposal . </S> The EU 's scientific veterinary and multidisciplinary committees are due to re-examine the issue early next month and make recommendations to the senior veterinary officials . </S> Sheep have long been known to contract scrapie , a brain-wasting disease similar to BSE which is believed to have been transferred to cattle through feed containing animal waste . </S> British farmers denied on Thursday there was any danger to human health from their sheep , but expressed concern that German government advice to consumers to avoid British lamb might influence consumers across Europe . </S> \" What we have to be extremely careful of is how other countries are going to take Germany 's lead , \" Welsh National Farmers ' Union ( NFU ) chairman John Lloyd Jones said on BBC radio . </S> Bonn has led efforts to protect public health after consumer confidence collapsed in March after a British report suggested humans could contract an illness similar to mad cow disease by eating contaminated beef . </S> Germany imported 47,600 sheep from Britain last year , nearly half of total imports . </S> It brought in 4,275 tonnes of British mutton , some 10 percent of overall imports . </S>\n"
]
}
],
"source": [
"# odczytaj dane treningowe\n",
"train = pd.read_csv('train/train.tsv', sep='\\t', names=['y', 'x'], header=None)\n",
"\n",
"print(\"wczytano dane treningowe\")\n",
"print(train[\"y\"][0], train[\"x\"][0])\n",
"\n",
"print(len(train[\"y\"][0].split(\" \")))\n",
"print(len(train[\"x\"][0].split(\" \")))\n",
"\n",
"print(train[\"x\"][0])"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wczytano dane testowe dev-0\n",
"CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY . </S> LONDON 1996-08-30 </S> West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship . </S> Their stay on top , though , may be short-lived as title rivals Essex , Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire . </S> After bowling Somerset out for 83 on the opening morning at Grace Road , Leicestershire extended their first innings by 94 runs before being bowled out for 296 with England discard Andy Caddick taking three for 83 . </S> Trailing by 213 , Somerset got a solid start to their second innings before Simmons stepped in to bundle them out for 174 . </S> Essex , however , look certain to regain their top spot after Nasser Hussain and Peter Such gave them a firm grip on their match against Yorkshire at Headingley . </S> Hussain , considered surplus to England 's one-day requirements , struck 158 , his first championship century of the season , as Essex reached 372 and took a first innings lead of 82 . </S> By the close Yorkshire had turned that into a 37-run advantage but off-spinner Such had scuttled their hopes , taking four for 24 in 48 balls and leaving them hanging on 119 for five and praying for rain . </S> At the Oval , Surrey captain Chris Lewis , another man dumped by England , continued to silence his critics as he followed his four for 45 on Thursday with 80 not out on Friday in the match against Warwickshire . </S> He was well backed by England hopeful Mark Butcher who made 70 as Surrey closed on 429 for seven , a lead of 234 . </S> Derbyshire kept up the hunt for their first championship title since 1936 by reducing Worcestershire to 133 for five in their second innings , still 100 runs away from avoiding an innings defeat . </S> Australian Tom Moody took six for 82 but Chris Adams , 123 , and Tim O'Gorman , 109 , took Derbyshire to 471 and a first innings lead of 233 . </S> After the frustration of seeing the opening day of their match badly affected by the weather , Kent stepped up a gear to dismiss Nottinghamshire for 214 . </S> They were held up by a gritty 84 from Paul Johnson but ex-England fast bowler Martin McCague took four for 55 . </S> By stumps Kent had reached 108 for three . </S>\n"
]
}
],
"source": [
"# odczytaj dane testowe dev-0\n",
"test_dev0 = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['x'], header=None)\n",
"\n",
"print(\"wczytano dane testowe dev-0\")\n",
"print(test_dev0[\"x\"][0])"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wczytano dane testowe A\n",
"SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT . </S> Nadim Ladki </S> AL-AIN , United Arab Emirates 1996-12-06 </S> Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday . </S> But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan . </S> China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net . </S> Oleg Shatskiku made sure of the win in injury time , hitting an unstoppable left foot shot from just outside the area . </S> The former Soviet republic was playing in an Asian Cup finals tie for the first time . </S> Despite winning the Asian Games title two years ago , Uzbekistan are in the finals as outsiders . </S> Two goals from defensive errors in the last six minutes allowed Japan to come from behind and collect all three points from their opening meeting against Syria . </S> Takuya Takagi scored the winner in the 88th minute , rising to head a Hiroshige Yanagimoto cross towards the Syrian goal which goalkeeper Salem Bitar appeared to have covered but then allowed to slip into the net . </S> It was the second costly blunder by Syria in four minutes . </S> Defender Hassan Abbas rose to intercept a long ball into the area in the 84th minute but only managed to divert it into the top corner of Bitar 's goal . </S> Nader Jokhadar had given Syria the lead with a well-struck header in the seventh minute . </S> Japan then laid siege to the Syrian penalty area for most of the game but rarely breached the Syrian defence . </S> Bitar pulled off fine saves whenever they did . </S> Japan coach Shu Kamo said : ' ' The Syrian own goal proved lucky for us . </S> The Syrians scored early and then played defensively and adopted long balls which made it hard for us . ' </S> ' </S> Japan , co-hosts of the World Cup in 2002 and ranked 20th in the world by FIFA , are favourites to regain their title here . </S> Hosts UAE play Kuwait and South Korea take on Indonesia on Saturday in Group A matches . </S> All four teams are level with one point each from one game . </S>\n"
]
}
],
"source": [
"# odczytaj dane testowe A\n",
"test_A = pd.read_csv('test-A/in.tsv', sep='\\t', names=['x'], header=None)\n",
"\n",
"print(\"wczytano dane testowe A\")\n",
"print(test_A[\"x\"][0])"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"456\n",
"O O B-ORG O O O O O O O O O B-LOC O O B-MISC I-MISC O B-PER I-PER O O O O O O O B-ORG O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O B-ORG O B-ORG O O O O O O B-ORG O O O O O O O O O O B-ORG O O O O B-ORG O O O O O O O O B-LOC I-LOC O B-ORG O O O O O O O O O O O O O O B-LOC O B-PER I-PER O O O O O O O O O O B-ORG O O O O O O O O O B-PER O O O O O O O O O O B-ORG O O O O O O O O O O O B-PER I-PER O B-PER I-PER O O O O O O O O O B-ORG O B-LOC O O B-PER O O O O B-LOC O O O O O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O O O B-ORG O O O O O O O O O B-PER O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-LOC O B-ORG O B-PER I-PER O O O O O B-LOC O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O O O O O O O B-LOC O B-PER I-PER O O O O B-ORG O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O O O O O O O O B-MISC B-PER I-PER O O O O O B-PER I-PER O O O O B-PER I-PER O O O O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O O O O O O B-ORG O O O O O O O O O O O O O B-PER I-PER O B-MISC O O B-PER I-PER O O O O O O O O B-ORG O O O O O O O\n"
]
}
],
"source": [
"# odczytaj etykiety dev-0\n",
"labels_dev0 = pd.read_csv('dev-0/expected.tsv', sep='\\t', names=['y'], header=None)\n",
"\n",
"print(len(labels_dev0[\"y\"][0].split(\" \")))\n",
"print(labels_dev0[\"y\"][0])"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY . </S> LONDON 1996-08-30 </S> West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship . </S> Their stay on top , though , may be short-lived as title rivals Essex , Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire . </S> After bowling Somerset out for 83 on the opening morning at Grace Road , Leicestershire extended their first innings by 94 runs before being bowled out for 296 with England discard Andy Caddick taking three for 83 . </S> Trailing by 213 , Somerset got a solid start to their second innings before Simmons stepped in to bundle them out for 174 . </S> Essex , however , look certain to regain their top spot after Nasser Hussain and Peter Such gave them a firm grip on their match against Yorkshire at Headingley . </S> Hussain , considered surplus to England 's one-day requirements , struck 158 , his first championship century of the season , as Essex reached 372 and took a first innings lead of 82 . </S> By the close Yorkshire had turned that into a 37-run advantage but off-spinner Such had scuttled their hopes , taking four for 24 in 48 balls and leaving them hanging on 119 for five and praying for rain . </S> At the Oval , Surrey captain Chris Lewis , another man dumped by England , continued to silence his critics as he followed his four for 45 on Thursday with 80 not out on Friday in the match against Warwickshire . </S> He was well backed by England hopeful Mark Butcher who made 70 as Surrey closed on 429 for seven , a lead of 234 . </S> Derbyshire kept up the hunt for their first championship title since 1936 by reducing Worcestershire to 133 for five in their second innings , still 100 runs away from avoiding an innings defeat . </S> Australian Tom Moody took six for 82 but Chris Adams , 123 , and Tim O'Gorman , 109 , took Derbyshire to 471 and a first innings lead of 233 . </S> After the frustration of seeing the opening day of their match badly affected by the weather , Kent stepped up a gear to dismiss Nottinghamshire for 214 . </S> They were held up by a gritty 84 from Paul Johnson but ex-England fast bowler Martin McCague took four for 55 . </S> By stumps Kent had reached 108 for three . </S>\n",
"O O B-ORG O O O O O O O O O B-LOC O O B-MISC I-MISC O B-PER I-PER O O O O O O O B-ORG O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O B-ORG O B-ORG O O O O O O B-ORG O O O O O O O O O O B-ORG O O O O B-ORG O O O O O O O O B-LOC I-LOC O B-ORG O O O O O O O O O O O O O O B-LOC O B-PER I-PER O O O O O O O O O O B-ORG O O O O O O O O O B-PER O O O O O O O O O O B-ORG O O O O O O O O O O O B-PER I-PER O B-PER I-PER O O O O O O O O O B-ORG O B-LOC O O B-PER O O O O B-LOC O O O O O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O O O B-ORG O O O O O O O O O B-PER O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-LOC O B-ORG O B-PER I-PER O O O O O B-LOC O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O O O O O O O B-LOC O B-PER I-PER O O O O B-ORG O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O O O O O O O O B-MISC B-PER I-PER O O O O O B-PER I-PER O O O O B-PER I-PER O O O O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O O O O O O B-ORG O O O O O O O O O O O O O B-PER I-PER O B-MISC O O B-PER I-PER O O O O O O O O B-ORG O O O O O O O\n",
"[{'entity': 'B-LOC', 'score': 0.69772226, 'index': 6, 'word': 'L', 'start': 10, 'end': 11}, {'entity': 'B-LOC', 'score': 0.99893564, 'index': 40, 'word': 'L', 'start': 71, 'end': 72}, {'entity': 'B-LOC', 'score': 0.618876, 'index': 41, 'word': '##ON', 'start': 72, 'end': 74}, {'entity': 'I-LOC', 'score': 0.8289433, 'index': 42, 'word': '##D', 'start': 74, 'end': 75}, {'entity': 'I-LOC', 'score': 0.98961467, 'index': 43, 'word': '##ON', 'start': 75, 'end': 77}, {'entity': 'B-MISC', 'score': 0.9995053, 'index': 53, 'word': 'West', 'start': 94, 'end': 98}, {'entity': 'I-MISC', 'score': 0.9968857, 'index': 54, 'word': 'Indian', 'start': 99, 'end': 105}, {'entity': 'B-PER', 'score': 0.99974555, 'index': 59, 'word': 'Phil', 'start': 118, 'end': 122}, {'entity': 'I-PER', 'score': 0.9996648, 'index': 60, 'word': 'Simmons', 'start': 123, 'end': 130}, {'entity': 'B-ORG', 'score': 0.9988734, 'index': 68, 'word': 'Leicestershire', 'start': 161, 'end': 175}, {'entity': 'B-ORG', 'score': 0.99889946, 'index': 70, 'word': 'Somerset', 'start': 181, 'end': 189}, {'entity': 'B-ORG', 'score': 0.99934953, 'index': 110, 'word': 'Essex', 'start': 351, 'end': 356}, {'entity': 'B-ORG', 'score': 0.9992531, 'index': 112, 'word': 'Derbyshire', 'start': 359, 'end': 369}, {'entity': 'B-ORG', 'score': 0.9992465, 'index': 114, 'word': 'Surrey', 'start': 374, 'end': 380}, {'entity': 'B-ORG', 'score': 0.9990815, 'index': 121, 'word': 'Kent', 'start': 412, 'end': 416}, {'entity': 'B-ORG', 'score': 0.99872905, 'index': 134, 'word': 'Nottinghamshire', 'start': 476, 'end': 491}, {'entity': 'B-ORG', 'score': 0.9989404, 'index': 142, 'word': 'Somerset', 'start': 513, 'end': 521}, {'entity': 'B-LOC', 'score': 0.99537045, 'index': 151, 'word': 'Grace', 'start': 559, 'end': 564}, {'entity': 'I-LOC', 'score': 0.99789894, 'index': 152, 'word': 'Road', 'start': 565, 'end': 569}, {'entity': 'B-ORG', 'score': 0.9989697, 'index': 154, 'word': 'Leicestershire', 'start': 572, 'end': 586}, {'entity': 'B-LOC', 'score': 0.9996301, 'index': 170, 'word': 'England', 'start': 664, 'end': 671}, {'entity': 'B-PER', 'score': 0.99971694, 'index': 173, 'word': 'Andy', 'start': 680, 'end': 684}, {'entity': 'I-PER', 'score': 0.999627, 'index': 174, 'word': 'C', 'start': 685, 'end': 686}, {'entity': 'I-PER', 'score': 0.99152094, 'index': 175, 'word': '##ad', 'start': 686, 'end': 688}, {'entity': 'I-PER', 'score': 0.9895243, 'index': 176, 'word': '##dick', 'start': 688, 'end': 692}, {'entity': 'B-ORG', 'score': 0.99902487, 'index': 191, 'word': 'Somerset', 'start': 738, 'end': 746}, {'entity': 'B-PER', 'score': 0.99702805, 'index': 201, 'word': 'Simmons', 'start': 796, 'end': 803}, {'entity': 'B-ORG', 'score': 0.99917525, 'index': 215, 'word': 'Essex', 'start': 849, 'end': 854}, {'entity': 'B-PER', 'score': 0.999554, 'index': 227, 'word': 'Na', 'start': 911, 'end': 913}, {'entity': 'B-PER', 'score': 0.9975007, 'index': 228, 'word': '##sser', 'start': 913, 'end': 917}, {'entity': 'I-PER', 'score': 0.99959487, 'index': 229, 'word': 'Hussain', 'start': 918, 'end': 925}, {'entity': 'B-PER', 'score': 0.9997563, 'index': 231, 'word': 'Peter', 'start': 930, 'end': 935}, {'entity': 'I-PER', 'score': 0.9985538, 'index': 232, 'word': 'Such', 'start': 936, 'end': 940}, {'entity': 'B-ORG', 'score': 0.9986828, 'index': 242, 'word': 'Yorkshire', 'start': 986, 'end': 995}, {'entity': 'B-LOC', 'score': 0.99893683, 'index': 244, 'word': 'Head', 'start': 999, 'end': 1003}, {'entity': 'I-LOC', 'score': 0.9932749, 'index': 245, 'word': '##ing', 'start': 1003, 'end': 1006}, {'entity': 'I-LOC', 'score': 0.9981122, 'index': 246, 'word': '##ley', 'start': 1006, 'end': 1009}, {'entity': 'B-PER', 'score': 0.9949905, 'index': 252, 'word': 'Hussain', 'start': 1017, 'end': 1024}, {'entity': 'B-LOC', 'score': 0.99968874, 'index': 257, 'word': 'England', 'start': 1049, 'end': 1056}, {'entity': 'B-ORG', 'score': 0.9988887, 'index': 277, 'word': 'Essex', 'start': 1146, 'end': 1151}, {'entity': 'B-ORG', 'score': 0.9990658, 'index': 297, 'word': 'Yorkshire', 'start': 1220, 'end': 1229}, {'entity': 'B-PER', 'score
]
}
],
"source": [
"# przewidywania dev0\n",
"print(test_dev0[\"x\"][0])\n",
"print(labels_dev0[\"y\"][0])\n",
"\n",
"predicted_labels_dev0 = []\n",
"for tekst in test_dev0[\"x\"]:\n",
" predicted_labels_dev0.append(nlp(tekst))\n",
"\n",
"print(predicted_labels_dev0[0])"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY . </S> LONDON 1996-08-30 </S> West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship . </S> Their stay on top , though , may be short-lived as title rivals Essex , Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire . </S> After bowling Somerset out for 83 on the opening morning at Grace Road , Leicestershire extended their first innings by 94 runs before being bowled out for 296 with England discard Andy Caddick taking three for 83 . </S> Trailing by 213 , Somerset got a solid start to their second innings before Simmons stepped in to bundle them out for 174 . </S> Essex , however , look certain to regain their top spot after Nasser Hussain and Peter Such gave them a firm grip on their match against Yorkshire at Headingley . </S> Hussain , considered surplus to England 's one-day requirements , struck 158 , his first championship century of the season , as Essex reached 372 and took a first innings lead of 82 . </S> By the close Yorkshire had turned that into a 37-run advantage but off-spinner Such had scuttled their hopes , taking four for 24 in 48 balls and leaving them hanging on 119 for five and praying for rain . </S> At the Oval , Surrey captain Chris Lewis , another man dumped by England , continued to silence his critics as he followed his four for 45 on Thursday with 80 not out on Friday in the match against Warwickshire . </S> He was well backed by England hopeful Mark Butcher who made 70 as Surrey closed on 429 for seven , a lead of 234 . </S> Derbyshire kept up the hunt for their first championship title since 1936 by reducing Worcestershire to 133 for five in their second innings , still 100 runs away from avoiding an innings defeat . </S> Australian Tom Moody took six for 82 but Chris Adams , 123 , and Tim O'Gorman , 109 , took Derbyshire to 471 and a first innings lead of 233 . </S> After the frustration of seeing the opening day of their match badly affected by the weather , Kent stepped up a gear to dismiss Nottinghamshire for 214 . </S> They were held up by a gritty 84 from Paul Johnson but ex-England fast bowler Martin McCague took four for 55 . </S> By stumps Kent had reached 108 for three . </S>\n",
"[(0, 6), (8, 8), (10, 23), (25, 28), (30, 33), (35, 36), (38, 40), (42, 46), (48, 54), (56, 62), (64, 64), (66, 69), (71, 76), (78, 87), (89, 92), (94, 97), (99, 104), (106, 116), (118, 121), (123, 129), (131, 134), (136, 139), (141, 143), (145, 146), (148, 149), (151, 156), (158, 159), (161, 174), (176, 179), (181, 188), (190, 191), (193, 194), (196, 202), (204, 206), (208, 209), (211, 214), (216, 217), (219, 221), (223, 226), (228, 229), (231, 234), (236, 239), (241, 242), (244, 246), (248, 251), (253, 254), (256, 258), (260, 265), (267, 278), (280, 280), (282, 285), (287, 291), (293, 296), (298, 299), (301, 303), (305, 305), (307, 312), (314, 314), (316, 318), (320, 321), (323, 333), (335, 336), (338, 342), (344, 349), (351, 355), (357, 357), (359, 368), (370, 372), (374, 379), (381, 383), (385, 390), (392, 393), (395, 396), (398, 404), (406, 410), (412, 415), (417, 420), (422, 423), (425, 427), (429, 432), (434, 437), (439, 440), (442, 446), (448, 460), (462, 466), (468, 474), (476, 490), (492, 492), (494, 497), (499, 503), (505, 511), (513, 520), (522, 524), (526, 528), (530, 531), (533, 534), (536, 538), (540, 546), (548, 554), (556, 557), (559, 563), (565, 568), (570, 570), (572, 585), (587, 594), (596, 600), (602, 606), (608, 614), (616, 617), (619, 620), (622, 625), (627, 632), (634, 638), (640, 645), (647, 649), (651, 653), (655, 657), (659, 662), (664, 670), (672, 678), (680, 683), (685, 691), (693, 698), (700, 704), (706, 708), (710, 711), (713, 713), (715, 718), (720, 727), (729, 730), (732, 734), (736, 736), (738, 745), (747, 749), (751, 751), (753, 757), (759, 763), (765, 766), (768, 772), (774, 779), (781, 787), (789, 794), (796, 802), (804, 810), (812, 813), (815, 816), (818, 823), (825, 828), (830, 832), (834, 836), (838, 840), (842, 842), (844, 847), (849, 853), (855, 855), (857, 863), (865, 865), (867, 870), (872, 878), (880, 881), (883, 888), (890, 894), (896, 898), (900, 903), (905, 909), (911, 916), (918, 924), (926, 928), (930, 934), (936, 939), (941, 944), (946, 949), (951, 951), (953, 956), (958, 961), (963, 964), (966, 970), (972, 976), (978, 984), (986, 994), (996, 997), (999, 1008), (1010, 1010), (1012, 1015), (1017, 1023), (1025, 1025), (1027, 1036), (1038, 1044), (1046, 1047), (1049, 1055), (1057, 1058), (1060, 1066), (1068, 1079), (1081, 1081), (1083, 1088), (1090, 1092), (1094, 1094), (1096, 1098), (1100, 1104), (1106, 1117), (1119, 1125), (1127, 1128), (1130, 1132), (1134, 1139), (1141, 1141), (1143, 1144), (1146, 1150), (1152, 1158), (1160, 1162), (1164, 1166), (1168, 1171), (1173, 1173), (1175, 1179), (1181, 1187), (1189, 1192), (1194, 1195), (1197, 1198), (1200, 1200), (1202, 1205), (1207, 1208), (1210, 1212), (1214, 1218), (1220, 1228), (1230, 1232), (1234, 1239), (1241, 1244), (1246, 1249), (1251, 1251), (1253, 1258), (1260, 1268), (1270, 1272), (1274, 1284), (1286, 1289), (1291, 1293), (1295, 1302), (1304, 1308), (1310, 1314), (1316, 1316), (1318, 1323), (1325, 1328), (1330, 1332), (1334, 1335), (1337, 1338), (1340, 1341), (1343, 1347), (1349, 1351), (1353, 1359), (1361, 1364), (1366, 1372), (1374, 1375), (1377, 1379), (1381, 1383), (1385, 1388), (1390, 1392), (1394, 1400), (1402, 1404), (1406, 1409), (1411, 1411), (1413, 1416), (1418, 1419), (1421, 1423), (1425, 1428), (1430, 1430), (1432, 1437), (1439, 1445), (1447, 1451), (1453, 1457), (1459, 1459), (1461, 1467), (1469, 1471), (1473, 1478), (1480, 1481), (1483, 1489), (1491, 1491), (1493, 1501), (1503, 1504), (1506, 1512), (1514, 1516), (1518, 1524), (1526, 1527), (1529, 1530), (1532, 1539), (1541, 1543), (1545, 1548), (1550, 1552), (1554, 1555), (1557, 1558), (1560, 1567), (1569, 1572), (1574, 1575), (1577, 1579), (1581, 1583), (1585, 1586), (1588, 1593), (1595, 1596), (1598, 1600), (1602, 1606), (1608, 1614), (1616, 1627), (1629, 1629), (1631, 1634), (1636, 1637), (1639, 1641), (1643, 1646), (1648, 1653), (1655, 1656), (1658, 1664), (1666, 1672), (1674, 1677), (1679, 1685), (1687, 1689), (1691, 1694), (1696, 1697), (1699, 1700), (1702, 1707), (1709, 1714), (1716, 1717), (1719, 1721), (1723, 1725), (1727, 1731), (1733,
"456\n",
"456\n"
]
}
],
"source": [
"# zakladamy ze teksty nie zaczynaj ani nie koncza sie na spacji, nie ma wielokrotnych spacji\n",
"zakresy_slow_tekstow = []\n",
"for tekst in test_dev0[\"x\"]:\n",
" zakresy_slow = []\n",
" poczatek = 0\n",
" koniec = len(tekst)-1\n",
" for numer_zakresu in range(len(tekst)):\n",
" znak = tekst[numer_zakresu]\n",
" if znak == \" \":\n",
" koniec = numer_zakresu-1\n",
" zakres = (poczatek, koniec)\n",
" zakresy_slow.append(zakres)\n",
" poczatek = numer_zakresu+1\n",
" koniec = len(tekst)-1\n",
" zakres = (poczatek, koniec)\n",
" zakresy_slow.append(zakres)\n",
" zakresy_slow_tekstow.append(zakresy_slow)\n",
"\n",
"print(test_dev0[\"x\"][0])\n",
"print(zakresy_slow_tekstow[0])\n",
"\n",
"print(len(labels_dev0[\"y\"][0].split(\" \")))\n",
"print(len(zakresy_slow_tekstow[0]))"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"poprawione_predicted_labels_dev0 = []\n",
"for linijka in range(len(test_dev0[\"x\"])):\n",
" zakresy_slow = zakresy_slow_tekstow[linijka]\n",
" przewidywania = predicted_labels_dev0[linijka]\n",
" etykiety = [\"O\" for _ in zakresy_slow]\n",
" for numer_zakresu in range(len(zakresy_slow)):\n",
" zakres = zakresy_slow[numer_zakresu]\n",
" for p in przewidywania:\n",
" start = p[\"start\"]\n",
" if start>=zakres[0] and start<=zakres[1]:\n",
" tag = p[\"entity\"]\n",
" if etykiety[numer_zakresu] == \"O\":\n",
" etykiety[numer_zakresu] = tag\n",
" poprawione_predicted_labels_dev0.append(\" \".join(etykiety).strip())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"O O B-ORG O O O O O O O O O B-LOC O O B-MISC I-MISC O B-PER I-PER O O O O O O O B-ORG O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O B-ORG O B-ORG O O O O O O B-ORG O O O O O O O O O O B-ORG O O O O B-ORG O O O O O O O O B-LOC I-LOC O B-ORG O O O O O O O O O O O O O O B-LOC O B-PER I-PER O O O O O O O O O O B-ORG O O O O O O O O O B-PER O O O O O O O O O O B-ORG O O O O O O O O O O O B-PER I-PER O B-PER I-PER O O O O O O O O O B-ORG O B-LOC O O B-PER O O O O B-LOC O O O O O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O O O B-ORG O O O O O O O O O B-PER O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-LOC O B-ORG O B-PER I-PER O O O O O B-LOC O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O O O O O O O B-LOC O B-PER I-PER O O O O B-ORG O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O O O O O O O O B-MISC B-PER I-PER O O O O O B-PER I-PER O O O O B-PER I-PER O O O O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O O O O O O B-ORG O O O O O O O O O O O O O B-PER I-PER O B-MISC O O B-PER I-PER O O O O O O O O B-ORG O O O O O O O\n",
"O O B-LOC O O O O O O O O O B-LOC O O B-MISC I-MISC O B-PER I-PER O O O O O O O B-ORG O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O B-ORG O B-ORG O O O O O O B-ORG O O O O O O O O O O B-ORG O O O O B-ORG O O O O O O O O B-LOC I-LOC O B-ORG O O O O O O O O O O O O O O B-LOC O B-PER I-PER O O O O O O O O O O B-ORG O O O O O O O O O B-PER O O O O O O O O O O B-ORG O O O O O O O O O O O B-PER I-PER O B-PER I-PER O O O O O O O O O B-ORG O B-LOC O O B-PER O O O O B-LOC O O O O O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O O O B-ORG O O O O O O O O O B-PER O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-LOC O B-ORG O B-PER I-PER O O O O O B-LOC O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O O O O O O O B-LOC O B-PER B-PER O O O O B-ORG O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O O O O O O O O B-MISC B-PER B-PER O O O O O B-PER B-PER O O O O B-PER B-PER O O O O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O\n",
"456\n",
"456\n"
]
}
],
"source": [
"print(labels_dev0[\"y\"][0])\n",
"print(poprawione_predicted_labels_dev0[0])\n",
"\n",
"print(len(labels_dev0[\"y\"][0].split(\" \")))\n",
"print(len(poprawione_predicted_labels_dev0[0].split(\" \")))"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"with open(\"dev-0/out.tsv\", \"w\", encoding=\"utf-8\") as uwu:\n",
" for linijka in poprawione_predicted_labels_dev0:\n",
" uwu.write(linijka)\n",
" uwu.write(str(\"\\n\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# test A"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT . </S> Nadim Ladki </S> AL-AIN , United Arab Emirates 1996-12-06 </S> Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday . </S> But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan . </S> China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net . </S> Oleg Shatskiku made sure of the win in injury time , hitting an unstoppable left foot shot from just outside the area . </S> The former Soviet republic was playing in an Asian Cup finals tie for the first time . </S> Despite winning the Asian Games title two years ago , Uzbekistan are in the finals as outsiders . </S> Two goals from defensive errors in the last six minutes allowed Japan to come from behind and collect all three points from their opening meeting against Syria . </S> Takuya Takagi scored the winner in the 88th minute , rising to head a Hiroshige Yanagimoto cross towards the Syrian goal which goalkeeper Salem Bitar appeared to have covered but then allowed to slip into the net . </S> It was the second costly blunder by Syria in four minutes . </S> Defender Hassan Abbas rose to intercept a long ball into the area in the 84th minute but only managed to divert it into the top corner of Bitar 's goal . </S> Nader Jokhadar had given Syria the lead with a well-struck header in the seventh minute . </S> Japan then laid siege to the Syrian penalty area for most of the game but rarely breached the Syrian defence . </S> Bitar pulled off fine saves whenever they did . </S> Japan coach Shu Kamo said : ' ' The Syrian own goal proved lucky for us . </S> The Syrians scored early and then played defensively and adopted long balls which made it hard for us . ' </S> ' </S> Japan , co-hosts of the World Cup in 2002 and ranked 20th in the world by FIFA , are favourites to regain their title here . </S> Hosts UAE play Kuwait and South Korea take on Indonesia on Saturday in Group A matches . </S> All four teams are level with one point each from one game . </S>\n",
"[{'entity': 'B-LOC', 'score': 0.7067536, 'index': 6, 'word': 'J', 'start': 9, 'end': 10}, {'entity': 'I-LOC', 'score': 0.58363664, 'index': 7, 'word': '##AP', 'start': 10, 'end': 12}, {'entity': 'B-PER', 'score': 0.30825755, 'index': 11, 'word': 'L', 'start': 19, 'end': 20}, {'entity': 'B-LOC', 'score': 0.61186683, 'index': 18, 'word': 'CH', 'start': 31, 'end': 33}, {'entity': 'I-LOC', 'score': 0.55468416, 'index': 19, 'word': '##IN', 'start': 33, 'end': 35}, {'entity': 'B-PER', 'score': 0.5897185, 'index': 36, 'word': 'Na', 'start': 63, 'end': 65}, {'entity': 'I-PER', 'score': 0.8697179, 'index': 39, 'word': 'La', 'start': 69, 'end': 71}, {'entity': 'I-LOC', 'score': 0.37895498, 'index': 40, 'word': '##d', 'start': 71, 'end': 72}, {'entity': 'B-LOC', 'score': 0.9993087, 'index': 46, 'word': 'AL', 'start': 80, 'end': 82}, {'entity': 'I-LOC', 'score': 0.7062251, 'index': 47, 'word': '-', 'start': 82, 'end': 83}, {'entity': 'I-LOC', 'score': 0.4951169, 'index': 48, 'word': 'AI', 'start': 83, 'end': 85}, {'entity': 'I-LOC', 'score': 0.9822494, 'index': 49, 'word': '##N', 'start': 85, 'end': 86}, {'entity': 'B-LOC', 'score': 0.9995478, 'index': 51, 'word': 'United', 'start': 89, 'end': 95}, {'entity': 'I-LOC', 'score': 0.9983919, 'index': 52, 'word': 'Arab', 'start': 96, 'end': 100}, {'entity': 'I-LOC', 'score': 0.9991522, 'index': 53, 'word': 'Emirates', 'start': 101, 'end': 109}, {'entity': 'B-LOC', 'score': 0.9998192, 'index': 63, 'word': 'Japan', 'start': 126, 'end': 131}, {'entity': 'B-MISC', 'score': 0.9982241, 'index': 69, 'word': 'Asian', 'start': 159, 'end': 164}, {'entity': 'I-MISC', 'score': 0.99690956, 'index': 70, 'word': 'Cup', 'start': 165, 'end': 168}, {'entity': 'B-LOC', 'score': 0.9997726, 'index': 80, 'word': 'Syria', 'start': 204, 'end': 209}, {'entity': 'I-MISC', 'score': 0.5675147, 'index': 84, 'word': 'C', 'start': 221, 'end': 222}, {'entity': 'B-LOC', 'score': 0.9998385, 'index': 95, 'word': 'China', 'start': 263, 'end': 268}, {'entity': 'B-LOC', 'score': 0.9997861, 'index': 120, 'word': 'Uzbekistan', 'start': 378, 'end': 388}, {'entity': 'B-LOC', 'score': 0.99983716, 'index': 126, 'word': 'China', 'start': 396, 'end': 401}, {'entity': 'B-MISC', 'score': 0.95902425, 'index': 143, 'word': 'U', 'start': 489, 'end': 490}, {'entity': 'I-MISC', 'score': 0.7956443, 'index': 144, 'word': '##z', 'start': 490, 'end': 491}, {'entity': 'I-MISC', 'score': 0.98343384, 'index': 145, 'word': '##bek', 'start': 491, 'end': 494}, {'entity': 'B-PER', 'score': 0.99971837, 'index': 147, 'word': 'Igor', 'start': 503, 'end': 507}, {'entity': 'I-PER', 'score': 0.9996294, 'index': 148, 'word': 'S', 'start': 508, 'end': 509}, {'entity': 'I-PER', 'score': 0.9988809, 'index': 149, 'word': '##h', 'start': 509, 'end': 510}, {'entity': 'I-PER', 'score': 0.99662894, 'index': 150, 'word': '##k', 'start': 510, 'end': 511}, {'entity': 'I-PER', 'score': 0.95508987, 'index': 151, 'word': '##vy', 'start': 511, 'end': 513}, {'entity': 'B-MISC', 'score': 0.9997379, 'index': 172, 'word': 'Chinese', 'start': 601, 'end': 608}, {'entity': 'B-PER', 'score': 0.99747485, 'index': 184, 'word': 'Ole', 'start': 645, 'end': 648}, {'entity': 'B-PER', 'score': 0.9991104, 'index': 185, 'word': '##g', 'start': 648, 'end': 649}, {'entity': 'I-PER', 'score': 0.999617, 'index': 186, 'word': 'S', 'start': 650, 'end': 651}, {'entity': 'I-PER', 'score': 0.96610016, 'index': 187, 'word': '##hat', 'start': 651, 'end': 654}, {'entity': 'I-PER', 'score': 0.9820726, 'index': 188, 'word': '##ski', 'start': 654, 'end': 657}, {'entity': 'B-MISC', 'score': 0.99974626, 'index': 219, 'word': 'Soviet', 'start': 781, 'end': 787}, {'entity': 'B-MISC', 'score': 0.9986237, 'index': 225, 'word': 'Asian', 'start': 815, 'end': 820}, {'entity': 'I-MISC', 'score': 0.9958943, 'index': 226, 'word': 'Cup', 'start': 821, 'end': 824}, {'entity': 'B-MISC', 'score': 0.997851, 'index': 241, 'word': 'Asian', 'start': 882, 'end': 887}, {'entity': 'I-MISC', 'score': 0.9963637, 'index': 242, 'word': 'Games', 'start': 888, 'end': 893}, {'entity': 'B-LOC', 'score': 0.9998166, 'index': 24
]
}
],
"source": [
"# przewidywania testA\n",
"print(test_A[\"x\"][0])\n",
"\n",
"predicted_labels_A = []\n",
"for tekst in test_A[\"x\"]:\n",
" predicted_labels_A.append(nlp(tekst))\n",
"\n",
"print(predicted_labels_A[0])"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT . </S> Nadim Ladki </S> AL-AIN , United Arab Emirates 1996-12-06 </S> Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday . </S> But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan . </S> China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net . </S> Oleg Shatskiku made sure of the win in injury time , hitting an unstoppable left foot shot from just outside the area . </S> The former Soviet republic was playing in an Asian Cup finals tie for the first time . </S> Despite winning the Asian Games title two years ago , Uzbekistan are in the finals as outsiders . </S> Two goals from defensive errors in the last six minutes allowed Japan to come from behind and collect all three points from their opening meeting against Syria . </S> Takuya Takagi scored the winner in the 88th minute , rising to head a Hiroshige Yanagimoto cross towards the Syrian goal which goalkeeper Salem Bitar appeared to have covered but then allowed to slip into the net . </S> It was the second costly blunder by Syria in four minutes . </S> Defender Hassan Abbas rose to intercept a long ball into the area in the 84th minute but only managed to divert it into the top corner of Bitar 's goal . </S> Nader Jokhadar had given Syria the lead with a well-struck header in the seventh minute . </S> Japan then laid siege to the Syrian penalty area for most of the game but rarely breached the Syrian defence . </S> Bitar pulled off fine saves whenever they did . </S> Japan coach Shu Kamo said : ' ' The Syrian own goal proved lucky for us . </S> The Syrians scored early and then played defensively and adopted long balls which made it hard for us . ' </S> ' </S> Japan , co-hosts of the World Cup in 2002 and ranked 20th in the world by FIFA , are favourites to regain their title here . </S> Hosts UAE play Kuwait and South Korea take on Indonesia on Saturday in Group A matches . </S> All four teams are level with one point each from one game . </S>\n",
"[(0, 5), (7, 7), (9, 13), (15, 17), (19, 23), (25, 27), (29, 29), (31, 35), (37, 38), (40, 47), (49, 54), (56, 56), (58, 61), (63, 67), (69, 73), (75, 78), (80, 85), (87, 87), (89, 94), (96, 99), (101, 108), (110, 119), (121, 124), (126, 130), (132, 136), (138, 140), (142, 148), (150, 151), (153, 157), (159, 163), (165, 167), (169, 173), (175, 178), (180, 180), (182, 186), (188, 190), (192, 194), (196, 202), (204, 208), (210, 211), (213, 213), (215, 219), (221, 221), (223, 234), (236, 240), (242, 243), (245, 250), (252, 252), (254, 257), (259, 261), (263, 267), (269, 271), (273, 277), (279, 282), (284, 289), (291, 294), (296, 297), (299, 301), (303, 308), (310, 314), (316, 317), (319, 321), (323, 327), (329, 329), (331, 338), (340, 341), (343, 343), (345, 352), (354, 356), (358, 363), (365, 366), (368, 376), (378, 387), (389, 389), (391, 394), (396, 400), (402, 411), (413, 416), (418, 419), (421, 423), (425, 429), (431, 433), (435, 437), (439, 445), (447, 453), (455, 460), (462, 466), (468, 470), (472, 475), (477, 482), (484, 487), (489, 493), (495, 501), (503, 506), (508, 515), (517, 520), (522, 530), (532, 533), (535, 535), (537, 547), (549, 557), (559, 564), (566, 567), (569, 571), (573, 575), (577, 580), (582, 585), (587, 589), (591, 599), (601, 607), (609, 614), (616, 618), (620, 623), (625, 626), (628, 632), (634, 636), (638, 638), (640, 643), (645, 648), (650, 658), (660, 663), (665, 668), (670, 671), (673, 675), (677, 679), (681, 682), (684, 689), (691, 694), (696, 696), (698, 704), (706, 707), (709, 719), (721, 724), (726, 729), (731, 734), (736, 739), (741, 744), (746, 752), (754, 756), (758, 761), (763, 763), (765, 768), (770, 772), (774, 779), (781, 786), (788, 795), (797, 799), (801, 807), (809, 810), (812, 813), (815, 819), (821, 823), (825, 830), (832, 834), (836, 838), (840, 842), (844, 848), (850, 853), (855, 855), (857, 860), (862, 868), (870, 876), (878, 880), (882, 886), (888, 892), (894, 898), (900, 902), (904, 908), (910, 912), (914, 914), (916, 925), (927, 929), (931, 932), (934, 936), (938, 943), (945, 946), (948, 956), (958, 958), (960, 963), (965, 967), (969, 973), (975, 978), (980, 988), (990, 995), (997, 998), (1000, 1002), (1004, 1007), (1009, 1011), (1013, 1019), (1021, 1027), (1029, 1033), (1035, 1036), (1038, 1041), (1043, 1046), (1048, 1053), (1055, 1057), (1059, 1065), (1067, 1069), (1071, 1075), (1077, 1082), (1084, 1087), (1089, 1093), (1095, 1101), (1103, 1109), (1111, 1117), (1119, 1123), (1125, 1125), (1127, 1130), (1132, 1137), (1139, 1144), (1146, 1151), (1153, 1155), (1157, 1162), (1164, 1165), (1167, 1169), (1171, 1174), (1176, 1181), (1183, 1183), (1185, 1190), (1192, 1193), (1195, 1198), (1200, 1200), (1202, 1210), (1212, 1221), (1223, 1227), (1229, 1235), (1237, 1239), (1241, 1246), (1248, 1251), (1253, 1257), (1259, 1268), (1270, 1274), (1276, 1280), (1282, 1289), (1291, 1292), (1294, 1297), (1299, 1305), (1307, 1309), (1311, 1314), (1316, 1322), (1324, 1325), (1327, 1330), (1332, 1335), (1337, 1339), (1341, 1343), (1345, 1345), (1347, 1350), (1352, 1353), (1355, 1357), (1359, 1361), (1363, 1368), (1370, 1375), (1377, 1383), (1385, 1386), (1388, 1392), (1394, 1395), (1397, 1400), (1402, 1408), (1410, 1410), (1412, 1415), (1417, 1424), (1426, 1431), (1433, 1437), (1439, 1442), (1444, 1445), (1447, 1455), (1457, 1457), (1459, 1462), (1464, 1467), (1469, 1472), (1474, 1476), (1478, 1481), (1483, 1484), (1486, 1488), (1490, 1493), (1495, 1500), (1502, 1504), (1506, 1509), (1511, 1517), (1519, 1520), (1522, 1527), (1529, 1530), (1532, 1535), (1537, 1539), (1541, 1543), (1545, 1550), (1552, 1553), (1555, 1559), (1561, 1562), (1564, 1567), (1569, 1569), (1571, 1574), (1576, 1580), (1582, 1589), (1591, 1593), (1595, 1599), (1601, 1605), (1607, 1609), (1611, 1614), (1616, 1619), (1621, 1621), (1623, 1633), (1635, 1640), (1642, 1643), (1645, 1647), (1649, 1655), (1657, 1662), (1664, 1664), (1666, 1669), (1671, 1675), (1677, 1680), (1682, 1685), (1687, 1691), (1693, 1694), (1696, 1698), (1700, 1705), (1707, 1713), (1715, 1718), (1720, 1722), (1724, 1727), (1729, 1730), (1732, 1
"441\n",
"441\n"
]
}
],
"source": [
"# zakladamy ze teksty nie zaczynaj ani nie koncza sie na spacji, nie ma wielokrotnych spacji\n",
"zakresy_slow_tekstow = []\n",
"for tekst in test_A[\"x\"]:\n",
" zakresy_slow = []\n",
" poczatek = 0\n",
" koniec = len(tekst)-1\n",
" for numer_zakresu in range(len(tekst)):\n",
" znak = tekst[numer_zakresu]\n",
" if znak == \" \":\n",
" koniec = numer_zakresu-1\n",
" zakres = (poczatek, koniec)\n",
" zakresy_slow.append(zakres)\n",
" poczatek = numer_zakresu+1\n",
" koniec = len(tekst)-1\n",
" zakres = (poczatek, koniec)\n",
" zakresy_slow.append(zakres)\n",
" zakresy_slow_tekstow.append(zakresy_slow)\n",
"\n",
"print(test_A[\"x\"][0])\n",
"print(zakresy_slow_tekstow[0])\n",
"\n",
"print(len(test_A[\"x\"][0].split(\" \")))\n",
"print(len(zakresy_slow_tekstow[0]))"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"poprawione_predicted_labels_A = []\n",
"for linijka in range(len(test_A[\"x\"])):\n",
" zakresy_slow = zakresy_slow_tekstow[linijka]\n",
" przewidywania = predicted_labels_A[linijka]\n",
" etykiety = [\"O\" for _ in zakresy_slow]\n",
" for numer_zakresu in range(len(zakresy_slow)):\n",
" zakres = zakresy_slow[numer_zakresu]\n",
" for p in przewidywania:\n",
" start = p[\"start\"]\n",
" if start>=zakres[0] and start<=zakres[1]:\n",
" tag = p[\"entity\"]\n",
" if etykiety[numer_zakresu] == \"O\":\n",
" etykiety[numer_zakresu] = tag\n",
" poprawione_predicted_labels_A.append(\" \".join(etykiety).strip())"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT . </S> Nadim Ladki </S> AL-AIN , United Arab Emirates 1996-12-06 </S> Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday . </S> But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan . </S> China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net . </S> Oleg Shatskiku made sure of the win in injury time , hitting an unstoppable left foot shot from just outside the area . </S> The former Soviet republic was playing in an Asian Cup finals tie for the first time . </S> Despite winning the Asian Games title two years ago , Uzbekistan are in the finals as outsiders . </S> Two goals from defensive errors in the last six minutes allowed Japan to come from behind and collect all three points from their opening meeting against Syria . </S> Takuya Takagi scored the winner in the 88th minute , rising to head a Hiroshige Yanagimoto cross towards the Syrian goal which goalkeeper Salem Bitar appeared to have covered but then allowed to slip into the net . </S> It was the second costly blunder by Syria in four minutes . </S> Defender Hassan Abbas rose to intercept a long ball into the area in the 84th minute but only managed to divert it into the top corner of Bitar 's goal . </S> Nader Jokhadar had given Syria the lead with a well-struck header in the seventh minute . </S> Japan then laid siege to the Syrian penalty area for most of the game but rarely breached the Syrian defence . </S> Bitar pulled off fine saves whenever they did . </S> Japan coach Shu Kamo said : ' ' The Syrian own goal proved lucky for us . </S> The Syrians scored early and then played defensively and adopted long balls which made it hard for us . ' </S> ' </S> Japan , co-hosts of the World Cup in 2002 and ranked 20th in the world by FIFA , are favourites to regain their title here . </S> Hosts UAE play Kuwait and South Korea take on Indonesia on Saturday in Group A matches . </S> All four teams are level with one point each from one game . </S>\n",
"O O B-LOC O B-PER O O B-LOC O O O O O B-PER I-PER O B-LOC O B-LOC I-LOC I-LOC O O B-LOC O O O O O B-MISC I-MISC O O O O O O O B-LOC O O O I-MISC O O O O O O O B-LOC O O O O O O O O O O O O O O O O O O O O O B-LOC O O B-LOC O O O O O O O O O O O O O O O B-MISC O B-PER I-PER O O O O O O O O O O O O O O B-MISC O O O O O O O O B-PER I-PER O O O O O O O O O O O O O O O O O O O O O O O O B-MISC O O O O O B-MISC I-MISC O O O O O O O O O O O B-MISC I-MISC O O O O O B-LOC O O O O O O O O O O O O O O O O O O O B-LOC O O O O O O O O O O O O O O B-LOC O O B-PER I-PER O O O O O O O O O O O O B-PER I-PER O O O B-MISC O O O B-PER I-PER O O O O O O O O O O O O O O O O O O O O O B-LOC O O O O O O B-PER I-PER O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-LOC O O O O O O O O O O O O B-LOC O O O O O B-MISC O O O O O O O O O O O B-MISC O O O O O O O O O O O O O B-LOC O B-PER O O O O O O B-MISC O O O O O O O O O B-MISC O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O\n",
"441\n",
"441\n"
]
}
],
"source": [
"print(test_A[\"x\"][0])\n",
"print(poprawione_predicted_labels_A[0])\n",
"\n",
"print(len(test_A[\"x\"][0].split(\" \")))\n",
"print(len(poprawione_predicted_labels_A[0].split(\" \")))"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"with open(\"test-A/out.tsv\", \"w\", encoding=\"utf-8\") as uwu:\n",
" for linijka in poprawione_predicted_labels_A:\n",
" uwu.write(linijka)\n",
" uwu.write(str(\"\\n\"))"
]
}
],
"metadata": {
"author": "Jakub Pokrywka",
"email": "kubapok@wmi.amu.edu.pl",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"lang": "pl",
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
},
"subtitle": "11.NER RNN[ćwiczenia]",
"title": "Ekstrakcja informacji",
"year": "2021"
},
"nbformat": 4,
"nbformat_minor": 4
}