en-ner-conll-2003/3_RNN — kopia.ipynb

979 lines
297 KiB
Plaintext
Raw Normal View History

2024-05-23 21:25:06 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Uczenie głębokie przetwarzanie tekstu laboratoria\n",
"# 3. RNN"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 60,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]\n",
" [--paths] [--json] [--debug]\n",
" [subcommand]\n",
"\n",
"Jupyter: Interactive Computing\n",
"\n",
"positional arguments:\n",
" subcommand the subcommand to launch\n",
"\n",
"options:\n",
" -h, --help show this help message and exit\n",
" --version show the versions of core jupyter packages and exit\n",
" --config-dir show Jupyter config dir\n",
" --data-dir show Jupyter data dir\n",
" --runtime-dir show Jupyter runtime dir\n",
" --paths show all Jupyter paths. Add --json for machine-readable\n",
" format.\n",
" --json output paths as machine-readable json\n",
" --debug output debug information about paths\n",
"\n",
2024-05-25 17:47:40 +02:00
"Available subcommands: console dejavu events execute kernel kernelspec lab\n",
"labextension labhub migrate nbconvert notebook qtconsole run server\n",
"troubleshoot trust\n",
2024-05-23 21:25:06 +02:00
"\n",
"Jupyter command `jupyter-nbextension` not found.\n"
]
}
],
"source": [
"!jupyter nbextension enable --py widgetsnbextension\n",
"\n",
"from collections import Counter\n",
"import torch\n",
"from torchtext.vocab import vocab\n",
"from tqdm import tqdm\n",
"from ipywidgets import FloatProgress\n",
"\n",
"import pandas as pd\n",
"from nltk.tokenize import word_tokenize\n",
"from unidecode import unidecode"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 61,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wczytano dane treningowe\n",
2024-05-25 17:47:40 +02:00
"B-ORG O B-MISC O O O B-MISC O O O B-PER I-PER O B-LOC O O O B-ORG I-ORG O O O O O O B-MISC O O O O O B-MISC O O O O O O O O O O O O O O O B-LOC O O O O B-ORG I-ORG O O O B-PER I-PER O O O O O O O O O O O B-LOC O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O O O B-PER I-PER I-PER I-PER O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG I-ORG O O O O O O O O O B-ORG O O B-PER I-PER O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-PER O B-MISC O O O O B-LOC O B-LOC O O O O O O O B-MISC I-MISC I-MISC O B-MISC O O O O O O O O B-PER O O O O O O O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-MISC O O B-PER I-PER I-PER O O O B-PER O O B-ORG O O O O O O O O O O O O O O O O O O B-LOC O B-LOC O B-PER O O O O O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-MISC O O O O O O O O O O O O O O O O B-MISC O O O O O O O O O O O O O O O O O O O B-MISC O O O O O O B-MISC O O O O O B-LOC O O O O O O O O O O O O O O O O O O O B-LOC O O O O B-ORG I-ORG I-ORG I-ORG I-ORG O B-ORG O O B-PER I-PER I-PER O O B-ORG I-ORG O O B-LOC O O O O O O O O O O O O O O O B-MISC O O O O O O O O O O O O O O O O O O B-LOC O O O O B-LOC O O O O O O O O O O O O O O O O B-MISC O O O O O O O O O O EU rejects German call to boycott British lamb . </S> Peter Blackburn </S> BRUSSELS 1996-08-22 </S> The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep . </S> Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer . </S> \" We do n't support any such recommendation because we do n't see any grounds for it , \" the Commission 's chief spokesman Nikolaus van der Pas told a news briefing . </S> He said further scientific study was required and if it was found that action was needed it should be taken by the European Union . </S> He said a proposal last month by EU Farm Commissioner Franz Fischler to ban sheep brains , spleens and spinal cords from the human and animal food chains was a highly specific and precautionary move to protect human health . </S> Fischler proposed EU-wide measures after reports from Britain and France that under laboratory conditions sheep could contract Bovine Spongiform Encephalopathy ( BSE ) -- mad cow disease . </S> But Fischler agreed to review his proposal after the EU 's standing veterinary committee , mational animal health officials , questioned if such action was justified as there was only a slight risk to human health . </S> Spanish Farm Minister Loyola de Palacio had earlier accused Fischler at an EU farm ministers ' meeting of causing unjustified alarm through \" dangerous generalisation . \" </S> . </S> Only France and Britain backed Fischler 's proposal . </S> The EU 's scientific veterinary and multidisciplinary committees are due to re-examine the issue early next month and make recommendations to the senior veterinary officials . </S> Sheep have long been known to contract scrapie , a brain-wasting disease similar to BSE which is believed to have been transferred to cattle through feed containing animal waste . </S> British farmers denied on Thursday there was any danger to human health from their sheep , but expressed concern that German government advice to consumers to avoid British lamb might influence consumers across Europe . </S> \" What we have to be extremely careful of is how other countries are going to take Germany 's lead , \" Welsh National Farmers ' Union ( NFU ) chairman John Lloyd Jones said on BBC radio . </S> Bonn has led efforts to protect public health after consumer confidence collapsed in March after a British report suggested humans could contract an illness similar to mad cow disease by eating contaminated beef . </S> Germany imported 47,600 sheep from Britain last year , nearly half of total imports . </S> It brought in
"489\n",
"489\n",
2024-05-23 21:25:06 +02:00
"podzielono dane treningowe na słowa\n",
2024-05-25 17:47:40 +02:00
"['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '</s>', 'peter', 'blackburn', '</s>', 'brussels', '1996-08-22', '</s>', 'the', 'european', 'commission', 'said', 'on', 'thursday', 'it', 'disagreed', 'with', 'german', 'advice', 'to', 'consumers', 'to', 'shun', 'british', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.', '</s>', 'germany', \"'s\", 'representative', 'to', 'the', 'european', 'union', \"'s\", 'veterinary', 'committee', 'werner', 'zwingmann', 'said', 'on', 'wednesday', 'consumers', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'britain', 'until', 'the', 'scientific', 'advice', 'was', 'clearer', '.', '</s>', '\"', 'we', 'do', \"n't\", 'support', 'any', 'such', 'recommendation', 'because', 'we', 'do', \"n't\", 'see', 'any', 'grounds', 'for', 'it', ',', '\"', 'the', 'commission', \"'s\", 'chief', 'spokesman', 'nikolaus', 'van', 'der', 'pas', 'told', 'a', 'news', 'briefing', '.', '</s>', 'he', 'said', 'further', 'scientific', 'study', 'was', 'required', 'and', 'if', 'it', 'was', 'found', 'that', 'action', 'was', 'needed', 'it', 'should', 'be', 'taken', 'by', 'the', 'european', 'union', '.', '</s>', 'he', 'said', 'a', 'proposal', 'last', 'month', 'by', 'eu', 'farm', 'commissioner', 'franz', 'fischler', 'to', 'ban', 'sheep', 'brains', ',', 'spleens', 'and', 'spinal', 'cords', 'from', 'the', 'human', 'and', 'animal', 'food', 'chains', 'was', 'a', 'highly', 'specific', 'and', 'precautionary', 'move', 'to', 'protect', 'human', 'health', '.', '</s>', 'fischler', 'proposed', 'eu-wide', 'measures', 'after', 'reports', 'from', 'britain', 'and', 'france', 'that', 'under', 'laboratory', 'conditions', 'sheep', 'could', 'contract', 'bovine', 'spongiform', 'encephalopathy', '(', 'bse', ')', '--', 'mad', 'cow', 'disease', '.', '</s>', 'but', 'fischler', 'agreed', 'to', 'review', 'his', 'proposal', 'after', 'the', 'eu', \"'s\", 'standing', 'veterinary', 'committee', ',', 'mational', 'animal', 'health', 'officials', ',', 'questioned', 'if', 'such', 'action', 'was', 'justified', 'as', 'there', 'was', 'only', 'a', 'slight', 'risk', 'to', 'human', 'health', '.', '</s>', 'spanish', 'farm', 'minister', 'loyola', 'de', 'palacio', 'had', 'earlier', 'accused', 'fischler', 'at', 'an', 'eu', 'farm', 'ministers', \"'\", 'meeting', 'of', 'causing', 'unjustified', 'alarm', 'through', '\"', 'dangerous', 'generalisation', '.', '\"', '</s>', '.', '</s>', 'only', 'france', 'and', 'britain', 'backed', 'fischler', \"'s\", 'proposal', '.', '</s>', 'the', 'eu', \"'s\", 'scientific', 'veterinary', 'and', 'multidisciplinary', 'committees', 'are', 'due', 'to', 're-examine', 'the', 'issue', 'early', 'next', 'month', 'and', 'make', 'recommendations', 'to', 'the', 'senior', 'veterinary', 'officials', '.', '</s>', 'sheep', 'have', 'long', 'been', 'known', 'to', 'contract', 'scrapie', ',', 'a', 'brain-wasting', 'disease', 'similar', 'to', 'bse', 'which', 'is', 'believed', 'to', 'have', 'been', 'transferred', 'to', 'cattle', 'through', 'feed', 'containing', 'animal', 'waste', '.', '</s>', 'british', 'farmers', 'denied', 'on', 'thursday', 'there', 'was', 'any', 'danger', 'to', 'human', 'health', 'from', 'their', 'sheep', ',', 'but', 'expressed', 'concern', 'that', 'german', 'government', 'advice', 'to', 'consumers', 'to', 'avoid', 'british', 'lamb', 'might', 'influence', 'consumers', 'across', 'europe', '.', '</s>', '\"', 'what', 'we', 'have', 'to', 'be', 'extremely', 'careful', 'of', 'is', 'how', 'other', 'countries', 'are', 'going', 'to', 'take', 'germany', \"'s\", 'lead', ',', '\"', 'welsh', 'national', 'farmers', \"'\", 'union', '(', 'nfu', ')', 'chairman', 'john', 'lloyd', 'jones', 'said', 'on', 'bbc', 'radio', '.', '</s>', 'bonn', 'has', 'led', 'efforts', 'to', 'protect', 'public', 'health', 'after', 'consumer', 'confidence', 'collapsed', 'in', 'march', 'after', 'a', 'british', 'report', 'suggested', 'humans', 'could', 'contract', 'an', 'illness', 'similar', 'to', 'mad', 'cow', 'disease', 'by', 'eating', 'contaminated', '
"489\n"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"# odczytaj dane treningowe\n",
2024-05-25 17:47:40 +02:00
"train = pd.read_csv('train/train.tsv', sep='\\t', names=['y', 'x'], header=None)\n",
"\n",
2024-05-23 21:25:06 +02:00
"print(\"wczytano dane treningowe\")\n",
"print(train[\"y\"][0], train[\"x\"][0])\n",
"\n",
2024-05-25 17:47:40 +02:00
"print(len(train[\"y\"][0].split(\" \")))\n",
"print(len(train[\"x\"][0].split(\" \")))\n",
"\n",
2024-05-23 21:25:06 +02:00
"# podziel dane treningowe na słowa\n",
"# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/\n",
"slowa_train = []\n",
"for tekst in train[\"x\"]:\n",
" pom = []\n",
" for slowo in tekst.split(\" \"):\n",
" #if slowo not in (\"<\",\"/s\",\">\",\"/S\",\"``\"):\n",
" pom.append(slowo.lower())\n",
" slowa_train.append(pom)\n",
"print(\"podzielono dane treningowe na słowa\")\n",
2024-05-25 17:47:40 +02:00
"\n",
"print(slowa_train[0])\n",
"print(len(slowa_train[0]))"
2024-05-23 21:25:06 +02:00
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 62,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wczytano dane testowe dev-0\n",
2024-05-25 17:47:40 +02:00
"CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY . </S> LONDON 1996-08-30 </S> West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship . </S> Their stay on top , though , may be short-lived as title rivals Essex , Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire . </S> After bowling Somerset out for 83 on the opening morning at Grace Road , Leicestershire extended their first innings by 94 runs before being bowled out for 296 with England discard Andy Caddick taking three for 83 . </S> Trailing by 213 , Somerset got a solid start to their second innings before Simmons stepped in to bundle them out for 174 . </S> Essex , however , look certain to regain their top spot after Nasser Hussain and Peter Such gave them a firm grip on their match against Yorkshire at Headingley . </S> Hussain , considered surplus to England 's one-day requirements , struck 158 , his first championship century of the season , as Essex reached 372 and took a first innings lead of 82 . </S> By the close Yorkshire had turned that into a 37-run advantage but off-spinner Such had scuttled their hopes , taking four for 24 in 48 balls and leaving them hanging on 119 for five and praying for rain . </S> At the Oval , Surrey captain Chris Lewis , another man dumped by England , continued to silence his critics as he followed his four for 45 on Thursday with 80 not out on Friday in the match against Warwickshire . </S> He was well backed by England hopeful Mark Butcher who made 70 as Surrey closed on 429 for seven , a lead of 234 . </S> Derbyshire kept up the hunt for their first championship title since 1936 by reducing Worcestershire to 133 for five in their second innings , still 100 runs away from avoiding an innings defeat . </S> Australian Tom Moody took six for 82 but Chris Adams , 123 , and Tim O'Gorman , 109 , took Derbyshire to 471 and a first innings lead of 233 . </S> After the frustration of seeing the opening day of their match badly affected by the weather , Kent stepped up a gear to dismiss Nottinghamshire for 214 . </S> They were held up by a gritty 84 from Paul Johnson but ex-England fast bowler Martin McCague took four for 55 . </S> By stumps Kent had reached 108 for three . </S>\n",
2024-05-23 21:25:06 +02:00
"podzielono dane treningowe na słowa\n",
2024-05-25 17:47:40 +02:00
"['cricket', '-', 'leicestershire', 'take', 'over', 'at', 'top', 'after', 'innings', 'victory', '.', '</s>', 'london', '1996-08-30', '</s>', 'west', 'indian', 'all-rounder', 'phil', 'simmons', 'took', 'four', 'for', '38', 'on', 'friday', 'as', 'leicestershire', 'beat', 'somerset', 'by', 'an', 'innings', 'and', '39', 'runs', 'in', 'two', 'days', 'to', 'take', 'over', 'at', 'the', 'head', 'of', 'the', 'county', 'championship', '.', '</s>', 'their', 'stay', 'on', 'top', ',', 'though', ',', 'may', 'be', 'short-lived', 'as', 'title', 'rivals', 'essex', ',', 'derbyshire', 'and', 'surrey', 'all', 'closed', 'in', 'on', 'victory', 'while', 'kent', 'made', 'up', 'for', 'lost', 'time', 'in', 'their', 'rain-affected', 'match', 'against', 'nottinghamshire', '.', '</s>', 'after', 'bowling', 'somerset', 'out', 'for', '83', 'on', 'the', 'opening', 'morning', 'at', 'grace', 'road', ',', 'leicestershire', 'extended', 'their', 'first', 'innings', 'by', '94', 'runs', 'before', 'being', 'bowled', 'out', 'for', '296', 'with', 'england', 'discard', 'andy', 'caddick', 'taking', 'three', 'for', '83', '.', '</s>', 'trailing', 'by', '213', ',', 'somerset', 'got', 'a', 'solid', 'start', 'to', 'their', 'second', 'innings', 'before', 'simmons', 'stepped', 'in', 'to', 'bundle', 'them', 'out', 'for', '174', '.', '</s>', 'essex', ',', 'however', ',', 'look', 'certain', 'to', 'regain', 'their', 'top', 'spot', 'after', 'nasser', 'hussain', 'and', 'peter', 'such', 'gave', 'them', 'a', 'firm', 'grip', 'on', 'their', 'match', 'against', 'yorkshire', 'at', 'headingley', '.', '</s>', 'hussain', ',', 'considered', 'surplus', 'to', 'england', \"'s\", 'one-day', 'requirements', ',', 'struck', '158', ',', 'his', 'first', 'championship', 'century', 'of', 'the', 'season', ',', 'as', 'essex', 'reached', '372', 'and', 'took', 'a', 'first', 'innings', 'lead', 'of', '82', '.', '</s>', 'by', 'the', 'close', 'yorkshire', 'had', 'turned', 'that', 'into', 'a', '37-run', 'advantage', 'but', 'off-spinner', 'such', 'had', 'scuttled', 'their', 'hopes', ',', 'taking', 'four', 'for', '24', 'in', '48', 'balls', 'and', 'leaving', 'them', 'hanging', 'on', '119', 'for', 'five', 'and', 'praying', 'for', 'rain', '.', '</s>', 'at', 'the', 'oval', ',', 'surrey', 'captain', 'chris', 'lewis', ',', 'another', 'man', 'dumped', 'by', 'england', ',', 'continued', 'to', 'silence', 'his', 'critics', 'as', 'he', 'followed', 'his', 'four', 'for', '45', 'on', 'thursday', 'with', '80', 'not', 'out', 'on', 'friday', 'in', 'the', 'match', 'against', 'warwickshire', '.', '</s>', 'he', 'was', 'well', 'backed', 'by', 'england', 'hopeful', 'mark', 'butcher', 'who', 'made', '70', 'as', 'surrey', 'closed', 'on', '429', 'for', 'seven', ',', 'a', 'lead', 'of', '234', '.', '</s>', 'derbyshire', 'kept', 'up', 'the', 'hunt', 'for', 'their', 'first', 'championship', 'title', 'since', '1936', 'by', 'reducing', 'worcestershire', 'to', '133', 'for', 'five', 'in', 'their', 'second', 'innings', ',', 'still', '100', 'runs', 'away', 'from', 'avoiding', 'an', 'innings', 'defeat', '.', '</s>', 'australian', 'tom', 'moody', 'took', 'six', 'for', '82', 'but', 'chris', 'adams', ',', '123', ',', 'and', 'tim', \"o'gorman\", ',', '109', ',', 'took', 'derbyshire', 'to', '471', 'and', 'a', 'first', 'innings', 'lead', 'of', '233', '.', '</s>', 'after', 'the', 'frustration', 'of', 'seeing', 'the', 'opening', 'day', 'of', 'their', 'match', 'badly', 'affected', 'by', 'the', 'weather', ',', 'kent', 'stepped', 'up', 'a', 'gear', 'to', 'dismiss', 'nottinghamshire', 'for', '214', '.', '</s>', 'they', 'were', 'held', 'up', 'by', 'a', 'gritty', '84', 'from', 'paul', 'johnson', 'but', 'ex-england', 'fast', 'bowler', 'martin', 'mccague', 'took', 'four', 'for', '55', '.', '</s>', 'by', 'stumps', 'kent', 'had', 'reached', '108', 'for', 'three', '.', '</s>']\n",
"456\n"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"# odczytaj dane testowe dev-0\n",
2024-05-25 17:47:40 +02:00
"test_dev0 = pd.read_csv('dev-0/in.tsv', sep='\\t', names=['x'], header=None)\n",
"\n",
2024-05-23 21:25:06 +02:00
"print(\"wczytano dane testowe dev-0\")\n",
"print(test_dev0[\"x\"][0])\n",
"\n",
"# podziel dane testowe na słowa\n",
"# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/\n",
"slowa_test_dev0 = []\n",
"for tekst in test_dev0[\"x\"]:\n",
" pom = []\n",
" for slowo in tekst.split(\" \"):\n",
" #if slowo not in (\"<\",\"/s\",\">\",\"/S\",\"``\"):\n",
" pom.append(slowo.lower())\n",
" slowa_test_dev0.append(pom)\n",
"print(\"podzielono dane treningowe na słowa\")\n",
2024-05-25 17:47:40 +02:00
"\n",
"print(slowa_test_dev0[0])\n",
"print(len(slowa_test_dev0[0]))"
2024-05-23 21:25:06 +02:00
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 63,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"wczytano dane testowe A\n",
2024-05-25 17:47:40 +02:00
"SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT . </S> Nadim Ladki </S> AL-AIN , United Arab Emirates 1996-12-06 </S> Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday . </S> But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan . </S> China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net . </S> Oleg Shatskiku made sure of the win in injury time , hitting an unstoppable left foot shot from just outside the area . </S> The former Soviet republic was playing in an Asian Cup finals tie for the first time . </S> Despite winning the Asian Games title two years ago , Uzbekistan are in the finals as outsiders . </S> Two goals from defensive errors in the last six minutes allowed Japan to come from behind and collect all three points from their opening meeting against Syria . </S> Takuya Takagi scored the winner in the 88th minute , rising to head a Hiroshige Yanagimoto cross towards the Syrian goal which goalkeeper Salem Bitar appeared to have covered but then allowed to slip into the net . </S> It was the second costly blunder by Syria in four minutes . </S> Defender Hassan Abbas rose to intercept a long ball into the area in the 84th minute but only managed to divert it into the top corner of Bitar 's goal . </S> Nader Jokhadar had given Syria the lead with a well-struck header in the seventh minute . </S> Japan then laid siege to the Syrian penalty area for most of the game but rarely breached the Syrian defence . </S> Bitar pulled off fine saves whenever they did . </S> Japan coach Shu Kamo said : ' ' The Syrian own goal proved lucky for us . </S> The Syrians scored early and then played defensively and adopted long balls which made it hard for us . ' </S> ' </S> Japan , co-hosts of the World Cup in 2002 and ranked 20th in the world by FIFA , are favourites to regain their title here . </S> Hosts UAE play Kuwait and South Korea take on Indonesia on Saturday in Group A matches . </S> All four teams are level with one point each from one game . </S>\n",
2024-05-23 21:25:06 +02:00
"podzielono dane treningowe na słowa\n",
2024-05-25 17:47:40 +02:00
"['soccer', '-', 'japan', 'get', 'lucky', 'win', ',', 'china', 'in', 'surprise', 'defeat', '.', '</s>', 'nadim', 'ladki', '</s>', 'al-ain', ',', 'united', 'arab', 'emirates', '1996-12-06', '</s>', 'japan', 'began', 'the', 'defence', 'of', 'their', 'asian', 'cup', 'title', 'with', 'a', 'lucky', '2-1', 'win', 'against', 'syria', 'in', 'a', 'group', 'c', 'championship', 'match', 'on', 'friday', '.', '</s>', 'but', 'china', 'saw', 'their', 'luck', 'desert', 'them', 'in', 'the', 'second', 'match', 'of', 'the', 'group', ',', 'crashing', 'to', 'a', 'surprise', '2-0', 'defeat', 'to', 'newcomers', 'uzbekistan', '.', '</s>', 'china', 'controlled', 'most', 'of', 'the', 'match', 'and', 'saw', 'several', 'chances', 'missed', 'until', 'the', '78th', 'minute', 'when', 'uzbek', 'striker', 'igor', 'shkvyrin', 'took', 'advantage', 'of', 'a', 'misdirected', 'defensive', 'header', 'to', 'lob', 'the', 'ball', 'over', 'the', 'advancing', 'chinese', 'keeper', 'and', 'into', 'an', 'empty', 'net', '.', '</s>', 'oleg', 'shatskiku', 'made', 'sure', 'of', 'the', 'win', 'in', 'injury', 'time', ',', 'hitting', 'an', 'unstoppable', 'left', 'foot', 'shot', 'from', 'just', 'outside', 'the', 'area', '.', '</s>', 'the', 'former', 'soviet', 'republic', 'was', 'playing', 'in', 'an', 'asian', 'cup', 'finals', 'tie', 'for', 'the', 'first', 'time', '.', '</s>', 'despite', 'winning', 'the', 'asian', 'games', 'title', 'two', 'years', 'ago', ',', 'uzbekistan', 'are', 'in', 'the', 'finals', 'as', 'outsiders', '.', '</s>', 'two', 'goals', 'from', 'defensive', 'errors', 'in', 'the', 'last', 'six', 'minutes', 'allowed', 'japan', 'to', 'come', 'from', 'behind', 'and', 'collect', 'all', 'three', 'points', 'from', 'their', 'opening', 'meeting', 'against', 'syria', '.', '</s>', 'takuya', 'takagi', 'scored', 'the', 'winner', 'in', 'the', '88th', 'minute', ',', 'rising', 'to', 'head', 'a', 'hiroshige', 'yanagimoto', 'cross', 'towards', 'the', 'syrian', 'goal', 'which', 'goalkeeper', 'salem', 'bitar', 'appeared', 'to', 'have', 'covered', 'but', 'then', 'allowed', 'to', 'slip', 'into', 'the', 'net', '.', '</s>', 'it', 'was', 'the', 'second', 'costly', 'blunder', 'by', 'syria', 'in', 'four', 'minutes', '.', '</s>', 'defender', 'hassan', 'abbas', 'rose', 'to', 'intercept', 'a', 'long', 'ball', 'into', 'the', 'area', 'in', 'the', '84th', 'minute', 'but', 'only', 'managed', 'to', 'divert', 'it', 'into', 'the', 'top', 'corner', 'of', 'bitar', \"'s\", 'goal', '.', '</s>', 'nader', 'jokhadar', 'had', 'given', 'syria', 'the', 'lead', 'with', 'a', 'well-struck', 'header', 'in', 'the', 'seventh', 'minute', '.', '</s>', 'japan', 'then', 'laid', 'siege', 'to', 'the', 'syrian', 'penalty', 'area', 'for', 'most', 'of', 'the', 'game', 'but', 'rarely', 'breached', 'the', 'syrian', 'defence', '.', '</s>', 'bitar', 'pulled', 'off', 'fine', 'saves', 'whenever', 'they', 'did', '.', '</s>', 'japan', 'coach', 'shu', 'kamo', 'said', ':', \"'\", \"'\", 'the', 'syrian', 'own', 'goal', 'proved', 'lucky', 'for', 'us', '.', '</s>', 'the', 'syrians', 'scored', 'early', 'and', 'then', 'played', 'defensively', 'and', 'adopted', 'long', 'balls', 'which', 'made', 'it', 'hard', 'for', 'us', '.', \"'\", '</s>', \"'\", '</s>', 'japan', ',', 'co-hosts', 'of', 'the', 'world', 'cup', 'in', '2002', 'and', 'ranked', '20th', 'in', 'the', 'world', 'by', 'fifa', ',', 'are', 'favourites', 'to', 'regain', 'their', 'title', 'here', '.', '</s>', 'hosts', 'uae', 'play', 'kuwait', 'and', 'south', 'korea', 'take', 'on', 'indonesia', 'on', 'saturday', 'in', 'group', 'a', 'matches', '.', '</s>', 'all', 'four', 'teams', 'are', 'level', 'with', 'one', 'point', 'each', 'from', 'one', 'game', '.', '</s>']\n"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"# odczytaj dane testowe A\n",
2024-05-25 17:47:40 +02:00
"test_A = pd.read_csv('test-A/in.tsv', sep='\\t', names=['x'], header=None)\n",
"\n",
2024-05-23 21:25:06 +02:00
"print(\"wczytano dane testowe A\")\n",
"print(test_A[\"x\"][0])\n",
"\n",
"# podziel dane testowe na słowa\n",
"# https://www.geeksforgeeks.org/python-word-embedding-using-word2vec/\n",
"slowa_test_A = []\n",
"for tekst in test_A[\"x\"]:\n",
" pom = []\n",
" for slowo in tekst.split(\" \"):\n",
" #if slowo not in (\"<\",\"/s\",\">\",\"/S\",\"``\"):\n",
" pom.append(slowo.lower())\n",
" slowa_test_A.append(pom)\n",
"print(\"podzielono dane treningowe na słowa\")\n",
2024-05-25 17:47:40 +02:00
"\n",
2024-05-23 21:25:06 +02:00
"print(slowa_test_A[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 64,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"def build_vocab(dataset):\n",
" counter = Counter()\n",
" for document in dataset:\n",
" counter.update(document)\n",
" return vocab(counter, specials=[\"<unk>\", \"<pad>\", \"<bos>\", \"<eos>\"])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 65,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"21014\n",
"['<unk>', '<pad>', '<bos>', '<eos>', 'eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.', '</s>', 'peter', 'blackburn', 'brussels', '1996-08-22', 'the', 'european', 'commission', 'said', 'on', 'thursday', 'it', 'disagreed', 'with', 'advice', 'consumers', 'shun', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'sheep', 'germany', \"'s\", 'representative', 'union', 'veterinary', 'committee', 'werner', 'zwingmann', 'wednesday', 'should', 'buy', 'sheepmeat', 'from', 'countries', 'other', 'than', 'britain', 'scientific', 'was', 'clearer', '\"', 'we', 'do', \"n't\", 'support', 'any', 'such', 'recommendation', 'because', 'see', 'grounds', 'for', ',', 'chief', 'spokesman', 'nikolaus', 'van', 'der', 'pas', 'told', 'a', 'news', 'briefing', 'he', 'further', 'study', 'required', 'and', 'if', 'found', 'that', 'action', 'needed', 'taken', 'by', 'proposal', 'last', 'month', 'farm', 'commissioner', 'franz', 'fischler', 'ban', 'brains', 'spleens', 'spinal', 'cords', 'human', 'animal', 'food', 'chains', 'highly', 'specific', 'precautionary', 'move', 'protect', 'health', 'proposed', 'eu-wide', 'measures', 'after', 'reports', 'france', 'under', 'laboratory', 'conditions', 'could', 'contract', 'bovine', 'spongiform', 'encephalopathy', '(', 'bse', ')', '--', 'but', 'agreed', 'review', 'his', 'standing', 'mational', 'officials', 'questioned', 'justified', 'as', 'there', 'only', 'slight', 'risk', 'spanish', 'minister', 'loyola', 'de', 'palacio', 'had', 'earlier', 'accused', 'at', 'an', 'ministers', \"'\", 'meeting', 'of', 'causing', 'unjustified', 'alarm', 'through', 'dangerous', 'generalisation', 'backed', 'multidisciplinary', 'committees', 'are', 'due', 're-examine', 'issue', 'early', 'next', 'make', 'recommendations', 'senior', 'have', 'long', 'been', 'known', 'scrapie', 'brain-wasting', 'similar', 'which', 'is', 'believed', 'transferred', 'cattle', 'feed', 'containing', 'waste', 'farmers', 'denied', 'danger', 'their', 'expressed', 'concern', 'government', 'avoid', 'might', 'influence', 'across', 'europe', 'what', 'extremely', 'careful', 'how', 'going', 'take', 'lead', 'welsh', 'national', 'nfu', 'chairman', 'john', 'lloyd', 'jones', 'bbc', 'radio', 'bonn', 'has', 'led', 'efforts', 'public', 'consumer', 'confidence', 'collapsed', 'in', 'march', 'report', 'suggested', 'humans', 'illness', 'eating', 'contaminated', 'beef', 'imported', '47,600', 'year', 'nearly', 'half', 'total', 'imports', 'brought', '4,275', 'tonnes', 'mutton', 'some', '10', 'percent', 'overall', 'rare', 'hendrix', 'song', 'draft', 'sells', 'almost', '$', '17,000', 'london', 'handwritten', 'u.s.', 'guitar', 'legend', 'jimi', 'sold', 'auction', 'late', 'musician', 'favourite', 'possessions', 'florida', 'restaurant', 'paid', '10,925', 'pounds', '16,935', 'ai', 'no', 'telling', 'penned', 'piece', 'hotel', 'stationery', '1966', 'end', 'january', '1967', 'concert', 'english', 'city', 'nottingham', 'threw', 'sheet', 'paper', 'into', 'audience', 'where', 'retrieved', 'fan', 'buyers', 'also', 'snapped', 'up', '16', 'items', 'were', 'put', 'former', 'girlfriend', 'kathy', 'etchingham', 'who', 'lived', 'him', '1969', 'they', 'included', 'black', 'lacquer', 'mother', 'pearl', 'inlaid', 'box', 'used', 'store', 'drugs', 'anonymous', 'australian', 'purchaser', 'bought', '5,060', '7,845', 'guitarist', 'died', 'overdose', '1970', 'aged', '27', 'china', 'says', 'taiwan', 'spoils', 'atmosphere', 'talks', 'beijing', 'taipei', 'spoiling', 'resumption', 'strait', 'visit', 'ukraine', 'taiwanese', 'vice', 'president', 'lien', 'chan', 'this', 'week', 'infuriated', 'speaking', 'hours', 'chinese', 'state', 'media', 'time', 'right', 'engage', 'political', 'foreign', 'ministry', 'shen', 'guofang', 'reuters', ':', 'necessary', 'opening', 'disrupted', 'authorities', 'quoted', 'top', 'negotiator', 'tang', 'shubei', 'visiting', 'group', 'rivals', 'hold', 'now', 'two', 'sides', '...', 'hostility', 'overseas', 'edition', 'people', 'daily', 'saying', 'television', 'interview', 'read', 'comments', 'gave', 'details', 'why', 'consi
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"v = build_vocab(slowa_train)\n",
"v.set_default_index(v[\"<unk>\"])\n",
"itos = v.get_itos() # mapowanie indeksów na tokeny\n",
"print(len(itos)) # liczba różnych tokenów w słowniku\n",
"print(itos)"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 66,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"{'B-ORG': 0, 'O': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}\n"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"# slownik etykiety - kody etykiet\n",
"etykieta_na_kod = {}\n",
"licznik = 0\n",
"for tekst in train[\"y\"]:\n",
" for etykieta in tekst.split(\" \"):\n",
" if etykieta not in etykieta_na_kod:\n",
" etykieta_na_kod[etykieta] = licznik\n",
" licznik+=1\n",
"print(etykieta_na_kod)"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 67,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"[0, 1, 2, 1, 1, 1, 2, 1, 1, 1, 3, 4, 1, 5, 1, 1, 1, 0, 6, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 0, 6, 1, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 3, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 5, 1, 5, 1, 1, 1, 1, 1, 1, 1, 2, 7, 7, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 4, 4, 1, 1, 1, 3, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 1, 3, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 0, 6, 6, 6, 6, 1, 0, 1, 1, 3, 4, 4, 1, 1, 0, 6, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"# podziel etykiety\n",
"kody_etykiet_train = []\n",
"for tekst in train[\"y\"]:\n",
" pom = []\n",
" for etykieta in tekst.split(\" \"):\n",
" pom.append(etykieta_na_kod[etykieta])\n",
" kody_etykiet_train.append(pom)\n",
"print(kody_etykiet_train[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 68,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"O O B-ORG O O O O O O O O O B-LOC O O B-MISC I-MISC O B-PER I-PER O O O O O O O B-ORG O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O B-ORG O B-ORG O O O O O O B-ORG O O O O O O O O O O B-ORG O O O O B-ORG O O O O O O O O B-LOC I-LOC O B-ORG O O O O O O O O O O O O O O B-LOC O B-PER I-PER O O O O O O O O O O B-ORG O O O O O O O O O B-PER O O O O O O O O O O B-ORG O O O O O O O O O O O B-PER I-PER O B-PER I-PER O O O O O O O O O B-ORG O B-LOC O O B-PER O O O O B-LOC O O O O O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O O O B-ORG O O O O O O O O O B-PER O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-LOC O B-ORG O B-PER I-PER O O O O O B-LOC O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O O O O O O O B-LOC O B-PER I-PER O O O O B-ORG O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O B-ORG O O O O O O O O O O O O O O O O O O O O B-MISC B-PER I-PER O O O O O B-PER I-PER O O O O B-PER I-PER O O O O B-ORG O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-ORG O O O O O O B-ORG O O O O O O O O O O O O O B-PER I-PER O B-MISC O O B-PER I-PER O O O O O O O O B-ORG O O O O O O O\n",
"456\n",
"[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 2, 7, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 5, 8, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 5, 1, 1, 3, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 0, 1, 3, 4, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 5, 1, 3, 4, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 1, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 1, 2, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]\n",
"456\n"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"# odczytaj etykiety dev-0\n",
2024-05-25 17:47:40 +02:00
"labels_dev0 = pd.read_csv('dev-0/expected.tsv', sep='\\t', names=['y'], header=None)\n",
"\n",
2024-05-23 21:25:06 +02:00
"print(labels_dev0[\"y\"][0])\n",
2024-05-25 17:47:40 +02:00
"print(len(labels_dev0[\"y\"][0].split(\" \")))\n",
2024-05-23 21:25:06 +02:00
"\n",
"# podziel etykiety\n",
"kody_etykiet_dev0 = []\n",
"for tekst in labels_dev0[\"y\"]:\n",
" pom = []\n",
" for etykieta in tekst.split(\" \"):\n",
" pom.append(etykieta_na_kod[etykieta])\n",
" kody_etykiet_dev0.append(pom)\n",
2024-05-25 17:47:40 +02:00
"\n",
"print(kody_etykiet_dev0[0])\n",
"print(len(kody_etykiet_dev0[0]))"
2024-05-23 21:25:06 +02:00
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 69,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"def data_process(dt):\n",
" # Wektoryzacja dokumentów tekstowych.\n",
" return [\n",
" torch.tensor(\n",
" [v[\"<bos>\"]] + [v[token] for token in document] + [v[\"<eos>\"]],\n",
" dtype=torch.long,\n",
" )\n",
" for document in dt\n",
" ]\n",
"\n",
"def labels_process(dt):\n",
" # Wektoryzacja etykiet (NER)\n",
" return [torch.tensor([0] + document + [0], dtype=torch.long) for document in dt]"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 70,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"train_tokens_ids = data_process(slowa_train)\n",
"test_dev0_tokens_ids = data_process(slowa_test_dev0)\n",
"test_A_tokens_ids = data_process(slowa_test_A)\n",
"\n",
"train_labels = labels_process(kody_etykiet_train)\n",
"test_dev0_labels = labels_process(kody_etykiet_dev0)"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 71,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"945 491\n",
"215 458\n",
"230 443\n",
"tensor([ 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 13,\n",
" 16, 17, 13, 18, 19, 20, 21, 22, 23, 24, 25, 26, 6, 27,\n",
" 8, 28, 8, 29, 10, 11, 30, 31, 32, 33, 34, 35, 36, 37,\n",
" 38, 39, 8, 40, 12, 13, 41, 42, 43, 8, 18, 19, 44, 42,\n",
" 45, 46, 47, 48, 21, 22, 49, 28, 50, 51, 52, 53, 54, 55,\n",
" 56, 57, 30, 18, 58, 27, 59, 60, 12, 13, 61, 62, 63, 64,\n",
" 65, 66, 67, 68, 69, 62, 63, 64, 70, 66, 71, 72, 24, 73,\n",
" 61, 18, 20, 42, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,\n",
" 12, 13, 84, 21, 85, 58, 86, 59, 87, 88, 89, 24, 59, 90,\n",
" 91, 92, 59, 93, 24, 50, 38, 94, 95, 18, 19, 44, 12, 13,\n",
" 84, 21, 81, 96, 97, 98, 95, 4, 99, 100, 101, 102, 8, 103,\n",
" 40, 104, 73, 105, 88, 106, 107, 53, 18, 108, 88, 109, 110, 111,\n",
" 59, 81, 112, 113, 88, 114, 115, 8, 116, 108, 117, 12, 13, 102,\n",
" 118, 119, 120, 121, 122, 53, 57, 88, 123, 91, 124, 125, 126, 40,\n",
" 127, 128, 129, 130, 131, 132, 133, 134, 135, 34, 35, 36, 12, 13,\n",
" 136, 102, 137, 8, 138, 139, 96, 121, 18, 4, 42, 140, 45, 46,\n",
" 73, 141, 109, 117, 142, 73, 143, 89, 67, 92, 59, 144, 145, 146,\n",
" 59, 147, 81, 148, 149, 8, 108, 117, 12, 13, 150, 99, 151, 152,\n",
" 153, 154, 155, 156, 157, 102, 158, 159, 4, 99, 160, 161, 162, 163,\n",
" 164, 165, 166, 167, 61, 168, 169, 12, 61, 13, 12, 13, 147, 123,\n",
" 88, 57, 170, 102, 42, 96, 12, 13, 18, 4, 42, 58, 45, 88,\n",
" 171, 172, 173, 174, 8, 175, 18, 176, 177, 178, 98, 88, 179, 180,\n",
" 8, 18, 181, 45, 142, 12, 13, 40, 182, 183, 184, 185, 8, 128,\n",
" 186, 73, 81, 187, 36, 188, 8, 133, 189, 190, 191, 8, 182, 184,\n",
" 192, 8, 193, 167, 194, 195, 109, 196, 12, 13, 10, 197, 198, 22,\n",
" 23, 146, 59, 66, 199, 8, 108, 117, 53, 200, 40, 73, 136, 201,\n",
" 202, 91, 6, 203, 27, 8, 28, 8, 204, 10, 11, 205, 206, 28,\n",
" 207, 208, 12, 13, 61, 209, 62, 182, 8, 38, 210, 211, 163, 190,\n",
" 212, 55, 54, 173, 213, 8, 214, 41, 42, 215, 73, 61, 216, 217,\n",
" 197, 161, 44, 132, 218, 134, 219, 220, 221, 222, 21, 22, 223, 224,\n",
" 12, 13, 225, 226, 227, 228, 8, 116, 229, 117, 121, 230, 231, 232,\n",
" 233, 234, 121, 81, 10, 235, 236, 237, 127, 128, 159, 238, 188, 8,\n",
" 34, 35, 36, 95, 239, 240, 241, 12, 13, 41, 242, 243, 40, 53,\n",
" 57, 97, 244, 73, 245, 246, 163, 247, 248, 12, 13, 24, 249, 233,\n",
" 250, 251, 163, 10, 252, 73, 253, 254, 255, 163, 256, 248, 12, 13,\n",
" 3])\n",
"tensor([ 2, 2005, 640, 2049, 214, 779, 158, 386, 121, 2094,\n",
" 1802, 12, 13, 265, 19358, 13, 1005, 5480, 2289, 1697,\n",
" 2052, 768, 2171, 72, 8937, 22, 1098, 145, 2049, 1762,\n",
" 2021, 95, 159, 2094, 88, 3721, 2987, 233, 395, 951,\n",
" 8, 214, 779, 158, 18, 1815, 163, 18, 2006, 2007,\n",
" 12, 13, 200, 4835, 22, 386, 73, 6024, 73, 1066,\n",
" 38, 14326, 145, 1975, 392, 2038, 73, 9340, 88, 2074,\n",
" 416, 1362, 233, 22, 1802, 481, 2041, 3394, 309, 72,\n",
" 2855, 371, 233, 200, 16878, 1772, 746, 2068, 12, 13,\n",
" 121, 2137, 2021, 788, 72, 6261, 22, 18, 382, 3407,\n",
" 158, 14885, 3408, 73, 2049, 1200, 200, 652, 2094, 95,\n",
" 2112, 2987, 1023, 1592, 6184, 788, 72, 0, 26, 1348,\n",
" 0, 2828, 14773, 1982, 992, 72, 6261, 12, 13, 8333,\n",
" 95, 0, 73, 2021, 2967, 81, 5593, 1575, 8, 200,\n",
" 501, 2094, 1023, 2052, 6760, 233, 8, 0, 1088, 788,\n",
" 72, 16349, 12, 13, 2038, 73, 1287, 73, 2539, 3442,\n",
" 8, 14563, 200, 386, 975, 121, 14761, 2102, 88, 14,\n",
" 67, 408, 1088, 81, 1125, 13917, 22, 200, 1772, 746,\n",
" 2084, 158, 2083, 12, 13, 2102, 73, 411, 1532, 8,\n",
" 1348, 42, 9847, 5710, 73, 6469, 14794, 73, 139, 652,\n",
" 2007, 6162, 163, 18, 1844, 73, 145, 2038, 6329, 0,\n",
" 88, 768, 81, 652, 2094, 215, 163, 2869, 12, 13,\n",
" 95, 18, 1599, 2084, 155, 4486, 91, 301, 81, 0,\n",
" 6485, 136, 7294, 67, 155, 0, 200, 5382, 73, 1982,\n",
" 2171, 72, 677, 233, 9000, 3384, 88, 1971, 1088, 9164,\n",
" 22, 3383, 72, 1881, 88, 9896, 72, 6164, 12, 13,\n",
" 158, 18, 2093, 73, 2074, 2526, 1825, 2115, 73, 2167,\n",
" 2168, 0, 95, 1348, 73, 1241, 8, 4778, 139, 4601,\n",
" 145, 84, 5703, 139, 2171, 72, 1355, 22, 23, 26,\n",
" 5493, 790, 788, 22, 1098, 233, 18, 1772, 746, 2076,\n",
" 12, 13, 84, 59, 1490, 170, 95, 1348, 17445, 1684,\n",
" 0, 318, 3394, 1663, 145, 2074, 1362, 22, 0, 72,\n",
" 1095, 73, 81, 215, 163, 0, 12, 13, 9340, 844,\n",
" 309, 18, 5713, 72, 200, 652, 2007, 1975, 765, 19457,\n",
" 95, 5429, 2082, 8, 6697, 72, 1881, 233, 200, 501,\n",
" 2094, 73, 712, 550, 2987, 1836, 53, 8607, 159, 2094,\n",
" 3778, 12, 13, 334, 1721, 584, 768, 996, 72, 2869,\n",
" 136, 1825, 2772, 73, 0, 73, 88, 1686, 0, 73,\n",
" 6272, 73, 768, 9340, 8, 0, 88, 81, 652, 2094,\n",
" 215, 163, 0, 12, 13, 121, 18, 0, 163, 6016,\n",
" 18, 382, 1451, 163, 200, 1772, 12355, 6360, 95, 18,\n",
" 6167, 73, 2041, 6760, 309, 81, 6675, 8, 0, 2068,\n",
" 72, 0, 12, 13, 322, 312, 428, 309, 95, 81,\n",
" 0, 5763, 53, 1671, 2807, 136, 0, 2284, 2285, 1949,\n",
" 0, 768, 2171, 72, 3892, 12, 13, 95, 0, 2041,\n",
" 155, 6329, 2053, 72, 992, 12, 13, 3])\n",
"tensor([ 2, 1759, 640, 1677, 1997, 6422, 2306, 73, 345, 233,\n",
" 3805, 3778, 12, 13, 0, 0, 13, 0, 73, 820,\n",
" 1077, 1078, 0, 13, 1677, 1315, 18, 3104, 163, 200,\n",
" 5613, 1770, 1975, 26, 81, 6422, 1763, 2306, 746, 667,\n",
" 233, 81, 391, 2103, 2007, 1772, 22, 1098, 12, 13,\n",
" 136, 345, 4535, 200, 3815, 4790, 1088, 233, 18, 501,\n",
" 1772, 163, 18, 391, 73, 7709, 8, 81, 3805, 2328,\n",
" 3778, 8, 17348, 0, 12, 13, 345, 1114, 939, 163,\n",
" 18, 1772, 88, 4535, 3692, 3916, 6349, 30, 18, 2403,\n",
" 2302, 1108, 0, 2164, 0, 0, 768, 6485, 163, 81,\n",
" 0, 754, 9477, 8, 11483, 18, 6366, 779, 18, 3888,\n",
" 368, 11451, 88, 301, 159, 7667, 1373, 12, 13, 0,\n",
" 0, 3394, 4647, 163, 18, 2306, 233, 3841, 371, 73,\n",
" 6365, 159, 0, 1156, 6322, 2940, 53, 1876, 923, 18,\n",
" 3741, 12, 13, 18, 314, 5957, 1126, 59, 3817, 233,\n",
" 159, 5613, 1770, 3123, 2488, 72, 18, 652, 371, 12,\n",
" 13, 781, 2856, 18, 5613, 1839, 1975, 395, 1201, 2577,\n",
" 73, 0, 173, 233, 18, 3123, 145, 12225, 12, 13,\n",
" 395, 2357, 53, 754, 12990, 233, 18, 97, 996, 3097,\n",
" 2999, 1677, 8, 3918, 53, 1572, 88, 8962, 416, 992,\n",
" 1469, 53, 200, 382, 162, 746, 667, 12, 13, 0,\n",
" 0, 2159, 18, 1832, 233, 18, 0, 2302, 73, 3776,\n",
" 8, 1815, 81, 0, 0, 5287, 425, 18, 710, 426,\n",
" 189, 9691, 0, 0, 691, 8, 182, 14169, 136, 2000,\n",
" 2999, 8, 17948, 301, 18, 1373, 12, 13, 24, 59,\n",
" 18, 501, 6799, 17399, 95, 667, 233, 2171, 3097, 12,\n",
" 13, 7236, 875, 18629, 472, 8, 0, 81, 183, 6366,\n",
" 301, 18, 3741, 233, 18, 1781, 2302, 136, 147, 10460,\n",
" 8, 0, 24, 301, 18, 386, 11605, 163, 0, 42,\n",
" 426, 12, 13, 0, 0, 155, 1635, 667, 18, 215,\n",
" 26, 81, 0, 9477, 233, 18, 2983, 2302, 12, 13,\n",
" 1677, 2000, 6670, 0, 8, 18, 710, 2338, 3741, 72,\n",
" 939, 163, 18, 2210, 136, 20403, 0, 18, 710, 3104,\n",
" 12, 13, 0, 5634, 661, 4851, 0, 0, 322, 1121,\n",
" 12, 13, 1677, 2532, 0, 0, 21, 380, 161, 161,\n",
" 18, 710, 961, 426, 15063, 6422, 72, 596, 12, 13,\n",
" 18, 714, 2159, 177, 88, 2000, 2489, 0, 88, 8579,\n",
" 183, 3384, 189, 3394, 24, 5616, 72, 596, 12, 161,\n",
" 13, 161, 13, 1677, 73, 0, 163, 18, 1654, 1770,\n",
" 233, 1590, 88, 3806, 11627, 233, 18, 1654, 95, 12663,\n",
" 73, 173, 12675, 8, 14563, 200, 1975, 719, 12, 13,\n",
" 0, 1071, 2008, 933, 88, 1705, 2589, 214, 22, 2609,\n",
" 22, 4025, 233, 391, 81, 2010, 12, 13, 416, 2171,\n",
" 2957, 173, 912, 26, 808, 3894, 3024, 53, 808, 2210,\n",
" 12, 13, 3])\n"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"print(len(train_tokens_ids), len(train_tokens_ids[0]))\n",
"print(len(test_dev0_tokens_ids), len(test_dev0_tokens_ids[0]))\n",
"print(len(test_A_tokens_ids), len(test_A_tokens_ids[0]))\n",
"\n",
"print(train_tokens_ids[0])\n",
"print(test_dev0_tokens_ids[0])\n",
"print(test_A_tokens_ids[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 72,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"945 491\n",
"215 458\n",
"tensor([0, 0, 1, 2, 1, 1, 1, 2, 1, 1, 1, 3, 4, 1, 5, 1, 1, 1, 0, 6, 1, 1, 1, 1,\n",
" 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 5, 1, 1, 1, 1, 0, 6, 1, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 0, 1, 1, 1, 3, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 6, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 0, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 5, 1, 5, 1,\n",
" 1, 1, 1, 1, 1, 1, 2, 7, 7, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1,\n",
" 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 3, 4, 4, 1, 1, 1, 3, 1, 1, 0, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 1, 3, 1, 1, 1, 1,\n",
" 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 0, 6, 6, 6,\n",
" 6, 1, 0, 1, 1, 3, 4, 4, 1, 1, 0, 6, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 5, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])\n",
"tensor([0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 2, 7, 1, 3, 4, 1, 1, 1,\n",
" 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,\n",
" 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 5, 8, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5,\n",
" 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4,\n",
" 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 5, 1, 1, 3, 1, 1, 1, 1, 5, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1,\n",
" 0, 1, 3, 4, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 5, 1, 3, 4, 1,\n",
" 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 2, 3, 4, 1, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 0,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 3, 4, 1, 2, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,\n",
" 1, 0])\n"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"print(len(train_labels), len(train_labels[0]))\n",
"print(len(test_dev0_labels), len(test_dev0_labels[0]))\n",
"\n",
"print(train_labels[0])\n",
"print(test_dev0_labels[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 73,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"def get_scores(y_true, y_pred):\n",
" # Funkcja zwraca precyzję, pokrycie i F1\n",
" acc_score = 0\n",
" tp = 0\n",
" fp = 0\n",
" selected_items = 0\n",
" relevant_items = 0\n",
"\n",
" for p, t in zip(y_pred, y_true):\n",
" if p == t:\n",
" acc_score += 1\n",
"\n",
" if p > 0 and p == t:\n",
" tp += 1\n",
"\n",
" if p > 0:\n",
" selected_items += 1\n",
"\n",
" if t > 0:\n",
" relevant_items += 1\n",
"\n",
" if selected_items == 0:\n",
" precision = 1.0\n",
" else:\n",
" precision = tp / selected_items\n",
"\n",
" if relevant_items == 0:\n",
" recall = 1.0\n",
" else:\n",
" recall = tp / relevant_items\n",
"\n",
" if precision + recall == 0.0:\n",
" f1 = 0.0\n",
" else:\n",
" f1 = 2 * precision * recall / (precision + recall)\n",
"\n",
" return precision, recall, f1"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 74,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"num_tags = len(etykieta_na_kod.keys())\n",
"\n",
"class LSTM(torch.nn.Module):\n",
"\n",
" def __init__(self):\n",
" super(LSTM, self).__init__()\n",
" self.emb = torch.nn.Embedding(len(v.get_itos()), 100)\n",
" self.rec = torch.nn.LSTM(100, 256, 1, batch_first=True)\n",
" self.fc1 = torch.nn.Linear(256, num_tags)\n",
"\n",
" def forward(self, x):\n",
" emb = torch.relu(self.emb(x))\n",
" lstm_output, (h_n, c_n) = self.rec(emb)\n",
" out_weights = self.fc1(lstm_output)\n",
" return out_weights"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 75,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"def eval_model(dataset_tokens, dataset_labels, model):\n",
" Y_true = []\n",
" Y_pred = []\n",
" for i in tqdm(range(len(dataset_labels))):\n",
" batch_tokens = dataset_tokens[i].unsqueeze(0)\n",
" tags = list(dataset_labels[i].numpy())\n",
" Y_true += tags\n",
"\n",
" Y_batch_pred_weights = model(batch_tokens).squeeze(0)\n",
" Y_batch_pred = torch.argmax(Y_batch_pred_weights, 1)\n",
" Y_pred += list(Y_batch_pred.numpy())\n",
"\n",
" return get_scores(Y_true, Y_pred)"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 76,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"lstm = LSTM()\n",
"criterion = torch.nn.CrossEntropyLoss()\n",
"optimizer = torch.optim.Adam(lstm.parameters())\n",
2024-05-25 17:47:40 +02:00
"NUM_EPOCHS = 2 #100 #50"
2024-05-23 21:25:06 +02:00
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 77,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"100%|██████████| 945/945 [00:25<00:00, 36.80it/s]\n",
"100%|██████████| 945/945 [00:31<00:00, 30.23it/s]\n"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"for i in range(NUM_EPOCHS):\n",
" lstm.train()\n",
" # for i in tqdm(range(500)):\n",
" for i in tqdm(range(len(train_labels))):\n",
" batch_tokens = train_tokens_ids[i].unsqueeze(0)\n",
" tags = train_labels[i].unsqueeze(1)\n",
"\n",
" predicted_tags = lstm(batch_tokens)\n",
"\n",
" optimizer.zero_grad()\n",
" loss = criterion(predicted_tags.squeeze(0), tags.squeeze(1))\n",
"\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" lstm.eval()"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 78,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"100%|██████████| 215/215 [00:00<00:00, 238.89it/s]\n"
2024-05-23 21:25:06 +02:00
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"(0.8949339082089975, 0.8914443942133815, 0.8931857430003114)\n"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"print(eval_model(test_dev0_tokens_ids, test_dev0_labels, lstm))"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 79,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"def zwroc_przewidywania(tokeny):\n",
" Y_pred = []\n",
" for i in tqdm(range(len(tokeny))):\n",
" pom1 = lstm(tokeny[i])\n",
" #print(pom1)\n",
" pom2 = torch.argmax(pom1,1)\n",
" #print(pom2)\n",
" pom3 = list(pom2.numpy())\n",
" #print(pom3)\n",
" Y_pred.append(pom3)\n",
" return Y_pred"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 80,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
2024-05-25 17:47:40 +02:00
"# i usun bos, eos\n",
2024-05-23 21:25:06 +02:00
"def zamien_przewidziane_kody_na_etykiety(przewidywania):\n",
" etykiety = []\n",
" for lista in przewidywania:\n",
" pom = []\n",
" for kod in lista:\n",
" etykieta = None\n",
" for e, k in etykieta_na_kod.items():\n",
" if kod == k:\n",
" etykieta = e\n",
" pom.append(etykieta)\n",
" etykiety.append(pom)\n",
2024-05-25 17:47:40 +02:00
" del pom[0]\n",
" del pom[-1]\n",
2024-05-23 21:25:06 +02:00
" return etykiety"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 81,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"100%|██████████| 215/215 [00:00<00:00, 247.70it/s]\n"
2024-05-23 21:25:06 +02:00
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"(0.8949339082089975, 0.8914443942133815, 0.8931857430003114)\n"
2024-05-23 21:25:06 +02:00
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"100%|██████████| 215/215 [00:00<00:00, 239.42it/s]"
2024-05-23 21:25:06 +02:00
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"[0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 8, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 8, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 1, 1, 1, 1, 1, 6, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 6, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 5, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 6, 1, 1, 1, 1, 1, 8, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 1, 5, 6, 1, 6, 6, 1, 1, 1, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 0]\n",
"458\n",
"['O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-PER', 'O', 'B-LOC', 'I-ORG', 'O', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
"456\n",
"tensor([0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 2, 7, 1, 3, 4, 1, 1, 1,\n",
" 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,\n",
" 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 5, 8, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5,\n",
" 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4,\n",
" 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 5, 1, 1, 3, 1, 1, 1, 1, 5, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1,\n",
" 0, 1, 3, 4, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 5, 1, 3, 4, 1,\n",
" 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 2, 3, 4, 1, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 0,\n",
" 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
" 1, 3, 4, 1, 2, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,\n",
" 1, 0])\n",
"458\n",
"[1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 2, 7, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 5, 8, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 5, 1, 1, 3, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 0, 1, 3, 4, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 5, 1, 3, 4, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 1, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 3, 4, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4, 1, 2, 1, 1, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]\n",
"456\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"print(eval_model(test_dev0_tokens_ids, test_dev0_labels, lstm))\n",
"przewidywania_kody_test_dev0 = zwroc_przewidywania(test_dev0_tokens_ids)\n",
"print(przewidywania_kody_test_dev0[0])\n",
2024-05-25 17:47:40 +02:00
"print(len(przewidywania_kody_test_dev0[0]))\n",
2024-05-23 21:25:06 +02:00
"przewidywania_etykiety_test_dev0 = zamien_przewidziane_kody_na_etykiety(przewidywania_kody_test_dev0)\n",
2024-05-25 17:47:40 +02:00
"print(przewidywania_etykiety_test_dev0[0])\n",
"print(len(przewidywania_etykiety_test_dev0[0]))\n",
"\n",
"print(test_dev0_labels[0])\n",
"print(len(test_dev0_labels[0]))\n",
"print(kody_etykiet_dev0[0])\n",
"print(len(kody_etykiet_dev0[0]))"
2024-05-23 21:25:06 +02:00
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 82,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"with open(\"dev-0/out.tsv\", \"w\", encoding=\"utf-8\") as uwu:\n",
" for lista in przewidywania_etykiety_test_dev0:\n",
" for etykieta in lista:\n",
" uwu.write(str(etykieta) + \" \")\n",
" uwu.write(str(\"\\n\"))"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 83,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"100%|██████████| 230/230 [00:00<00:00, 278.11it/s]"
2024-05-23 21:25:06 +02:00
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"[0, 1, 1, 5, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 5, 8, 1, 5, 1, 5, 8, 6, 6, 1, 5, 1, 1, 1, 1, 1, 1, 7, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 0, 8, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 5, 8, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 0, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 0, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 1, 1, 1, 7, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 4, 1, 6, 1, 5, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]\n",
"['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-ORG', 'I-ORG', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'I-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-ORG', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'I-MISC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-PER', 'O', 'I-ORG', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n"
2024-05-23 21:25:06 +02:00
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"przewidywania_kody_test_A = zwroc_przewidywania(test_A_tokens_ids)\n",
"print(przewidywania_kody_test_A[0])\n",
"przewidywania_etykiety_test_A = zamien_przewidziane_kody_na_etykiety(przewidywania_kody_test_A)\n",
"print(przewidywania_etykiety_test_A[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 84,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"with open(\"test-A/out.tsv\", \"w\", encoding=\"utf-8\") as uwu:\n",
" for lista in przewidywania_etykiety_test_A:\n",
" for etykieta in lista:\n",
" uwu.write(str(etykieta) + \" \")\n",
" uwu.write(str(\"\\n\"))"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 85,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"100%|██████████| 945/945 [00:03<00:00, 273.52it/s]\n"
2024-05-23 21:25:06 +02:00
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"(0.9134328639027648, 0.919004869645874, 0.9162103952046085)\n"
2024-05-23 21:25:06 +02:00
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"100%|██████████| 945/945 [00:03<00:00, 289.35it/s]\n"
2024-05-23 21:25:06 +02:00
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
2024-05-25 17:47:40 +02:00
"[0, 5, 6, 2, 1, 1, 1, 2, 1, 1, 1, 5, 8, 1, 5, 1, 1, 1, 2, 6, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 2, 6, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 5, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 3, 4, 1, 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 5, 1, 3, 1, 1, 1, 1, 1, 0, 1, 1, 4, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]\n",
"['B-LOC', 'I-ORG', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'B-MISC', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-MISC', 'I-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'I-PER', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n"
2024-05-23 21:25:06 +02:00
]
}
],
"source": [
"print(eval_model(train_tokens_ids, train_labels, lstm))\n",
"przewidywania_kody_test_train = zwroc_przewidywania(train_tokens_ids)\n",
"print(przewidywania_kody_test_train[0])\n",
"przewidywania_etykiety_test_train = zamien_przewidziane_kody_na_etykiety(przewidywania_kody_test_train)\n",
"print(przewidywania_etykiety_test_train[0])"
]
},
{
"cell_type": "code",
2024-05-25 17:47:40 +02:00
"execution_count": 86,
2024-05-23 21:25:06 +02:00
"metadata": {},
"outputs": [],
"source": [
"with open(\"train/out.tsv\", \"w\", encoding=\"utf-8\") as uwu:\n",
" for lista in przewidywania_etykiety_test_train:\n",
" for etykieta in lista:\n",
" uwu.write(str(etykieta) + \" \")\n",
" uwu.write(str(\"\\n\"))"
]
}
],
"metadata": {
"author": "Jakub Pokrywka",
"email": "kubapok@wmi.amu.edu.pl",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"lang": "pl",
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
},
"subtitle": "11.NER RNN[ćwiczenia]",
"title": "Ekstrakcja informacji",
"year": "2021"
},
"nbformat": 4,
"nbformat_minor": 4
}