From 72c6fbcbf683026ba45e33f74e31a2d3789f1adc Mon Sep 17 00:00:00 2001 From: Filip Gralinski Date: Mon, 27 Sep 2021 07:36:37 +0200 Subject: [PATCH] Fixes --- wyk/01_Wyszukiwarki-wprowadzenie.ipynb | 25 +++++- wyk/14_pretrenowanie.ipynb | 113 ++++++++++++++++--------- 2 files changed, 94 insertions(+), 44 deletions(-) diff --git a/wyk/01_Wyszukiwarki-wprowadzenie.ipynb b/wyk/01_Wyszukiwarki-wprowadzenie.ipynb index dba39ef..934186d 100644 --- a/wyk/01_Wyszukiwarki-wprowadzenie.ipynb +++ b/wyk/01_Wyszukiwarki-wprowadzenie.ipynb @@ -3,6 +3,22 @@ { "cell_type": "markdown", "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

1. Wyszukiwarki - wprowadzenie [wykład]

\n", + "

Filip Graliński (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Wyszukiwarki - wprowadzenie\n", "\n", @@ -13,7 +29,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, "source": [ "## Wyszukiwarki\n", "\n", @@ -1676,7 +1695,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1690,7 +1709,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.9.6" } }, "nbformat": 4, diff --git a/wyk/14_pretrenowanie.ipynb b/wyk/14_pretrenowanie.ipynb index ae7721a..b47815e 100644 --- a/wyk/14_pretrenowanie.ipynb +++ b/wyk/14_pretrenowanie.ipynb @@ -24,7 +24,7 @@ "Jest kilka sposobów na pretrenowanie modelu, w każdym razie sprowadza\n", "się do odgadywania następnego bądź zamaskowanego słowa.\n", "W każdym razie zawsze stosujemy softmax (być może ze „sztuczkami” takimi jak\n", - "negatywne próbkowanie albo hierarchiczny softamx) na pewnej **representecji kontekstowej**:\n", + "negatywne próbkowanie albo hierarchiczny softamx) na pewnej **reprezentacji kontekstowej**:\n", "\n", "$$\\vec{p} = \\operatorname{softmax}(f(\\vec{c})).$$\n", "\n", @@ -66,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -79,39 +79,39 @@ { "data": { "text/plain": [ - "[('Ġon', 0.6786560416221619),\n", - " ('Ġupon', 0.04339785501360893),\n", - " ('Ġheavily', 0.02208443358540535),\n", - " ('Ġin', 0.021049050614237785),\n", - " (',', 0.020188499242067337),\n", - " ('Ġa', 0.01833895780146122),\n", - " ('Ġvery', 0.017935041338205338),\n", - " ('Ġentirely', 0.017528969794511795),\n", - " ('Ġlargely', 0.016769640147686005),\n", - " ('Ġto', 0.01009418722242117),\n", - " ('Ġgreatly', 0.010009866207838058),\n", - " ('Ġnot', 0.009016563184559345),\n", - " ('Ġmore', 0.005853226874023676),\n", - " ('Ġprimarily', 0.005203146021813154),\n", - " ('Ġstrongly', 0.0034501152113080025),\n", - " ('Ġpartly', 0.0033184229396283627),\n", - " ('Ġmuch', 0.0033095215912908316),\n", - " ('Ġmostly', 0.0032150144688785076),\n", - " ('Ġmainly', 0.0030899408739060163),\n", - " ('Ġfor', 0.003034428460523486),\n", - " ('.', 0.0028878094162791967),\n", - " ('Ġboth', 0.0028405177872627974),\n", - " ('Ġsomewhat', 0.0028194624464958906),\n", - " ('Ġcru', 0.002263976726680994),\n", - " ('Ġas', 0.00221616611815989),\n", - " ('Ġof', 0.0022000609897077084),\n", - " ('Ġalmost', 0.001968063646927476),\n", - " ('Ġat', 0.0018015997484326363),\n", - " ('Ġhighly', 0.0017461496172472835),\n", - " ('Ġcompletely', 0.001692073536105454)]" + "[('Âł', 0.6182783842086792),\n", + " ('È', 0.1154019758105278),\n", + " ('Ñģ', 0.026960616931319237),\n", + " ('_____', 0.024418892338871956),\n", + " ('________', 0.014962316490709782),\n", + " ('ÃĤ', 0.010653386823832989),\n", + " ('ä¸Ń', 0.008340531960129738),\n", + " ('Ñ', 0.007557711564004421),\n", + " ('Ê', 0.007046067621558905),\n", + " ('ãĢ', 0.006875576451420784),\n", + " ('ile', 0.006685272324830294),\n", + " ('____', 0.006307446397840977),\n", + " ('âĢĭ', 0.006306538358330727),\n", + " ('ÑĢ', 0.006197483278810978),\n", + " ('ĠBelarus', 0.006108700763434172),\n", + " ('Æ', 0.005720408633351326),\n", + " ('ĠPoland', 0.0053678699769079685),\n", + " ('á¹', 0.004606408067047596),\n", + " ('îĢ', 0.004161055199801922),\n", + " ('????', 0.004056799225509167),\n", + " ('_______', 0.0038176667876541615),\n", + " ('ä¸', 0.0036082742735743523),\n", + " ('Ì', 0.003221835708245635),\n", + " ('urs', 0.003080119378864765),\n", + " ('________________', 0.0027312245219945908),\n", + " ('ĠLithuania', 0.0023860156070441008),\n", + " ('ich', 0.0021211160346865654),\n", + " ('iz', 0.002069818088784814),\n", + " ('vern', 0.002001357264816761),\n", + " ('ÅĤ', 0.001717406208626926)]" ] }, - "execution_count": 5, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -121,12 +121,11 @@ "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n", "tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')\n", "model = GPT2LMHeadModel.from_pretrained('gpt2-large')\n", - "text = \"This issue depends\"\n", + "text = 'Warsaw is the capital city of'\n", "encoded_input = tokenizer(text, return_tensors='pt')\n", "output = model(**encoded_input)\n", "next_token_probs = torch.softmax(output[0][:, -1, :][0], dim=0)\n", "\n", - "next_token_probs\n", "nb_of_tokens = next_token_probs.size()[0]\n", "print(nb_of_tokens)\n", "\n", @@ -198,11 +197,28 @@ "execution_count": 1, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/filipg/.local/lib/python3.9/site-packages/transformers/models/auto/modeling_auto.py:806: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", + " warnings.warn(\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "# Out[3]:" + "W którym państwie leży Bombaj? W USA. (score: 0.16715531051158905)\n", + "W którym państwie leży Bombaj? W India. (score: 0.09912960231304169)\n", + "W którym państwie leży Bombaj? W Indian. (score: 0.039642028510570526)\n", + "W którym państwie leży Bombaj? W Nepal. (score: 0.027137665078043938)\n", + "W którym państwie leży Bombaj? W Pakistan. (score: 0.027065709233283997)\n", + "W którym państwie leży Bombaj? W Polsce. (score: 0.023737527430057526)\n", + "W którym państwie leży Bombaj? W .... (score: 0.02306722290813923)\n", + "W którym państwie leży Bombaj? W Bangladesh. (score: 0.022106658667325974)\n", + "W którym państwie leży Bombaj? W .... (score: 0.01628892682492733)\n", + "W którym państwie leży Bombaj? W Niemczech. (score: 0.014501162804663181)\n" ] } ], @@ -213,7 +229,7 @@ "tokenizer = AutoTokenizer.from_pretrained(\"xlm-roberta-large\")\n", "model = AutoModelWithLMHead.from_pretrained(\"xlm-roberta-large\")\n", "\n", - "sequence = f'II wojna światowa zakończyła się w {tokenizer.mask_token} roku.'\n", + "sequence = f'W którym państwie leży Bombaj? W {tokenizer.mask_token}.'\n", "\n", "input_ids = tokenizer.encode(sequence, return_tensors=\"pt\")\n", "mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]\n", @@ -262,9 +278,24 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['World War II ended in World War II.',\n", + " 'World War II ended in 1945..',\n", + " 'World War II ended in 1945.',\n", + " 'World War II ended in 1945.',\n", + " 'World War II ended in 1945.']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration\n", "\n", @@ -276,7 +307,7 @@ "\n", "slot = ''\n", "\n", - "text = f'Warsaw is the {slot} of Poland.'\n", + "text = f'World War II ended in {slot}.'\n", "\n", "encoded = t5_tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')\n", "input_ids = encoded['input_ids']\n", @@ -334,5 +365,5 @@ "org": null }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 }