diff --git a/wyk/01_Wyszukiwarki-wprowadzenie.ipynb b/wyk/01_Wyszukiwarki-wprowadzenie.ipynb
index dba39ef..934186d 100644
--- a/wyk/01_Wyszukiwarki-wprowadzenie.ipynb
+++ b/wyk/01_Wyszukiwarki-wprowadzenie.ipynb
@@ -3,6 +3,22 @@
{
"cell_type": "markdown",
"metadata": {},
+ "source": [
+ "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
+ "
\n",
+ "
Ekstrakcja informacji
\n",
+ "1. Wyszukiwarki - wprowadzenie [wykład]
\n",
+ "Filip Graliński (2021)
\n",
+ "\n",
+ "\n",
+ "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": [
"# Wyszukiwarki - wprowadzenie\n",
"\n",
@@ -13,7 +29,10 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true,
+ "tags": []
+ },
"source": [
"## Wyszukiwarki\n",
"\n",
@@ -1676,7 +1695,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -1690,7 +1709,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.1"
+ "version": "3.9.6"
}
},
"nbformat": 4,
diff --git a/wyk/14_pretrenowanie.ipynb b/wyk/14_pretrenowanie.ipynb
index ae7721a..b47815e 100644
--- a/wyk/14_pretrenowanie.ipynb
+++ b/wyk/14_pretrenowanie.ipynb
@@ -24,7 +24,7 @@
"Jest kilka sposobów na pretrenowanie modelu, w każdym razie sprowadza\n",
"się do odgadywania następnego bądź zamaskowanego słowa.\n",
"W każdym razie zawsze stosujemy softmax (być może ze „sztuczkami” takimi jak\n",
- "negatywne próbkowanie albo hierarchiczny softamx) na pewnej **representecji kontekstowej**:\n",
+ "negatywne próbkowanie albo hierarchiczny softamx) na pewnej **reprezentacji kontekstowej**:\n",
"\n",
"$$\\vec{p} = \\operatorname{softmax}(f(\\vec{c})).$$\n",
"\n",
@@ -66,7 +66,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
@@ -79,39 +79,39 @@
{
"data": {
"text/plain": [
- "[('Ġon', 0.6786560416221619),\n",
- " ('Ġupon', 0.04339785501360893),\n",
- " ('Ġheavily', 0.02208443358540535),\n",
- " ('Ġin', 0.021049050614237785),\n",
- " (',', 0.020188499242067337),\n",
- " ('Ġa', 0.01833895780146122),\n",
- " ('Ġvery', 0.017935041338205338),\n",
- " ('Ġentirely', 0.017528969794511795),\n",
- " ('Ġlargely', 0.016769640147686005),\n",
- " ('Ġto', 0.01009418722242117),\n",
- " ('Ġgreatly', 0.010009866207838058),\n",
- " ('Ġnot', 0.009016563184559345),\n",
- " ('Ġmore', 0.005853226874023676),\n",
- " ('Ġprimarily', 0.005203146021813154),\n",
- " ('Ġstrongly', 0.0034501152113080025),\n",
- " ('Ġpartly', 0.0033184229396283627),\n",
- " ('Ġmuch', 0.0033095215912908316),\n",
- " ('Ġmostly', 0.0032150144688785076),\n",
- " ('Ġmainly', 0.0030899408739060163),\n",
- " ('Ġfor', 0.003034428460523486),\n",
- " ('.', 0.0028878094162791967),\n",
- " ('Ġboth', 0.0028405177872627974),\n",
- " ('Ġsomewhat', 0.0028194624464958906),\n",
- " ('Ġcru', 0.002263976726680994),\n",
- " ('Ġas', 0.00221616611815989),\n",
- " ('Ġof', 0.0022000609897077084),\n",
- " ('Ġalmost', 0.001968063646927476),\n",
- " ('Ġat', 0.0018015997484326363),\n",
- " ('Ġhighly', 0.0017461496172472835),\n",
- " ('Ġcompletely', 0.001692073536105454)]"
+ "[('Âł', 0.6182783842086792),\n",
+ " ('È', 0.1154019758105278),\n",
+ " ('Ñģ', 0.026960616931319237),\n",
+ " ('_____', 0.024418892338871956),\n",
+ " ('________', 0.014962316490709782),\n",
+ " ('ÃĤ', 0.010653386823832989),\n",
+ " ('ä¸Ń', 0.008340531960129738),\n",
+ " ('Ñ', 0.007557711564004421),\n",
+ " ('Ê', 0.007046067621558905),\n",
+ " ('ãĢ', 0.006875576451420784),\n",
+ " ('ile', 0.006685272324830294),\n",
+ " ('____', 0.006307446397840977),\n",
+ " ('âĢĭ', 0.006306538358330727),\n",
+ " ('ÑĢ', 0.006197483278810978),\n",
+ " ('ĠBelarus', 0.006108700763434172),\n",
+ " ('Æ', 0.005720408633351326),\n",
+ " ('ĠPoland', 0.0053678699769079685),\n",
+ " ('á¹', 0.004606408067047596),\n",
+ " ('îĢ', 0.004161055199801922),\n",
+ " ('????', 0.004056799225509167),\n",
+ " ('_______', 0.0038176667876541615),\n",
+ " ('ä¸', 0.0036082742735743523),\n",
+ " ('Ì', 0.003221835708245635),\n",
+ " ('urs', 0.003080119378864765),\n",
+ " ('________________', 0.0027312245219945908),\n",
+ " ('ĠLithuania', 0.0023860156070441008),\n",
+ " ('ich', 0.0021211160346865654),\n",
+ " ('iz', 0.002069818088784814),\n",
+ " ('vern', 0.002001357264816761),\n",
+ " ('ÅĤ', 0.001717406208626926)]"
]
},
- "execution_count": 5,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -121,12 +121,11 @@
"from transformers import GPT2Tokenizer, GPT2LMHeadModel\n",
"tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')\n",
"model = GPT2LMHeadModel.from_pretrained('gpt2-large')\n",
- "text = \"This issue depends\"\n",
+ "text = 'Warsaw is the capital city of'\n",
"encoded_input = tokenizer(text, return_tensors='pt')\n",
"output = model(**encoded_input)\n",
"next_token_probs = torch.softmax(output[0][:, -1, :][0], dim=0)\n",
"\n",
- "next_token_probs\n",
"nb_of_tokens = next_token_probs.size()[0]\n",
"print(nb_of_tokens)\n",
"\n",
@@ -198,11 +197,28 @@
"execution_count": 1,
"metadata": {},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/filipg/.local/lib/python3.9/site-packages/transformers/models/auto/modeling_auto.py:806: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n",
+ " warnings.warn(\n"
+ ]
+ },
{
"name": "stdout",
"output_type": "stream",
"text": [
- "# Out[3]:"
+ "W którym państwie leży Bombaj? W USA. (score: 0.16715531051158905)\n",
+ "W którym państwie leży Bombaj? W India. (score: 0.09912960231304169)\n",
+ "W którym państwie leży Bombaj? W Indian. (score: 0.039642028510570526)\n",
+ "W którym państwie leży Bombaj? W Nepal. (score: 0.027137665078043938)\n",
+ "W którym państwie leży Bombaj? W Pakistan. (score: 0.027065709233283997)\n",
+ "W którym państwie leży Bombaj? W Polsce. (score: 0.023737527430057526)\n",
+ "W którym państwie leży Bombaj? W .... (score: 0.02306722290813923)\n",
+ "W którym państwie leży Bombaj? W Bangladesh. (score: 0.022106658667325974)\n",
+ "W którym państwie leży Bombaj? W .... (score: 0.01628892682492733)\n",
+ "W którym państwie leży Bombaj? W Niemczech. (score: 0.014501162804663181)\n"
]
}
],
@@ -213,7 +229,7 @@
"tokenizer = AutoTokenizer.from_pretrained(\"xlm-roberta-large\")\n",
"model = AutoModelWithLMHead.from_pretrained(\"xlm-roberta-large\")\n",
"\n",
- "sequence = f'II wojna światowa zakończyła się w {tokenizer.mask_token} roku.'\n",
+ "sequence = f'W którym państwie leży Bombaj? W {tokenizer.mask_token}.'\n",
"\n",
"input_ids = tokenizer.encode(sequence, return_tensors=\"pt\")\n",
"mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]\n",
@@ -262,9 +278,24 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 2,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['World War II ended in World War II.',\n",
+ " 'World War II ended in 1945..',\n",
+ " 'World War II ended in 1945.',\n",
+ " 'World War II ended in 1945.',\n",
+ " 'World War II ended in 1945.']"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration\n",
"\n",
@@ -276,7 +307,7 @@
"\n",
"slot = ''\n",
"\n",
- "text = f'Warsaw is the {slot} of Poland.'\n",
+ "text = f'World War II ended in {slot}.'\n",
"\n",
"encoded = t5_tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')\n",
"input_ids = encoded['input_ids']\n",
@@ -334,5 +365,5 @@
"org": null
},
"nbformat": 4,
- "nbformat_minor": 1
+ "nbformat_minor": 4
}