This commit is contained in:
Filip Gralinski 2021-09-27 07:36:37 +02:00
parent a45fd570e5
commit 72c6fbcbf6
2 changed files with 94 additions and 44 deletions

View File

@ -3,6 +3,22 @@
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
"<div class=\"alert alert-block alert-info\">\n",
"<h1>Ekstrakcja informacji</h1>\n",
"<h2>1. <i>Wyszukiwarki - wprowadzenie</i> [wykład]</h2> \n",
"<h3>Filip Graliński (2021)</h3>\n",
"</div>\n",
"\n",
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Wyszukiwarki - wprowadzenie\n",
"\n",
@ -13,7 +29,10 @@
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"jp-MarkdownHeadingCollapsed": true,
"tags": []
},
"source": [
"## Wyszukiwarki\n",
"\n",
@ -1676,7 +1695,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@ -1690,7 +1709,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
"version": "3.9.6"
}
},
"nbformat": 4,

View File

@ -24,7 +24,7 @@
"Jest kilka sposobów na pretrenowanie modelu, w każdym razie sprowadza\n",
"się do odgadywania następnego bądź zamaskowanego słowa.\n",
"W każdym razie zawsze stosujemy softmax (być może ze „sztuczkami” takimi jak\n",
"negatywne próbkowanie albo hierarchiczny softamx) na pewnej **representecji kontekstowej**:\n",
"negatywne próbkowanie albo hierarchiczny softamx) na pewnej **reprezentacji kontekstowej**:\n",
"\n",
"$$\\vec{p} = \\operatorname{softmax}(f(\\vec{c})).$$\n",
"\n",
@ -66,7 +66,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 17,
"metadata": {},
"outputs": [
{
@ -79,39 +79,39 @@
{
"data": {
"text/plain": [
"[('Ġon', 0.6786560416221619),\n",
" ('Ġupon', 0.04339785501360893),\n",
" ('Ġheavily', 0.02208443358540535),\n",
" ('Ġin', 0.021049050614237785),\n",
" (',', 0.020188499242067337),\n",
" ('Ġa', 0.01833895780146122),\n",
" ('Ġvery', 0.017935041338205338),\n",
" ('Ġentirely', 0.017528969794511795),\n",
" ('Ġlargely', 0.016769640147686005),\n",
" ('Ġto', 0.01009418722242117),\n",
" ('Ġgreatly', 0.010009866207838058),\n",
" ('Ġnot', 0.009016563184559345),\n",
" ('Ġmore', 0.005853226874023676),\n",
" ('Ġprimarily', 0.005203146021813154),\n",
" ('Ġstrongly', 0.0034501152113080025),\n",
" ('Ġpartly', 0.0033184229396283627),\n",
" ('Ġmuch', 0.0033095215912908316),\n",
" ('Ġmostly', 0.0032150144688785076),\n",
" ('Ġmainly', 0.0030899408739060163),\n",
" ('Ġfor', 0.003034428460523486),\n",
" ('.', 0.0028878094162791967),\n",
" ('Ġboth', 0.0028405177872627974),\n",
" ('Ġsomewhat', 0.0028194624464958906),\n",
" ('Ġcru', 0.002263976726680994),\n",
" ('Ġas', 0.00221616611815989),\n",
" ('Ġof', 0.0022000609897077084),\n",
" ('Ġalmost', 0.001968063646927476),\n",
" ('Ġat', 0.0018015997484326363),\n",
" ('Ġhighly', 0.0017461496172472835),\n",
" ('Ġcompletely', 0.001692073536105454)]"
"[('Âł', 0.6182783842086792),\n",
" ('È', 0.1154019758105278),\n",
" ('Ñģ', 0.026960616931319237),\n",
" ('_____', 0.024418892338871956),\n",
" ('________', 0.014962316490709782),\n",
" ('ÃĤ', 0.010653386823832989),\n",
" ('ä¸Ń', 0.008340531960129738),\n",
" ('Ñ', 0.007557711564004421),\n",
" ('Ê', 0.007046067621558905),\n",
" ('ãĢ', 0.006875576451420784),\n",
" ('ile', 0.006685272324830294),\n",
" ('____', 0.006307446397840977),\n",
" ('âĢĭ', 0.006306538358330727),\n",
" ('ÑĢ', 0.006197483278810978),\n",
" ('ĠBelarus', 0.006108700763434172),\n",
" ('Æ', 0.005720408633351326),\n",
" ('ĠPoland', 0.0053678699769079685),\n",
" ('á¹', 0.004606408067047596),\n",
" ('îĢ', 0.004161055199801922),\n",
" ('????', 0.004056799225509167),\n",
" ('_______', 0.0038176667876541615),\n",
" ('ä¸', 0.0036082742735743523),\n",
" ('Ì', 0.003221835708245635),\n",
" ('urs', 0.003080119378864765),\n",
" ('________________', 0.0027312245219945908),\n",
" ('ĠLithuania', 0.0023860156070441008),\n",
" ('ich', 0.0021211160346865654),\n",
" ('iz', 0.002069818088784814),\n",
" ('vern', 0.002001357264816761),\n",
" ('ÅĤ', 0.001717406208626926)]"
]
},
"execution_count": 5,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@ -121,12 +121,11 @@
"from transformers import GPT2Tokenizer, GPT2LMHeadModel\n",
"tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')\n",
"model = GPT2LMHeadModel.from_pretrained('gpt2-large')\n",
"text = \"This issue depends\"\n",
"text = 'Warsaw is the capital city of'\n",
"encoded_input = tokenizer(text, return_tensors='pt')\n",
"output = model(**encoded_input)\n",
"next_token_probs = torch.softmax(output[0][:, -1, :][0], dim=0)\n",
"\n",
"next_token_probs\n",
"nb_of_tokens = next_token_probs.size()[0]\n",
"print(nb_of_tokens)\n",
"\n",
@ -198,11 +197,28 @@
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/filipg/.local/lib/python3.9/site-packages/transformers/models/auto/modeling_auto.py:806: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"# Out[3]:"
"W którym państwie leży Bombaj? W USA. (score: 0.16715531051158905)\n",
"W którym państwie leży Bombaj? W India. (score: 0.09912960231304169)\n",
"W którym państwie leży Bombaj? W Indian. (score: 0.039642028510570526)\n",
"W którym państwie leży Bombaj? W Nepal. (score: 0.027137665078043938)\n",
"W którym państwie leży Bombaj? W Pakistan. (score: 0.027065709233283997)\n",
"W którym państwie leży Bombaj? W Polsce. (score: 0.023737527430057526)\n",
"W którym państwie leży Bombaj? W .... (score: 0.02306722290813923)\n",
"W którym państwie leży Bombaj? W Bangladesh. (score: 0.022106658667325974)\n",
"W którym państwie leży Bombaj? W .... (score: 0.01628892682492733)\n",
"W którym państwie leży Bombaj? W Niemczech. (score: 0.014501162804663181)\n"
]
}
],
@ -213,7 +229,7 @@
"tokenizer = AutoTokenizer.from_pretrained(\"xlm-roberta-large\")\n",
"model = AutoModelWithLMHead.from_pretrained(\"xlm-roberta-large\")\n",
"\n",
"sequence = f'II wojna światowa zakończyła się w {tokenizer.mask_token} roku.'\n",
"sequence = f'W którym państwie leży Bombaj? W {tokenizer.mask_token}.'\n",
"\n",
"input_ids = tokenizer.encode(sequence, return_tensors=\"pt\")\n",
"mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]\n",
@ -262,9 +278,24 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"['World War II ended in World War II.',\n",
" 'World War II ended in 1945..',\n",
" 'World War II ended in 1945.',\n",
" 'World War II ended in 1945.',\n",
" 'World War II ended in 1945.']"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration\n",
"\n",
@ -276,7 +307,7 @@
"\n",
"slot = '<extra_id_0>'\n",
"\n",
"text = f'Warsaw is the {slot} of Poland.'\n",
"text = f'World War II ended in {slot}.'\n",
"\n",
"encoded = t5_tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')\n",
"input_ids = encoded['input_ids']\n",
@ -334,5 +365,5 @@
"org": null
},
"nbformat": 4,
"nbformat_minor": 1
"nbformat_minor": 4
}