From 85d14a1c1044e0c43669966ef2066f2f1d5299c0 Mon Sep 17 00:00:00 2001 From: Jakub Pokrywka Date: Tue, 5 Jul 2022 11:24:56 +0200 Subject: [PATCH] update --- cw/00_Informacje_na_temat_przedmiotu.ipynb | 7 - cw/01_Kodowanie_tekstu.ipynb | 9 +- cw/02_Jezyk.ipynb | 7547 ---------------- cw/02_Język.ipynb | 7652 +++++++++++++++++ cw/03_statystyczny_model_językowy.ipynb | 176 - cw/04_statystyczny_model_językowy.ipynb | 246 +- ...tystyczny_model_językowy_część_2.ipynb | 272 + ... 06_wygładzanie_modeli_językowych.ipynb} | 2 +- ...teki_STM.ipynb => 07_biblioteki_STM.ipynb} | 2 +- ...nb => 08_neuronowe_modele_językowe.ipynb} | 2 +- ...=> 09_Model_neuronowy_typu_word2vec.ipynb} | 6 +- ... => 10_Model_neuronowy_rekurencyjny.ipynb} | 34 +- cw/11_Model_rekurencyjny_z_atencją.ipynb | 517 -- cw/11_regularyzacja_modeli_neuronowych.ipynb | 126 + ...ncyjny2.ipynb => 12_Ensemble_modeli.ipynb} | 6 +- ...el_neuronowy_rekurencyjny_część_2.ipynb | 59 + cw/14_Model_rekurencyjny_z_atencją.ipynb | 955 ++ ...15_Model_transformer_autoregresywny.ipynb} | 6 +- 18 files changed, 9185 insertions(+), 8439 deletions(-) delete mode 100644 cw/02_Jezyk.ipynb create mode 100644 cw/02_Język.ipynb delete mode 100644 cw/03_statystyczny_model_językowy.ipynb create mode 100644 cw/05_statystyczny_model_językowy_część_2.ipynb rename cw/{05_wygładzanie_modeli_językowych.ipynb => 06_wygładzanie_modeli_językowych.ipynb} (99%) rename cw/{06_biblioteki_STM.ipynb => 07_biblioteki_STM.ipynb} (99%) rename cw/{07_neuronowe_modele_językowe.ipynb => 08_neuronowe_modele_językowe.ipynb} (98%) rename cw/{08_Model_neuronowy_typu_word2vec.ipynb => 09_Model_neuronowy_typu_word2vec.ipynb} (98%) rename cw/{09_Model_neuronowy_rekurencyjny.ipynb => 10_Model_neuronowy_rekurencyjny.ipynb} (98%) delete mode 100644 cw/11_Model_rekurencyjny_z_atencją.ipynb create mode 100644 cw/11_regularyzacja_modeli_neuronowych.ipynb rename cw/{10_Ensemble_oraz_Model_neuronowy_rekurencyjny2.ipynb => 12_Ensemble_modeli.ipynb} (98%) create mode 100644 cw/13_Model_neuronowy_rekurencyjny_część_2.ipynb create mode 100644 cw/14_Model_rekurencyjny_z_atencją.ipynb rename cw/{12_Model_transformer_autoregresywny.ipynb => 15_Model_transformer_autoregresywny.ipynb} (99%) diff --git a/cw/00_Informacje_na_temat_przedmiotu.ipynb b/cw/00_Informacje_na_temat_przedmiotu.ipynb index 7ab3bc1..172a69d 100644 --- a/cw/00_Informacje_na_temat_przedmiotu.ipynb +++ b/cw/00_Informacje_na_temat_przedmiotu.ipynb @@ -70,13 +70,6 @@ "\n", "**Żeby zaliczyć przedmiot należy pojawiać się na laboratoriach. Maksymalna liczba nieobecności to 3. Obecność będę sprawdzał co zajęcia. Jeżeli kogoś nie będzie więcej niż 3 razy, to nie będzie miał zaliczonego przedmiotu** \n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/cw/01_Kodowanie_tekstu.ipynb b/cw/01_Kodowanie_tekstu.ipynb index 033c435..a4456e1 100644 --- a/cw/01_Kodowanie_tekstu.ipynb +++ b/cw/01_Kodowanie_tekstu.ipynb @@ -7,7 +7,7 @@ "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", "
\n", "

Ekstrakcja informacji

\n", - "

0. Kodowanie tekstu [ćwiczenia]

\n", + "

1. Kodowanie tekstu [ćwiczenia]

\n", "

Jakub Pokrywka (2022)

\n", "
\n", "\n", @@ -733,13 +733,6 @@ "- następnie wygeneruj z notebooka PDF (File → Download As → PDF via Latex).\n", "- notebook z kodem oraz PDF zamieść w zakładce zadań w MS TEAMS" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/cw/02_Jezyk.ipynb b/cw/02_Jezyk.ipynb deleted file mode 100644 index ffdad79..0000000 --- a/cw/02_Jezyk.ipynb +++ /dev/null @@ -1,7547 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", - "
\n", - "

Ekstrakcja informacji

\n", - "

0. Jezyk [ćwiczenia]

\n", - "

Jakub Pokrywka (2022)

\n", - "
\n", - "\n", - "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" - ] - }, - { - "cell_type": "code", - "execution_count": 278, - "metadata": {}, - "outputs": [], - "source": [ - "NR_INDEKSU = 375985" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": {}, - "outputs": [], - "source": [ - "import random\n", - "import plotly.express as px\n", - "import numpy as np\n", - "import pandas as pd\n", - "import nltk" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "program : program\n", - "programs : program\n", - "programmer : programm\n", - "programming : program\n", - "programmers : programm\n" - ] - } - ], - "source": [ - "ps = nltk.stem.PorterStemmer()\n", - "\n", - "for w in [\"program\", \"programs\", \"programmer\", \"programming\", \"programmers\"]:\n", - " print(w, \" : \", ps.stem(w))" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[nltk_data] Downloading package punkt to /home/kuba/nltk_data...\n", - "[nltk_data] Package punkt is already up-to-date!\n", - "[nltk_data] Downloading package stopwords to /home/kuba/nltk_data...\n", - "[nltk_data] Unzipping corpora/stopwords.zip.\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 77, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "nltk.download('punkt')\n", - "nltk.download('stopwords')" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Python',\n", - " 'is',\n", - " 'dynamically-typed',\n", - " 'and',\n", - " 'garbage-collected',\n", - " '.',\n", - " 'It',\n", - " 'supports',\n", - " 'multiple',\n", - " 'programming',\n", - " 'paradigms',\n", - " ',',\n", - " 'including',\n", - " 'structured',\n", - " '(',\n", - " 'particularly',\n", - " ',',\n", - " 'procedural',\n", - " ')',\n", - " ',',\n", - " 'object-oriented',\n", - " 'and',\n", - " 'functional',\n", - " 'programming',\n", - " '.',\n", - " 'It',\n", - " 'is',\n", - " 'often',\n", - " 'described',\n", - " 'as',\n", - " 'a',\n", - " '``',\n", - " 'batteries',\n", - " 'included',\n", - " \"''\",\n", - " 'language',\n", - " 'due',\n", - " 'to',\n", - " 'its',\n", - " 'comprehensive',\n", - " 'standard',\n", - " 'library',\n", - " '.']" - ] - }, - "execution_count": 78, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "text = \"\"\"Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly, procedural), object-oriented and functional programming. It is often described as a \"batteries included\" language due to its comprehensive standard library.\"\"\"\n", - "nltk.tokenize.word_tokenize(text)" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Python is dynamically-typed and garbage-collected.',\n", - " 'It supports multiple programming paradigms, including structured (particularly, procedural), object-oriented and functional programming.',\n", - " 'It is often described as a \"batteries included\" language due to its comprehensive standard library.']" - ] - }, - "execution_count": 79, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "nltk.tokenize.sent_tokenize(text)" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['aber',\n", - " 'alle',\n", - " 'allem',\n", - " 'allen',\n", - " 'aller',\n", - " 'alles',\n", - " 'als',\n", - " 'also',\n", - " 'am',\n", - " 'an',\n", - " 'ander',\n", - " 'andere',\n", - " 'anderem',\n", - " 'anderen',\n", - " 'anderer',\n", - " 'anderes',\n", - " 'anderm',\n", - " 'andern',\n", - " 'anderr',\n", - " 'anders',\n", - " 'auch',\n", - " 'auf',\n", - " 'aus',\n", - " 'bei',\n", - " 'bin',\n", - " 'bis',\n", - " 'bist',\n", - " 'da',\n", - " 'damit',\n", - " 'dann',\n", - " 'der',\n", - " 'den',\n", - " 'des',\n", - " 'dem',\n", - " 'die',\n", - " 'das',\n", - " 'dass',\n", - " 'daß',\n", - " 'derselbe',\n", - " 'derselben',\n", - " 'denselben',\n", - " 'desselben',\n", - " 'demselben',\n", - " 'dieselbe',\n", - " 'dieselben',\n", - " 'dasselbe',\n", - " 'dazu',\n", - " 'dein',\n", - " 'deine',\n", - " 'deinem',\n", - " 'deinen',\n", - " 'deiner',\n", - " 'deines',\n", - " 'denn',\n", - " 'derer',\n", - " 'dessen',\n", - " 'dich',\n", - " 'dir',\n", - " 'du',\n", - " 'dies',\n", - " 'diese',\n", - " 'diesem',\n", - " 'diesen',\n", - " 'dieser',\n", - " 'dieses',\n", - " 'doch',\n", - " 'dort',\n", - " 'durch',\n", - " 'ein',\n", - " 'eine',\n", - " 'einem',\n", - " 'einen',\n", - " 'einer',\n", - " 'eines',\n", - " 'einig',\n", - " 'einige',\n", - " 'einigem',\n", - " 'einigen',\n", - " 'einiger',\n", - " 'einiges',\n", - " 'einmal',\n", - " 'er',\n", - " 'ihn',\n", - " 'ihm',\n", - " 'es',\n", - " 'etwas',\n", - " 'euer',\n", - " 'eure',\n", - " 'eurem',\n", - " 'euren',\n", - " 'eurer',\n", - " 'eures',\n", - " 'für',\n", - " 'gegen',\n", - " 'gewesen',\n", - " 'hab',\n", - " 'habe',\n", - " 'haben',\n", - " 'hat',\n", - " 'hatte',\n", - " 'hatten',\n", - " 'hier',\n", - " 'hin',\n", - " 'hinter',\n", - " 'ich',\n", - " 'mich',\n", - " 'mir',\n", - " 'ihr',\n", - " 'ihre',\n", - " 'ihrem',\n", - " 'ihren',\n", - " 'ihrer',\n", - " 'ihres',\n", - " 'euch',\n", - " 'im',\n", - " 'in',\n", - " 'indem',\n", - " 'ins',\n", - " 'ist',\n", - " 'jede',\n", - " 'jedem',\n", - " 'jeden',\n", - " 'jeder',\n", - " 'jedes',\n", - " 'jene',\n", - " 'jenem',\n", - " 'jenen',\n", - " 'jener',\n", - " 'jenes',\n", - " 'jetzt',\n", - " 'kann',\n", - " 'kein',\n", - " 'keine',\n", - " 'keinem',\n", - " 'keinen',\n", - " 'keiner',\n", - " 'keines',\n", - " 'können',\n", - " 'könnte',\n", - " 'machen',\n", - " 'man',\n", - " 'manche',\n", - " 'manchem',\n", - " 'manchen',\n", - " 'mancher',\n", - " 'manches',\n", - " 'mein',\n", - " 'meine',\n", - " 'meinem',\n", - " 'meinen',\n", - " 'meiner',\n", - " 'meines',\n", - " 'mit',\n", - " 'muss',\n", - " 'musste',\n", - " 'nach',\n", - " 'nicht',\n", - " 'nichts',\n", - " 'noch',\n", - " 'nun',\n", - " 'nur',\n", - " 'ob',\n", - " 'oder',\n", - " 'ohne',\n", - " 'sehr',\n", - " 'sein',\n", - " 'seine',\n", - " 'seinem',\n", - " 'seinen',\n", - " 'seiner',\n", - " 'seines',\n", - " 'selbst',\n", - " 'sich',\n", - " 'sie',\n", - " 'ihnen',\n", - " 'sind',\n", - " 'so',\n", - " 'solche',\n", - " 'solchem',\n", - " 'solchen',\n", - " 'solcher',\n", - " 'solches',\n", - " 'soll',\n", - " 'sollte',\n", - " 'sondern',\n", - " 'sonst',\n", - " 'über',\n", - " 'um',\n", - " 'und',\n", - " 'uns',\n", - " 'unsere',\n", - " 'unserem',\n", - " 'unseren',\n", - " 'unser',\n", - " 'unseres',\n", - " 'unter',\n", - " 'viel',\n", - " 'vom',\n", - " 'von',\n", - " 'vor',\n", - " 'während',\n", - " 'war',\n", - " 'waren',\n", - " 'warst',\n", - " 'was',\n", - " 'weg',\n", - " 'weil',\n", - " 'weiter',\n", - " 'welche',\n", - " 'welchem',\n", - " 'welchen',\n", - " 'welcher',\n", - " 'welches',\n", - " 'wenn',\n", - " 'werde',\n", - " 'werden',\n", - " 'wie',\n", - " 'wieder',\n", - " 'will',\n", - " 'wir',\n", - " 'wird',\n", - " 'wirst',\n", - " 'wo',\n", - " 'wollen',\n", - " 'wollte',\n", - " 'würde',\n", - " 'würden',\n", - " 'zu',\n", - " 'zum',\n", - " 'zur',\n", - " 'zwar',\n", - " 'zwischen']" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "nltk.corpus.stopwords.words('german')" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[('Python', 'is'), ('is', 'dynamically-typed'), ('dynamically-typed', 'and'), ('and', 'garbage-collected'), ('garbage-collected', '.'), ('.', 'It'), ('It', 'supports'), ('supports', 'multiple'), ('multiple', 'programming'), ('programming', 'paradigms'), ('paradigms', ','), (',', 'including'), ('including', 'structured'), ('structured', '('), ('(', 'particularly'), ('particularly', ','), (',', 'procedural'), ('procedural', ')'), (')', ','), (',', 'object-oriented'), ('object-oriented', 'and'), ('and', 'functional'), ('functional', 'programming'), ('programming', '.'), ('.', 'It'), ('It', 'is'), ('is', 'often'), ('often', 'described'), ('described', 'as'), ('as', 'a'), ('a', '``'), ('``', 'batteries'), ('batteries', 'included'), ('included', \"''\"), (\"''\", 'language'), ('language', 'due'), ('due', 'to'), ('to', 'its'), ('its', 'comprehensive'), ('comprehensive', 'standard'), ('standard', 'library'), ('library', '.')]\n" - ] - } - ], - "source": [ - "nltk_tokens = nltk.word_tokenize(text)\n", - "print(list(nltk.bigrams(nltk_tokens)))" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "plotlyServerURL": "https://plot.ly" - }, - "data": [ - { - "alignmentgroup": "True", - "hovertemplate": "słowo=%{x}
liczba=%{y}", - "legendgroup": "", - "marker": { - "color": "#636efa", - "pattern": { - "shape": "" - } - }, - "name": "", - "offsetgroup": "", - "orientation": "v", - "showlegend": false, - "textposition": "auto", - "type": "bar", - "x": [ - "ma", - "ala", - "psa", - "kota" - ], - "xaxis": "x", - "y": [ - 20, - 15, - 10, - 10 - ], - "yaxis": "y" - } - ], - "layout": { - "barmode": "relative", - "legend": { - "tracegroupgap": 0 - }, - "margin": { - "t": 60 - }, - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "heatmapgl": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmapgl" - } - ], - "histogram": [ - { - "marker": { - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "autotypenumbers": "strict", - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "#E5ECF6", - "showlakes": true, - "showland": true, - "subunitcolor": "white" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "#E5ECF6", - "polar": { - "angularaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "radialaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "yaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "zaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "baxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "caxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - } - } - }, - "xaxis": { - "anchor": "y", - "domain": [ - 0, - 1 - ], - "title": { - "text": "słowo" - } - }, - "yaxis": { - "anchor": "x", - "domain": [ - 0, - 1 - ], - "title": { - "text": "liczba" - } - } - } - }, - "text/html": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df = pd.DataFrame([['ma', 20], ['ala', 15], ['psa', 10], ['kota', 10]], columns=['słowo', 'liczba'])\n", - "fig = px.bar(df, x=\"słowo\", y=\"liczba\")\n", - "fig.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "plotlyServerURL": "https://plot.ly" - }, - "data": [ - { - "alignmentgroup": "True", - "bingroup": "x", - "hovertemplate": "jezyk=polski
dlugosc=%{x}
count=%{y}", - "legendgroup": "", - "marker": { - "color": "#636efa", - "pattern": { - "shape": "" - } - }, - "name": "", - "nbinsx": 50, - "offsetgroup": "", - "orientation": "v", - "showlegend": false, - "type": "histogram", - "x": [ - 8, - 4, - 3, - 8, - 7, - 2, - 6, - 7, - 7, - 6, - 3, - 9, - 11, - 9, - 7, - 2, - 10, - 6, - 1, - 2, - 1, - 1, - 2, - 8, - 5, - 2, - 1, - 3, - 1, - 2, - 8, - 1, - 6, - 3, - 4, - 11, - 1, - 2, - 4, - 1, - 6, - 4, - 5, - 4, - 6, - 2, - 5, - 4, - 5, - 6, - 14, - 2, - 3, - 6, - 2, - 6, - 3, - 8, - 11, - 2, - 6, - 4, - 17, - 6, - 5, - 1, - 6, - 2, - 6, - 2, - 7, - 3, - 6, - 2, - 2, - 3, - 11, - 3, - 1, - 1, - 8, - 12, - 4, - 8, - 3, - 5, - 3, - 5, - 5, - 1, - 8, - 2, - 15, - 1, - 3, - 1, - 6, - 1, - 7, - 5, - 1, - 7, - 11, - 2, - 4, - 1, - 1, - 1, - 3, - 3, - 3, - 1, - 2, - 1, - 2, - 1, - 13, - 3, - 2, - 1, - 1, - 6, - 2, - 7, - 3, - 4, - 7, - 5, - 12, - 3, - 6, - 5, - 2, - 1, - 2, - 4, - 1, - 1, - 4, - 1, - 1, - 1, - 5, - 3, - 3, - 9, - 9, - 1, - 3, - 1, - 4, - 5, - 9, - 2, - 1, - 1, - 2, - 1, - 3, - 1, - 1, - 4, - 2, - 1, - 2, - 8, - 1, - 2, - 6, - 1, - 5, - 1, - 6, - 2, - 3, - 15, - 4, - 11, - 2, - 14, - 3, - 2, - 10, - 1, - 13, - 5, - 3, - 9, - 8, - 13, - 7, - 12, - 1, - 5, - 6, - 1, - 1, - 10, - 9, - 1, - 6, - 16, - 5, - 2, - 2, - 8, - 2, - 8, - 1, - 1, - 7, - 1, - 2, - 1, - 18, - 10, - 5, - 3, - 8, - 14, - 5, - 1, - 5, - 10, - 27, - 1, - 4, - 3, - 1, - 11, - 4, - 7, - 7, - 1, - 1, - 1, - 10, - 3, - 3, - 2, - 2, - 1, - 7, - 3, - 3, - 1, - 6, - 11, - 5, - 6, - 7, - 2, - 1, - 2, - 4, - 8, - 9, - 2, - 1, - 3, - 6, - 8, - 6, - 11, - 2, - 3, - 16, - 2, - 2, - 1, - 22, - 4, - 4, - 12, - 8, - 3, - 5, - 3, - 2, - 9, - 4, - 5, - 5, - 1, - 2, - 1, - 1, - 9, - 6, - 1, - 5, - 2, - 7, - 8, - 16, - 17, - 6, - 2, - 3, - 3, - 7, - 4, - 5, - 5, - 2, - 1, - 4, - 4, - 9, - 1, - 2, - 5, - 3, - 1, - 9, - 2, - 1, - 3, - 2, - 3, - 15, - 1, - 2, - 13, - 7, - 2, - 2, - 8, - 7, - 4, - 1, - 3, - 7, - 4, - 6, - 2, - 6, - 7, - 4, - 4, - 1, - 4, - 1, - 1, - 2, - 1, - 6, - 13, - 6, - 11, - 14, - 31, - 6, - 5, - 7, - 10, - 3, - 12, - 2, - 1, - 3, - 1, - 9, - 12, - 8, - 1, - 13, - 6, - 2, - 4, - 2, - 8, - 3, - 8, - 10, - 6, - 32, - 4, - 1, - 5, - 1, - 5, - 5, - 6, - 1, - 1, - 5, - 8, - 8, - 3, - 1, - 10, - 2, - 4, - 8, - 15, - 7, - 11, - 1, - 2, - 1, - 7, - 5, - 1, - 5, - 3, - 2, - 2, - 11, - 2, - 17, - 2, - 5, - 14, - 16, - 8, - 9, - 9, - 3, - 1, - 4, - 2, - 2, - 6, - 4, - 3, - 3, - 3, - 3, - 7, - 8, - 2, - 4, - 7, - 3, - 1, - 3, - 5, - 4, - 4, - 1, - 4, - 4, - 3, - 5, - 2, - 2, - 1, - 3, - 2, - 2, - 22, - 9, - 3, - 1, - 8, - 2, - 10, - 6, - 6, - 2, - 12, - 5, - 4, - 1, - 11, - 16, - 11, - 2, - 1, - 3, - 8, - 5, - 2, - 6, - 6, - 5, - 1, - 3, - 4, - 3, - 8, - 7, - 3, - 7, - 3, - 6, - 1, - 1, - 2, - 2, - 12, - 8, - 2, - 2, - 7, - 3, - 1, - 9, - 1, - 1, - 1, - 7, - 1, - 4, - 2, - 2, - 1, - 1, - 6, - 7, - 7, - 10, - 5, - 4, - 6, - 1, - 1, - 9, - 3, - 3, - 4, - 5, - 4, - 3, - 1, - 2, - 9, - 2, - 7, - 2, - 10, - 7, - 1, - 2, - 2, - 4, - 2, - 13, - 8, - 5, - 15, - 3, - 10, - 3, - 7, - 3, - 3, - 2, - 7, - 1, - 30, - 1, - 5, - 3, - 2, - 8, - 1, - 7, - 7, - 3, - 7, - 1, - 9, - 7, - 14, - 1, - 5, - 2, - 3, - 1, - 2, - 11, - 4, - 7, - 5, - 1, - 2, - 1, - 5, - 7, - 4, - 3, - 1, - 3, - 1, - 5, - 2, - 4, - 2, - 1, - 7, - 4, - 1, - 10, - 5, - 3, - 1, - 1, - 1, - 10, - 6, - 2, - 3, - 3, - 5, - 2, - 1, - 6, - 2, - 5, - 3, - 2, - 1, - 15, - 7, - 10, - 14, - 3, - 1, - 5, - 2, - 1, - 4, - 1, - 2, - 3, - 4, - 6, - 12, - 2, - 4, - 1, - 9, - 2, - 8, - 7, - 3, - 18, - 10, - 3, - 6, - 8, - 1, - 8, - 1, - 5, - 6, - 1, - 1, - 5, - 5, - 14, - 1, - 4, - 2, - 1, - 2, - 1, - 7, - 3, - 1, - 2, - 5, - 3, - 11, - 3, - 4, - 2, - 7, - 29, - 3, - 3, - 1, - 10, - 3, - 4, - 2, - 3, - 12, - 2, - 1, - 2, - 1, - 14, - 2, - 5, - 1, - 2, - 13, - 3, - 4, - 1, - 2, - 3, - 2, - 3, - 1, - 2, - 1, - 2, - 8, - 12, - 5, - 5, - 2, - 6, - 9, - 1, - 11, - 1, - 8, - 1, - 2, - 2, - 2, - 8, - 9, - 3, - 1, - 4, - 3, - 1, - 4, - 10, - 1, - 4, - 6, - 3, - 1, - 13, - 18, - 4, - 1, - 1, - 2, - 8, - 3, - 1, - 1, - 5, - 6, - 22, - 13, - 7, - 1, - 12, - 3, - 2, - 1, - 5, - 3, - 6, - 6, - 2, - 2, - 15, - 16, - 3, - 3, - 13, - 1, - 4, - 5, - 1, - 2, - 2, - 23, - 10, - 5, - 9, - 2, - 1, - 2, - 5, - 2, - 2, - 2, - 7, - 3, - 3, - 1, - 3, - 1, - 5, - 1, - 1, - 4, - 1, - 2, - 1, - 6, - 14, - 2, - 6, - 6, - 6, - 1, - 2, - 10, - 7, - 7, - 2, - 5, - 1, - 2, - 5, - 10, - 2, - 1, - 1, - 7, - 2, - 1, - 14, - 3, - 2, - 18, - 3, - 5, - 1, - 6, - 3, - 4, - 3, - 6, - 7, - 5, - 6, - 11, - 7, - 6, - 1, - 2, - 1, - 1, - 4, - 3, - 2, - 4, - 8, - 5, - 4, - 4, - 3, - 2, - 4, - 5, - 4, - 1, - 2, - 3, - 3, - 17, - 4, - 8, - 4, - 7, - 4, - 2, - 20, - 1, - 1, - 7, - 7, - 3, - 1, - 16, - 4, - 1, - 4, - 5, - 13, - 4, - 5, - 3, - 2, - 6, - 2, - 1, - 16, - 1, - 1, - 5, - 2, - 7, - 6, - 4, - 1, - 8, - 6, - 1, - 7, - 5, - 3, - 6, - 4, - 1, - 1, - 7, - 2, - 3, - 10, - 7, - 6, - 3, - 7, - 2, - 3, - 2, - 1, - 4, - 1, - 2, - 19, - 1, - 10, - 1, - 1, - 2, - 1, - 8, - 8, - 4, - 5, - 1, - 1, - 8, - 8, - 4, - 7, - 1, - 3, - 7, - 4, - 1, - 1, - 4, - 8, - 2, - 4, - 5, - 2, - 2, - 4, - 5, - 6, - 5, - 5, - 2, - 6, - 6, - 1, - 5, - 3, - 12, - 6, - 5, - 4, - 22, - 11, - 1, - 3, - 4, - 1, - 9, - 11, - 5, - 1, - 1, - 2, - 9, - 8, - 4, - 26, - 2, - 3, - 5, - 3, - 1, - 5, - 7, - 1, - 7, - 3, - 1, - 2, - 2, - 5, - 2, - 1, - 3, - 10, - 12, - 4, - 2, - 4, - 2, - 4, - 2, - 1, - 3, - 8, - 23, - 3, - 3, - 2, - 1, - 4, - 18, - 5, - 5, - 12, - 1, - 23, - 7, - 7, - 1, - 9, - 4, - 8, - 4, - 2, - 3, - 4, - 9, - 9, - 5, - 2, - 4, - 5, - 5, - 5, - 1, - 7, - 8, - 2, - 1, - 5, - 2, - 5, - 2, - 1, - 1, - 13, - 1, - 9, - 4, - 1, - 2, - 3, - 3, - 1, - 10, - 1, - 4, - 3, - 2, - 9, - 2, - 8, - 8, - 5, - 7, - 7, - 3, - 5, - 3, - 2, - 1, - 11, - 2, - 3, - 3, - 6, - 2, - 2, - 5, - 2, - 1, - 19, - 1, - 6, - 6, - 5, - 14, - 8, - 1, - 2, - 18, - 7, - 2, - 9, - 3, - 2, - 13, - 8, - 4, - 6, - 17, - 9, - 7, - 2, - 2, - 1, - 10, - 5, - 5, - 11, - 5, - 10, - 6, - 1, - 1, - 2, - 3, - 5, - 13, - 7, - 11, - 1, - 12, - 1, - 2, - 1, - 2, - 1, - 6, - 2, - 7, - 3, - 5, - 3, - 1, - 2, - 2, - 3, - 8, - 1, - 15, - 5, - 2, - 2, - 2, - 1, - 1, - 2, - 4, - 11, - 3, - 2, - 2, - 4, - 18, - 3, - 14, - 1, - 1, - 2, - 7, - 1, - 7, - 1, - 4, - 1, - 1, - 6, - 3, - 3, - 4, - 1, - 12, - 7, - 5, - 16, - 2, - 7, - 7, - 4, - 6, - 3, - 6, - 2, - 3, - 8, - 10, - 3, - 1, - 3, - 4, - 8, - 2, - 7, - 1, - 1, - 1, - 4, - 1, - 9, - 2, - 1, - 2, - 2, - 3, - 2, - 6, - 1, - 1, - 5, - 2, - 3, - 5, - 6, - 5, - 7, - 3, - 4, - 8, - 3, - 1, - 2, - 5, - 6, - 9, - 17, - 9, - 8, - 2, - 1, - 7, - 1, - 4, - 1, - 10, - 3, - 6, - 5, - 2, - 11, - 7, - 11, - 4, - 3, - 5, - 3, - 10, - 4, - 3, - 5, - 2, - 2, - 1, - 6, - 2, - 8, - 4, - 1, - 3, - 2, - 1, - 1, - 1, - 5, - 10, - 2, - 4, - 7, - 5, - 17, - 10, - 1, - 1, - 2, - 4, - 9, - 5, - 6, - 4, - 4, - 6, - 7, - 8, - 1, - 4, - 10, - 11, - 4, - 1, - 2, - 1, - 2, - 4, - 5, - 2, - 2, - 1, - 8, - 5, - 1, - 28, - 2, - 3, - 1, - 3, - 3, - 3, - 11, - 15, - 2, - 1, - 20, - 2, - 7, - 5, - 10, - 3, - 4, - 2, - 3, - 2, - 1, - 8, - 10, - 8, - 2, - 6, - 3, - 16, - 2, - 5, - 4, - 12, - 3, - 1, - 4, - 2, - 1, - 2, - 2, - 1, - 5, - 5, - 3, - 7, - 2, - 1, - 3, - 2, - 2, - 15, - 2, - 2, - 1, - 1, - 4, - 9, - 6, - 9, - 6, - 1, - 7, - 4, - 4, - 8, - 12, - 6, - 1, - 1, - 10, - 7, - 9, - 9, - 1, - 2, - 3, - 1, - 7, - 6, - 1, - 4, - 5, - 3, - 1, - 1, - 2, - 4, - 3, - 6, - 4, - 7, - 2, - 6, - 5, - 2, - 10, - 3, - 8, - 7, - 6, - 1, - 1, - 9, - 3, - 3, - 4, - 13, - 7, - 17, - 3, - 9, - 8, - 10, - 1, - 4, - 4, - 5, - 1, - 1, - 3, - 3, - 10, - 7, - 3, - 2, - 5, - 8, - 6, - 2, - 4, - 13, - 9, - 11, - 2, - 7, - 7, - 1, - 3, - 8, - 3, - 3, - 8, - 3, - 1, - 12, - 3, - 3, - 3, - 7, - 1, - 9, - 3, - 8, - 1, - 2, - 6, - 5, - 6, - 4, - 6, - 6, - 10, - 4, - 3, - 2, - 1, - 4, - 1, - 8, - 4, - 4, - 4, - 5, - 1, - 1, - 3, - 5, - 4, - 10, - 6, - 2, - 5, - 11, - 13, - 6, - 5, - 2, - 5, - 6, - 9, - 4, - 7, - 1, - 2, - 5, - 7, - 4, - 8, - 1, - 20, - 12, - 2, - 5, - 11, - 2, - 1, - 3, - 10, - 5, - 7, - 3, - 4, - 7, - 1, - 1, - 2, - 2, - 3, - 3, - 9, - 2, - 5, - 4, - 3, - 5, - 1, - 9, - 2, - 5, - 1, - 4, - 14, - 5, - 9, - 5, - 3, - 1, - 5, - 3, - 3, - 3, - 2, - 2, - 2, - 11, - 5, - 5, - 5, - 7, - 30, - 7, - 3, - 2, - 1, - 5, - 2, - 3, - 12, - 11, - 17, - 3, - 4, - 11, - 2, - 1, - 4, - 1, - 14, - 5, - 7, - 9, - 10, - 10, - 3, - 6, - 13, - 2, - 6, - 7, - 9, - 2, - 3, - 2, - 1, - 4, - 9, - 1, - 15, - 1, - 6, - 4, - 16, - 2, - 4, - 3, - 2, - 2, - 13, - 5, - 5, - 5, - 2, - 4, - 10, - 3, - 14, - 1, - 1, - 1, - 7, - 5, - 21, - 2, - 1, - 5, - 7, - 3, - 5, - 1, - 1, - 4, - 2, - 5, - 2, - 10, - 2, - 13, - 1, - 3, - 15, - 13, - 8, - 4, - 3, - 6, - 5, - 4, - 2, - 4, - 3, - 3, - 2, - 4, - 10, - 5, - 5, - 1, - 3, - 2, - 2, - 5, - 8, - 6, - 6, - 2, - 13, - 4, - 13, - 19, - 3, - 2, - 3, - 2, - 2, - 8, - 1, - 2, - 18, - 1, - 3, - 1, - 1, - 11, - 9, - 4, - 12, - 1, - 1, - 1, - 6, - 5, - 7, - 5, - 2, - 4, - 2, - 2, - 4, - 4, - 3, - 22, - 1, - 12, - 3, - 3, - 1, - 3, - 4, - 1, - 2, - 4, - 1, - 7, - 10, - 2, - 4, - 5, - 1, - 4, - 3, - 7, - 4, - 3, - 7, - 5, - 1, - 1, - 5, - 14, - 2, - 3 - ], - "xaxis": "x3", - "yaxis": "y3" - }, - { - "alignmentgroup": "True", - "bingroup": "x", - "hovertemplate": "jezyk=hiszp
dlugosc=%{x}
count=%{y}", - "legendgroup": "", - "marker": { - "color": "#636efa", - "pattern": { - "shape": "" - } - }, - "name": "", - "nbinsx": 50, - "offsetgroup": "", - "orientation": "v", - "showlegend": false, - "type": "histogram", - "x": [ - 13, - 2, - 4, - 3, - 11, - 1, - 1, - 8, - 10, - 11, - 2, - 1, - 6, - 3, - 1, - 3, - 3, - 1, - 5, - 1, - 3, - 1, - 4, - 4, - 2, - 8, - 4, - 5, - 1, - 1, - 2, - 2, - 10, - 2, - 17, - 8, - 5, - 2, - 17, - 5, - 1, - 2, - 1, - 8, - 4, - 2, - 2, - 15, - 1, - 6, - 3, - 4, - 9, - 4, - 1, - 2, - 4, - 7, - 12, - 3, - 9, - 6, - 1, - 4, - 1, - 3, - 1, - 6, - 3, - 3, - 9, - 10, - 2, - 14, - 3, - 19, - 8, - 4, - 6, - 7, - 6, - 1, - 2, - 14, - 2, - 3, - 9, - 2, - 2, - 12, - 12, - 11, - 5, - 5, - 3, - 3, - 1, - 4, - 6, - 4, - 5, - 3, - 2, - 1, - 2, - 6, - 12, - 10, - 16, - 4, - 15, - 7, - 6, - 17, - 15, - 2, - 9, - 2, - 3, - 3, - 2, - 3, - 2, - 4, - 14, - 2, - 8, - 13, - 35, - 2, - 1, - 1, - 6, - 2, - 8, - 3, - 3, - 17, - 11, - 3, - 1, - 1, - 3, - 2, - 6, - 2, - 3, - 4, - 7, - 4, - 3, - 7, - 1, - 2, - 4, - 2, - 3, - 4, - 8, - 1, - 4, - 7, - 6, - 2, - 2, - 15, - 1, - 7, - 1, - 4, - 3, - 13, - 6, - 4, - 4, - 20, - 3, - 2, - 4, - 5, - 8, - 7, - 7, - 9, - 4, - 1, - 13, - 2, - 6, - 6, - 1, - 1, - 3, - 14, - 5, - 5, - 3, - 9, - 4, - 1, - 1, - 2, - 3, - 9, - 2, - 2, - 9, - 5, - 2, - 2, - 4, - 2, - 2, - 6, - 12, - 4, - 10, - 4, - 3, - 4, - 6, - 3, - 1, - 2, - 6, - 8, - 1, - 1, - 10, - 2, - 19, - 1, - 4, - 9, - 4, - 10, - 2, - 2, - 4, - 5, - 7, - 4, - 4, - 4, - 7, - 1, - 10, - 2, - 4, - 8, - 10, - 3, - 7, - 7, - 4, - 5, - 9, - 2, - 3, - 5, - 7, - 1, - 4, - 6, - 5, - 5, - 3, - 8, - 3, - 3, - 1, - 1, - 1, - 6, - 13, - 3, - 1, - 4, - 4, - 1, - 6, - 2, - 3, - 7, - 9, - 5, - 1, - 5, - 16, - 9, - 5, - 1, - 10, - 1, - 7, - 1, - 1, - 22, - 3, - 7, - 3, - 3, - 3, - 13, - 13, - 1, - 20, - 9, - 1, - 23, - 3, - 1, - 6, - 13, - 5, - 6, - 1, - 8, - 2, - 10, - 2, - 12, - 7, - 2, - 1, - 16, - 4, - 7, - 2, - 2, - 20, - 5, - 11, - 8, - 3, - 4, - 5, - 3, - 3, - 8, - 5, - 16, - 2, - 6, - 3, - 11, - 2, - 1, - 3, - 5, - 3, - 1, - 6, - 1, - 12, - 4, - 5, - 1, - 18, - 1, - 2, - 2, - 4, - 19, - 2, - 4, - 7, - 1, - 14, - 3, - 9, - 6, - 4, - 17, - 8, - 2, - 10, - 4, - 10, - 8, - 2, - 3, - 2, - 9, - 1, - 5, - 12, - 2, - 6, - 4, - 3, - 1, - 5, - 3, - 2, - 9, - 4, - 3, - 14, - 2, - 2, - 3, - 3, - 4, - 2, - 3, - 7, - 6, - 4, - 9, - 2, - 6, - 3, - 5, - 2, - 1, - 1, - 6, - 9, - 6, - 6, - 1, - 12, - 11, - 2, - 3, - 4, - 1, - 15, - 2, - 1, - 13, - 3, - 3, - 2, - 2, - 3, - 3, - 1, - 1, - 3, - 3, - 9, - 3, - 1, - 6, - 5, - 2, - 2, - 14, - 4, - 8, - 3, - 4, - 2, - 13, - 1, - 2, - 5, - 2, - 16, - 2, - 1, - 2, - 5, - 7, - 1, - 3, - 1, - 4, - 7, - 5, - 2, - 4, - 2, - 1, - 14, - 3, - 6, - 7, - 1, - 1, - 5, - 2, - 4, - 11, - 2, - 2, - 3, - 9, - 6, - 10, - 11, - 1, - 5, - 3, - 3, - 1, - 1, - 3, - 10, - 1, - 16, - 5, - 4, - 19, - 2, - 3, - 1, - 2, - 6, - 4, - 3, - 9, - 4, - 7, - 3, - 4, - 5, - 9, - 1, - 2, - 1, - 3, - 1, - 3, - 7, - 5, - 4, - 1, - 4, - 14, - 7, - 22, - 6, - 1, - 3, - 1, - 5, - 36, - 3, - 1, - 1, - 3, - 5, - 2, - 9, - 13, - 1, - 2, - 20, - 2, - 7, - 8, - 3, - 1, - 4, - 4, - 13, - 1, - 4, - 4, - 1, - 2, - 4, - 5, - 4, - 10, - 11, - 3, - 3, - 2, - 5, - 3, - 3, - 3, - 1, - 3, - 1, - 2, - 4, - 4, - 2, - 4, - 2, - 21, - 6, - 1, - 9, - 4, - 2, - 3, - 5, - 9, - 1, - 2, - 5, - 4, - 5, - 11, - 4, - 1, - 4, - 4, - 2, - 6, - 1, - 7, - 5, - 4, - 7, - 3, - 4, - 5, - 10, - 1, - 5, - 1, - 2, - 3, - 9, - 8, - 7, - 8, - 1, - 1, - 3, - 3, - 7, - 7, - 3, - 2, - 1, - 1, - 11, - 11, - 16, - 3, - 2, - 10, - 8, - 3, - 2, - 25, - 3, - 10, - 9, - 9, - 4, - 7, - 2, - 3, - 3, - 6, - 1, - 1, - 4, - 6, - 1, - 3, - 1, - 11, - 4, - 1, - 2, - 4, - 3, - 4, - 9, - 1, - 10, - 4, - 4, - 5, - 3, - 6, - 4, - 5, - 1, - 2, - 8, - 4, - 5, - 4, - 3, - 5, - 8, - 1, - 7, - 1, - 6, - 3, - 4, - 19, - 1, - 3, - 9, - 11, - 6, - 5, - 5, - 2, - 3, - 18, - 3, - 6, - 8, - 3, - 7, - 1, - 13, - 3, - 5, - 2, - 4, - 8, - 6, - 4, - 1, - 6, - 1, - 14, - 2, - 2, - 1, - 1, - 4, - 1, - 5, - 2, - 2, - 2, - 6, - 4, - 2, - 4, - 14, - 2, - 3, - 6, - 6, - 1, - 1, - 3, - 1, - 4, - 17, - 8, - 1, - 6, - 4, - 9, - 2, - 8, - 4, - 1, - 10, - 3, - 12, - 3, - 1, - 3, - 1, - 7, - 1, - 14, - 3, - 13, - 4, - 3, - 3, - 3, - 1, - 1, - 3, - 2, - 10, - 7, - 1, - 2, - 8, - 4, - 6, - 7, - 1, - 12, - 12, - 4, - 3, - 7, - 4, - 16, - 1, - 2, - 9, - 1, - 1, - 5, - 1, - 9, - 4, - 8, - 1, - 1, - 5, - 3, - 2, - 1, - 1, - 1, - 2, - 1, - 6, - 5, - 2, - 11, - 3, - 2, - 8, - 1, - 4, - 2, - 4, - 3, - 2, - 7, - 3, - 8, - 3, - 8, - 1, - 1, - 5, - 12, - 1, - 1, - 4, - 2, - 2, - 4, - 4, - 8, - 7, - 5, - 1, - 4, - 1, - 13, - 8, - 1, - 4, - 2, - 2, - 4, - 4, - 4, - 2, - 6, - 2, - 4, - 7, - 2, - 1, - 1, - 3, - 6, - 4, - 11, - 5, - 1, - 19, - 10, - 12, - 4, - 10, - 13, - 1, - 5, - 14, - 8, - 2, - 1, - 6, - 13, - 1, - 7, - 3, - 7, - 12, - 2, - 7, - 6, - 2, - 5, - 1, - 3, - 5, - 3, - 1, - 4, - 15, - 4, - 1, - 1, - 9, - 4, - 4, - 6, - 7, - 1, - 13, - 1, - 2, - 3, - 1, - 5, - 3, - 10, - 2, - 1, - 5, - 4, - 1, - 6, - 1, - 3, - 4, - 3, - 5, - 1, - 4, - 6, - 3, - 4, - 2, - 1, - 6, - 3, - 3, - 2, - 2, - 5, - 1, - 2, - 1, - 26, - 5, - 2, - 2, - 1, - 1, - 2, - 4, - 5, - 6, - 2, - 1, - 3, - 6, - 2, - 3, - 6, - 8, - 4, - 1, - 3, - 1, - 4, - 4, - 3, - 10, - 4, - 1, - 12, - 18, - 7, - 8, - 7, - 1, - 7, - 4, - 3, - 2, - 13, - 9, - 1, - 14, - 2, - 2, - 13, - 7, - 6, - 1, - 1, - 9, - 3, - 4, - 2, - 6, - 5, - 1, - 2, - 1, - 1, - 2, - 6, - 6, - 22, - 4, - 2, - 2, - 4, - 4, - 4, - 3, - 5, - 2, - 2, - 1, - 5, - 1, - 16, - 11, - 3, - 1, - 8, - 5, - 5, - 14, - 8, - 3, - 2, - 3, - 2, - 1, - 13, - 2, - 4, - 3, - 6, - 3, - 2, - 11, - 1, - 8, - 3, - 2, - 7, - 2, - 5, - 4, - 3, - 5, - 4, - 3, - 7, - 2, - 3, - 1, - 10, - 8, - 8, - 1, - 1, - 1, - 2, - 1, - 9, - 1, - 12, - 4, - 1, - 3, - 2, - 1, - 1, - 2, - 7, - 9, - 2, - 4, - 1, - 9, - 14, - 1, - 6, - 2, - 2, - 1, - 6, - 6, - 5, - 7, - 46, - 4, - 1, - 3, - 15, - 2, - 2, - 4, - 2, - 9, - 8, - 3, - 1, - 6, - 2, - 3, - 12, - 1, - 7, - 2, - 1, - 3, - 5, - 1, - 16, - 1, - 2, - 1, - 1, - 10, - 1, - 1, - 3, - 1, - 3, - 1, - 7, - 20, - 2, - 3, - 3, - 2, - 1, - 3, - 1, - 2, - 2, - 3, - 1, - 1, - 3, - 7, - 2, - 12, - 6, - 14, - 3, - 1, - 9, - 12, - 6, - 5, - 6, - 8, - 3, - 3, - 1, - 3, - 49, - 11, - 2, - 16, - 6, - 10, - 2, - 4, - 6, - 2, - 12, - 4, - 12, - 10, - 8, - 7, - 3, - 3, - 1, - 3, - 3, - 2, - 5, - 4, - 2, - 11, - 4, - 8, - 4, - 4, - 2, - 9, - 2, - 7, - 1, - 6, - 7, - 7, - 4, - 2, - 4, - 8, - 8, - 5, - 18, - 6, - 3, - 15, - 1, - 5, - 1, - 9, - 2, - 1, - 3, - 9, - 4, - 7, - 4, - 3, - 5, - 9, - 3, - 1, - 7, - 6, - 2, - 9, - 10, - 4, - 2, - 2, - 4, - 9, - 4, - 4, - 7, - 12, - 5, - 3, - 26, - 3, - 5, - 10, - 13, - 1, - 1, - 2, - 11, - 1, - 13, - 15, - 1, - 6, - 1, - 1, - 4, - 3, - 6, - 3, - 4, - 3, - 9, - 1, - 2, - 19, - 2, - 3, - 2, - 22, - 5, - 2, - 1, - 1, - 5, - 1, - 10, - 9, - 6, - 10, - 3, - 5, - 5, - 9, - 1, - 3, - 11, - 12, - 10, - 2, - 1, - 4, - 8, - 7, - 11, - 3, - 7, - 3, - 5, - 3, - 4, - 8, - 2, - 5, - 2, - 4, - 13, - 3, - 3, - 8, - 5, - 8, - 9, - 10, - 7, - 5, - 3, - 3, - 2, - 1, - 2, - 2, - 11, - 12, - 2, - 2, - 4, - 5, - 12, - 11, - 12, - 5, - 4, - 5, - 11, - 2, - 5, - 1, - 19, - 9, - 5, - 3, - 6, - 5, - 1, - 2, - 4, - 3, - 1, - 2, - 1, - 2, - 2, - 5, - 2, - 2, - 1, - 9, - 7, - 5, - 4, - 8, - 5, - 4, - 3, - 2, - 8, - 11, - 8, - 3, - 2, - 2, - 6, - 1, - 7, - 13, - 4, - 2, - 5, - 1, - 3, - 1, - 10, - 1, - 1, - 1, - 3, - 8, - 5, - 8, - 2, - 2, - 4, - 1, - 1, - 4, - 9, - 3, - 1, - 1, - 1, - 2, - 2, - 3, - 3, - 2, - 4, - 5, - 1, - 5, - 15, - 2, - 1, - 17, - 10, - 1, - 2, - 7, - 3, - 5, - 3, - 7, - 1, - 8, - 8, - 7, - 1, - 8, - 3, - 1, - 1, - 17, - 2, - 2, - 5, - 7, - 2, - 4, - 1, - 1, - 13, - 4, - 1, - 7, - 27, - 1, - 4, - 10, - 14, - 8, - 4, - 4, - 3, - 9, - 3, - 4, - 5, - 1, - 8, - 2, - 1, - 3, - 4, - 2, - 1, - 2, - 10, - 2, - 9, - 1, - 6, - 10, - 3, - 3, - 4, - 1, - 5, - 5, - 1, - 5, - 3, - 4, - 15, - 3, - 3, - 7, - 2, - 4, - 4, - 18, - 4, - 4, - 2, - 2, - 2, - 4, - 7, - 3, - 1, - 9, - 3, - 5, - 2, - 15, - 8, - 6, - 2, - 2, - 1, - 2, - 8, - 3, - 3, - 4, - 8, - 6, - 1, - 9, - 7, - 1, - 11, - 7, - 1, - 5, - 9, - 5, - 2, - 12, - 14, - 4, - 5, - 3, - 4, - 16, - 2, - 3, - 2, - 1, - 4, - 3, - 2, - 4, - 14, - 1, - 1, - 1, - 3, - 6, - 8, - 2, - 2, - 7, - 1, - 1, - 13, - 1, - 5, - 6, - 8, - 3, - 1, - 1, - 4, - 7, - 5, - 2, - 8, - 1, - 7, - 6, - 6, - 9, - 22, - 2, - 2, - 3, - 5, - 2, - 6, - 7, - 4, - 11, - 1, - 1, - 4, - 2, - 7, - 5, - 9, - 1, - 4, - 1, - 9, - 13, - 3, - 3, - 2, - 6, - 5, - 6, - 1, - 2, - 13, - 5, - 7, - 7, - 12, - 1, - 3, - 12, - 24, - 7, - 19, - 5, - 2, - 4, - 3, - 3, - 7, - 5, - 1, - 1, - 1, - 5, - 6, - 9, - 8, - 5, - 6, - 3, - 1, - 7, - 3, - 12, - 3, - 2, - 7, - 6, - 8 - ], - "xaxis": "x2", - "yaxis": "y2" - }, - { - "alignmentgroup": "True", - "bingroup": "x", - "hovertemplate": "jezyk=ang
dlugosc=%{x}
count=%{y}", - "legendgroup": "", - "marker": { - "color": "#636efa", - "pattern": { - "shape": "" - } - }, - "name": "", - "nbinsx": 50, - "offsetgroup": "", - "orientation": "v", - "showlegend": false, - "type": "histogram", - "x": [ - 7, - 2, - 12, - 1, - 12, - 1, - 2, - 3, - 2, - 18, - 2, - 4, - 3, - 8, - 3, - 2, - 8, - 4, - 2, - 9, - 4, - 9, - 16, - 8, - 4, - 2, - 1, - 2, - 3, - 2, - 3, - 1, - 4, - 1, - 7, - 5, - 2, - 3, - 1, - 14, - 2, - 1, - 2, - 9, - 6, - 1, - 3, - 1, - 1, - 2, - 7, - 3, - 4, - 4, - 7, - 3, - 2, - 3, - 25, - 3, - 6, - 8, - 11, - 6, - 3, - 1, - 9, - 5, - 4, - 4, - 4, - 3, - 7, - 4, - 8, - 2, - 2, - 1, - 7, - 1, - 6, - 4, - 4, - 3, - 3, - 5, - 2, - 8, - 7, - 3, - 6, - 6, - 5, - 1, - 2, - 1, - 4, - 7, - 1, - 3, - 2, - 1, - 2, - 1, - 4, - 2, - 9, - 1, - 3, - 3, - 16, - 1, - 4, - 1, - 4, - 4, - 3, - 6, - 2, - 5, - 1, - 10, - 2, - 3, - 4, - 5, - 1, - 6, - 3, - 9, - 6, - 8, - 6, - 5, - 5, - 7, - 2, - 8, - 5, - 2, - 3, - 9, - 8, - 1, - 11, - 2, - 2, - 8, - 8, - 4, - 3, - 7, - 1, - 4, - 1, - 3, - 4, - 19, - 2, - 1, - 10, - 4, - 3, - 6, - 8, - 3, - 9, - 7, - 1, - 1, - 9, - 9, - 3, - 1, - 1, - 1, - 1, - 6, - 1, - 3, - 7, - 7, - 1, - 7, - 4, - 1, - 2, - 2, - 2, - 13, - 6, - 1, - 4, - 3, - 3, - 1, - 6, - 6, - 4, - 8, - 5, - 1, - 14, - 2, - 1, - 19, - 1, - 2, - 3, - 6, - 2, - 2, - 1, - 3, - 4, - 4, - 8, - 14, - 2, - 4, - 2, - 6, - 2, - 2, - 1, - 4, - 2, - 6, - 15, - 3, - 1, - 6, - 19, - 17, - 9, - 1, - 9, - 2, - 3, - 3, - 4, - 4, - 1, - 2, - 1, - 19, - 2, - 25, - 2, - 9, - 2, - 8, - 5, - 6, - 7, - 7, - 5, - 10, - 5, - 1, - 11, - 1, - 1, - 4, - 5, - 3, - 4, - 1, - 9, - 3, - 1, - 2, - 1, - 3, - 9, - 11, - 1, - 1, - 11, - 15, - 5, - 7, - 4, - 6, - 11, - 5, - 4, - 9, - 5, - 1, - 13, - 1, - 6, - 17, - 3, - 4, - 2, - 1, - 4, - 2, - 8, - 13, - 5, - 6, - 6, - 3, - 9, - 2, - 3, - 4, - 3, - 2, - 1, - 1, - 2, - 2, - 12, - 2, - 2, - 1, - 18, - 1, - 1, - 6, - 11, - 2, - 1, - 1, - 1, - 1, - 43, - 5, - 2, - 6, - 2, - 4, - 6, - 2, - 7, - 5, - 3, - 6, - 4, - 5, - 5, - 5, - 6, - 17, - 3, - 11, - 2, - 3, - 6, - 5, - 1, - 2, - 26, - 1, - 8, - 7, - 7, - 4, - 4, - 4, - 1, - 3, - 2, - 2, - 1, - 1, - 7, - 1, - 1, - 5, - 5, - 11, - 12, - 2, - 7, - 6, - 1, - 7, - 6, - 7, - 8, - 6, - 4, - 17, - 2, - 2, - 1, - 7, - 5, - 3, - 7, - 7, - 2, - 2, - 8, - 1, - 7, - 12, - 4, - 15, - 1, - 6, - 6, - 4, - 5, - 3, - 12, - 8, - 13, - 3, - 16, - 19, - 11, - 8, - 4, - 5, - 2, - 5, - 18, - 4, - 2, - 19, - 6, - 6, - 3, - 5, - 2, - 1, - 5, - 4, - 2, - 1, - 5, - 3, - 7, - 3, - 2, - 4, - 5, - 3, - 8, - 1, - 14, - 2, - 14, - 6, - 6, - 1, - 3, - 5, - 2, - 2, - 8, - 1, - 3, - 1, - 3, - 1, - 10, - 1, - 1, - 11, - 2, - 3, - 8, - 1, - 13, - 1, - 5, - 6, - 8, - 6, - 11, - 6, - 13, - 9, - 1, - 1, - 3, - 2, - 1, - 10, - 1, - 2, - 3, - 2, - 7, - 5, - 1, - 6, - 2, - 3, - 2, - 7, - 2, - 2, - 1, - 2, - 3, - 2, - 13, - 1, - 13, - 2, - 1, - 9, - 9, - 11, - 16, - 5, - 2, - 2, - 7, - 7, - 1, - 2, - 1, - 1, - 1, - 5, - 1, - 11, - 3, - 5, - 2, - 2, - 3, - 9, - 4, - 11, - 1, - 2, - 9, - 8, - 1, - 7, - 1, - 5, - 3, - 7, - 6, - 2, - 1, - 5, - 3, - 1, - 2, - 7, - 12, - 3, - 4, - 4, - 3, - 3, - 1, - 4, - 7, - 18, - 1, - 2, - 1, - 13, - 2, - 2, - 1, - 1, - 1, - 6, - 2, - 15, - 8, - 3, - 1, - 1, - 3, - 4, - 9, - 2, - 3, - 9, - 1, - 1, - 2, - 5, - 14, - 6, - 5, - 8, - 3, - 2, - 4, - 5, - 2, - 4, - 1, - 2, - 1, - 1, - 6, - 3, - 1, - 4, - 5, - 9, - 2, - 9, - 1, - 1, - 2, - 5, - 5, - 4, - 15, - 7, - 4, - 6, - 2, - 9, - 1, - 2, - 1, - 2, - 2, - 3, - 4, - 6, - 1, - 8, - 4, - 22, - 1, - 8, - 2, - 2, - 6, - 6, - 5, - 8, - 11, - 3, - 3, - 8, - 4, - 7, - 3, - 1, - 1, - 1, - 6, - 6, - 7, - 2, - 2, - 5, - 1, - 15, - 2, - 3, - 11, - 2, - 7, - 2, - 12, - 7, - 3, - 1, - 1, - 2, - 3, - 3, - 7, - 2, - 5, - 2, - 5, - 1, - 5, - 10, - 1, - 2, - 2, - 2, - 2, - 3, - 13, - 3, - 10, - 9, - 5, - 4, - 1, - 2, - 1, - 6, - 2, - 1, - 9, - 15, - 1, - 8, - 1, - 10, - 2, - 5, - 15, - 3, - 2, - 1, - 4, - 4, - 1, - 2, - 1, - 18, - 5, - 15, - 3, - 11, - 12, - 8, - 2, - 1, - 3, - 1, - 4, - 1, - 9, - 5, - 1, - 9, - 8, - 4, - 14, - 4, - 4, - 1, - 5, - 3, - 2, - 1, - 3, - 13, - 6, - 2, - 4, - 3, - 13, - 2, - 11, - 2, - 1, - 2, - 12, - 4, - 1, - 5, - 6, - 2, - 4, - 2, - 1, - 4, - 2, - 5, - 8, - 2, - 4, - 1, - 1, - 11, - 2, - 3, - 12, - 1, - 4, - 1, - 8, - 1, - 7, - 2, - 2, - 2, - 5, - 3, - 1, - 3, - 11, - 1, - 1, - 1, - 1, - 2, - 16, - 1, - 3, - 2, - 1, - 11, - 1, - 6, - 9, - 6, - 7, - 13, - 7, - 3, - 8, - 2, - 7, - 3, - 6, - 10, - 6, - 2, - 3, - 1, - 5, - 8, - 2, - 4, - 2, - 1, - 2, - 1, - 5, - 4, - 7, - 1, - 1, - 1, - 4, - 1, - 2, - 5, - 4, - 6, - 4, - 3, - 3, - 1, - 1, - 3, - 3, - 21, - 9, - 6, - 1, - 1, - 4, - 1, - 14, - 6, - 3, - 1, - 4, - 1, - 1, - 16, - 12, - 1, - 9, - 7, - 1, - 1, - 3, - 1, - 2, - 6, - 18, - 11, - 17, - 1, - 5, - 3, - 15, - 3, - 3, - 14, - 5, - 2, - 2, - 5, - 4, - 1, - 8, - 8, - 6, - 2, - 1, - 7, - 4, - 1, - 8, - 6, - 10, - 9, - 1, - 7, - 1, - 1, - 6, - 1, - 16, - 1, - 4, - 2, - 2, - 2, - 1, - 2, - 3, - 1, - 8, - 3, - 3, - 12, - 4, - 1, - 5, - 6, - 4, - 16, - 9, - 3, - 8, - 9, - 7, - 9, - 5, - 2, - 2, - 5, - 3, - 16, - 8, - 1, - 2, - 2, - 2, - 1, - 2, - 17, - 5, - 2, - 2, - 2, - 3, - 2, - 4, - 6, - 11, - 13, - 1, - 6, - 1, - 1, - 5, - 1, - 2, - 2, - 2, - 4, - 7, - 10, - 3, - 8, - 6, - 2, - 8, - 4, - 2, - 10, - 1, - 5, - 2, - 6, - 14, - 17, - 19, - 1, - 1, - 8, - 3, - 9, - 3, - 8, - 1, - 3, - 1, - 4, - 1, - 7, - 5, - 19, - 3, - 2, - 4, - 17, - 1, - 10, - 10, - 1, - 1, - 6, - 5, - 6, - 4, - 1, - 6, - 1, - 2, - 2, - 6, - 5, - 10, - 12, - 3, - 11, - 1, - 6, - 19, - 4, - 6, - 3, - 1, - 2, - 4, - 2, - 2, - 4, - 2, - 3, - 3, - 1, - 15, - 22, - 8, - 1, - 2, - 1, - 1, - 16, - 1, - 3, - 2, - 1, - 17, - 3, - 1, - 9, - 2, - 5, - 9, - 8, - 2, - 8, - 6, - 2, - 8, - 2, - 3, - 4, - 1, - 4, - 12, - 3, - 3, - 1, - 2, - 2, - 5, - 26, - 8, - 2, - 3, - 7, - 12, - 6, - 9, - 6, - 8, - 1, - 4, - 2, - 3, - 4, - 4, - 5, - 4, - 4, - 5, - 4, - 1, - 5, - 13, - 8, - 5, - 2, - 9, - 5, - 13, - 5, - 2, - 9, - 1, - 1, - 3, - 8, - 7, - 1, - 9, - 15, - 8, - 4, - 14, - 1, - 16, - 1, - 12, - 2, - 4, - 1, - 1, - 3, - 25, - 3, - 1, - 1, - 4, - 1, - 8, - 1, - 4, - 3, - 2, - 1, - 4, - 3, - 2, - 3, - 7, - 2, - 1, - 5, - 2, - 5, - 5, - 8, - 21, - 2, - 2, - 5, - 1, - 2, - 1, - 5, - 10, - 4, - 13, - 7, - 8, - 4, - 2, - 6, - 7, - 1, - 1, - 4, - 16, - 18, - 11, - 3, - 4, - 13, - 3, - 17, - 12, - 7, - 4, - 1, - 14, - 5, - 4, - 11, - 7, - 2, - 6, - 1, - 4, - 1, - 1, - 5, - 1, - 1, - 6, - 5, - 4, - 2, - 14, - 8, - 5, - 1, - 9, - 1, - 1, - 4, - 4, - 6, - 2, - 3, - 5, - 2, - 4, - 3, - 1, - 2, - 1, - 12, - 1, - 1, - 11, - 2, - 3, - 20, - 2, - 18, - 1, - 1, - 4, - 2, - 9, - 3, - 4, - 4, - 3, - 2, - 2, - 1, - 4, - 1, - 4, - 4, - 2, - 1, - 2, - 2, - 3, - 3, - 1, - 1, - 10, - 3, - 3, - 2, - 16, - 3, - 2, - 2, - 3, - 1, - 2, - 25, - 6, - 5, - 16, - 7, - 1, - 2, - 2, - 5, - 3, - 7, - 16, - 10, - 2, - 5, - 7, - 2, - 3, - 9, - 6, - 1, - 1, - 1, - 9, - 2, - 3, - 6, - 5, - 3, - 7, - 4, - 5, - 6, - 2, - 4, - 1, - 3, - 3, - 6, - 9, - 2, - 1, - 1, - 3, - 4, - 9, - 6, - 1, - 1, - 2, - 2, - 4, - 1, - 3, - 4, - 3, - 4, - 10, - 7, - 6, - 9, - 4, - 1, - 1, - 4, - 1, - 11, - 4, - 7, - 1, - 6, - 5, - 6, - 3, - 2, - 2, - 2, - 3, - 17, - 1, - 4, - 2, - 4, - 4, - 1, - 4, - 1, - 1, - 3, - 20, - 12, - 2, - 2, - 1, - 3, - 5, - 3, - 2, - 3, - 5, - 3, - 2, - 5, - 4, - 11, - 4, - 2, - 2, - 1, - 9, - 5, - 3, - 2, - 3, - 2, - 1, - 23, - 1, - 4, - 1, - 20, - 3, - 3, - 7, - 14, - 10, - 5, - 1, - 1, - 3, - 4, - 3, - 7, - 7, - 4, - 1, - 25, - 1, - 3, - 1, - 11, - 6, - 1, - 4, - 4, - 1, - 5, - 6, - 10, - 4, - 2, - 1, - 6, - 10, - 1, - 2, - 21, - 1, - 13, - 3, - 8, - 3, - 2, - 3, - 1, - 11, - 4, - 12, - 2, - 2, - 4, - 8, - 5, - 2, - 4, - 2, - 2, - 1, - 2, - 4, - 1, - 5, - 6, - 5, - 7, - 1, - 3, - 10, - 3, - 1, - 3, - 1, - 12, - 6, - 1, - 1, - 8, - 1, - 3, - 4, - 2, - 2, - 6, - 2, - 1, - 2, - 3, - 5, - 15, - 4, - 3, - 3, - 3, - 2, - 8, - 20, - 10, - 10, - 1, - 11, - 4, - 1, - 20, - 6, - 20, - 10, - 4, - 5, - 5, - 3, - 3, - 15, - 10, - 4, - 1, - 8, - 6, - 5, - 1, - 8, - 11, - 3, - 2, - 13, - 2, - 6, - 11, - 4, - 4, - 14, - 5, - 3, - 2, - 1, - 6, - 2, - 2, - 7, - 7, - 5, - 3, - 1, - 8, - 1, - 3, - 3, - 2, - 1, - 1, - 1, - 16, - 6, - 10, - 2, - 10, - 16, - 7, - 7, - 8, - 3, - 7, - 3, - 5, - 2, - 1, - 17, - 4, - 1, - 7, - 3, - 3, - 3, - 1, - 1, - 3, - 2, - 10, - 1, - 4, - 4, - 2, - 5, - 3, - 2, - 2, - 1, - 9, - 6, - 5, - 1, - 2, - 23, - 6, - 3, - 1, - 6, - 10, - 3, - 19, - 3, - 6, - 11, - 5, - 13, - 6, - 13, - 7, - 3, - 2, - 6, - 18, - 10, - 9, - 16, - 10, - 3, - 4, - 8, - 6, - 8, - 3, - 7, - 1, - 1, - 2, - 1, - 1, - 4, - 5, - 2, - 1, - 3, - 4, - 5, - 6, - 4, - 3, - 1, - 1, - 3, - 9, - 3, - 6, - 3, - 4, - 4, - 4, - 2, - 5, - 2, - 9, - 1, - 4, - 6, - 3 - ], - "xaxis": "x", - "yaxis": "y" - } - ], - "layout": { - "annotations": [ - { - "font": {}, - "showarrow": false, - "text": "jezyk=ang", - "textangle": 90, - "x": 0.98, - "xanchor": "left", - "xref": "paper", - "y": 0.15666666666666665, - "yanchor": "middle", - "yref": "paper" - }, - { - "font": {}, - "showarrow": false, - "text": "jezyk=hiszp", - "textangle": 90, - "x": 0.98, - "xanchor": "left", - "xref": "paper", - "y": 0.4999999999999999, - "yanchor": "middle", - "yref": "paper" - }, - { - "font": {}, - "showarrow": false, - "text": "jezyk=polski", - "textangle": 90, - "x": 0.98, - "xanchor": "left", - "xref": "paper", - "y": 0.8433333333333332, - "yanchor": "middle", - "yref": "paper" - } - ], - "barmode": "relative", - "legend": { - "tracegroupgap": 0 - }, - "margin": { - "t": 60 - }, - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "heatmapgl": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmapgl" - } - ], - "histogram": [ - { - "marker": { - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "autotypenumbers": "strict", - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "#E5ECF6", - "showlakes": true, - "showland": true, - "subunitcolor": "white" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "#E5ECF6", - "polar": { - "angularaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "radialaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "yaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "zaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "baxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "caxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - } - } - }, - "xaxis": { - "anchor": "y", - "domain": [ - 0, - 0.98 - ], - "title": { - "text": "dlugosc" - } - }, - "xaxis2": { - "anchor": "y2", - "domain": [ - 0, - 0.98 - ], - "matches": "x", - "showticklabels": false - }, - "xaxis3": { - "anchor": "y3", - "domain": [ - 0, - 0.98 - ], - "matches": "x", - "showticklabels": false - }, - "yaxis": { - "anchor": "x", - "domain": [ - 0, - 0.3133333333333333 - ], - "title": { - "text": "count" - } - }, - "yaxis2": { - "anchor": "x2", - "domain": [ - 0.34333333333333327, - 0.6566666666666665 - ], - "matches": "y", - "title": { - "text": "count" - } - }, - "yaxis3": { - "anchor": "x3", - "domain": [ - 0.6866666666666665, - 0.9999999999999998 - ], - "matches": "y", - "title": { - "text": "count" - } - } - } - }, - "text/html": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df = pd.DataFrame([[random.choice(['ang','polski','hiszp']), np.random.geometric(0.2)] for i in range(5000) ], columns=['jezyk', 'dlugosc'])\n", - "fig = px.histogram(df, x=\"dlugosc\",facet_row='jezyk',nbins=50, hover_data=df.columns)\n", - "fig.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "?px.histogram" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ZADANIE 1 \n", - "\n", - "ZNAJDŹ PRZYKŁAD TEKSTÓW Z TEJ SAMEJ DOMENY 1_000_000 słów (20 punktów):\n", - "- język angielski \n", - "- język polski\n", - "- język z rodziny romańskich\n", - "\n", - "Narzędzia:\n", - "- nltk, plotly express\n", - "\n", - "\n", - "Dla każdego z języków:\n", - "- policz ilosć unikalnych lowercase słów (ze stemmingiem i bez)\n", - "- policz ilosć znaków\n", - "- policz ilosć unikalnych znaków\n", - "- policz ilosć zdań zdań\n", - "- policz ilosć unikalnych zdań\n", - "- podaj min, max, średnią oraz medianę ilości znaków w słowie \n", - "- podaj min, max, średnią oraz medianę ilości słów w zdaniu\n", - "- wygeneruj word cloud (normalnie i po usunięciu stopwordów)\n", - "- wypisz 20 najbardziej popularnych słów (normalnie i po usunięciu stopwordów) (lowercase)\n", - "- wypisz 20 najbardziej popularnych bigramów (normalnie i po usunięciu stopwordów)\n", - "- narysuj wykres częstotliwości słów (histogram) w taki sposób żeby był maksymalnie czytelny, wypróbuj skali logarytmicznej x, y, usuwanie słów poniżej limitu wystąpień itp. \n", - "- dla próbki 10000 zdań sprawdź jak często langdetect https://pypi.org/project/langdetect/ się myli i jakie języki odgaduje \n", - "- zilustruj prawo zipfa ( px.line z zaznaczonymi punktami)\n", - "- napisz wnioski (20-50 zdań)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### START ZADANIA" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### KONIEC ZADANIA" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ZADANIE\n", - "\n", - "Weź teksty w języku polskim (mają składać sie po 5 osobnych dokumentów każdy:\n", - "- tekst prawny\n", - "- tekst naukowy\n", - "- tekst z polskiego z powieści (np. wolne lektury)\n", - "- tekst z polskiego internetu (reddit, wykop, komentarze)\n", - "- transkrypcja tekstu mówionego\n", - "\n", - "\n", - "- zilustruj gunning_fog INDEX z https://pypi.org/project/textstat/ (oś y) i średnią długość zdania (oś x) na jednym wykresie narysuj dla wszystkich tekstów na jednym wykresie , domeny oznacz kolorami (px.scatter)\n", - "- zilustruj prawo Heap'a dla wszystkich tekstów na jednym wykresie, domeny oznacz kolorami (px.scatter)\n", - "- napisz wnioski (20-50 zdań)\n", - "\n", - "NAPISZ WNIOSKI\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### START ZADANIA" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### KONIEC ZADANIA" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## WYKONANIE ZADAŃ\n", - "Zgodnie z instrukcją 01_Kodowanie_tekstu.ipynb" - ] - } - ], - "metadata": { - "author": "Jakub Pokrywka", - "email": "kubapok@wmi.amu.edu.pl", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "lang": "pl", - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - }, - "subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]", - "title": "Ekstrakcja informacji", - "year": "2021" - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/cw/02_Język.ipynb b/cw/02_Język.ipynb new file mode 100644 index 0000000..1199148 --- /dev/null +++ b/cw/02_Język.ipynb @@ -0,0 +1,7652 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Modelowanie Języka

\n", + "

2. Język [ćwiczenia]

\n", + "

Jakub Pokrywka (2022)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import random\n", + "import plotly.express as px\n", + "import numpy as np\n", + "import pandas as pd\n", + "import nltk" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://github.com/sdadas/polish-nlp-resources" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "program : program\n", + "programs : program\n", + "programmer : programm\n", + "programming : program\n", + "programmers : programm\n" + ] + } + ], + "source": [ + "ps = nltk.stem.PorterStemmer()\n", + "\n", + "for w in [\"program\", \"programs\", \"programmer\", \"programming\", \"programmers\"]:\n", + " print(w, \" : \", ps.stem(w))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt to /home/kuba/nltk_data...\n", + "[nltk_data] Package punkt is already up-to-date!\n", + "[nltk_data] Downloading package stopwords to /home/kuba/nltk_data...\n", + "[nltk_data] Package stopwords is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nltk.download('punkt')\n", + "nltk.download('stopwords')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Python',\n", + " 'is',\n", + " 'dynamically-typed',\n", + " 'and',\n", + " 'garbage-collected',\n", + " '.',\n", + " 'It',\n", + " 'supports',\n", + " 'multiple',\n", + " 'programming',\n", + " 'paradigms',\n", + " ',',\n", + " 'including',\n", + " 'structured',\n", + " '(',\n", + " 'particularly',\n", + " ',',\n", + " 'procedural',\n", + " ')',\n", + " ',',\n", + " 'object-oriented',\n", + " 'and',\n", + " 'functional',\n", + " 'programming',\n", + " '.',\n", + " 'It',\n", + " 'is',\n", + " 'often',\n", + " 'described',\n", + " 'as',\n", + " 'a',\n", + " '``',\n", + " 'batteries',\n", + " 'included',\n", + " \"''\",\n", + " 'language',\n", + " 'due',\n", + " 'to',\n", + " 'its',\n", + " 'comprehensive',\n", + " 'standard',\n", + " 'library',\n", + " '.']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text = \"\"\"Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly, procedural), object-oriented and functional programming. It is often described as a \"batteries included\" language due to its comprehensive standard library.\"\"\"\n", + "nltk.tokenize.word_tokenize(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Python is dynamically-typed and garbage-collected.',\n", + " 'It supports multiple programming paradigms, including structured (particularly, procedural), object-oriented and functional programming.',\n", + " 'It is often described as a \"batteries included\" language due to its comprehensive standard library.']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nltk.tokenize.sent_tokenize(text)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['aber',\n", + " 'alle',\n", + " 'allem',\n", + " 'allen',\n", + " 'aller',\n", + " 'alles',\n", + " 'als',\n", + " 'also',\n", + " 'am',\n", + " 'an',\n", + " 'ander',\n", + " 'andere',\n", + " 'anderem',\n", + " 'anderen',\n", + " 'anderer',\n", + " 'anderes',\n", + " 'anderm',\n", + " 'andern',\n", + " 'anderr',\n", + " 'anders',\n", + " 'auch',\n", + " 'auf',\n", + " 'aus',\n", + " 'bei',\n", + " 'bin',\n", + " 'bis',\n", + " 'bist',\n", + " 'da',\n", + " 'damit',\n", + " 'dann',\n", + " 'der',\n", + " 'den',\n", + " 'des',\n", + " 'dem',\n", + " 'die',\n", + " 'das',\n", + " 'dass',\n", + " 'daß',\n", + " 'derselbe',\n", + " 'derselben',\n", + " 'denselben',\n", + " 'desselben',\n", + " 'demselben',\n", + " 'dieselbe',\n", + " 'dieselben',\n", + " 'dasselbe',\n", + " 'dazu',\n", + " 'dein',\n", + " 'deine',\n", + " 'deinem',\n", + " 'deinen',\n", + " 'deiner',\n", + " 'deines',\n", + " 'denn',\n", + " 'derer',\n", + " 'dessen',\n", + " 'dich',\n", + " 'dir',\n", + " 'du',\n", + " 'dies',\n", + " 'diese',\n", + " 'diesem',\n", + " 'diesen',\n", + " 'dieser',\n", + " 'dieses',\n", + " 'doch',\n", + " 'dort',\n", + " 'durch',\n", + " 'ein',\n", + " 'eine',\n", + " 'einem',\n", + " 'einen',\n", + " 'einer',\n", + " 'eines',\n", + " 'einig',\n", + " 'einige',\n", + " 'einigem',\n", + " 'einigen',\n", + " 'einiger',\n", + " 'einiges',\n", + " 'einmal',\n", + " 'er',\n", + " 'ihn',\n", + " 'ihm',\n", + " 'es',\n", + " 'etwas',\n", + " 'euer',\n", + " 'eure',\n", + " 'eurem',\n", + " 'euren',\n", + " 'eurer',\n", + " 'eures',\n", + " 'für',\n", + " 'gegen',\n", + " 'gewesen',\n", + " 'hab',\n", + " 'habe',\n", + " 'haben',\n", + " 'hat',\n", + " 'hatte',\n", + " 'hatten',\n", + " 'hier',\n", + " 'hin',\n", + " 'hinter',\n", + " 'ich',\n", + " 'mich',\n", + " 'mir',\n", + " 'ihr',\n", + " 'ihre',\n", + " 'ihrem',\n", + " 'ihren',\n", + " 'ihrer',\n", + " 'ihres',\n", + " 'euch',\n", + " 'im',\n", + " 'in',\n", + " 'indem',\n", + " 'ins',\n", + " 'ist',\n", + " 'jede',\n", + " 'jedem',\n", + " 'jeden',\n", + " 'jeder',\n", + " 'jedes',\n", + " 'jene',\n", + " 'jenem',\n", + " 'jenen',\n", + " 'jener',\n", + " 'jenes',\n", + " 'jetzt',\n", + " 'kann',\n", + " 'kein',\n", + " 'keine',\n", + " 'keinem',\n", + " 'keinen',\n", + " 'keiner',\n", + " 'keines',\n", + " 'können',\n", + " 'könnte',\n", + " 'machen',\n", + " 'man',\n", + " 'manche',\n", + " 'manchem',\n", + " 'manchen',\n", + " 'mancher',\n", + " 'manches',\n", + " 'mein',\n", + " 'meine',\n", + " 'meinem',\n", + " 'meinen',\n", + " 'meiner',\n", + " 'meines',\n", + " 'mit',\n", + " 'muss',\n", + " 'musste',\n", + " 'nach',\n", + " 'nicht',\n", + " 'nichts',\n", + " 'noch',\n", + " 'nun',\n", + " 'nur',\n", + " 'ob',\n", + " 'oder',\n", + " 'ohne',\n", + " 'sehr',\n", + " 'sein',\n", + " 'seine',\n", + " 'seinem',\n", + " 'seinen',\n", + " 'seiner',\n", + " 'seines',\n", + " 'selbst',\n", + " 'sich',\n", + " 'sie',\n", + " 'ihnen',\n", + " 'sind',\n", + " 'so',\n", + " 'solche',\n", + " 'solchem',\n", + " 'solchen',\n", + " 'solcher',\n", + " 'solches',\n", + " 'soll',\n", + " 'sollte',\n", + " 'sondern',\n", + " 'sonst',\n", + " 'über',\n", + " 'um',\n", + " 'und',\n", + " 'uns',\n", + " 'unsere',\n", + " 'unserem',\n", + " 'unseren',\n", + " 'unser',\n", + " 'unseres',\n", + " 'unter',\n", + " 'viel',\n", + " 'vom',\n", + " 'von',\n", + " 'vor',\n", + " 'während',\n", + " 'war',\n", + " 'waren',\n", + " 'warst',\n", + " 'was',\n", + " 'weg',\n", + " 'weil',\n", + " 'weiter',\n", + " 'welche',\n", + " 'welchem',\n", + " 'welchen',\n", + " 'welcher',\n", + " 'welches',\n", + " 'wenn',\n", + " 'werde',\n", + " 'werden',\n", + " 'wie',\n", + " 'wieder',\n", + " 'will',\n", + " 'wir',\n", + " 'wird',\n", + " 'wirst',\n", + " 'wo',\n", + " 'wollen',\n", + " 'wollte',\n", + " 'würde',\n", + " 'würden',\n", + " 'zu',\n", + " 'zum',\n", + " 'zur',\n", + " 'zwar',\n", + " 'zwischen']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nltk.corpus.stopwords.words('german')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[('Python', 'is'), ('is', 'dynamically-typed'), ('dynamically-typed', 'and'), ('and', 'garbage-collected'), ('garbage-collected', '.'), ('.', 'It'), ('It', 'supports'), ('supports', 'multiple'), ('multiple', 'programming'), ('programming', 'paradigms'), ('paradigms', ','), (',', 'including'), ('including', 'structured'), ('structured', '('), ('(', 'particularly'), ('particularly', ','), (',', 'procedural'), ('procedural', ')'), (')', ','), (',', 'object-oriented'), ('object-oriented', 'and'), ('and', 'functional'), ('functional', 'programming'), ('programming', '.'), ('.', 'It'), ('It', 'is'), ('is', 'often'), ('often', 'described'), ('described', 'as'), ('as', 'a'), ('a', '``'), ('``', 'batteries'), ('batteries', 'included'), ('included', \"''\"), (\"''\", 'language'), ('language', 'due'), ('due', 'to'), ('to', 'its'), ('its', 'comprehensive'), ('comprehensive', 'standard'), ('standard', 'library'), ('library', '.')]\n" + ] + } + ], + "source": [ + "nltk_tokens = nltk.word_tokenize(text)\n", + "print(list(nltk.bigrams(nltk_tokens)))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + " \n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "plotlyServerURL": "https://plot.ly" + }, + "data": [ + { + "alignmentgroup": "True", + "hovertemplate": "słowo=%{x}
liczba=%{y}", + "legendgroup": "", + "marker": { + "color": "#636efa", + "pattern": { + "shape": "" + } + }, + "name": "", + "offsetgroup": "", + "orientation": "v", + "showlegend": false, + "textposition": "auto", + "type": "bar", + "x": [ + "ma", + "ala", + "psa", + "kota" + ], + "xaxis": "x", + "y": [ + 20, + 15, + 10, + 10 + ], + "yaxis": "y" + } + ], + "layout": { + "barmode": "relative", + "legend": { + "tracegroupgap": 0 + }, + "margin": { + "t": 60 + }, + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "xaxis": { + "anchor": "y", + "domain": [ + 0, + 1 + ], + "title": { + "text": "słowo" + } + }, + "yaxis": { + "anchor": "x", + "domain": [ + 0, + 1 + ], + "title": { + "text": "liczba" + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df = pd.DataFrame([['ma', 20], ['ala', 15], ['psa', 10], ['kota', 10]], columns=['słowo', 'liczba'])\n", + "fig = px.bar(df, x=\"słowo\", y=\"liczba\")\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "plotlyServerURL": "https://plot.ly" + }, + "data": [ + { + "alignmentgroup": "True", + "bingroup": "x", + "hovertemplate": "jezyk=polski
dlugosc=%{x}
count=%{y}", + "legendgroup": "", + "marker": { + "color": "#636efa", + "pattern": { + "shape": "" + } + }, + "name": "", + "nbinsx": 50, + "offsetgroup": "", + "orientation": "v", + "showlegend": false, + "type": "histogram", + "x": [ + 6, + 3, + 4, + 1, + 1, + 7, + 7, + 2, + 1, + 6, + 4, + 7, + 2, + 2, + 15, + 2, + 1, + 3, + 14, + 5, + 3, + 11, + 7, + 1, + 3, + 1, + 4, + 1, + 1, + 1, + 2, + 1, + 2, + 7, + 1, + 8, + 4, + 1, + 7, + 4, + 5, + 10, + 7, + 4, + 1, + 4, + 2, + 5, + 6, + 8, + 3, + 8, + 2, + 3, + 10, + 5, + 3, + 8, + 5, + 1, + 2, + 3, + 10, + 8, + 19, + 10, + 5, + 1, + 4, + 7, + 2, + 11, + 1, + 2, + 11, + 3, + 3, + 5, + 4, + 3, + 1, + 1, + 2, + 13, + 11, + 2, + 13, + 1, + 1, + 7, + 9, + 1, + 7, + 1, + 4, + 2, + 2, + 3, + 5, + 8, + 4, + 2, + 12, + 6, + 19, + 7, + 2, + 7, + 4, + 9, + 7, + 2, + 13, + 19, + 5, + 5, + 5, + 8, + 4, + 5, + 2, + 2, + 1, + 1, + 10, + 3, + 1, + 7, + 1, + 3, + 13, + 5, + 2, + 4, + 2, + 3, + 5, + 2, + 2, + 14, + 5, + 4, + 10, + 3, + 5, + 5, + 6, + 4, + 9, + 2, + 3, + 29, + 17, + 12, + 3, + 3, + 1, + 1, + 5, + 3, + 1, + 1, + 3, + 2, + 7, + 3, + 6, + 11, + 5, + 3, + 1, + 2, + 12, + 2, + 1, + 1, + 2, + 7, + 2, + 1, + 2, + 2, + 4, + 6, + 10, + 1, + 7, + 1, + 8, + 2, + 5, + 1, + 3, + 2, + 4, + 6, + 3, + 18, + 2, + 8, + 10, + 2, + 2, + 5, + 2, + 7, + 3, + 2, + 6, + 8, + 4, + 5, + 1, + 2, + 4, + 3, + 2, + 11, + 5, + 2, + 7, + 12, + 8, + 2, + 7, + 3, + 25, + 12, + 7, + 1, + 1, + 17, + 4, + 6, + 7, + 7, + 2, + 1, + 15, + 3, + 1, + 2, + 3, + 2, + 3, + 1, + 8, + 1, + 6, + 4, + 3, + 7, + 14, + 5, + 10, + 3, + 3, + 9, + 1, + 3, + 5, + 1, + 2, + 3, + 2, + 11, + 2, + 3, + 1, + 3, + 5, + 1, + 2, + 5, + 2, + 12, + 3, + 1, + 3, + 7, + 5, + 1, + 4, + 3, + 1, + 6, + 1, + 4, + 5, + 10, + 4, + 8, + 6, + 7, + 8, + 3, + 10, + 2, + 4, + 5, + 1, + 12, + 7, + 11, + 5, + 6, + 2, + 6, + 6, + 4, + 12, + 1, + 2, + 1, + 5, + 2, + 17, + 1, + 2, + 3, + 5, + 3, + 1, + 9, + 6, + 7, + 2, + 3, + 7, + 1, + 2, + 2, + 4, + 10, + 5, + 10, + 1, + 5, + 1, + 4, + 1, + 8, + 8, + 3, + 1, + 2, + 1, + 1, + 3, + 1, + 7, + 3, + 4, + 3, + 5, + 3, + 2, + 5, + 7, + 2, + 1, + 1, + 15, + 3, + 3, + 7, + 4, + 2, + 2, + 2, + 2, + 3, + 1, + 6, + 19, + 6, + 4, + 1, + 2, + 8, + 4, + 8, + 2, + 13, + 1, + 15, + 2, + 4, + 3, + 11, + 1, + 1, + 2, + 2, + 1, + 1, + 4, + 18, + 1, + 5, + 3, + 1, + 3, + 1, + 2, + 11, + 11, + 10, + 6, + 12, + 2, + 10, + 5, + 1, + 6, + 9, + 1, + 12, + 4, + 3, + 3, + 14, + 5, + 5, + 14, + 10, + 5, + 2, + 2, + 1, + 3, + 5, + 1, + 3, + 5, + 3, + 1, + 5, + 4, + 1, + 6, + 1, + 6, + 6, + 1, + 2, + 7, + 5, + 3, + 1, + 7, + 1, + 7, + 1, + 21, + 4, + 1, + 4, + 1, + 5, + 21, + 20, + 7, + 2, + 2, + 2, + 6, + 2, + 5, + 5, + 13, + 1, + 1, + 6, + 20, + 2, + 1, + 12, + 2, + 12, + 2, + 1, + 6, + 8, + 1, + 13, + 11, + 8, + 13, + 3, + 13, + 6, + 1, + 4, + 3, + 4, + 1, + 7, + 1, + 9, + 5, + 2, + 9, + 3, + 6, + 2, + 2, + 1, + 2, + 1, + 4, + 17, + 3, + 2, + 2, + 4, + 4, + 1, + 5, + 6, + 2, + 1, + 8, + 2, + 8, + 4, + 6, + 5, + 6, + 9, + 2, + 10, + 13, + 3, + 6, + 3, + 7, + 5, + 7, + 15, + 2, + 4, + 2, + 7, + 1, + 8, + 18, + 1, + 3, + 5, + 5, + 3, + 3, + 9, + 19, + 1, + 9, + 7, + 1, + 4, + 2, + 3, + 2, + 1, + 3, + 2, + 13, + 2, + 3, + 4, + 1, + 4, + 1, + 2, + 14, + 6, + 7, + 7, + 15, + 3, + 5, + 1, + 2, + 7, + 4, + 7, + 1, + 3, + 6, + 5, + 10, + 6, + 1, + 1, + 2, + 3, + 4, + 2, + 1, + 2, + 7, + 3, + 1, + 14, + 4, + 7, + 1, + 8, + 9, + 9, + 2, + 2, + 1, + 2, + 3, + 19, + 9, + 1, + 2, + 6, + 7, + 7, + 2, + 21, + 12, + 14, + 9, + 6, + 2, + 1, + 8, + 2, + 3, + 10, + 1, + 1, + 15, + 5, + 1, + 1, + 14, + 5, + 11, + 17, + 1, + 2, + 12, + 8, + 12, + 7, + 4, + 2, + 9, + 2, + 2, + 6, + 7, + 7, + 8, + 4, + 8, + 6, + 10, + 1, + 4, + 7, + 2, + 4, + 1, + 1, + 4, + 3, + 15, + 18, + 2, + 11, + 10, + 2, + 1, + 2, + 9, + 1, + 14, + 9, + 2, + 10, + 3, + 2, + 4, + 1, + 6, + 1, + 17, + 2, + 3, + 17, + 11, + 7, + 4, + 15, + 4, + 1, + 3, + 20, + 5, + 5, + 10, + 3, + 4, + 1, + 2, + 3, + 3, + 1, + 4, + 4, + 12, + 1, + 1, + 3, + 1, + 23, + 4, + 2, + 3, + 11, + 26, + 1, + 4, + 2, + 10, + 2, + 1, + 6, + 1, + 4, + 5, + 7, + 7, + 1, + 1, + 1, + 7, + 15, + 9, + 3, + 6, + 5, + 1, + 1, + 8, + 3, + 2, + 8, + 7, + 4, + 9, + 13, + 2, + 1, + 5, + 4, + 1, + 1, + 8, + 4, + 6, + 5, + 2, + 4, + 7, + 7, + 5, + 4, + 7, + 3, + 1, + 3, + 13, + 6, + 17, + 5, + 4, + 9, + 15, + 15, + 13, + 3, + 2, + 12, + 1, + 1, + 8, + 11, + 4, + 4, + 2, + 14, + 4, + 5, + 12, + 4, + 3, + 22, + 7, + 5, + 1, + 3, + 1, + 3, + 1, + 1, + 3, + 6, + 1, + 1, + 1, + 6, + 7, + 2, + 6, + 2, + 16, + 1, + 4, + 2, + 8, + 11, + 5, + 3, + 4, + 3, + 1, + 2, + 15, + 8, + 3, + 2, + 7, + 23, + 4, + 10, + 1, + 6, + 7, + 5, + 1, + 7, + 3, + 5, + 2, + 4, + 2, + 1, + 4, + 2, + 6, + 7, + 1, + 11, + 2, + 1, + 17, + 3, + 2, + 3, + 3, + 3, + 1, + 2, + 8, + 3, + 3, + 1, + 2, + 4, + 4, + 4, + 1, + 19, + 6, + 3, + 1, + 1, + 1, + 4, + 11, + 3, + 4, + 5, + 2, + 4, + 2, + 1, + 1, + 1, + 2, + 6, + 7, + 4, + 3, + 3, + 7, + 2, + 1, + 1, + 3, + 1, + 11, + 8, + 6, + 3, + 1, + 2, + 4, + 6, + 1, + 8, + 7, + 11, + 7, + 4, + 4, + 12, + 3, + 1, + 2, + 15, + 3, + 1, + 13, + 2, + 4, + 5, + 6, + 25, + 4, + 13, + 1, + 4, + 2, + 7, + 7, + 4, + 4, + 9, + 4, + 6, + 1, + 11, + 6, + 1, + 10, + 15, + 1, + 6, + 6, + 7, + 1, + 1, + 5, + 1, + 1, + 4, + 1, + 5, + 3, + 6, + 26, + 2, + 3, + 5, + 3, + 2, + 4, + 1, + 7, + 2, + 1, + 4, + 8, + 1, + 12, + 2, + 4, + 3, + 18, + 1, + 1, + 5, + 11, + 2, + 4, + 3, + 11, + 2, + 2, + 1, + 6, + 3, + 12, + 8, + 3, + 5, + 7, + 5, + 3, + 9, + 11, + 8, + 8, + 1, + 1, + 10, + 6, + 10, + 2, + 9, + 2, + 1, + 9, + 4, + 5, + 1, + 5, + 3, + 2, + 5, + 5, + 4, + 1, + 6, + 13, + 4, + 11, + 1, + 2, + 3, + 9, + 2, + 15, + 3, + 4, + 1, + 3, + 2, + 5, + 6, + 6, + 1, + 8, + 6, + 2, + 1, + 3, + 4, + 12, + 2, + 4, + 10, + 4, + 1, + 5, + 3, + 1, + 5, + 2, + 14, + 1, + 6, + 8, + 9, + 2, + 9, + 10, + 4, + 1, + 4, + 7, + 2, + 1, + 8, + 1, + 2, + 15, + 1, + 1, + 4, + 3, + 4, + 3, + 8, + 8, + 11, + 10, + 3, + 7, + 1, + 2, + 8, + 5, + 2, + 12, + 3, + 14, + 6, + 1, + 8, + 1, + 1, + 9, + 6, + 2, + 5, + 2, + 14, + 15, + 1, + 1, + 8, + 6, + 1, + 4, + 7, + 1, + 1, + 1, + 2, + 9, + 1, + 10, + 10, + 2, + 8, + 6, + 6, + 21, + 2, + 8, + 4, + 8, + 5, + 14, + 1, + 3, + 5, + 5, + 6, + 1, + 1, + 7, + 2, + 1, + 3, + 6, + 1, + 5, + 7, + 4, + 3, + 1, + 4, + 14, + 1, + 15, + 6, + 2, + 4, + 9, + 9, + 8, + 6, + 3, + 6, + 13, + 9, + 4, + 3, + 1, + 3, + 2, + 7, + 4, + 7, + 2, + 8, + 5, + 8, + 1, + 3, + 2, + 8, + 1, + 3, + 1, + 2, + 2, + 3, + 2, + 1, + 2, + 1, + 2, + 11, + 1, + 8, + 5, + 7, + 3, + 3, + 14, + 1, + 2, + 2, + 9, + 4, + 5, + 3, + 1, + 4, + 1, + 7, + 1, + 9, + 5, + 4, + 2, + 5, + 9, + 7, + 1, + 17, + 19, + 4, + 1, + 3, + 11, + 2, + 1, + 3, + 3, + 1, + 3, + 1, + 2, + 5, + 17, + 1, + 4, + 2, + 8, + 1, + 2, + 13, + 6, + 5, + 3, + 7, + 1, + 1, + 5, + 4, + 11, + 1, + 4, + 2, + 9, + 5, + 1, + 10, + 5, + 10, + 6, + 12, + 5, + 1, + 5, + 1, + 3, + 1, + 9, + 12, + 1, + 7, + 2, + 7, + 6, + 3, + 1, + 4, + 5, + 5, + 4, + 4, + 12, + 8, + 17, + 1, + 3, + 6, + 12, + 2, + 4, + 1, + 5, + 5, + 5, + 1, + 3, + 2, + 9, + 3, + 2, + 3, + 1, + 7, + 1, + 6, + 4, + 4, + 2, + 3, + 1, + 2, + 4, + 1, + 1, + 1, + 1, + 1, + 5, + 1, + 1, + 3, + 4, + 3, + 5, + 4, + 6, + 7, + 13, + 1, + 8, + 4, + 4, + 6, + 2, + 1, + 3, + 5, + 7, + 11, + 1, + 1, + 4, + 1, + 7, + 1, + 1, + 5, + 8, + 4, + 1, + 2, + 22, + 6, + 2, + 5, + 1, + 1, + 1, + 6, + 5, + 10, + 13, + 4, + 10, + 5, + 1, + 3, + 2, + 5, + 1, + 2, + 3, + 15, + 1, + 2, + 4, + 6, + 2, + 11, + 9, + 9, + 14, + 6, + 3, + 5, + 6, + 1, + 13, + 2, + 3, + 5, + 1, + 3, + 3, + 2, + 2, + 2, + 3, + 5, + 1, + 7, + 2, + 15, + 1, + 14, + 10, + 15, + 1, + 2, + 1, + 8, + 4, + 3, + 6, + 11, + 3, + 7, + 4, + 8, + 8, + 3, + 2, + 3, + 15, + 1, + 3, + 10, + 7, + 7, + 11, + 3, + 5, + 10, + 1, + 3, + 1, + 8, + 11, + 7, + 6, + 4, + 3, + 8, + 3, + 6, + 14, + 3, + 3, + 1, + 1, + 2, + 1, + 1, + 3, + 2, + 1, + 2, + 6, + 5, + 11, + 1, + 11, + 4, + 3, + 3, + 3, + 3, + 20, + 3, + 2, + 8, + 3, + 5, + 1, + 1, + 6, + 5, + 13, + 2, + 4, + 2, + 8, + 6, + 10, + 8, + 3, + 2, + 4, + 4, + 3, + 1, + 2, + 1, + 11, + 10, + 5, + 4, + 1, + 1, + 5, + 4, + 11, + 7, + 12, + 4, + 4, + 1, + 2, + 11, + 1, + 3, + 4, + 2, + 3, + 3, + 15, + 4, + 9, + 5, + 1, + 11, + 3, + 3, + 5, + 6, + 1, + 3, + 1, + 3, + 5, + 6, + 18, + 4, + 6, + 2, + 5, + 3, + 3, + 1, + 4, + 4, + 4, + 4, + 2, + 2, + 12, + 5, + 8, + 6, + 1, + 1, + 5, + 9, + 1, + 2, + 9, + 5, + 1, + 7, + 2, + 1, + 2, + 8, + 9, + 18, + 4, + 11, + 1, + 3, + 2, + 1, + 1, + 8, + 4, + 4, + 9, + 4, + 9, + 8, + 4, + 2, + 8, + 1, + 1, + 1, + 7, + 9, + 6, + 2, + 3, + 4, + 1, + 5, + 1, + 4, + 1, + 6, + 5, + 9, + 7, + 6, + 3, + 1, + 2, + 4, + 2, + 2, + 1, + 5, + 1, + 4, + 1, + 3, + 1, + 5, + 14, + 1, + 1, + 5, + 9, + 1, + 4, + 3 + ], + "xaxis": "x3", + "yaxis": "y3" + }, + { + "alignmentgroup": "True", + "bingroup": "x", + "hovertemplate": "jezyk=ang
dlugosc=%{x}
count=%{y}", + "legendgroup": "", + "marker": { + "color": "#636efa", + "pattern": { + "shape": "" + } + }, + "name": "", + "nbinsx": 50, + "offsetgroup": "", + "orientation": "v", + "showlegend": false, + "type": "histogram", + "x": [ + 10, + 3, + 2, + 1, + 5, + 15, + 3, + 2, + 6, + 11, + 17, + 5, + 5, + 1, + 6, + 9, + 2, + 4, + 2, + 2, + 17, + 4, + 3, + 3, + 2, + 3, + 5, + 13, + 2, + 2, + 2, + 5, + 9, + 4, + 19, + 3, + 1, + 6, + 2, + 5, + 2, + 7, + 14, + 1, + 4, + 7, + 5, + 12, + 4, + 2, + 6, + 1, + 10, + 1, + 2, + 3, + 3, + 6, + 4, + 5, + 21, + 8, + 9, + 1, + 15, + 3, + 1, + 2, + 1, + 3, + 1, + 1, + 15, + 5, + 1, + 2, + 2, + 1, + 1, + 1, + 3, + 1, + 4, + 1, + 7, + 4, + 7, + 11, + 7, + 12, + 3, + 1, + 11, + 2, + 3, + 1, + 5, + 5, + 2, + 2, + 2, + 22, + 1, + 2, + 1, + 2, + 8, + 7, + 3, + 8, + 2, + 3, + 10, + 3, + 3, + 9, + 4, + 3, + 2, + 1, + 13, + 1, + 13, + 1, + 1, + 2, + 4, + 8, + 1, + 4, + 9, + 1, + 6, + 2, + 9, + 2, + 6, + 4, + 1, + 5, + 3, + 2, + 2, + 9, + 3, + 2, + 15, + 1, + 6, + 1, + 2, + 1, + 1, + 1, + 3, + 2, + 3, + 12, + 19, + 7, + 1, + 4, + 4, + 11, + 2, + 1, + 2, + 3, + 6, + 4, + 28, + 7, + 13, + 8, + 1, + 2, + 2, + 7, + 1, + 14, + 5, + 5, + 5, + 10, + 2, + 4, + 12, + 2, + 3, + 19, + 3, + 1, + 2, + 4, + 2, + 1, + 7, + 8, + 7, + 1, + 8, + 1, + 10, + 6, + 3, + 3, + 7, + 3, + 6, + 2, + 14, + 2, + 1, + 9, + 3, + 9, + 2, + 4, + 1, + 1, + 2, + 4, + 5, + 7, + 2, + 3, + 1, + 1, + 10, + 3, + 15, + 4, + 7, + 1, + 8, + 4, + 6, + 3, + 7, + 9, + 9, + 2, + 3, + 6, + 2, + 8, + 32, + 10, + 2, + 7, + 1, + 4, + 2, + 6, + 3, + 14, + 6, + 6, + 7, + 7, + 2, + 6, + 10, + 7, + 7, + 3, + 7, + 6, + 6, + 5, + 6, + 5, + 4, + 4, + 7, + 2, + 7, + 7, + 9, + 4, + 9, + 1, + 6, + 6, + 3, + 3, + 4, + 2, + 5, + 4, + 5, + 6, + 5, + 14, + 6, + 11, + 5, + 7, + 3, + 2, + 4, + 13, + 4, + 3, + 2, + 1, + 1, + 1, + 2, + 1, + 8, + 10, + 5, + 1, + 1, + 7, + 3, + 8, + 4, + 1, + 3, + 4, + 4, + 4, + 2, + 7, + 3, + 1, + 1, + 5, + 6, + 3, + 2, + 2, + 2, + 2, + 2, + 1, + 6, + 1, + 1, + 1, + 1, + 1, + 6, + 5, + 8, + 1, + 9, + 2, + 9, + 15, + 1, + 10, + 2, + 1, + 1, + 2, + 2, + 3, + 10, + 8, + 8, + 11, + 5, + 2, + 3, + 4, + 10, + 13, + 1, + 2, + 2, + 1, + 3, + 8, + 21, + 6, + 13, + 14, + 11, + 6, + 6, + 1, + 8, + 3, + 4, + 3, + 15, + 4, + 2, + 1, + 3, + 14, + 11, + 1, + 11, + 10, + 3, + 1, + 2, + 2, + 3, + 1, + 4, + 10, + 3, + 22, + 7, + 10, + 2, + 2, + 1, + 1, + 11, + 20, + 3, + 5, + 5, + 2, + 6, + 8, + 1, + 10, + 4, + 1, + 9, + 4, + 2, + 1, + 2, + 2, + 12, + 5, + 5, + 1, + 5, + 8, + 2, + 1, + 1, + 4, + 6, + 3, + 6, + 5, + 2, + 2, + 3, + 2, + 2, + 1, + 4, + 2, + 2, + 2, + 6, + 2, + 12, + 7, + 1, + 5, + 1, + 11, + 9, + 18, + 2, + 1, + 1, + 3, + 3, + 6, + 4, + 4, + 2, + 7, + 11, + 4, + 1, + 5, + 2, + 4, + 6, + 6, + 5, + 3, + 5, + 7, + 1, + 11, + 2, + 6, + 8, + 1, + 3, + 2, + 4, + 5, + 6, + 9, + 1, + 18, + 7, + 6, + 2, + 1, + 1, + 18, + 3, + 12, + 7, + 5, + 1, + 2, + 2, + 3, + 10, + 1, + 4, + 5, + 2, + 2, + 7, + 2, + 2, + 3, + 3, + 10, + 3, + 2, + 4, + 1, + 11, + 2, + 3, + 2, + 1, + 1, + 4, + 1, + 12, + 3, + 1, + 6, + 5, + 2, + 5, + 2, + 5, + 22, + 6, + 3, + 2, + 3, + 2, + 1, + 5, + 1, + 5, + 4, + 10, + 3, + 4, + 16, + 2, + 5, + 5, + 6, + 1, + 12, + 5, + 5, + 1, + 7, + 3, + 3, + 3, + 14, + 1, + 3, + 5, + 3, + 1, + 2, + 6, + 1, + 6, + 3, + 3, + 3, + 4, + 3, + 4, + 8, + 6, + 3, + 9, + 3, + 4, + 6, + 6, + 17, + 2, + 3, + 2, + 4, + 1, + 4, + 3, + 1, + 2, + 16, + 4, + 2, + 4, + 15, + 11, + 9, + 10, + 1, + 5, + 6, + 2, + 11, + 13, + 3, + 4, + 4, + 1, + 12, + 1, + 2, + 6, + 10, + 14, + 9, + 4, + 1, + 3, + 1, + 4, + 1, + 1, + 3, + 11, + 9, + 4, + 3, + 5, + 2, + 3, + 1, + 2, + 1, + 9, + 2, + 1, + 3, + 2, + 4, + 3, + 11, + 6, + 1, + 4, + 4, + 7, + 6, + 3, + 4, + 5, + 13, + 1, + 5, + 2, + 20, + 8, + 3, + 8, + 8, + 2, + 3, + 13, + 3, + 1, + 8, + 7, + 8, + 1, + 14, + 2, + 4, + 1, + 2, + 1, + 6, + 3, + 3, + 1, + 1, + 5, + 6, + 5, + 3, + 1, + 7, + 4, + 6, + 3, + 1, + 19, + 10, + 3, + 2, + 1, + 5, + 1, + 1, + 3, + 3, + 1, + 1, + 2, + 4, + 4, + 1, + 1, + 35, + 6, + 2, + 2, + 2, + 2, + 3, + 12, + 6, + 34, + 5, + 2, + 11, + 13, + 7, + 6, + 12, + 7, + 9, + 6, + 3, + 2, + 1, + 3, + 2, + 8, + 1, + 8, + 1, + 2, + 5, + 17, + 3, + 1, + 1, + 3, + 3, + 7, + 1, + 14, + 1, + 6, + 6, + 2, + 2, + 3, + 8, + 4, + 1, + 8, + 4, + 7, + 4, + 4, + 7, + 1, + 3, + 8, + 4, + 1, + 1, + 7, + 1, + 1, + 6, + 9, + 1, + 1, + 1, + 1, + 1, + 4, + 4, + 2, + 13, + 6, + 5, + 4, + 2, + 13, + 10, + 5, + 3, + 13, + 1, + 1, + 7, + 7, + 16, + 4, + 9, + 7, + 11, + 1, + 2, + 1, + 5, + 13, + 1, + 5, + 3, + 6, + 3, + 4, + 8, + 5, + 14, + 3, + 8, + 7, + 6, + 5, + 7, + 1, + 5, + 6, + 4, + 23, + 1, + 1, + 2, + 1, + 7, + 2, + 4, + 6, + 3, + 3, + 8, + 1, + 1, + 1, + 3, + 4, + 5, + 10, + 3, + 1, + 18, + 4, + 7, + 1, + 2, + 1, + 1, + 1, + 5, + 2, + 2, + 6, + 7, + 9, + 12, + 5, + 2, + 3, + 3, + 1, + 14, + 6, + 4, + 1, + 4, + 1, + 5, + 21, + 9, + 5, + 5, + 7, + 8, + 11, + 5, + 1, + 6, + 2, + 4, + 2, + 2, + 4, + 3, + 3, + 1, + 3, + 5, + 1, + 7, + 9, + 8, + 2, + 1, + 2, + 4, + 4, + 19, + 4, + 3, + 1, + 14, + 5, + 4, + 1, + 8, + 10, + 2, + 11, + 3, + 3, + 1, + 11, + 3, + 6, + 3, + 4, + 3, + 2, + 1, + 4, + 21, + 2, + 3, + 9, + 3, + 2, + 1, + 8, + 4, + 2, + 1, + 3, + 11, + 4, + 6, + 8, + 7, + 3, + 9, + 9, + 4, + 2, + 3, + 3, + 11, + 5, + 7, + 3, + 11, + 4, + 8, + 2, + 9, + 7, + 1, + 7, + 26, + 2, + 20, + 14, + 1, + 3, + 12, + 2, + 17, + 11, + 4, + 1, + 12, + 4, + 1, + 7, + 2, + 14, + 2, + 6, + 1, + 4, + 7, + 6, + 3, + 3, + 4, + 1, + 2, + 8, + 5, + 6, + 1, + 8, + 3, + 1, + 5, + 1, + 6, + 7, + 10, + 6, + 4, + 3, + 5, + 5, + 14, + 3, + 2, + 2, + 3, + 1, + 3, + 4, + 13, + 9, + 7, + 2, + 7, + 2, + 3, + 4, + 4, + 2, + 3, + 2, + 4, + 9, + 5, + 8, + 4, + 6, + 4, + 5, + 22, + 14, + 1, + 3, + 3, + 5, + 3, + 8, + 1, + 2, + 1, + 9, + 5, + 15, + 13, + 11, + 13, + 2, + 2, + 4, + 4, + 8, + 2, + 2, + 1, + 1, + 1, + 3, + 1, + 4, + 4, + 17, + 4, + 5, + 2, + 2, + 1, + 6, + 18, + 1, + 12, + 6, + 1, + 1, + 2, + 1, + 1, + 22, + 9, + 2, + 9, + 3, + 14, + 8, + 35, + 1, + 1, + 3, + 6, + 9, + 21, + 5, + 1, + 1, + 19, + 3, + 3, + 4, + 8, + 2, + 22, + 3, + 2, + 2, + 5, + 5, + 3, + 1, + 1, + 4, + 1, + 1, + 1, + 4, + 16, + 6, + 4, + 16, + 8, + 7, + 2, + 6, + 2, + 2, + 5, + 6, + 3, + 5, + 19, + 29, + 2, + 3, + 3, + 1, + 3, + 1, + 1, + 2, + 3, + 7, + 11, + 12, + 2, + 23, + 3, + 2, + 15, + 4, + 5, + 1, + 2, + 4, + 1, + 5, + 1, + 7, + 6, + 4, + 7, + 4, + 1, + 1, + 7, + 8, + 1, + 9, + 11, + 10, + 6, + 14, + 6, + 13, + 4, + 8, + 2, + 4, + 7, + 4, + 3, + 7, + 4, + 1, + 7, + 2, + 4, + 5, + 6, + 4, + 1, + 2, + 1, + 1, + 5, + 4, + 4, + 13, + 5, + 1, + 1, + 3, + 5, + 2, + 1, + 4, + 1, + 2, + 7, + 5, + 11, + 5, + 1, + 1, + 1, + 2, + 4, + 1, + 4, + 2, + 1, + 1, + 2, + 2, + 2, + 2, + 5, + 2, + 2, + 1, + 1, + 5, + 13, + 3, + 3, + 6, + 1, + 3, + 1, + 3, + 1, + 2, + 1, + 2, + 4, + 7, + 3, + 13, + 2, + 12, + 8, + 5, + 1, + 1, + 1, + 2, + 1, + 8, + 5, + 1, + 4, + 1, + 7, + 8, + 2, + 17, + 2, + 12, + 9, + 1, + 3, + 2, + 2, + 5, + 6, + 8, + 8, + 1, + 1, + 8, + 1, + 1, + 5, + 7, + 6, + 3, + 7, + 2, + 11, + 12, + 2, + 1, + 1, + 2, + 1, + 6, + 5, + 2, + 11, + 1, + 4, + 3, + 5, + 2, + 1, + 5, + 2, + 15, + 2, + 2, + 4, + 2, + 1, + 2, + 3, + 6, + 2, + 2, + 7, + 1, + 5, + 7, + 1, + 1, + 1, + 11, + 2, + 6, + 5, + 2, + 3, + 10, + 1, + 2, + 2, + 16, + 6, + 6, + 4, + 2, + 23, + 4, + 9, + 2, + 3, + 6, + 13, + 4, + 17, + 1, + 5, + 2, + 4, + 1, + 8, + 31, + 1, + 2, + 3, + 9, + 4, + 1, + 7, + 1, + 2, + 7, + 1, + 3, + 3, + 1, + 3, + 2, + 1, + 8, + 7, + 10, + 2, + 1, + 2, + 2, + 7, + 7, + 3, + 7, + 1, + 11, + 1, + 1, + 6, + 2, + 2, + 14, + 11, + 4, + 6, + 2, + 3, + 5, + 7, + 14, + 4, + 7, + 2, + 5, + 4, + 3, + 9, + 2, + 4, + 4, + 1, + 6, + 2, + 7, + 6, + 3, + 5, + 1, + 2, + 3, + 1, + 9, + 6, + 10, + 8, + 2, + 1, + 20, + 7, + 2, + 6, + 6, + 21, + 4, + 9, + 4, + 8, + 7, + 3, + 13, + 1, + 3, + 1, + 10, + 2, + 2, + 6, + 11, + 1, + 2, + 3, + 1, + 3, + 9, + 8, + 2, + 2, + 1, + 1, + 1, + 3, + 3, + 11, + 1, + 5, + 3, + 1, + 10, + 29, + 13, + 4, + 7, + 3, + 3, + 8, + 1, + 3, + 4, + 11, + 2, + 10, + 2, + 16, + 6, + 3, + 10, + 5, + 1, + 1, + 12, + 2, + 10, + 3, + 3, + 2, + 12, + 18, + 3, + 3, + 1, + 3, + 10, + 4, + 1, + 3, + 8, + 1, + 1, + 4, + 5, + 2, + 8, + 7, + 1, + 8, + 5, + 3, + 4, + 2, + 2, + 1, + 1, + 2, + 5, + 23, + 6, + 2, + 6, + 3, + 3, + 1, + 3, + 2, + 3, + 1, + 3, + 11, + 6, + 2, + 5, + 2, + 19, + 1, + 4, + 1, + 1, + 6, + 10, + 2, + 4, + 12, + 4, + 2, + 7, + 8, + 2, + 17, + 37, + 1, + 4, + 2, + 7, + 3, + 2, + 5, + 4, + 16, + 3, + 5, + 1, + 2, + 1, + 6, + 9, + 2, + 7, + 10, + 4, + 1, + 6, + 3, + 6, + 1, + 1, + 11, + 1, + 1, + 8, + 3, + 2, + 13, + 4, + 2, + 3, + 2, + 4, + 2, + 7, + 2, + 3, + 1, + 10, + 1, + 1, + 1, + 2, + 9, + 3, + 3, + 2, + 2, + 1, + 4, + 18, + 4, + 1, + 1, + 2, + 3, + 4, + 1, + 6, + 3, + 7, + 6, + 1, + 7, + 3, + 1, + 3, + 6, + 2, + 4, + 9, + 4, + 5, + 10, + 1, + 7, + 7, + 2 + ], + "xaxis": "x2", + "yaxis": "y2" + }, + { + "alignmentgroup": "True", + "bingroup": "x", + "hovertemplate": "jezyk=hiszp
dlugosc=%{x}
count=%{y}", + "legendgroup": "", + "marker": { + "color": "#636efa", + "pattern": { + "shape": "" + } + }, + "name": "", + "nbinsx": 50, + "offsetgroup": "", + "orientation": "v", + "showlegend": false, + "type": "histogram", + "x": [ + 5, + 11, + 8, + 5, + 2, + 6, + 3, + 2, + 10, + 7, + 4, + 12, + 1, + 19, + 3, + 9, + 1, + 1, + 7, + 5, + 11, + 4, + 10, + 10, + 8, + 2, + 3, + 8, + 9, + 7, + 5, + 2, + 3, + 13, + 4, + 14, + 2, + 2, + 1, + 1, + 1, + 5, + 9, + 3, + 5, + 4, + 2, + 1, + 10, + 8, + 3, + 2, + 7, + 6, + 2, + 11, + 2, + 15, + 2, + 2, + 19, + 1, + 1, + 3, + 1, + 8, + 5, + 5, + 1, + 6, + 5, + 13, + 9, + 3, + 1, + 3, + 4, + 2, + 4, + 8, + 17, + 5, + 3, + 1, + 4, + 5, + 2, + 5, + 2, + 4, + 6, + 1, + 2, + 3, + 2, + 10, + 2, + 5, + 5, + 9, + 6, + 6, + 5, + 6, + 3, + 2, + 1, + 3, + 1, + 1, + 8, + 3, + 10, + 1, + 1, + 9, + 4, + 1, + 1, + 2, + 1, + 6, + 4, + 3, + 1, + 6, + 1, + 8, + 1, + 2, + 8, + 7, + 14, + 3, + 3, + 4, + 3, + 3, + 6, + 1, + 4, + 3, + 2, + 4, + 16, + 5, + 14, + 6, + 1, + 4, + 6, + 1, + 3, + 6, + 1, + 3, + 8, + 2, + 4, + 3, + 12, + 6, + 1, + 1, + 7, + 2, + 3, + 10, + 3, + 3, + 6, + 5, + 3, + 3, + 1, + 4, + 10, + 1, + 3, + 1, + 3, + 3, + 8, + 8, + 2, + 28, + 1, + 1, + 8, + 18, + 35, + 4, + 17, + 2, + 16, + 7, + 2, + 4, + 8, + 1, + 1, + 2, + 1, + 9, + 5, + 8, + 1, + 9, + 5, + 1, + 6, + 14, + 4, + 17, + 4, + 2, + 3, + 9, + 7, + 2, + 4, + 4, + 1, + 3, + 3, + 1, + 10, + 5, + 1, + 9, + 13, + 3, + 17, + 4, + 19, + 1, + 6, + 8, + 3, + 9, + 2, + 12, + 2, + 1, + 2, + 5, + 3, + 11, + 20, + 8, + 1, + 1, + 1, + 18, + 2, + 2, + 4, + 5, + 3, + 6, + 8, + 5, + 7, + 5, + 2, + 5, + 2, + 6, + 5, + 20, + 5, + 2, + 2, + 1, + 6, + 3, + 17, + 4, + 5, + 2, + 8, + 2, + 6, + 7, + 4, + 2, + 2, + 7, + 3, + 2, + 4, + 2, + 3, + 1, + 3, + 1, + 1, + 11, + 6, + 8, + 8, + 14, + 3, + 4, + 3, + 9, + 5, + 9, + 5, + 11, + 3, + 1, + 3, + 13, + 1, + 1, + 13, + 4, + 1, + 3, + 1, + 13, + 2, + 1, + 11, + 3, + 5, + 15, + 4, + 3, + 4, + 8, + 2, + 10, + 2, + 3, + 2, + 20, + 1, + 10, + 2, + 2, + 11, + 5, + 4, + 4, + 2, + 12, + 3, + 12, + 10, + 2, + 4, + 3, + 3, + 1, + 5, + 4, + 10, + 1, + 10, + 4, + 6, + 2, + 3, + 4, + 3, + 4, + 4, + 4, + 1, + 7, + 9, + 1, + 3, + 10, + 7, + 5, + 3, + 11, + 1, + 2, + 10, + 2, + 7, + 5, + 8, + 4, + 3, + 3, + 2, + 1, + 4, + 14, + 4, + 12, + 3, + 8, + 6, + 6, + 4, + 2, + 4, + 3, + 3, + 11, + 7, + 3, + 1, + 4, + 9, + 2, + 2, + 1, + 6, + 3, + 11, + 1, + 1, + 4, + 6, + 4, + 1, + 3, + 4, + 11, + 4, + 1, + 2, + 8, + 2, + 2, + 1, + 1, + 2, + 4, + 2, + 7, + 6, + 2, + 3, + 3, + 5, + 4, + 3, + 3, + 4, + 1, + 2, + 3, + 7, + 1, + 1, + 8, + 5, + 1, + 2, + 10, + 4, + 14, + 4, + 3, + 3, + 15, + 3, + 2, + 4, + 5, + 7, + 1, + 3, + 1, + 3, + 4, + 8, + 3, + 16, + 5, + 6, + 3, + 2, + 1, + 6, + 6, + 12, + 3, + 5, + 2, + 3, + 1, + 1, + 4, + 6, + 2, + 3, + 1, + 8, + 2, + 4, + 4, + 2, + 3, + 3, + 1, + 4, + 8, + 4, + 2, + 8, + 3, + 2, + 6, + 2, + 4, + 13, + 4, + 2, + 12, + 5, + 8, + 4, + 1, + 2, + 2, + 16, + 3, + 11, + 3, + 3, + 15, + 5, + 6, + 6, + 2, + 9, + 5, + 6, + 16, + 1, + 4, + 2, + 8, + 17, + 5, + 12, + 5, + 1, + 1, + 9, + 1, + 6, + 8, + 2, + 1, + 4, + 2, + 3, + 1, + 13, + 2, + 2, + 2, + 6, + 1, + 1, + 2, + 14, + 2, + 13, + 4, + 10, + 2, + 4, + 20, + 1, + 1, + 6, + 4, + 2, + 10, + 15, + 1, + 4, + 4, + 2, + 2, + 4, + 13, + 9, + 3, + 17, + 4, + 5, + 1, + 1, + 1, + 1, + 1, + 2, + 4, + 5, + 12, + 1, + 4, + 16, + 1, + 1, + 5, + 7, + 5, + 3, + 3, + 1, + 7, + 3, + 1, + 7, + 3, + 13, + 1, + 1, + 4, + 4, + 15, + 2, + 3, + 22, + 3, + 6, + 16, + 3, + 1, + 1, + 4, + 1, + 3, + 3, + 6, + 1, + 1, + 13, + 1, + 3, + 6, + 12, + 17, + 5, + 1, + 5, + 3, + 7, + 2, + 3, + 1, + 1, + 3, + 24, + 12, + 7, + 2, + 3, + 18, + 1, + 7, + 1, + 3, + 17, + 1, + 3, + 2, + 1, + 8, + 9, + 10, + 3, + 7, + 4, + 3, + 3, + 2, + 5, + 2, + 4, + 2, + 6, + 2, + 5, + 9, + 5, + 5, + 7, + 4, + 6, + 7, + 1, + 5, + 1, + 1, + 9, + 6, + 24, + 9, + 21, + 5, + 5, + 6, + 1, + 5, + 1, + 2, + 1, + 1, + 1, + 13, + 3, + 5, + 6, + 7, + 3, + 8, + 4, + 1, + 5, + 2, + 7, + 1, + 3, + 3, + 2, + 10, + 2, + 2, + 3, + 5, + 2, + 3, + 2, + 1, + 1, + 7, + 3, + 5, + 1, + 1, + 1, + 8, + 16, + 5, + 3, + 4, + 3, + 2, + 2, + 1, + 22, + 1, + 2, + 2, + 4, + 9, + 2, + 6, + 3, + 4, + 7, + 3, + 1, + 2, + 2, + 14, + 1, + 2, + 8, + 2, + 9, + 24, + 4, + 1, + 8, + 4, + 1, + 10, + 2, + 1, + 2, + 5, + 6, + 11, + 3, + 2, + 8, + 5, + 2, + 7, + 1, + 3, + 3, + 6, + 6, + 10, + 3, + 1, + 1, + 1, + 7, + 3, + 1, + 12, + 3, + 9, + 1, + 1, + 2, + 2, + 6, + 7, + 2, + 2, + 5, + 4, + 1, + 1, + 2, + 14, + 7, + 20, + 3, + 4, + 16, + 7, + 7, + 6, + 20, + 8, + 2, + 2, + 11, + 4, + 2, + 1, + 4, + 2, + 1, + 8, + 4, + 4, + 5, + 12, + 7, + 2, + 2, + 1, + 6, + 3, + 2, + 1, + 2, + 9, + 3, + 1, + 3, + 9, + 1, + 16, + 3, + 8, + 8, + 5, + 11, + 15, + 2, + 2, + 2, + 3, + 3, + 15, + 3, + 3, + 5, + 19, + 6, + 18, + 15, + 9, + 3, + 2, + 8, + 1, + 1, + 5, + 4, + 3, + 4, + 4, + 5, + 1, + 4, + 6, + 5, + 1, + 3, + 3, + 14, + 1, + 1, + 3, + 3, + 1, + 10, + 2, + 9, + 4, + 2, + 6, + 7, + 6, + 1, + 3, + 6, + 1, + 2, + 11, + 5, + 1, + 4, + 2, + 10, + 3, + 2, + 7, + 10, + 1, + 4, + 16, + 2, + 2, + 2, + 4, + 1, + 5, + 3, + 10, + 3, + 5, + 7, + 9, + 1, + 7, + 3, + 2, + 3, + 19, + 3, + 4, + 2, + 5, + 4, + 6, + 5, + 1, + 3, + 1, + 1, + 6, + 3, + 7, + 2, + 4, + 3, + 4, + 1, + 3, + 5, + 1, + 1, + 8, + 11, + 3, + 1, + 6, + 9, + 1, + 5, + 2, + 1, + 5, + 5, + 1, + 3, + 4, + 3, + 2, + 6, + 5, + 5, + 5, + 3, + 19, + 2, + 1, + 3, + 23, + 14, + 4, + 1, + 4, + 2, + 6, + 8, + 2, + 4, + 3, + 7, + 13, + 1, + 3, + 1, + 11, + 8, + 5, + 1, + 1, + 5, + 1, + 1, + 2, + 1, + 3, + 2, + 2, + 13, + 7, + 17, + 6, + 6, + 4, + 2, + 3, + 7, + 5, + 1, + 5, + 5, + 1, + 1, + 3, + 7, + 2, + 2, + 3, + 3, + 7, + 1, + 2, + 9, + 10, + 9, + 3, + 1, + 2, + 5, + 2, + 8, + 2, + 3, + 4, + 6, + 2, + 14, + 6, + 1, + 4, + 1, + 14, + 7, + 3, + 1, + 1, + 1, + 8, + 6, + 13, + 14, + 9, + 1, + 3, + 6, + 4, + 12, + 9, + 3, + 1, + 8, + 9, + 5, + 5, + 3, + 3, + 4, + 4, + 5, + 14, + 6, + 4, + 5, + 2, + 4, + 5, + 4, + 7, + 7, + 14, + 5, + 4, + 7, + 1, + 17, + 4, + 2, + 1, + 3, + 11, + 1, + 1, + 2, + 1, + 3, + 1, + 3, + 4, + 5, + 3, + 3, + 7, + 5, + 5, + 1, + 3, + 6, + 1, + 8, + 4, + 4, + 19, + 6, + 4, + 3, + 4, + 2, + 7, + 2, + 9, + 3, + 1, + 13, + 4, + 9, + 2, + 7, + 3, + 2, + 6, + 3, + 1, + 1, + 2, + 7, + 9, + 5, + 7, + 15, + 1, + 2, + 2, + 4, + 7, + 15, + 4, + 5, + 3, + 7, + 7, + 18, + 7, + 7, + 2, + 10, + 1, + 11, + 2, + 1, + 9, + 6, + 5, + 10, + 8, + 3, + 2, + 1, + 3, + 4, + 2, + 5, + 5, + 9, + 1, + 5, + 2, + 4, + 4, + 1, + 6, + 1, + 2, + 2, + 3, + 4, + 5, + 4, + 1, + 7, + 11, + 1, + 1, + 2, + 2, + 3, + 3, + 2, + 2, + 7, + 3, + 3, + 1, + 3, + 1, + 1, + 1, + 3, + 5, + 5, + 5, + 5, + 8, + 3, + 9, + 2, + 2, + 2, + 3, + 4, + 1, + 3, + 2, + 3, + 4, + 8, + 6, + 3, + 4, + 1, + 1, + 3, + 1, + 5, + 4, + 5, + 3, + 1, + 5, + 10, + 1, + 7, + 4, + 17, + 2, + 6, + 1, + 7, + 1, + 1, + 8, + 3, + 2, + 12, + 6, + 1, + 7, + 1, + 15, + 2, + 3, + 2, + 1, + 5, + 6, + 1, + 4, + 4, + 2, + 3, + 4, + 8, + 2, + 5, + 1, + 13, + 7, + 3, + 1, + 5, + 4, + 19, + 1, + 6, + 7, + 2, + 3, + 2, + 5, + 3, + 8, + 2, + 7, + 5, + 1, + 6, + 2, + 4, + 3, + 4, + 4, + 3, + 2, + 4, + 3, + 15, + 6, + 6, + 6, + 3, + 1, + 7, + 1, + 8, + 7, + 9, + 3, + 3, + 2, + 15, + 2, + 1, + 2, + 2, + 6, + 4, + 6, + 1, + 12, + 3, + 14, + 4, + 6, + 2, + 6, + 3, + 1, + 1, + 1, + 6, + 2, + 3, + 4, + 3, + 5, + 12, + 3, + 1, + 3, + 11, + 7, + 6, + 1, + 1, + 2, + 6, + 3, + 1, + 10, + 15, + 1, + 5, + 3, + 1, + 1, + 1, + 4, + 2, + 1, + 5, + 6, + 1, + 2, + 2, + 9, + 1, + 2, + 6, + 9, + 1, + 5, + 2, + 5, + 2, + 7, + 5, + 2, + 9, + 4, + 5, + 2, + 3, + 2, + 1, + 10, + 2, + 10, + 6, + 7, + 11, + 1, + 9, + 5, + 3, + 8, + 2, + 3, + 1, + 2, + 1, + 4, + 1, + 1, + 7, + 1, + 8, + 7, + 1, + 4, + 21, + 2, + 4, + 1, + 1, + 5, + 1, + 3, + 6, + 5, + 4, + 3, + 1, + 2, + 6, + 5, + 3, + 1, + 6, + 4, + 1, + 2, + 1, + 4, + 1, + 2, + 2, + 3, + 7, + 19, + 2, + 2, + 1, + 4, + 1, + 4, + 10, + 14, + 2, + 3, + 6, + 4, + 6, + 5, + 1, + 2, + 8, + 4, + 3, + 11, + 7, + 4, + 1, + 6, + 13, + 6, + 22, + 2, + 7, + 2, + 3, + 3, + 3, + 1, + 3, + 4, + 4, + 1, + 8, + 3, + 6, + 4, + 3, + 7, + 3, + 7, + 2, + 1, + 1, + 2, + 1, + 6, + 3, + 4, + 1, + 2, + 6, + 2, + 11, + 6, + 1, + 5, + 14, + 2, + 12, + 3, + 14, + 2, + 2, + 10, + 2, + 8, + 5, + 2, + 11, + 2, + 17, + 1, + 6, + 2, + 3, + 2, + 2, + 5, + 2, + 5, + 6, + 3, + 2, + 5, + 5, + 4, + 4, + 6, + 4, + 6, + 3, + 2, + 2, + 8, + 1, + 1, + 2, + 8, + 2, + 13, + 19, + 7, + 4, + 7, + 2, + 6, + 10, + 2, + 6, + 17, + 4, + 11, + 2, + 1, + 4, + 9, + 4, + 14, + 1, + 4, + 2, + 5, + 13, + 2, + 4, + 4, + 1, + 2, + 1, + 2, + 6, + 5, + 2, + 7, + 2, + 1, + 10, + 2, + 1, + 3, + 3, + 5, + 6, + 4, + 10, + 3, + 4, + 2, + 2, + 31, + 5, + 5, + 2, + 1, + 8, + 5, + 7, + 1, + 4, + 4, + 4, + 1, + 1, + 9, + 11, + 3, + 1, + 8, + 1, + 14, + 7, + 8, + 2, + 3, + 7, + 3, + 2, + 1, + 9, + 1, + 5, + 3, + 6, + 5, + 1, + 2, + 5, + 3, + 4, + 7, + 29, + 4, + 1, + 3, + 3, + 3, + 3, + 1, + 5, + 1, + 1, + 12, + 2, + 13, + 13, + 5, + 1, + 7, + 2, + 8, + 2, + 3, + 1, + 4 + ], + "xaxis": "x", + "yaxis": "y" + } + ], + "layout": { + "annotations": [ + { + "font": {}, + "showarrow": false, + "text": "jezyk=hiszp", + "textangle": 90, + "x": 0.98, + "xanchor": "left", + "xref": "paper", + "y": 0.15666666666666665, + "yanchor": "middle", + "yref": "paper" + }, + { + "font": {}, + "showarrow": false, + "text": "jezyk=ang", + "textangle": 90, + "x": 0.98, + "xanchor": "left", + "xref": "paper", + "y": 0.4999999999999999, + "yanchor": "middle", + "yref": "paper" + }, + { + "font": {}, + "showarrow": false, + "text": "jezyk=polski", + "textangle": 90, + "x": 0.98, + "xanchor": "left", + "xref": "paper", + "y": 0.8433333333333332, + "yanchor": "middle", + "yref": "paper" + } + ], + "barmode": "relative", + "legend": { + "tracegroupgap": 0 + }, + "margin": { + "t": 60 + }, + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "xaxis": { + "anchor": "y", + "domain": [ + 0, + 0.98 + ], + "title": { + "text": "dlugosc" + } + }, + "xaxis2": { + "anchor": "y2", + "domain": [ + 0, + 0.98 + ], + "matches": "x", + "showticklabels": false + }, + "xaxis3": { + "anchor": "y3", + "domain": [ + 0, + 0.98 + ], + "matches": "x", + "showticklabels": false + }, + "yaxis": { + "anchor": "x", + "domain": [ + 0, + 0.3133333333333333 + ], + "title": { + "text": "count" + } + }, + "yaxis2": { + "anchor": "x2", + "domain": [ + 0.34333333333333327, + 0.6566666666666665 + ], + "matches": "y", + "title": { + "text": "count" + } + }, + "yaxis3": { + "anchor": "x3", + "domain": [ + 0.6866666666666665, + 0.9999999999999998 + ], + "matches": "y", + "title": { + "text": "count" + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df = pd.DataFrame([[random.choice(['ang','polski','hiszp']), np.random.geometric(0.2)] for i in range(5000) ], columns=['jezyk', 'dlugosc'])\n", + "fig = px.histogram(df, x=\"dlugosc\",facet_row='jezyk',nbins=50, hover_data=df.columns)\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "?px.histogram" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ZADANIE 1 \n", + "\n", + "(40 punktów)\n", + "\n", + "ZNAJDŹ PRZYKŁAD TEKSTÓW Z TEJ SAMEJ DOMENY 1_000_000 słów albo nawet tłumaczenie :\n", + "- język angielski \n", + "- język polski\n", + "- język z rodziny romańskich\n", + "\n", + "Proponowane narzędzia:\n", + "- nltk\n", + "- plotly express\n", + "- biblioteka collections\n", + "- spacy (niekoniecznie)\n", + "\n", + "\n", + "Dla każdego z języków:\n", + "- policz ilosć unikalnych lowercase słów (ze stemmingiem i bez)\n", + "- policz ilosć znaków\n", + "- policz ilosć unikalnych znaków\n", + "- policz ilosć zdań zdań\n", + "- policz ilosć unikalnych zdań\n", + "- podaj min, max, średnią oraz medianę ilości znaków w słowie \n", + "- podaj min, max, średnią oraz medianę ilości słów w zdaniu, znajdz najkrotsze i najdluzsze zdania\n", + "- wygeneruj word cloud (normalnie i po usunięciu stopwordów)\n", + "- wypisz 20 najbardziej popularnych słów (normalnie i po usunięciu stopwordów) (lowercase)\n", + "- wypisz 20 najbardziej popularnych bigramów (normalnie i po usunięciu stopwordów)\n", + "- narysuj wykres częstotliwości słów (histogram lub linie) w taki sposób żeby był czytelny, wypróbuj skali logarytmicznej dla osi x (ale na razie nie dla y), usuwanie słów poniżej limitu wystąpień itp.\n", + "- punkt jak wyżej, tylko dla bigramów\n", + "- punkt jak wyżej, tylko dla znaków\n", + "- narysuj wykres barplot dla części mowy (PART OF SPEECH TAGS, tylko pierwszy stopień zagłębienia)\n", + "- dla próbki 10000 zdań sprawdź jak często langdetect https://pypi.org/project/langdetect/ się myli i w jaki sposób.\n", + "- zilustruj prawo zipfa ( px.line z zaznaczonymi punktami)\n", + "- napisz wnioski (10-50 zdań)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### START ZADANIA" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### KONIEC ZADANIA" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://github.com/sdadas/polish-nlp-resources\n", + "\n", + "## Indeks czytelności Foga\n", + "\n", + "\n", + "Indeks czytelności Foga (Fog index) ilustruje stopień trudności tekstu. Nim wyższa liczba, tym trudniejszy jest tekst w odbiorze. Ze względu na charakterystyki różnych języków nie powinno porównywać sie tekstów pisanych w różnych językach. Index służy do porównywania różnych tekstów w tym samym języku.\n", + "\n", + "$$FOG = 0.4\\left(\\frac{liczba\\ słów}{liczba\\ zdań} + 100\\left(\\frac{liczba\\ słów\\ skomplikowanych}{liczba\\ słów}\\right) \\right)$$\n", + "\n", + "Słowa skomplikowane mogą pochodzić ze specjalnej listy, jeżeli nie ma takiej listy, to można przyjąć że są to słowa składające sie z więcej niz 3 sylab (dla języka polskiego)\n", + "\n", + "Indeks czytelności Foga jest skutecznym narzędziem przy pisaniu tekstów. Jeżeli indeks jest zbyt wysoki należy uprościć tekst. Można to zrobić przez redukcje średniej długości zdania $\\frac{liczba słów}{liczba zdań}$ oraz uproszczenie skompikowanych lub długich słów.\n", + "\n", + "## Prawo Heapsa\n", + "\n", + "Prawo heapsa to empiryczne prawo lingwistyczne. Stanowi, że liczba odmiennych słów rośnie wykładniczo (z wykładnikiem <1) względem długości dokumentu.\n", + "\n", + "Ilosć odmiennych słów $V_R$ względem całkowitej ilości słów w tekście $n$ można opisać wzorem:\n", + "\n", + "$$V_R(n) = Kn^{\\beta},$$\n", + "\n", + "gdzie $K$ i $\\beta$ to parametry wyznaczone empirycznie.\n", + "\n", + "Podobnie do Indeksu czytelności Foga nie powinno się porównywać różnych tekstów w różnych językach. Jednak porównanie tego samego tekstu przetłumaczonego na różne języki daje ich wgląd w ich charakterystykę.\n", + "\n", + "## ZADANIE 1\n", + "\n", + "(50 punktów)\n", + "\n", + "Znajdź teksty w języku polskim (mają składać sie po 5 osobnych dokumentów każdy, długości powinny być różne):\n", + "- tekst prawny\n", + "- tekst naukowy\n", + "- tekst z polskiego z powieści (np. wolne lektury)\n", + "- tekst z polskiego internetu (reddit, wykop, komentarze)\n", + "- transkrypcja tekstu mówionego\n", + "\n", + "\n", + "\n", + "ZADANIA:\n", + "- zilustruj gunning fog index (oś y) i średnią długość zdania (oś x) na jednym wykresie dla wszystkich tekstów, domeny oznacz kolorami (px.scatter), dla języka polskiego traktuj wyrazy długie jako te powyżej 3 sylab, możesz użyć https://pyphen.org/ do liczenia sylab\n", + "- zilustruj prawo Heaps'a dla wszystkich tekstów na jednym wykresie, domeny oznacz kolorami (px.scatter)\n", + "- napisz wnioski (10-50 zdań)\n", + "\n", + "\n", + "#### START ZADANIA\n", + "\n", + "#### KONIEC ZADANIA\n", + "\n", + "## WYKONANIE ZADAŃ\n", + "Zgodnie z instrukcją 01_Kodowanie_tekstu.ipynb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "author": "Jakub Pokrywka", + "email": "kubapok@wmi.amu.edu.pl", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "lang": "pl", + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + }, + "subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]", + "title": "Ekstrakcja informacji", + "year": "2021" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/cw/03_statystyczny_model_językowy.ipynb b/cw/03_statystyczny_model_językowy.ipynb deleted file mode 100644 index 9ce2cd4..0000000 --- a/cw/03_statystyczny_model_językowy.ipynb +++ /dev/null @@ -1,176 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", - "
\n", - "

Ekstrakcja informacji

\n", - "

0. Jezyk [ćwiczenia]

\n", - "

Jakub Pokrywka (2022)

\n", - "
\n", - "\n", - "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" - ] - }, - { - "cell_type": "code", - "execution_count": 278, - "metadata": {}, - "outputs": [], - "source": [ - "NR_INDEKSU = 375985" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "https://web.stanford.edu/~jurafsky/slp3/3.pdf" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "class Model():\n", - " \n", - " def __init__(self, vocab_size=30_000, UNK_token= ''):\n", - " pass\n", - " \n", - " def train(corpus:list) -> None:\n", - " pass\n", - " \n", - " def get_conditional_prob_for_word(text: list, word: str) -> float:\n", - " pass\n", - " \n", - " def get_prob_for_text(text: list) -> float:\n", - " pass\n", - " \n", - " def most_probable_next_word(text:list) -> str:\n", - " 'nie powinien zwracań nigdy '\n", - " pass\n", - " \n", - " def high_probable_next_word(text:list) -> str:\n", - " 'nie powinien zwracań nigdy '\n", - " pass\n", - " \n", - " def generate_text(text_beggining:list, length: int, greedy: bool) -> list:\n", - " 'nie powinien zwracań nigdy '\n", - " pass" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "def get_ppl(text: list) -> float:\n", - " pass" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "def get_entropy(text: list) -> float:\n", - " pass" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- wybierz tekst w dowolnym języku (10_000_000 słów)\n", - "- podziel zbiór na train/test w proporcji 90/100\n", - "- stworzyć unigramowy model językowy\n", - "- stworzyć bigramowy model językowy\n", - "- stworzyć trigramowy model językowy\n", - "- wymyśl 5 krótkich zdań. Policz ich prawdopodobieństwo\n", - "- napisz włąsnoręcznie funkcję, która liczy perplexity na korpusie i policz perplexity na każdym z modeli dla train i test\n", - "- wygeneruj tekst, zaczynając od wymyślonych 5 początków. Postaraj się, żeby dla obu funkcji, a przynajmniej dla high_probable_next_word teksty były orginalne. Czy wynik będzię sie róźnił dla tekstów np.\n", - "`We sketch how Loomis–Whitney follows from this: Indeed, let X be a uniformly distributed random variable with values` oraz `random variable with values`?\n", - "- stwórz model dla korpusu z ZADANIE 1 i policz perplexity dla każdego z tekstów (zrób split 90/10) dla train i test\n", - "\n", - "- klasyfikacja za pomocą modelu językowego\n", - "- wygładzanie metodą laplace'a" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### START ZADANIA" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### KONIEC ZADANIA" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- znajdź duży zbiór danych dla klasyfikacji binarnej, wytrenuj osobne modele dla każdej z klas i użyj dla klasyfikacji. Warunkiem zaliczenia jest uzyskanie wyniku większego niż baseline (zwracanie zawsze bardziej licznej klasy)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## WYKONANIE ZADAŃ\n", - "Zgodnie z instrukcją 01_Kodowanie_tekstu.ipynb" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Teoria informacji" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Wygładzanie modeli językowych" - ] - } - ], - "metadata": { - "author": "Jakub Pokrywka", - "email": "kubapok@wmi.amu.edu.pl", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "lang": "pl", - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - }, - "subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]", - "title": "Ekstrakcja informacji", - "year": "2021" - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/cw/04_statystyczny_model_językowy.ipynb b/cw/04_statystyczny_model_językowy.ipynb index aba0f1d..889a655 100644 --- a/cw/04_statystyczny_model_językowy.ipynb +++ b/cw/04_statystyczny_model_językowy.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 278, "metadata": {}, "outputs": [], "source": [ @@ -32,25 +32,40 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "class Model():\n", " \n", - " def __init__(self, vocab_size, UNK_token= ''):\n", + " def __init__(self, vocab_size=30_000, UNK_token= ''):\n", " pass\n", " \n", " def train(corpus:list) -> None:\n", " pass\n", " \n", - " def predict(text: list, probs: str) -> float:\n", + " def get_conditional_prob_for_word(text: list, word: str) -> float:\n", + " pass\n", + " \n", + " def get_prob_for_text(text: list) -> float:\n", + " pass\n", + " \n", + " def most_probable_next_word(text:list) -> str:\n", + " 'nie powinien zwracań nigdy '\n", + " pass\n", + " \n", + " def high_probable_next_word(text:list) -> str:\n", + " 'nie powinien zwracań nigdy '\n", + " pass\n", + " \n", + " def generate_text(text_beggining:list, length: int, greedy: bool) -> list:\n", + " 'nie powinien zwracań nigdy '\n", " pass" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -60,186 +75,75 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "text = 'Pani Ala ma kota oraz ładnego pieska i 3 chomiki'" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "text_splitted = text.split(' ')" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['Pani', 'Ala', 'ma', 'kota', 'oraz', 'ładnego', 'pieska', 'i', '3', 'chomiki']" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "text_splitted" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "text_masked = text_splitted[:4] + [''] + text_splitted[5:]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Pani',\n", - " 'Ala',\n", - " 'ma',\n", - " 'kota',\n", - " '',\n", - " 'ładnego',\n", - " 'pieska',\n", - " 'i',\n", - " '3',\n", - " 'chomiki']" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "text_masked" + "def get_entropy(text: list) -> float:\n", + " pass" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "trigram_model działa na ['ma', 'kota', <'MASK>']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "trigram_model.predict(['ma', 'kota']) → 'i:0.55 oraz:0.25 czarnego:0.1 :0.1'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ZADANIE:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "g1 = [470618, 415366, 434695, 470611, 470607]\n", - "g2 = [440054, 434742, 434760, 434784, 434788]\n", - "g3 = [434804, 430705, 470609, 470619, 434704]\n", - "g4 = [434708, 470629, 434732, 434749, 426206]\n", - "g5 = [434766, 470628, 437622, 434780, 470627, 440058]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "model trigramowy odwrotny\n" - ] - } - ], - "source": [ - "if NR_INDEKSU in g1:\n", - " print('model bigramowy standardowy')\n", - "elif NR_INDEKSU in g2:\n", - " print('model bigramowy odwrotny')\n", - "elif NR_INDEKSU in g3:\n", - " print('model trigramowy')\n", - "elif NR_INDEKSU in g4:\n", - " print('model trigramowy odwrotny')\n", - "elif NR_INDEKSU in g5:\n", - " print('model trigramowy ze zgadywaniem środka')\n", - "else:\n", - " print('proszę zgłosić się do prowadzącego')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### gonito:\n", - "- zapisanie do achievmentu przez start working\n", - "- send to review" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### ZADANIE\n", + "- wybierz tekst w dowolnym języku (10_000_000 słów)\n", + "- podziel zbiór na train/test w proporcji 90/100\n", + "- stworzyć unigramowy model językowy\n", + "- stworzyć bigramowy model językowy\n", + "- stworzyć trigramowy model językowy\n", + "- wymyśl 5 krótkich zdań. Policz ich prawdopodobieństwo\n", + "- napisz włąsnoręcznie funkcję, która liczy perplexity na korpusie i policz perplexity na każdym z modeli dla train i test\n", + "- wygeneruj tekst, zaczynając od wymyślonych 5 początków. Postaraj się, żeby dla obu funkcji, a przynajmniej dla high_probable_next_word teksty były orginalne. Czy wynik będzię sie róźnił dla tekstów np.\n", + "`We sketch how Loomis–Whitney follows from this: Indeed, let X be a uniformly distributed random variable with values` oraz `random variable with values`?\n", + "- stwórz model dla korpusu z ZADANIE 1 i policz perplexity dla każdego z tekstów (zrób split 90/10) dla train i test\n", "\n", - "Proszę stworzyć rozwiązanie modelu (komórka wyżej) dla https://gonito.net/challenge/challenging-america-word-gap-prediction i umieścić je na platformie gonito\n", - " \n", - "Warunki zaliczenia:\n", - "- wynik widoczny na platformie zarówno dla dev i dla test\n", - "- wynik dla dev i test lepszy (niższy) od 1024.00\n", - "- deadline do końca dnia 27.04\n", - "- commitując rozwiązanie proszę również umieścić rozwiązanie w pliku /run.py (czyli na szczycie katalogu). Można przekonwertować jupyter do pliku python przez File → Download as → Python. Rozwiązanie nie musi być w pythonie, może być w innym języku.\n", - "- zadania wykonujemy samodzielnie\n", - "- w nazwie commita podaj nr indeksu\n", - "- w tagach podaj \"n-grams\" (należy zatwierdzić przecinkiem po wybraniu tagu)!\n", - "\n", - "Uwagi:\n", - "\n", - "- warto wymyślić jakąś metodę wygładazania, bez tego może być bardzo kiepski wynik\n", - "- nie trzeba korzystać z całego zbioru trenującego\n", - "- zadanie to 50 punktów, za najlepsze rozwiązanie w swojej grupie (g1,g2,g3,g4,g5), przyznaję dodatkowo 40 punktów\n", - "- punkty będą przyznane na gonito\n", - "- warto monitorować RAM, próbować z różnym vocab_size, można skorzystać z pythonowego Counter\n", - "- warto sobie zrobić dodatkowo model unigramowy w ramach ćwiczenia" + "- klasyfikacja za pomocą modelu językowego\n", + "- wygładzanie metodą laplace'a" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "#### START ZADANIA" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### KONIEC ZADANIA" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- znajdź duży zbiór danych dla klasyfikacji binarnej, wytrenuj osobne modele dla każdej z klas i użyj dla klasyfikacji. Warunkiem zaliczenia jest uzyskanie wyniku większego niż baseline (zwracanie zawsze bardziej licznej klasy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## WYKONANIE ZADAŃ\n", + "Zgodnie z instrukcją 01_Kodowanie_tekstu.ipynb" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Teoria informacji" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wygładzanie modeli językowych" + ] } ], "metadata": { diff --git a/cw/05_statystyczny_model_językowy_część_2.ipynb b/cw/05_statystyczny_model_językowy_część_2.ipynb new file mode 100644 index 0000000..640a5dc --- /dev/null +++ b/cw/05_statystyczny_model_językowy_część_2.ipynb @@ -0,0 +1,272 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

5. Statystyczny model językowy część 2 [ćwiczenia]

\n", + "

Jakub Pokrywka (2022)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "NR_INDEKSU = 375985" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://web.stanford.edu/~jurafsky/slp3/3.pdf" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "class Model():\n", + " \n", + " def __init__(self, vocab_size, UNK_token= ''):\n", + " pass\n", + " \n", + " def train(corpus:list) -> None:\n", + " pass\n", + " \n", + " def predict(text: list, probs: str) -> float:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def get_ppl(text: list) -> float:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "text = 'Pani Ala ma kota oraz ładnego pieska i 3 chomiki'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "text_splitted = text.split(' ')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['Pani', 'Ala', 'ma', 'kota', 'oraz', 'ładnego', 'pieska', 'i', '3', 'chomiki']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_splitted" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "text_masked = text_splitted[:4] + [''] + text_splitted[5:]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Pani',\n", + " 'Ala',\n", + " 'ma',\n", + " 'kota',\n", + " '',\n", + " 'ładnego',\n", + " 'pieska',\n", + " 'i',\n", + " '3',\n", + " 'chomiki']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_masked" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "trigram_model działa na ['ma', 'kota', <'MASK>']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "trigram_model.predict(['ma', 'kota']) → 'i:0.55 oraz:0.25 czarnego:0.1 :0.1'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ZADANIE:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "g1 = [470618, 415366, 434695, 470611, 470607]\n", + "g2 = [440054, 434742, 434760, 434784, 434788]\n", + "g3 = [434804, 430705, 470609, 470619, 434704]\n", + "g4 = [434708, 470629, 434732, 434749, 426206]\n", + "g5 = [434766, 470628, 437622, 434780, 470627, 440058]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "model trigramowy odwrotny\n" + ] + } + ], + "source": [ + "if NR_INDEKSU in g1:\n", + " print('model bigramowy standardowy')\n", + "elif NR_INDEKSU in g2:\n", + " print('model bigramowy odwrotny')\n", + "elif NR_INDEKSU in g3:\n", + " print('model trigramowy')\n", + "elif NR_INDEKSU in g4:\n", + " print('model trigramowy odwrotny')\n", + "elif NR_INDEKSU in g5:\n", + " print('model trigramowy ze zgadywaniem środka')\n", + "else:\n", + " print('proszę zgłosić się do prowadzącego')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### gonito:\n", + "- zapisanie do achievmentu przez start working\n", + "- send to review" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ZADANIE\n", + "\n", + "Proszę stworzyć rozwiązanie modelu (komórka wyżej) dla https://gonito.net/challenge/challenging-america-word-gap-prediction i umieścić je na platformie gonito\n", + " \n", + "Warunki zaliczenia:\n", + "- wynik widoczny na platformie zarówno dla dev i dla test\n", + "- wynik dla dev i test lepszy (niższy) od 1024.00\n", + "- deadline do końca dnia 27.04\n", + "- commitując rozwiązanie proszę również umieścić rozwiązanie w pliku /run.py (czyli na szczycie katalogu). Można przekonwertować jupyter do pliku python przez File → Download as → Python. Rozwiązanie nie musi być w pythonie, może być w innym języku.\n", + "- zadania wykonujemy samodzielnie\n", + "- w nazwie commita podaj nr indeksu\n", + "- w tagach podaj \"n-grams\" (należy zatwierdzić przecinkiem po wybraniu tagu)!\n", + "\n", + "Uwagi:\n", + "\n", + "- warto wymyślić jakąś metodę wygładazania, bez tego może być bardzo kiepski wynik\n", + "- nie trzeba korzystać z całego zbioru trenującego\n", + "- zadanie to 70 punktów, za najlepsze rozwiązanie w swojej grupie przyznaję dodatkowo 40 punktów\n", + "- punkty będą przyznane na gonito\n", + "- warto monitorować RAM, próbować z różnym vocab_size, można skorzystać z pythonowego Counter\n", + "- warto sobie zrobić dodatkowo model unigramowy w ramach ćwiczenia" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "author": "Jakub Pokrywka", + "email": "kubapok@wmi.amu.edu.pl", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "lang": "pl", + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + }, + "subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]", + "title": "Ekstrakcja informacji", + "year": "2021" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/cw/05_wygładzanie_modeli_językowych.ipynb b/cw/06_wygładzanie_modeli_językowych.ipynb similarity index 99% rename from cw/05_wygładzanie_modeli_językowych.ipynb rename to cw/06_wygładzanie_modeli_językowych.ipynb index 09261d2..838378c 100644 --- a/cw/05_wygładzanie_modeli_językowych.ipynb +++ b/cw/06_wygładzanie_modeli_językowych.ipynb @@ -7,7 +7,7 @@ "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", "
\n", "

Modelowanie Języka

\n", - "

5. Wygłazanie modeli językowych [ćwiczenia]

\n", + "

6. Wygładzanie modeli językowych [ćwiczenia]

\n", "

Jakub Pokrywka (2022)

\n", "
\n", "\n", diff --git a/cw/06_biblioteki_STM.ipynb b/cw/07_biblioteki_STM.ipynb similarity index 99% rename from cw/06_biblioteki_STM.ipynb rename to cw/07_biblioteki_STM.ipynb index d737d9c..2aec6e1 100644 --- a/cw/06_biblioteki_STM.ipynb +++ b/cw/07_biblioteki_STM.ipynb @@ -7,7 +7,7 @@ "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", "
\n", "

Modelowanie Języka

\n", - "

6. biblioteki LM [ćwiczenia]

\n", + "

7. biblioteki STM [ćwiczenia]

\n", "

Jakub Pokrywka (2022)

\n", "
\n", "\n", diff --git a/cw/07_neuronowe_modele_językowe.ipynb b/cw/08_neuronowe_modele_językowe.ipynb similarity index 98% rename from cw/07_neuronowe_modele_językowe.ipynb rename to cw/08_neuronowe_modele_językowe.ipynb index e3c1174..06e7f2e 100644 --- a/cw/07_neuronowe_modele_językowe.ipynb +++ b/cw/08_neuronowe_modele_językowe.ipynb @@ -7,7 +7,7 @@ "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", "
\n", "

Modelowanie Języka

\n", - "

7. Model neuronowy ff [ćwiczenia]

\n", + "

8. Neuronowe modele językowe [ćwiczenia]

\n", "

Jakub Pokrywka (2022)

\n", "
\n", "\n", diff --git a/cw/08_Model_neuronowy_typu_word2vec.ipynb b/cw/09_Model_neuronowy_typu_word2vec.ipynb similarity index 98% rename from cw/08_Model_neuronowy_typu_word2vec.ipynb rename to cw/09_Model_neuronowy_typu_word2vec.ipynb index 3c6a62b..77bf725 100644 --- a/cw/08_Model_neuronowy_typu_word2vec.ipynb +++ b/cw/09_Model_neuronowy_typu_word2vec.ipynb @@ -7,7 +7,7 @@ "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", "
\n", "

Modelowanie Języka

\n", - "

8. Model neuronowy typu word2vec [ćwiczenia]

\n", + "

9. Model neuronowy typu word2vec [ćwiczenia]

\n", "

Jakub Pokrywka (2022)

\n", "
\n", "\n", @@ -133,7 +133,7 @@ "author": "Jakub Pokrywka", "email": "kubapok@wmi.amu.edu.pl", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -148,7 +148,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.8.3" }, "subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]", "title": "Ekstrakcja informacji", diff --git a/cw/09_Model_neuronowy_rekurencyjny.ipynb b/cw/10_Model_neuronowy_rekurencyjny.ipynb similarity index 98% rename from cw/09_Model_neuronowy_rekurencyjny.ipynb rename to cw/10_Model_neuronowy_rekurencyjny.ipynb index cb0302c..0280683 100644 --- a/cw/09_Model_neuronowy_rekurencyjny.ipynb +++ b/cw/10_Model_neuronowy_rekurencyjny.ipynb @@ -7,7 +7,7 @@ "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", "
\n", "

Modelowanie Języka

\n", - "

9. Model neuronowy rekurencyjny [ćwiczenia]

\n", + "

10. Model neuronowy rekurencyjny [ćwiczenia]

\n", "

Jakub Pokrywka (2022)

\n", "
\n", "\n", @@ -952,6 +952,38 @@ "source": [ "predict(dataset, model, 'kmicic szedł')" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ZADANIE 1\n", + "\n", + "Stworzyć sieć rekurencyjną GRU dla Challenging America word-gap prediction. Wymogi takie jak zawsze, zadanie widoczne na gonito" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ZADANIE 2\n", + "\n", + "Podjąć wyzwanie na https://gonito.net/challenge/precipitation-pl i/lub https://gonito.net/challenge/book-dialogues-pl\n", + "\n", + "\n", + "**KONIECZNIE** należy je zgłosić do końca następnego piątku, czyli 20 maja!. Za późniejsze zgłoszenia (nawet minutę) nieprzyznaję punktów.\n", + " \n", + "Za każde zgłoszenie lepsze niż baseline przyznaję 40 punktów.\n", + "\n", + "Zamiast tych 40 punktów za najlepsze miejsca:\n", + "- 1. miejsce 150 punktów\n", + "- 2. miejsce 100 punktów\n", + "- 3. miejsce 70 punktów\n", + "\n", + "Można brać udział w 2 wyzwaniach jednocześnie.\n", + "\n", + "Zadania nie będą widoczne w gonito w achievements. Nie trzeba udostępniać kodu, należy jednak przestrzegać regulaminu wyzwań." + ] } ], "metadata": { diff --git a/cw/11_Model_rekurencyjny_z_atencją.ipynb b/cw/11_Model_rekurencyjny_z_atencją.ipynb deleted file mode 100644 index 3cedddb..0000000 --- a/cw/11_Model_rekurencyjny_z_atencją.ipynb +++ /dev/null @@ -1,517 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", - "
\n", - "

Modelowanie Języka

\n", - "

10. Model rekurencyjny z atencją [ćwiczenia]

\n", - "

Jakub Pokrywka (2022)

\n", - "
\n", - "\n", - "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "notebook na podstawie:\n", - "\n", - "# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from __future__ import unicode_literals, print_function, division\n", - "from io import open\n", - "import unicodedata\n", - "import string\n", - "import re\n", - "import random\n", - "\n", - "import torch\n", - "import torch.nn as nn\n", - "from torch import optim\n", - "import torch.nn.functional as F\n", - "\n", - "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "SOS_token = 0\n", - "EOS_token = 1\n", - "\n", - "class Lang:\n", - " def __init__(self):\n", - " self.word2index = {}\n", - " self.word2count = {}\n", - " self.index2word = {0: \"SOS\", 1: \"EOS\"}\n", - " self.n_words = 2 # Count SOS and EOS\n", - "\n", - " def addSentence(self, sentence):\n", - " for word in sentence.split(' '):\n", - " self.addWord(word)\n", - "\n", - " def addWord(self, word):\n", - " if word not in self.word2index:\n", - " self.word2index[word] = self.n_words\n", - " self.word2count[word] = 1\n", - " self.index2word[self.n_words] = word\n", - " self.n_words += 1\n", - " else:\n", - " self.word2count[word] += 1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pairs = []\n", - "with open('data/eng-pol.txt') as f:\n", - " for line in f:\n", - " eng_line, pol_line = line.lower().rstrip().split('\\t')\n", - "\n", - " eng_line = re.sub(r\"([.!?])\", r\" \\1\", eng_line)\n", - " eng_line = re.sub(r\"[^a-zA-Z.!?]+\", r\" \", eng_line)\n", - "\n", - " pol_line = re.sub(r\"([.!?])\", r\" \\1\", pol_line)\n", - " pol_line = re.sub(r\"[^a-zA-Z.!?ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]+\", r\" \", pol_line)\n", - "\n", - " pairs.append([eng_line, pol_line])\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pairs[1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "MAX_LENGTH = 10\n", - "eng_prefixes = (\n", - " \"i am \", \"i m \",\n", - " \"he is\", \"he s \",\n", - " \"she is\", \"she s \",\n", - " \"you are\", \"you re \",\n", - " \"we are\", \"we re \",\n", - " \"they are\", \"they re \"\n", - ")\n", - "\n", - "pairs = [p for p in pairs if len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH]\n", - "pairs = [p for p in pairs if p[0].startswith(eng_prefixes)]\n", - "\n", - "eng_lang = Lang()\n", - "pol_lang = Lang()\n", - "\n", - "for pair in pairs:\n", - " eng_lang.addSentence(pair[0])\n", - " pol_lang.addSentence(pair[1])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pairs[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pairs[1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pairs[2]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "eng_lang.n_words" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pol_lang.n_words" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class EncoderRNN(nn.Module):\n", - " def __init__(self, input_size, embedding_size, hidden_size):\n", - " super(EncoderRNN, self).__init__()\n", - " self.embedding_size = 200\n", - " self.hidden_size = hidden_size\n", - "\n", - " self.embedding = nn.Embedding(input_size, self.embedding_size)\n", - " self.gru = nn.GRU(self.embedding_size, hidden_size)\n", - "\n", - " def forward(self, input, hidden):\n", - " embedded = self.embedding(input).view(1, 1, -1)\n", - " output = embedded\n", - " output, hidden = self.gru(output, hidden)\n", - " return output, hidden\n", - "\n", - " def initHidden(self):\n", - " return torch.zeros(1, 1, self.hidden_size, device=device)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class DecoderRNN(nn.Module):\n", - " def __init__(self, embedding_size, hidden_size, output_size):\n", - " super(DecoderRNN, self).__init__()\n", - " self.embedding_size = embedding_size\n", - " self.hidden_size = hidden_size\n", - "\n", - " self.embedding = nn.Embedding(output_size, self.embedding_size)\n", - " self.gru = nn.GRU(self.embedding_size, hidden_size)\n", - " self.out = nn.Linear(hidden_size, output_size)\n", - " self.softmax = nn.LogSoftmax(dim=1)\n", - "\n", - " def forward(self, input, hidden):\n", - " output = self.embedding(input).view(1, 1, -1)\n", - " output = F.relu(output)\n", - " output, hidden = self.gru(output, hidden)\n", - " output = self.softmax(self.out(output[0]))\n", - " return output, hidden\n", - "\n", - " def initHidden(self):\n", - " return torch.zeros(1, 1, self.hidden_size, device=device)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class AttnDecoderRNN(nn.Module):\n", - " def __init__(self, embedding_size, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):\n", - " super(AttnDecoderRNN, self).__init__()\n", - " self.embedding_size = embedding_size\n", - " self.hidden_size = hidden_size\n", - " self.output_size = output_size\n", - " self.dropout_p = dropout_p\n", - " self.max_length = max_length\n", - "\n", - " self.embedding = nn.Embedding(self.output_size, self.embedding_size)\n", - " self.attn = nn.Linear(self.hidden_size + self.embedding_size, self.max_length)\n", - " self.attn_combine = nn.Linear(self.hidden_size + self.embedding_size, self.embedding_size)\n", - " self.dropout = nn.Dropout(self.dropout_p)\n", - " self.gru = nn.GRU(self.embedding_size, self.hidden_size)\n", - " self.out = nn.Linear(self.hidden_size, self.output_size)\n", - "\n", - " def forward(self, input, hidden, encoder_outputs):\n", - " embedded = self.embedding(input).view(1, 1, -1)\n", - " embedded = self.dropout(embedded)\n", - "\n", - " attn_weights = F.softmax(\n", - " self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)\n", - " attn_applied = torch.bmm(attn_weights.unsqueeze(0),\n", - " encoder_outputs.unsqueeze(0))\n", - " #import pdb; pdb.set_trace()\n", - "\n", - " output = torch.cat((embedded[0], attn_applied[0]), 1)\n", - " output = self.attn_combine(output).unsqueeze(0)\n", - "\n", - " output = F.relu(output)\n", - " output, hidden = self.gru(output, hidden)\n", - "\n", - " output = F.log_softmax(self.out(output[0]), dim=1)\n", - " return output, hidden, attn_weights\n", - "\n", - " def initHidden(self):\n", - " return torch.zeros(1, 1, self.hidden_size, device=device)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def tensorFromSentence(sentence, lang):\n", - " indexes = [lang.word2index[word] for word in sentence.split(' ')]\n", - " indexes.append(EOS_token)\n", - " return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "teacher_forcing_ratio = 0.5\n", - "\n", - "def train_one_batch(input_tensor, target_tensor, encoder, decoder, optimizer, criterion, max_length=MAX_LENGTH):\n", - " encoder_hidden = encoder.initHidden()\n", - "\n", - "\n", - " optimizer.zero_grad()\n", - "\n", - " input_length = input_tensor.size(0)\n", - " target_length = target_tensor.size(0)\n", - "\n", - " encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)\n", - "\n", - " loss = 0\n", - "\n", - " for ei in range(input_length):\n", - " encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n", - " encoder_outputs[ei] = encoder_output[0, 0]\n", - "\n", - " decoder_input = torch.tensor([[SOS_token]], device=device)\n", - "\n", - " decoder_hidden = encoder_hidden\n", - "\n", - " use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False\n", - "\n", - " if use_teacher_forcing:\n", - " for di in range(target_length):\n", - " decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)\n", - " loss += criterion(decoder_output, target_tensor[di])\n", - " decoder_input = target_tensor[di] # Teacher forcing\n", - "\n", - " else:\n", - " for di in range(target_length):\n", - " decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)\n", - " topv, topi = decoder_output.topk(1)\n", - " decoder_input = topi.squeeze().detach() # detach from history as input\n", - "\n", - " loss += criterion(decoder_output, target_tensor[di])\n", - " if decoder_input.item() == EOS_token:\n", - " break\n", - "\n", - " loss.backward()\n", - "\n", - " optimizer.step()\n", - "\n", - " return loss.item() / target_length" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):\n", - " print_loss_total = 0 # Reset every print_every\n", - " encoder.train()\n", - " decoder.train()\n", - "\n", - " optimizer = optim.SGD(list(encoder.parameters()) + list(decoder.parameters()), lr=learning_rate)\n", - " \n", - " training_pairs = [random.choice(pairs) for _ in range(n_iters)]\n", - " training_pairs = [(tensorFromSentence(p[0], eng_lang), tensorFromSentence(p[1], pol_lang)) for p in training_pairs]\n", - " \n", - " criterion = nn.NLLLoss()\n", - "\n", - " for i in range(1, n_iters + 1):\n", - " training_pair = training_pairs[i - 1]\n", - " input_tensor = training_pair[0]\n", - " target_tensor = training_pair[1]\n", - "\n", - " loss = train_one_batch(input_tensor,\n", - " target_tensor,\n", - " encoder,\n", - " decoder,\n", - " optimizer,\n", - "\n", - " criterion)\n", - " \n", - " print_loss_total += loss\n", - "\n", - " if i % print_every == 0:\n", - " print_loss_avg = print_loss_total / print_every\n", - " print_loss_total = 0\n", - " print(f'iter: {i}, loss: {print_loss_avg}')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):\n", - " encoder.eval()\n", - " decoder.eval()\n", - " with torch.no_grad():\n", - " input_tensor = tensorFromSentence(sentence, eng_lang)\n", - " input_length = input_tensor.size()[0]\n", - " encoder_hidden = encoder.initHidden()\n", - "\n", - " encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)\n", - "\n", - " for ei in range(input_length):\n", - " encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n", - " encoder_outputs[ei] += encoder_output[0, 0]\n", - "\n", - " decoder_input = torch.tensor([[SOS_token]], device=device)\n", - "\n", - " decoder_hidden = encoder_hidden\n", - "\n", - " decoded_words = []\n", - " decoder_attentions = torch.zeros(max_length, max_length)\n", - "\n", - " for di in range(max_length):\n", - " decoder_output, decoder_hidden, decoder_attention = decoder(\n", - " decoder_input, decoder_hidden, encoder_outputs)\n", - " decoder_attentions[di] = decoder_attention.data\n", - " topv, topi = decoder_output.data.topk(1)\n", - " if topi.item() == EOS_token:\n", - " decoded_words.append('')\n", - " break\n", - " else:\n", - " decoded_words.append(pol_lang.index2word[topi.item()])\n", - "\n", - " decoder_input = topi.squeeze().detach()\n", - "\n", - " return decoded_words, decoder_attentions[:di + 1]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluateRandomly(encoder, decoder, n=10):\n", - " for i in range(n):\n", - " pair = random.choice(pairs)\n", - " print('>', pair[0])\n", - " print('=', pair[1])\n", - " output_words, attentions = evaluate(encoder, decoder, pair[0])\n", - " output_sentence = ' '.join(output_words)\n", - " print('<', output_sentence)\n", - " print('')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embedding_size = 200\n", - "hidden_size = 256\n", - "encoder1 = EncoderRNN(eng_lang.n_words, embedding_size, hidden_size).to(device)\n", - "attn_decoder1 = AttnDecoderRNN(embedding_size, hidden_size, pol_lang.n_words, dropout_p=0.1).to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainIters(encoder1, attn_decoder1, 10_000, print_every=50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "evaluateRandomly(encoder1, attn_decoder1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## ZADANIE\n", - "\n", - "Gonito \"WMT2017 Czech-English machine translation challenge for news \"\n", - "\n", - "Proszę wytrenować najpierw model german -> english, a później dotrenować na czech-> english.\n", - "Można wziąć inicjalizować enkoder od nowa lub nie. Proszę w każdym razie użyć wytrenowanego dekodera." - ] - } - ], - "metadata": { - "author": "Jakub Pokrywka", - "email": "kubapok@wmi.amu.edu.pl", - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "lang": "pl", - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.4" - }, - "subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]", - "title": "Ekstrakcja informacji", - "year": "2021" - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/cw/11_regularyzacja_modeli_neuronowych.ipynb b/cw/11_regularyzacja_modeli_neuronowych.ipynb new file mode 100644 index 0000000..5ffa351 --- /dev/null +++ b/cw/11_regularyzacja_modeli_neuronowych.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Modelowanie Języka

\n", + "

11. Regularyzacja modeli neuronowych [ćwiczenia]

\n", + "

Jakub Pokrywka (2022)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overfitting modeli" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Trenując model uczenia maszynowego zależy nam, aby model miał dobrą zdolność predykcji. Zdolności predykcyjne powinny być wysokie na jakichkolwiek danych, a nie wyłącznie na tych, na których model się uczył. \n", + "\n", + "\n", + "Zjawiskiem overfittingu modeli nazywamy nadmierne dopasowanie modelu do zbioru trenującego. Skutkuje to tym, że model świetnie działa na zbiorze trenującym, ale źle dla innych danych, na których się nie uczył.\n", + "\n", + "Overfitting modelu łatwo sprawdzić jako różnicę w metrykach między zbiorem trenującym a zbiorem deweloperskim/testowym. Nim większa jest ta różnica, tym większy overfitting modelu.\n", + "\n", + "Zazwyczaj overfitting będzie występował do pewnego stopnia. Nie należy się tym przejmować. Najważniejsze jest, aby model miał jak najlepszy wynik metryki na zbiorze deweloperskim/testowym. Nawet kosztem overfittingu.\n", + "\n", + "Aby zmniejszyć overfitting (a tym samym zwiększyć wyniki modelu na zbiorze deweloperskim/testowym), korzysta się z metod regularyzacji.\n", + "\n", + "## Regularyzacja modelu\n", + "\n", + "Najbardziej powszechne metody regularyzacji to:\n", + "\n", + "- regularyzacja L1\n", + "- regularyzacja L2\n", + "- dropout" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### regularyzacja L1\n", + "\n", + "Czynnik regularyzacyjny to $\\lambda \\sum_{i=1}^{N}|w_i|$, gdzie $0<\\lambda$ to parametr, a $w_i$ to parametry modelu.\n", + "\n", + "Wtedy funkcja kosztu powinna wyglądać: $L(x) = Error(y,\\bar{y}) + \\lambda \\sum_{i=1}^{N}|w_i|$.\n", + "\n", + "\n", + "### regularyzacja L2\n", + "\n", + "\n", + "Czynnik regularyzacyjny to $\\lambda \\sum_{i=1}^{N}(w_i)^2$, gdzie $0<\\lambda$ to parametr, a $w_i$ to parametry modelu.\n", + "\n", + "Wtedy funkcja kosztu powinna wyglądać: $L(x) = Error(y,\\bar{y}) + \\lambda \\sum_{i=1}^{N}(w_i)^2$." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dropout\n", + "\n", + "Dropout to technika polegająca na losowym wygaszania wyjściu z neuronów (przyjmowanie wartości $0$) podczas treningu. Prawpodopobieństwo ignorowania to parametr $p$. Podczas inferencji nie wygasza sie wyjścia, natomiast wszystkie wartości przemnaża się przez $1-p$." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Zadanie 1 \n", + "\n", + "Wzorując się na poprzednich zajęciach zaimplementować powyższe metody reguluryzacji i zgłosić na gonito.\n", + "\n", + "Warunki zaliczenia:\n", + "- wynik widoczny na platformie zarówno dla dev i dla test\n", + "- wynik dla dev i test lepszy (niższy) niż 1024.00 (liczone przy pomocy geval)\n", + "- deadline do końca dnia 24.04\n", + "- commitując rozwiązanie proszę również umieścić rozwiązanie w pliku /run.py (czyli na szczycie katalogu). Można przekonwertować jupyter do pliku python przez File → Download as → Python. Rozwiązanie nie musi być w pythonie, może być w innym języku.\n", + "- zadania wykonujemy samodzielnie\n", + "- w nazwie commita podaj nr indeksu\n", + "- w tagach podaj **neural-network** oraz **bigram**!\n", + "- uwaga na specjalne znaki \\\\n w pliku 'in.tsv' oraz pierwsze kolumny pliku in.tsv (które należy usunąć)\n", + "\n", + "Punktacja:\n", + "- 50 punktów z najlepszy wynik z 2 grup\n" + ] + } + ], + "metadata": { + "author": "Jakub Pokrywka", + "email": "kubapok@wmi.amu.edu.pl", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "lang": "pl", + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + }, + "subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]", + "title": "Ekstrakcja informacji", + "year": "2021" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/cw/10_Ensemble_oraz_Model_neuronowy_rekurencyjny2.ipynb b/cw/12_Ensemble_modeli.ipynb similarity index 98% rename from cw/10_Ensemble_oraz_Model_neuronowy_rekurencyjny2.ipynb rename to cw/12_Ensemble_modeli.ipynb index a19ed12..c761174 100644 --- a/cw/10_Ensemble_oraz_Model_neuronowy_rekurencyjny2.ipynb +++ b/cw/12_Ensemble_modeli.ipynb @@ -7,7 +7,7 @@ "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", "
\n", "

Modelowanie Języka

\n", - "

10. Model neuronowy rekurencyjny [ćwiczenia]

\n", + "

12. Model neuronowy rekurencyjny [ćwiczenia]

\n", "

Jakub Pokrywka (2022)

\n", "
\n", "\n", @@ -308,7 +308,7 @@ "author": "Jakub Pokrywka", "email": "kubapok@wmi.amu.edu.pl", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -323,7 +323,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.8.3" }, "subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]", "title": "Ekstrakcja informacji", diff --git a/cw/13_Model_neuronowy_rekurencyjny_część_2.ipynb b/cw/13_Model_neuronowy_rekurencyjny_część_2.ipynb new file mode 100644 index 0000000..02cbfcf --- /dev/null +++ b/cw/13_Model_neuronowy_rekurencyjny_część_2.ipynb @@ -0,0 +1,59 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Modelowanie Języka

\n", + "

13. Model neuronowy rekurencyjny część 2 [ćwiczenia]

\n", + "

Jakub Pokrywka (2022)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ZADANIE\n", + "\n", + "Proszę zrobić model 1 model rekurencyjny dwuwarstwowy BiLSTM z rekurencyjnym dropoutem oraz analogiczny model GRU.\n", + "Proszę zaimplementować early stopping i wykorzystać do treningu. Następnie proszę zrobić ensemble tych 2 modeli.\n", + "\n", + "Zadanie widoczne na gonito\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "author": "Jakub Pokrywka", + "email": "kubapok@wmi.amu.edu.pl", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "lang": "pl", + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + }, + "subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]", + "title": "Ekstrakcja informacji", + "year": "2021" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/cw/14_Model_rekurencyjny_z_atencją.ipynb b/cw/14_Model_rekurencyjny_z_atencją.ipynb new file mode 100644 index 0000000..6ff44ac --- /dev/null +++ b/cw/14_Model_rekurencyjny_z_atencją.ipynb @@ -0,0 +1,955 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Modelowanie Języka

\n", + "

14. Model rekurencyjny z atencją [ćwiczenia]

\n", + "

Jakub Pokrywka (2022)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "notebook na podstawie:\n", + "\n", + "# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import unicode_literals, print_function, division\n", + "from io import open\n", + "import unicodedata\n", + "import string\n", + "import re\n", + "import random\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "from torch import optim\n", + "import torch.nn.functional as F\n", + "\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "SOS_token = 0\n", + "EOS_token = 1\n", + "\n", + "class Lang:\n", + " def __init__(self):\n", + " self.word2index = {}\n", + " self.word2count = {}\n", + " self.index2word = {0: \"SOS\", 1: \"EOS\"}\n", + " self.n_words = 2 # Count SOS and EOS\n", + "\n", + " def addSentence(self, sentence):\n", + " for word in sentence.split(' '):\n", + " self.addWord(word)\n", + "\n", + " def addWord(self, word):\n", + " if word not in self.word2index:\n", + " self.word2index[word] = self.n_words\n", + " self.word2count[word] = 1\n", + " self.index2word[self.n_words] = word\n", + " self.n_words += 1\n", + " else:\n", + " self.word2count[word] += 1" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "pairs = []\n", + "with open('data/eng-pol.txt') as f:\n", + " for line in f:\n", + " eng_line, pol_line = line.lower().rstrip().split('\\t')\n", + "\n", + " eng_line = re.sub(r\"([.!?])\", r\" \\1\", eng_line)\n", + " eng_line = re.sub(r\"[^a-zA-Z.!?]+\", r\" \", eng_line)\n", + "\n", + " pol_line = re.sub(r\"([.!?])\", r\" \\1\", pol_line)\n", + " pol_line = re.sub(r\"[^a-zA-Z.!?ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]+\", r\" \", pol_line)\n", + "\n", + " pairs.append([eng_line, pol_line])\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['hi .', 'cześć .']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pairs[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "MAX_LENGTH = 10\n", + "eng_prefixes = (\n", + " \"i am \", \"i m \",\n", + " \"he is\", \"he s \",\n", + " \"she is\", \"she s \",\n", + " \"you are\", \"you re \",\n", + " \"we are\", \"we re \",\n", + " \"they are\", \"they re \"\n", + ")\n", + "\n", + "pairs = [p for p in pairs if len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH]\n", + "pairs = [p for p in pairs if p[0].startswith(eng_prefixes)]\n", + "\n", + "eng_lang = Lang()\n", + "pol_lang = Lang()\n", + "\n", + "for pair in pairs:\n", + " eng_lang.addSentence(pair[0])\n", + " pol_lang.addSentence(pair[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['i m ok .', 'ze mną wszystko w porządku .']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pairs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['i m up .', 'wstałem .']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pairs[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['i m tom .', 'jestem tom .']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pairs[2]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1828" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eng_lang.n_words" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2883" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pol_lang.n_words" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "class EncoderRNN(nn.Module):\n", + " def __init__(self, input_size, embedding_size, hidden_size):\n", + " super(EncoderRNN, self).__init__()\n", + " self.embedding_size = 200\n", + " self.hidden_size = hidden_size\n", + "\n", + " self.embedding = nn.Embedding(input_size, self.embedding_size)\n", + " self.gru = nn.GRU(self.embedding_size, hidden_size)\n", + "\n", + " def forward(self, input, hidden):\n", + " embedded = self.embedding(input).view(1, 1, -1)\n", + " output = embedded\n", + " output, hidden = self.gru(output, hidden)\n", + " return output, hidden\n", + "\n", + " def initHidden(self):\n", + " return torch.zeros(1, 1, self.hidden_size, device=device)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "class DecoderRNN(nn.Module):\n", + " def __init__(self, embedding_size, hidden_size, output_size):\n", + " super(DecoderRNN, self).__init__()\n", + " self.embedding_size = embedding_size\n", + " self.hidden_size = hidden_size\n", + "\n", + " self.embedding = nn.Embedding(output_size, self.embedding_size)\n", + " self.gru = nn.GRU(self.embedding_size, hidden_size)\n", + " self.out = nn.Linear(hidden_size, output_size)\n", + " self.softmax = nn.LogSoftmax(dim=1)\n", + "\n", + " def forward(self, input, hidden):\n", + " output = self.embedding(input).view(1, 1, -1)\n", + " output = F.relu(output)\n", + " output, hidden = self.gru(output, hidden)\n", + " output = self.softmax(self.out(output[0]))\n", + " return output, hidden\n", + "\n", + " def initHidden(self):\n", + " return torch.zeros(1, 1, self.hidden_size, device=device)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "class AttnDecoderRNN(nn.Module):\n", + " def __init__(self, embedding_size, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):\n", + " super(AttnDecoderRNN, self).__init__()\n", + " self.embedding_size = embedding_size\n", + " self.hidden_size = hidden_size\n", + " self.output_size = output_size\n", + " self.dropout_p = dropout_p\n", + " self.max_length = max_length\n", + "\n", + " self.embedding = nn.Embedding(self.output_size, self.embedding_size)\n", + " self.attn = nn.Linear(self.hidden_size + self.embedding_size, self.max_length)\n", + " self.attn_combine = nn.Linear(self.hidden_size + self.embedding_size, self.embedding_size)\n", + " self.dropout = nn.Dropout(self.dropout_p)\n", + " self.gru = nn.GRU(self.embedding_size, self.hidden_size)\n", + " self.out = nn.Linear(self.hidden_size, self.output_size)\n", + "\n", + " def forward(self, input, hidden, encoder_outputs):\n", + " embedded = self.embedding(input).view(1, 1, -1)\n", + " embedded = self.dropout(embedded)\n", + "\n", + " attn_weights = F.softmax(\n", + " self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)\n", + " attn_applied = torch.bmm(attn_weights.unsqueeze(0),\n", + " encoder_outputs.unsqueeze(0))\n", + " import pdb; pdb.set_trace()\n", + "\n", + " output = torch.cat((embedded[0], attn_applied[0]), 1)\n", + " output = self.attn_combine(output).unsqueeze(0)\n", + "\n", + " output = F.relu(output)\n", + " output, hidden = self.gru(output, hidden)\n", + "\n", + " output = F.log_softmax(self.out(output[0]), dim=1)\n", + " return output, hidden, attn_weights\n", + "\n", + " def initHidden(self):\n", + " return torch.zeros(1, 1, self.hidden_size, device=device)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "def tensorFromSentence(sentence, lang):\n", + " indexes = [lang.word2index[word] for word in sentence.split(' ')]\n", + " indexes.append(EOS_token)\n", + " return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "teacher_forcing_ratio = 0.5\n", + "\n", + "def train_one_batch(input_tensor, target_tensor, encoder, decoder, optimizer, criterion, max_length=MAX_LENGTH):\n", + " encoder_hidden = encoder.initHidden()\n", + "\n", + "\n", + " optimizer.zero_grad()\n", + "\n", + " input_length = input_tensor.size(0)\n", + " target_length = target_tensor.size(0)\n", + "\n", + " encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)\n", + "\n", + " loss = 0\n", + "\n", + " for ei in range(input_length):\n", + " encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n", + " encoder_outputs[ei] = encoder_output[0, 0]\n", + "\n", + " decoder_input = torch.tensor([[SOS_token]], device=device)\n", + "\n", + " decoder_hidden = encoder_hidden\n", + "\n", + " use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False\n", + "\n", + " if use_teacher_forcing:\n", + " for di in range(target_length):\n", + " decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)\n", + " loss += criterion(decoder_output, target_tensor[di])\n", + " decoder_input = target_tensor[di] # Teacher forcing\n", + "\n", + " else:\n", + " for di in range(target_length):\n", + " decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)\n", + " topv, topi = decoder_output.topk(1)\n", + " decoder_input = topi.squeeze().detach() # detach from history as input\n", + "\n", + " loss += criterion(decoder_output, target_tensor[di])\n", + " if decoder_input.item() == EOS_token:\n", + " break\n", + "\n", + " loss.backward()\n", + "\n", + " optimizer.step()\n", + "\n", + " return loss.item() / target_length" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.01):\n", + " print_loss_total = 0 # Reset every print_every\n", + " encoder.train()\n", + " decoder.train()\n", + "\n", + " optimizer = optim.SGD(list(encoder.parameters()) + list(decoder.parameters()), lr=learning_rate)\n", + " \n", + " training_pairs = [random.choice(pairs) for _ in range(n_iters)]\n", + " training_pairs = [(tensorFromSentence(p[0], eng_lang), tensorFromSentence(p[1], pol_lang)) for p in training_pairs]\n", + " \n", + " criterion = nn.NLLLoss()\n", + "\n", + " for i in range(1, n_iters + 1):\n", + " training_pair = training_pairs[i - 1]\n", + " input_tensor = training_pair[0]\n", + " target_tensor = training_pair[1]\n", + "\n", + " loss = train_one_batch(input_tensor,\n", + " target_tensor,\n", + " encoder,\n", + " decoder,\n", + " optimizer,\n", + "\n", + " criterion)\n", + " \n", + " print_loss_total += loss\n", + "\n", + " if i % print_every == 0:\n", + " print_loss_avg = print_loss_total / print_every\n", + " print_loss_total = 0\n", + " print(f'iter: {i}, loss: {print_loss_avg}')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):\n", + " encoder.eval()\n", + " decoder.eval()\n", + " with torch.no_grad():\n", + " input_tensor = tensorFromSentence(sentence, eng_lang)\n", + " input_length = input_tensor.size()[0]\n", + " encoder_hidden = encoder.initHidden()\n", + "\n", + " encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)\n", + "\n", + " for ei in range(input_length):\n", + " encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)\n", + " encoder_outputs[ei] += encoder_output[0, 0]\n", + "\n", + " decoder_input = torch.tensor([[SOS_token]], device=device)\n", + "\n", + " decoder_hidden = encoder_hidden\n", + "\n", + " decoded_words = []\n", + " decoder_attentions = torch.zeros(max_length, max_length)\n", + "\n", + " for di in range(max_length):\n", + " decoder_output, decoder_hidden, decoder_attention = decoder(\n", + " decoder_input, decoder_hidden, encoder_outputs)\n", + " decoder_attentions[di] = decoder_attention.data\n", + " topv, topi = decoder_output.data.topk(1)\n", + " if topi.item() == EOS_token:\n", + " decoded_words.append('')\n", + " break\n", + " else:\n", + " decoded_words.append(pol_lang.index2word[topi.item()])\n", + "\n", + " decoder_input = topi.squeeze().detach()\n", + "\n", + " return decoded_words, decoder_attentions[:di + 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "def evaluateRandomly(encoder, decoder, n=10):\n", + " for i in range(n):\n", + " pair = random.choice(pairs)\n", + " print('>', pair[0])\n", + " print('=', pair[1])\n", + " output_words, attentions = evaluate(encoder, decoder, pair[0])\n", + " output_sentence = ' '.join(output_words)\n", + " print('<', output_sentence)\n", + " print('')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_size = 200\n", + "hidden_size = 256\n", + "encoder1 = EncoderRNN(eng_lang.n_words, embedding_size, hidden_size).to(device)\n", + "attn_decoder1 = AttnDecoderRNN(embedding_size, hidden_size, pol_lang.n_words, dropout_p=0.1).to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/tmp/ipykernel_41821/2519748186.py\u001b[0m(27)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 25 \u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 26 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m---> 27 \u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0membedded\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattn_applied\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 28 \u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattn_combine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munsqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 29 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n", + "ipdb> embedded\n", + "tensor([[[-0.7259, 0.0000, 2.2112, 1.1947, -0.1261, -1.0427, -1.4295,\n", + " 0.1567, -0.3949, -1.0815, 1.1206, 2.0630, 2.8148, -1.8538,\n", + " -1.5486, -0.4900, -0.0000, 0.0000, -1.5046, 2.0329, -0.5872,\n", + " 1.5764, -0.0000, 1.1447, -0.4200, -0.1560, 0.1723, 1.5950,\n", + " 1.2955, -0.5796, -0.0000, -0.8989, 0.4737, 1.7037, 0.8787,\n", + " -0.2064, 1.9589, 2.0400, -1.0883, 1.0515, 0.0540, 0.1436,\n", + " 1.2383, 0.4912, -1.7719, 1.6435, 1.5523, 2.3576, 0.0000,\n", + " 0.4063, -0.0821, -1.2872, 0.8372, -0.5638, 0.0706, 0.4151,\n", + " -0.0000, 1.1651, 1.7333, -0.1684, -0.0000, -0.8560, -0.0000,\n", + " 2.7717, -0.4485, -0.8488, 0.8165, 2.1787, -1.0720, -0.3146,\n", + " 1.5798, -0.6788, 0.0000, 0.5609, 0.7415, -0.5585, 2.0659,\n", + " 0.7054, 1.3791, -0.2697, -0.0458, 1.6028, -0.0304, -0.6326,\n", + " -1.3258, -0.8370, 0.6533, 2.2756, -0.5393, 0.4752, 0.4479,\n", + " -0.0186, -0.7785, -1.7858, 0.2345, 1.9794, -0.0314, -0.8594,\n", + " -0.0000, 0.0596, -2.6836, -1.9927, 0.2714, -1.4617, -0.8142,\n", + " -0.7790, 0.5029, -0.6001, -0.7932, 1.3418, 0.1305, -0.0000,\n", + " -1.2961, -2.7107, -2.3360, -0.7960, 0.5207, 1.6896, 0.9285,\n", + " 0.0000, 1.8187, -0.0000, 1.5908, 0.2745, -0.2589, 0.4066,\n", + " -0.0000, -1.3145, -0.5903, 0.3696, -1.9539, -1.9995, -0.8219,\n", + " 0.3937, -0.6068, 0.7947, 1.3940, 0.5513, 0.7498, 1.4578,\n", + " -0.0000, -0.5037, -0.6856, 0.7723, -0.6553, 1.0936, -0.2788,\n", + " -1.9658, 1.5950, 0.8480, 1.1166, 1.3168, -0.0000, 0.2597,\n", + " 1.0813, 0.1827, -1.6485, 0.5743, -0.4952, 0.7176, -0.4468,\n", + " -1.7915, -0.6303, 0.2046, 0.7791, 0.1586, 0.2322, -2.3935,\n", + " 1.3643, -1.2023, -1.6792, 0.5582, -2.0117, -0.6245, 2.4039,\n", + " 2.3736, 0.0559, 0.9173, 0.6446, -0.2068, -0.8805, -0.3070,\n", + " 0.7318, 1.9806, 1.9318, -1.1276, -0.1307, 0.0243, 0.8480,\n", + " 0.4865, -1.5352, 0.8082, 1.7595, -0.2168, 2.0735, -1.0444,\n", + " -0.0000, 1.0729, -0.2194, 0.5439]]], grad_fn=)\n", + "ipdb> embedded.shape\n", + "torch.Size([1, 1, 200])\n", + "ipdb> attn_weights\n", + "tensor([[0.0817, 0.1095, 0.1425, 0.1611, 0.0574, 0.0546, 0.0374, 0.0621, 0.0703,\n", + " 0.2234]], grad_fn=)\n", + "ipdb> attn_applied\n", + "tensor([[[ 0.0354, -0.0156, -0.0048, -0.0936, 0.0637, 0.1516, 0.1419,\n", + " 0.1106, 0.0511, 0.0235, -0.0622, 0.0725, 0.0709, -0.0624,\n", + " 0.1407, -0.0069, -0.1602, -0.1883, -0.1707, -0.1528, -0.0296,\n", + " -0.0500, 0.2115, 0.0705, -0.1385, -0.0487, -0.0165, -0.0128,\n", + " -0.0594, 0.0209, -0.1081, 0.0509, 0.0655, 0.1314, -0.0455,\n", + " -0.0049, -0.1527, -0.1900, -0.0019, 0.0295, -0.0308, 0.0886,\n", + " 0.1369, -0.1571, 0.0518, -0.0991, -0.0310, -0.1781, -0.0290,\n", + " 0.0558, 0.0585, -0.1045, -0.0027, -0.0476, -0.0377, -0.1026,\n", + " 0.0481, 0.0398, -0.0956, 0.0655, -0.1449, 0.0193, -0.0380,\n", + " 0.0401, 0.0491, -0.1925, 0.0669, 0.0774, 0.0604, 0.1187,\n", + " -0.0401, 0.1094, 0.0706, 0.0474, 0.0178, -0.0888, -0.0632,\n", + " 0.1180, -0.0257, -0.0180, -0.0807, 0.0867, -0.0428, -0.0982,\n", + " -0.0129, 0.1326, -0.0868, -0.0118, 0.0923, -0.0634, -0.1758,\n", + " -0.0835, -0.2328, 0.0578, 0.0184, 0.0602, -0.1132, -0.1089,\n", + " -0.1371, -0.0996, -0.0758, -0.1615, 0.0474, -0.0595, 0.1130,\n", + " -0.1329, 0.0068, -0.0485, -0.0376, 0.0170, 0.0743, 0.0284,\n", + " -0.1708, 0.0283, -0.0161, 0.1138, -0.0223, -0.0504, -0.0068,\n", + " 0.1297, 0.0962, 0.1806, -0.1773, -0.1658, 0.1612, 0.0569,\n", + " 0.0703, -0.0321, -0.1741, -0.0983, -0.0848, 0.0342, 0.1021,\n", + " -0.1319, 0.1122, -0.0467, 0.0927, -0.0528, -0.0696, 0.0227,\n", + " 0.0445, 0.0268, 0.1563, 0.0008, 0.0296, 0.0112, -0.0863,\n", + " -0.1705, -0.0137, -0.0336, -0.0533, 0.0015, -0.0134, -0.0530,\n", + " 0.0995, 0.0445, -0.1190, -0.1675, 0.1295, -0.1072, 0.0954,\n", + " 0.0559, 0.0572, 0.1595, 0.0054, -0.1020, 0.0309, -0.0821,\n", + " 0.0230, -0.1480, -0.0815, -0.0013, -0.0012, 0.1046, 0.0248,\n", + " 0.1121, 0.0055, 0.1006, -0.0891, -0.0237, -0.0231, -0.0891,\n", + " 0.0234, 0.0164, -0.0080, -0.0431, -0.0041, 0.2627, -0.2110,\n", + " 0.1026, -0.0049, 0.0077, -0.1126, 0.0161, 0.0039, 0.0700,\n", + " 0.0353, -0.0941, 0.0770, 0.1015, -0.1124, -0.1738, 0.0232,\n", + " 0.1839, -0.2329, 0.0488, 0.0791, 0.2002, 0.0389, -0.0985,\n", + " -0.0744, 0.1392, 0.0052, 0.1119, 0.0851, -0.1062, -0.0948,\n", + " 0.0718, 0.0308, 0.0136, 0.2036, -0.0510, 0.0615, 0.1164,\n", + " 0.0242, -0.0717, 0.0955, -0.0796, 0.0856, 0.0040, -0.1370,\n", + " -0.1614, 0.0605, -0.1396, -0.0286, 0.0295, 0.0515, -0.0880,\n", + " 0.0249, -0.2263, 0.0048, -0.0381, -0.0019, 0.0186, -0.0209,\n", + " -0.0929, -0.1371, 0.0052, -0.1237, -0.1090, -0.0606, 0.0524,\n", + " 0.0351, 0.0283, 0.0264, 0.0866]]], grad_fn=)\n", + "ipdb> attn_applied.shape\n", + "torch.Size([1, 1, 256])\n", + "ipdb> attn_applied.shape\n", + "torch.Size([1, 1, 256])\n", + "ipdb> attn_weights.shape\n", + "torch.Size([1, 10])\n", + "ipdb> encoder_outputs.shape\n", + "torch.Size([10, 256])\n", + "ipdb> attn_applied.shape\n", + "torch.Size([1, 1, 256])\n", + "ipdb> attn_applied\n", + "tensor([[[ 0.0354, -0.0156, -0.0048, -0.0936, 0.0637, 0.1516, 0.1419,\n", + " 0.1106, 0.0511, 0.0235, -0.0622, 0.0725, 0.0709, -0.0624,\n", + " 0.1407, -0.0069, -0.1602, -0.1883, -0.1707, -0.1528, -0.0296,\n", + " -0.0500, 0.2115, 0.0705, -0.1385, -0.0487, -0.0165, -0.0128,\n", + " -0.0594, 0.0209, -0.1081, 0.0509, 0.0655, 0.1314, -0.0455,\n", + " -0.0049, -0.1527, -0.1900, -0.0019, 0.0295, -0.0308, 0.0886,\n", + " 0.1369, -0.1571, 0.0518, -0.0991, -0.0310, -0.1781, -0.0290,\n", + " 0.0558, 0.0585, -0.1045, -0.0027, -0.0476, -0.0377, -0.1026,\n", + " 0.0481, 0.0398, -0.0956, 0.0655, -0.1449, 0.0193, -0.0380,\n", + " 0.0401, 0.0491, -0.1925, 0.0669, 0.0774, 0.0604, 0.1187,\n", + " -0.0401, 0.1094, 0.0706, 0.0474, 0.0178, -0.0888, -0.0632,\n", + " 0.1180, -0.0257, -0.0180, -0.0807, 0.0867, -0.0428, -0.0982,\n", + " -0.0129, 0.1326, -0.0868, -0.0118, 0.0923, -0.0634, -0.1758,\n", + " -0.0835, -0.2328, 0.0578, 0.0184, 0.0602, -0.1132, -0.1089,\n", + " -0.1371, -0.0996, -0.0758, -0.1615, 0.0474, -0.0595, 0.1130,\n", + " -0.1329, 0.0068, -0.0485, -0.0376, 0.0170, 0.0743, 0.0284,\n", + " -0.1708, 0.0283, -0.0161, 0.1138, -0.0223, -0.0504, -0.0068,\n", + " 0.1297, 0.0962, 0.1806, -0.1773, -0.1658, 0.1612, 0.0569,\n", + " 0.0703, -0.0321, -0.1741, -0.0983, -0.0848, 0.0342, 0.1021,\n", + " -0.1319, 0.1122, -0.0467, 0.0927, -0.0528, -0.0696, 0.0227,\n", + " 0.0445, 0.0268, 0.1563, 0.0008, 0.0296, 0.0112, -0.0863,\n", + " -0.1705, -0.0137, -0.0336, -0.0533, 0.0015, -0.0134, -0.0530,\n", + " 0.0995, 0.0445, -0.1190, -0.1675, 0.1295, -0.1072, 0.0954,\n", + " 0.0559, 0.0572, 0.1595, 0.0054, -0.1020, 0.0309, -0.0821,\n", + " 0.0230, -0.1480, -0.0815, -0.0013, -0.0012, 0.1046, 0.0248,\n", + " 0.1121, 0.0055, 0.1006, -0.0891, -0.0237, -0.0231, -0.0891,\n", + " 0.0234, 0.0164, -0.0080, -0.0431, -0.0041, 0.2627, -0.2110,\n", + " 0.1026, -0.0049, 0.0077, -0.1126, 0.0161, 0.0039, 0.0700,\n", + " 0.0353, -0.0941, 0.0770, 0.1015, -0.1124, -0.1738, 0.0232,\n", + " 0.1839, -0.2329, 0.0488, 0.0791, 0.2002, 0.0389, -0.0985,\n", + " -0.0744, 0.1392, 0.0052, 0.1119, 0.0851, -0.1062, -0.0948,\n", + " 0.0718, 0.0308, 0.0136, 0.2036, -0.0510, 0.0615, 0.1164,\n", + " 0.0242, -0.0717, 0.0955, -0.0796, 0.0856, 0.0040, -0.1370,\n", + " -0.1614, 0.0605, -0.1396, -0.0286, 0.0295, 0.0515, -0.0880,\n", + " 0.0249, -0.2263, 0.0048, -0.0381, -0.0019, 0.0186, -0.0209,\n", + " -0.0929, -0.1371, 0.0052, -0.1237, -0.1090, -0.0606, 0.0524,\n", + " 0.0351, 0.0283, 0.0264, 0.0866]]], grad_fn=)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ipdb> attn_weights.shape\n", + "torch.Size([1, 10])\n", + "ipdb> encoder_outputs.shape\n", + "torch.Size([10, 256])\n", + "ipdb> embedded.shape\n", + "torch.Size([1, 1, 200])\n", + "ipdb> attn_applied.shape\n", + "torch.Size([1, 1, 256])\n", + "ipdb> output = torch.cat((embedded[0], attn_applied[0]), 1)\n", + "ipdb> output.shape\n", + "torch.Size([1, 456])\n", + "ipdb> output = self.attn_combine(output).unsqueeze(0)\n", + "ipdb> output.shape\n", + "torch.Size([1, 1, 200])\n", + "ipdb> attn_weights\n", + "tensor([[0.0817, 0.1095, 0.1425, 0.1611, 0.0574, 0.0546, 0.0374, 0.0621, 0.0703,\n", + " 0.2234]], grad_fn=)\n", + "ipdb> attn_weights.shape\n", + "torch.Size([1, 10])\n", + "ipdb> attn_applied.shape\n", + "torch.Size([1, 1, 256])\n", + "ipdb> attn_applied.shape\n", + "torch.Size([1, 1, 256])\n", + "ipdb> attn_applied\n", + "tensor([[[ 0.0354, -0.0156, -0.0048, -0.0936, 0.0637, 0.1516, 0.1419,\n", + " 0.1106, 0.0511, 0.0235, -0.0622, 0.0725, 0.0709, -0.0624,\n", + " 0.1407, -0.0069, -0.1602, -0.1883, -0.1707, -0.1528, -0.0296,\n", + " -0.0500, 0.2115, 0.0705, -0.1385, -0.0487, -0.0165, -0.0128,\n", + " -0.0594, 0.0209, -0.1081, 0.0509, 0.0655, 0.1314, -0.0455,\n", + " -0.0049, -0.1527, -0.1900, -0.0019, 0.0295, -0.0308, 0.0886,\n", + " 0.1369, -0.1571, 0.0518, -0.0991, -0.0310, -0.1781, -0.0290,\n", + " 0.0558, 0.0585, -0.1045, -0.0027, -0.0476, -0.0377, -0.1026,\n", + " 0.0481, 0.0398, -0.0956, 0.0655, -0.1449, 0.0193, -0.0380,\n", + " 0.0401, 0.0491, -0.1925, 0.0669, 0.0774, 0.0604, 0.1187,\n", + " -0.0401, 0.1094, 0.0706, 0.0474, 0.0178, -0.0888, -0.0632,\n", + " 0.1180, -0.0257, -0.0180, -0.0807, 0.0867, -0.0428, -0.0982,\n", + " -0.0129, 0.1326, -0.0868, -0.0118, 0.0923, -0.0634, -0.1758,\n", + " -0.0835, -0.2328, 0.0578, 0.0184, 0.0602, -0.1132, -0.1089,\n", + " -0.1371, -0.0996, -0.0758, -0.1615, 0.0474, -0.0595, 0.1130,\n", + " -0.1329, 0.0068, -0.0485, -0.0376, 0.0170, 0.0743, 0.0284,\n", + " -0.1708, 0.0283, -0.0161, 0.1138, -0.0223, -0.0504, -0.0068,\n", + " 0.1297, 0.0962, 0.1806, -0.1773, -0.1658, 0.1612, 0.0569,\n", + " 0.0703, -0.0321, -0.1741, -0.0983, -0.0848, 0.0342, 0.1021,\n", + " -0.1319, 0.1122, -0.0467, 0.0927, -0.0528, -0.0696, 0.0227,\n", + " 0.0445, 0.0268, 0.1563, 0.0008, 0.0296, 0.0112, -0.0863,\n", + " -0.1705, -0.0137, -0.0336, -0.0533, 0.0015, -0.0134, -0.0530,\n", + " 0.0995, 0.0445, -0.1190, -0.1675, 0.1295, -0.1072, 0.0954,\n", + " 0.0559, 0.0572, 0.1595, 0.0054, -0.1020, 0.0309, -0.0821,\n", + " 0.0230, -0.1480, -0.0815, -0.0013, -0.0012, 0.1046, 0.0248,\n", + " 0.1121, 0.0055, 0.1006, -0.0891, -0.0237, -0.0231, -0.0891,\n", + " 0.0234, 0.0164, -0.0080, -0.0431, -0.0041, 0.2627, -0.2110,\n", + " 0.1026, -0.0049, 0.0077, -0.1126, 0.0161, 0.0039, 0.0700,\n", + " 0.0353, -0.0941, 0.0770, 0.1015, -0.1124, -0.1738, 0.0232,\n", + " 0.1839, -0.2329, 0.0488, 0.0791, 0.2002, 0.0389, -0.0985,\n", + " -0.0744, 0.1392, 0.0052, 0.1119, 0.0851, -0.1062, -0.0948,\n", + " 0.0718, 0.0308, 0.0136, 0.2036, -0.0510, 0.0615, 0.1164,\n", + " 0.0242, -0.0717, 0.0955, -0.0796, 0.0856, 0.0040, -0.1370,\n", + " -0.1614, 0.0605, -0.1396, -0.0286, 0.0295, 0.0515, -0.0880,\n", + " 0.0249, -0.2263, 0.0048, -0.0381, -0.0019, 0.0186, -0.0209,\n", + " -0.0929, -0.1371, 0.0052, -0.1237, -0.1090, -0.0606, 0.0524,\n", + " 0.0351, 0.0283, 0.0264, 0.0866]]], grad_fn=)\n", + "ipdb> torch.cat((embedded[0], attn_applied[0]), 1)\n", + "tensor([[-7.2585e-01, 0.0000e+00, 2.2112e+00, 1.1947e+00, -1.2609e-01,\n", + " -1.0427e+00, -1.4295e+00, 1.5669e-01, -3.9488e-01, -1.0815e+00,\n", + " 1.1206e+00, 2.0630e+00, 2.8148e+00, -1.8538e+00, -1.5486e+00,\n", + " -4.8997e-01, -0.0000e+00, 0.0000e+00, -1.5046e+00, 2.0329e+00,\n", + " -5.8720e-01, 1.5764e+00, -0.0000e+00, 1.1447e+00, -4.2003e-01,\n", + " -1.5600e-01, 1.7233e-01, 1.5950e+00, 1.2955e+00, -5.7964e-01,\n", + " -0.0000e+00, -8.9891e-01, 4.7372e-01, 1.7037e+00, 8.7866e-01,\n", + " -2.0642e-01, 1.9589e+00, 2.0400e+00, -1.0883e+00, 1.0515e+00,\n", + " 5.3959e-02, 1.4358e-01, 1.2383e+00, 4.9123e-01, -1.7719e+00,\n", + " 1.6435e+00, 1.5523e+00, 2.3576e+00, 0.0000e+00, 4.0628e-01,\n", + " -8.2075e-02, -1.2872e+00, 8.3723e-01, -5.6378e-01, 7.0637e-02,\n", + " 4.1508e-01, -0.0000e+00, 1.1651e+00, 1.7333e+00, -1.6842e-01,\n", + " -0.0000e+00, -8.5601e-01, -0.0000e+00, 2.7717e+00, -4.4849e-01,\n", + " -8.4885e-01, 8.1650e-01, 2.1787e+00, -1.0720e+00, -3.1463e-01,\n", + " 1.5798e+00, -6.7880e-01, 0.0000e+00, 5.6090e-01, 7.4153e-01,\n", + " -5.5849e-01, 2.0659e+00, 7.0539e-01, 1.3791e+00, -2.6968e-01,\n", + " -4.5789e-02, 1.6028e+00, -3.0432e-02, -6.3259e-01, -1.3258e+00,\n", + " -8.3697e-01, 6.5333e-01, 2.2756e+00, -5.3934e-01, 4.7520e-01,\n", + " 4.4788e-01, -1.8612e-02, -7.7847e-01, -1.7858e+00, 2.3452e-01,\n", + " 1.9794e+00, -3.1421e-02, -8.5938e-01, -0.0000e+00, 5.9576e-02,\n", + " -2.6836e+00, -1.9927e+00, 2.7139e-01, -1.4617e+00, -8.1419e-01,\n", + " -7.7900e-01, 5.0293e-01, -6.0008e-01, -7.9323e-01, 1.3418e+00,\n", + " 1.3053e-01, -0.0000e+00, -1.2961e+00, -2.7107e+00, -2.3360e+00,\n", + " -7.9603e-01, 5.2071e-01, 1.6896e+00, 9.2845e-01, 0.0000e+00,\n", + " 1.8187e+00, -0.0000e+00, 1.5908e+00, 2.7451e-01, -2.5888e-01,\n", + " 4.0663e-01, -0.0000e+00, -1.3145e+00, -5.9031e-01, 3.6964e-01,\n", + " -1.9539e+00, -1.9995e+00, -8.2193e-01, 3.9374e-01, -6.0678e-01,\n", + " 7.9467e-01, 1.3940e+00, 5.5134e-01, 7.4983e-01, 1.4578e+00,\n", + " -0.0000e+00, -5.0368e-01, -6.8556e-01, 7.7229e-01, -6.5534e-01,\n", + " 1.0936e+00, -2.7885e-01, -1.9658e+00, 1.5950e+00, 8.4796e-01,\n", + " 1.1166e+00, 1.3168e+00, -0.0000e+00, 2.5968e-01, 1.0813e+00,\n", + " 1.8274e-01, -1.6485e+00, 5.7433e-01, -4.9516e-01, 7.1760e-01,\n", + " -4.4680e-01, -1.7915e+00, -6.3027e-01, 2.0462e-01, 7.7905e-01,\n", + " 1.5859e-01, 2.3222e-01, -2.3935e+00, 1.3643e+00, -1.2023e+00,\n", + " -1.6792e+00, 5.5823e-01, -2.0117e+00, -6.2452e-01, 2.4039e+00,\n", + " 2.3736e+00, 5.5896e-02, 9.1725e-01, 6.4464e-01, -2.0675e-01,\n", + " -8.8049e-01, -3.0703e-01, 7.3178e-01, 1.9806e+00, 1.9318e+00,\n", + " -1.1276e+00, -1.3072e-01, 2.4253e-02, 8.4797e-01, 4.8654e-01,\n", + " -1.5352e+00, 8.0822e-01, 1.7595e+00, -2.1682e-01, 2.0735e+00,\n", + " -1.0444e+00, -0.0000e+00, 1.0729e+00, -2.1940e-01, 5.4391e-01,\n", + " 3.5435e-02, -1.5585e-02, -4.8357e-03, -9.3600e-02, 6.3727e-02,\n", + " 1.5162e-01, 1.4191e-01, 1.1063e-01, 5.1059e-02, 2.3501e-02,\n", + " -6.2207e-02, 7.2538e-02, 7.0922e-02, -6.2352e-02, 1.4066e-01,\n", + " -6.8974e-03, -1.6019e-01, -1.8832e-01, -1.7067e-01, -1.5275e-01,\n", + " -2.9574e-02, -5.0036e-02, 2.1154e-01, 7.0534e-02, -1.3852e-01,\n", + " -4.8703e-02, -1.6496e-02, -1.2794e-02, -5.9357e-02, 2.0857e-02,\n", + " -1.0812e-01, 5.0935e-02, 6.5458e-02, 1.3136e-01, -4.5476e-02,\n", + " -4.8890e-03, -1.5270e-01, -1.9004e-01, -1.9268e-03, 2.9531e-02,\n", + " -3.0820e-02, 8.8608e-02, 1.3690e-01, -1.5715e-01, 5.1807e-02,\n", + " -9.9062e-02, -3.0984e-02, -1.7808e-01, -2.8995e-02, 5.5791e-02,\n", + " 5.8522e-02, -1.0453e-01, -2.7097e-03, -4.7650e-02, -3.7730e-02,\n", + " -1.0258e-01, 4.8142e-02, 3.9797e-02, -9.5571e-02, 6.5458e-02,\n", + " -1.4489e-01, 1.9339e-02, -3.8005e-02, 4.0136e-02, 4.9097e-02,\n", + " -1.9247e-01, 6.6852e-02, 7.7364e-02, 6.0379e-02, 1.1870e-01,\n", + " -4.0057e-02, 1.0945e-01, 7.0648e-02, 4.7377e-02, 1.7824e-02,\n", + " -8.8779e-02, -6.3218e-02, 1.1804e-01, -2.5733e-02, -1.7959e-02,\n", + " -8.0674e-02, 8.6741e-02, -4.2754e-02, -9.8244e-02, -1.2859e-02,\n", + " 1.3257e-01, -8.6784e-02, -1.1774e-02, 9.2331e-02, -6.3417e-02,\n", + " -1.7581e-01, -8.3526e-02, -2.3277e-01, 5.7765e-02, 1.8407e-02,\n", + " 6.0199e-02, -1.1321e-01, -1.0885e-01, -1.3705e-01, -9.9638e-02,\n", + " -7.5838e-02, -1.6146e-01, 4.7433e-02, -5.9514e-02, 1.1298e-01,\n", + " -1.3286e-01, 6.7797e-03, -4.8545e-02, -3.7572e-02, 1.7049e-02,\n", + " 7.4291e-02, 2.8442e-02, -1.7075e-01, 2.8328e-02, -1.6143e-02,\n", + " 1.1376e-01, -2.2335e-02, -5.0417e-02, -6.8320e-03, 1.2967e-01,\n", + " 9.6223e-02, 1.8056e-01, -1.7727e-01, -1.6582e-01, 1.6121e-01,\n", + " 5.6873e-02, 7.0338e-02, -3.2107e-02, -1.7414e-01, -9.8330e-02,\n", + " -8.4751e-02, 3.4170e-02, 1.0213e-01, -1.3191e-01, 1.1224e-01,\n", + " -4.6743e-02, 9.2736e-02, -5.2760e-02, -6.9552e-02, 2.2712e-02,\n", + " 4.4459e-02, 2.6758e-02, 1.5629e-01, 8.4847e-04, 2.9560e-02,\n", + " 1.1163e-02, -8.6294e-02, -1.7045e-01, -1.3690e-02, -3.3578e-02,\n", + " -5.3289e-02, 1.4815e-03, -1.3354e-02, -5.3049e-02, 9.9541e-02,\n", + " 4.4520e-02, -1.1904e-01, -1.6747e-01, 1.2955e-01, -1.0718e-01,\n", + " 9.5381e-02, 5.5950e-02, 5.7216e-02, 1.5949e-01, 5.4154e-03,\n", + " -1.0203e-01, 3.0928e-02, -8.2072e-02, 2.2982e-02, -1.4800e-01,\n", + " -8.1458e-02, -1.3399e-03, -1.2277e-03, 1.0457e-01, 2.4771e-02,\n", + " 1.1215e-01, 5.4644e-03, 1.0059e-01, -8.9117e-02, -2.3669e-02,\n", + " -2.3117e-02, -8.9104e-02, 2.3379e-02, 1.6435e-02, -8.0299e-03,\n", + " -4.3092e-02, -4.1300e-03, 2.6272e-01, -2.1100e-01, 1.0265e-01,\n", + " -4.9496e-03, 7.7325e-03, -1.1258e-01, 1.6118e-02, 3.8591e-03,\n", + " 6.9952e-02, 3.5275e-02, -9.4110e-02, 7.6992e-02, 1.0149e-01,\n", + " -1.1243e-01, -1.7381e-01, 2.3158e-02, 1.8389e-01, -2.3291e-01,\n", + " 4.8788e-02, 7.9070e-02, 2.0018e-01, 3.8932e-02, -9.8458e-02,\n", + " -7.4388e-02, 1.3917e-01, 5.1577e-03, 1.1188e-01, 8.5138e-02,\n", + " -1.0618e-01, -9.4835e-02, 7.1822e-02, 3.0813e-02, 1.3624e-02,\n", + " 2.0363e-01, -5.0962e-02, 6.1539e-02, 1.1643e-01, 2.4200e-02,\n", + " -7.1730e-02, 9.5475e-02, -7.9572e-02, 8.5584e-02, 3.9502e-03,\n", + " -1.3701e-01, -1.6142e-01, 6.0496e-02, -1.3962e-01, -2.8607e-02,\n", + " 2.9515e-02, 5.1506e-02, -8.7967e-02, 2.4942e-02, -2.2634e-01,\n", + " 4.7778e-03, -3.8064e-02, -1.9145e-03, 1.8559e-02, -2.0943e-02,\n", + " -9.2896e-02, -1.3714e-01, 5.1929e-03, -1.2374e-01, -1.0901e-01,\n", + " -6.0571e-02, 5.2448e-02, 3.5082e-02, 2.8269e-02, 2.6405e-02,\n", + " 8.6625e-02]], grad_fn=)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ipdb> torch.cat((embedded[0], attn_applied[0]), 1).shape\n", + "torch.Size([1, 456])\n", + "ipdb> attnn_weights\n", + "*** NameError: name 'attnn_weights' is not defined\n", + "ipdb> attn_weights.shape\n", + "torch.Size([1, 10])\n", + "ipdb> attn_applied\n", + "tensor([[[ 0.0354, -0.0156, -0.0048, -0.0936, 0.0637, 0.1516, 0.1419,\n", + " 0.1106, 0.0511, 0.0235, -0.0622, 0.0725, 0.0709, -0.0624,\n", + " 0.1407, -0.0069, -0.1602, -0.1883, -0.1707, -0.1528, -0.0296,\n", + " -0.0500, 0.2115, 0.0705, -0.1385, -0.0487, -0.0165, -0.0128,\n", + " -0.0594, 0.0209, -0.1081, 0.0509, 0.0655, 0.1314, -0.0455,\n", + " -0.0049, -0.1527, -0.1900, -0.0019, 0.0295, -0.0308, 0.0886,\n", + " 0.1369, -0.1571, 0.0518, -0.0991, -0.0310, -0.1781, -0.0290,\n", + " 0.0558, 0.0585, -0.1045, -0.0027, -0.0476, -0.0377, -0.1026,\n", + " 0.0481, 0.0398, -0.0956, 0.0655, -0.1449, 0.0193, -0.0380,\n", + " 0.0401, 0.0491, -0.1925, 0.0669, 0.0774, 0.0604, 0.1187,\n", + " -0.0401, 0.1094, 0.0706, 0.0474, 0.0178, -0.0888, -0.0632,\n", + " 0.1180, -0.0257, -0.0180, -0.0807, 0.0867, -0.0428, -0.0982,\n", + " -0.0129, 0.1326, -0.0868, -0.0118, 0.0923, -0.0634, -0.1758,\n", + " -0.0835, -0.2328, 0.0578, 0.0184, 0.0602, -0.1132, -0.1089,\n", + " -0.1371, -0.0996, -0.0758, -0.1615, 0.0474, -0.0595, 0.1130,\n", + " -0.1329, 0.0068, -0.0485, -0.0376, 0.0170, 0.0743, 0.0284,\n", + " -0.1708, 0.0283, -0.0161, 0.1138, -0.0223, -0.0504, -0.0068,\n", + " 0.1297, 0.0962, 0.1806, -0.1773, -0.1658, 0.1612, 0.0569,\n", + " 0.0703, -0.0321, -0.1741, -0.0983, -0.0848, 0.0342, 0.1021,\n", + " -0.1319, 0.1122, -0.0467, 0.0927, -0.0528, -0.0696, 0.0227,\n", + " 0.0445, 0.0268, 0.1563, 0.0008, 0.0296, 0.0112, -0.0863,\n", + " -0.1705, -0.0137, -0.0336, -0.0533, 0.0015, -0.0134, -0.0530,\n", + " 0.0995, 0.0445, -0.1190, -0.1675, 0.1295, -0.1072, 0.0954,\n", + " 0.0559, 0.0572, 0.1595, 0.0054, -0.1020, 0.0309, -0.0821,\n", + " 0.0230, -0.1480, -0.0815, -0.0013, -0.0012, 0.1046, 0.0248,\n", + " 0.1121, 0.0055, 0.1006, -0.0891, -0.0237, -0.0231, -0.0891,\n", + " 0.0234, 0.0164, -0.0080, -0.0431, -0.0041, 0.2627, -0.2110,\n", + " 0.1026, -0.0049, 0.0077, -0.1126, 0.0161, 0.0039, 0.0700,\n", + " 0.0353, -0.0941, 0.0770, 0.1015, -0.1124, -0.1738, 0.0232,\n", + " 0.1839, -0.2329, 0.0488, 0.0791, 0.2002, 0.0389, -0.0985,\n", + " -0.0744, 0.1392, 0.0052, 0.1119, 0.0851, -0.1062, -0.0948,\n", + " 0.0718, 0.0308, 0.0136, 0.2036, -0.0510, 0.0615, 0.1164,\n", + " 0.0242, -0.0717, 0.0955, -0.0796, 0.0856, 0.0040, -0.1370,\n", + " -0.1614, 0.0605, -0.1396, -0.0286, 0.0295, 0.0515, -0.0880,\n", + " 0.0249, -0.2263, 0.0048, -0.0381, -0.0019, 0.0186, -0.0209,\n", + " -0.0929, -0.1371, 0.0052, -0.1237, -0.1090, -0.0606, 0.0524,\n", + " 0.0351, 0.0283, 0.0264, 0.0866]]], grad_fn=)\n", + "ipdb> attn_applied.shape\n", + "torch.Size([1, 1, 256])\n", + "ipdb> torch.cat((embedded[0], attn_applied[0]), 1).shape\n", + "torch.Size([1, 456])\n", + "ipdb> self.attn_combine(output).unsqueeze(0).shape\n", + "*** RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x200 and 456x200)\n", + "ipdb> output = self.attn_combine(output).unsqueeze(0)\n", + "*** RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x200 and 456x200)\n", + "ipdb> output = torch.cat((embedded[0], attn_applied[0]), 1)\n", + "ipdb> output = torch.cat((embedded[0], attn_applied[0]), 1)\n", + "ipdb> c\n", + "> \u001b[0;32m/tmp/ipykernel_41821/2519748186.py\u001b[0m(27)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 25 \u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 26 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m---> 27 \u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0membedded\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattn_applied\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 28 \u001b[0;31m \u001b[0moutput\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattn_combine\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munsqueeze\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 29 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\n", + "ipdb> output = torch.cat((embedded[0], attn_applied[0]), 1)\n", + "ipdb> attn_weights.shape\n", + "torch.Size([1, 10])\n", + "ipdb> attn_applied.shape\n", + "torch.Size([1, 1, 256])\n", + "ipdb> output.shape\n", + "torch.Size([1, 456])\n", + "ipdb> self.attn_combine(output).unsqueeze(0).shape\n", + "torch.Size([1, 1, 200])\n" + ] + } + ], + "source": [ + "trainIters(encoder1, attn_decoder1, 10_000, print_every=50)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "evaluateRandomly(encoder1, attn_decoder1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "## ZADANIE\n", + "\n", + "Gonito \"WMT2017 Czech-English machine translation challenge for news \"\n", + "\n", + "Proszę wytrenować najpierw model german -> english, a później dotrenować na czech-> english.\n", + "Można wziąć inicjalizować enkoder od nowa lub nie. Proszę w każdym razie użyć wytrenowanego dekodera." + ] + } + ], + "metadata": { + "author": "Jakub Pokrywka", + "email": "kubapok@wmi.amu.edu.pl", + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "lang": "pl", + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + }, + "subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]", + "title": "Ekstrakcja informacji", + "year": "2021" + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/cw/12_Model_transformer_autoregresywny.ipynb b/cw/15_Model_transformer_autoregresywny.ipynb similarity index 99% rename from cw/12_Model_transformer_autoregresywny.ipynb rename to cw/15_Model_transformer_autoregresywny.ipynb index 5d519c5..3cb18ed 100644 --- a/cw/12_Model_transformer_autoregresywny.ipynb +++ b/cw/15_Model_transformer_autoregresywny.ipynb @@ -7,7 +7,7 @@ "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", "
\n", "

Modelowanie Języka

\n", - "

12. Model rekurencyjny z atencją [ćwiczenia]

\n", + "

15. Model transformer autoregresywny [ćwiczenia]

\n", "

Jakub Pokrywka (2022)

\n", "
\n", "\n", @@ -4638,7 +4638,7 @@ "author": "Jakub Pokrywka", "email": "kubapok@wmi.amu.edu.pl", "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -4653,7 +4653,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.4" + "version": "3.8.3" }, "subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]", "title": "Ekstrakcja informacji",