diff --git a/add-metadata.py b/add-metadata.py index 6a3a737..a3239cb 100755 --- a/add-metadata.py +++ b/add-metadata.py @@ -3,7 +3,7 @@ #procedura napisywania plików ipynb (generowanie nagłówka i metadanych) import json import sys - +import re def modjup(filen,numer,tytul,typ,author,email,lang,title,year): zerocell=['![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n', @@ -35,7 +35,7 @@ def modjup(filen,numer,tytul,typ,author,email,lang,title,year): #zmodyfikuj te dane filen=sys.argv[1] -numer="2" +numer=re.match(r'^(?:\D+/)?0*(\d+)', filen).group(1) tytul=sys.argv[2] typ="wykład" diff --git a/wyk/03_Tfidf.ipynb b/wyk/03_Tfidf.ipynb index e5ce3b6..03d83ab 100644 --- a/wyk/03_Tfidf.ipynb +++ b/wyk/03_Tfidf.ipynb @@ -1,4550 +1,4572 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Wyszukiwarka - szybka i sensowna" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Roboczy przykład\n", - "\n", - "Zakładamy, że mamy pewną kolekcję dokumentów $D = {d_1, \\ldots, d_N}$. ($N$ - liczba dokumentów w kolekcji)." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Ala ma kota." - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "{-# LANGUAGE OverloadedStrings #-}\n", - "\n", - "import Data.Text hiding(map, filter, zip)\n", - "import Prelude hiding(words, take)\n", - "\n", - "collectionD :: [Text]\n", - "collectionD = [\"Ala ma kota.\", \"Podobno jest kot w butach.\", \"Ty chyba masz kota!\", \"But chyba zgubiłem.\", \"Kot ma kota.\"]\n", - "\n", - "-- Operator (!!) zwraca element listy o podanym indeksie\n", - "-- (Przy większych listach będzie nieefektywne, ale nie będziemy komplikować)\n", - "Prelude.head collectionD" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Wydobycie tekstu\n", - "\n", - "Przykładowe narzędzia:\n", - "\n", - "* pdftotext\n", - "* antiword\n", - "* Tesseract OCR\n", - "* Apache Tika - uniwersalne narzędzie do wydobywania tekstu z różnych formatów\n", - "\n", - "## Normalizacja tekstu\n", - "\n", - "Cokolwiek robimy z tekstem, najpierw musimy go _znormalizować_." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Tokenizacja\n", - "\n", - "Po pierwsze musimy podzielić tekst na _tokeny_, czyli wyrazapodobne jednostki.\n", - "Może po prostu podzielić po spacjach?" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Ala" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ma" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kota." - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenizeStupidly :: Text -> [Text]\n", - "-- words to funkcja z Data.Text, która dzieli po spacjach\n", - "tokenizeStupidly = words\n", - "\n", - "tokenizeStupidly $ Prelude.head collectionD" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A, trzeba _chociaż_ odsunąć znaki interpunkcyjne. Najprościej użyć wyrażenia regularnego. Warto użyć [unikodowych własności](https://en.wikipedia.org/wiki/Unicode_character_property) znaków i konstrukcji `\\p{...}`. " - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "But" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "chyba" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zgubiłem" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "." - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "{-# LANGUAGE QuasiQuotes #-}\n", - "\n", - "import Text.Regex.PCRE.Heavy\n", - "\n", - "tokenize :: Text -> [Text]\n", - "tokenize = map fst . scan [re|C\\+\\+|[\\p{L}0-9]+|\\p{P}|]\n", - "\n", - "tokenize $ collectionD !! 3\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Cała kolekcja stokenizowana:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Ala" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ma" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kota" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "." - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Podobno" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "jest" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "butach" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "." - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Ty" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "chyba" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "masz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kota" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "!" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "But" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "chyba" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zgubiłem" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "." - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ma" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kota" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "." - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "map tokenize collectionD" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Problemy z tokenizacją\n", - "\n", - "##### Język angielski" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "I" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "use" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "a" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "data" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "-" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "base" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenize \"I use a data-base\"" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "I" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "use" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "a" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "database" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenize \"I use a database\"" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "I" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "use" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "a" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "data" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "base" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenize \"I use a data base\"" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "I" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "don" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "t" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "like" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Python" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenize \"'I don't like Python'\"" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "I" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "can" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "see" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "the" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Johnes" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "house" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenize \"I can see the Johnes' house\"" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "I" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "do" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "not" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "like" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Python" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenize \"I do not like Python\"" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0018" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "555" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "-" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "555" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "-" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "122" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenize \"+0018 555-555-122\"" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0018555555122" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenize \"+0018555555122\"" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Which" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "one" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "is" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "better" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - ":" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "C++" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "or" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "C" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "#" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "?" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenize \"Which one is better: C++ or C#?\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Inne języki?" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Rechtsschutzversicherungsgesellschaften" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "wie" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "die" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "HUK" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "-" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Coburg" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "machen" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "es" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "bereits" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "seit" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "geraumer" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Zeit" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "vor" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - ":" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenize \"Rechtsschutzversicherungsgesellschaften wie die HUK-Coburg machen es bereits seit geraumer Zeit vor:\"" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "今日波兹南是贸易" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "、" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "工业及教育的中心" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "。" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "波兹南是波兰第五大的城市及第四大的工业中心" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "," - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "波兹南亦是大波兰省的行政首府" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "。" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "也舉辦有不少展覽會" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "。" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "是波蘭西部重要的交通中心都市" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "。" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenize \"今日波兹南是贸易、工业及教育的中心。波兹南是波兰第五大的城市及第四大的工业中心,波兹南亦是大波兰省的行政首府。也舉辦有不少展覽會。是波蘭西部重要的交通中心都市。\"" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "l" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ordinateur" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "tokenize \"l'ordinateur\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Lematyzacja" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "_Lematyzacja_ to sprowadzenie do formy podstawowej (_lematu_), np. \"krześle\" do \"krzesło\", \"zrobimy\" do \"zrobić\" dla języka polskiego, \"chairs\" do \"chair\", \"made\" do \"make\" dla języka angielskiego." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Lematyzacja dla języka polskiego jest bardzo trudna, praktycznie nie sposób wykonać ją regułowo, po prostu musimy się postarać o bardzo obszerny _słownik form fleksyjnych_.\n", - "\n", - "Na potrzeby tego wykładu stwórzmy sobie mały słownik form fleksyjnych w postaci tablicy asocjacyjnej (haszującej)." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Use head
Found:
collectionD !! 0
Why Not:
head collectionD
" - ], - "text/plain": [ - "Line 22: Use head\n", - "Found:\n", - "collectionD !! 0\n", - "Why not:\n", - "head collectionD" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "but" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "butami" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Ala" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mieć" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "." - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Wczoraj" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kupiłem" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "." - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import Data.Map as Map hiding(take, map, filter)\n", - "\n", - "mockInflectionDictionary :: Map Text Text\n", - "mockInflectionDictionary = Map.fromList [\n", - " (\"kota\", \"kot\"),\n", - " (\"butach\", \"but\"),\n", - " (\"masz\", \"mieć\"),\n", - " (\"ma\", \"mieć\"),\n", - " (\"buta\", \"but\"),\n", - " (\"zgubiłem\", \"zgubić\")]\n", - "\n", - "lemmatizeWord :: Map Text Text -> Text -> Text\n", - "lemmatizeWord dict w = findWithDefault w w dict\n", - "\n", - "lemmatizeWord mockInflectionDictionary \"butach\"\n", - "-- a tego nie ma w naszym słowniczku, więc zwracamy to samo\n", - "lemmatizeWord mockInflectionDictionary \"butami\"\n", - "\n", - "lemmatize :: Map Text Text -> [Text] -> [Text]\n", - "lemmatize dict = map (lemmatizeWord dict)\n", - "\n", - "lemmatize mockInflectionDictionary $ tokenize $ collectionD !! 0 \n", - "\n", - "lemmatize mockInflectionDictionary $ tokenize \"Wczoraj kupiłem kota.\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Pytanie**: Nawet w naszym słowniczku mamy problemy z niejednoznacznością lematyzacji. Jakie?\n", - "\n", - "Obszerny słownik form fleksyjnych dla języka polskiego: http://zil.ipipan.waw.pl/PoliMorf?action=AttachFile&do=view&target=PoliMorf-0.6.7.tab.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Stemowanie\n", - "\n", - "Stemowanie (rdzeniowanie) obcina wyraz do _rdzenia_ niekoniecznie będącego sensownym wyrazem, np. \"krześle\" może być rdzeniowane do \"krześl\", \"krześ\" albo \"krzes\", \"zrobimy\" do \"zrobi\".\n", - "\n", - "* stemowanie nie jest tak dobrze określone jak lematyzacja (można robić na wiele sposobów)\n", - "* bardziej podatne na metody regułowe (choć dla polskiego i tak trudno)\n", - "* dla angielskiego istnieją znane algorytmy stemowania, np. [algorytm Portera](https://tartarus.org/martin/PorterStemmer/def.txt)\n", - "* zob. też [program Snowball](https://snowballstem.org/) z regułami dla wielu języków\n", - "\n", - "Prosty stemmer \"dla ubogich\" dla języka polskiego to obcinanie do sześciu znaków." - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "zrobim" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "komput" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "butach" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "źdźbła" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "poorMansStemming :: Text -> Text\n", - "poorMansStemming = Data.Text.take 6\n", - "\n", - "poorMansStemming \"zrobimy\"\n", - "poorMansStemming \"komputerami\"\n", - "poorMansStemming \"butach\"\n", - "poorMansStemming \"źdźbłami\"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### _Stop words_\n", - "\n", - "Często wyszukiwarki pomijają krótkie, częste i nieniosące znaczenia słowa - _stop words_ (_słowa przestankowe_)." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "isStopWord :: Text -> Bool\n", - "isStopWord \"w\" = True\n", - "isStopWord \"jest\" = True\n", - "isStopWord \"że\" = True\n", - "-- przy okazji możemy pozbyć się znaków interpunkcyjnych\n", - "isStopWord w = w ≈ [re|^\\p{P}+$|]\n", - "\n", - "isStopWord \"kot\"\n", - "isStopWord \"!\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Ala" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ma" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kota" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "removeStopWords :: [Text] -> [Text]\n", - "removeStopWords = filter (not . isStopWord)\n", - "\n", - "removeStopWords $ tokenize $ Prelude.head collectionD " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Pytanie**: Jakim zapytaniom usuwanie _stop words_ może szkodzić? Podać przykłady dla języka polskiego i angielskiego. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Normalizacja - różności\n", - "\n", - "W skład normalizacji może też wchodzić:\n", - "\n", - "* poprawianie błędów literowych\n", - "* sprowadzanie do małych liter (lower-casing czy raczej case-folding)\n", - "* usuwanie znaków diakrytycznych\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "żdźbło" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "toLower \"ŻDŹBŁO\"" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "źdźbło" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "toCaseFold \"ŹDŹBŁO\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Pytanie:** Kiedy _case-folding_ da inny wynik niż _lower-casing_? Jakie to ma praktyczne znaczenie?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Normalizacja jako całościowy proces\n", - "\n", - "Najważniejsza zasada: dokumenty w naszej kolekcji powinny być normalizowane w dokładnie taki sposób, jak zapytania.\n", - "\n", - "Efektem normalizacji jest zamiana dokumentu na ciąg _termów_ (ang. _terms_), czyli znormalizowanych wyrazów.\n", - "\n", - "Innymi słowy po normalizacji dokument $d_i$ traktujemy jako ciąg termów $t_i^1,\\dots,t_i^{|d_i|}$." - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ala" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mieć" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "podobn" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "but" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ty" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "chyba" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mieć" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "but" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "chyba" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zgubić" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mieć" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "normalize :: Text -> [Text]\n", - "normalize = map poorMansStemming . removeStopWords . map toLower . lemmatize mockInflectionDictionary . tokenize\n", - "\n", - "map normalize collectionD" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Zbiór wszystkich termów w kolekcji dokumentów nazywamy słownikiem (ang. _vocabulary_), nie mylić ze słownikiem jako strukturą danych w Pythonie (_dictionary_).\n", - "\n", - "$$V = \\bigcup_{i=1}^N \\{t_i^1,\\dots,t_i^{|d_i|}\\}$$\n", - "\n", - "(To zbiór, więc liczymy bez powtórzeń!)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "fromList [\"ala\",\"but\",\"chyba\",\"kot\",\"mie\\263\",\"podobn\",\"ty\",\"zgubi\\263\"]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import Data.Set as Set hiding(map)\n", - "\n", - "getVocabulary :: [Text] -> Set Text \n", - "getVocabulary = Set.unions . map (Set.fromList . normalize) \n", - "\n", - "getVocabulary collectionD" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Jak wyszukiwarka może być szybka?\n", - "\n", - "_Odwrócony indeks_ (ang. _inverted index_) pozwala wyszukiwarce szybko szukać w milionach dokumentów. Odwrócony indeks to prostu... indeks, jaki znamy z książek (mapowanie słów na numery stron/dokumentów).\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Use tuple-section
Found:
\\ t -> (t, ix)
Why Not:
(, ix)
" - ], - "text/plain": [ - "Line 4: Use tuple-section\n", - "Found:\n", - "\\ t -> (t, ix)\n", - "Why not:\n", - "(, ix)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "fromList [(\"chyba\",2),(\"kot\",2),(\"mie\\263\",2),(\"ty\",2)]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "collectionDNormalized = map normalize collectionD\n", - "\n", - "documentToPostings :: ([Text], Int) -> Set (Text, Int)\n", - "documentToPostings (d, ix) = Set.fromList $ map (\\t -> (t, ix)) d\n", - "\n", - "documentToPostings (collectionDNormalized !! 2, 2) \n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Use zipWith
Found:
map documentToPostings $ Prelude.zip coll [0 .. ]
Why Not:
zipWith (curry documentToPostings) coll [0 .. ]
" - ], - "text/plain": [ - "Line 2: Use zipWith\n", - "Found:\n", - "map documentToPostings $ Prelude.zip coll [0 .. ]\n", - "Why not:\n", - "zipWith (curry documentToPostings) coll [0 .. ]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "fromList [(\"ala\",0),(\"but\",1),(\"but\",3),(\"chyba\",2),(\"chyba\",3),(\"kot\",0),(\"kot\",1),(\"kot\",2),(\"kot\",4),(\"mie\\263\",0),(\"mie\\263\",2),(\"mie\\263\",4),(\"podobn\",1),(\"ty\",2),(\"zgubi\\263\",3)]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "collectionToPostings :: [[Text]] -> Set (Text, Int)\n", - "collectionToPostings coll = Set.unions $ map documentToPostings $ Prelude.zip coll [0..]\n", - "\n", - "collectionToPostings collectionDNormalized" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Eta reduce
Found:
updateInvertedIndex (t, ix) invIndex\n", - " = insertWith (++) t [ix] invIndex
Why Not:
updateInvertedIndex (t, ix) = insertWith (++) t [ix]
" - ], - "text/plain": [ - "Line 2: Eta reduce\n", - "Found:\n", - "updateInvertedIndex (t, ix) invIndex\n", - " = insertWith (++) t [ix] invIndex\n", - "Why not:\n", - "updateInvertedIndex (t, ix) = insertWith (++) t [ix]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "fromList [(\"ala\",[0]),(\"but\",[1,3]),(\"chyba\",[2,3]),(\"kot\",[0,1,2,4]),(\"mie\\263\",[0,2,4]),(\"podobn\",[1]),(\"ty\",[2]),(\"zgubi\\263\",[3])]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[0,1,2,4]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "updateInvertedIndex :: (Text, Int) -> Map Text [Int] -> Map Text [Int]\n", - "updateInvertedIndex (t, ix) invIndex = insertWith (++) t [ix] invIndex\n", - "\n", - "getInvertedIndex :: [[Text]] -> Map Text [Int]\n", - "getInvertedIndex = Prelude.foldr updateInvertedIndex Map.empty . Set.toList . collectionToPostings\n", - "\n", - "ind = getInvertedIndex collectionDNormalized\n", - "ind\n", - "ind ! \"kot\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Relewantność\n", - "\n", - "Potrafimy szybko przeszukiwać znormalizowane dokumenty, ale które dokumenty są ważne (_relewantne_) względem potrzeby informacyjnej użytkownika?\n", - "\n", - "### Zapytania boole'owskie\n", - "\n", - "* `pizzeria Poznań dowóz` to `pizzeria AND Poznań AND dowóz` czy `pizzeria OR Poznań OR dowóz`\n", - "* `(pizzeria OR pizza OR tratoria) AND Poznań AND dowóz\n", - "* `pizzeria AND Poznań AND dowóz AND NOT golonka`\n", - "\n", - "Jak domyślnie interpretować zapytanie?\n", - "\n", - "* jako zapytanie AND -- być może za mało dokumentów\n", - "* rozwiązanie pośrednie?\n", - "* jako zapytanie OR -- być może za dużo dokumentów\n", - "\n", - "Możemy jakieś miary dopasowania dokumentu do zapytania, żeby móc posortować dokumenty...\n", - "\n", - "### Mierzenie dopasowania dokumentu do zapytania\n", - "\n", - "Potrzebujemy jakieś funkcji $\\sigma : Q x D \\rightarrow \\mathbb{R}$. \n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Musimy jakoś zamienić dokumenty na liczby, tj. dokumenty na wektory liczb, a całą kolekcję na macierz.\n", - "\n", - "Po pierwsze ponumerujmy wszystkie termy ze słownika." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "fromList [(0,\"ala\"),(1,\"but\"),(2,\"chyba\"),(3,\"kot\"),(4,\"mie\\263\"),(5,\"podobn\"),(6,\"ty\"),(7,\"zgubi\\263\")]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "fromList [(\"ala\",0),(\"but\",1),(\"chyba\",2),(\"kot\",3),(\"mie\\263\",4),(\"podobn\",5),(\"ty\",6),(\"zgubi\\263\",7)]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ala" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "2" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "voc = getVocabulary collectionD\n", - "\n", - "vocD :: Map Int Text\n", - "vocD = Map.fromList $ zip [0..] $ Set.toList voc\n", - "\n", - "invvocD :: Map Text Int\n", - "invvocD = Map.fromList $ zip (Set.toList voc) [0..]\n", - "\n", - "vocD\n", - "\n", - "invvocD\n", - "\n", - "vocD ! 0\n", - "invvocD ! \"chyba\"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Napiszmy funkcję, która _wektoryzuje_ znormalizowany dokument.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Redundant $
Found:
map (\\ i -> count (v ! i) doc) $ [0 .. (vecSize - 1)]
Why Not:
map (\\ i -> count (v ! i) doc) [0 .. (vecSize - 1)]
Redundant bracket
Found:
(collectionDNormalized !! 2)
Why Not:
collectionDNormalized !! 2
" - ], - "text/plain": [ - "Line 2: Redundant $\n", - "Found:\n", - "map (\\ i -> count (v ! i) doc) $ [0 .. (vecSize - 1)]\n", - "Why not:\n", - "map (\\ i -> count (v ! i) doc) [0 .. (vecSize - 1)]Line 9: Redundant bracket\n", - "Found:\n", - "(collectionDNormalized !! 2)\n", - "Why not:\n", - "collectionDNormalized !! 2" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ty" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "chyba" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mieć" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "vectorize :: Int -> Map Int Text -> [Text] -> [Double]\n", - "vectorize vecSize v doc = map (\\i -> count (v ! i) doc) $ [0..(vecSize-1)]\n", - " where count t doc \n", - " | t `elem` doc = 1.0\n", - " | otherwise = 0.0\n", - " \n", - "vocSize = Set.size voc\n", - "\n", - "(collectionDNormalized !! 2)\n", - "vectorize vocSize vocD (collectionDNormalized !! 2)\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ![image](./macierz.png)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Jak inaczej uwzględnić częstość wyrazów?\n", - "\n", - "
\n", - " $\n", - " \\newcommand{\\idf}{\\mathop{\\rm idf}\\nolimits}\n", - " \\newcommand{\\tf}{\\mathop{\\rm tf}\\nolimits}\n", - " \\newcommand{\\df}{\\mathop{\\rm df}\\nolimits}\n", - " \\newcommand{\\tfidf}{\\mathop{\\rm tfidf}\\nolimits}\n", - " $\n", - "
\n", - "\n", - "* $\\tf_{t,d}$ - term frequency\n", - "\n", - "* $1+\\log(\\tf_{t,d})$\n", - "\n", - "* $0.5 + \\frac{0.5 \\times \\tf_{t,d}}{max_t(\\tf_{t,d})}$" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Redundant $
Found:
map (\\ i -> count (v ! i) doc) $ [0 .. (vecSize - 1)]
Why Not:
map (\\ i -> count (v ! i) doc) [0 .. (vecSize - 1)]
Redundant bracket
Found:
(collectionDNormalized !! 4)
Why Not:
collectionDNormalized !! 4
" - ], - "text/plain": [ - "Line 2: Redundant $\n", - "Found:\n", - "map (\\ i -> count (v ! i) doc) $ [0 .. (vecSize - 1)]\n", - "Why not:\n", - "map (\\ i -> count (v ! i) doc) [0 .. (vecSize - 1)]Line 7: Redundant bracket\n", - "Found:\n", - "(collectionDNormalized !! 4)\n", - "Why not:\n", - "collectionDNormalized !! 4" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mieć" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "vectorizeTf :: Int -> Map Int Text -> [Text] -> [Double]\n", - "vectorizeTf vecSize v doc = map (\\i -> count (v ! i) doc) $ [0..(vecSize-1)]\n", - " where count t doc = fromIntegral $ (Prelude.length . Prelude.filter (== t)) doc\n", - "\n", - "vocSize = Set.size voc\n", - "\n", - "(collectionDNormalized !! 4)\n", - "vectorize vocSize vocD (collectionDNormalized !! 4)\n", - "vectorizeTf vocSize vocD (collectionDNormalized !! 4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - " $\n", - " \\newcommand{\\idf}{\\mathop{\\rm idf}\\nolimits}\n", - " \\newcommand{\\tf}{\\mathop{\\rm tf}\\nolimits}\n", - " \\newcommand{\\df}{\\mathop{\\rm df}\\nolimits}\n", - " \\newcommand{\\tfidf}{\\mathop{\\rm tfidf}\\nolimits}\n", - " $\n", - "
\n", - "\n", - "### Odwrotna częstość dokumentowa\n", - "\n", - "Czy wszystkie wyrazy są tak samo ważne?\n", - "\n", - "**NIE.** Wyrazy pojawiające się w wielu dokumentach są mniej ważne.\n", - "\n", - "Aby to uwzględnić, przemnażamy frekwencję wyrazu przez _odwrotną\n", - " częstość w dokumentach_ (_inverse document frequency_):\n", - "\n", - "$$\\idf_t = \\log \\frac{N}{\\df_t},$$\n", - "\n", - "gdzie:\n", - "\n", - "* $\\idf_t$ - odwrotna częstość wyrazu $t$ w dokumentach\n", - "\n", - "* $N$ - liczba dokumentów w kolekcji\n", - "\n", - "* $\\df_f$ - w ilu dokumentach wystąpił wyraz $t$?\n", - "\n", - "#### Dlaczego idf?\n", - "\n", - "term $t$ wystąpił...\n", - "\n", - "* w 1 dokumencie, $\\idf_t = \\log N/1 = \\log N$\n", - "* 2 razy w kolekcji, $\\idf_t = \\log N/2$ lub $\\log N$\n", - "* w połowie dokumentów kolekcji, $\\idf_t = \\log N/(N/2) = \\log 2$\n", - "* we wszystkich dokumentach, $\\idf_t = \\log N/N = \\log 1 = 0$\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.22314355131420976" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "idf :: [[Text]] -> Text -> Double\n", - "idf coll t = log (fromIntegral n / fromIntegral df)\n", - " where df = Prelude.length $ Prelude.filter (\\d -> t `elem` d) coll\n", - " n = Prelude.length coll\n", - " \n", - "idf collectionDNormalized \"kot\" " - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.9162907318741551" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "idf collectionDNormalized \"chyba\" " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Co z tego wynika?\n", - "\n", - "Zamiast $\\tf_{t,d}$ będziemy w wektorach rozpatrywać wartości:\n", - "\n", - "$$\\tfidf_{t,d} = \\tf_{t,d} \\times \\idf_{t}$$\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mieć" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kot" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[0.0,0.0,0.0,0.44628710262841953,0.5108256237659907,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "vectorizeTfIdf :: Int -> [[Text]] -> Map Int Text -> [Text] -> [Double]\n", - "vectorizeTfIdf vecSize coll v doc = map (\\i -> count (v ! i) doc * idf coll (v ! i)) [0..(vecSize-1)]\n", - " where count t doc = fromIntegral $ (Prelude.length . Prelude.filter (== t)) doc\n", - "\n", - "vocSize = Set.size voc\n", - "\n", - "collectionDNormalized !! 4\n", - "vectorize vocSize vocD (collectionDNormalized !! 4)\n", - "vectorizeTf vocSize vocD (collectionDNormalized !! 4)\n", - "vectorizeTfIdf vocSize collectionDNormalized vocD (collectionDNormalized !! 4)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[[1.6094379124341003,0.0,0.0,0.22314355131420976,0.5108256237659907,0.0,0.0,0.0],[0.0,0.9162907318741551,0.0,0.22314355131420976,0.0,1.6094379124341003,0.0,0.0],[0.0,0.0,0.9162907318741551,0.22314355131420976,0.5108256237659907,0.0,1.6094379124341003,0.0],[0.0,0.9162907318741551,0.9162907318741551,0.0,0.0,0.0,0.0,1.6094379124341003],[0.0,0.0,0.0,0.44628710262841953,0.5108256237659907,0.0,0.0,0.0]]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "map (vectorizeTfIdf vocSize collectionDNormalized vocD) collectionDNormalized" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Teraz zdefiniujemy _overlap score measure_:\n", - "\n", - "$$\\sigma(q,d) = \\sum_{t \\in q} \\tfidf_{t,d}$$" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Podobieństwo kosinusowe\n", - "\n", - "_Overlap score measure_ nie jest jedyną możliwą metryką, za pomocą której możemy mierzyć dopasowanie dokumentu do zapytania. Możemy również sięgnąć po intuicje geometryczne (skoro mamy do czynienia z wektorami).\n", - "\n", - "**Pytanie**: Ile wymiarów mają wektory, na których operujemy? Jak \"wyglądają\" te wektory? Czy możemy wykonywać na nich standardowe operacje geometryczne czy te, które znamy z geometrii liniowej?\n", - "\n", - "#### Podobieństwo między dokumentami\n", - "\n", - "Zajmijmy się teraz poszukiwaniem miary mierzącej podobieństwo między dokumentami $d_1$ i $d_2$ (czyli poszukujemy sensownej funkcji $\\sigma : D x D \\rightarrow \\mathbb{R}$).\n", - "\n", - "**Uwaga** Pojęcia \"miary\" używamy nieformalnie, nie spełnia ona założeń znanych z teorii miary.\n", - "\n", - "Rozpatrzmy zbiorek tekstów legend miejskich z .\n", - "\n", - "(To autentyczne teksty z Internentu, z językiem potocznym, wulgarnym itd.)\n", - "\n", - "```\n", - " git clone git://gonito.net/polish-urban-legends\n", - " paste polish-urban-legends/dev-0/expected.tsv polish-urban-legends/dev-0/in.tsv > legendy.txt\n", - "``` " - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "na_ak" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Opowieść prawdziwa... Olsztyn, akademik, 7 piętro, impreza u Mariusza, jak to na polskiej najebce bywa ktoś rzucił tekstem: \"Mariusz nie zjedziesz na nartach po schodach\". Sprawa ucichla, studencii wrocili do tego co lubia i w sumie umieją najbardziej czyli picia, lecz nad ranem kolo godziny 6.00 ludzia przypomnialo sie ze Mariusz miał zjechać na nartach po schodach. Tu warto wspomnieć że Mariusz był zapalonym narciarzem stąd właśnie w jego pokoju znalezc można bylo narty, bo po ki huj komuś narty w Olsztynie! Tak wracajac do historii nasz bohater odział się w sprzet, podszed do schodow i niestety dał radę zjechać jedynie w połowie, gdyż jak to powiedzial \"no kurwa potknąłem sie\", ale nieustraszoony Mariusz próbowal dalej. Nastepny zjazd byl perfekcyjny, jedno pietro zanim, niestety pomiedzy 6 a 5 pietrem Mariusza natrafil na Pania sprzątaczke, która potrącił i zwiał z miejsca wypadku. Ok godziny 10.00 nastopilo przebudzenie Mariusza, ktory zaraz po obudzeniu uslyszal co narobił, mianowicie o skutkach potracenia, Pani sprzataczka złamala rękę i trafiła do szpitala. Mogły powstać przez to cieżkie konsekwencje, Mariusz mógł wyleciec z akademika jeżeli kierownik dowie sie o calym zajściu. Wiec koledzy poradzili narciażowi, aby kupił kwiaty i bombonierkę i poszedł do szpitala z przeprosinami. Po szybkich zakupach w sasiedniej Biedrące, Mariusz byl przygotowany na konfrontacje z Pania sprzątaczka, ale nie mogło pojść pięknie i gładko. Po wejściu do szpitala nasz bohater skierowal swoje kroki do recepcji pytajac się o ciocię, która miała wypadek w akademiku, recepcjonistka skierowała go do lekarza, gdzie czekał na jego wyjście ok 15 minut, gdy lekarz już wyszedł ten odrazu podleciał do niego, żeby spytać się o stan zdrowia Pani sprzątaczki. Wnet uslyszla od lekarz, niestety Pani teraz jest u psychiatry po twierdzi, że ktoś potracil ja zjeżdzajac na nartach w akademiku. Po uslyszeniu tej wiadomosci Mariusz odwrocił się, wybiegł, kupił piecie i szybko pobiegł do akademika pić dalej! Morał... student potrafi!" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import System.IO\n", - "import Data.List.Split as SP\n", - "\n", - "legendsh <- openFile \"legendy.txt\" ReadMode\n", - "hSetEncoding legendsh utf8\n", - "contents <- hGetContents legendsh\n", - "ls = Prelude.lines contents\n", - "items = map (map pack . SP.splitOn \"\\t\") ls\n", - "Prelude.head items" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "87" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "nbOfLegends = Prelude.length items\n", - "nbOfLegends" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "na_ak" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lud" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ba_hy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lap" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ne_dz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "be_wy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mo_zu" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "be_wy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ba_hy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mo_zu" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "be_wy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lud" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ne_dz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ta_ab" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ta_ab" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ta_ab" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lap" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ba_hy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ne_dz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ba_hy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "tr_su" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ne_dz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ba_hy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mo_zu" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "tr_su" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ne_dz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ne_dz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lud" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ne_dz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ta_ab" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lud" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "na_ak" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lap" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "be_wy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "na_ak" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lap" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "na_ak" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ba_hy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lud" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mo_zu" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ba_hy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "tr_su" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "na_ak" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ba_hy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lud" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lud" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "tr_su" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lud" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "be_wy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "tr_su" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "na_ak" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ba_hy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ne_dz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ba_hy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "na_ak" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lud" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mo_zu" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mo_zu" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "na_ak" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lap" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ne_dz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ba_hy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mo_zu" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ba_hy" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ne_dz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zw_oz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "tr_su" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ne_dz" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "w_lud" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Ja podejrzewam że o polowaniu nie było mowy, po prostu znalazł martwego szczupaka i skorzystał z okazji! Mnie mocno zdziwiła jego siła żeby taki pół kilogramowy okaz szczupaka przesuwać o parę metrów i to w trzcinach! Szacuneczek. Przypomniala mi sie historia którą kiedys zaslyszalem o wlascicielce pytona, ktory nagle polozyl sie wzdluz jej łóżka. Leżał tak wyciągniety jak struna dłuższy czas jak nieżywy (a był długości łóżka), więc kobitka zadzonila do weterynarza co ma robić. Usłyszała że ma szybko zamknąć się w łazience i poczekać na niego bo pyton ją mierzy jako potencjalną ofiarę (czy mu się zmieści w brzuchu...). Wierzyć, nie wierzyć? Kiedyś nie wierzyłem ale od kilku dni mam wątpliwosci... Pozdrawiam" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "labelsL = map Prelude.head items\n", - "labelsL\n", - "collectionL = map (!!1) items\n", - "items !! 1" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "348" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "collectionLNormalized = map normalize collectionL\n", - "voc' = getVocabulary collectionL\n", - "\n", - "vocLSize = Prelude.length voc'\n", - "\n", - "vocL :: Map Int Text\n", - "vocL = Map.fromList $ zip [0..] $ Set.toList voc'\n", - "\n", - "invvocL :: Map Text Int\n", - "invvocL = Map.fromList $ zip (Set.toList voc') [0..]\n", - "\n", - "vocL ! 0\n", - "invvocL ! \"chyba\"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Wektoryzujemy całą kolekcję:" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38837067474886433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.752336051950276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0647107369924282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2078115806331018,0.0,0.0,0.0,0.0,0.0,1.247032293786383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5947071077466928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.268683541318364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2078115806331018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.7578579175523736,0.0,0.0,0.0,0.0,0.0,0.3550342544812725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9395475940384223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21437689194643514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2878542883066382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2745334443309775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.079613757534693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.330413902725434,0.0,1.247032293786383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.330413902725434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,2.5199979695992702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.6741486494265287,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5199979695992702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.6741486494265287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.079613757534693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.386466576974748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.856470206220483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,1.0319209141694374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,2.340142505300509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.7578579175523736,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5214691394881432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.388148398070203e-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9810014688665833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6096847248398047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.575536360758419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.079613757534693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1847155011136463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0319209141694374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,2.856470206220483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.079613757534693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.322773392263051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.079613757534693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.163323025660538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.900958761193047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,3.079613757534693,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.340142505300509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.710068508962545,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.931816237309167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5199979695992702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0319209141694374,0.0,2.163323025660538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26121549926361765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.6741486494265287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.386466576974748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.238841272604079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.330413902725434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.163323025660538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12210269680089991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.068012845856213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.856470206220483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.856470206220483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.079613757534693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.712940412440966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.068012845856213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "lVectorized = map (vectorizeTfIdf vocLSize collectionLNormalized vocL) collectionLNormalized\n", - "lVectorized !! 1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Szukamy funkcji $sigma$, która da wysoką wartość dla tekstów dotyczących tego samego wątku legendowego (np. $d_1$ i $d_2$ mówią o wężu przymierzającym się do zjedzenia swojej właścicielki) i niską dla tekstów z różnych wątków (np. $d_1$ opowiada o wężu ludojadzie, $d_2$ - bałwanku na hydrancie)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Może po prostu odległość euklidesowa, skoro to punkty w wielowymiarowej przestrzeni?" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Eta reduce
Found:
formatNumber x = printf \"% 7.2f\" x
Why Not:
formatNumber = printf \"% 7.2f\"
" - ], - "text/plain": [ - "Line 5: Eta reduce\n", - "Found:\n", - "formatNumber x = printf \"% 7.2f\" x\n", - "Why not:\n", - "formatNumber = printf \"% 7.2f\"" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - " 0.00 79.93 78.37 76.57 87.95 81.15 82.77 127.50 124.54 76.42 84.19 78.90 90.90" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import Text.Printf\n", - "import Data.List (take)\n", - "\n", - "formatNumber :: Double -> String\n", - "formatNumber x = printf \"% 7.2f\" x\n", - "\n", - "similarTo :: ([Double] -> [Double] -> Double) -> [[Double]] -> Int -> Text\n", - "similarTo simFun vs ix = pack $ Prelude.unwords $ map (formatNumber . ((vs !! ix) `simFun`)) vs\n", - "\n", - "euclDistance :: [Double] -> [Double] -> Double\n", - "euclDistance v1 v2 = sqrt $ sum $ Prelude.zipWith (\\x1 x2 -> (x1 - x2)**2) v1 v2\n", - "\n", - "limit = 13\n", - "labelsLimited = Data.List.take limit labelsL\n", - "limitedL = Data.List.take limit lVectorized\n", - "\n", - "similarTo euclDistance limitedL 0\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Move brackets to avoid $
Found:
\"\\n\"\n", - " <>\n", - " (Data.Text.unlines\n", - " $ map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", - " $ zip labels [0 .. (Prelude.length vs - 1)])
Why Not:
\"\\n\"\n", - " <>\n", - " Data.Text.unlines\n", - " (map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", - " $ zip labels [0 .. (Prelude.length vs - 1)])
Use zipWith
Found:
map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", - " $ zip labels [0 .. (Prelude.length vs - 1)]
Why Not:
zipWith\n", - " (curry (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix))\n", - " labels [0 .. (Prelude.length vs - 1)]
Move brackets to avoid $
Found:
\" \"\n", - " <> (Data.Text.unwords $ map (\\ l -> pack $ printf \"% 7s\" l) labels)
Why Not:
\" \"\n", - " <> Data.Text.unwords (map (\\ l -> pack $ printf \"% 7s\" l) labels)
Avoid lambda
Found:
\\ l -> pack $ printf \"% 7s\" l
Why Not:
pack . printf \"% 7s\"
" - ], - "text/plain": [ - "Line 2: Move brackets to avoid $\n", - "Found:\n", - "\"\\n\"\n", - " <>\n", - " (Data.Text.unlines\n", - " $ map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", - " $ zip labels [0 .. (Prelude.length vs - 1)])\n", - "Why not:\n", - "\"\\n\"\n", - " <>\n", - " Data.Text.unlines\n", - " (map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", - " $ zip labels [0 .. (Prelude.length vs - 1)])Line 2: Use zipWith\n", - "Found:\n", - "map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", - " $ zip labels [0 .. (Prelude.length vs - 1)]\n", - "Why not:\n", - "zipWith\n", - " (curry (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix))\n", - " labels [0 .. (Prelude.length vs - 1)]Line 3: Move brackets to avoid $\n", - "Found:\n", - "\" \"\n", - " <> (Data.Text.unwords $ map (\\ l -> pack $ printf \"% 7s\" l) labels)\n", - "Why not:\n", - "\" \"\n", - " <> Data.Text.unwords (map (\\ l -> pack $ printf \"% 7s\" l) labels)Line 3: Avoid lambda\n", - "Found:\n", - "\\ l -> pack $ printf \"% 7s\" l\n", - "Why not:\n", - "pack . printf \"% 7s\"" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", - "na_ak 0.00 79.93 78.37 76.57 87.95 81.15 82.77 127.50 124.54 76.42 84.19 78.90 90.90\n", - "w_lud 79.93 0.00 38.92 34.35 56.48 44.89 47.21 109.24 104.82 35.33 49.88 39.98 60.20\n", - "ba_hy 78.37 38.92 0.00 30.37 54.23 40.93 43.83 108.15 102.91 27.37 46.95 35.81 58.99\n", - "w_lap 76.57 34.35 30.37 0.00 51.54 37.46 40.86 107.43 103.22 25.22 43.66 32.10 56.53\n", - "ne_dz 87.95 56.48 54.23 51.54 0.00 57.98 60.32 113.66 109.59 50.96 62.17 54.84 70.70\n", - "be_wy 81.15 44.89 40.93 37.46 57.98 0.00 49.55 110.37 100.50 37.77 51.54 37.09 62.92\n", - "zw_oz 82.77 47.21 43.83 40.86 60.32 49.55 0.00 111.11 107.57 41.02 54.07 45.23 64.65\n", - "mo_zu 127.50 109.24 108.15 107.43 113.66 110.37 111.11 0.00 139.57 107.38 109.91 108.20 117.07\n", - "be_wy 124.54 104.82 102.91 103.22 109.59 100.50 107.57 139.57 0.00 102.69 108.32 99.06 113.25\n", - "ba_hy 76.42 35.33 27.37 25.22 50.96 37.77 41.02 107.38 102.69 0.00 43.83 32.08 56.68\n", - "mo_zu 84.19 49.88 46.95 43.66 62.17 51.54 54.07 109.91 108.32 43.83 0.00 47.87 66.40\n", - "be_wy 78.90 39.98 35.81 32.10 54.84 37.09 45.23 108.20 99.06 32.08 47.87 0.00 59.66\n", - "w_lud 90.90 60.20 58.99 56.53 70.70 62.92 64.65 117.07 113.25 56.68 66.40 59.66 0.00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "paintMatrix :: ([Double] -> [Double] -> Double) -> [Text] -> [[Double]] -> Text\n", - "paintMatrix simFun labels vs = header <> \"\\n\" <> (Data.Text.unlines $ map (\\(lab, ix) -> lab <> \" \" <> similarTo simFun vs ix) $ zip labels [0..(Prelude.length vs - 1)])\n", - " where header = \" \" <> (Data.Text.unwords $ map (\\l -> pack $ printf \"% 7s\" l) labels)\n", - " \n", - "paintMatrix euclDistance labelsLimited limitedL" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Problem: za dużo zależy od długości tekstu.\n", - "\n", - "Rozwiązanie: znormalizować wektor $v$ do wektora jednostkowego.\n", - "\n", - "$$ \\vec{1}(v) = \\frac{v}{|v|} $$\n", - "\n", - "Taki wektor ma długość 1!" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1.0" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", - "na_ak 10.00 0.67 0.66 0.66 0.67 0.67 0.67 0.67 0.67 0.67 0.66 0.67 0.67\n", - "w_lud 0.67 10.00 0.67 0.68 0.67 0.66 0.67 0.67 0.68 0.66 0.67 0.67 0.68\n", - "ba_hy 0.66 0.67 10.00 0.66 0.67 0.67 0.67 0.67 0.69 0.74 0.66 0.67 0.66\n", - "w_lap 0.66 0.68 0.66 10.00 0.66 0.66 0.66 0.66 0.67 0.66 0.66 0.66 0.66\n", - "ne_dz 0.67 0.67 0.67 0.66 10.00 0.67 0.67 0.68 0.69 0.68 0.67 0.67 0.68\n", - "be_wy 0.67 0.66 0.67 0.66 0.67 10.00 0.66 0.67 0.74 0.66 0.67 0.76 0.66\n", - "zw_oz 0.67 0.67 0.67 0.66 0.67 0.66 10.00 0.67 0.67 0.66 0.66 0.67 0.67\n", - "mo_zu 0.67 0.67 0.67 0.66 0.68 0.67 0.67 10.00 0.69 0.67 0.69 0.68 0.67\n", - "be_wy 0.67 0.68 0.69 0.67 0.69 0.74 0.67 0.69 10.00 0.68 0.67 0.75 0.67\n", - "ba_hy 0.67 0.66 0.74 0.66 0.68 0.66 0.66 0.67 0.68 10.00 0.66 0.67 0.66\n", - "mo_zu 0.66 0.67 0.66 0.66 0.67 0.67 0.66 0.69 0.67 0.66 10.00 0.67 0.67\n", - "be_wy 0.67 0.67 0.67 0.66 0.67 0.76 0.67 0.68 0.75 0.67 0.67 10.00 0.67\n", - "w_lud 0.67 0.68 0.66 0.66 0.68 0.66 0.67 0.67 0.67 0.66 0.67 0.67 10.00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "vectorNorm :: [Double] -> Double\n", - "vectorNorm vs = sqrt $ sum $ map (\\x -> x * x) vs\n", - "\n", - "toUnitVector :: [Double] -> [Double]\n", - "toUnitVector vs = map (/ n) vs\n", - " where n = vectorNorm vs\n", - "\n", - "vectorNorm (toUnitVector [3.0, 4.0])\n", - "\n", - "euclDistanceNormalized :: [Double] -> [Double] -> Double\n", - "euclDistanceNormalized v1 v2 = toUnitVector v1 `euclDistance` toUnitVector v2\n", - "\n", - "euclSim v1 v2 = 1 / (d + 0.1)\n", - " where d = euclDistanceNormalized v1 v2\n", - "\n", - "paintMatrix euclSim labelsLimited limitedL" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Podobieństwo kosinusowe\n", - "\n", - "Częściej zamiast odległości euklidesowej stosuje się podobieństwo kosinusowe, czyli kosinus kąta między wektorami.\n", - "\n", - "Wektor dokumentu ($\\vec{V}(d)$) - wektor, którego składowe odpowiadają wyrazom.\n", - "\n", - "$$\\sigma(d_1,d_2) = \\cos\\theta(\\vec{V}(d_1),\\vec{V}(d_2)) = \\frac{\\vec{V}(d_1) \\cdot \\vec{V}(d_2)}{|\\vec{V}(d_1)||\\vec{V}(d_2)|} $$\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Zauważmy, że jest to iloczyn skalarny znormalizowanych wektorów!\n", - "\n", - "$$\\sigma(d_1,d_2) = \\vec{1}(\\vec{V}(d_1)) \\times \\vec{1}(\\vec{V}(d_2)) $$" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1.0" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "(✕) :: [Double] -> [Double] -> Double\n", - "(✕) v1 v2 = sum $ Prelude.zipWith (*) v1 v2\n", - "\n", - "[2, 1, 0] ✕ [-2, 5, 10]" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", - "na_ak 1.00 0.02 0.01 0.01 0.03 0.02 0.02 0.04 0.03 0.02 0.01 0.02 0.03\n", - "w_lud 0.02 1.00 0.02 0.05 0.04 0.01 0.03 0.04 0.06 0.01 0.02 0.03 0.06\n", - "ba_hy 0.01 0.02 1.00 0.01 0.02 0.03 0.03 0.04 0.08 0.22 0.01 0.04 0.01\n", - "w_lap 0.01 0.05 0.01 1.00 0.01 0.01 0.00 0.01 0.02 0.00 0.00 0.00 0.00\n", - "ne_dz 0.03 0.04 0.02 0.01 1.00 0.04 0.03 0.07 0.08 0.06 0.03 0.03 0.05\n", - "be_wy 0.02 0.01 0.03 0.01 0.04 1.00 0.01 0.03 0.21 0.01 0.02 0.25 0.01\n", - "zw_oz 0.02 0.03 0.03 0.00 0.03 0.01 1.00 0.04 0.03 0.00 0.01 0.02 0.02\n", - "mo_zu 0.04 0.04 0.04 0.01 0.07 0.03 0.04 1.00 0.10 0.02 0.09 0.05 0.04\n", - "be_wy 0.03 0.06 0.08 0.02 0.08 0.21 0.03 0.10 1.00 0.05 0.03 0.24 0.04\n", - "ba_hy 0.02 0.01 0.22 0.00 0.06 0.01 0.00 0.02 0.05 1.00 0.01 0.02 0.00\n", - "mo_zu 0.01 0.02 0.01 0.00 0.03 0.02 0.01 0.09 0.03 0.01 1.00 0.01 0.02\n", - "be_wy 0.02 0.03 0.04 0.00 0.03 0.25 0.02 0.05 0.24 0.02 0.01 1.00 0.02\n", - "w_lud 0.03 0.06 0.01 0.00 0.05 0.01 0.02 0.04 0.04 0.00 0.02 0.02 1.00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "cosineSim v1 v2 = toUnitVector v1 ✕ toUnitVector v2\n", - "\n", - "paintMatrix cosineSim labelsLimited limitedL" - ] - }, - { - "cell_type": "code", - "execution_count": 140, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "na tylnym siedzeniu w autobusie siedzi matka z 7-8 letnim synkiem. naprzeciwko synka siedzi kobieta (zwrócona twarzą do dzieciaka). synek co chwile wymachuje nogami i kopie kobietę, matka widząc to nie reaguje na to wogóle. wreszcie kobieta zwraca uwagę matce, żeby ta powiedziała coś synowi a matka do niej: nie mogę, bo wychowuję syna bezstresowo!!! ...chłopak, który stał w pobliżu i widział i słyszał całe to zajście wypluł z ust gumę do żucia i przykleił matce na czoło i powiedział: ja też byłem bezstresowo wychowywany... autentyczny przypadek w londyńskim autobusie (a tym co przykleił matce gumę na czoło był chyba nawet młody Polak)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "collectionL !! 5" - ] - }, - { - "cell_type": "code", - "execution_count": 141, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Krótko zwięźle i na temat. Zastanawia mnie jak ludzie wychowują dzieci. Co prawda sam nie mam potomstwa i nie zamierzam mieć jak narazie (bo to trochę głupie mieć 17-letniego tatusia), ale niestety mam przyjemność oglądać efekty wychowawcze niektórych par (dzięki znajomym rodziców w różnym wieku). Są trzy najbardziej znane mi modele wychowania. Surowe, bezstresowe (w moim znaczeniu) i \"bezstresowe\" w mowie potocznej. Zaczynam od tego pierwszego. Jak nazwa wskazuje, jest to surowe wychowanie, oparte na karach cielesnych lub torturach umysłowych. Nie uważam tego za dobre wychowanie, bo dziecko jak będzie nieco starsze będzie się bało wszystkiego, bo uzna, ż jak zrobi coś żle to spotka je kara. Więc bicie za różne rzeczy odpada (no chyba, że dzieciak na serio nabroi to oczywiście). Wychowanie bezstresowe z mojego słownika oznacza nienarażanie dziecka na stresy, pocieszanie w trudnych sytuacjach, załatwianie problemów przez rozmowę oraz stały kontakt z dzieckiem. I to chyba najlepsze. Sam zostałem tak wychowany i cieszę się z tego powodu. I oczywiście \"wychowanie bezstresowe\". A tu się normalnie rozpiszę. Po pierwsze geneza. Więc jak dochodzi do takiego wychowania? Odpowiedź. Mamusi i tatusiowi się zachciało bobaska bo to takie malutkie fajniutkie i ooo. Oboje zazdroszczą innym parom bo one mają, a oni nie, więc oni też chcą. No więc rodzi im się bobasek, chuchają dmuchają na niego póki małe. Ale przychodzi ten okres, kiedy dziecko trzeba wychować i kiedy ma się na dzieciaka największy wpływ. I tu się zaczynają schody. Nagle oboje nie mają czasu i mówią \"Wychowamy go/ją/ich (niepotrzebne skreślić) bezstresowo.\" Po drugie. Decyzja o sposobie wychowania podjęta. A więc jak to wygląda? Odpowiedź. Totalna olewka! Mama i tata balują, a dzieciaka zostawiają samemu sobie, albo pod opiekę babci, która również leje na dziecko ciepłym moczem. Dzieciak rośnie i rośnie, nie wie co dobre a co złe. Przypomniała mi się pewna, podobno autentyczna scenka. Chłopak jedzie ze szwagrem autobusem czy tam tramwajem. Na jednym miejscu siedzi starowinka, a na przeciwko niej siedzi lafirynda z brzdącem na kolanach. No i sobie dzieciak macha nóżkami i tu ciach i kopnął staruszkę w nogę. Babcia nic sobie z tego nie zrobiła, a dzieciak nie widząc reakcji zaczął ją już celowo kopać. Staruszka: Może pani powiedzieć coś synkowi żeby mnie nie kopał. Matka: Nie bo ja go wychowuję bezstresowo. Szwagier wyciąga z ust gumę do żucia i przykleja mamusi na czoło mówiąc: Moja mama też mnie wychowała bezstresowo. Ciekaw jestem ile w tym prawdy było, a jeżeli 100% to czy mamusi się odmieniły poglądy. Kto go wie? Po trzecie. Dorosły wychowany bezstresowo. Jaki on jest? Odpowiedź. Zupełnie inny. Myśli, że jest pępkiem świata i że wszystko musi być pod jego dyktando. Pracując w Szwajcarii przy pielęgnacji winogron, syn polskiego kolegi taty zaczął rzucać we mnie winogronami. Miałem ochotę wbić mu nożyczki (którymi podcinałem liście) w oczy. A to byłby ciekawy widok. Dzieciak o białych włosach, skórze i niebieskich oczach stałby sie albinosem (bo z niebieskich oczu stałyby sie czerwone jak u białych szczurów i myszek). Ojciec sie co prawda na niego wydzierał, żeby nie przeszkadzał, ale jak widać dzieciak miał to po prostu w dupie. Więc skoro dziecko nie słucha się nawet rodzica, to jak w szkole posłucha nauczyciela? Jak znajdzie pracę, w której będzie jakiś szef (chyba, że sam sobie będzie szefem)? W ten oto sposób jak dowiaduję się o tym, że ktoś wychowuje dzieciaka bezstresowo, ciary przechodzą mi po plecach, a tegoż rodzica mam ochotę palnąć mu w łeb tak żeby się przekręcił (zarówno łeb jak i poglądy). A jak mnie wychowano? Byłem często sam sobie zostawiany. Ale nie oznacza że to byla wspomniana olewka. Jako, że rodzice pracowali, a rodzeństwo chodziło do szkoły, podrzucali mnie do babci. A wieczorami się mną opiekowali. Gadali jak miałem problemy i nie bili bo ponoć byłem spokojnym dzieckiem. No i tyle. Do 17 urodzin 2 dni, a szczura chyba nie dostanę. A sam nie kupię!;(" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "collectionL !! 8" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Z powrotem do wyszukiwarek\n", - "\n", - "Możemy potraktować zapytanie jako bardzo krótki dokument, dokonać jego wektoryzacji i policzyć cosinus kąta między zapytaniem a dokumentem." - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ja za to znam przypadek, że koleżanka mieszkala w bloku parę lat temu, pewnego razu wchodzi do łazienki w samej bieliźnie a tam ogromny wąż na podłodze i tak się wystraszyła że wybiegła z wrzaskiem z mieszkania i wyleciała przed blok w samej bieliźnie i uciekła do babci swojej, która mieszkala gdzieś niedaleko. a potem się okazało, że jej sąsiad z dołu hodował sobie węża i tak właśnie swobodnie go \"pasał\" po mieszkaniu i wąż mu spierdzielił przez rurę w łazience :cool :" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Pewna dziewczyna, wieku mi nieznanego, w mieście stołecznym - rozwiodła się. Była sama i samotna, więc zapragnęła kupić sobie zwierzę, aby swą miłą obecnością rozjaśniało jej puste wieczory i takież poranki. Dziewczyna była najwyraźniej ekscentryczką, bo zamiast rozkosznego, miękkiego kociaka z czerwonym kłębuszkiem wełenki lub kudłatego pieska , co sika na parkiet i gryzie skarpetki - kupiła sobie ... węża. Wąż zamieszkał z dziewczyną, i dobrze im było. Gad jadł, spał i rósł, a po pierwszym okresie obojętności ( zwłaszcza ze strony węża ) nawiązała się między nimi nić porozumienia. Przynajmniej dziewczyna odczuwała tę nić wyraźnie, gdyż wąż reagował na jej obecność, a nocą spał zwinięty w kłębek w nogach jej łóżka. Po dwóch latach wspólnego bytowania, nie przerywanych żadnym znaczącym wydarzeniem w ich wzajemnych relacjach, dziewczyna zauważyła, że wąż stał się osowiały. Przestał jeść, chował się po kątach, a nocami, zamiast w nogach łóżka - sypiał wyciągnięty wzdłuż jej boku. Martwiła się o swojego gada i poszła z nim do weterynarza. Weterynarz zbadał go, zapisał leki na poprawę apetytu ( ciekawe, jak się bada węża ? ) i odesłał do domu. Zdrowie śliskiego pacjenta nie poprawiło się, więc troskliwa dziewczyna postanowiła zasięgnąć porady u znawcy gadów i gadzich obyczajów. Znawca wysłuchał opisu niepokojących objawów, i powiedział : - Proszę pani. Ten wąż nie jest chory. On teraz pości. A leży wzdłuż pani nocą, bo sprawdza, czy pani się zmieści. To prawdziwa historia. Opowiedziała nam ją dziś klientka. Leżę na łóżku, pisze tego posta, i patrzę na drzemiącą obok mnie kotkę. Trochę mała jest. Raczej nie ma szans, żebym sie zmieściła, jakby co.." - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Anakonda. Czy to kolejna miejska legenda? Jakiś czas temu koleżanka na jednej z imprez towarzyskich opowiedziała mrożącą krew w żyłach historię o dziewczynie ze swojej pracy, która w Warszawie na dyskotece w Dekadzie poznała chłopaka. Spotykała się z nim na kawę i po drugiej randce doszło do pocałunków. Umówiła się na trzecią randkę, ale zanim do niej doszło wyskoczył jej jakiś pryszcz na twarzy. Poszła do lekarza, a ten... zawiadomił policję, prokuraturę itd. , bo rozpoznał zarażenie... jadem trupim! Rozpoczęto przesłuchanie dziewczyny i po wyjaśnieniach trafiono do chłopaka, z którym się całowała. W jego domu odkryto rozkładające się zwłoki dwóch dziewczyn. Byłam ta historią wstrząśnięta. Następnego dnia opowiedziałam ją w pracy, a koleżanka Justyna przyznała, że już o tym slyszała. To mnie utwierdziło, że historia jest prawdziwa, ale... tylko do wieczora. Coś mi nie dawało spokoju. Uwaga TVN nic? Interwencja Polsatu - nic? Nasz rodzimy Telekurier nic? Zaczęłam sprawdzać w internecie co to jest jad trupi, opryszczka od zakażenia tymże jadem i tak... trafiłam na miejską legendę. Historia wydarzyła się nie tylko w Warszawie, ale i w Olsztynie, Toruniu, Wrocławiu i Krakowie, a być może w ogóle za granicą. Choć prawdopodobne jest, że nie wydarzyła się nigdy. Głośno o niej było na miejskch forach. Za każdym razem ofiara była czyjąś znajomą. Po przeczytaniu kolejnej wersji historii zadzwoniłam do koleżanki, która opowiedziała mi tę historię i sklęłam czym świat stoi. Dlatego kiedy kilka dni temu inna koleżanka opowiedziała kolejną mrożącą krew w żyłach historię - tym razem o anakondzie - rozpoczęłam poszukiwania w internecie czy to nie jest następna miejska legenda. Nic nie znalazłam. Jednak coś mi nie pasuje, choć ta historia może brzmieć wielce prawdopodobnie. Zwłaszcza, gdy ktoś oglądał głupawy film z J. Lo. Zainteresowało mnie to, bo siedząc nad powieścią \"Dzika\" poczytałam trochę o wężach. A o jaką historię mi chodzi? Pewna kobieta (podobno sąsiadka tej mojej koleżanki z pracy, która historię opowiadała) hodowała w domu węża - anakondę. Hodowała ją pięć lat i nie trzymała w terrarium. Anakonda chodziła (pełzała) samopas po domu i co kilka dni dostawała chomika, szczura, mysz lub królika do zjedzenia. Pewnego dnia przestała jeść i zaczęła się dziwnie zachowywać. Każdego ranka po przebudzeniu właścicielka znajdowała ją w swoim łóżku wyprostowaną jak struna. Po dwóch tygodniach takich zachowań ze strony anakondy właścicielka zaniepokojona stanem zdrowia ukochanego węża poszła z nim do lekarza. Ten wysłuchał objawów \"choroby\" i powiedział, że anakonda głodziła się, by zjeść... włascicielkę. Kładzenie się koło niej było mierzeniem ile jeszcze głodzić się trzeba, by właścicielka zmieściła się w pysku no i badaniem od której strony trzeba ją zaatakować. Wężowi chodziło bowiem o to, by smakowity i duży obiad się za bardzo nie bronił. Ja domyśliłam się od razu do czego zmierza ta historia (lektura artykułów o wężach zrobiła swoje), ale dla reszty, którzy słuchali było to szokiem. Mnie szokuje co innego. Po co trzymać węża skoro nie ma z nim człowiek żadnego kontaktu? To nie pies, kot czy inny ssak. To nie ptak. Wąż to wąż! Nie przyjdzie na zawołanie. Jaby ktoś nie wiedział to... Węże są mięsożerne. Połykają ofiary w całości, mimo że często wielokrotnie są one większe od samego węża. Połykanie polega na nasuwaniu się węża na swoją ofiarę. A anakonda... żyje zwykle w wodzie i na drzewach, żywiąc się ssakami (m.in. tapiry, dziki, kapibary, jelenie!, gryzonie, niekiedy nawet jaguary), gadami (kajmany), rybami i ptakami, polując zazwyczaj w nocy. Jest w stanie połknąć ofiarę znacznie szerszą od swojego ciała, co jest możliwe dzięki rozciągnięciu szczęk. Trawienie jest bardzo powolne - po posiłku wąż trawi większą ofiarę przez wiele dni, a potem może pościć przez szereg tygodni lub miesięcy. Zanotowany rekord postu, w przypadku anakondy znajdującej się w niewoli, wynosi 2 lata. Z historii wynika, że gdyby nie interwencja u weterynarza mogłaby rodzina przez kilka lat szukać właścicielki anakondy. Myśleliby, że jest na wycieczce a ona w brzuszku w postaci obiadku. Jest tylko jedno ale. Nigdzie nie znalazłam jednak śladu, ani nawet wzmianki o tym, że anakonda zjadła człowieka. I dlatego ciągle w sumie mam wątpliwości. ps. Dalszy los anakondy \"sąsiadki\" koleżanki nie jest mi znany." - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import Data.Ord\n", - "import Data.List\n", - "\n", - "legendVectorizer = vectorizeTfIdf vocLSize collectionLNormalized vocL . normalize\n", - "\n", - "\n", - "query vs vzer q = map ((collectionL !!) . snd) $ Data.List.take 3 $ sortBy (\\a b -> fst b `compare` fst a) $ zip (map (`cosineSim` qvec) vs) [0..] \n", - " where qvec = vzer q \n", - "\n", - "query lVectorized legendVectorizer \"wąż przymierza się do zjedzenia właścicielki\"\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Haskell", - "language": "haskell", - "name": "haskell" - }, - "language_info": { - "codemirror_mode": "ihaskell", - "file_extension": ".hs", - "mimetype": "text/x-haskell", - "name": "haskell", - "pygments_lexer": "Haskell", - "version": "8.10.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

3. Wyszukiwarki \u2014 TF-IDF [wyk\u0142ad]

\n", + "

Filip Grali\u0144ski (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Wyszukiwarka - szybka i sensowna" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Roboczy przyk\u0142ad\n", + "\n", + "Zak\u0142adamy, \u017ce mamy pewn\u0105 kolekcj\u0119 dokument\u00f3w $D = {d_1, \\ldots, d_N}$. ($N$ - liczba dokument\u00f3w w kolekcji)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ala ma kota." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "{-# LANGUAGE OverloadedStrings #-}\n", + "\n", + "import Data.Text hiding(map, filter, zip)\n", + "import Prelude hiding(words, take)\n", + "\n", + "collectionD :: [Text]\n", + "collectionD = [\"Ala ma kota.\", \"Podobno jest kot w butach.\", \"Ty chyba masz kota!\", \"But chyba zgubi\u0142em.\", \"Kot ma kota.\"]\n", + "\n", + "-- Operator (!!) zwraca element listy o podanym indeksie\n", + "-- (Przy wi\u0119kszych listach b\u0119dzie nieefektywne, ale nie b\u0119dziemy komplikowa\u0107)\n", + "Prelude.head collectionD" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Wydobycie tekstu\n", + "\n", + "Przyk\u0142adowe narz\u0119dzia:\n", + "\n", + "* pdftotext\n", + "* antiword\n", + "* Tesseract OCR\n", + "* Apache Tika - uniwersalne narz\u0119dzie do wydobywania tekstu z r\u00f3\u017cnych format\u00f3w\n", + "\n", + "## Normalizacja tekstu\n", + "\n", + "Cokolwiek robimy z tekstem, najpierw musimy go _znormalizowa\u0107_." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tokenizacja\n", + "\n", + "Po pierwsze musimy podzieli\u0107 tekst na _tokeny_, czyli wyrazapodobne jednostki.\n", + "Mo\u017ce po prostu podzieli\u0107 po spacjach?" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ala" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ma" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kota." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenizeStupidly :: Text -> [Text]\n", + "-- words to funkcja z Data.Text, kt\u00f3ra dzieli po spacjach\n", + "tokenizeStupidly = words\n", + "\n", + "tokenizeStupidly $ Prelude.head collectionD" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A, trzeba _chocia\u017c_ odsun\u0105\u0107 znaki interpunkcyjne. Najpro\u015bciej u\u017cy\u0107 wyra\u017cenia regularnego. Warto u\u017cy\u0107 [unikodowych w\u0142asno\u015bci](https://en.wikipedia.org/wiki/Unicode_character_property) znak\u00f3w i konstrukcji `\\p{...}`. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "But" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "chyba" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zgubi\u0142em" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "{-# LANGUAGE QuasiQuotes #-}\n", + "\n", + "import Text.Regex.PCRE.Heavy\n", + "\n", + "tokenize :: Text -> [Text]\n", + "tokenize = map fst . scan [re|C\\+\\+|[\\p{L}0-9]+|\\p{P}|]\n", + "\n", + "tokenize $ collectionD !! 3\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ca\u0142a kolekcja stokenizowana:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ala" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ma" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kota" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "." + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Podobno" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "jest" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "butach" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "." + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Ty" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "chyba" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "masz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kota" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "!" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "But" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "chyba" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zgubi\u0142em" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "." + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ma" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kota" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "map tokenize collectionD" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Problemy z tokenizacj\u0105\n", + "\n", + "##### J\u0119zyk angielski" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "I" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "use" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "a" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "data" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "-" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "base" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenize \"I use a data-base\"" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "I" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "use" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "a" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "database" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenize \"I use a database\"" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "I" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "use" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "a" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "data" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "base" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenize \"I use a data base\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "I" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "don" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "t" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "like" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Python" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenize \"'I don't like Python'\"" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "I" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "can" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "see" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "the" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Johnes" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "house" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenize \"I can see the Johnes' house\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "I" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "do" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "not" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "like" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Python" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenize \"I do not like Python\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0018" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "555" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "-" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "555" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "-" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "122" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenize \"+0018 555-555-122\"" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0018555555122" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenize \"+0018555555122\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Which" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "one" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "is" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "better" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + ":" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "C++" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "or" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "C" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "#" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "?" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenize \"Which one is better: C++ or C#?\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Inne j\u0119zyki?" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Rechtsschutzversicherungsgesellschaften" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "wie" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "die" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "HUK" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "-" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Coburg" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "machen" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "es" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "bereits" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "seit" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "geraumer" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Zeit" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "vor" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + ":" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenize \"Rechtsschutzversicherungsgesellschaften wie die HUK-Coburg machen es bereits seit geraumer Zeit vor:\"" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u4eca\u65e5\u6ce2\u5179\u5357\u662f\u8d38\u6613" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "\u3001" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "\u5de5\u4e1a\u53ca\u6559\u80b2\u7684\u4e2d\u5fc3" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "\u3002" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "\u6ce2\u5179\u5357\u662f\u6ce2\u5170\u7b2c\u4e94\u5927\u7684\u57ce\u5e02\u53ca\u7b2c\u56db\u5927\u7684\u5de5\u4e1a\u4e2d\u5fc3" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "\uff0c" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "\u6ce2\u5179\u5357\u4ea6\u662f\u5927\u6ce2\u5170\u7701\u7684\u884c\u653f\u9996\u5e9c" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "\u3002" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "\u4e5f\u8209\u8fa6\u6709\u4e0d\u5c11\u5c55\u89bd\u6703" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "\u3002" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "\u662f\u6ce2\u862d\u897f\u90e8\u91cd\u8981\u7684\u4ea4\u901a\u4e2d\u5fc3\u90fd\u5e02" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "\u3002" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenize \"\u4eca\u65e5\u6ce2\u5179\u5357\u662f\u8d38\u6613\u3001\u5de5\u4e1a\u53ca\u6559\u80b2\u7684\u4e2d\u5fc3\u3002\u6ce2\u5179\u5357\u662f\u6ce2\u5170\u7b2c\u4e94\u5927\u7684\u57ce\u5e02\u53ca\u7b2c\u56db\u5927\u7684\u5de5\u4e1a\u4e2d\u5fc3\uff0c\u6ce2\u5179\u5357\u4ea6\u662f\u5927\u6ce2\u5170\u7701\u7684\u884c\u653f\u9996\u5e9c\u3002\u4e5f\u8209\u8fa6\u6709\u4e0d\u5c11\u5c55\u89bd\u6703\u3002\u662f\u6ce2\u862d\u897f\u90e8\u91cd\u8981\u7684\u4ea4\u901a\u4e2d\u5fc3\u90fd\u5e02\u3002\"" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "l" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ordinateur" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tokenize \"l'ordinateur\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Lematyzacja" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "_Lematyzacja_ to sprowadzenie do formy podstawowej (_lematu_), np. \"krze\u015ble\" do \"krzes\u0142o\", \"zrobimy\" do \"zrobi\u0107\" dla j\u0119zyka polskiego, \"chairs\" do \"chair\", \"made\" do \"make\" dla j\u0119zyka angielskiego." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lematyzacja dla j\u0119zyka polskiego jest bardzo trudna, praktycznie nie spos\u00f3b wykona\u0107 j\u0105 regu\u0142owo, po prostu musimy si\u0119 postara\u0107 o bardzo obszerny _s\u0142ownik form fleksyjnych_.\n", + "\n", + "Na potrzeby tego wyk\u0142adu stw\u00f3rzmy sobie ma\u0142y s\u0142ownik form fleksyjnych w postaci tablicy asocjacyjnej (haszuj\u0105cej)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Use head
Found:
collectionD !! 0
Why Not:
head collectionD
" + ], + "text/plain": [ + "Line 22: Use head\n", + "Found:\n", + "collectionD !! 0\n", + "Why not:\n", + "head collectionD" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "but" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "butami" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Ala" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mie\u0107" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "." + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Wczoraj" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kupi\u0142em" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import Data.Map as Map hiding(take, map, filter)\n", + "\n", + "mockInflectionDictionary :: Map Text Text\n", + "mockInflectionDictionary = Map.fromList [\n", + " (\"kota\", \"kot\"),\n", + " (\"butach\", \"but\"),\n", + " (\"masz\", \"mie\u0107\"),\n", + " (\"ma\", \"mie\u0107\"),\n", + " (\"buta\", \"but\"),\n", + " (\"zgubi\u0142em\", \"zgubi\u0107\")]\n", + "\n", + "lemmatizeWord :: Map Text Text -> Text -> Text\n", + "lemmatizeWord dict w = findWithDefault w w dict\n", + "\n", + "lemmatizeWord mockInflectionDictionary \"butach\"\n", + "-- a tego nie ma w naszym s\u0142owniczku, wi\u0119c zwracamy to samo\n", + "lemmatizeWord mockInflectionDictionary \"butami\"\n", + "\n", + "lemmatize :: Map Text Text -> [Text] -> [Text]\n", + "lemmatize dict = map (lemmatizeWord dict)\n", + "\n", + "lemmatize mockInflectionDictionary $ tokenize $ collectionD !! 0 \n", + "\n", + "lemmatize mockInflectionDictionary $ tokenize \"Wczoraj kupi\u0142em kota.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Pytanie**: Nawet w naszym s\u0142owniczku mamy problemy z niejednoznaczno\u015bci\u0105 lematyzacji. Jakie?\n", + "\n", + "Obszerny s\u0142ownik form fleksyjnych dla j\u0119zyka polskiego: http://zil.ipipan.waw.pl/PoliMorf?action=AttachFile&do=view&target=PoliMorf-0.6.7.tab.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Stemowanie\n", + "\n", + "Stemowanie (rdzeniowanie) obcina wyraz do _rdzenia_ niekoniecznie b\u0119d\u0105cego sensownym wyrazem, np. \"krze\u015ble\" mo\u017ce by\u0107 rdzeniowane do \"krze\u015bl\", \"krze\u015b\" albo \"krzes\", \"zrobimy\" do \"zrobi\".\n", + "\n", + "* stemowanie nie jest tak dobrze okre\u015blone jak lematyzacja (mo\u017cna robi\u0107 na wiele sposob\u00f3w)\n", + "* bardziej podatne na metody regu\u0142owe (cho\u0107 dla polskiego i tak trudno)\n", + "* dla angielskiego istniej\u0105 znane algorytmy stemowania, np. [algorytm Portera](https://tartarus.org/martin/PorterStemmer/def.txt)\n", + "* zob. te\u017c [program Snowball](https://snowballstem.org/) z regu\u0142ami dla wielu j\u0119zyk\u00f3w\n", + "\n", + "Prosty stemmer \"dla ubogich\" dla j\u0119zyka polskiego to obcinanie do sze\u015bciu znak\u00f3w." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "zrobim" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "komput" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "butach" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "\u017ad\u017ab\u0142a" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "poorMansStemming :: Text -> Text\n", + "poorMansStemming = Data.Text.take 6\n", + "\n", + "poorMansStemming \"zrobimy\"\n", + "poorMansStemming \"komputerami\"\n", + "poorMansStemming \"butach\"\n", + "poorMansStemming \"\u017ad\u017ab\u0142ami\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### _Stop words_\n", + "\n", + "Cz\u0119sto wyszukiwarki pomijaj\u0105 kr\u00f3tkie, cz\u0119ste i nienios\u0105ce znaczenia s\u0142owa - _stop words_ (_s\u0142owa przestankowe_)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "isStopWord :: Text -> Bool\n", + "isStopWord \"w\" = True\n", + "isStopWord \"jest\" = True\n", + "isStopWord \"\u017ce\" = True\n", + "-- przy okazji mo\u017cemy pozby\u0107 si\u0119 znak\u00f3w interpunkcyjnych\n", + "isStopWord w = w \u2248 [re|^\\p{P}+$|]\n", + "\n", + "isStopWord \"kot\"\n", + "isStopWord \"!\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Ala" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ma" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kota" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "removeStopWords :: [Text] -> [Text]\n", + "removeStopWords = filter (not . isStopWord)\n", + "\n", + "removeStopWords $ tokenize $ Prelude.head collectionD " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Pytanie**: Jakim zapytaniom usuwanie _stop words_ mo\u017ce szkodzi\u0107? Poda\u0107 przyk\u0142ady dla j\u0119zyka polskiego i angielskiego. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Normalizacja - r\u00f3\u017cno\u015bci\n", + "\n", + "W sk\u0142ad normalizacji mo\u017ce te\u017c wchodzi\u0107:\n", + "\n", + "* poprawianie b\u0142\u0119d\u00f3w literowych\n", + "* sprowadzanie do ma\u0142ych liter (lower-casing czy raczej case-folding)\n", + "* usuwanie znak\u00f3w diakrytycznych\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u017cd\u017ab\u0142o" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "toLower \"\u017bD\u0179B\u0141O\"" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\u017ad\u017ab\u0142o" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "toCaseFold \"\u0179D\u0179B\u0141O\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Pytanie:** Kiedy _case-folding_ da inny wynik ni\u017c _lower-casing_? Jakie to ma praktyczne znaczenie?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Normalizacja jako ca\u0142o\u015bciowy proces\n", + "\n", + "Najwa\u017cniejsza zasada: dokumenty w naszej kolekcji powinny by\u0107 normalizowane w dok\u0142adnie taki spos\u00f3b, jak zapytania.\n", + "\n", + "Efektem normalizacji jest zamiana dokumentu na ci\u0105g _term\u00f3w_ (ang. _terms_), czyli znormalizowanych wyraz\u00f3w.\n", + "\n", + "Innymi s\u0142owy po normalizacji dokument $d_i$ traktujemy jako ci\u0105g term\u00f3w $t_i^1,\\dots,t_i^{|d_i|}$." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ala" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mie\u0107" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "podobn" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "but" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ty" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "chyba" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mie\u0107" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "but" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "chyba" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zgubi\u0107" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mie\u0107" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "normalize :: Text -> [Text]\n", + "normalize = map poorMansStemming . removeStopWords . map toLower . lemmatize mockInflectionDictionary . tokenize\n", + "\n", + "map normalize collectionD" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Zbi\u00f3r wszystkich term\u00f3w w kolekcji dokument\u00f3w nazywamy s\u0142ownikiem (ang. _vocabulary_), nie myli\u0107 ze s\u0142ownikiem jako struktur\u0105 danych w Pythonie (_dictionary_).\n", + "\n", + "$$V = \\bigcup_{i=1}^N \\{t_i^1,\\dots,t_i^{|d_i|}\\}$$\n", + "\n", + "(To zbi\u00f3r, wi\u0119c liczymy bez powt\u00f3rze\u0144!)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "fromList [\"ala\",\"but\",\"chyba\",\"kot\",\"mie\\263\",\"podobn\",\"ty\",\"zgubi\\263\"]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import Data.Set as Set hiding(map)\n", + "\n", + "getVocabulary :: [Text] -> Set Text \n", + "getVocabulary = Set.unions . map (Set.fromList . normalize) \n", + "\n", + "getVocabulary collectionD" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Jak wyszukiwarka mo\u017ce by\u0107 szybka?\n", + "\n", + "_Odwr\u00f3cony indeks_ (ang. _inverted index_) pozwala wyszukiwarce szybko szuka\u0107 w milionach dokument\u00f3w. Odwr\u00f3cony indeks to prostu... indeks, jaki znamy z ksi\u0105\u017cek (mapowanie s\u0142\u00f3w na numery stron/dokument\u00f3w).\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Use tuple-section
Found:
\\ t -> (t, ix)
Why Not:
(, ix)
" + ], + "text/plain": [ + "Line 4: Use tuple-section\n", + "Found:\n", + "\\ t -> (t, ix)\n", + "Why not:\n", + "(, ix)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "fromList [(\"chyba\",2),(\"kot\",2),(\"mie\\263\",2),(\"ty\",2)]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "collectionDNormalized = map normalize collectionD\n", + "\n", + "documentToPostings :: ([Text], Int) -> Set (Text, Int)\n", + "documentToPostings (d, ix) = Set.fromList $ map (\\t -> (t, ix)) d\n", + "\n", + "documentToPostings (collectionDNormalized !! 2, 2) \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Use zipWith
Found:
map documentToPostings $ Prelude.zip coll [0 .. ]
Why Not:
zipWith (curry documentToPostings) coll [0 .. ]
" + ], + "text/plain": [ + "Line 2: Use zipWith\n", + "Found:\n", + "map documentToPostings $ Prelude.zip coll [0 .. ]\n", + "Why not:\n", + "zipWith (curry documentToPostings) coll [0 .. ]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "fromList [(\"ala\",0),(\"but\",1),(\"but\",3),(\"chyba\",2),(\"chyba\",3),(\"kot\",0),(\"kot\",1),(\"kot\",2),(\"kot\",4),(\"mie\\263\",0),(\"mie\\263\",2),(\"mie\\263\",4),(\"podobn\",1),(\"ty\",2),(\"zgubi\\263\",3)]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "collectionToPostings :: [[Text]] -> Set (Text, Int)\n", + "collectionToPostings coll = Set.unions $ map documentToPostings $ Prelude.zip coll [0..]\n", + "\n", + "collectionToPostings collectionDNormalized" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Eta reduce
Found:
updateInvertedIndex (t, ix) invIndex\n", + " = insertWith (++) t [ix] invIndex
Why Not:
updateInvertedIndex (t, ix) = insertWith (++) t [ix]
" + ], + "text/plain": [ + "Line 2: Eta reduce\n", + "Found:\n", + "updateInvertedIndex (t, ix) invIndex\n", + " = insertWith (++) t [ix] invIndex\n", + "Why not:\n", + "updateInvertedIndex (t, ix) = insertWith (++) t [ix]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "fromList [(\"ala\",[0]),(\"but\",[1,3]),(\"chyba\",[2,3]),(\"kot\",[0,1,2,4]),(\"mie\\263\",[0,2,4]),(\"podobn\",[1]),(\"ty\",[2]),(\"zgubi\\263\",[3])]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[0,1,2,4]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "updateInvertedIndex :: (Text, Int) -> Map Text [Int] -> Map Text [Int]\n", + "updateInvertedIndex (t, ix) invIndex = insertWith (++) t [ix] invIndex\n", + "\n", + "getInvertedIndex :: [[Text]] -> Map Text [Int]\n", + "getInvertedIndex = Prelude.foldr updateInvertedIndex Map.empty . Set.toList . collectionToPostings\n", + "\n", + "ind = getInvertedIndex collectionDNormalized\n", + "ind\n", + "ind ! \"kot\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Relewantno\u015b\u0107\n", + "\n", + "Potrafimy szybko przeszukiwa\u0107 znormalizowane dokumenty, ale kt\u00f3re dokumenty s\u0105 wa\u017cne (_relewantne_) wzgl\u0119dem potrzeby informacyjnej u\u017cytkownika?\n", + "\n", + "### Zapytania boole'owskie\n", + "\n", + "* `pizzeria Pozna\u0144 dow\u00f3z` to `pizzeria AND Pozna\u0144 AND dow\u00f3z` czy `pizzeria OR Pozna\u0144 OR dow\u00f3z`\n", + "* `(pizzeria OR pizza OR tratoria) AND Pozna\u0144 AND dow\u00f3z\n", + "* `pizzeria AND Pozna\u0144 AND dow\u00f3z AND NOT golonka`\n", + "\n", + "Jak domy\u015blnie interpretowa\u0107 zapytanie?\n", + "\n", + "* jako zapytanie AND -- by\u0107 mo\u017ce za ma\u0142o dokument\u00f3w\n", + "* rozwi\u0105zanie po\u015brednie?\n", + "* jako zapytanie OR -- by\u0107 mo\u017ce za du\u017co dokument\u00f3w\n", + "\n", + "Mo\u017cemy jakie\u015b miary dopasowania dokumentu do zapytania, \u017ceby m\u00f3c posortowa\u0107 dokumenty...\n", + "\n", + "### Mierzenie dopasowania dokumentu do zapytania\n", + "\n", + "Potrzebujemy jakie\u015b funkcji $\\sigma : Q x D \\rightarrow \\mathbb{R}$. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Musimy jako\u015b zamieni\u0107 dokumenty na liczby, tj. dokumenty na wektory liczb, a ca\u0142\u0105 kolekcj\u0119 na macierz.\n", + "\n", + "Po pierwsze ponumerujmy wszystkie termy ze s\u0142ownika." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "fromList [(0,\"ala\"),(1,\"but\"),(2,\"chyba\"),(3,\"kot\"),(4,\"mie\\263\"),(5,\"podobn\"),(6,\"ty\"),(7,\"zgubi\\263\")]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "fromList [(\"ala\",0),(\"but\",1),(\"chyba\",2),(\"kot\",3),(\"mie\\263\",4),(\"podobn\",5),(\"ty\",6),(\"zgubi\\263\",7)]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ala" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "2" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "voc = getVocabulary collectionD\n", + "\n", + "vocD :: Map Int Text\n", + "vocD = Map.fromList $ zip [0..] $ Set.toList voc\n", + "\n", + "invvocD :: Map Text Int\n", + "invvocD = Map.fromList $ zip (Set.toList voc) [0..]\n", + "\n", + "vocD\n", + "\n", + "invvocD\n", + "\n", + "vocD ! 0\n", + "invvocD ! \"chyba\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Napiszmy funkcj\u0119, kt\u00f3ra _wektoryzuje_ znormalizowany dokument.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Redundant $
Found:
map (\\ i -> count (v ! i) doc) $ [0 .. (vecSize - 1)]
Why Not:
map (\\ i -> count (v ! i) doc) [0 .. (vecSize - 1)]
Redundant bracket
Found:
(collectionDNormalized !! 2)
Why Not:
collectionDNormalized !! 2
" + ], + "text/plain": [ + "Line 2: Redundant $\n", + "Found:\n", + "map (\\ i -> count (v ! i) doc) $ [0 .. (vecSize - 1)]\n", + "Why not:\n", + "map (\\ i -> count (v ! i) doc) [0 .. (vecSize - 1)]Line 9: Redundant bracket\n", + "Found:\n", + "(collectionDNormalized !! 2)\n", + "Why not:\n", + "collectionDNormalized !! 2" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ty" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "chyba" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mie\u0107" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "vectorize :: Int -> Map Int Text -> [Text] -> [Double]\n", + "vectorize vecSize v doc = map (\\i -> count (v ! i) doc) $ [0..(vecSize-1)]\n", + " where count t doc \n", + " | t `elem` doc = 1.0\n", + " | otherwise = 0.0\n", + " \n", + "vocSize = Set.size voc\n", + "\n", + "(collectionDNormalized !! 2)\n", + "vectorize vocSize vocD (collectionDNormalized !! 2)\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " ![image](./macierz.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Jak inaczej uwzgl\u0119dni\u0107 cz\u0119sto\u015b\u0107 wyraz\u00f3w?\n", + "\n", + "
\n", + " $\n", + " \\newcommand{\\idf}{\\mathop{\\rm idf}\\nolimits}\n", + " \\newcommand{\\tf}{\\mathop{\\rm tf}\\nolimits}\n", + " \\newcommand{\\df}{\\mathop{\\rm df}\\nolimits}\n", + " \\newcommand{\\tfidf}{\\mathop{\\rm tfidf}\\nolimits}\n", + " $\n", + "
\n", + "\n", + "* $\\tf_{t,d}$ - term frequency\n", + "\n", + "* $1+\\log(\\tf_{t,d})$\n", + "\n", + "* $0.5 + \\frac{0.5 \\times \\tf_{t,d}}{max_t(\\tf_{t,d})}$" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Redundant $
Found:
map (\\ i -> count (v ! i) doc) $ [0 .. (vecSize - 1)]
Why Not:
map (\\ i -> count (v ! i) doc) [0 .. (vecSize - 1)]
Redundant bracket
Found:
(collectionDNormalized !! 4)
Why Not:
collectionDNormalized !! 4
" + ], + "text/plain": [ + "Line 2: Redundant $\n", + "Found:\n", + "map (\\ i -> count (v ! i) doc) $ [0 .. (vecSize - 1)]\n", + "Why not:\n", + "map (\\ i -> count (v ! i) doc) [0 .. (vecSize - 1)]Line 7: Redundant bracket\n", + "Found:\n", + "(collectionDNormalized !! 4)\n", + "Why not:\n", + "collectionDNormalized !! 4" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mie\u0107" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "vectorizeTf :: Int -> Map Int Text -> [Text] -> [Double]\n", + "vectorizeTf vecSize v doc = map (\\i -> count (v ! i) doc) $ [0..(vecSize-1)]\n", + " where count t doc = fromIntegral $ (Prelude.length . Prelude.filter (== t)) doc\n", + "\n", + "vocSize = Set.size voc\n", + "\n", + "(collectionDNormalized !! 4)\n", + "vectorize vocSize vocD (collectionDNormalized !! 4)\n", + "vectorizeTf vocSize vocD (collectionDNormalized !! 4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + " $\n", + " \\newcommand{\\idf}{\\mathop{\\rm idf}\\nolimits}\n", + " \\newcommand{\\tf}{\\mathop{\\rm tf}\\nolimits}\n", + " \\newcommand{\\df}{\\mathop{\\rm df}\\nolimits}\n", + " \\newcommand{\\tfidf}{\\mathop{\\rm tfidf}\\nolimits}\n", + " $\n", + "
\n", + "\n", + "### Odwrotna cz\u0119sto\u015b\u0107 dokumentowa\n", + "\n", + "Czy wszystkie wyrazy s\u0105 tak samo wa\u017cne?\n", + "\n", + "**NIE.** Wyrazy pojawiaj\u0105ce si\u0119 w wielu dokumentach s\u0105 mniej wa\u017cne.\n", + "\n", + "Aby to uwzgl\u0119dni\u0107, przemna\u017camy frekwencj\u0119 wyrazu przez _odwrotn\u0105\n", + " cz\u0119sto\u015b\u0107 w dokumentach_ (_inverse document frequency_):\n", + "\n", + "$$\\idf_t = \\log \\frac{N}{\\df_t},$$\n", + "\n", + "gdzie:\n", + "\n", + "* $\\idf_t$ - odwrotna cz\u0119sto\u015b\u0107 wyrazu $t$ w dokumentach\n", + "\n", + "* $N$ - liczba dokument\u00f3w w kolekcji\n", + "\n", + "* $\\df_f$ - w ilu dokumentach wyst\u0105pi\u0142 wyraz $t$?\n", + "\n", + "#### Dlaczego idf?\n", + "\n", + "term $t$ wyst\u0105pi\u0142...\n", + "\n", + "* w 1 dokumencie, $\\idf_t = \\log N/1 = \\log N$\n", + "* 2 razy w kolekcji, $\\idf_t = \\log N/2$ lub $\\log N$\n", + "* w po\u0142owie dokument\u00f3w kolekcji, $\\idf_t = \\log N/(N/2) = \\log 2$\n", + "* we wszystkich dokumentach, $\\idf_t = \\log N/N = \\log 1 = 0$\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.22314355131420976" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "idf :: [[Text]] -> Text -> Double\n", + "idf coll t = log (fromIntegral n / fromIntegral df)\n", + " where df = Prelude.length $ Prelude.filter (\\d -> t `elem` d) coll\n", + " n = Prelude.length coll\n", + " \n", + "idf collectionDNormalized \"kot\" " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9162907318741551" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "idf collectionDNormalized \"chyba\" " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Co z tego wynika?\n", + "\n", + "Zamiast $\\tf_{t,d}$ b\u0119dziemy w wektorach rozpatrywa\u0107 warto\u015bci:\n", + "\n", + "$$\\tfidf_{t,d} = \\tf_{t,d} \\times \\idf_{t}$$\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mie\u0107" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kot" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[0.0,0.0,0.0,0.44628710262841953,0.5108256237659907,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "vectorizeTfIdf :: Int -> [[Text]] -> Map Int Text -> [Text] -> [Double]\n", + "vectorizeTfIdf vecSize coll v doc = map (\\i -> count (v ! i) doc * idf coll (v ! i)) [0..(vecSize-1)]\n", + " where count t doc = fromIntegral $ (Prelude.length . Prelude.filter (== t)) doc\n", + "\n", + "vocSize = Set.size voc\n", + "\n", + "collectionDNormalized !! 4\n", + "vectorize vocSize vocD (collectionDNormalized !! 4)\n", + "vectorizeTf vocSize vocD (collectionDNormalized !! 4)\n", + "vectorizeTfIdf vocSize collectionDNormalized vocD (collectionDNormalized !! 4)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[[1.6094379124341003,0.0,0.0,0.22314355131420976,0.5108256237659907,0.0,0.0,0.0],[0.0,0.9162907318741551,0.0,0.22314355131420976,0.0,1.6094379124341003,0.0,0.0],[0.0,0.0,0.9162907318741551,0.22314355131420976,0.5108256237659907,0.0,1.6094379124341003,0.0],[0.0,0.9162907318741551,0.9162907318741551,0.0,0.0,0.0,0.0,1.6094379124341003],[0.0,0.0,0.0,0.44628710262841953,0.5108256237659907,0.0,0.0,0.0]]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "map (vectorizeTfIdf vocSize collectionDNormalized vocD) collectionDNormalized" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Teraz zdefiniujemy _overlap score measure_:\n", + "\n", + "$$\\sigma(q,d) = \\sum_{t \\in q} \\tfidf_{t,d}$$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Podobie\u0144stwo kosinusowe\n", + "\n", + "_Overlap score measure_ nie jest jedyn\u0105 mo\u017cliw\u0105 metryk\u0105, za pomoc\u0105 kt\u00f3rej mo\u017cemy mierzy\u0107 dopasowanie dokumentu do zapytania. Mo\u017cemy r\u00f3wnie\u017c si\u0119gn\u0105\u0107 po intuicje geometryczne (skoro mamy do czynienia z wektorami).\n", + "\n", + "**Pytanie**: Ile wymiar\u00f3w maj\u0105 wektory, na kt\u00f3rych operujemy? Jak \"wygl\u0105daj\u0105\" te wektory? Czy mo\u017cemy wykonywa\u0107 na nich standardowe operacje geometryczne czy te, kt\u00f3re znamy z geometrii liniowej?\n", + "\n", + "#### Podobie\u0144stwo mi\u0119dzy dokumentami\n", + "\n", + "Zajmijmy si\u0119 teraz poszukiwaniem miary mierz\u0105cej podobie\u0144stwo mi\u0119dzy dokumentami $d_1$ i $d_2$ (czyli poszukujemy sensownej funkcji $\\sigma : D x D \\rightarrow \\mathbb{R}$).\n", + "\n", + "**Uwaga** Poj\u0119cia \"miary\" u\u017cywamy nieformalnie, nie spe\u0142nia ona za\u0142o\u017ce\u0144 znanych z teorii miary.\n", + "\n", + "Rozpatrzmy zbiorek tekst\u00f3w legend miejskich z .\n", + "\n", + "(To autentyczne teksty z Internentu, z j\u0119zykiem potocznym, wulgarnym itd.)\n", + "\n", + "```\n", + " git clone git://gonito.net/polish-urban-legends\n", + " paste polish-urban-legends/dev-0/expected.tsv polish-urban-legends/dev-0/in.tsv > legendy.txt\n", + "``` " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "na_ak" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Opowie\u015b\u0107 prawdziwa... Olsztyn, akademik, 7 pi\u0119tro, impreza u Mariusza, jak to na polskiej najebce bywa kto\u015b rzuci\u0142 tekstem: \"Mariusz nie zjedziesz na nartach po schodach\". Sprawa ucichla, studencii wrocili do tego co lubia i w sumie umiej\u0105 najbardziej czyli picia, lecz nad ranem kolo godziny 6.00 ludzia przypomnialo sie ze Mariusz mia\u0142 zjecha\u0107 na nartach po schodach. Tu warto wspomnie\u0107 \u017ce Mariusz by\u0142 zapalonym narciarzem st\u0105d w\u0142a\u015bnie w jego pokoju znalezc mo\u017cna bylo narty, bo po ki huj komu\u015b narty w Olsztynie! Tak wracajac do historii nasz bohater odzia\u0142 si\u0119 w sprzet, podszed do schodow i niestety da\u0142 rad\u0119 zjecha\u0107 jedynie w po\u0142owie, gdy\u017c jak to powiedzial \"no kurwa potkn\u0105\u0142em sie\", ale nieustraszoony Mariusz pr\u00f3bowal dalej. Nastepny zjazd byl perfekcyjny, jedno pietro zanim, niestety pomiedzy 6 a 5 pietrem Mariusza natrafil na Pania sprz\u0105taczke, kt\u00f3ra potr\u0105ci\u0142 i zwia\u0142 z miejsca wypadku. Ok godziny 10.00 nastopilo przebudzenie Mariusza, ktory zaraz po obudzeniu uslyszal co narobi\u0142, mianowicie o skutkach potracenia, Pani sprzataczka z\u0142amala r\u0119k\u0119 i trafi\u0142a do szpitala. Mog\u0142y powsta\u0107 przez to cie\u017ckie konsekwencje, Mariusz m\u00f3g\u0142 wyleciec z akademika je\u017celi kierownik dowie sie o calym zaj\u015bciu. Wiec koledzy poradzili narcia\u017cowi, aby kupi\u0142 kwiaty i bombonierk\u0119 i poszed\u0142 do szpitala z przeprosinami. Po szybkich zakupach w sasiedniej Biedr\u0105ce, Mariusz byl przygotowany na konfrontacje z Pania sprz\u0105taczka, ale nie mog\u0142o poj\u015b\u0107 pi\u0119knie i g\u0142adko. Po wej\u015bciu do szpitala nasz bohater skierowal swoje kroki do recepcji pytajac si\u0119 o cioci\u0119, kt\u00f3ra mia\u0142a wypadek w akademiku, recepcjonistka skierowa\u0142a go do lekarza, gdzie czeka\u0142 na jego wyj\u015bcie ok 15 minut, gdy lekarz ju\u017c wyszed\u0142 ten odrazu podlecia\u0142 do niego, \u017ceby spyta\u0107 si\u0119 o stan zdrowia Pani sprz\u0105taczki. Wnet uslyszla od lekarz, niestety Pani teraz jest u psychiatry po twierdzi, \u017ce kto\u015b potracil ja zje\u017cdzajac na nartach w akademiku. Po uslyszeniu tej wiadomosci Mariusz odwroci\u0142 si\u0119, wybieg\u0142, kupi\u0142 piecie i szybko pobieg\u0142 do akademika pi\u0107 dalej! Mora\u0142... student potrafi!" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import System.IO\n", + "import Data.List.Split as SP\n", + "\n", + "legendsh <- openFile \"legendy.txt\" ReadMode\n", + "hSetEncoding legendsh utf8\n", + "contents <- hGetContents legendsh\n", + "ls = Prelude.lines contents\n", + "items = map (map pack . SP.splitOn \"\\t\") ls\n", + "Prelude.head items" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "87" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "nbOfLegends = Prelude.length items\n", + "nbOfLegends" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "na_ak" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lud" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ba_hy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lap" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ne_dz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "be_wy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mo_zu" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "be_wy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ba_hy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mo_zu" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "be_wy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lud" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ne_dz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ta_ab" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ta_ab" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ta_ab" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lap" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ba_hy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ne_dz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ba_hy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "tr_su" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ne_dz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ba_hy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mo_zu" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "tr_su" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ne_dz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ne_dz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lud" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ne_dz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ta_ab" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lud" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "na_ak" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lap" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "be_wy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "na_ak" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lap" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "na_ak" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ba_hy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lud" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mo_zu" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ba_hy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "tr_su" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "na_ak" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ba_hy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lud" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lud" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "tr_su" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lud" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "be_wy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "tr_su" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "na_ak" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ba_hy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ne_dz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ba_hy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "na_ak" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lud" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mo_zu" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mo_zu" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "na_ak" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lap" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ne_dz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ba_hy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mo_zu" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ba_hy" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ne_dz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zw_oz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "tr_su" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ne_dz" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "w_lud" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Ja podejrzewam \u017ce o polowaniu nie by\u0142o mowy, po prostu znalaz\u0142 martwego szczupaka i skorzysta\u0142 z okazji! Mnie mocno zdziwi\u0142a jego si\u0142a \u017ceby taki p\u00f3\u0142 kilogramowy okaz szczupaka przesuwa\u0107 o par\u0119 metr\u00f3w i to w trzcinach! Szacuneczek. Przypomniala mi sie historia kt\u00f3r\u0105 kiedys zaslyszalem o wlascicielce pytona, ktory nagle polozyl sie wzdluz jej \u0142\u00f3\u017cka. Le\u017ca\u0142 tak wyci\u0105gniety jak struna d\u0142u\u017cszy czas jak nie\u017cywy (a by\u0142 d\u0142ugo\u015bci \u0142\u00f3\u017cka), wi\u0119c kobitka zadzonila do weterynarza co ma robi\u0107. Us\u0142ysza\u0142a \u017ce ma szybko zamkn\u0105\u0107 si\u0119 w \u0142azience i poczeka\u0107 na niego bo pyton j\u0105 mierzy jako potencjaln\u0105 ofiar\u0119 (czy mu si\u0119 zmie\u015bci w brzuchu...). Wierzy\u0107, nie wierzy\u0107? Kiedy\u015b nie wierzy\u0142em ale od kilku dni mam w\u0105tpliwosci... Pozdrawiam" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "labelsL = map Prelude.head items\n", + "labelsL\n", + "collectionL = map (!!1) items\n", + "items !! 1" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "348" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "collectionLNormalized = map normalize collectionL\n", + "voc' = getVocabulary collectionL\n", + "\n", + "vocLSize = Prelude.length voc'\n", + "\n", + "vocL :: Map Int Text\n", + "vocL = Map.fromList $ zip [0..] $ Set.toList voc'\n", + "\n", + "invvocL :: Map Text Int\n", + "invvocL = Map.fromList $ zip (Set.toList voc') [0..]\n", + "\n", + "vocL ! 0\n", + "invvocL ! \"chyba\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wektoryzujemy ca\u0142\u0105 kolekcj\u0119:" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.38837067474886433,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.752336051950276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0647107369924282,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2078115806331018,0.0,0.0,0.0,0.0,0.0,1.247032293786383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5947071077466928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.268683541318364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2078115806331018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.7578579175523736,0.0,0.0,0.0,0.0,0.0,0.3550342544812725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9395475940384223,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21437689194643514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2878542883066382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2745334443309775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.079613757534693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.330413902725434,0.0,1.247032293786383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.330413902725434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,2.5199979695992702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.6741486494265287,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5199979695992702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.6741486494265287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.079613757534693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.386466576974748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.856470206220483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,1.0319209141694374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,2.340142505300509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.7578579175523736,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5214691394881432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.388148398070203e-2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.9810014688665833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6096847248398047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.575536360758419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.079613757534693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.1847155011136463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0319209141694374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,2.856470206220483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.079613757534693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.322773392263051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.079613757534693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.163323025660538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.900958761193047,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,3.079613757534693,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.340142505300509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.710068508962545,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.931816237309167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5199979695992702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0319209141694374,0.0,2.163323025660538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26121549926361765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.6741486494265287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.386466576974748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.238841272604079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.330413902725434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.163323025660538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12210269680089991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.068012845856213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.856470206220483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.856470206220483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.079613757534693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.712940412440966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.068012845856213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "lVectorized = map (vectorizeTfIdf vocLSize collectionLNormalized vocL) collectionLNormalized\n", + "lVectorized !! 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Szukamy funkcji $sigma$, kt\u00f3ra da wysok\u0105 warto\u015b\u0107 dla tekst\u00f3w dotycz\u0105cych tego samego w\u0105tku legendowego (np. $d_1$ i $d_2$ m\u00f3wi\u0105 o w\u0119\u017cu przymierzaj\u0105cym si\u0119 do zjedzenia swojej w\u0142a\u015bcicielki) i nisk\u0105 dla tekst\u00f3w z r\u00f3\u017cnych w\u0105tk\u00f3w (np. $d_1$ opowiada o w\u0119\u017cu ludojadzie, $d_2$ - ba\u0142wanku na hydrancie)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mo\u017ce po prostu odleg\u0142o\u015b\u0107 euklidesowa, skoro to punkty w wielowymiarowej przestrzeni?" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Eta reduce
Found:
formatNumber x = printf \"% 7.2f\" x
Why Not:
formatNumber = printf \"% 7.2f\"
" + ], + "text/plain": [ + "Line 5: Eta reduce\n", + "Found:\n", + "formatNumber x = printf \"% 7.2f\" x\n", + "Why not:\n", + "formatNumber = printf \"% 7.2f\"" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + " 0.00 79.93 78.37 76.57 87.95 81.15 82.77 127.50 124.54 76.42 84.19 78.90 90.90" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import Text.Printf\n", + "import Data.List (take)\n", + "\n", + "formatNumber :: Double -> String\n", + "formatNumber x = printf \"% 7.2f\" x\n", + "\n", + "similarTo :: ([Double] -> [Double] -> Double) -> [[Double]] -> Int -> Text\n", + "similarTo simFun vs ix = pack $ Prelude.unwords $ map (formatNumber . ((vs !! ix) `simFun`)) vs\n", + "\n", + "euclDistance :: [Double] -> [Double] -> Double\n", + "euclDistance v1 v2 = sqrt $ sum $ Prelude.zipWith (\\x1 x2 -> (x1 - x2)**2) v1 v2\n", + "\n", + "limit = 13\n", + "labelsLimited = Data.List.take limit labelsL\n", + "limitedL = Data.List.take limit lVectorized\n", + "\n", + "similarTo euclDistance limitedL 0\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Move brackets to avoid $
Found:
\"\\n\"\n", + " <>\n", + " (Data.Text.unlines\n", + " $ map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", + " $ zip labels [0 .. (Prelude.length vs - 1)])
Why Not:
\"\\n\"\n", + " <>\n", + " Data.Text.unlines\n", + " (map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", + " $ zip labels [0 .. (Prelude.length vs - 1)])
Use zipWith
Found:
map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", + " $ zip labels [0 .. (Prelude.length vs - 1)]
Why Not:
zipWith\n", + " (curry (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix))\n", + " labels [0 .. (Prelude.length vs - 1)]
Move brackets to avoid $
Found:
\" \"\n", + " <> (Data.Text.unwords $ map (\\ l -> pack $ printf \"% 7s\" l) labels)
Why Not:
\" \"\n", + " <> Data.Text.unwords (map (\\ l -> pack $ printf \"% 7s\" l) labels)
Avoid lambda
Found:
\\ l -> pack $ printf \"% 7s\" l
Why Not:
pack . printf \"% 7s\"
" + ], + "text/plain": [ + "Line 2: Move brackets to avoid $\n", + "Found:\n", + "\"\\n\"\n", + " <>\n", + " (Data.Text.unlines\n", + " $ map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", + " $ zip labels [0 .. (Prelude.length vs - 1)])\n", + "Why not:\n", + "\"\\n\"\n", + " <>\n", + " Data.Text.unlines\n", + " (map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", + " $ zip labels [0 .. (Prelude.length vs - 1)])Line 2: Use zipWith\n", + "Found:\n", + "map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", + " $ zip labels [0 .. (Prelude.length vs - 1)]\n", + "Why not:\n", + "zipWith\n", + " (curry (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix))\n", + " labels [0 .. (Prelude.length vs - 1)]Line 3: Move brackets to avoid $\n", + "Found:\n", + "\" \"\n", + " <> (Data.Text.unwords $ map (\\ l -> pack $ printf \"% 7s\" l) labels)\n", + "Why not:\n", + "\" \"\n", + " <> Data.Text.unwords (map (\\ l -> pack $ printf \"% 7s\" l) labels)Line 3: Avoid lambda\n", + "Found:\n", + "\\ l -> pack $ printf \"% 7s\" l\n", + "Why not:\n", + "pack . printf \"% 7s\"" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", + "na_ak 0.00 79.93 78.37 76.57 87.95 81.15 82.77 127.50 124.54 76.42 84.19 78.90 90.90\n", + "w_lud 79.93 0.00 38.92 34.35 56.48 44.89 47.21 109.24 104.82 35.33 49.88 39.98 60.20\n", + "ba_hy 78.37 38.92 0.00 30.37 54.23 40.93 43.83 108.15 102.91 27.37 46.95 35.81 58.99\n", + "w_lap 76.57 34.35 30.37 0.00 51.54 37.46 40.86 107.43 103.22 25.22 43.66 32.10 56.53\n", + "ne_dz 87.95 56.48 54.23 51.54 0.00 57.98 60.32 113.66 109.59 50.96 62.17 54.84 70.70\n", + "be_wy 81.15 44.89 40.93 37.46 57.98 0.00 49.55 110.37 100.50 37.77 51.54 37.09 62.92\n", + "zw_oz 82.77 47.21 43.83 40.86 60.32 49.55 0.00 111.11 107.57 41.02 54.07 45.23 64.65\n", + "mo_zu 127.50 109.24 108.15 107.43 113.66 110.37 111.11 0.00 139.57 107.38 109.91 108.20 117.07\n", + "be_wy 124.54 104.82 102.91 103.22 109.59 100.50 107.57 139.57 0.00 102.69 108.32 99.06 113.25\n", + "ba_hy 76.42 35.33 27.37 25.22 50.96 37.77 41.02 107.38 102.69 0.00 43.83 32.08 56.68\n", + "mo_zu 84.19 49.88 46.95 43.66 62.17 51.54 54.07 109.91 108.32 43.83 0.00 47.87 66.40\n", + "be_wy 78.90 39.98 35.81 32.10 54.84 37.09 45.23 108.20 99.06 32.08 47.87 0.00 59.66\n", + "w_lud 90.90 60.20 58.99 56.53 70.70 62.92 64.65 117.07 113.25 56.68 66.40 59.66 0.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "paintMatrix :: ([Double] -> [Double] -> Double) -> [Text] -> [[Double]] -> Text\n", + "paintMatrix simFun labels vs = header <> \"\\n\" <> (Data.Text.unlines $ map (\\(lab, ix) -> lab <> \" \" <> similarTo simFun vs ix) $ zip labels [0..(Prelude.length vs - 1)])\n", + " where header = \" \" <> (Data.Text.unwords $ map (\\l -> pack $ printf \"% 7s\" l) labels)\n", + " \n", + "paintMatrix euclDistance labelsLimited limitedL" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Problem: za du\u017co zale\u017cy od d\u0142ugo\u015bci tekstu.\n", + "\n", + "Rozwi\u0105zanie: znormalizowa\u0107 wektor $v$ do wektora jednostkowego.\n", + "\n", + "$$ \\vec{1}(v) = \\frac{v}{|v|} $$\n", + "\n", + "Taki wektor ma d\u0142ugo\u015b\u0107 1!" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", + "na_ak 10.00 0.67 0.66 0.66 0.67 0.67 0.67 0.67 0.67 0.67 0.66 0.67 0.67\n", + "w_lud 0.67 10.00 0.67 0.68 0.67 0.66 0.67 0.67 0.68 0.66 0.67 0.67 0.68\n", + "ba_hy 0.66 0.67 10.00 0.66 0.67 0.67 0.67 0.67 0.69 0.74 0.66 0.67 0.66\n", + "w_lap 0.66 0.68 0.66 10.00 0.66 0.66 0.66 0.66 0.67 0.66 0.66 0.66 0.66\n", + "ne_dz 0.67 0.67 0.67 0.66 10.00 0.67 0.67 0.68 0.69 0.68 0.67 0.67 0.68\n", + "be_wy 0.67 0.66 0.67 0.66 0.67 10.00 0.66 0.67 0.74 0.66 0.67 0.76 0.66\n", + "zw_oz 0.67 0.67 0.67 0.66 0.67 0.66 10.00 0.67 0.67 0.66 0.66 0.67 0.67\n", + "mo_zu 0.67 0.67 0.67 0.66 0.68 0.67 0.67 10.00 0.69 0.67 0.69 0.68 0.67\n", + "be_wy 0.67 0.68 0.69 0.67 0.69 0.74 0.67 0.69 10.00 0.68 0.67 0.75 0.67\n", + "ba_hy 0.67 0.66 0.74 0.66 0.68 0.66 0.66 0.67 0.68 10.00 0.66 0.67 0.66\n", + "mo_zu 0.66 0.67 0.66 0.66 0.67 0.67 0.66 0.69 0.67 0.66 10.00 0.67 0.67\n", + "be_wy 0.67 0.67 0.67 0.66 0.67 0.76 0.67 0.68 0.75 0.67 0.67 10.00 0.67\n", + "w_lud 0.67 0.68 0.66 0.66 0.68 0.66 0.67 0.67 0.67 0.66 0.67 0.67 10.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "vectorNorm :: [Double] -> Double\n", + "vectorNorm vs = sqrt $ sum $ map (\\x -> x * x) vs\n", + "\n", + "toUnitVector :: [Double] -> [Double]\n", + "toUnitVector vs = map (/ n) vs\n", + " where n = vectorNorm vs\n", + "\n", + "vectorNorm (toUnitVector [3.0, 4.0])\n", + "\n", + "euclDistanceNormalized :: [Double] -> [Double] -> Double\n", + "euclDistanceNormalized v1 v2 = toUnitVector v1 `euclDistance` toUnitVector v2\n", + "\n", + "euclSim v1 v2 = 1 / (d + 0.1)\n", + " where d = euclDistanceNormalized v1 v2\n", + "\n", + "paintMatrix euclSim labelsLimited limitedL" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Podobie\u0144stwo kosinusowe\n", + "\n", + "Cz\u0119\u015bciej zamiast odleg\u0142o\u015bci euklidesowej stosuje si\u0119 podobie\u0144stwo kosinusowe, czyli kosinus k\u0105ta mi\u0119dzy wektorami.\n", + "\n", + "Wektor dokumentu ($\\vec{V}(d)$) - wektor, kt\u00f3rego sk\u0142adowe odpowiadaj\u0105 wyrazom.\n", + "\n", + "$$\\sigma(d_1,d_2) = \\cos\\theta(\\vec{V}(d_1),\\vec{V}(d_2)) = \\frac{\\vec{V}(d_1) \\cdot \\vec{V}(d_2)}{|\\vec{V}(d_1)||\\vec{V}(d_2)|} $$\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Zauwa\u017cmy, \u017ce jest to iloczyn skalarny znormalizowanych wektor\u00f3w!\n", + "\n", + "$$\\sigma(d_1,d_2) = \\vec{1}(\\vec{V}(d_1)) \\times \\vec{1}(\\vec{V}(d_2)) $$" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "(\u2715) :: [Double] -> [Double] -> Double\n", + "(\u2715) v1 v2 = sum $ Prelude.zipWith (*) v1 v2\n", + "\n", + "[2, 1, 0] \u2715 [-2, 5, 10]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", + "na_ak 1.00 0.02 0.01 0.01 0.03 0.02 0.02 0.04 0.03 0.02 0.01 0.02 0.03\n", + "w_lud 0.02 1.00 0.02 0.05 0.04 0.01 0.03 0.04 0.06 0.01 0.02 0.03 0.06\n", + "ba_hy 0.01 0.02 1.00 0.01 0.02 0.03 0.03 0.04 0.08 0.22 0.01 0.04 0.01\n", + "w_lap 0.01 0.05 0.01 1.00 0.01 0.01 0.00 0.01 0.02 0.00 0.00 0.00 0.00\n", + "ne_dz 0.03 0.04 0.02 0.01 1.00 0.04 0.03 0.07 0.08 0.06 0.03 0.03 0.05\n", + "be_wy 0.02 0.01 0.03 0.01 0.04 1.00 0.01 0.03 0.21 0.01 0.02 0.25 0.01\n", + "zw_oz 0.02 0.03 0.03 0.00 0.03 0.01 1.00 0.04 0.03 0.00 0.01 0.02 0.02\n", + "mo_zu 0.04 0.04 0.04 0.01 0.07 0.03 0.04 1.00 0.10 0.02 0.09 0.05 0.04\n", + "be_wy 0.03 0.06 0.08 0.02 0.08 0.21 0.03 0.10 1.00 0.05 0.03 0.24 0.04\n", + "ba_hy 0.02 0.01 0.22 0.00 0.06 0.01 0.00 0.02 0.05 1.00 0.01 0.02 0.00\n", + "mo_zu 0.01 0.02 0.01 0.00 0.03 0.02 0.01 0.09 0.03 0.01 1.00 0.01 0.02\n", + "be_wy 0.02 0.03 0.04 0.00 0.03 0.25 0.02 0.05 0.24 0.02 0.01 1.00 0.02\n", + "w_lud 0.03 0.06 0.01 0.00 0.05 0.01 0.02 0.04 0.04 0.00 0.02 0.02 1.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "cosineSim v1 v2 = toUnitVector v1 \u2715 toUnitVector v2\n", + "\n", + "paintMatrix cosineSim labelsLimited limitedL" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "na tylnym siedzeniu w autobusie siedzi matka z 7-8 letnim synkiem. naprzeciwko synka siedzi kobieta (zwr\u00f3cona twarz\u0105 do dzieciaka). synek co chwile wymachuje nogami i kopie kobiet\u0119, matka widz\u0105c to nie reaguje na to wog\u00f3le. wreszcie kobieta zwraca uwag\u0119 matce, \u017ceby ta powiedzia\u0142a co\u015b synowi a matka do niej: nie mog\u0119, bo wychowuj\u0119 syna bezstresowo!!! ...ch\u0142opak, kt\u00f3ry sta\u0142 w pobli\u017cu i widzia\u0142 i s\u0142ysza\u0142 ca\u0142e to zaj\u015bcie wyplu\u0142 z ust gum\u0119 do \u017cucia i przyklei\u0142 matce na czo\u0142o i powiedzia\u0142: ja te\u017c by\u0142em bezstresowo wychowywany... autentyczny przypadek w londy\u0144skim autobusie (a tym co przyklei\u0142 matce gum\u0119 na czo\u0142o by\u0142 chyba nawet m\u0142ody Polak)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "collectionL !! 5" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Kr\u00f3tko zwi\u0119\u017ale i na temat. Zastanawia mnie jak ludzie wychowuj\u0105 dzieci. Co prawda sam nie mam potomstwa i nie zamierzam mie\u0107 jak narazie (bo to troch\u0119 g\u0142upie mie\u0107 17-letniego tatusia), ale niestety mam przyjemno\u015b\u0107 ogl\u0105da\u0107 efekty wychowawcze niekt\u00f3rych par (dzi\u0119ki znajomym rodzic\u00f3w w r\u00f3\u017cnym wieku). S\u0105 trzy najbardziej znane mi modele wychowania. Surowe, bezstresowe (w moim znaczeniu) i \"bezstresowe\" w mowie potocznej. Zaczynam od tego pierwszego. Jak nazwa wskazuje, jest to surowe wychowanie, oparte na karach cielesnych lub torturach umys\u0142owych. Nie uwa\u017cam tego za dobre wychowanie, bo dziecko jak b\u0119dzie nieco starsze b\u0119dzie si\u0119 ba\u0142o wszystkiego, bo uzna, \u017c jak zrobi co\u015b \u017cle to spotka je kara. Wi\u0119c bicie za r\u00f3\u017cne rzeczy odpada (no chyba, \u017ce dzieciak na serio nabroi to oczywi\u015bcie). Wychowanie bezstresowe z mojego s\u0142ownika oznacza nienara\u017canie dziecka na stresy, pocieszanie w trudnych sytuacjach, za\u0142atwianie problem\u00f3w przez rozmow\u0119 oraz sta\u0142y kontakt z dzieckiem. I to chyba najlepsze. Sam zosta\u0142em tak wychowany i ciesz\u0119 si\u0119 z tego powodu. I oczywi\u015bcie \"wychowanie bezstresowe\". A tu si\u0119 normalnie rozpisz\u0119. Po pierwsze geneza. Wi\u0119c jak dochodzi do takiego wychowania? Odpowied\u017a. Mamusi i tatusiowi si\u0119 zachcia\u0142o bobaska bo to takie malutkie fajniutkie i ooo. Oboje zazdroszcz\u0105 innym parom bo one maj\u0105, a oni nie, wi\u0119c oni te\u017c chc\u0105. No wi\u0119c rodzi im si\u0119 bobasek, chuchaj\u0105 dmuchaj\u0105 na niego p\u00f3ki ma\u0142e. Ale przychodzi ten okres, kiedy dziecko trzeba wychowa\u0107 i kiedy ma si\u0119 na dzieciaka najwi\u0119kszy wp\u0142yw. I tu si\u0119 zaczynaj\u0105 schody. Nagle oboje nie maj\u0105 czasu i m\u00f3wi\u0105 \"Wychowamy go/j\u0105/ich (niepotrzebne skre\u015bli\u0107) bezstresowo.\" Po drugie. Decyzja o sposobie wychowania podj\u0119ta. A wi\u0119c jak to wygl\u0105da? Odpowied\u017a. Totalna olewka! Mama i tata baluj\u0105, a dzieciaka zostawiaj\u0105 samemu sobie, albo pod opiek\u0119 babci, kt\u00f3ra r\u00f3wnie\u017c leje na dziecko ciep\u0142ym moczem. Dzieciak ro\u015bnie i ro\u015bnie, nie wie co dobre a co z\u0142e. Przypomnia\u0142a mi si\u0119 pewna, podobno autentyczna scenka. Ch\u0142opak jedzie ze szwagrem autobusem czy tam tramwajem. Na jednym miejscu siedzi starowinka, a na przeciwko niej siedzi lafirynda z brzd\u0105cem na kolanach. No i sobie dzieciak macha n\u00f3\u017ckami i tu ciach i kopn\u0105\u0142 staruszk\u0119 w nog\u0119. Babcia nic sobie z tego nie zrobi\u0142a, a dzieciak nie widz\u0105c reakcji zacz\u0105\u0142 j\u0105 ju\u017c celowo kopa\u0107. Staruszka: Mo\u017ce pani powiedzie\u0107 co\u015b synkowi \u017ceby mnie nie kopa\u0142. Matka: Nie bo ja go wychowuj\u0119 bezstresowo. Szwagier wyci\u0105ga z ust gum\u0119 do \u017cucia i przykleja mamusi na czo\u0142o m\u00f3wi\u0105c: Moja mama te\u017c mnie wychowa\u0142a bezstresowo. Ciekaw jestem ile w tym prawdy by\u0142o, a je\u017celi 100% to czy mamusi si\u0119 odmieni\u0142y pogl\u0105dy. Kto go wie? Po trzecie. Doros\u0142y wychowany bezstresowo. Jaki on jest? Odpowied\u017a. Zupe\u0142nie inny. My\u015bli, \u017ce jest p\u0119pkiem \u015bwiata i \u017ce wszystko musi by\u0107 pod jego dyktando. Pracuj\u0105c w Szwajcarii przy piel\u0119gnacji winogron, syn polskiego kolegi taty zacz\u0105\u0142 rzuca\u0107 we mnie winogronami. Mia\u0142em ochot\u0119 wbi\u0107 mu no\u017cyczki (kt\u00f3rymi podcina\u0142em li\u015bcie) w oczy. A to by\u0142by ciekawy widok. Dzieciak o bia\u0142ych w\u0142osach, sk\u00f3rze i niebieskich oczach sta\u0142by sie albinosem (bo z niebieskich oczu sta\u0142yby sie czerwone jak u bia\u0142ych szczur\u00f3w i myszek). Ojciec sie co prawda na niego wydziera\u0142, \u017ceby nie przeszkadza\u0142, ale jak wida\u0107 dzieciak mia\u0142 to po prostu w dupie. Wi\u0119c skoro dziecko nie s\u0142ucha si\u0119 nawet rodzica, to jak w szkole pos\u0142ucha nauczyciela? Jak znajdzie prac\u0119, w kt\u00f3rej b\u0119dzie jaki\u015b szef (chyba, \u017ce sam sobie b\u0119dzie szefem)? W ten oto spos\u00f3b jak dowiaduj\u0119 si\u0119 o tym, \u017ce kto\u015b wychowuje dzieciaka bezstresowo, ciary przechodz\u0105 mi po plecach, a tego\u017c rodzica mam ochot\u0119 paln\u0105\u0107 mu w \u0142eb tak \u017ceby si\u0119 przekr\u0119ci\u0142 (zar\u00f3wno \u0142eb jak i pogl\u0105dy). A jak mnie wychowano? By\u0142em cz\u0119sto sam sobie zostawiany. Ale nie oznacza \u017ce to byla wspomniana olewka. Jako, \u017ce rodzice pracowali, a rodze\u0144stwo chodzi\u0142o do szko\u0142y, podrzucali mnie do babci. A wieczorami si\u0119 mn\u0105 opiekowali. Gadali jak mia\u0142em problemy i nie bili bo pono\u0107 by\u0142em spokojnym dzieckiem. No i tyle. Do 17 urodzin 2 dni, a szczura chyba nie dostan\u0119. A sam nie kupi\u0119!;(" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "collectionL !! 8" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Z powrotem do wyszukiwarek\n", + "\n", + "Mo\u017cemy potraktowa\u0107 zapytanie jako bardzo kr\u00f3tki dokument, dokona\u0107 jego wektoryzacji i policzy\u0107 cosinus k\u0105ta mi\u0119dzy zapytaniem a dokumentem." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ja za to znam przypadek, \u017ce kole\u017canka mieszkala w bloku par\u0119 lat temu, pewnego razu wchodzi do \u0142azienki w samej bieli\u017anie a tam ogromny w\u0105\u017c na pod\u0142odze i tak si\u0119 wystraszy\u0142a \u017ce wybieg\u0142a z wrzaskiem z mieszkania i wylecia\u0142a przed blok w samej bieli\u017anie i uciek\u0142a do babci swojej, kt\u00f3ra mieszkala gdzie\u015b niedaleko. a potem si\u0119 okaza\u0142o, \u017ce jej s\u0105siad z do\u0142u hodowa\u0142 sobie w\u0119\u017ca i tak w\u0142a\u015bnie swobodnie go \"pasa\u0142\" po mieszkaniu i w\u0105\u017c mu spierdzieli\u0142 przez rur\u0119 w \u0142azience :cool :" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Pewna dziewczyna, wieku mi nieznanego, w mie\u015bcie sto\u0142ecznym - rozwiod\u0142a si\u0119. By\u0142a sama i samotna, wi\u0119c zapragn\u0119\u0142a kupi\u0107 sobie zwierz\u0119, aby sw\u0105 mi\u0142\u0105 obecno\u015bci\u0105 rozja\u015bnia\u0142o jej puste wieczory i takie\u017c poranki. Dziewczyna by\u0142a najwyra\u017aniej ekscentryczk\u0105, bo zamiast rozkosznego, mi\u0119kkiego kociaka z czerwonym k\u0142\u0119buszkiem we\u0142enki lub kud\u0142atego pieska , co sika na parkiet i gryzie skarpetki - kupi\u0142a sobie ... w\u0119\u017ca. W\u0105\u017c zamieszka\u0142 z dziewczyn\u0105, i dobrze im by\u0142o. Gad jad\u0142, spa\u0142 i r\u00f3s\u0142, a po pierwszym okresie oboj\u0119tno\u015bci ( zw\u0142aszcza ze strony w\u0119\u017ca ) nawi\u0105za\u0142a si\u0119 mi\u0119dzy nimi ni\u0107 porozumienia. Przynajmniej dziewczyna odczuwa\u0142a t\u0119 ni\u0107 wyra\u017anie, gdy\u017c w\u0105\u017c reagowa\u0142 na jej obecno\u015b\u0107, a noc\u0105 spa\u0142 zwini\u0119ty w k\u0142\u0119bek w nogach jej \u0142\u00f3\u017cka. Po dw\u00f3ch latach wsp\u00f3lnego bytowania, nie przerywanych \u017cadnym znacz\u0105cym wydarzeniem w ich wzajemnych relacjach, dziewczyna zauwa\u017cy\u0142a, \u017ce w\u0105\u017c sta\u0142 si\u0119 osowia\u0142y. Przesta\u0142 je\u015b\u0107, chowa\u0142 si\u0119 po k\u0105tach, a nocami, zamiast w nogach \u0142\u00f3\u017cka - sypia\u0142 wyci\u0105gni\u0119ty wzd\u0142u\u017c jej boku. Martwi\u0142a si\u0119 o swojego gada i posz\u0142a z nim do weterynarza. Weterynarz zbada\u0142 go, zapisa\u0142 leki na popraw\u0119 apetytu ( ciekawe, jak si\u0119 bada w\u0119\u017ca ? ) i odes\u0142a\u0142 do domu. Zdrowie \u015bliskiego pacjenta nie poprawi\u0142o si\u0119, wi\u0119c troskliwa dziewczyna postanowi\u0142a zasi\u0119gn\u0105\u0107 porady u znawcy gad\u00f3w i gadzich obyczaj\u00f3w. Znawca wys\u0142ucha\u0142 opisu niepokoj\u0105cych objaw\u00f3w, i powiedzia\u0142 : - Prosz\u0119 pani. Ten w\u0105\u017c nie jest chory. On teraz po\u015bci. A le\u017cy wzd\u0142u\u017c pani noc\u0105, bo sprawdza, czy pani si\u0119 zmie\u015bci. To prawdziwa historia. Opowiedzia\u0142a nam j\u0105 dzi\u015b klientka. Le\u017c\u0119 na \u0142\u00f3\u017cku, pisze tego posta, i patrz\u0119 na drzemi\u0105c\u0105 obok mnie kotk\u0119. Troch\u0119 ma\u0142a jest. Raczej nie ma szans, \u017cebym sie zmie\u015bci\u0142a, jakby co.." + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Anakonda. Czy to kolejna miejska legenda? Jaki\u015b czas temu kole\u017canka na jednej z imprez towarzyskich opowiedzia\u0142a mro\u017c\u0105c\u0105 krew w \u017cy\u0142ach histori\u0119 o dziewczynie ze swojej pracy, kt\u00f3ra w Warszawie na dyskotece w Dekadzie pozna\u0142a ch\u0142opaka. Spotyka\u0142a si\u0119 z nim na kaw\u0119 i po drugiej randce dosz\u0142o do poca\u0142unk\u00f3w. Um\u00f3wi\u0142a si\u0119 na trzeci\u0105 randk\u0119, ale zanim do niej dosz\u0142o wyskoczy\u0142 jej jaki\u015b pryszcz na twarzy. Posz\u0142a do lekarza, a ten... zawiadomi\u0142 policj\u0119, prokuratur\u0119 itd. , bo rozpozna\u0142 zara\u017cenie... jadem trupim! Rozpocz\u0119to przes\u0142uchanie dziewczyny i po wyja\u015bnieniach trafiono do ch\u0142opaka, z kt\u00f3rym si\u0119 ca\u0142owa\u0142a. W jego domu odkryto rozk\u0142adaj\u0105ce si\u0119 zw\u0142oki dw\u00f3ch dziewczyn. By\u0142am ta histori\u0105 wstrz\u0105\u015bni\u0119ta. Nast\u0119pnego dnia opowiedzia\u0142am j\u0105 w pracy, a kole\u017canka Justyna przyzna\u0142a, \u017ce ju\u017c o tym slysza\u0142a. To mnie utwierdzi\u0142o, \u017ce historia jest prawdziwa, ale... tylko do wieczora. Co\u015b mi nie dawa\u0142o spokoju. Uwaga TVN nic? Interwencja Polsatu - nic? Nasz rodzimy Telekurier nic? Zacz\u0119\u0142am sprawdza\u0107 w internecie co to jest jad trupi, opryszczka od zaka\u017cenia tym\u017ce jadem i tak... trafi\u0142am na miejsk\u0105 legend\u0119. Historia wydarzy\u0142a si\u0119 nie tylko w Warszawie, ale i w Olsztynie, Toruniu, Wroc\u0142awiu i Krakowie, a by\u0107 mo\u017ce w og\u00f3le za granic\u0105. Cho\u0107 prawdopodobne jest, \u017ce nie wydarzy\u0142a si\u0119 nigdy. G\u0142o\u015bno o niej by\u0142o na miejskch forach. Za ka\u017cdym razem ofiara by\u0142a czyj\u0105\u015b znajom\u0105. Po przeczytaniu kolejnej wersji historii zadzwoni\u0142am do kole\u017canki, kt\u00f3ra opowiedzia\u0142a mi t\u0119 histori\u0119 i skl\u0119\u0142am czym \u015bwiat stoi. Dlatego kiedy kilka dni temu inna kole\u017canka opowiedzia\u0142a kolejn\u0105 mro\u017c\u0105c\u0105 krew w \u017cy\u0142ach histori\u0119 - tym razem o anakondzie - rozpocz\u0119\u0142am poszukiwania w internecie czy to nie jest nast\u0119pna miejska legenda. Nic nie znalaz\u0142am. Jednak co\u015b mi nie pasuje, cho\u0107 ta historia mo\u017ce brzmie\u0107 wielce prawdopodobnie. Zw\u0142aszcza, gdy kto\u015b ogl\u0105da\u0142 g\u0142upawy film z J. Lo. Zainteresowa\u0142o mnie to, bo siedz\u0105c nad powie\u015bci\u0105 \"Dzika\" poczyta\u0142am troch\u0119 o w\u0119\u017cach. A o jak\u0105 histori\u0119 mi chodzi? Pewna kobieta (podobno s\u0105siadka tej mojej kole\u017canki z pracy, kt\u00f3ra histori\u0119 opowiada\u0142a) hodowa\u0142a w domu w\u0119\u017ca - anakond\u0119. Hodowa\u0142a j\u0105 pi\u0119\u0107 lat i nie trzyma\u0142a w terrarium. Anakonda chodzi\u0142a (pe\u0142za\u0142a) samopas po domu i co kilka dni dostawa\u0142a chomika, szczura, mysz lub kr\u00f3lika do zjedzenia. Pewnego dnia przesta\u0142a je\u015b\u0107 i zacz\u0119\u0142a si\u0119 dziwnie zachowywa\u0107. Ka\u017cdego ranka po przebudzeniu w\u0142a\u015bcicielka znajdowa\u0142a j\u0105 w swoim \u0142\u00f3\u017cku wyprostowan\u0105 jak struna. Po dw\u00f3ch tygodniach takich zachowa\u0144 ze strony anakondy w\u0142a\u015bcicielka zaniepokojona stanem zdrowia ukochanego w\u0119\u017ca posz\u0142a z nim do lekarza. Ten wys\u0142ucha\u0142 objaw\u00f3w \"choroby\" i powiedzia\u0142, \u017ce anakonda g\u0142odzi\u0142a si\u0119, by zje\u015b\u0107... w\u0142ascicielk\u0119. K\u0142adzenie si\u0119 ko\u0142o niej by\u0142o mierzeniem ile jeszcze g\u0142odzi\u0107 si\u0119 trzeba, by w\u0142a\u015bcicielka zmie\u015bci\u0142a si\u0119 w pysku no i badaniem od kt\u00f3rej strony trzeba j\u0105 zaatakowa\u0107. W\u0119\u017cowi chodzi\u0142o bowiem o to, by smakowity i du\u017cy obiad si\u0119 za bardzo nie broni\u0142. Ja domy\u015bli\u0142am si\u0119 od razu do czego zmierza ta historia (lektura artyku\u0142\u00f3w o w\u0119\u017cach zrobi\u0142a swoje), ale dla reszty, kt\u00f3rzy s\u0142uchali by\u0142o to szokiem. Mnie szokuje co innego. Po co trzyma\u0107 w\u0119\u017ca skoro nie ma z nim cz\u0142owiek \u017cadnego kontaktu? To nie pies, kot czy inny ssak. To nie ptak. W\u0105\u017c to w\u0105\u017c! Nie przyjdzie na zawo\u0142anie. Jaby kto\u015b nie wiedzia\u0142 to... W\u0119\u017ce s\u0105 mi\u0119so\u017cerne. Po\u0142ykaj\u0105 ofiary w ca\u0142o\u015bci, mimo \u017ce cz\u0119sto wielokrotnie s\u0105 one wi\u0119ksze od samego w\u0119\u017ca. Po\u0142ykanie polega na nasuwaniu si\u0119 w\u0119\u017ca na swoj\u0105 ofiar\u0119. A anakonda... \u017cyje zwykle w wodzie i na drzewach, \u017cywi\u0105c si\u0119 ssakami (m.in. tapiry, dziki, kapibary, jelenie!, gryzonie, niekiedy nawet jaguary), gadami (kajmany), rybami i ptakami, poluj\u0105c zazwyczaj w nocy. Jest w stanie po\u0142kn\u0105\u0107 ofiar\u0119 znacznie szersz\u0105 od swojego cia\u0142a, co jest mo\u017cliwe dzi\u0119ki rozci\u0105gni\u0119ciu szcz\u0119k. Trawienie jest bardzo powolne - po posi\u0142ku w\u0105\u017c trawi wi\u0119ksz\u0105 ofiar\u0119 przez wiele dni, a potem mo\u017ce po\u015bci\u0107 przez szereg tygodni lub miesi\u0119cy. Zanotowany rekord postu, w przypadku anakondy znajduj\u0105cej si\u0119 w niewoli, wynosi 2 lata. Z historii wynika, \u017ce gdyby nie interwencja u weterynarza mog\u0142aby rodzina przez kilka lat szuka\u0107 w\u0142a\u015bcicielki anakondy. My\u015bleliby, \u017ce jest na wycieczce a ona w brzuszku w postaci obiadku. Jest tylko jedno ale. Nigdzie nie znalaz\u0142am jednak \u015bladu, ani nawet wzmianki o tym, \u017ce anakonda zjad\u0142a cz\u0142owieka. I dlatego ci\u0105gle w sumie mam w\u0105tpliwo\u015bci. ps. Dalszy los anakondy \"s\u0105siadki\" kole\u017canki nie jest mi znany." + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import Data.Ord\n", + "import Data.List\n", + "\n", + "legendVectorizer = vectorizeTfIdf vocLSize collectionLNormalized vocL . normalize\n", + "\n", + "\n", + "query vs vzer q = map ((collectionL !!) . snd) $ Data.List.take 3 $ sortBy (\\a b -> fst b `compare` fst a) $ zip (map (`cosineSim` qvec) vs) [0..] \n", + " where qvec = vzer q \n", + "\n", + "query lVectorized legendVectorizer \"w\u0105\u017c przymierza si\u0119 do zjedzenia w\u0142a\u015bcicielki\"\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Haskell", + "language": "haskell", + "name": "haskell" + }, + "language_info": { + "codemirror_mode": "ihaskell", + "file_extension": ".hs", + "mimetype": "text/x-haskell", + "name": "haskell", + "pygments_lexer": "Haskell", + "version": "8.10.4" + }, + "author": "Filip Grali\u0144ski", + "email": "filipg@amu.edu.pl", + "lang": "pl", + "subtitle": "3.Wyszukiwarki \u2014 TF-IDF[wyk\u0142ad]", + "title": "Ekstrakcja informacji", + "year": "2021" + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/wyk/05_Geste_wektory.ipynb b/wyk/05_Geste_wektory.ipynb index 3176042..085ea81 100644 --- a/wyk/05_Geste_wektory.ipynb +++ b/wyk/05_Geste_wektory.ipynb @@ -1,1623 +1,1645 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Zagęszczamy wektory\n", - "\n", - "Podstawowy problem z wektorową reprezentacją typu tf-idf polega na tym, że wektory dokumentów (i macierz całej kolekcji dokumentów) są _rzadkie_, tzn. zawierają dużo zer. W praktyce potrzebujemy bardziej \"gęstej\" czy \"kompaktowej\" reprezentacji numerycznej dokumentów. \n", - "\n", - "## _Hashing trick_\n", - "\n", - "Powierzchownie problem możemy rozwiązać przez użycie tzw. _sztuczki z haszowaniem_ (_hashing trick_). Będziemy potrzebować funkcji mieszającej (haszującej) $H$, która rzutuje napisy na liczby, których reprezentacja binarna składa się z $b$ bitów:\n", - "\n", - "$$H : \\Sigma^{*} \\rightarrow \\{0,\\dots,2^b-1\\}$$\n", - "\n", - "($\\Sigma^{*}$ to zbiór wszystkich napisów.)\n", - "\n", - "**Pytanie:** Czy funkcja $H$ może być różnowartościowa?\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Jako funkcji $H$ możemy np. użyć funkcji MurmurHash2 lub 3." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Hash64 0x4a80abc136f926e7" - ] - }, - "metadata": {}, - "output_type": "display_data" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

5. G\u0119ste reprezentacje wektorowe [wyk\u0142ad]

\n", + "

Filip Grali\u0144ski (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Zag\u0119szczamy wektory\n", + "\n", + "Podstawowy problem z wektorow\u0105 reprezentacj\u0105 typu tf-idf polega na tym, \u017ce wektory dokument\u00f3w (i macierz ca\u0142ej kolekcji dokument\u00f3w) s\u0105 _rzadkie_, tzn. zawieraj\u0105 du\u017co zer. W praktyce potrzebujemy bardziej \"g\u0119stej\" czy \"kompaktowej\" reprezentacji numerycznej dokument\u00f3w. \n", + "\n", + "## _Hashing trick_\n", + "\n", + "Powierzchownie problem mo\u017cemy rozwi\u0105za\u0107 przez u\u017cycie tzw. _sztuczki z haszowaniem_ (_hashing trick_). B\u0119dziemy potrzebowa\u0107 funkcji mieszaj\u0105cej (haszuj\u0105cej) $H$, kt\u00f3ra rzutuje napisy na liczby, kt\u00f3rych reprezentacja binarna sk\u0142ada si\u0119 z $b$ bit\u00f3w:\n", + "\n", + "$$H : \\Sigma^{*} \\rightarrow \\{0,\\dots,2^b-1\\}$$\n", + "\n", + "($\\Sigma^{*}$ to zbi\u00f3r wszystkich napis\u00f3w.)\n", + "\n", + "**Pytanie:** Czy funkcja $H$ mo\u017ce by\u0107 r\u00f3\u017cnowarto\u015bciowa?\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Jako funkcji $H$ mo\u017cemy np. u\u017cy\u0107 funkcji MurmurHash2 lub 3." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Hash64 0x4a80abc136f926e7" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Hash64 0x6c3a641663470e2c" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Hash64 0x6c3a641663470e2c" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Hash64 0xa714568917576314" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Hash64 0x875d9e7e413747c8" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Hash64 0x13ce831936ebc69e" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Hash64 0xb04ce6229407c882" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "Hash64 0x6ecd7bae29ae0450" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import Data.Digest.Murmur64\n", + "\n", + "hash64 \"Komputer\"\n", + "hash64 \"komputer\"\n", + "hash64 \"komputer\"\n", + "hash64 \"komputerze\"\n", + "hash64 \"komputerek\"\n", + "hash64 \"abrakadabra\"\n", + "hash64 \"\"\n", + "hash64 \" \"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Pytanie:** podobne napisy maj\u0105 zupe\u0142nie r\u00f3\u017cne warto\u015bci funkcji haszuj\u0105cej, czy to dobrze, czy to \u017ale?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Musimy tylko sparametryzowa\u0107 nasz\u0105 funkcj\u0119 rozmiarem \"odcisku\" (parametr $b$)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3628" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "25364" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "2877" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "50846" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "12" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "{-# LANGUAGE OverloadedStrings #-}\n", + "\n", + "import Data.Text\n", + "\n", + "-- pomocnicza funkcja, kt\u00f3ra konwertuje warto\u015b\u0107 specjalnego\n", + "-- typu Hash64 do zwyk\u0142ej liczby ca\u0142kowitej\n", + "hashValueAsInteger :: Hash64 -> Integer\n", + "hashValueAsInteger = toInteger . asWord64\n", + "\n", + "-- unpack to funkcja, kt\u00f3ra warto\u015b\u0107 typu String konwertuje do Text\n", + "hash :: Integer -> Text -> Integer\n", + "hash b t = hashValueAsInteger (hash64 $ unpack t) `mod` (2 ^ b)\n", + "\n", + "hash 16 \"komputer\"\n", + "hash 16 \"komputerze\"\n", + "hash 16 \"komputerem\"\n", + "hash 16 \"abrakadabra\"\n", + "hash 4 \"komputer\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Pytanie:** Jakie warto\u015bci $b$ b\u0119d\u0105 bezsensowne?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sztuczka z haszowaniem polega na tym, \u017ce zamiast numerowa\u0107 s\u0142owa korzystaj\u0105c ze s\u0142ownika, po prostu u\u017cywamy funkcji haszuj\u0105cej. W ten spos\u00f3b wektor b\u0119dzie _zawsze_ rozmiar $2^b$ - bez wzgl\u0119du na rozmiar s\u0142ownika." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Zacznijmy od przywo\u0142ania wszystkich potrzebnych definicji." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "{-# LANGUAGE OverloadedStrings #-}\n", + "{-# LANGUAGE QuasiQuotes #-}\n", + "\n", + "import Data.Text hiding(map, filter, zip)\n", + "import Text.Regex.PCRE.Heavy\n", + "\n", + "isStopWord :: Text -> Bool\n", + "isStopWord \"w\" = True\n", + "isStopWord \"jest\" = True\n", + "isStopWord \"\u017ce\" = True\n", + "isStopWord w = w \u2248 [re|^\\p{P}+$|]\n", + "\n", + "\n", + "removeStopWords :: [Text] -> [Text]\n", + "removeStopWords = filter (not . isStopWord)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "{-# LANGUAGE OverloadedStrings #-}\n", + "{-# LANGUAGE QuasiQuotes #-}\n", + "{-# LANGUAGE FlexibleContexts #-}\n", + "\n", + "import Data.Text hiding(map, filter, zip)\n", + "import Prelude hiding(words, take)\n", + "import Text.Regex.PCRE.Heavy\n", + "import Data.Map as Map hiding(take, map, filter)\n", + "import Data.Set as Set hiding(map)\n", + "\n", + "tokenize :: Text -> [Text]\n", + "tokenize = map fst . scan [re|C\\+\\+|[\\p{L}0-9]+|\\p{P}|]\n", + "\n", + "\n", + "mockInflectionDictionary :: Map Text Text\n", + "mockInflectionDictionary = Map.fromList [\n", + " (\"kota\", \"kot\"),\n", + " (\"butach\", \"but\"),\n", + " (\"masz\", \"mie\u0107\"),\n", + " (\"ma\", \"mie\u0107\"),\n", + " (\"buta\", \"but\"),\n", + " (\"zgubi\u0142em\", \"zgubi\u0107\")]\n", + "\n", + "lemmatizeWord :: Map Text Text -> Text -> Text\n", + "lemmatizeWord dict w = findWithDefault w w dict\n", + "\n", + "lemmatize :: Map Text Text -> [Text] -> [Text]\n", + "lemmatize dict = map (lemmatizeWord dict)\n", + "\n", + "\n", + "poorMansStemming = Data.Text.take 6\n", + "\n", + "normalize :: Text -> [Text]\n", + "normalize = map poorMansStemming . removeStopWords . map toLower . lemmatize mockInflectionDictionary . tokenize\n", + "\n", + "getVocabulary :: [Text] -> Set Text \n", + "getVocabulary = Set.unions . map (Set.fromList . normalize) \n", + " \n", + "idf :: [[Text]] -> Text -> Double\n", + "idf coll t = log (fromIntegral n / fromIntegral df)\n", + " where df = Prelude.length $ Prelude.filter (\\d -> t `elem` d) coll\n", + " n = Prelude.length coll\n", + " \n", + "vectorizeTfIdf :: Int -> [[Text]] -> Map Int Text -> [Text] -> [Double]\n", + "vectorizeTfIdf vecSize coll v doc = map (\\i -> count (v ! i) doc * idf coll (v ! i)) [0..(vecSize-1)]\n", + " where count t doc = fromIntegral $ (Prelude.length . Prelude.filter (== t)) doc " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import System.IO\n", + "import Data.List.Split as SP\n", + "\n", + "legendsh <- openFile \"legendy.txt\" ReadMode\n", + "hSetEncoding legendsh utf8\n", + "contents <- hGetContents legendsh\n", + "ls = Prelude.lines contents\n", + "items = map (map pack . SP.splitOn \"\\t\") ls\n", + "\n", + "labelsL = map Prelude.head items\n", + "collectionL = map (!!1) items\n", + "\n", + "collectionLNormalized = map normalize collectionL\n", + "voc' = getVocabulary collectionL\n", + "\n", + "vocLSize = Prelude.length voc'\n", + "\n", + "vocL :: Map Int Text\n", + "vocL = Map.fromList $ zip [0..] $ Set.toList voc'\n", + "\n", + "invvocL :: Map Text Int\n", + "invvocL = Map.fromList $ zip (Set.toList voc') [0..]\n", + "\n", + "lVectorized = map (vectorizeTfIdf vocLSize collectionLNormalized vocL) collectionLNormalized\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Eta reduce
Found:
formatNumber x = printf \"% 7.2f\" x
Why Not:
formatNumber = printf \"% 7.2f\"
Use zipWith
Found:
map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", + " $ zip labels [0 .. (Prelude.length vs - 1)]
Why Not:
zipWith\n", + " (curry (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix))\n", + " labels [0 .. (Prelude.length vs - 1)]
Avoid lambda
Found:
\\ l -> pack $ printf \"% 7s\" l
Why Not:
pack . printf \"% 7s\"
" + ], + "text/plain": [ + "Line 5: Eta reduce\n", + "Found:\n", + "formatNumber x = printf \"% 7.2f\" x\n", + "Why not:\n", + "formatNumber = printf \"% 7.2f\"Line 11: Use zipWith\n", + "Found:\n", + "map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", + " $ zip labels [0 .. (Prelude.length vs - 1)]\n", + "Why not:\n", + "zipWith\n", + " (curry (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix))\n", + " labels [0 .. (Prelude.length vs - 1)]Line 12: Avoid lambda\n", + "Found:\n", + "\\ l -> pack $ printf \"% 7s\" l\n", + "Why not:\n", + "pack . printf \"% 7s\"" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import Text.Printf\n", + "import Data.List (take)\n", + "\n", + "formatNumber :: Double -> String\n", + "formatNumber x = printf \"% 7.2f\" x\n", + "\n", + "similarTo :: ([Double] -> [Double] -> Double) -> [[Double]] -> Int -> Text\n", + "similarTo simFun vs ix = pack $ Prelude.unwords $ map (formatNumber . ((vs !! ix) `simFun`)) vs\n", + "\n", + "paintMatrix :: ([Double] -> [Double] -> Double) -> [Text] -> [[Double]] -> Text\n", + "paintMatrix simFun labels vs = header <> \"\\n\" <> Data.Text.unlines (map (\\(lab, ix) -> lab <> \" \" <> similarTo simFun vs ix) $ zip labels [0..(Prelude.length vs - 1)])\n", + " where header = \" \" <> Data.Text.unwords (map (\\l -> pack $ printf \"% 7s\" l) labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", + "na_ak 1.00 0.02 0.01 0.01 0.03 0.02 0.02 0.04 0.03 0.02 0.01 0.02 0.03\n", + "w_lud 0.02 1.00 0.02 0.05 0.04 0.01 0.03 0.04 0.06 0.01 0.02 0.03 0.06\n", + "ba_hy 0.01 0.02 1.00 0.01 0.02 0.03 0.03 0.04 0.08 0.22 0.01 0.04 0.01\n", + "w_lap 0.01 0.05 0.01 1.00 0.01 0.01 0.00 0.01 0.02 0.00 0.00 0.00 0.00\n", + "ne_dz 0.03 0.04 0.02 0.01 1.00 0.04 0.03 0.07 0.08 0.06 0.03 0.03 0.05\n", + "be_wy 0.02 0.01 0.03 0.01 0.04 1.00 0.01 0.03 0.21 0.01 0.02 0.25 0.01\n", + "zw_oz 0.02 0.03 0.03 0.00 0.03 0.01 1.00 0.04 0.03 0.00 0.01 0.02 0.02\n", + "mo_zu 0.04 0.04 0.04 0.01 0.07 0.03 0.04 1.00 0.10 0.02 0.09 0.05 0.04\n", + "be_wy 0.03 0.06 0.08 0.02 0.08 0.21 0.03 0.10 1.00 0.05 0.03 0.24 0.04\n", + "ba_hy 0.02 0.01 0.22 0.00 0.06 0.01 0.00 0.02 0.05 1.00 0.01 0.02 0.00\n", + "mo_zu 0.01 0.02 0.01 0.00 0.03 0.02 0.01 0.09 0.03 0.01 1.00 0.01 0.02\n", + "be_wy 0.02 0.03 0.04 0.00 0.03 0.25 0.02 0.05 0.24 0.02 0.01 1.00 0.02\n", + "w_lud 0.03 0.06 0.01 0.00 0.05 0.01 0.02 0.04 0.04 0.00 0.02 0.02 1.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "limit = 13\n", + "labelsLimited = Data.List.take limit labelsL\n", + "limitedL = Data.List.take limit lVectorized\n", + "\n", + "vectorNorm :: [Double] -> Double\n", + "vectorNorm vs = sqrt $ sum $ map (\\x -> x * x) vs\n", + "\n", + "toUnitVector :: [Double] -> [Double]\n", + "toUnitVector vs = map (/ n) vs\n", + " where n = vectorNorm vs\n", + "\n", + "\n", + "(\u2715) :: [Double] -> [Double] -> Double\n", + "(\u2715) v1 v2 = sum $ Prelude.zipWith (*) v1 v2\n", + "\n", + "cosineSim v1 v2 = toUnitVector v1 \u2715 toUnitVector v2\n", + "\n", + "paintMatrix cosineSim labelsLimited limitedL" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Powy\u017csza macierz reprezentuje por\u00f3wnanie przy u\u017cyciu podobie\u0144stwa kosinusowego. Spr\u00f3bujmy teraz u\u017cy\u0107 g\u0119stszych wektor\u00f3w przy u\u017cyciu hashing trick. Jako warto\u015b\u0107 $b$ przyjmijmy 6.\n", + "\n", + "Zobaczmy najpierw, w kt\u00f3re \"przegr\u00f3dki\" b\u0119d\u0105 wpada\u0142y poszczeg\u00f3lne wyrazy s\u0142ownika.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(\"0\",32),(\"00\",4),(\"01\",4),(\"07\",40),(\"09\",44),(\"1\",1),(\"10\",61),(\"100\",27),(\"12\",58),(\"13\",51),(\"131\",37),(\"15\",30),(\"16\",21),(\"17\",58),(\"18\",55),(\"19\",35),(\"1997r\",61),(\"2\",62),(\"20\",28),(\"2006\",44),(\"2008\",19),(\"2009\",4),(\"2010\",3),(\"22\",27),(\"23\",34),(\"24\",7),(\"25\",29),(\"26\",35),(\"27\",44),(\"28\",61),(\"29\",30),(\"3\",56),(\"30\",55),(\"300\",38),(\"31\",45),(\"4\",53),(\"40\",39),(\"42\",43),(\"48\",53),(\"49\",13),(\"5\",31),(\"50\",32),(\"56\",38),(\"57\",55),(\"6\",59),(\"7\",27),(\"8\",34),(\"a\",27),(\"aaa\",33),(\"absolu\",11),(\"absurd\",18),(\"aby\",12),(\"adnym\",10),(\"adres\",15),(\"adrese\",62),(\"afroam\",3),(\"afryce\",46),(\"agresy\",57),(\"ah\",37),(\"aha\",42),(\"aig\",56),(\"akadem\",18),(\"akcja\",0),(\"akcje\",21),(\"akompa\",13),(\"aktor\",26),(\"akurat\",7),(\"albino\",27),(\"albo\",44),(\"ale\",7),(\"alfa\",58),(\"alkoho\",56),(\"altern\",38),(\"ameryk\",11),(\"amp\",62),(\"anakon\",34),(\"analiz\",62),(\"andrze\",63),(\"anegdo\",43),(\"ang\",37),(\"anga\\380o\",27),(\"anglii\",33),(\"ani\",22),(\"anonsu\",36),(\"antono\",3),(\"antykr\",41),(\"apetyt\",16),(\"apolit\",39),(\"apropo\",54),(\"apteki\",20),(\"aqua\",59),(\"archit\",61),(\"aromat\",44),(\"artyku\",31),(\"asami\",22),(\"astron\",59),(\"asy\\347ci\",60),(\"atmosf\",37),(\"audycj\",50),(\"auta\",38)]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "map (\\t -> (t, hash 6 t)) $ Data.List.take 100 $ Set.toList voc'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Pytanie:** Czy jakie\u015b dwa termy wpad\u0142y do jednej przegr\u00f3dki?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Stw\u00f3rzmy najpierw funkcj\u0119, kt\u00f3ra b\u0119dzie wektoryzowa\u0142a pojedynczy term $t$. Po prostu stworzymy wektor, kt\u00f3re b\u0119dzie mia\u0142 rozmiar $2^b$, wsz\u0119dzie b\u0119dzie mia\u0142 0 z wyj\u0105tkiem pozycji o numerze $H_b(t)$ - tam wpiszmy odwrotn\u0105 cz\u0119sto\u015b\u0107 dokumentow\u0105.\n", + "\n", + "$$\\vec{t} = [0,\\dots,\\idf_c t,\\dots,0]$$\n", + "\n", + "Teraz dla dokumentu $d = (t_1,\\dots,t_n)$ i dla schematu wa\u017cenia tf-idf:\n", + "\n", + "$$\\vec{d} = \\sum \\vec{t_i}$$" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.268683541318364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "wordVector :: Integer -> [[Text]] -> Text -> [Double]\n", + "wordVector b coll term = map selector [0..vecSize]\n", + " where vecSize = 2^b - 1\n", + " wordFingerprint = hash b term\n", + " selector i \n", + " | i == wordFingerprint = idf coll term\n", + " | otherwise = 0.0\n", + "\n", + "wordVector 6 collectionLNormalized \"aromat\"\n", + "wordVector 6 collectionLNormalized \"albo\"\n", + "wordVector 6 collectionLNormalized \"akcja\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Teraz wystarczy zsumowa\u0107 wektory dla poszczeg\u00f3lnych s\u0142\u00f3w, \u017ceby otrzyma\u0107 wektor dokumentu. Najpierw zdefiniujmy sobie sum\u0119 wektorow\u0105." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[1.2,4.0,3.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "(+++) :: [Double] -> [Double] -> [Double]\n", + "(+++) = Prelude.zipWith (+)\n", + "\n", + "[0.2, 0.5, 1.0] +++ [1.0, 3.5, 2.0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Przydatna b\u0119dzie jeszcze funkcja, kt\u00f3ra tworzy wektor z samymi zerami o zadanej d\u0142ugo\u015bci:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "zero :: Int -> [Double]\n", + "zero s = Prelude.replicate s 0.0\n", + "\n", + "zero (2^6)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Eta reduce
Found:
vectorizeWithHashingTrick b coll doc\n", + " = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2 ^ b) doc
Why Not:
vectorizeWithHashingTrick b coll\n", + " = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2 ^ b)
" + ], + "text/plain": [ + "Line 3: Eta reduce\n", + "Found:\n", + "vectorizeWithHashingTrick b coll doc\n", + " = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2 ^ b) doc\n", + "Why not:\n", + "vectorizeWithHashingTrick b coll\n", + " = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2 ^ b)" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[5.242936783195232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.856470206220483,0.0,0.0,1.1700712526502546,0.5947071077466928,0.0,5.712940412440966,3.0708470981669183,0.0,0.0,4.465908118654584,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,4.788681510917635,0.0,3.7727609380946383,0.0,1.575536360758419,0.0,3.079613757534693,0.0,4.465908118654584,0.0,4.588010815455483,4.465908118654584,0.0,1.5214691394881432,0.0,0.0,0.0,0.0,4.465908118654584,2.5199979695992702,0.0,1.5214691394881432,8.388148398070203e-2,0.0,4.465908118654584,0.0,0.0,3.367295829986474,0.0,3.7727609380946383,0.0,1.5214691394881432,0.0,3.7727609380946383,0.0,0.0,0.0,3.367295829986474,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.734591659972947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.734591659972947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.003275201291313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.931816237309167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\n", + "vectorizeWithHashingTrick :: Integer -> [[Text]] -> [Text] -> [Double]\n", + "vectorizeWithHashingTrick b coll doc = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2^b) doc\n", + "\n", + "vectorizeWithHashingTrick 6 collectionLNormalized $ collectionLNormalized !! 3\n", + "vectorizeWithHashingTrick 6 collectionLNormalized [\"aromat\", \"albo\", \"akcja\"]\n", + "vectorizeWithHashingTrick 6 collectionLNormalized [\"akcja\", \"aromat\", \"albo\"]\n", + "vectorizeWithHashingTrick 6 collectionLNormalized [\"akcja\", \"aromat\", \"albo\", \"albo\"]\n", + "vectorizeWithHashingTrick 6 collectionLNormalized [\"akcja\", \"aromat\", \"09\"]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Zobaczmy, jak zag\u0119szczenie wp\u0142ywa na macierz podobie\u0144stwa." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", + "na_ak 1.00 0.37 0.21 0.28 0.35 0.22 0.32 0.45 0.47 0.21 0.25 0.20 0.39\n", + "w_lud 0.37 1.00 0.28 0.18 0.38 0.15 0.20 0.35 0.36 0.14 0.17 0.19 0.33\n", + "ba_hy 0.21 0.28 1.00 0.08 0.20 0.18 0.24 0.29 0.30 0.27 0.17 0.15 0.24\n", + "w_lap 0.28 0.18 0.08 1.00 0.10 0.11 0.11 0.30 0.17 0.06 0.07 0.13 0.21\n", + "ne_dz 0.35 0.38 0.20 0.10 1.00 0.32 0.30 0.52 0.44 0.27 0.36 0.26 0.41\n", + "be_wy 0.22 0.15 0.18 0.11 0.32 1.00 0.26 0.26 0.39 0.15 0.23 0.43 0.22\n", + "zw_oz 0.32 0.20 0.24 0.11 0.30 0.26 1.00 0.38 0.36 0.06 0.18 0.20 0.29\n", + "mo_zu 0.45 0.35 0.29 0.30 0.52 0.26 0.38 1.00 0.54 0.23 0.39 0.38 0.51\n", + "be_wy 0.47 0.36 0.30 0.17 0.44 0.39 0.36 0.54 1.00 0.26 0.37 0.42 0.48\n", + "ba_hy 0.21 0.14 0.27 0.06 0.27 0.15 0.06 0.23 0.26 1.00 0.24 0.10 0.27\n", + "mo_zu 0.25 0.17 0.17 0.07 0.36 0.23 0.18 0.39 0.37 0.24 1.00 0.20 0.34\n", + "be_wy 0.20 0.19 0.15 0.13 0.26 0.43 0.20 0.38 0.42 0.10 0.20 1.00 0.29\n", + "w_lud 0.39 0.33 0.24 0.21 0.41 0.22 0.29 0.51 0.48 0.27 0.34 0.29 1.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "lVectorized' = map (vectorizeWithHashingTrick 8 collectionLNormalized) collectionLNormalized\n", + "limitedL' = Data.List.take limit lVectorized'\n", + "\n", + "paintMatrix cosineSim labelsLimited limitedL'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Pytanie:** Co si\u0119 stanie, gdy zwi\u0119kszymy $b$, a co je\u015bli zmniejszymi?\n", + "\n", + "Zalety sztuczki z haszowaniem:\n", + "\n", + "* zagwarantowany sta\u0142y rozmiar wektora\n", + "* szybsze obliczenia\n", + "* w naturalny spos\u00f3b uwzgl\u0119dniamy termy, kt\u00f3rych nie by\u0142o w pocz\u0105tkowej kolekcji (ale uwaga na idf!)\n", + "* nie musimy pami\u0119ta\u0107 odzworowania rzutuj\u0105cego s\u0142owa na ich numery\n", + "\n", + "Wady:\n", + "\n", + "* dwa r\u00f3\u017cne s\u0142owa mog\u0105 wpa\u015b\u0107 do jednej przegr\u00f3dki (szczeg\u00f3lnie cz\u0119ste, je\u015bli $b$ jest za ma\u0142e)\n", + "* je\u015bli $b$ ustawimy za du\u017ce, wektory mog\u0105 by\u0107 nawet wi\u0119ksze ni\u017c w przypadku standardowego podej\u015bcia\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Word2vec\n", + "\n", + "A mo\u017ce istnieje dobra wr\u00f3\u017cka, kt\u00f3ra da\u0142aby nam dobre wektory s\u0142\u00f3w (z kt\u00f3rych b\u0119dziemy sk\u0142adali proste wektory dokument\u00f3w przez sumowanie)?\n", + "\n", + "**Pytanie:** Jakie w\u0142asno\u015bci powinny mie\u0107 dobre wektory s\u0142\u00f3w?\n", + "\n", + "Tak! Istniej\u0105 gotowe \"bazy danych\" wektor\u00f3w. Jedn\u0105 z najpopularniejszych (i najstarszych) metod uzyskiwania takich wektor\u00f3w jest Word2vec. Jak dok\u0142adnie Word2vec, dowiemy si\u0119 p\u00f3\u017aniej, na dzisiaj po prostu u\u017cyjmy tych wektor\u00f3w.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Najpierw wprowad\u017amy alternatywn\u0105 normalizacj\u0119 zgodn\u0105 z tym, jak zosta\u0142 wygenerowany model." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ala" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ma" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kota" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "normalize' :: Text -> [Text]\n", + "normalize' = removeStopWords . map toLower . tokenize\n", + "\n", + "normalize' \"Ala ma kota.\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "mam" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kumpla" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ktory" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "zdawal" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "walentynki" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "i" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "polozyl" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "koperte" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "dla" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "laski" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "z" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "kartka" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "na" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "desce" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "rozdzielczej" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "egzaminator" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "wziol" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ta" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "karteke" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "i" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "powiedzial" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ze" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "ma" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "znade" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "wypisal" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "mu" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "papierek" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "i" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "po" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "egzaminie" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "hehe" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "filmik" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "dobry" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "collectionLNormalized' = map normalize' collectionL\n", + "collectionLNormalized' !! 3" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[-2.305081844329834,0.3418600857257843,4.44999361038208,0.9008448719978333,-2.1629886627197266,1.0206516981124878,4.157524108886719,2.5060904026031494,-0.17275184392929077,4.085052967071533,2.236677408218384,-2.3315281867980957,0.5224806070327759,0.15804219245910645,-1.5636622905731201,-1.2624900341033936,-0.3161393105983734,-1.971177101135254,1.4859644174575806,-0.1742715835571289,1.209444284439087,4.063786193728447e-2,-0.2808700501918793,-0.5895432233810425,-4.126195430755615,-2.690922260284424,1.4975452423095703,-0.25380706787109375,-4.5767364501953125,-1.7726246118545532,2.938936710357666,-0.7173141837120056,-2.4317402839660645,-4.206724643707275,0.6768773198127747,2.236821413040161,4.1044291108846664e-2,1.6991114616394043,1.2354476377367973e-2,-3.079916000366211,-1.7430219650268555,1.8969229459762573,-0.4897139072418213,1.1981141567230225,2.431124687194824,0.39453181624412537,1.9735784530639648,2.124225378036499,-4.338796138763428,-0.954145610332489,3.3927927017211914,0.8821511268615723,5.120451096445322e-3,2.917816638946533,-2.035374164581299,3.3221969604492188,-4.981880187988281,-1.105080008506775,-4.093905448913574,-1.5998111963272095,0.6372298002243042,-0.7565107345581055,0.4038744270801544,0.685226321220398,2.137610912322998,-0.4390018582344055,1.007287859916687,0.19681350886821747,-2.598611354827881,-1.8872140645980835,1.6989527940750122,1.6458508968353271,-5.091184616088867,1.4902764558792114,-0.4839307367801666,-2.840092420578003,1.0180696249008179,0.7615311741828918,1.8135554790496826,-0.30493396520614624,3.5879104137420654,1.4585649967193604,3.2775094509124756,-1.1610190868377686,-2.3159284591674805,4.1530327796936035,-4.67172384262085,-0.8594478964805603,-0.860812783241272,-0.31788957118988037,0.7260096669197083,0.1879102736711502,-0.15789580345153809,1.9434200525283813,-1.9945732355117798,1.8799400329589844,-0.5253798365592957,-0.2834266722202301,-0.8012301921844482,1.5093021392822266]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "100" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "{-# LANGUAGE OverloadedStrings #-}\n", + "{-# LANGUAGE BangPatterns #-}\n", + "\n", + "import Data.Word2Vec.Model\n", + "import Data.Maybe (catMaybes, fromJust)\n", + "import qualified Data.Vector.Storable as V\n", + "\n", + "model <- readWord2VecModel \"tiny.bin\"\n", + "\n", + "toOurVector :: WVector -> [Double]\n", + "toOurVector (WVector v _) = map realToFrac $ V.toList v\n", + "\n", + "balwanV = toOurVector $ fromJust $ getVector model \"ba\u0142wan\"\n", + "balwanV\n", + "Prelude.length balwanV\n", + "\n", + "vectorizeWord2vec model d = Prelude.foldr (+++) (zero 100) $ map toOurVector $ catMaybes $ map (getVector model) d\n", + "\n", + "collectionLVectorized'' = map (vectorizeWord2vec model) collectionLNormalized'" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[-26.834667675197124,2.568521626293659,37.66925026476383,9.381511189043522,-32.04328362643719,-19.734033070504665,55.21128339320421,14.215368987061083,23.60182836651802,38.74189975857735,0.16257449332624674,-47.983866568654776,-36.917382495012134,36.08420217037201,13.996580198407173,-30.473296120762825,21.28328724205494,30.601420499384403,-40.5945385559462,16.043263137340546,-8.694086126983166,-41.90418399870396,-10.448782376945019,-0.21028679609298706,9.586350612342358,-46.172676257789135,46.27567541599274,11.25023115798831,9.00947591662407,-43.525397814810276,22.09978771582246,56.93886440992355,-23.428963833488524,-1.4649565666913986,21.969609811902046,-21.504647210240364,24.955158293247223,-8.328911297023296,-31.118815276771784,0.22846409678459167,12.212224327027798,-28.337586268782616,-24.105730276554823,3.36764569953084,8.270942151546478,33.71851025521755,30.665825616568327,-24.134687054902315,-31.72916578501463,35.20022106170654,71.15121555328369,-15.448215141892433,-41.27439119666815,3.0322337672114372,9.768462024629116,38.911416467279196,-9.848581969738007,-20.030757322907448,6.734442539513111,-84.9070791369304,38.147536396980286,4.3607237339019775,-25.426255017518997,5.240264508873224,-32.71464269608259,2.095752328634262,2.4292337521910667,32.93906496465206,-51.44473773613572,0.5551527962088585,-6.1982685178518295,20.187213011085987,-52.809339098632336,-10.458874322474003,13.979218572378159,-38.16066548228264,27.336308609694242,5.3437707126140594,-32.01269288826734,-38.117460787296295,-9.337415304034948,38.90077601373196,-2.158842660486698,-44.878454223275185,23.69188129901886,-54.10413733869791,-41.30505630373955,-37.28948371112347,-65.8488347530365,32.51569982431829,3.781733974814415,72.77320172637701,6.847739472985268,63.77478001266718,24.26227615773678,7.260737741366029,10.931276574730873,-17.388786104973406,9.978045962750912,5.968699499964714]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "collectionLVectorized'' !! 3" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", + "na_ak 1.00 0.92 0.85 0.77 0.87 0.90 0.92 0.88 0.87 0.87 0.89 0.89 0.89\n", + "w_lud 0.92 1.00 0.92 0.72 0.93 0.93 0.91 0.94 0.95 0.86 0.94 0.94 0.96\n", + "ba_hy 0.85 0.92 1.00 0.69 0.89 0.91 0.83 0.89 0.95 0.86 0.87 0.94 0.90\n", + "w_lap 0.77 0.72 0.69 1.00 0.60 0.74 0.67 0.65 0.68 0.58 0.68 0.73 0.66\n", + "ne_dz 0.87 0.93 0.89 0.60 1.00 0.90 0.87 0.95 0.94 0.86 0.93 0.90 0.95\n", + "be_wy 0.90 0.93 0.91 0.74 0.90 1.00 0.89 0.89 0.91 0.85 0.91 0.96 0.94\n", + "zw_oz 0.92 0.91 0.83 0.67 0.87 0.89 1.00 0.89 0.86 0.86 0.91 0.85 0.90\n", + "mo_zu 0.88 0.94 0.89 0.65 0.95 0.89 0.89 1.00 0.97 0.85 0.95 0.91 0.96\n", + "be_wy 0.87 0.95 0.95 0.68 0.94 0.91 0.86 0.97 1.00 0.84 0.93 0.95 0.95\n", + "ba_hy 0.87 0.86 0.86 0.58 0.86 0.85 0.86 0.85 0.84 1.00 0.83 0.85 0.84\n", + "mo_zu 0.89 0.94 0.87 0.68 0.93 0.91 0.91 0.95 0.93 0.83 1.00 0.91 0.96\n", + "be_wy 0.89 0.94 0.94 0.73 0.90 0.96 0.85 0.91 0.95 0.85 0.91 1.00 0.94\n", + "w_lud 0.89 0.96 0.90 0.66 0.95 0.94 0.90 0.96 0.95 0.84 0.96 0.94 1.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "limitedL'' = Data.List.take limit collectionLVectorized''\n", + "\n", + "paintMatrix cosineSim labelsLimited limitedL''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mo\u017cemy pr\u00f3bowa\u0107 mno\u017cy\u0107 wektory z modelu Word2vec z idf. Najpierw zdefiniujmy mno\u017cenie przez skalar." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[2.5,0.0,5.0]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "(***) :: Double -> [Double] -> [Double]\n", + "(***) s = map (*s)\n", + "\n", + "2.5 *** [1.0, 0.0, 2.0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Teraz b\u0119dziemy przemna\u017cali wektory Word2vec przez idf (jako skalar)." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Fuse foldr/map
Found:
Prelude.foldr (+++) (zero 100)\n", + " $ map (\\ (t, Just v) -> idf coll t *** toOurVector v)\n", + " $ Prelude.filter (\\ (_, v) -> isJust v)\n", + " $ map (\\ t -> (t, getVector model t)) d
Why Not:
foldr\n", + " ((+++) . (\\ (t, Just v) -> idf coll t *** toOurVector v))\n", + " (zero 100)\n", + " (Prelude.filter (\\ (_, v) -> isJust v)\n", + " $ map (\\ t -> (t, getVector model t)) d)
" + ], + "text/plain": [ + "Line 4: Fuse foldr/map\n", + "Found:\n", + "Prelude.foldr (+++) (zero 100)\n", + " $ map (\\ (t, Just v) -> idf coll t *** toOurVector v)\n", + " $ Prelude.filter (\\ (_, v) -> isJust v)\n", + " $ map (\\ t -> (t, getVector model t)) d\n", + "Why not:\n", + "foldr\n", + " ((+++) . (\\ (t, Just v) -> idf coll t *** toOurVector v))\n", + " (zero 100)\n", + " (Prelude.filter (\\ (_, v) -> isJust v)\n", + " $ map (\\ t -> (t, getVector model t)) d)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import Data.Maybe (isJust)\n", + "\n", + "vectorizeWord2vecIdf model coll d = \n", + " Prelude.foldr (+++) (zero 100) \n", + " $ map (\\(t, Just v) -> idf coll t *** toOurVector v) \n", + " $ Prelude.filter (\\(_, v) -> isJust v)\n", + " $ map (\\t -> (t, getVector model t)) d\n", + "\n", + "collectionLVectorized''' = map (vectorizeWord2vecIdf model collectionLNormalized') collectionLNormalized'" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[-35.63830397762308,32.606312678971506,102.20663646169147,56.00417395285867,-130.56709475346878,-14.916644370325773,55.15817632053957,83.2241937686228,26.432875116296394,48.94350344147367,11.370669191277202,-59.54579267200742,-116.01687192456801,60.53824040579282,39.84659684249884,-34.37377085402866,104.53525319069323,45.53363024094972,-34.25020197907558,-43.9007702604392,35.36538495508536,-59.81737728971619,-1.5823889595648828,-50.211106838043655,14.83789867297237,-109.45917608219175,86.56767915592452,-32.170794763065615,29.559930839016644,-126.81686726526162,-9.918908360030228,47.14965938694648,5.955083439147183,41.24417782948478,3.592410260515919,72.10649687523313,61.374776273461855,60.28687760276824,-28.886499026001676,-8.710633131022206,-68.73464623080284,-37.95272838994007,-26.390548039392165,-14.241950251566944,74.6286124718925,46.21889022510431,72.23999508751568,-19.597547074284556,-20.160749174807382,99.49036127458763,131.98057386978817,-23.842794956628147,-62.381675411749846,-19.366936151725387,1.4839595614144327,60.40520721416763,-7.70311857607342,-31.75784386529525,48.71818084466781,-202.41827342135582,138.5639100010709,12.447619757719652,-39.38375639132277,27.877688543771935,-87.00559882214534,56.45689362090545,37.89098984507379,103.78465196444151,-166.10094891357176,-50.83382060940457,11.574060187412977,74.00519869734406,-97.00170731343235,32.18159534728971,-11.280059681646494,-40.701643971890256,74.64230137346699,0.7613112917269982,-6.103424218278271,-150.47551072570587,-21.714627635239918,91.26690441786137,62.91576955719526,-92.35700140312395,-25.421583980267307,-67.87480813505826,-120.16245846953592,-68.89155479679258,-122.00206448376261,35.263603445401785,6.416282520155956,203.41225708856086,-62.42983953251155,59.36113672119048,40.00275897200196,-62.55633545667429,89.66866371308245,-42.287712072353834,-72.59490110281287,52.23637641217955]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "collectionLVectorized''' !! 3" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", + "na_ak 1.00 0.83 0.78 0.63 0.78 0.81 0.83 0.76 0.77 0.80 0.77 0.79 0.79\n", + "w_lud 0.83 1.00 0.82 0.60 0.84 0.84 0.84 0.85 0.86 0.74 0.86 0.83 0.90\n", + "ba_hy 0.78 0.82 1.00 0.57 0.78 0.84 0.77 0.79 0.90 0.75 0.74 0.89 0.85\n", + "w_lap 0.63 0.60 0.57 1.00 0.38 0.60 0.50 0.43 0.52 0.45 0.55 0.65 0.47\n", + "ne_dz 0.78 0.84 0.78 0.38 1.00 0.81 0.79 0.90 0.89 0.77 0.81 0.81 0.90\n", + "be_wy 0.81 0.84 0.84 0.60 0.81 1.00 0.82 0.76 0.83 0.74 0.81 0.92 0.88\n", + "zw_oz 0.83 0.84 0.77 0.50 0.79 0.82 1.00 0.77 0.77 0.74 0.82 0.75 0.83\n", + "mo_zu 0.76 0.85 0.79 0.43 0.90 0.76 0.77 1.00 0.93 0.74 0.87 0.80 0.90\n", + "be_wy 0.77 0.86 0.90 0.52 0.89 0.83 0.77 0.93 1.00 0.72 0.81 0.89 0.92\n", + "ba_hy 0.80 0.74 0.75 0.45 0.77 0.74 0.74 0.74 0.72 1.00 0.66 0.73 0.72\n", + "mo_zu 0.77 0.86 0.74 0.55 0.81 0.81 0.82 0.87 0.81 0.66 1.00 0.80 0.88\n", + "be_wy 0.79 0.83 0.89 0.65 0.81 0.92 0.75 0.80 0.89 0.73 0.80 1.00 0.87\n", + "w_lud 0.79 0.90 0.85 0.47 0.90 0.88 0.83 0.90 0.92 0.72 0.88 0.87 1.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "limitedL''' = Data.List.take limit collectionLVectorized'''\n", + "\n", + "paintMatrix cosineSim labelsLimited limitedL'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Haskell", + "language": "haskell", + "name": "haskell" + }, + "language_info": { + "codemirror_mode": "ihaskell", + "file_extension": ".hs", + "mimetype": "text/x-haskell", + "name": "haskell", + "pygments_lexer": "Haskell", + "version": "8.10.4" + }, + "author": "Filip Grali\u0144ski", + "email": "filipg@amu.edu.pl", + "lang": "pl", + "subtitle": "5.G\u0119ste reprezentacje wektorowe[wyk\u0142ad]", + "title": "Ekstrakcja informacji", + "year": "2021" }, - { - "data": { - "text/plain": [ - "Hash64 0x6c3a641663470e2c" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Hash64 0x6c3a641663470e2c" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Hash64 0xa714568917576314" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Hash64 0x875d9e7e413747c8" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Hash64 0x13ce831936ebc69e" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Hash64 0xb04ce6229407c882" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "Hash64 0x6ecd7bae29ae0450" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import Data.Digest.Murmur64\n", - "\n", - "hash64 \"Komputer\"\n", - "hash64 \"komputer\"\n", - "hash64 \"komputer\"\n", - "hash64 \"komputerze\"\n", - "hash64 \"komputerek\"\n", - "hash64 \"abrakadabra\"\n", - "hash64 \"\"\n", - "hash64 \" \"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Pytanie:** podobne napisy mają zupełnie różne wartości funkcji haszującej, czy to dobrze, czy to źle?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Musimy tylko sparametryzować naszą funkcję rozmiarem \"odcisku\" (parametr $b$)." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3628" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "25364" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "2877" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "50846" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "12" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "{-# LANGUAGE OverloadedStrings #-}\n", - "\n", - "import Data.Text\n", - "\n", - "-- pomocnicza funkcja, która konwertuje wartość specjalnego\n", - "-- typu Hash64 do zwykłej liczby całkowitej\n", - "hashValueAsInteger :: Hash64 -> Integer\n", - "hashValueAsInteger = toInteger . asWord64\n", - "\n", - "-- unpack to funkcja, która wartość typu String konwertuje do Text\n", - "hash :: Integer -> Text -> Integer\n", - "hash b t = hashValueAsInteger (hash64 $ unpack t) `mod` (2 ^ b)\n", - "\n", - "hash 16 \"komputer\"\n", - "hash 16 \"komputerze\"\n", - "hash 16 \"komputerem\"\n", - "hash 16 \"abrakadabra\"\n", - "hash 4 \"komputer\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Pytanie:** Jakie wartości $b$ będą bezsensowne?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sztuczka z haszowaniem polega na tym, że zamiast numerować słowa korzystając ze słownika, po prostu używamy funkcji haszującej. W ten sposób wektor będzie _zawsze_ rozmiar $2^b$ - bez względu na rozmiar słownika." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Zacznijmy od przywołania wszystkich potrzebnych definicji." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "{-# LANGUAGE OverloadedStrings #-}\n", - "{-# LANGUAGE QuasiQuotes #-}\n", - "\n", - "import Data.Text hiding(map, filter, zip)\n", - "import Text.Regex.PCRE.Heavy\n", - "\n", - "isStopWord :: Text -> Bool\n", - "isStopWord \"w\" = True\n", - "isStopWord \"jest\" = True\n", - "isStopWord \"że\" = True\n", - "isStopWord w = w ≈ [re|^\\p{P}+$|]\n", - "\n", - "\n", - "removeStopWords :: [Text] -> [Text]\n", - "removeStopWords = filter (not . isStopWord)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "{-# LANGUAGE OverloadedStrings #-}\n", - "{-# LANGUAGE QuasiQuotes #-}\n", - "{-# LANGUAGE FlexibleContexts #-}\n", - "\n", - "import Data.Text hiding(map, filter, zip)\n", - "import Prelude hiding(words, take)\n", - "import Text.Regex.PCRE.Heavy\n", - "import Data.Map as Map hiding(take, map, filter)\n", - "import Data.Set as Set hiding(map)\n", - "\n", - "tokenize :: Text -> [Text]\n", - "tokenize = map fst . scan [re|C\\+\\+|[\\p{L}0-9]+|\\p{P}|]\n", - "\n", - "\n", - "mockInflectionDictionary :: Map Text Text\n", - "mockInflectionDictionary = Map.fromList [\n", - " (\"kota\", \"kot\"),\n", - " (\"butach\", \"but\"),\n", - " (\"masz\", \"mieć\"),\n", - " (\"ma\", \"mieć\"),\n", - " (\"buta\", \"but\"),\n", - " (\"zgubiłem\", \"zgubić\")]\n", - "\n", - "lemmatizeWord :: Map Text Text -> Text -> Text\n", - "lemmatizeWord dict w = findWithDefault w w dict\n", - "\n", - "lemmatize :: Map Text Text -> [Text] -> [Text]\n", - "lemmatize dict = map (lemmatizeWord dict)\n", - "\n", - "\n", - "poorMansStemming = Data.Text.take 6\n", - "\n", - "normalize :: Text -> [Text]\n", - "normalize = map poorMansStemming . removeStopWords . map toLower . lemmatize mockInflectionDictionary . tokenize\n", - "\n", - "getVocabulary :: [Text] -> Set Text \n", - "getVocabulary = Set.unions . map (Set.fromList . normalize) \n", - " \n", - "idf :: [[Text]] -> Text -> Double\n", - "idf coll t = log (fromIntegral n / fromIntegral df)\n", - " where df = Prelude.length $ Prelude.filter (\\d -> t `elem` d) coll\n", - " n = Prelude.length coll\n", - " \n", - "vectorizeTfIdf :: Int -> [[Text]] -> Map Int Text -> [Text] -> [Double]\n", - "vectorizeTfIdf vecSize coll v doc = map (\\i -> count (v ! i) doc * idf coll (v ! i)) [0..(vecSize-1)]\n", - " where count t doc = fromIntegral $ (Prelude.length . Prelude.filter (== t)) doc " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import System.IO\n", - "import Data.List.Split as SP\n", - "\n", - "legendsh <- openFile \"legendy.txt\" ReadMode\n", - "hSetEncoding legendsh utf8\n", - "contents <- hGetContents legendsh\n", - "ls = Prelude.lines contents\n", - "items = map (map pack . SP.splitOn \"\\t\") ls\n", - "\n", - "labelsL = map Prelude.head items\n", - "collectionL = map (!!1) items\n", - "\n", - "collectionLNormalized = map normalize collectionL\n", - "voc' = getVocabulary collectionL\n", - "\n", - "vocLSize = Prelude.length voc'\n", - "\n", - "vocL :: Map Int Text\n", - "vocL = Map.fromList $ zip [0..] $ Set.toList voc'\n", - "\n", - "invvocL :: Map Text Int\n", - "invvocL = Map.fromList $ zip (Set.toList voc') [0..]\n", - "\n", - "lVectorized = map (vectorizeTfIdf vocLSize collectionLNormalized vocL) collectionLNormalized\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Eta reduce
Found:
formatNumber x = printf \"% 7.2f\" x
Why Not:
formatNumber = printf \"% 7.2f\"
Use zipWith
Found:
map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", - " $ zip labels [0 .. (Prelude.length vs - 1)]
Why Not:
zipWith\n", - " (curry (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix))\n", - " labels [0 .. (Prelude.length vs - 1)]
Avoid lambda
Found:
\\ l -> pack $ printf \"% 7s\" l
Why Not:
pack . printf \"% 7s\"
" - ], - "text/plain": [ - "Line 5: Eta reduce\n", - "Found:\n", - "formatNumber x = printf \"% 7.2f\" x\n", - "Why not:\n", - "formatNumber = printf \"% 7.2f\"Line 11: Use zipWith\n", - "Found:\n", - "map (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix)\n", - " $ zip labels [0 .. (Prelude.length vs - 1)]\n", - "Why not:\n", - "zipWith\n", - " (curry (\\ (lab, ix) -> lab <> \" \" <> similarTo simFun vs ix))\n", - " labels [0 .. (Prelude.length vs - 1)]Line 12: Avoid lambda\n", - "Found:\n", - "\\ l -> pack $ printf \"% 7s\" l\n", - "Why not:\n", - "pack . printf \"% 7s\"" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import Text.Printf\n", - "import Data.List (take)\n", - "\n", - "formatNumber :: Double -> String\n", - "formatNumber x = printf \"% 7.2f\" x\n", - "\n", - "similarTo :: ([Double] -> [Double] -> Double) -> [[Double]] -> Int -> Text\n", - "similarTo simFun vs ix = pack $ Prelude.unwords $ map (formatNumber . ((vs !! ix) `simFun`)) vs\n", - "\n", - "paintMatrix :: ([Double] -> [Double] -> Double) -> [Text] -> [[Double]] -> Text\n", - "paintMatrix simFun labels vs = header <> \"\\n\" <> Data.Text.unlines (map (\\(lab, ix) -> lab <> \" \" <> similarTo simFun vs ix) $ zip labels [0..(Prelude.length vs - 1)])\n", - " where header = \" \" <> Data.Text.unwords (map (\\l -> pack $ printf \"% 7s\" l) labels)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", - "na_ak 1.00 0.02 0.01 0.01 0.03 0.02 0.02 0.04 0.03 0.02 0.01 0.02 0.03\n", - "w_lud 0.02 1.00 0.02 0.05 0.04 0.01 0.03 0.04 0.06 0.01 0.02 0.03 0.06\n", - "ba_hy 0.01 0.02 1.00 0.01 0.02 0.03 0.03 0.04 0.08 0.22 0.01 0.04 0.01\n", - "w_lap 0.01 0.05 0.01 1.00 0.01 0.01 0.00 0.01 0.02 0.00 0.00 0.00 0.00\n", - "ne_dz 0.03 0.04 0.02 0.01 1.00 0.04 0.03 0.07 0.08 0.06 0.03 0.03 0.05\n", - "be_wy 0.02 0.01 0.03 0.01 0.04 1.00 0.01 0.03 0.21 0.01 0.02 0.25 0.01\n", - "zw_oz 0.02 0.03 0.03 0.00 0.03 0.01 1.00 0.04 0.03 0.00 0.01 0.02 0.02\n", - "mo_zu 0.04 0.04 0.04 0.01 0.07 0.03 0.04 1.00 0.10 0.02 0.09 0.05 0.04\n", - "be_wy 0.03 0.06 0.08 0.02 0.08 0.21 0.03 0.10 1.00 0.05 0.03 0.24 0.04\n", - "ba_hy 0.02 0.01 0.22 0.00 0.06 0.01 0.00 0.02 0.05 1.00 0.01 0.02 0.00\n", - "mo_zu 0.01 0.02 0.01 0.00 0.03 0.02 0.01 0.09 0.03 0.01 1.00 0.01 0.02\n", - "be_wy 0.02 0.03 0.04 0.00 0.03 0.25 0.02 0.05 0.24 0.02 0.01 1.00 0.02\n", - "w_lud 0.03 0.06 0.01 0.00 0.05 0.01 0.02 0.04 0.04 0.00 0.02 0.02 1.00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "limit = 13\n", - "labelsLimited = Data.List.take limit labelsL\n", - "limitedL = Data.List.take limit lVectorized\n", - "\n", - "vectorNorm :: [Double] -> Double\n", - "vectorNorm vs = sqrt $ sum $ map (\\x -> x * x) vs\n", - "\n", - "toUnitVector :: [Double] -> [Double]\n", - "toUnitVector vs = map (/ n) vs\n", - " where n = vectorNorm vs\n", - "\n", - "\n", - "(✕) :: [Double] -> [Double] -> Double\n", - "(✕) v1 v2 = sum $ Prelude.zipWith (*) v1 v2\n", - "\n", - "cosineSim v1 v2 = toUnitVector v1 ✕ toUnitVector v2\n", - "\n", - "paintMatrix cosineSim labelsLimited limitedL" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Powyższa macierz reprezentuje porównanie przy użyciu podobieństwa kosinusowego. Spróbujmy teraz użyć gęstszych wektorów przy użyciu hashing trick. Jako wartość $b$ przyjmijmy 6.\n", - "\n", - "Zobaczmy najpierw, w które \"przegródki\" będą wpadały poszczególne wyrazy słownika.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[(\"0\",32),(\"00\",4),(\"01\",4),(\"07\",40),(\"09\",44),(\"1\",1),(\"10\",61),(\"100\",27),(\"12\",58),(\"13\",51),(\"131\",37),(\"15\",30),(\"16\",21),(\"17\",58),(\"18\",55),(\"19\",35),(\"1997r\",61),(\"2\",62),(\"20\",28),(\"2006\",44),(\"2008\",19),(\"2009\",4),(\"2010\",3),(\"22\",27),(\"23\",34),(\"24\",7),(\"25\",29),(\"26\",35),(\"27\",44),(\"28\",61),(\"29\",30),(\"3\",56),(\"30\",55),(\"300\",38),(\"31\",45),(\"4\",53),(\"40\",39),(\"42\",43),(\"48\",53),(\"49\",13),(\"5\",31),(\"50\",32),(\"56\",38),(\"57\",55),(\"6\",59),(\"7\",27),(\"8\",34),(\"a\",27),(\"aaa\",33),(\"absolu\",11),(\"absurd\",18),(\"aby\",12),(\"adnym\",10),(\"adres\",15),(\"adrese\",62),(\"afroam\",3),(\"afryce\",46),(\"agresy\",57),(\"ah\",37),(\"aha\",42),(\"aig\",56),(\"akadem\",18),(\"akcja\",0),(\"akcje\",21),(\"akompa\",13),(\"aktor\",26),(\"akurat\",7),(\"albino\",27),(\"albo\",44),(\"ale\",7),(\"alfa\",58),(\"alkoho\",56),(\"altern\",38),(\"ameryk\",11),(\"amp\",62),(\"anakon\",34),(\"analiz\",62),(\"andrze\",63),(\"anegdo\",43),(\"ang\",37),(\"anga\\380o\",27),(\"anglii\",33),(\"ani\",22),(\"anonsu\",36),(\"antono\",3),(\"antykr\",41),(\"apetyt\",16),(\"apolit\",39),(\"apropo\",54),(\"apteki\",20),(\"aqua\",59),(\"archit\",61),(\"aromat\",44),(\"artyku\",31),(\"asami\",22),(\"astron\",59),(\"asy\\347ci\",60),(\"atmosf\",37),(\"audycj\",50),(\"auta\",38)]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "map (\\t -> (t, hash 6 t)) $ Data.List.take 100 $ Set.toList voc'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Pytanie:** Czy jakieś dwa termy wpadły do jednej przegródki?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Stwórzmy najpierw funkcję, która będzie wektoryzowała pojedynczy term $t$. Po prostu stworzymy wektor, które będzie miał rozmiar $2^b$, wszędzie będzie miał 0 z wyjątkiem pozycji o numerze $H_b(t)$ - tam wpiszmy odwrotną częstość dokumentową.\n", - "\n", - "$$\\vec{t} = [0,\\dots,\\idf_c t,\\dots,0]$$\n", - "\n", - "Teraz dla dokumentu $d = (t_1,\\dots,t_n)$ i dla schematu ważenia tf-idf:\n", - "\n", - "$$\\vec{d} = \\sum \\vec{t_i}$$" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.465908118654584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.268683541318364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "wordVector :: Integer -> [[Text]] -> Text -> [Double]\n", - "wordVector b coll term = map selector [0..vecSize]\n", - " where vecSize = 2^b - 1\n", - " wordFingerprint = hash b term\n", - " selector i \n", - " | i == wordFingerprint = idf coll term\n", - " | otherwise = 0.0\n", - "\n", - "wordVector 6 collectionLNormalized \"aromat\"\n", - "wordVector 6 collectionLNormalized \"albo\"\n", - "wordVector 6 collectionLNormalized \"akcja\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Teraz wystarczy zsumować wektory dla poszczególnych słów, żeby otrzymać wektor dokumentu. Najpierw zdefiniujmy sobie sumę wektorową." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[1.2,4.0,3.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "(+++) :: [Double] -> [Double] -> [Double]\n", - "(+++) = Prelude.zipWith (+)\n", - "\n", - "[0.2, 0.5, 1.0] +++ [1.0, 3.5, 2.0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Przydatna będzie jeszcze funkcja, która tworzy wektor z samymi zerami o zadanej długości:" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "zero :: Int -> [Double]\n", - "zero s = Prelude.replicate s 0.0\n", - "\n", - "zero (2^6)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Eta reduce
Found:
vectorizeWithHashingTrick b coll doc\n", - " = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2 ^ b) doc
Why Not:
vectorizeWithHashingTrick b coll\n", - " = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2 ^ b)
" - ], - "text/plain": [ - "Line 3: Eta reduce\n", - "Found:\n", - "vectorizeWithHashingTrick b coll doc\n", - " = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2 ^ b) doc\n", - "Why not:\n", - "vectorizeWithHashingTrick b coll\n", - " = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2 ^ b)" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[5.242936783195232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.856470206220483,0.0,0.0,1.1700712526502546,0.5947071077466928,0.0,5.712940412440966,3.0708470981669183,0.0,0.0,4.465908118654584,0.0,3.7727609380946383,0.0,0.0,0.0,0.0,4.788681510917635,0.0,3.7727609380946383,0.0,1.575536360758419,0.0,3.079613757534693,0.0,4.465908118654584,0.0,4.588010815455483,4.465908118654584,0.0,1.5214691394881432,0.0,0.0,0.0,0.0,4.465908118654584,2.5199979695992702,0.0,1.5214691394881432,8.388148398070203e-2,0.0,4.465908118654584,0.0,0.0,3.367295829986474,0.0,3.7727609380946383,0.0,1.5214691394881432,0.0,3.7727609380946383,0.0,0.0,0.0,3.367295829986474,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.734591659972947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.734591659972947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.003275201291313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "[3.367295829986474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.931816237309167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "\n", - "vectorizeWithHashingTrick :: Integer -> [[Text]] -> [Text] -> [Double]\n", - "vectorizeWithHashingTrick b coll doc = Prelude.foldr ((+++) . wordVector b coll) (zero $ 2^b) doc\n", - "\n", - "vectorizeWithHashingTrick 6 collectionLNormalized $ collectionLNormalized !! 3\n", - "vectorizeWithHashingTrick 6 collectionLNormalized [\"aromat\", \"albo\", \"akcja\"]\n", - "vectorizeWithHashingTrick 6 collectionLNormalized [\"akcja\", \"aromat\", \"albo\"]\n", - "vectorizeWithHashingTrick 6 collectionLNormalized [\"akcja\", \"aromat\", \"albo\", \"albo\"]\n", - "vectorizeWithHashingTrick 6 collectionLNormalized [\"akcja\", \"aromat\", \"09\"]\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Zobaczmy, jak zagęszczenie wpływa na macierz podobieństwa." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", - "na_ak 1.00 0.37 0.21 0.28 0.35 0.22 0.32 0.45 0.47 0.21 0.25 0.20 0.39\n", - "w_lud 0.37 1.00 0.28 0.18 0.38 0.15 0.20 0.35 0.36 0.14 0.17 0.19 0.33\n", - "ba_hy 0.21 0.28 1.00 0.08 0.20 0.18 0.24 0.29 0.30 0.27 0.17 0.15 0.24\n", - "w_lap 0.28 0.18 0.08 1.00 0.10 0.11 0.11 0.30 0.17 0.06 0.07 0.13 0.21\n", - "ne_dz 0.35 0.38 0.20 0.10 1.00 0.32 0.30 0.52 0.44 0.27 0.36 0.26 0.41\n", - "be_wy 0.22 0.15 0.18 0.11 0.32 1.00 0.26 0.26 0.39 0.15 0.23 0.43 0.22\n", - "zw_oz 0.32 0.20 0.24 0.11 0.30 0.26 1.00 0.38 0.36 0.06 0.18 0.20 0.29\n", - "mo_zu 0.45 0.35 0.29 0.30 0.52 0.26 0.38 1.00 0.54 0.23 0.39 0.38 0.51\n", - "be_wy 0.47 0.36 0.30 0.17 0.44 0.39 0.36 0.54 1.00 0.26 0.37 0.42 0.48\n", - "ba_hy 0.21 0.14 0.27 0.06 0.27 0.15 0.06 0.23 0.26 1.00 0.24 0.10 0.27\n", - "mo_zu 0.25 0.17 0.17 0.07 0.36 0.23 0.18 0.39 0.37 0.24 1.00 0.20 0.34\n", - "be_wy 0.20 0.19 0.15 0.13 0.26 0.43 0.20 0.38 0.42 0.10 0.20 1.00 0.29\n", - "w_lud 0.39 0.33 0.24 0.21 0.41 0.22 0.29 0.51 0.48 0.27 0.34 0.29 1.00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "lVectorized' = map (vectorizeWithHashingTrick 8 collectionLNormalized) collectionLNormalized\n", - "limitedL' = Data.List.take limit lVectorized'\n", - "\n", - "paintMatrix cosineSim labelsLimited limitedL'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Pytanie:** Co się stanie, gdy zwiększymy $b$, a co jeśli zmniejszymi?\n", - "\n", - "Zalety sztuczki z haszowaniem:\n", - "\n", - "* zagwarantowany stały rozmiar wektora\n", - "* szybsze obliczenia\n", - "* w naturalny sposób uwzględniamy termy, których nie było w początkowej kolekcji (ale uwaga na idf!)\n", - "* nie musimy pamiętać odzworowania rzutującego słowa na ich numery\n", - "\n", - "Wady:\n", - "\n", - "* dwa różne słowa mogą wpaść do jednej przegródki (szczególnie częste, jeśli $b$ jest za małe)\n", - "* jeśli $b$ ustawimy za duże, wektory mogą być nawet większe niż w przypadku standardowego podejścia\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Word2vec\n", - "\n", - "A może istnieje dobra wróżka, która dałaby nam dobre wektory słów (z których będziemy składali proste wektory dokumentów przez sumowanie)?\n", - "\n", - "**Pytanie:** Jakie własności powinny mieć dobre wektory słów?\n", - "\n", - "Tak! Istnieją gotowe \"bazy danych\" wektorów. Jedną z najpopularniejszych (i najstarszych) metod uzyskiwania takich wektorów jest Word2vec. Jak dokładnie Word2vec, dowiemy się później, na dzisiaj po prostu użyjmy tych wektorów.\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Najpierw wprowadźmy alternatywną normalizację zgodną z tym, jak został wygenerowany model." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ala" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ma" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kota" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "normalize' :: Text -> [Text]\n", - "normalize' = removeStopWords . map toLower . tokenize\n", - "\n", - "normalize' \"Ala ma kota.\"" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "mam" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kumpla" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ktory" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "zdawal" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "walentynki" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "i" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "polozyl" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "koperte" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "dla" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "laski" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "z" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "kartka" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "na" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "desce" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "rozdzielczej" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "egzaminator" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "wziol" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ta" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "karteke" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "i" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "powiedzial" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ze" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "ma" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "znade" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "wypisal" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "mu" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "papierek" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "i" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "po" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "egzaminie" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "hehe" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "filmik" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "dobry" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "collectionLNormalized' = map normalize' collectionL\n", - "collectionLNormalized' !! 3" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[-2.305081844329834,0.3418600857257843,4.44999361038208,0.9008448719978333,-2.1629886627197266,1.0206516981124878,4.157524108886719,2.5060904026031494,-0.17275184392929077,4.085052967071533,2.236677408218384,-2.3315281867980957,0.5224806070327759,0.15804219245910645,-1.5636622905731201,-1.2624900341033936,-0.3161393105983734,-1.971177101135254,1.4859644174575806,-0.1742715835571289,1.209444284439087,4.063786193728447e-2,-0.2808700501918793,-0.5895432233810425,-4.126195430755615,-2.690922260284424,1.4975452423095703,-0.25380706787109375,-4.5767364501953125,-1.7726246118545532,2.938936710357666,-0.7173141837120056,-2.4317402839660645,-4.206724643707275,0.6768773198127747,2.236821413040161,4.1044291108846664e-2,1.6991114616394043,1.2354476377367973e-2,-3.079916000366211,-1.7430219650268555,1.8969229459762573,-0.4897139072418213,1.1981141567230225,2.431124687194824,0.39453181624412537,1.9735784530639648,2.124225378036499,-4.338796138763428,-0.954145610332489,3.3927927017211914,0.8821511268615723,5.120451096445322e-3,2.917816638946533,-2.035374164581299,3.3221969604492188,-4.981880187988281,-1.105080008506775,-4.093905448913574,-1.5998111963272095,0.6372298002243042,-0.7565107345581055,0.4038744270801544,0.685226321220398,2.137610912322998,-0.4390018582344055,1.007287859916687,0.19681350886821747,-2.598611354827881,-1.8872140645980835,1.6989527940750122,1.6458508968353271,-5.091184616088867,1.4902764558792114,-0.4839307367801666,-2.840092420578003,1.0180696249008179,0.7615311741828918,1.8135554790496826,-0.30493396520614624,3.5879104137420654,1.4585649967193604,3.2775094509124756,-1.1610190868377686,-2.3159284591674805,4.1530327796936035,-4.67172384262085,-0.8594478964805603,-0.860812783241272,-0.31788957118988037,0.7260096669197083,0.1879102736711502,-0.15789580345153809,1.9434200525283813,-1.9945732355117798,1.8799400329589844,-0.5253798365592957,-0.2834266722202301,-0.8012301921844482,1.5093021392822266]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "100" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "{-# LANGUAGE OverloadedStrings #-}\n", - "{-# LANGUAGE BangPatterns #-}\n", - "\n", - "import Data.Word2Vec.Model\n", - "import Data.Maybe (catMaybes, fromJust)\n", - "import qualified Data.Vector.Storable as V\n", - "\n", - "model <- readWord2VecModel \"tiny.bin\"\n", - "\n", - "toOurVector :: WVector -> [Double]\n", - "toOurVector (WVector v _) = map realToFrac $ V.toList v\n", - "\n", - "balwanV = toOurVector $ fromJust $ getVector model \"bałwan\"\n", - "balwanV\n", - "Prelude.length balwanV\n", - "\n", - "vectorizeWord2vec model d = Prelude.foldr (+++) (zero 100) $ map toOurVector $ catMaybes $ map (getVector model) d\n", - "\n", - "collectionLVectorized'' = map (vectorizeWord2vec model) collectionLNormalized'" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[-26.834667675197124,2.568521626293659,37.66925026476383,9.381511189043522,-32.04328362643719,-19.734033070504665,55.21128339320421,14.215368987061083,23.60182836651802,38.74189975857735,0.16257449332624674,-47.983866568654776,-36.917382495012134,36.08420217037201,13.996580198407173,-30.473296120762825,21.28328724205494,30.601420499384403,-40.5945385559462,16.043263137340546,-8.694086126983166,-41.90418399870396,-10.448782376945019,-0.21028679609298706,9.586350612342358,-46.172676257789135,46.27567541599274,11.25023115798831,9.00947591662407,-43.525397814810276,22.09978771582246,56.93886440992355,-23.428963833488524,-1.4649565666913986,21.969609811902046,-21.504647210240364,24.955158293247223,-8.328911297023296,-31.118815276771784,0.22846409678459167,12.212224327027798,-28.337586268782616,-24.105730276554823,3.36764569953084,8.270942151546478,33.71851025521755,30.665825616568327,-24.134687054902315,-31.72916578501463,35.20022106170654,71.15121555328369,-15.448215141892433,-41.27439119666815,3.0322337672114372,9.768462024629116,38.911416467279196,-9.848581969738007,-20.030757322907448,6.734442539513111,-84.9070791369304,38.147536396980286,4.3607237339019775,-25.426255017518997,5.240264508873224,-32.71464269608259,2.095752328634262,2.4292337521910667,32.93906496465206,-51.44473773613572,0.5551527962088585,-6.1982685178518295,20.187213011085987,-52.809339098632336,-10.458874322474003,13.979218572378159,-38.16066548228264,27.336308609694242,5.3437707126140594,-32.01269288826734,-38.117460787296295,-9.337415304034948,38.90077601373196,-2.158842660486698,-44.878454223275185,23.69188129901886,-54.10413733869791,-41.30505630373955,-37.28948371112347,-65.8488347530365,32.51569982431829,3.781733974814415,72.77320172637701,6.847739472985268,63.77478001266718,24.26227615773678,7.260737741366029,10.931276574730873,-17.388786104973406,9.978045962750912,5.968699499964714]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "collectionLVectorized'' !! 3" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", - "na_ak 1.00 0.92 0.85 0.77 0.87 0.90 0.92 0.88 0.87 0.87 0.89 0.89 0.89\n", - "w_lud 0.92 1.00 0.92 0.72 0.93 0.93 0.91 0.94 0.95 0.86 0.94 0.94 0.96\n", - "ba_hy 0.85 0.92 1.00 0.69 0.89 0.91 0.83 0.89 0.95 0.86 0.87 0.94 0.90\n", - "w_lap 0.77 0.72 0.69 1.00 0.60 0.74 0.67 0.65 0.68 0.58 0.68 0.73 0.66\n", - "ne_dz 0.87 0.93 0.89 0.60 1.00 0.90 0.87 0.95 0.94 0.86 0.93 0.90 0.95\n", - "be_wy 0.90 0.93 0.91 0.74 0.90 1.00 0.89 0.89 0.91 0.85 0.91 0.96 0.94\n", - "zw_oz 0.92 0.91 0.83 0.67 0.87 0.89 1.00 0.89 0.86 0.86 0.91 0.85 0.90\n", - "mo_zu 0.88 0.94 0.89 0.65 0.95 0.89 0.89 1.00 0.97 0.85 0.95 0.91 0.96\n", - "be_wy 0.87 0.95 0.95 0.68 0.94 0.91 0.86 0.97 1.00 0.84 0.93 0.95 0.95\n", - "ba_hy 0.87 0.86 0.86 0.58 0.86 0.85 0.86 0.85 0.84 1.00 0.83 0.85 0.84\n", - "mo_zu 0.89 0.94 0.87 0.68 0.93 0.91 0.91 0.95 0.93 0.83 1.00 0.91 0.96\n", - "be_wy 0.89 0.94 0.94 0.73 0.90 0.96 0.85 0.91 0.95 0.85 0.91 1.00 0.94\n", - "w_lud 0.89 0.96 0.90 0.66 0.95 0.94 0.90 0.96 0.95 0.84 0.96 0.94 1.00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "limitedL'' = Data.List.take limit collectionLVectorized''\n", - "\n", - "paintMatrix cosineSim labelsLimited limitedL''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Możemy próbować mnożyć wektory z modelu Word2vec z idf. Najpierw zdefiniujmy mnożenie przez skalar." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[2.5,0.0,5.0]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "(***) :: Double -> [Double] -> [Double]\n", - "(***) s = map (*s)\n", - "\n", - "2.5 *** [1.0, 0.0, 2.0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Teraz będziemy przemnażali wektory Word2vec przez idf (jako skalar)." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Fuse foldr/map
Found:
Prelude.foldr (+++) (zero 100)\n", - " $ map (\\ (t, Just v) -> idf coll t *** toOurVector v)\n", - " $ Prelude.filter (\\ (_, v) -> isJust v)\n", - " $ map (\\ t -> (t, getVector model t)) d
Why Not:
foldr\n", - " ((+++) . (\\ (t, Just v) -> idf coll t *** toOurVector v))\n", - " (zero 100)\n", - " (Prelude.filter (\\ (_, v) -> isJust v)\n", - " $ map (\\ t -> (t, getVector model t)) d)
" - ], - "text/plain": [ - "Line 4: Fuse foldr/map\n", - "Found:\n", - "Prelude.foldr (+++) (zero 100)\n", - " $ map (\\ (t, Just v) -> idf coll t *** toOurVector v)\n", - " $ Prelude.filter (\\ (_, v) -> isJust v)\n", - " $ map (\\ t -> (t, getVector model t)) d\n", - "Why not:\n", - "foldr\n", - " ((+++) . (\\ (t, Just v) -> idf coll t *** toOurVector v))\n", - " (zero 100)\n", - " (Prelude.filter (\\ (_, v) -> isJust v)\n", - " $ map (\\ t -> (t, getVector model t)) d)" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import Data.Maybe (isJust)\n", - "\n", - "vectorizeWord2vecIdf model coll d = \n", - " Prelude.foldr (+++) (zero 100) \n", - " $ map (\\(t, Just v) -> idf coll t *** toOurVector v) \n", - " $ Prelude.filter (\\(_, v) -> isJust v)\n", - " $ map (\\t -> (t, getVector model t)) d\n", - "\n", - "collectionLVectorized''' = map (vectorizeWord2vecIdf model collectionLNormalized') collectionLNormalized'" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[-35.63830397762308,32.606312678971506,102.20663646169147,56.00417395285867,-130.56709475346878,-14.916644370325773,55.15817632053957,83.2241937686228,26.432875116296394,48.94350344147367,11.370669191277202,-59.54579267200742,-116.01687192456801,60.53824040579282,39.84659684249884,-34.37377085402866,104.53525319069323,45.53363024094972,-34.25020197907558,-43.9007702604392,35.36538495508536,-59.81737728971619,-1.5823889595648828,-50.211106838043655,14.83789867297237,-109.45917608219175,86.56767915592452,-32.170794763065615,29.559930839016644,-126.81686726526162,-9.918908360030228,47.14965938694648,5.955083439147183,41.24417782948478,3.592410260515919,72.10649687523313,61.374776273461855,60.28687760276824,-28.886499026001676,-8.710633131022206,-68.73464623080284,-37.95272838994007,-26.390548039392165,-14.241950251566944,74.6286124718925,46.21889022510431,72.23999508751568,-19.597547074284556,-20.160749174807382,99.49036127458763,131.98057386978817,-23.842794956628147,-62.381675411749846,-19.366936151725387,1.4839595614144327,60.40520721416763,-7.70311857607342,-31.75784386529525,48.71818084466781,-202.41827342135582,138.5639100010709,12.447619757719652,-39.38375639132277,27.877688543771935,-87.00559882214534,56.45689362090545,37.89098984507379,103.78465196444151,-166.10094891357176,-50.83382060940457,11.574060187412977,74.00519869734406,-97.00170731343235,32.18159534728971,-11.280059681646494,-40.701643971890256,74.64230137346699,0.7613112917269982,-6.103424218278271,-150.47551072570587,-21.714627635239918,91.26690441786137,62.91576955719526,-92.35700140312395,-25.421583980267307,-67.87480813505826,-120.16245846953592,-68.89155479679258,-122.00206448376261,35.263603445401785,6.416282520155956,203.41225708856086,-62.42983953251155,59.36113672119048,40.00275897200196,-62.55633545667429,89.66866371308245,-42.287712072353834,-72.59490110281287,52.23637641217955]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "collectionLVectorized''' !! 3" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - " na_ak w_lud ba_hy w_lap ne_dz be_wy zw_oz mo_zu be_wy ba_hy mo_zu be_wy w_lud\n", - "na_ak 1.00 0.83 0.78 0.63 0.78 0.81 0.83 0.76 0.77 0.80 0.77 0.79 0.79\n", - "w_lud 0.83 1.00 0.82 0.60 0.84 0.84 0.84 0.85 0.86 0.74 0.86 0.83 0.90\n", - "ba_hy 0.78 0.82 1.00 0.57 0.78 0.84 0.77 0.79 0.90 0.75 0.74 0.89 0.85\n", - "w_lap 0.63 0.60 0.57 1.00 0.38 0.60 0.50 0.43 0.52 0.45 0.55 0.65 0.47\n", - "ne_dz 0.78 0.84 0.78 0.38 1.00 0.81 0.79 0.90 0.89 0.77 0.81 0.81 0.90\n", - "be_wy 0.81 0.84 0.84 0.60 0.81 1.00 0.82 0.76 0.83 0.74 0.81 0.92 0.88\n", - "zw_oz 0.83 0.84 0.77 0.50 0.79 0.82 1.00 0.77 0.77 0.74 0.82 0.75 0.83\n", - "mo_zu 0.76 0.85 0.79 0.43 0.90 0.76 0.77 1.00 0.93 0.74 0.87 0.80 0.90\n", - "be_wy 0.77 0.86 0.90 0.52 0.89 0.83 0.77 0.93 1.00 0.72 0.81 0.89 0.92\n", - "ba_hy 0.80 0.74 0.75 0.45 0.77 0.74 0.74 0.74 0.72 1.00 0.66 0.73 0.72\n", - "mo_zu 0.77 0.86 0.74 0.55 0.81 0.81 0.82 0.87 0.81 0.66 1.00 0.80 0.88\n", - "be_wy 0.79 0.83 0.89 0.65 0.81 0.92 0.75 0.80 0.89 0.73 0.80 1.00 0.87\n", - "w_lud 0.79 0.90 0.85 0.47 0.90 0.88 0.83 0.90 0.92 0.72 0.88 0.87 1.00" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "limitedL''' = Data.List.take limit collectionLVectorized'''\n", - "\n", - "paintMatrix cosineSim labelsLimited limitedL'''" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Haskell", - "language": "haskell", - "name": "haskell" - }, - "language_info": { - "codemirror_mode": "ihaskell", - "file_extension": ".hs", - "mimetype": "text/x-haskell", - "name": "haskell", - "pygments_lexer": "Haskell", - "version": "8.10.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/wyk/06_Uczenie_maszynowe.ipynb b/wyk/06_Uczenie_maszynowe.ipynb index a08a594..4480153 100644 --- a/wyk/06_Uczenie_maszynowe.ipynb +++ b/wyk/06_Uczenie_maszynowe.ipynb @@ -1,5 +1,19 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

6. Wyzwania uczenia maszynowego [wykład]

\n", + "

Filip Graliński (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -367,11 +381,14 @@ } ], "metadata": { + "author": "Filip Graliński", + "email": "filipg@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, + "lang": "pl", "language_info": { "codemirror_mode": { "name": "ipython", @@ -382,8 +399,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" - } + "version": "3.9.6" + }, + "subtitle": "6.Wyzwania uczenia maszynowego[wykład]", + "title": "Ekstrakcja informacji", + "year": "2021" }, "nbformat": 4, "nbformat_minor": 4 diff --git a/wyk/07_Naiwny_klasyfikator_bayesowski.ipynb b/wyk/07_Naiwny_klasyfikator_bayesowski.ipynb index 5468e43..918ae7f 100644 --- a/wyk/07_Naiwny_klasyfikator_bayesowski.ipynb +++ b/wyk/07_Naiwny_klasyfikator_bayesowski.ipynb @@ -1,5 +1,20 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "45264aad", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

7. Naiwny klasyfikator bayesowski w ekstrakcji informacji [wykład]

\n", + "

Filip Graliński (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, { "cell_type": "markdown", "id": "moderate-array", @@ -347,11 +362,14 @@ } ], "metadata": { + "author": "Filip Graliński", + "email": "filipg@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, + "lang": "pl", "language_info": { "codemirror_mode": { "name": "ipython", @@ -362,8 +380,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" - } + "version": "3.9.6" + }, + "subtitle": "7.Naiwny klasyfikator bayesowski w ekstrakcji informacji[wykład]", + "title": "Ekstrakcja informacji", + "year": "2021" }, "nbformat": 4, "nbformat_minor": 5 diff --git a/wyk/08_Regresja_liniowa.ipynb b/wyk/08_Regresja_liniowa.ipynb index 121951f..3fb53ab 100644 --- a/wyk/08_Regresja_liniowa.ipynb +++ b/wyk/08_Regresja_liniowa.ipynb @@ -1,5 +1,20 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "35c19016", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

8. Regresja liniowa [wykład]

\n", + "

Filip Graliński (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, { "cell_type": "markdown", "id": "cathedral-newark", @@ -141,6 +156,8 @@ "\n", "![Morskie Oko - Krzysztof Dudzik](08_files/morskieoko.jpg)\n", "\n", + "(Źródło: https://pl.wikipedia.org/wiki/Morskie_Oko#/media/Plik:Morskie_Oko_ze_szlaku_przez_%C5%9Awist%C3%B3wk%C4%99.jpg, licencja CC BY 3.0)\n", + "\n", "Schodź wzdłuż lokalnego spadku funkcji błędu.\n", "\n", "Tak więc w praktyce zamiast podstawiać do wzoru lepiej się uczyć iteracyjnie -\n", @@ -279,11 +296,14 @@ } ], "metadata": { + "author": "Filip Graliński", + "email": "filipg@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, + "lang": "pl", "language_info": { "codemirror_mode": { "name": "ipython", @@ -294,8 +314,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" - } + "version": "3.9.6" + }, + "subtitle": "8.Regresja liniowa[wykład]", + "title": "Ekstrakcja informacji", + "year": "2021" }, "nbformat": 4, "nbformat_minor": 5 diff --git a/wyk/09_neurozoo.ipynb b/wyk/09_neurozoo.ipynb index 7d5a737..e93fb77 100644 --- a/wyk/09_neurozoo.ipynb +++ b/wyk/09_neurozoo.ipynb @@ -1,5 +1,19 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

9. Przegląd składowych sieci neuronowych [wykład]

\n", + "

Filip Graliński (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1463,13 +1477,7 @@ "1.938151240348816 1.998972773551941 950 4 tensor([[0.1241, 0.1263, 0.1215, 0.1199, 0.1355, 0.1184, 0.1261, 0.1283]],\n", " grad_fn=) Andrzej Kostyra stworzył \"idealnego polskiego boksera\". Jest dużo cech Tomasza Adamka Andrzej Kostyra, ekspert bokserski, stworzył model \"idealnego polskiego pięściarza\". Wymienił najlepsze cechy poszczególnych bokserów. Najwięcej jest Tomasza Adamka.\n", "1.928910732269287 1.9361062049865723 1000 1 tensor([[0.1222, 0.1443, 0.1320, 0.1216, 0.1117, 0.1137, 0.1200, 0.1346]],\n", - " grad_fn=) Rajd Niemiec: Andreas Mikkelsen i Jari-Matti Latvala najszybsi na shakedown W czwartek kierowcy mieli do pokonania odcinek testowy przed Rajdem Niemiec. Na mecie okazało się, że Andreas Mikkelsen i Jari-Matti Latvala uzyskali identyczny czas.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + " grad_fn=) Rajd Niemiec: Andreas Mikkelsen i Jari-Matti Latvala najszybsi na shakedown W czwartek kierowcy mieli do pokonania odcinek testowy przed Rajdem Niemiec. Na mecie okazało się, że Andreas Mikkelsen i Jari-Matti Latvala uzyskali identyczny czas.\n", "1.9247257709503174 1.9077305793762207 1050 4 tensor([[0.1264, 0.1246, 0.1286, 0.1161, 0.1484, 0.1108, 0.1174, 0.1276]],\n", " grad_fn=) Była rywalka Joanny Jędrzejczyk na dopingu. Czeka ją zawieszenie Była pretendenta to tytułu mistrzyni UFC w wadze słomkowej, Jessica Penne (MMA 12-5) została zawieszona przez Amerykańską Agencję Antydopingową za stosowanie niedozwolonego środka. Amerykankę czeka 1,5-roczne zawieszenie.\n", "1.9094451665878296 1.8653218746185303 1100 2 tensor([[0.1117, 0.1150, 0.1548, 0.1148, 0.1137, 0.1239, 0.1094, 0.1566]],\n", @@ -1511,13 +1519,7 @@ "1.7356246709823608 1.938697099685669 2000 6 tensor([[0.1114, 0.0960, 0.1303, 0.1193, 0.1003, 0.1257, 0.1439, 0.1731]],\n", " grad_fn=) KMŚ 2017: ZAKSA - Sarmayeh Bank Teheran na żywo. Gdzie oglądać transmisję TV i online? We wtorek, ZAKSA Kędzierzyn-Koźle zmierzy się z Sarmayeh Bank Teheran w ramach Klubowych Mistrzostw Świata w siatkówce. Transmisja TV na antenie Polsat Sport. Stream online w Ipla TV. Relacja LIVE w WP SportoweFakty za darmo.\n", "1.7901594638824463 1.9917528629302979 2050 1 tensor([[0.1212, 0.1365, 0.1351, 0.1287, 0.1104, 0.1252, 0.1179, 0.1250]],\n", - " grad_fn=) Wakacyjny freestyle Przygońskiego i Pawlusiaka na pustyni Pędzące po wydmach dakarowe MINI, specjalnie dostosowany snowboard, lina i dwóch utalentowanych sportowców - tak w skrócie można opisać projekt \"Przygoński & Pawlusiak Dune Freestyle\".\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + " grad_fn=) Wakacyjny freestyle Przygońskiego i Pawlusiaka na pustyni Pędzące po wydmach dakarowe MINI, specjalnie dostosowany snowboard, lina i dwóch utalentowanych sportowców - tak w skrócie można opisać projekt \"Przygoński & Pawlusiak Dune Freestyle\".\n", "1.7326788902282715 1.8687950372695923 2100 5 tensor([[0.1091, 0.1428, 0.1050, 0.1267, 0.1092, 0.1543, 0.1100, 0.1429]],\n", " grad_fn=) Martynas Sajus: Sobin jest bardziej doświadczonym graczem, ale w przyszłości będę od niego lepszy Pojedynek Josipa Sobina z Martynasem Sajusem może być jednym ze smaczków piątkowego spotkania Anwilu z Polpharmą. Który ze środkowych da więcej swojej ekipie? - On jest bardziej doświadczony, ale w przyszłości to ja będę lepszy - śmieje się Sajus.\n", "1.7521668672561646 1.5104379653930664 2150 2 tensor([[0.0978, 0.1259, 0.2208, 0.1105, 0.1043, 0.1174, 0.1048, 0.1186]],\n", @@ -1557,13 +1559,7 @@ "1.6379656791687012 1.4863052368164062 3000 3 tensor([[0.0881, 0.0816, 0.1089, 0.2262, 0.0698, 0.1202, 0.1658, 0.1394]],\n", " grad_fn=) Liga Mistrzów: Paris Saint-Germain HB kolejnym uczestnikiem Final Four Paris Saint-Germain HB zremisował z MOL-Pickiem Szeged 30:30 w rewanżowym meczu ćwierćfinałowym Ligi Mistrzów 2016/2017, tym samym zdobywając awans do turnieju finałowego w Kolonii.\n", "1.620102047920227 1.955077886581421 3050 5 tensor([[0.0998, 0.1599, 0.1024, 0.1031, 0.1239, 0.1416, 0.1172, 0.1520]],\n", - " grad_fn=) Chewbacca ma nową twarz. Jak koszykarz z Finlandii trafił do \"Gwiezdnych Wojen\" Zbliżający się weekend będzie tym, w którym miliony fanów \"Gwiezdnych Wojen\" zaczną szturmować kina, by obejrzeć 8. część sagi. Wielu z nich nie wie, że za maską Chewbakki od niedawna skrywa się nowa twarz - fińskiego koszykarza, Joonasa Suotamo.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + " grad_fn=) Chewbacca ma nową twarz. Jak koszykarz z Finlandii trafił do \"Gwiezdnych Wojen\" Zbliżający się weekend będzie tym, w którym miliony fanów \"Gwiezdnych Wojen\" zaczną szturmować kina, by obejrzeć 8. część sagi. Wielu z nich nie wie, że za maską Chewbakki od niedawna skrywa się nowa twarz - fińskiego koszykarza, Joonasa Suotamo.\n", "1.6508986949920654 1.7872048616409302 3100 7 tensor([[0.1113, 0.1329, 0.0890, 0.1126, 0.1327, 0.1295, 0.1246, 0.1674]],\n", " grad_fn=) Ireneusz Mamrot liczy na przełamanie. \"Jest sportowa złość, która musi się przełożyć na naszą korzyść\" - Nie ma zdenerwowania, ale jest duża sportowa złość. To musi się przełożyć na naszą korzyść - mówi przed sobotnim pojedynkiem z Koroną Kielce trener Jagiellonii Białystok, Ireneusz Mamrot. - Nie można wiecznie mieć gorszego okresu - dodaje.\n", "1.5091105699539185 1.5536433458328247 3150 2 tensor([[0.1030, 0.1194, 0.2115, 0.1183, 0.1021, 0.1098, 0.1085, 0.1274]],\n", @@ -1603,13 +1599,7 @@ "1.4597876071929932 1.3940199613571167 4000 7 tensor([[0.0933, 0.1557, 0.0803, 0.0930, 0.1256, 0.1070, 0.0970, 0.2481]],\n", " grad_fn=) Grzegorz Krychowiak na zakręcie. Mundial to ostatnia szansa Grzegorz Krychowiak znowu jest na zakręcie i musi szukać nowego klubu. Paris-Saint Germain chce się pozbyć Polaka na dobre. Mundial w Rosji to dla mistrzów Francji ostatnia szansa, żeby sprzedać go za godne pieniądze.\n", "1.4579588174819946 1.5661852359771729 4050 6 tensor([[0.0991, 0.1113, 0.0903, 0.1400, 0.0902, 0.1380, 0.2088, 0.1223]],\n", - " grad_fn=) ZAKSA Kędzierzyn-Koźle trenuje już niemal w komplecie Na początku tygodnia do kędzierzyńskiej drużyny dołączyli zawodnicy, którzy brali udział w mistrzostwach Europy. Wyjątkiem jest francuski rozgrywający Benjamin Toniutti.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + " grad_fn=) ZAKSA Kędzierzyn-Koźle trenuje już niemal w komplecie Na początku tygodnia do kędzierzyńskiej drużyny dołączyli zawodnicy, którzy brali udział w mistrzostwach Europy. Wyjątkiem jest francuski rozgrywający Benjamin Toniutti.\n", "1.524263858795166 1.2569677829742432 4100 1 tensor([[0.0736, 0.2845, 0.0688, 0.0741, 0.1107, 0.1046, 0.1125, 0.1710]],\n", " grad_fn=) Krzysztof Hołowczyc trzyma kciuki za Kubicę. \"Ci, którzy nie chcą jego powrotu, po prostu się go boją\" Trwa walka Roberta Kubicy o powrót do Formuły 1. Polak jest jednym z kandydatów do reprezentowania w przyszłym sezonie barw zespołu Williams. Za Kubicę kciuki trzyma Krzysztof Hołowczyc.\n", "1.4493881464004517 1.4371377229690552 4150 1 tensor([[0.1067, 0.2376, 0.1001, 0.0918, 0.1164, 0.1187, 0.1077, 0.1211]],\n", @@ -2226,11 +2216,14 @@ } ], "metadata": { + "author": "Filip Graliński", + "email": "filipg@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, + "lang": "pl", "language_info": { "codemirror_mode": { "name": "ipython", @@ -2241,10 +2234,13 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.9.6" }, - "org": null + "org": null, + "subtitle": "9.Przegląd składowych sieci neuronowych[wykład]", + "title": "Ekstrakcja informacji", + "year": "2021" }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 } diff --git a/wyk/11_rnn.ipynb b/wyk/11_rnn.ipynb index 89ec808..18d26d8 100644 --- a/wyk/11_rnn.ipynb +++ b/wyk/11_rnn.ipynb @@ -1,5 +1,19 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

11. Sieci rekurencyjne [wykład]

\n", + "

Filip Graliński (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -498,11 +512,14 @@ } ], "metadata": { + "author": "Filip Graliński", + "email": "filipg@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, + "lang": "pl", "language_info": { "codemirror_mode": { "name": "ipython", @@ -513,9 +530,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.9.6" }, - "org": null + "org": null, + "subtitle": "11.Sieci rekurencyjne[wykład]", + "title": "Ekstrakcja informacji", + "year": "2021" }, "nbformat": 4, "nbformat_minor": 4 diff --git a/wyk/12_bpe.ipynb b/wyk/12_bpe.ipynb index 0220f04..c29d4f7 100644 --- a/wyk/12_bpe.ipynb +++ b/wyk/12_bpe.ipynb @@ -1,839 +1,861 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Podział na jednostki podwyrazowe\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Słownik nie może być za duży…\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Jeśli używamy wyuczalnych zanurzeń słów (embeddingów), wówczas musimy\n", - "je dopisać do listy parametrów całego modelu — jest to $|V|n$ wag,\n", - "gdzie $n$ to rozmiar embeddingów; w wypadku uczenia dodatkowo musimy\n", - "jeszcze pamiętać związane z embeddingami gradienty. Pamięć RAM karty\n", - "graficznej jest rzecz jasna ograniczona, słownik więc nie może być\n", - "dowolnie duży. Dla danego modelu karty graficznej dość łatwo ustalić\n", - "maksymalny rozmiar słownika — jest „twarde” ograniczenie, które musimy\n", - "spełnić.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Czy rzeczywiście słownik może być taki duży?\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Ile jest różnych form fleksyjnych w języku polskim? Zobaczmy w słowniku PoliMorf…\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "a\n", - "aa\n", - "AA\n", - "Aachen\n", - "Aalborg\n", - "Aalborgiem\n", - "Aalborgowi\n", - "Aalborgu\n", - "AAP\n", - "Aar\n", - "Aarem\n", - "Aarowi\n", - "Aaru\n", - "Aarze\n", - "Aara\n", - "Aarą\n", - "Aarę\n", - "Aaro\n", - "Aary\n", - "Aarze\n", - "uniq: błąd zapisu: Przerwany potok\n" - ] - } - ], - "source": [ - "! wget -q 'http://zil.ipipan.waw.pl/PoliMorf?action=AttachFile&do=get&target=PoliMorf-0.6.7.tab.gz' -O - | zcat | cut -f 1 | uniq | head -n 20" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "3844535\n" - ] - } - ], - "source": [ - "! wget -q 'http://zil.ipipan.waw.pl/PoliMorf?action=AttachFile&do=get&target=PoliMorf-0.6.7.tab.gz' -O - | zcat | cut -f 1 | sort -u | wc -l" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Pytanie** W którym języku europejskim wyrazów będzie jeszcze więcej niż języku polskim?\n", - "\n", - "Tak naprawdę form jest jeszcze więcej, oczywiście PoliMorf nie wyczerpuje zbioru…\n", - "\n", - "**Pytanie** Podaj przykłady „oczywistych” wyrazów, których nie ma w PoliMorfie. Jak w sposób systematyczny szukać takich wyrazów?\n", - "\n", - "Z drugiej strony, w PoliMorfie jest dużo dziwnych, „sztucznych” wyrazów.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "niebiałościenną\n", - "nieponadosobowości\n", - "nieknerający\n", - "inspektoratów\n", - "Korytkowskich\n", - "elektrostatyczności\n", - "Okola\n", - "bezsłowny\n", - "indygowcu\n", - "gadany\n", - "nieładowarkowościach\n", - "niepawężnicowate\n", - "Thom\n", - "poradlmy\n", - "olejący\n", - "Ziemianinów\n", - "stenotropizmami\n", - "wigiliowości\n", - "pognanej\n", - "niekinezyterapeutycznym\n" - ] - } - ], - "source": [ - "! wget -q 'http://zil.ipipan.waw.pl/PoliMorf?action=AttachFile&do=get&target=PoliMorf-0.6.7.tab.gz' -O - | zcat | cut -f 1 | shuf -n 20" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Inaczej, zobaczmy, ile różnych wyrazów jest w jakimś rzeczywistym zbiorze tekstów, rozpatrzmy\n", - "teksty zebrane na potrzeby identyfikacji płci autora tekstu:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "# Out[7]:" - ] - } - ], - "source": [ - "! git clone --single-branch --depth 1 git://gonito.net/petite-difference-challenge2" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "! xzcat petite-difference-challenge2/train/in.tsv.xz | perl -C -ne 'print \"$&\\n\" while/\\p{L}+/g;' | sort -u > vocab.txt" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ˆ\n", - "ˇ\n", - "゚\n", - "a\n", - "A\n", - "á\n", - "Á\n", - "à\n", - "À\n", - "ă\n", - "Ă\n", - "â\n", - "Â\n", - "å\n", - "Å\n", - "ä\n", - "Ä\n", - "Ã\n", - "ā\n", - "aa\n", - "aA\n", - "Aa\n", - "AA\n", - "aĂ\n", - "AĂ\n", - "aâ\n", - "aÂ\n", - "Aâ\n", - "aÅ\n", - "aÄ\n", - "ª\n", - "aaa\n", - "aAa\n", - "Aaa\n", - "AaA\n", - "AAa\n", - "AAA\n", - "aaaa\n", - "aAaa\n", - "Aaaa\n", - "AaAa\n", - "AAaa\n", - "AAAa\n", - "AAAA\n", - "aaaaa\n", - "Aaaaa\n", - "AaaaA\n", - "AAaaa\n", - "AAAAA\n", - "aaaaaa\n" - ] - } - ], - "source": [ - "! head -n 50 vocab.txt" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2974556 vocab.txt\n" - ] - } - ], - "source": [ - "! wc -l vocab.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Co gorsza, nawet jak weźmiemy cały taki słownik bez ograniczeń i tak\n", - "nie pokryje on sporej części tekstów przetwarzanych w czasie inferencji.\n", - "Zobaczmy, ilu wyrazów ze zbioru deweloperskiego nie będzie w słowniku.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "81380\n" - ] - } - ], - "source": [ - "! cat petite-difference-challenge2/dev-0/in.tsv | perl -C -ne 'print \"$&\\n\" while/\\p{L}+/g;' | sort -u | comm vocab.txt - -13 | wc -l" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Takie wyrazy nazywamy wyrazami **OOV** (*out-of-vocabulary*).\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Obcięcie słownika\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Najprostszy sposób ograniczenia słownika to po prostu obcięcie do $N$ najczęstszych słów.\n", - "\n", - "Spróbujmy zastosować do korpusu „płci”:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "sort: błąd zapisu: 'standardowe wyjście': Przerwany potok\n", - "sort: błąd zapisu\n" - ] - } - ], - "source": [ - "! xzcat petite-difference-challenge2/train/in.tsv.xz | perl -C -ne 'print \"$&\\n\" while/\\p{L}+/g;' | sort | uniq -c | sort -k 1rn | head -n 50000 | sort -k 2 > vocab50000.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Daje to lepszy efekt niż można się spodziewać. Odrzucamy w ten sposób\n", - "tylko bardzo rzadkie słowa (albo takie, które wystąpiły tylko raz w\n", - "korpusie — tzw. *hapax legomena*), choć tych słów jest bardzo dużo.\n", - "\n", - "**Zagadka**: 50000 najczęstszych słów (1,9% **typów**) pokrywa jaki odsetek **wystąpień**?\n", - "\n", - "Rozkład normalny w języku nie jest… normalny — nie spotkamy się z nim\n", - "badając języki. W tekstach dominują „skrzywione” rozkłady z długimi,\n", - "„chudymi” ogonami.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "! xzcat petite-difference-challenge2/train/in.tsv.xz | perl -C -ne 'print \"$&\\n\" while/\\p{L}+/g;' | sort | uniq -c | sort -k 1rn | cut -f 1 > freqs.txt" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'word-distribution.png'" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

12. Kodowanie BPE [wyk\u0142ad]

\n", + "

Filip Grali\u0144ski (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Podzia\u0142 na jednostki podwyrazowe\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### S\u0142ownik nie mo\u017ce by\u0107 za du\u017cy\u2026\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Je\u015bli u\u017cywamy wyuczalnych zanurze\u0144 s\u0142\u00f3w (embedding\u00f3w), w\u00f3wczas musimy\n", + "je dopisa\u0107 do listy parametr\u00f3w ca\u0142ego modelu \u2014 jest to $|V|n$ wag,\n", + "gdzie $n$ to rozmiar embedding\u00f3w; w wypadku uczenia dodatkowo musimy\n", + "jeszcze pami\u0119ta\u0107 zwi\u0105zane z embeddingami gradienty. Pami\u0119\u0107 RAM karty\n", + "graficznej jest rzecz jasna ograniczona, s\u0142ownik wi\u0119c nie mo\u017ce by\u0107\n", + "dowolnie du\u017cy. Dla danego modelu karty graficznej do\u015b\u0107 \u0142atwo ustali\u0107\n", + "maksymalny rozmiar s\u0142ownika \u2014 jest \u201etwarde\u201d ograniczenie, kt\u00f3re musimy\n", + "spe\u0142ni\u0107.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Czy rzeczywi\u015bcie s\u0142ownik mo\u017ce by\u0107 taki du\u017cy?\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ile jest r\u00f3\u017cnych form fleksyjnych w j\u0119zyku polskim? Zobaczmy w s\u0142owniku PoliMorf\u2026\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a\n", + "aa\n", + "AA\n", + "Aachen\n", + "Aalborg\n", + "Aalborgiem\n", + "Aalborgowi\n", + "Aalborgu\n", + "AAP\n", + "Aar\n", + "Aarem\n", + "Aarowi\n", + "Aaru\n", + "Aarze\n", + "Aara\n", + "Aar\u0105\n", + "Aar\u0119\n", + "Aaro\n", + "Aary\n", + "Aarze\n", + "uniq: b\u0142\u0105d zapisu: Przerwany potok\n" + ] + } + ], + "source": [ + "! wget -q 'http://zil.ipipan.waw.pl/PoliMorf?action=AttachFile&do=get&target=PoliMorf-0.6.7.tab.gz' -O - | zcat | cut -f 1 | uniq | head -n 20" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3844535\n" + ] + } + ], + "source": [ + "! wget -q 'http://zil.ipipan.waw.pl/PoliMorf?action=AttachFile&do=get&target=PoliMorf-0.6.7.tab.gz' -O - | zcat | cut -f 1 | sort -u | wc -l" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Pytanie** W kt\u00f3rym j\u0119zyku europejskim wyraz\u00f3w b\u0119dzie jeszcze wi\u0119cej ni\u017c j\u0119zyku polskim?\n", + "\n", + "Tak naprawd\u0119 form jest jeszcze wi\u0119cej, oczywi\u015bcie PoliMorf nie wyczerpuje zbioru\u2026\n", + "\n", + "**Pytanie** Podaj przyk\u0142ady \u201eoczywistych\u201d wyraz\u00f3w, kt\u00f3rych nie ma w PoliMorfie. Jak w spos\u00f3b systematyczny szuka\u0107 takich wyraz\u00f3w?\n", + "\n", + "Z drugiej strony, w PoliMorfie jest du\u017co dziwnych, \u201esztucznych\u201d wyraz\u00f3w.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "niebia\u0142o\u015bcienn\u0105\n", + "nieponadosobowo\u015bci\n", + "niekneraj\u0105cy\n", + "inspektorat\u00f3w\n", + "Korytkowskich\n", + "elektrostatyczno\u015bci\n", + "Okola\n", + "bezs\u0142owny\n", + "indygowcu\n", + "gadany\n", + "nie\u0142adowarkowo\u015bciach\n", + "niepaw\u0119\u017cnicowate\n", + "Thom\n", + "poradlmy\n", + "olej\u0105cy\n", + "Ziemianin\u00f3w\n", + "stenotropizmami\n", + "wigiliowo\u015bci\n", + "pognanej\n", + "niekinezyterapeutycznym\n" + ] + } + ], + "source": [ + "! wget -q 'http://zil.ipipan.waw.pl/PoliMorf?action=AttachFile&do=get&target=PoliMorf-0.6.7.tab.gz' -O - | zcat | cut -f 1 | shuf -n 20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Inaczej, zobaczmy, ile r\u00f3\u017cnych wyraz\u00f3w jest w jakim\u015b rzeczywistym zbiorze tekst\u00f3w, rozpatrzmy\n", + "teksty zebrane na potrzeby identyfikacji p\u0142ci autora tekstu:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# Out[7]:" + ] + } + ], + "source": [ + "! git clone --single-branch --depth 1 git://gonito.net/petite-difference-challenge2" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "! xzcat petite-difference-challenge2/train/in.tsv.xz | perl -C -ne 'print \"$&\\n\" while/\\p{L}+/g;' | sort -u > vocab.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u02c6\n", + "\u02c7\n", + "\uff9f\n", + "a\n", + "A\n", + "\u00e1\n", + "\u00c1\n", + "\u00e0\n", + "\u00c0\n", + "\u0103\n", + "\u0102\n", + "\u00e2\n", + "\u00c2\n", + "\u00e5\n", + "\u00c5\n", + "\u00e4\n", + "\u00c4\n", + "\u00c3\n", + "\u0101\n", + "aa\n", + "aA\n", + "Aa\n", + "AA\n", + "a\u0102\n", + "A\u0102\n", + "a\u00e2\n", + "a\u00c2\n", + "A\u00e2\n", + "a\u00c5\n", + "a\u00c4\n", + "\u00c2\u00aa\n", + "aaa\n", + "aAa\n", + "Aaa\n", + "AaA\n", + "AAa\n", + "AAA\n", + "aaaa\n", + "aAaa\n", + "Aaaa\n", + "AaAa\n", + "AAaa\n", + "AAAa\n", + "AAAA\n", + "aaaaa\n", + "Aaaaa\n", + "AaaaA\n", + "AAaaa\n", + "AAAAA\n", + "aaaaaa\n" + ] + } + ], + "source": [ + "! head -n 50 vocab.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2974556 vocab.txt\n" + ] + } + ], + "source": [ + "! wc -l vocab.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Co gorsza, nawet jak we\u017amiemy ca\u0142y taki s\u0142ownik bez ogranicze\u0144 i tak\n", + "nie pokryje on sporej cz\u0119\u015bci tekst\u00f3w przetwarzanych w czasie inferencji.\n", + "Zobaczmy, ilu wyraz\u00f3w ze zbioru deweloperskiego nie b\u0119dzie w s\u0142owniku.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "81380\n" + ] + } + ], + "source": [ + "! cat petite-difference-challenge2/dev-0/in.tsv | perl -C -ne 'print \"$&\\n\" while/\\p{L}+/g;' | sort -u | comm vocab.txt - -13 | wc -l" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Takie wyrazy nazywamy wyrazami **OOV** (*out-of-vocabulary*).\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Obci\u0119cie s\u0142ownika\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Najprostszy spos\u00f3b ograniczenia s\u0142ownika to po prostu obci\u0119cie do $N$ najcz\u0119stszych s\u0142\u00f3w.\n", + "\n", + "Spr\u00f3bujmy zastosowa\u0107 do korpusu \u201ep\u0142ci\u201d:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sort: b\u0142\u0105d zapisu: 'standardowe wyj\u015bcie': Przerwany potok\n", + "sort: b\u0142\u0105d zapisu\n" + ] + } + ], + "source": [ + "! xzcat petite-difference-challenge2/train/in.tsv.xz | perl -C -ne 'print \"$&\\n\" while/\\p{L}+/g;' | sort | uniq -c | sort -k 1rn | head -n 50000 | sort -k 2 > vocab50000.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Daje to lepszy efekt ni\u017c mo\u017cna si\u0119 spodziewa\u0107. Odrzucamy w ten spos\u00f3b\n", + "tylko bardzo rzadkie s\u0142owa (albo takie, kt\u00f3re wyst\u0105pi\u0142y tylko raz w\n", + "korpusie \u2014 tzw. *hapax legomena*), cho\u0107 tych s\u0142\u00f3w jest bardzo du\u017co.\n", + "\n", + "**Zagadka**: 50000 najcz\u0119stszych s\u0142\u00f3w (1,9% **typ\u00f3w**) pokrywa jaki odsetek **wyst\u0105pie\u0144**?\n", + "\n", + "Rozk\u0142ad normalny w j\u0119zyku nie jest\u2026 normalny \u2014 nie spotkamy si\u0119 z nim\n", + "badaj\u0105c j\u0119zyki. W tekstach dominuj\u0105 \u201eskrzywione\u201d rozk\u0142ady z d\u0142ugimi,\n", + "\u201echudymi\u201d ogonami.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "! xzcat petite-difference-challenge2/train/in.tsv.xz | perl -C -ne 'print \"$&\\n\" while/\\p{L}+/g;' | sort | uniq -c | sort -k 1rn | cut -f 1 > freqs.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'word-distribution.png'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAEFCAYAAAD69rxNAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAXf0lEQVR4nO3deZSdVZnv8e9TQ+YRUiEhIYQhBDDMhSB4kUFZkaalrwtYoNJ2yzUXb+ttbg8qehVv32UvV+ui2742l5tWBG0aW5S2kSYMjYzKVAlTSJgkAoEMBSEhIWMl+/5xTmWoU5U6qXNOndpV389aWanznvec93mz9ceu/b7v3pFSQpKUn4Z6FyBJ6hsDXJIyZYBLUqYMcEnKlAEuSZlq6s+DTZo0Kc2cObM/DylJ2Vu4cOFbKaWWrtv7NcBnzpxJW1tbfx5SkrIXEa92t90hFEnKlAEuSZkywCUpUwa4JGXKAJekTBngkpQpA1ySMpVFgN+7dBXX3v9yvcuQpAEliwC//4V2vv/QsnqXIUkDShYBDuDCE5K0pywCPKLeFUjSwJNFgAPY/5akPWUR4HbAJalUFgEuSSqVTYB7DVOS9pRFgIdXMSWpRBYBDt5GKEldZRPgkqQ9ZRPg9r8laU9ZBLhD4JJUKosAB+yCS1IXvQZ4RFwfEasjYnE37/1FRKSImFSb8orH8VEeSSpRTg/8BmBu140RcRDwEeC1KtfULTvgkrSnXgM8pfQgsKabt/4W+CL9kK2OgUtSqT6NgUfEx4A3UkpPl7HvvIhoi4i29vb2vhxOktSNfQ7wiBgFfBX4ejn7p5Tmp5RaU0qtLS0t+3q43b+nz5+VpMGoLz3ww4BDgKcj4nfAdGBRREypZmG7cwRFkko17esHUkrPApM7XxdDvDWl9FYV6yo9bi2/XJIyVM5thDcDjwCzI2J5RFxe+7K61tDfR5Skga/XHnhK6dJe3p9ZtWr2epz+OIok5SOLJzGdTlaSSmUR4ADJUXBJ2kMWAW7/W5JKZRHg4Bi4JHWVR4DbBZekEnkEON4HLkldZRHgTicrSaWyCHBJUql8AtwxFEnaQxYB7nM8klQqiwAHH+SRpK6yCHA74JJUKosABx/kkaSusghwx8AlqVQWAQ7ehCJJXWUR4D7II0mlsghwcFFjSeqqnCXVro+I1RGxeLdt346I5yPimYj414iYUMsiHQOXpFLl9MBvAOZ22XYPMCeldCzwInBVleuSJPWi1wBPKT0IrOmy7e6UUkfx5aPA9BrUtmcdtT6AJGWmGmPgnwEW9PRmRMyLiLaIaGtvb+/TARxBkaRSFQV4RHwV6ABu6mmflNL8lFJrSqm1paWlz8fyGqYk7amprx+MiE8D5wPnpFrfIuJVTEkq0acAj4i5wJeAD6WUNla3JElSOcq5jfBm4BFgdkQsj4jLge8BY4F7IuKpiLiulkXa/5akUr32wFNKl3az+Qc1qKVXKSXC4RRJAjJ5EtPMlqRSWQR4J+9EkaRdsghwJ7OSpFJZBLgkqVRWAe4IiiTtkkWAexFTkkplEeCdnBNcknbJIsDtgEtSqSwCvJP9b0naJYsAdwxckkplEeCdHAKXpF2yCHDnP5GkUlkEeKfkKLgk7ZRVgEuSdskqwB0Dl6Rdsghwh8AlqVQWAS5JKlXOkmrXR8TqiFi827b9IuKeiHip+PfE2pYpSeqqnB74DcDcLtu+DNybUpoF3Ft8XTPOBy5JpXoN8JTSg8CaLpsvAG4s/nwj8AfVLaunWvrjKJKUh76OgR+QUloBUPx7ck87RsS8iGiLiLb29vY+HcyLmJJUquYXMVNK81NKrSml1paWlsq+ywd5JGmnvgb4qoiYClD8e3X1SiplB1ySSvU1wG8DPl38+dPAv1WnnL1zDFySdinnNsKbgUeA2RGxPCIuB74FfCQiXgI+UnxdM46BS1Kppt52SCld2sNb51S5ll7ZAZekXbJ4EtP7wCWpVBYBLkkqlUWAd46Bb9/hIIokdcoiwJsbC2V2bN9R50okaeDIK8DtgUvSTlkEeFNjYQxla4c9cEnqlEWANxcD3B64JO2SRYA3NTgGLkldZRHgnWPgWw1wSdopkwAvDqFsdwhFkjplEeBNO+9CsQcuSZ2yCPDmhkIPfJs9cEnaKY8AbyqUuc0xcEnaKYsAb2pwDFySusoiwDvvQrEHLkm7ZBHgTT7II0klsghwe+CSVKqiAI+I/xERz0XE4oi4OSJGVKuw3Q3rfJDHuVAkaac+B3hETAP+O9CaUpoDNAKXVKuw3Y1obgRgswEuSTtVOoTSBIyMiCZgFPBm5SWVGjmsGOBbt9fi6yUpS30O8JTSG8B3gNeAFcC6lNLdXfeLiHkR0RYRbe3t7X061shiD3yjAS5JO1UyhDIRuAA4BDgQGB0Rn+q6X0ppfkqpNaXU2tLS0qdjNTYEw5oa2Lito6/lStKgU8kQyoeBZSml9pTSNuBW4LTqlFVq9LBGNm6xBy5JnSoJ8NeAUyNiVEQEcA6wtDpllRo9vIkNW+yBS1KnSsbAHwN+BiwCni1+1/wq1VVizPAm1m82wCWpU1MlH04pXQ1cXaVa9mrsiCY2bNnWH4eSpCxk8SQmwLgRzfbAJWk3+QT4yGbWbbIHLkmdsgnw8SObWbfRAJekTtkE+MRRw1i/pcP5UCSpKJsA3290MwBrN22tcyWSNDBkE+ATRw8DYM17BrgkQUYB3jJmOADt67fUuRJJGhiyCfDJ4wpTja9+1wCXJMgowA8YV+iBr3x3c50rkaSBIZsAHzWsiXEjmli5zgCXJMgowAGmTRzFm2s31bsMSRoQsgrw6RNH8vo7G+tdhiQNCFkF+Iz9RvHamo2klOpdiiTVXVYBPnP/UWzetoPV3kooSZkF+KTRALzS/l6dK5Gk+ssqwA9rGQPAy6vX17kSSaq/rAJ86vgRjB3exIurNtS7FEmqu4oCPCImRMTPIuL5iFgaER+oVmE9HI/ZU8aydMW7tTyMJGWh0h74d4E7U0pHAsdRw0WNOx194DiWrniXHTu8E0XS0NbnAI+IccAZwA8AUkpbU0prq1RXj+ZMG897W7ez7G0vZEoa2irpgR8KtAM/jIgnI+L7ETG6604RMS8i2iKirb29vYLDFRw7fTwAT7++tuLvkqScVRLgTcCJwP9NKZ0AvAd8uetOKaX5KaXWlFJrS0tLBYcrmDV5LGOGN7HotXcq/i5JylklAb4cWJ5Seqz4+mcUAr2mGhuCE2ZM4IllBrikoa3PAZ5SWgm8HhGzi5vOAZZUpapenHro/rywaj1vbfCJTElDV6V3oXwBuCkingGOB/664orK8MHDJwHw65ff6o/DSdKA1FTJh1NKTwGt1SmlfHOmjWfCqGYeeLGdC46f1t+Hl6QBIasnMTs1NgRnHtHCfc+vpmP7jnqXI0l1kWWAA8ydM4V3Nm7jsWVr6l2KJNVFtgH+oSMmM3pYI7948o16lyJJdZFtgI8c1sjcOVO5c/FKNm/bXu9yJKnfZRvgABccfyDrt3Rw79LV9S5Fkvpd1gF+2mH7M2XcCG5Z+Hq9S5Gkfpd1gDc1NnBR63QeeLGdZW85uZWkoSXrAAe47NSDaWoIbvzN7+pdiiT1q+wDfPK4Efz+sQfyL0+8zpr3tta7HEnqN9kHOMBnzziUTdu28/V/W1zvUiSp3wyKAD9q6jj+06xJ3P7MCt5Yu6ne5UhSvxgUAQ7wJ2cdDsCVP3myzpVIUv8YNAF+6qH7M23CSJ743Ts89+a6epcjSTU3aAIc4HufOAGAy29oq3MlklR7gyrAT5gxkfcdOI6V727m5wuX17scSaqpQRXgAD/8o5MB+PNbnmabU81KGsQGXYBPHjeCz5x+CADfufuFOlcjSbVTcYBHRGNEPBkRt1ejoGr42vlHcdbsFv7fA69w3wtOdCVpcKpGD/xPgaVV+J6qiQiuufh4xg5v4o9/+ISLH0salCoK8IiYDvwe8P3qlFM9E0cP44sfPRKAs75zPymlOlckSdVVaQ/874AvAj1eLYyIeRHRFhFt7e3tFR5u31x26sGcOGMC6zd38IfXP96vx5akWutzgEfE+cDqlNLCve2XUpqfUmpNKbW2tLT09XB9dssVpwHw0Etv8a0Fz/f78SWpVirpgZ8OfCwifgf8BDg7Iv6pKlVVUWND8PhXzwHgugd+y0+fcPEHSYNDnwM8pXRVSml6SmkmcAnwq5TSp6pWWRVNHjuCX37+gwB88efPsODZFXWuSJIqN+juA+/JMdPH73zI53M3LeLu51bWuSJJqkxVAjyldH9K6fxqfFctnXXkZK795IkAzPvxQu6wJy4pY0OmB97pvGOm8veXFia9+m83LeLWRc6ZIilPQy7AAT523IFc96mTAPiznz7NdQ/8ts4VSdK+G5IBDjB3zhR+Mu9UAL614Hn+7F+e8mEfSVkZsgEOhUUgfvXnHwLg1iff4JxrHmBLx/Y6VyVJ5RnSAQ5waMsYnvzaR2hsCF5pf4/Z//NOXn37vXqXJUm9GvIBDoV5U17433P54OGTAPjQt+/nhl8vq3NVkrR3BnhRU2MD//RfTuGb/3kOAN/45RKu+PFCtu9wXFzSwGSAd/HJUw7mts+fDsCdz63ksK/cwaLX3qlzVZJUygDvxrHTJ/DMN86l9eCJAHz82t/wV79cQodLtEkaQAzwHowb0cwtV3yAay4+DoDrf72M1m/+B4vfWFfnyiSpwADfi4jg4ydO55lvnMt5x0xh7cZtnP9/Hua//riNDVs66l2epCHOAC/DuBHNXPvJk/j2hccCcNdzq5hz9V0seHaFQS6pbgzwfXBR60E8/KWz+NyZhwGFWQ1P/9avWL1+sw8ASep30Z+Pj7e2tqa2trZ+O14t3ff8au5espKbHy8sEDFtwkge/tJZRESdK5M02ETEwpRSa9ftTfUoZjA468jJvP+Q/Thm2gQeeqmdBYtXMufquxg/spnbvvBBJo0ZXu8SJQ1yBngFRg9v4hOnzOCcoyYzY/9RLH9nE//+zAo++6M2xo9s5tyjp/CJU2bUu0xJg5QBXgUHjBvBVR89inWbtrFu4zbWb97Gk6+t5aVVGxgzovBPfPTUsRw+eWydK5U0mPR5DDwiDgJ+BEwBdgDzU0rf3dtnBtMYeG+++e9L+MeHds2ncuSUsdx55Rl1rEhSrnoaA68kwKcCU1NKiyJiLLAQ+IOU0pKePjOUArxj+w5eXbORlOC7977EgmdXMOuAQg88gCs/PItz3zelvkVKykLVL2KmlFYAK4o/r4+IpcA0oMcAH0qaGhs4rGUMAJ/+wMFs2badzv9UPvhiO3c+t5KTio/qA4wc1sioYY5oSSpfVW4jjIiZwIPAnJTSu13emwfMA5gxY8ZJr776asXHy925f/sAL67asMe2kc2NPHLV2UwYNaxOVUkaqGp2G2FEjAF+DlzZNbwBUkrzgflQGEKp9HiDwbcvPI6nl6/d+Xrpine5+fHXeX7lemZNHrPHvqOHNzGiubGfK5SUg4oCPCKaKYT3TSmlW6tT0uB33EETOO6gCTtf/+a3b3Hz469zyfxHS/adMm4Ej1x1tg8ISSrR5wCPQqL8AFiaUrqmeiUNPSfP3I/vXHQcG7fuOa/Kwy+9xd1LVrGlY4e9cEklKumBnw5cBjwbEU8Vt30lpXRHxVUNMc2NDVx40vRu37t7ySouv/EJmhq6n7bm/GOnclHrQbUsT9IAVcldKA9TuCNONfL+Q/bj5JkT2bBlO1A6WdYrqzfw3pYOA1waorxvbQA7cso4brnitB7f/8wNT7B6/eZ+rEjSQGKAZ2xkcyMvrtrA3L97sKz9p08cxfzLTqKhwV+cpMHAAM/YRa3T6dhR3jqdr769kf9Yuor1WzoYP7K5xpVJ6g8GeMbOnD2ZM2dPLmvfHz/6Kl/7xeLiwhMGuDQYGOBDxPDGwl0sq9ZtISq89tzUEEwc7ROjUr0Z4EPE6OGFpv797z1cle/73idO4PxjD6zKd0nqGwN8iDjnqMn8zYXHsqWjvDHznmzr2MFf3b6EN9duqlJlkvrKAB8iRjQ3cnEV7hffWgzwbdud1kaqN1el1z5pbiyMn2+tsCcvqXIGuPZJRNDUEGXfviipdhxC0T5ragz++bHXuGfJqnqX0q0Jo4bxwz86eeeFW2mw8n/h2mdfOHsWi99YV+8yurVi3WYeX7aGN9Zu4ogDXERag5sBrn32J2cdXu8SerTg2RV87qZFbN/hRVYNfo6Ba1DpnOfFANdQYIBrUGksrly0owprvUoDnQGuQaXRHriGEANcg4pDKBpKKgrwiJgbES9ExMsR8eVqFSX1VecQigGuoaDPAR4RjcA/AB8FjgYujYijq1WY1Bc7h1AcA9cQUMlthO8HXk4pvQIQET8BLgCWVKMwqS86A/wvb3mGUcMa61yNtMtff/wYTp65X1W/s5IAnwa8vtvr5cApXXeKiHnAPIAZM2ZUcDipd+87cBwXt05nw5aOepci7WFkc/U7FJUEeHerApT83ppSmg/MB2htbfX3WtXU6OFN/M2Fx9W7DKlfVHIRczmw+/yk04E3KytHklSuSgL8CWBWRBwSEcOAS4DbqlOWJKk3fR5CSSl1RMTngbuARuD6lNJzVatMkrRXFU1mlVK6A7ijSrVIkvaBT2JKUqYMcEnKlAEuSZkywCUpU5H6cc6IiGgHXu3jxycBb1WxnHryXAauwXQ+nsvA1JdzOTil1NJ1Y78GeCUioi2l1FrvOqrBcxm4BtP5eC4DUzXPxSEUScqUAS5JmcopwOfXu4Aq8lwGrsF0Pp7LwFS1c8lmDFyStKeceuCSpN0Y4JKUqQEX4L0tlBwFf198/5mIOLEedZajjHM5MyLWRcRTxT9fr0ed5YiI6yNidUQs7uH9nNqlt3PJqV0Oioj7ImJpRDwXEX/azT5ZtE2Z55JF20TEiIh4PCKeLp7L/+pmn8rbJaU0YP5QmJb2t8ChwDDgaeDoLvucByygsCLQqcBj9a67gnM5E7i93rWWeT5nACcCi3t4P4t2KfNccmqXqcCJxZ/HAi9m/P+Zcs4li7Yp/luPKf7cDDwGnFrtdhloPfCdCyWnlLYCnQsl7+4C4Eep4FFgQkRM7e9Cy1DOuWQjpfQgsGYvu+TSLuWcSzZSSitSSouKP68HllJYr3Z3WbRNmeeSheK/9Ybiy+bin653jFTcLgMtwLtbKLlrA5azz0BQbp0fKP6atSAi3tc/pdVELu1SruzaJSJmAidQ6O3tLru22cu5QCZtExGNEfEUsBq4J6VU9XapaEGHGihnoeSyFlMeAMqpcxGFOQ42RMR5wC+AWbUurEZyaZdyZNcuETEG+DlwZUrp3a5vd/ORAds2vZxLNm2TUtoOHB8RE4B/jYg5KaXdr7tU3C4DrQdezkLJuSym3GudKaV3O3/NSoXVjZojYlL/lVhVubRLr3Jrl4hophB4N6WUbu1ml2zaprdzya1tAFJKa4H7gbld3qq4XQZagJezUPJtwB8Wr+CeCqxLKa3o70LL0Ou5RMSUiIjiz++n0B5v93ul1ZFLu/Qqp3Yp1vkDYGlK6Zoedsuibco5l1zaJiJaij1vImIk8GHg+S67VdwuA2oIJfWwUHJEXFF8/zoKa3CeB7wMbAT+uF717k2Z53Ih8LmI6AA2AZek4uXpgSYibqZwB8CkiFgOXE3hwkxW7QJlnUs27QKcDlwGPFscbwX4CjADsmubcs4ll7aZCtwYEY0U/iPz05TS7dXOMh+ll6RMDbQhFElSmQxwScqUAS5JmTLAJSlTBrgk1Uj0MnFaN/tfHBFLihNg/XOv+3sXiiTVRkScAWygMOfJnF72nQX8FDg7pfRORExOKa3e22fsgUtSjXQ3cVpEHBYRd0bEwoh4KCKOLL71WeAfUkrvFD+71/AGA1yS+tt84AsppZOAvwCuLW4/AjgiIn4dEY9GRNdH70sMqCcxJWkwK07UdRpwS3FGAIDhxb+bKEzMdSaFeVEeKk6Atban7zPAJan/NABrU0rHd/PecuDRlNI2YFlEvEAh0J/Y25dJkvpBcXrcZRFxEexcVu244tu/AM4qbp9EYUjllb19nwEuSTVSnDjtEWB2RCyPiMuBTwKXR8TTwHPsWqnrLuDtiFgC3Af8ZUpprzMtehuhJGXKHrgkZcoAl6RMGeCSlCkDXJIyZYBLUqYMcEnKlAEuSZn6/0eQrxlOiW/HAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "import re\n", + "from math import log\n", + "\n", + "freqs = []\n", + "\n", + "with open('freqs.txt', 'r') as fh:\n", + " for line in fh:\n", + " m = re.match(r'\\s*(\\d+)', line)\n", + " if m:\n", + " freqs.append(log(float(m.group(1))))\n", + "\n", + "plt.plot(range(len(freqs)), freqs)\n", + "fname = 'word-distribution.png'\n", + "plt.savefig(fname)\n", + "fname" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[[file:# Out[25]:\n", + "\n", + " 'word-distribution.png'\n", + "\n", + "![img](./obipy-resources/c0TrCn.png)]]\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Lematyzacja\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lematyzacja wydaje si\u0119 dobrym pomys\u0142em, zw\u0142aszcza dla j\u0119zyk\u00f3w dla bogatej fleksji:\n", + "\n", + "- znacznie redukujemy s\u0142ownik,\n", + "- formy fleksyjne tego samego wyrazu s\u0105 traktowane tak samo (co wydaje si\u0119 s\u0142uszne).\n", + "\n", + "W praktyce wsp\u00f3\u0142cze\u015bnie **nie** stosuje si\u0119 lematyzacji (w po\u0142\u0105czeniu z\n", + "metodami opartymi na sieciach neuronowych):\n", + "\n", + "- lematyzacja wymaga wiedzy j\u0119zykowej (regu\u0142 lub s\u0142ownika),\n", + " wytworzenie takiej wiedzy mo\u017ce by\u0107 kosztowne, obecnie preferowane\n", + " s\u0105 metody niezale\u017cne od j\u0119zyka;\n", + "- tracimy pewn\u0105 informacj\u0119 niesion\u0105 przez form\u0119 fleksyjn\u0105 (co w szczeg\u00f3lnych\n", + " przypadkach mo\u017ce by\u0107 niefortunne, np. *aspiracja* i *aspiracje*);\n", + "- lematyzacja nie jest trywialnym problemem ze wzgl\u0119du na niejednoznaczno\u015bci\n", + " (*Lekarzu, lecz si\u0119 sam*);\n", + "- niekt\u00f3re niejednoznaczno\u015bci s\u0105 seryjne, wyb\u00f3r lematu mo\u017ce by\u0107 arbitralny,\n", + " np. czy *posiadanie*, *gotowanie*, *skakanie* to rzeczowniki czy czasowniki?\n", + " a *urz\u0105dzenie*, *mieszkanie*?\n", + "- zazwyczaj sieci neuronowe (czy nawet prostsze modele typu Word2vec)\n", + " s\u0105 w stanie nauczy\u0107 si\u0119 rekonstruowania zale\u017cno\u015bci mi\u0119dzy formami fleksyjnymi\n", + " (i wi\u0119cej: b\u0142\u0119dnych form, b\u0142\u0119d\u00f3w ortograficznych, form archaicznych itd.)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Zej\u015bcie na poziom znak\u00f3w\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Skoro s\u0142ownik wyraz\u00f3w jest zbyt du\u017cy, to mo\u017ce zej\u015b\u0107 na poziom znak\u00f3w?\n", + "\n", + "- pojedynczy znak alfabetu wprawdzie nic nie znaczy (co znaczy *h*?)\n", + "\n", + "- \u2026 ale rozmiar wej\u015bcia przy kodowaniu gor\u0105c\u0105 jedynk\u0105\n", + " dramatycznie si\u0119 zmniejsza\n", + "\n", + "- mo\u017ce dzia\u0142a\u0107, je\u015bli doda\u0107 wielowarstwow\u0105 sie\u0107\n", + " neuronow\u0105\n", + "\n", + "- \u2026 ale mo\u017ce by\u0107 bardzo kosztowne obliczeniowo\n", + "\n", + "A mo\u017ce co\u015b po\u015bredniego mi\u0119dzy znakami a wyrazami?\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BPE\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ani znaki, ani wyrazy \u2014 co\u015b pomi\u0119dzy: jednostki podwyrazowe (*subword\n", + "units*). Mogliby\u015bmy np. dzieli\u0107 wyraz *superkomputera* na dwie\n", + "jednostki *super/+/komputera*, a mo\u017ce nawet trzy: *super/+/komputer/+/a*?\n", + "\n", + "Najpopularniejszy algorytm podzia\u0142u na jednostki podwyrazowe to BPE\n", + "(*byte-pair encoding*), zainspirowany algorytmami kompresji danych.\n", + "Lista jednostek jest automatycznie indukowana na podstawie tekstu (nie\n", + "potrzeba \u017cadnej wiedzy o j\u0119zyku!). Ich liczba musi by\u0107 natomiast z g\u00f3ry\n", + "okre\u015blona.\n", + "\n", + "W kroku pocz\u0105tkowym zaznaczamy ko\u0144ce wyraz\u00f3w (token\u00f3w), robimy to po\n", + "to, \u017ceby jednostki podwyrazowe nie przekracza\u0142y granic wyraz\u00f3w.\n", + "\n", + "Nast\u0119pnie wykonujemy tyle krok\u00f3w iteracji, ile wynosi rozmiar zadanego\n", + "s\u0142ownika. W ka\u017cdym kroku szukamy najcz\u0119stszego bigramu, od tego\n", + "momentu traktujemy go jako ca\u0142ostk\u0119 (wk\u0142adamy go do \u201epude\u0142ka\u201d).\n", + "\n", + "![img](./bpe.png)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Implementacja w Pythonie\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['e$', 'to', 'to$', 'be$', 't$', 'th', 'or', 'or$', 'no', 'not$']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from collections import Counter\n", + "\n", + "def replace_bigram(l, b, r):\n", + " i = 0\n", + " while i < len(l) - 1:\n", + " if (l[i], l[i+1]) == b:\n", + " l[i:i+2] = [r]\n", + " i += 1\n", + " return l\n", + "\n", + "def learn_bpe_vocab(d, max_vocab_size):\n", + " d = list(d.replace(' ', '$') + '$')\n", + "\n", + " vocab = []\n", + "\n", + " for ix in range(0, max_vocab_size):\n", + " bigrams = [(d[i], d[i+1]) for i in range(0, len(d) - 1) if d[i][-1] != '$']\n", + " selected_bigram = Counter(bigrams).most_common(1)[0][0]\n", + "\n", + " new_subword = selected_bigram[0] + selected_bigram[1]\n", + " d = replace_bigram(d, selected_bigram, new_subword)\n", + "\n", + " vocab.append(new_subword)\n", + "\n", + " return vocab\n", + "\n", + "vocab1 = learn_bpe_vocab('to be or not to be that is the question', 10)\n", + "vocab1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "S\u0142ownik jednostek podwyrazowych mo\u017cemy zastosowa\u0107 do dowolnego tekstu, np. do tekstu,\n", + "na kt\u00f3rym s\u0142ownik by\u0142 wyuczony:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'to$ be$ or$ not$ to$ be$ th a t$ i s $ th e$ q u e s t i o n $'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def apply_bpe_vocab(vocab, d):\n", + " d = list(d.replace(' ', '$') + '$')\n", + " vocab_set = set(vocab)\n", + "\n", + " modified = True\n", + " while modified:\n", + " ix = 0\n", + " modified = False\n", + " while ix < len(d) - 1:\n", + " bigram = d[ix] + d[ix+1]\n", + " if bigram in vocab_set:\n", + " d[ix:ix+2] = [bigram]\n", + " modified = True\n", + " else:\n", + " ix += 1\n", + "\n", + " return d\n", + "\n", + "' '.join(apply_bpe_vocab(vocab1, 'to be or not to be that is the question'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Zauwa\u017cmy, \u017ce opr\u00f3cz jednostek podwyrazowych zosta\u0142y izolowane litery,\n", + "zazwyczaj dodajemy je do s\u0142ownika. (I zazwyczaj, s\u0142ownik jest troch\u0119\n", + "wi\u0119kszy ni\u017c warto\u015b\u0107 podana jako parametr przy uczeniu BPE \u2014 jest\n", + "wi\u0119kszy o znaki i specjalne tokeny typu `UNK`, `BOS`, `EOS`, `PAD`.)\n", + "\n", + "**Pytanie**: Jaki problem mo\u017ce pojawi\u0107 przy zastosowaniu BPE dla tekstu,\n", + "gdzie pojawiaj\u0105 si\u0119 chi\u0144skie znaki? Jak mo\u017cna sobie z nim poradzi\u0107?\n", + "\n", + "S\u0142ownik jednostek podwyrazowych mo\u017cna stosowa\u0107 dla dowolnego tekstu:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'to m $ w i l l $ be$ th e$ b e s t$'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "' '.join(apply_bpe_vocab(vocab1, 'tom will be the best'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Jak mo\u017cna zauwa\u017cy\u0107 algorytm BPE daje dwa rodzaje jednostek podwyrazowych:\n", + "\n", + "- jednostki, kt\u00f3re mog\u0105 doklejane na pocz\u0105tku wyrazu;\n", + "- jednostki, kt\u00f3re stanowi\u0105 koniec wyrazu, w szczeg\u00f3lno\u015bci s\u0105 ca\u0142ym wyrazem.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Gotowa implementacja\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Po raz pierwszy BPE u\u017cyto do neuronowego t\u0142umaczenia maszynowego.\n", + "U\u017cyjmy modu\u0142u autorstwa Rica Sennricha ([https://github.com/rsennrich/subword-nmt](https://github.com/rsennrich/subword-nmt)).\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install subword-nmt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Wyindukujmy s\u0142ownik dla zbioru ucz\u0105cego zadania identyfikacji p\u0142ci\n", + "autora tekstu:\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "! xzcat petite-difference-challenge2/train/in.tsv.xz | perl -C -ne 'print \"$&\\n\" while/\\p{L}+/g;' | python -m subword_nmt.learn_bpe -s 50000 -v > bpe_vocab.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Procedura trwa kilka minut, trzeba uzbroi\u0107 si\u0119 w cierpliwo\u015b\u0107 (ale wypisywanie bigram\u00f3w przyspieszy!).\n", + "\n", + " pair 0: n i -> ni (frequency 17625075)\n", + " pair 1: i e -> ie (frequency 11471590)\n", + " pair 2: c z -> cz (frequency 9143490)\n", + " pair 3: ni e -> nie (frequency 7901783)\n", + " pair 4: p o -> po (frequency 7790826)\n", + " pair 5: r z -> rz (frequency 7542046)\n", + " pair 6: s t -> st (frequency 7269069)\n", + " pair 7: e m -> em (frequency 7207280)\n", + " pair 8: d z -> dz (frequency 6860931)\n", + " pair 9: s z -> sz (frequency 6609907)\n", + " pair 10: r a -> ra (frequency 6601618)\n", + " pair 11: o w -> ow (frequency 6395963)\n", + " pair 12: i e -> ie (frequency 5906869)\n", + " pair 13: n a -> na (frequency 5300380)\n", + " pair 14: r o -> ro (frequency 5181363)\n", + " pair 15: n a -> na (frequency 5125807)\n", + " pair 16: a \u0142 -> a\u0142 (frequency 4786696)\n", + " pair 17: j e -> je (frequency 4599579)\n", + " pair 18: s i -> si (frequency 4300984)\n", + " pair 19: a l -> al (frequency 4276823)\n", + " pair 20: t e -> te (frequency 4033344)\n", + " pair 21: w i -> wi (frequency 3939063)\n", + " pair 22: c h -> ch (frequency 3919410)\n", + " pair 23: c h -> ch (frequency 3661410)\n", + " pair 24: k o -> ko (frequency 3629840)\n", + " pair 25: z a -> za (frequency 3625424)\n", + " pair 26: t a -> ta (frequency 3570094)\n", + " pair 27: p rz -> prz (frequency 3494551)\n", + " pair 28: g o -> go (frequency 3279997)\n", + " pair 29: a r -> ar (frequency 3081492)\n", + " pair 30: si \u0119 -> si\u0119 (frequency 2973681)\n", + " ...\n", + " pair 49970: brz mieniu -> brzmieniu (frequency 483)\n", + " pair 49971: bie\u017c\u0105 cych -> bie\u017c\u0105cych (frequency 483)\n", + " pair 49972: biegu nk\u0119 -> biegunk\u0119 (frequency 483)\n", + " pair 49973: ban kowo\u015bci -> bankowo\u015bci (frequency 483)\n", + " pair 49974: ba ku -> baku (frequency 483)\n", + " pair 49975: ba cznie -> bacznie (frequency 483)\n", + " pair 49976: Przypad kowo -> Przypadkowo (frequency 483)\n", + " pair 49977: MA \u0141 -> MA\u0141 (frequency 483)\n", + " pair 49978: Lep pera -> Leppera (frequency 483)\n", + " pair 49979: Ko za -> Koza (frequency 483)\n", + " pair 49980: Jak by\u015b -> Jakby\u015b (frequency 483)\n", + " pair 49981: Geni alne -> Genialne (frequency 483)\n", + " pair 49982: \u017be nada -> \u017benada (frequency 482)\n", + " pair 49983: \u0144 czykiem -> \u0144czykiem (frequency 482)\n", + " pair 49984: zwie \u0144 -> zwie\u0144 (frequency 482)\n", + " pair 49985: zost a\u0142a\u015b -> zosta\u0142a\u015b (frequency 482)\n", + " pair 49986: zni szczona -> zniszczona (frequency 482)\n", + " pair 49987: ze stawi -> zestawi (frequency 482)\n", + " pair 49988: za s\u00f3b -> zas\u00f3b (frequency 482)\n", + " pair 49989: w\u0119d r\u00f3wk\u0119 -> w\u0119dr\u00f3wk\u0119 (frequency 482)\n", + " pair 49990: wysko czy\u0142a -> wyskoczy\u0142a (frequency 482)\n", + " pair 49991: wyle czenia -> wyleczenia (frequency 482)\n", + " pair 49992: wychowaw cze -> wychowawcze (frequency 482)\n", + " pair 49993: w t -> wt (frequency 482)\n", + " pair 49994: un da -> unda (frequency 482)\n", + " pair 49995: udzie la\u0142em -> udziela\u0142em (frequency 482)\n", + " pair 49996: t\u0119 czy -> t\u0119czy (frequency 482)\n", + " pair 49997: tro sce -> trosce (frequency 482)\n", + " pair 49998: s\u0142usz no\u015bci -> s\u0142uszno\u015bci (frequency 482)\n", + " pair 49999: su me -> sume (frequency 482\n", + "\n", + "Zastosujmy teraz wyindukowany s\u0142ownik BPE dla jakiego\u015b rzeczywistego tekstu.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cier@@ pia\u0142em na straszne la@@ gi kilkana\u015bcie sekund lub d\u0142u\u017cej czarnego ekranu przy pr\u00f3bie prze\u0142\u0105@@ czenia si\u0119 uruchomienia prawie ka\u017cdej aplikacji Dodatkowo telefon mi si\u0119 wy\u0142\u0105@@ cza\u0142 czasem bez powodu sam z siebie albo rese@@ towa\u0142 Ostatnio nawet przegl\u0105darka zacz\u0119\u0142a si\u0119 cz\u0119sto zawie@@ sza\u0107 i Android proponowa\u0142 wymu@@ szone zamkni\u0119cie Do tego te problemy z po\u0142\u0105czeniem do komputera przez USB " + ] + } + ], + "source": [ + "! echo 'Cierpia\u0142em na straszne lagi \u2013 kilkana\u015bcie sekund lub d\u0142u\u017cej czarnego ekranu przy pr\u00f3bie prze\u0142\u0105czenia si\u0119 / uruchomienia prawie ka\u017cdej aplikacji. Dodatkowo telefon mi si\u0119 wy\u0142\u0105cza\u0142 czasem bez powodu \u2013 sam z siebie, albo resetowa\u0142. Ostatnio nawet przegl\u0105darka zacz\u0119\u0142a si\u0119 cz\u0119sto zawiesza\u0107 i Android proponowa\u0142 wymuszone zamkni\u0119cie. Do tego te problemy z po\u0142\u0105czeniem do komputera przez USB.' | perl -C -ne 'print \"$& \" while/\\p{L}+/g;' | python -m subword_nmt.apply_bpe -c bpe_vocab.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Ta konkretna implementacja zaznacza za pomoc\u0105 sekwencji ~@@ ~ koniec jednostki podwyrazowej.\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "org": null, + "author": "Filip Grali\u0144ski", + "email": "filipg@amu.edu.pl", + "lang": "pl", + "subtitle": "12.Kodowanie BPE[wyk\u0142ad]", + "title": "Ekstrakcja informacji", + "year": "2021" }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAEFCAYAAAD69rxNAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAXf0lEQVR4nO3deZSdVZnv8e9TQ+YRUiEhIYQhBDDMhSB4kUFZkaalrwtYoNJ2yzUXb+ttbg8qehVv32UvV+ui2742l5tWBG0aW5S2kSYMjYzKVAlTSJgkAoEMBSEhIWMl+/5xTmWoU5U6qXNOndpV389aWanznvec93mz9ceu/b7v3pFSQpKUn4Z6FyBJ6hsDXJIyZYBLUqYMcEnKlAEuSZlq6s+DTZo0Kc2cObM/DylJ2Vu4cOFbKaWWrtv7NcBnzpxJW1tbfx5SkrIXEa92t90hFEnKlAEuSZkywCUpUwa4JGXKAJekTBngkpQpA1ySMpVFgN+7dBXX3v9yvcuQpAEliwC//4V2vv/QsnqXIUkDShYBDuDCE5K0pywCPKLeFUjSwJNFgAPY/5akPWUR4HbAJalUFgEuSSqVTYB7DVOS9pRFgIdXMSWpRBYBDt5GKEldZRPgkqQ9ZRPg9r8laU9ZBLhD4JJUKosAB+yCS1IXvQZ4RFwfEasjYnE37/1FRKSImFSb8orH8VEeSSpRTg/8BmBu140RcRDwEeC1KtfULTvgkrSnXgM8pfQgsKabt/4W+CL9kK2OgUtSqT6NgUfEx4A3UkpPl7HvvIhoi4i29vb2vhxOktSNfQ7wiBgFfBX4ejn7p5Tmp5RaU0qtLS0t+3q43b+nz5+VpMGoLz3ww4BDgKcj4nfAdGBRREypZmG7cwRFkko17esHUkrPApM7XxdDvDWl9FYV6yo9bi2/XJIyVM5thDcDjwCzI2J5RFxe+7K61tDfR5Skga/XHnhK6dJe3p9ZtWr2epz+OIok5SOLJzGdTlaSSmUR4ADJUXBJ2kMWAW7/W5JKZRHg4Bi4JHWVR4DbBZekEnkEON4HLkldZRHgTicrSaWyCHBJUql8AtwxFEnaQxYB7nM8klQqiwAHH+SRpK6yCHA74JJUKosABx/kkaSusghwx8AlqVQWAQ7ehCJJXWUR4D7II0mlsghwcFFjSeqqnCXVro+I1RGxeLdt346I5yPimYj414iYUMsiHQOXpFLl9MBvAOZ22XYPMCeldCzwInBVleuSJPWi1wBPKT0IrOmy7e6UUkfx5aPA9BrUtmcdtT6AJGWmGmPgnwEW9PRmRMyLiLaIaGtvb+/TARxBkaRSFQV4RHwV6ABu6mmflNL8lFJrSqm1paWlz8fyGqYk7amprx+MiE8D5wPnpFrfIuJVTEkq0acAj4i5wJeAD6WUNla3JElSOcq5jfBm4BFgdkQsj4jLge8BY4F7IuKpiLiulkXa/5akUr32wFNKl3az+Qc1qKVXKSXC4RRJAjJ5EtPMlqRSWQR4J+9EkaRdsghwJ7OSpFJZBLgkqVRWAe4IiiTtkkWAexFTkkplEeCdnBNcknbJIsDtgEtSqSwCvJP9b0naJYsAdwxckkplEeCdHAKXpF2yCHDnP5GkUlkEeKfkKLgk7ZRVgEuSdskqwB0Dl6Rdsghwh8AlqVQWAS5JKlXOkmrXR8TqiFi827b9IuKeiHip+PfE2pYpSeqqnB74DcDcLtu+DNybUpoF3Ft8XTPOBy5JpXoN8JTSg8CaLpsvAG4s/nwj8AfVLaunWvrjKJKUh76OgR+QUloBUPx7ck87RsS8iGiLiLb29vY+HcyLmJJUquYXMVNK81NKrSml1paWlsq+ywd5JGmnvgb4qoiYClD8e3X1SiplB1ySSvU1wG8DPl38+dPAv1WnnL1zDFySdinnNsKbgUeA2RGxPCIuB74FfCQiXgI+UnxdM46BS1Kppt52SCld2sNb51S5ll7ZAZekXbJ4EtP7wCWpVBYBLkkqlUWAd46Bb9/hIIokdcoiwJsbC2V2bN9R50okaeDIK8DtgUvSTlkEeFNjYQxla4c9cEnqlEWANxcD3B64JO2SRYA3NTgGLkldZRHgnWPgWw1wSdopkwAvDqFsdwhFkjplEeBNO+9CsQcuSZ2yCPDmhkIPfJs9cEnaKY8AbyqUuc0xcEnaKYsAb2pwDFySusoiwDvvQrEHLkm7ZBHgTT7II0klsghwe+CSVKqiAI+I/xERz0XE4oi4OSJGVKuw3Q3rfJDHuVAkaac+B3hETAP+O9CaUpoDNAKXVKuw3Y1obgRgswEuSTtVOoTSBIyMiCZgFPBm5SWVGjmsGOBbt9fi6yUpS30O8JTSG8B3gNeAFcC6lNLdXfeLiHkR0RYRbe3t7X061shiD3yjAS5JO1UyhDIRuAA4BDgQGB0Rn+q6X0ppfkqpNaXU2tLS0qdjNTYEw5oa2Lito6/lStKgU8kQyoeBZSml9pTSNuBW4LTqlFVq9LBGNm6xBy5JnSoJ8NeAUyNiVEQEcA6wtDpllRo9vIkNW+yBS1KnSsbAHwN+BiwCni1+1/wq1VVizPAm1m82wCWpU1MlH04pXQ1cXaVa9mrsiCY2bNnWH4eSpCxk8SQmwLgRzfbAJWk3+QT4yGbWbbIHLkmdsgnw8SObWbfRAJekTtkE+MRRw1i/pcP5UCSpKJsA3290MwBrN22tcyWSNDBkE+ATRw8DYM17BrgkQUYB3jJmOADt67fUuRJJGhiyCfDJ4wpTja9+1wCXJMgowA8YV+iBr3x3c50rkaSBIZsAHzWsiXEjmli5zgCXJMgowAGmTRzFm2s31bsMSRoQsgrw6RNH8vo7G+tdhiQNCFkF+Iz9RvHamo2klOpdiiTVXVYBPnP/UWzetoPV3kooSZkF+KTRALzS/l6dK5Gk+ssqwA9rGQPAy6vX17kSSaq/rAJ86vgRjB3exIurNtS7FEmqu4oCPCImRMTPIuL5iFgaER+oVmE9HI/ZU8aydMW7tTyMJGWh0h74d4E7U0pHAsdRw0WNOx194DiWrniXHTu8E0XS0NbnAI+IccAZwA8AUkpbU0prq1RXj+ZMG897W7ez7G0vZEoa2irpgR8KtAM/jIgnI+L7ETG6604RMS8i2iKirb29vYLDFRw7fTwAT7++tuLvkqScVRLgTcCJwP9NKZ0AvAd8uetOKaX5KaXWlFJrS0tLBYcrmDV5LGOGN7HotXcq/i5JylklAb4cWJ5Seqz4+mcUAr2mGhuCE2ZM4IllBrikoa3PAZ5SWgm8HhGzi5vOAZZUpapenHro/rywaj1vbfCJTElDV6V3oXwBuCkingGOB/664orK8MHDJwHw65ff6o/DSdKA1FTJh1NKTwGt1SmlfHOmjWfCqGYeeLGdC46f1t+Hl6QBIasnMTs1NgRnHtHCfc+vpmP7jnqXI0l1kWWAA8ydM4V3Nm7jsWVr6l2KJNVFtgH+oSMmM3pYI7948o16lyJJdZFtgI8c1sjcOVO5c/FKNm/bXu9yJKnfZRvgABccfyDrt3Rw79LV9S5Fkvpd1gF+2mH7M2XcCG5Z+Hq9S5Gkfpd1gDc1NnBR63QeeLGdZW85uZWkoSXrAAe47NSDaWoIbvzN7+pdiiT1q+wDfPK4Efz+sQfyL0+8zpr3tta7HEnqN9kHOMBnzziUTdu28/V/W1zvUiSp3wyKAD9q6jj+06xJ3P7MCt5Yu6ne5UhSvxgUAQ7wJ2cdDsCVP3myzpVIUv8YNAF+6qH7M23CSJ743Ts89+a6epcjSTU3aAIc4HufOAGAy29oq3MlklR7gyrAT5gxkfcdOI6V727m5wuX17scSaqpQRXgAD/8o5MB+PNbnmabU81KGsQGXYBPHjeCz5x+CADfufuFOlcjSbVTcYBHRGNEPBkRt1ejoGr42vlHcdbsFv7fA69w3wtOdCVpcKpGD/xPgaVV+J6qiQiuufh4xg5v4o9/+ISLH0salCoK8IiYDvwe8P3qlFM9E0cP44sfPRKAs75zPymlOlckSdVVaQ/874AvAj1eLYyIeRHRFhFt7e3tFR5u31x26sGcOGMC6zd38IfXP96vx5akWutzgEfE+cDqlNLCve2XUpqfUmpNKbW2tLT09XB9dssVpwHw0Etv8a0Fz/f78SWpVirpgZ8OfCwifgf8BDg7Iv6pKlVVUWND8PhXzwHgugd+y0+fcPEHSYNDnwM8pXRVSml6SmkmcAnwq5TSp6pWWRVNHjuCX37+gwB88efPsODZFXWuSJIqN+juA+/JMdPH73zI53M3LeLu51bWuSJJqkxVAjyldH9K6fxqfFctnXXkZK795IkAzPvxQu6wJy4pY0OmB97pvGOm8veXFia9+m83LeLWRc6ZIilPQy7AAT523IFc96mTAPiznz7NdQ/8ts4VSdK+G5IBDjB3zhR+Mu9UAL614Hn+7F+e8mEfSVkZsgEOhUUgfvXnHwLg1iff4JxrHmBLx/Y6VyVJ5RnSAQ5waMsYnvzaR2hsCF5pf4/Z//NOXn37vXqXJUm9GvIBDoV5U17433P54OGTAPjQt+/nhl8vq3NVkrR3BnhRU2MD//RfTuGb/3kOAN/45RKu+PFCtu9wXFzSwGSAd/HJUw7mts+fDsCdz63ksK/cwaLX3qlzVZJUygDvxrHTJ/DMN86l9eCJAHz82t/wV79cQodLtEkaQAzwHowb0cwtV3yAay4+DoDrf72M1m/+B4vfWFfnyiSpwADfi4jg4ydO55lvnMt5x0xh7cZtnP9/Hua//riNDVs66l2epCHOAC/DuBHNXPvJk/j2hccCcNdzq5hz9V0seHaFQS6pbgzwfXBR60E8/KWz+NyZhwGFWQ1P/9avWL1+sw8ASep30Z+Pj7e2tqa2trZ+O14t3ff8au5espKbHy8sEDFtwkge/tJZRESdK5M02ETEwpRSa9ftTfUoZjA468jJvP+Q/Thm2gQeeqmdBYtXMufquxg/spnbvvBBJo0ZXu8SJQ1yBngFRg9v4hOnzOCcoyYzY/9RLH9nE//+zAo++6M2xo9s5tyjp/CJU2bUu0xJg5QBXgUHjBvBVR89inWbtrFu4zbWb97Gk6+t5aVVGxgzovBPfPTUsRw+eWydK5U0mPR5DDwiDgJ+BEwBdgDzU0rf3dtnBtMYeG+++e9L+MeHds2ncuSUsdx55Rl1rEhSrnoaA68kwKcCU1NKiyJiLLAQ+IOU0pKePjOUArxj+w5eXbORlOC7977EgmdXMOuAQg88gCs/PItz3zelvkVKykLVL2KmlFYAK4o/r4+IpcA0oMcAH0qaGhs4rGUMAJ/+wMFs2badzv9UPvhiO3c+t5KTio/qA4wc1sioYY5oSSpfVW4jjIiZwIPAnJTSu13emwfMA5gxY8ZJr776asXHy925f/sAL67asMe2kc2NPHLV2UwYNaxOVUkaqGp2G2FEjAF+DlzZNbwBUkrzgflQGEKp9HiDwbcvPI6nl6/d+Xrpine5+fHXeX7lemZNHrPHvqOHNzGiubGfK5SUg4oCPCKaKYT3TSmlW6tT0uB33EETOO6gCTtf/+a3b3Hz469zyfxHS/adMm4Ej1x1tg8ISSrR5wCPQqL8AFiaUrqmeiUNPSfP3I/vXHQcG7fuOa/Kwy+9xd1LVrGlY4e9cEklKumBnw5cBjwbEU8Vt30lpXRHxVUNMc2NDVx40vRu37t7ySouv/EJmhq6n7bm/GOnclHrQbUsT9IAVcldKA9TuCNONfL+Q/bj5JkT2bBlO1A6WdYrqzfw3pYOA1waorxvbQA7cso4brnitB7f/8wNT7B6/eZ+rEjSQGKAZ2xkcyMvrtrA3L97sKz9p08cxfzLTqKhwV+cpMHAAM/YRa3T6dhR3jqdr769kf9Yuor1WzoYP7K5xpVJ6g8GeMbOnD2ZM2dPLmvfHz/6Kl/7xeLiwhMGuDQYGOBDxPDGwl0sq9ZtISq89tzUEEwc7ROjUr0Z4EPE6OGFpv797z1cle/73idO4PxjD6zKd0nqGwN8iDjnqMn8zYXHsqWjvDHznmzr2MFf3b6EN9duqlJlkvrKAB8iRjQ3cnEV7hffWgzwbdud1kaqN1el1z5pbiyMn2+tsCcvqXIGuPZJRNDUEGXfviipdhxC0T5ragz++bHXuGfJqnqX0q0Jo4bxwz86eeeFW2mw8n/h2mdfOHsWi99YV+8yurVi3WYeX7aGN9Zu4ogDXERag5sBrn32J2cdXu8SerTg2RV87qZFbN/hRVYNfo6Ba1DpnOfFANdQYIBrUGksrly0owprvUoDnQGuQaXRHriGEANcg4pDKBpKKgrwiJgbES9ExMsR8eVqFSX1VecQigGuoaDPAR4RjcA/AB8FjgYujYijq1WY1Bc7h1AcA9cQUMlthO8HXk4pvQIQET8BLgCWVKMwqS86A/wvb3mGUcMa61yNtMtff/wYTp65X1W/s5IAnwa8vtvr5cApXXeKiHnAPIAZM2ZUcDipd+87cBwXt05nw5aOepci7WFkc/U7FJUEeHerApT83ppSmg/MB2htbfX3WtXU6OFN/M2Fx9W7DKlfVHIRczmw+/yk04E3KytHklSuSgL8CWBWRBwSEcOAS4DbqlOWJKk3fR5CSSl1RMTngbuARuD6lNJzVatMkrRXFU1mlVK6A7ijSrVIkvaBT2JKUqYMcEnKlAEuSZkywCUpU5H6cc6IiGgHXu3jxycBb1WxnHryXAauwXQ+nsvA1JdzOTil1NJ1Y78GeCUioi2l1FrvOqrBcxm4BtP5eC4DUzXPxSEUScqUAS5JmcopwOfXu4Aq8lwGrsF0Pp7LwFS1c8lmDFyStKeceuCSpN0Y4JKUqQEX4L0tlBwFf198/5mIOLEedZajjHM5MyLWRcRTxT9fr0ed5YiI6yNidUQs7uH9nNqlt3PJqV0Oioj7ImJpRDwXEX/azT5ZtE2Z55JF20TEiIh4PCKeLp7L/+pmn8rbJaU0YP5QmJb2t8ChwDDgaeDoLvucByygsCLQqcBj9a67gnM5E7i93rWWeT5nACcCi3t4P4t2KfNccmqXqcCJxZ/HAi9m/P+Zcs4li7Yp/luPKf7cDDwGnFrtdhloPfCdCyWnlLYCnQsl7+4C4Eep4FFgQkRM7e9Cy1DOuWQjpfQgsGYvu+TSLuWcSzZSSitSSouKP68HllJYr3Z3WbRNmeeSheK/9Ybiy+bin653jFTcLgMtwLtbKLlrA5azz0BQbp0fKP6atSAi3tc/pdVELu1SruzaJSJmAidQ6O3tLru22cu5QCZtExGNEfEUsBq4J6VU9XapaEGHGihnoeSyFlMeAMqpcxGFOQ42RMR5wC+AWbUurEZyaZdyZNcuETEG+DlwZUrp3a5vd/ORAds2vZxLNm2TUtoOHB8RE4B/jYg5KaXdr7tU3C4DrQdezkLJuSym3GudKaV3O3/NSoXVjZojYlL/lVhVubRLr3Jrl4hophB4N6WUbu1ml2zaprdzya1tAFJKa4H7gbld3qq4XQZagJezUPJtwB8Wr+CeCqxLKa3o70LL0Ou5RMSUiIjiz++n0B5v93ul1ZFLu/Qqp3Yp1vkDYGlK6Zoedsuibco5l1zaJiJaij1vImIk8GHg+S67VdwuA2oIJfWwUHJEXFF8/zoKa3CeB7wMbAT+uF717k2Z53Ih8LmI6AA2AZek4uXpgSYibqZwB8CkiFgOXE3hwkxW7QJlnUs27QKcDlwGPFscbwX4CjADsmubcs4ll7aZCtwYEY0U/iPz05TS7dXOMh+ll6RMDbQhFElSmQxwScqUAS5JmTLAJSlTBrgk1Uj0MnFaN/tfHBFLihNg/XOv+3sXiiTVRkScAWygMOfJnF72nQX8FDg7pfRORExOKa3e22fsgUtSjXQ3cVpEHBYRd0bEwoh4KCKOLL71WeAfUkrvFD+71/AGA1yS+tt84AsppZOAvwCuLW4/AjgiIn4dEY9GRNdH70sMqCcxJWkwK07UdRpwS3FGAIDhxb+bKEzMdSaFeVEeKk6Atban7zPAJan/NABrU0rHd/PecuDRlNI2YFlEvEAh0J/Y25dJkvpBcXrcZRFxEexcVu244tu/AM4qbp9EYUjllb19nwEuSTVSnDjtEWB2RCyPiMuBTwKXR8TTwHPsWqnrLuDtiFgC3Af8ZUpprzMtehuhJGXKHrgkZcoAl6RMGeCSlCkDXJIyZYBLUqYMcEnKlAEuSZn6/0eQrxlOiW/HAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "%matplotlib inline\n", - "import matplotlib.pyplot as plt\n", - "import re\n", - "from math import log\n", - "\n", - "freqs = []\n", - "\n", - "with open('freqs.txt', 'r') as fh:\n", - " for line in fh:\n", - " m = re.match(r'\\s*(\\d+)', line)\n", - " if m:\n", - " freqs.append(log(float(m.group(1))))\n", - "\n", - "plt.plot(range(len(freqs)), freqs)\n", - "fname = 'word-distribution.png'\n", - "plt.savefig(fname)\n", - "fname" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[[file:# Out[25]:\n", - "\n", - " 'word-distribution.png'\n", - "\n", - "![img](./obipy-resources/c0TrCn.png)]]\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Lematyzacja\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Lematyzacja wydaje się dobrym pomysłem, zwłaszcza dla języków dla bogatej fleksji:\n", - "\n", - "- znacznie redukujemy słownik,\n", - "- formy fleksyjne tego samego wyrazu są traktowane tak samo (co wydaje się słuszne).\n", - "\n", - "W praktyce współcześnie **nie** stosuje się lematyzacji (w połączeniu z\n", - "metodami opartymi na sieciach neuronowych):\n", - "\n", - "- lematyzacja wymaga wiedzy językowej (reguł lub słownika),\n", - " wytworzenie takiej wiedzy może być kosztowne, obecnie preferowane\n", - " są metody niezależne od języka;\n", - "- tracimy pewną informację niesioną przez formę fleksyjną (co w szczególnych\n", - " przypadkach może być niefortunne, np. *aspiracja* i *aspiracje*);\n", - "- lematyzacja nie jest trywialnym problemem ze względu na niejednoznaczności\n", - " (*Lekarzu, lecz się sam*);\n", - "- niektóre niejednoznaczności są seryjne, wybór lematu może być arbitralny,\n", - " np. czy *posiadanie*, *gotowanie*, *skakanie* to rzeczowniki czy czasowniki?\n", - " a *urządzenie*, *mieszkanie*?\n", - "- zazwyczaj sieci neuronowe (czy nawet prostsze modele typu Word2vec)\n", - " są w stanie nauczyć się rekonstruowania zależności między formami fleksyjnymi\n", - " (i więcej: błędnych form, błędów ortograficznych, form archaicznych itd.)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Zejście na poziom znaków\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Skoro słownik wyrazów jest zbyt duży, to może zejść na poziom znaków?\n", - "\n", - "- pojedynczy znak alfabetu wprawdzie nic nie znaczy (co znaczy *h*?)\n", - "\n", - "- … ale rozmiar wejścia przy kodowaniu gorącą jedynką\n", - " dramatycznie się zmniejsza\n", - "\n", - "- może działać, jeśli dodać wielowarstwową sieć\n", - " neuronową\n", - "\n", - "- … ale może być bardzo kosztowne obliczeniowo\n", - "\n", - "A może coś pośredniego między znakami a wyrazami?\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### BPE\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Ani znaki, ani wyrazy — coś pomiędzy: jednostki podwyrazowe (*subword\n", - "units*). Moglibyśmy np. dzielić wyraz *superkomputera* na dwie\n", - "jednostki *super/+/komputera*, a może nawet trzy: *super/+/komputer/+/a*?\n", - "\n", - "Najpopularniejszy algorytm podziału na jednostki podwyrazowe to BPE\n", - "(*byte-pair encoding*), zainspirowany algorytmami kompresji danych.\n", - "Lista jednostek jest automatycznie indukowana na podstawie tekstu (nie\n", - "potrzeba żadnej wiedzy o języku!). Ich liczba musi być natomiast z góry\n", - "określona.\n", - "\n", - "W kroku początkowym zaznaczamy końce wyrazów (tokenów), robimy to po\n", - "to, żeby jednostki podwyrazowe nie przekraczały granic wyrazów.\n", - "\n", - "Następnie wykonujemy tyle kroków iteracji, ile wynosi rozmiar zadanego\n", - "słownika. W każdym kroku szukamy najczęstszego bigramu, od tego\n", - "momentu traktujemy go jako całostkę (wkładamy go do „pudełka”).\n", - "\n", - "![img](./bpe.png)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Implementacja w Pythonie\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['e$', 'to', 'to$', 'be$', 't$', 'th', 'or', 'or$', 'no', 'not$']" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from collections import Counter\n", - "\n", - "def replace_bigram(l, b, r):\n", - " i = 0\n", - " while i < len(l) - 1:\n", - " if (l[i], l[i+1]) == b:\n", - " l[i:i+2] = [r]\n", - " i += 1\n", - " return l\n", - "\n", - "def learn_bpe_vocab(d, max_vocab_size):\n", - " d = list(d.replace(' ', '$') + '$')\n", - "\n", - " vocab = []\n", - "\n", - " for ix in range(0, max_vocab_size):\n", - " bigrams = [(d[i], d[i+1]) for i in range(0, len(d) - 1) if d[i][-1] != '$']\n", - " selected_bigram = Counter(bigrams).most_common(1)[0][0]\n", - "\n", - " new_subword = selected_bigram[0] + selected_bigram[1]\n", - " d = replace_bigram(d, selected_bigram, new_subword)\n", - "\n", - " vocab.append(new_subword)\n", - "\n", - " return vocab\n", - "\n", - "vocab1 = learn_bpe_vocab('to be or not to be that is the question', 10)\n", - "vocab1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Słownik jednostek podwyrazowych możemy zastosować do dowolnego tekstu, np. do tekstu,\n", - "na którym słownik był wyuczony:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'to$ be$ or$ not$ to$ be$ th a t$ i s $ th e$ q u e s t i o n $'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "def apply_bpe_vocab(vocab, d):\n", - " d = list(d.replace(' ', '$') + '$')\n", - " vocab_set = set(vocab)\n", - "\n", - " modified = True\n", - " while modified:\n", - " ix = 0\n", - " modified = False\n", - " while ix < len(d) - 1:\n", - " bigram = d[ix] + d[ix+1]\n", - " if bigram in vocab_set:\n", - " d[ix:ix+2] = [bigram]\n", - " modified = True\n", - " else:\n", - " ix += 1\n", - "\n", - " return d\n", - "\n", - "' '.join(apply_bpe_vocab(vocab1, 'to be or not to be that is the question'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Zauważmy, że oprócz jednostek podwyrazowych zostały izolowane litery,\n", - "zazwyczaj dodajemy je do słownika. (I zazwyczaj, słownik jest trochę\n", - "większy niż wartość podana jako parametr przy uczeniu BPE — jest\n", - "większy o znaki i specjalne tokeny typu `UNK`, `BOS`, `EOS`, `PAD`.)\n", - "\n", - "**Pytanie**: Jaki problem może pojawić przy zastosowaniu BPE dla tekstu,\n", - "gdzie pojawiają się chińskie znaki? Jak można sobie z nim poradzić?\n", - "\n", - "Słownik jednostek podwyrazowych można stosować dla dowolnego tekstu:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'to m $ w i l l $ be$ th e$ b e s t$'" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "' '.join(apply_bpe_vocab(vocab1, 'tom will be the best'))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Jak można zauważyć algorytm BPE daje dwa rodzaje jednostek podwyrazowych:\n", - "\n", - "- jednostki, które mogą doklejane na początku wyrazu;\n", - "- jednostki, które stanowią koniec wyrazu, w szczególności są całym wyrazem.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Gotowa implementacja\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Po raz pierwszy BPE użyto do neuronowego tłumaczenia maszynowego.\n", - "Użyjmy modułu autorstwa Rica Sennricha ([https://github.com/rsennrich/subword-nmt](https://github.com/rsennrich/subword-nmt)).\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "! pip install subword-nmt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Wyindukujmy słownik dla zbioru uczącego zadania identyfikacji płci\n", - "autora tekstu:\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "! xzcat petite-difference-challenge2/train/in.tsv.xz | perl -C -ne 'print \"$&\\n\" while/\\p{L}+/g;' | python -m subword_nmt.learn_bpe -s 50000 -v > bpe_vocab.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Procedura trwa kilka minut, trzeba uzbroić się w cierpliwość (ale wypisywanie bigramów przyspieszy!).\n", - "\n", - " pair 0: n i -> ni (frequency 17625075)\n", - " pair 1: i e -> ie (frequency 11471590)\n", - " pair 2: c z -> cz (frequency 9143490)\n", - " pair 3: ni e -> nie (frequency 7901783)\n", - " pair 4: p o -> po (frequency 7790826)\n", - " pair 5: r z -> rz (frequency 7542046)\n", - " pair 6: s t -> st (frequency 7269069)\n", - " pair 7: e m -> em (frequency 7207280)\n", - " pair 8: d z -> dz (frequency 6860931)\n", - " pair 9: s z -> sz (frequency 6609907)\n", - " pair 10: r a -> ra (frequency 6601618)\n", - " pair 11: o w -> ow (frequency 6395963)\n", - " pair 12: i e -> ie (frequency 5906869)\n", - " pair 13: n a -> na (frequency 5300380)\n", - " pair 14: r o -> ro (frequency 5181363)\n", - " pair 15: n a -> na (frequency 5125807)\n", - " pair 16: a ł -> ał (frequency 4786696)\n", - " pair 17: j e -> je (frequency 4599579)\n", - " pair 18: s i -> si (frequency 4300984)\n", - " pair 19: a l -> al (frequency 4276823)\n", - " pair 20: t e -> te (frequency 4033344)\n", - " pair 21: w i -> wi (frequency 3939063)\n", - " pair 22: c h -> ch (frequency 3919410)\n", - " pair 23: c h -> ch (frequency 3661410)\n", - " pair 24: k o -> ko (frequency 3629840)\n", - " pair 25: z a -> za (frequency 3625424)\n", - " pair 26: t a -> ta (frequency 3570094)\n", - " pair 27: p rz -> prz (frequency 3494551)\n", - " pair 28: g o -> go (frequency 3279997)\n", - " pair 29: a r -> ar (frequency 3081492)\n", - " pair 30: si ę -> się (frequency 2973681)\n", - " ...\n", - " pair 49970: brz mieniu -> brzmieniu (frequency 483)\n", - " pair 49971: bieżą cych -> bieżących (frequency 483)\n", - " pair 49972: biegu nkę -> biegunkę (frequency 483)\n", - " pair 49973: ban kowości -> bankowości (frequency 483)\n", - " pair 49974: ba ku -> baku (frequency 483)\n", - " pair 49975: ba cznie -> bacznie (frequency 483)\n", - " pair 49976: Przypad kowo -> Przypadkowo (frequency 483)\n", - " pair 49977: MA Ł -> MAŁ (frequency 483)\n", - " pair 49978: Lep pera -> Leppera (frequency 483)\n", - " pair 49979: Ko za -> Koza (frequency 483)\n", - " pair 49980: Jak byś -> Jakbyś (frequency 483)\n", - " pair 49981: Geni alne -> Genialne (frequency 483)\n", - " pair 49982: Że nada -> Żenada (frequency 482)\n", - " pair 49983: ń czykiem -> ńczykiem (frequency 482)\n", - " pair 49984: zwie ń -> zwień (frequency 482)\n", - " pair 49985: zost ałaś -> zostałaś (frequency 482)\n", - " pair 49986: zni szczona -> zniszczona (frequency 482)\n", - " pair 49987: ze stawi -> zestawi (frequency 482)\n", - " pair 49988: za sób -> zasób (frequency 482)\n", - " pair 49989: węd rówkę -> wędrówkę (frequency 482)\n", - " pair 49990: wysko czyła -> wyskoczyła (frequency 482)\n", - " pair 49991: wyle czenia -> wyleczenia (frequency 482)\n", - " pair 49992: wychowaw cze -> wychowawcze (frequency 482)\n", - " pair 49993: w t -> wt (frequency 482)\n", - " pair 49994: un da -> unda (frequency 482)\n", - " pair 49995: udzie lałem -> udzielałem (frequency 482)\n", - " pair 49996: tę czy -> tęczy (frequency 482)\n", - " pair 49997: tro sce -> trosce (frequency 482)\n", - " pair 49998: słusz ności -> słuszności (frequency 482)\n", - " pair 49999: su me -> sume (frequency 482\n", - "\n", - "Zastosujmy teraz wyindukowany słownik BPE dla jakiegoś rzeczywistego tekstu.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cier@@ piałem na straszne la@@ gi kilkanaście sekund lub dłużej czarnego ekranu przy próbie przełą@@ czenia się uruchomienia prawie każdej aplikacji Dodatkowo telefon mi się wyłą@@ czał czasem bez powodu sam z siebie albo rese@@ tował Ostatnio nawet przeglądarka zaczęła się często zawie@@ szać i Android proponował wymu@@ szone zamknięcie Do tego te problemy z połączeniem do komputera przez USB " - ] - } - ], - "source": [ - "! echo 'Cierpiałem na straszne lagi – kilkanaście sekund lub dłużej czarnego ekranu przy próbie przełączenia się / uruchomienia prawie każdej aplikacji. Dodatkowo telefon mi się wyłączał czasem bez powodu – sam z siebie, albo resetował. Ostatnio nawet przeglądarka zaczęła się często zawieszać i Android proponował wymuszone zamknięcie. Do tego te problemy z połączeniem do komputera przez USB.' | perl -C -ne 'print \"$& \" while/\\p{L}+/g;' | python -m subword_nmt.apply_bpe -c bpe_vocab.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Ta konkretna implementacja zaznacza za pomocą sekwencji ~@@ ~ koniec jednostki podwyrazowej.\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.2" - }, - "org": null - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/wyk/13_generative_approach.ipynb b/wyk/13_generative_approach.ipynb index 8234d7a..32dae7b 100644 --- a/wyk/13_generative_approach.ipynb +++ b/wyk/13_generative_approach.ipynb @@ -1,113 +1,135 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Ekstrakcja informacji a podejście generatywne\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Podejście generatywne\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Do tej pory zadanie ekstrakcji informacji traktowaliśmy jako zadanie etykietowania sekwencji, tzn. uczyliśmy system zaznaczać tokeny składające się na ekstrahowane informacje.\n", - "\n", - "![img](./ie-seqlab.png)\n", - "\n", - "Możliwe jest inne podeście, **generatywne**, w którym podchodzimy do problemu ekstrakcji informacji jak do swego rodzaju **tłumaczenia maszynowego** — „tłumaczymy” tekst (wraz z pytaniem lub etykietą) na informację.\n", - "\n", - "![img](./ie-gener.png)\n", - "\n", - "To podejście może się wydawać trudniejsze niż etykietowanie sekwencji, ale wystarczająco zaawansowanej architekturze sieci, jest wykonalne.\n", - "\n", - "Zalety:\n", - "\n", - "- informacja nie musi być dosłownie zapisana w tekście, ekstraktor może nauczyć się również normalizacji czy parafrazowania,\n", - "- nie wprowadzamy wielu kroków przetwarzania (gdzie błędy mogą się\n", - " namnażać), system działa na zasadzie *end-to-end*.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Atencja\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Pierwsze systemu neuronowego tłumaczenia maszynowego używały siecie LSTM. Dopiero jednak dodanie tzw. atencji (*attention*) umożliwiło duży przeskok jakościowy. Najpierw atencję dodano do sieci rekurencyjnych, później powstały sieci oparte *wyłącznie* na atencji — modele Transformer.\n", - "\n", - "Idea atencji polega na tym, że sieć może kierować selektywnie „snop” uwagi na wyrazy na wejściu lub do tej pory wygenerowane wyrazy.\n", - "\n", - "Mechanizm atencji korzysta z:\n", - "\n", - "- z poprzedniego stanu sieci $\\vec{s^{k-1}}$ (to jest „miejsce”, z którego „kierujemy” atencję),\n", - "- z wektora reprezentującego słowo $\\vec{v}(t_i)$ (to jest „miejsce”, na które kierujemy atencję), gdzie\n", - " $\\vec{v}(t_i)$ to reprezentacja wektorowa wyrazu $t_i$ (statyczny embedding lub reprezentacja wektorowa\n", - " z poprzedniej warstwy dla sieci wielowarstwowej),\n", - "\n", - "aby wytworzyć wektor kontekstu $\\vec{\\xi^k}$ (który z kolei będzie w jakiś sposób wnosił wkład do wyliczenia nowej wartości stanu $\\vec{s^k}$ lub wyjścia $y^k$.\n", - "\n", - "Najpierw wyliczymy skalarne wartości atencji, tzn. liczby, które będą sygnalizowały, jak bardzo wektor $\\vec{v}(t_i)$ „pasuje” do $\\vec{s^{k-1}}$, w najprostszej wersji można po prostu skorzystać z iloczynu skalarnego (o ile $n=m$),\n", - "\n", - "$$a(\\vec{s^{k-1}}, \\vec{v}(t_i)) = \\vec{s^{k-1}}\\vec{v}(t_i).$$\n", - "\n", - "**Pytanie**: co jeśli $n$ nie jest równe $m$, tzn. rozmiar embeddingu nie jest równy rozmiarowi wektora stanu?\n", - "\n", - "W przypadku sieci LSTM korzysta się częściej z bardziej skomplikowanego wzoru zawierającego dodatkowe wyuczalne wagi:\n", - "\n", - "$$a(\\vec{s^{k-1}}, \\vec{v}(t_i)) = \\vec{w_a}\\operatorname{tanh}(W_a\\vec{s^{k-1}} + U_a\\vec{v}(t_i))$$\n", - "\n", - "**Pytanie**: jakie rozmiary mają macierze $W_a$, $U_a$ i wektor $w_a$?\n", - "\n", - "Powtórzmy, że wartości $a$ są wartościami skalarnymi, natomiast nie są one znormalizowane (nie sumują się do jedynki), normalizujemy je używając schematu podobnego do softmaxa:\n", - "\n", - "$$\\alpha_{i} = \\frac{e^{a(\\vec{s^{k-1}}, \\vec{v}(t_i))}}{\\sum_j e^{a(\\vec{s^{k-1}}, \\vec{v}(t_j))}}$$\n", - "\n", - "Wektor kontekstu $\\vec{\\xi^k}$ będzie po prostu średnią ważoną wektorowych reprezentacji słów:\n", - "\n", - "$$\\vec{\\xi^k} = \\sum_i \\alpha_i\\vec{v}(t_i)$$\n", - "\n", - "**Pytanie**: zasadniczo atencja jest środkiem do celu (żeby sieć się sprawniej uczyła), czy można atencja sama w sobie może być do czegoś przydatna?\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.2" - }, - "org": null - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

13. Podej\u015bcie generatywne w ekstrakcji informacji [wyk\u0142ad]

\n", + "

Filip Grali\u0144ski (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Ekstrakcja informacji a podej\u015bcie generatywne\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Podej\u015bcie generatywne\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Do tej pory zadanie ekstrakcji informacji traktowali\u015bmy jako zadanie etykietowania sekwencji, tzn. uczyli\u015bmy system zaznacza\u0107 tokeny sk\u0142adaj\u0105ce si\u0119 na ekstrahowane informacje.\n", + "\n", + "![img](./ie-seqlab.png)\n", + "\n", + "Mo\u017cliwe jest inne pode\u015bcie, **generatywne**, w kt\u00f3rym podchodzimy do problemu ekstrakcji informacji jak do swego rodzaju **t\u0142umaczenia maszynowego** \u2014 \u201et\u0142umaczymy\u201d tekst (wraz z pytaniem lub etykiet\u0105) na informacj\u0119.\n", + "\n", + "![img](./ie-gener.png)\n", + "\n", + "To podej\u015bcie mo\u017ce si\u0119 wydawa\u0107 trudniejsze ni\u017c etykietowanie sekwencji, ale wystarczaj\u0105co zaawansowanej architekturze sieci, jest wykonalne.\n", + "\n", + "Zalety:\n", + "\n", + "- informacja nie musi by\u0107 dos\u0142ownie zapisana w tek\u015bcie, ekstraktor mo\u017ce nauczy\u0107 si\u0119 r\u00f3wnie\u017c normalizacji czy parafrazowania,\n", + "- nie wprowadzamy wielu krok\u00f3w przetwarzania (gdzie b\u0142\u0119dy mog\u0105 si\u0119\n", + " namna\u017ca\u0107), system dzia\u0142a na zasadzie *end-to-end*.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Atencja\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pierwsze systemu neuronowego t\u0142umaczenia maszynowego u\u017cywa\u0142y siecie LSTM. Dopiero jednak dodanie tzw. atencji (*attention*) umo\u017cliwi\u0142o du\u017cy przeskok jako\u015bciowy. Najpierw atencj\u0119 dodano do sieci rekurencyjnych, p\u00f3\u017aniej powsta\u0142y sieci oparte *wy\u0142\u0105cznie* na atencji \u2014 modele Transformer.\n", + "\n", + "Idea atencji polega na tym, \u017ce sie\u0107 mo\u017ce kierowa\u0107 selektywnie \u201esnop\u201d uwagi na wyrazy na wej\u015bciu lub do tej pory wygenerowane wyrazy.\n", + "\n", + "Mechanizm atencji korzysta z:\n", + "\n", + "- z poprzedniego stanu sieci $\\vec{s^{k-1}}$ (to jest \u201emiejsce\u201d, z kt\u00f3rego \u201ekierujemy\u201d atencj\u0119),\n", + "- z wektora reprezentuj\u0105cego s\u0142owo $\\vec{v}(t_i)$ (to jest \u201emiejsce\u201d, na kt\u00f3re kierujemy atencj\u0119), gdzie\n", + " $\\vec{v}(t_i)$ to reprezentacja wektorowa wyrazu $t_i$ (statyczny embedding lub reprezentacja wektorowa\n", + " z poprzedniej warstwy dla sieci wielowarstwowej),\n", + "\n", + "aby wytworzy\u0107 wektor kontekstu $\\vec{\\xi^k}$ (kt\u00f3ry z kolei b\u0119dzie w jaki\u015b spos\u00f3b wnosi\u0142 wk\u0142ad do wyliczenia nowej warto\u015bci stanu $\\vec{s^k}$ lub wyj\u015bcia $y^k$.\n", + "\n", + "Najpierw wyliczymy skalarne warto\u015bci atencji, tzn. liczby, kt\u00f3re b\u0119d\u0105 sygnalizowa\u0142y, jak bardzo wektor $\\vec{v}(t_i)$ \u201epasuje\u201d do $\\vec{s^{k-1}}$, w najprostszej wersji mo\u017cna po prostu skorzysta\u0107 z iloczynu skalarnego (o ile $n=m$),\n", + "\n", + "$$a(\\vec{s^{k-1}}, \\vec{v}(t_i)) = \\vec{s^{k-1}}\\vec{v}(t_i).$$\n", + "\n", + "**Pytanie**: co je\u015bli $n$ nie jest r\u00f3wne $m$, tzn. rozmiar embeddingu nie jest r\u00f3wny rozmiarowi wektora stanu?\n", + "\n", + "W przypadku sieci LSTM korzysta si\u0119 cz\u0119\u015bciej z bardziej skomplikowanego wzoru zawieraj\u0105cego dodatkowe wyuczalne wagi:\n", + "\n", + "$$a(\\vec{s^{k-1}}, \\vec{v}(t_i)) = \\vec{w_a}\\operatorname{tanh}(W_a\\vec{s^{k-1}} + U_a\\vec{v}(t_i))$$\n", + "\n", + "**Pytanie**: jakie rozmiary maj\u0105 macierze $W_a$, $U_a$ i wektor $w_a$?\n", + "\n", + "Powt\u00f3rzmy, \u017ce warto\u015bci $a$ s\u0105 warto\u015bciami skalarnymi, natomiast nie s\u0105 one znormalizowane (nie sumuj\u0105 si\u0119 do jedynki), normalizujemy je u\u017cywaj\u0105c schematu podobnego do softmaxa:\n", + "\n", + "$$\\alpha_{i} = \\frac{e^{a(\\vec{s^{k-1}}, \\vec{v}(t_i))}}{\\sum_j e^{a(\\vec{s^{k-1}}, \\vec{v}(t_j))}}$$\n", + "\n", + "Wektor kontekstu $\\vec{\\xi^k}$ b\u0119dzie po prostu \u015bredni\u0105 wa\u017con\u0105 wektorowych reprezentacji s\u0142\u00f3w:\n", + "\n", + "$$\\vec{\\xi^k} = \\sum_i \\alpha_i\\vec{v}(t_i)$$\n", + "\n", + "**Pytanie**: zasadniczo atencja jest \u015brodkiem do celu (\u017ceby sie\u0107 si\u0119 sprawniej uczy\u0142a), czy mo\u017cna atencja sama w sobie mo\u017ce by\u0107 do czego\u015b przydatna?\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + }, + "org": null, + "author": "Filip Grali\u0144ski", + "email": "filipg@amu.edu.pl", + "lang": "pl", + "subtitle": "13.Podej\u015bcie generatywne w ekstrakcji informacji[wyk\u0142ad]", + "title": "Ekstrakcja informacji", + "year": "2021" + }, + "nbformat": 4, + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/wyk/14_pretrenowanie.ipynb b/wyk/14_pretrenowanie.ipynb index ad081d1..860a4b0 100644 --- a/wyk/14_pretrenowanie.ipynb +++ b/wyk/14_pretrenowanie.ipynb @@ -1,369 +1,391 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pretrenowanie modeli\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "System AlphaZero uczy się grając sam ze sobą — wystarczy 24 godziny,\n", - "by system nauczył się grać w szachy lub go na nadludzkim poziomie.\n", - "\n", - "**Pytanie**: Dlaczego granie samemu ze sobą nie jest dobrym sposobem\n", - " nauczenia się grania w szachy dla człowieka, a dla maszyny jest?\n", - "\n", - "Co jest odpowiednikiem grania samemu ze sobą w świecie przetwarzania tekstu?\n", - "Tzn. **pretrenowanie** (*pretraining*) na dużym korpusie tekstu. (Tekst jest tani!)\n", - "\n", - "Jest kilka sposobów na pretrenowanie modelu, w każdym razie sprowadza\n", - "się do odgadywania następnego bądź zamaskowanego słowa.\n", - "W każdym razie zawsze stosujemy softmax (być może ze „sztuczkami” takimi jak\n", - "negatywne próbkowanie albo hierarchiczny softmax) na pewnej **reprezentacji kontekstowej**:\n", - "\n", - "$$\\vec{p} = \\operatorname{softmax}(f(\\vec{c})).$$\n", - "\n", - "Model jest karany używając funkcji log loss:\n", - "\n", - "$$-\\log(p_j),$$\n", - "\n", - "gdzie $w_j$ jest wyrazem, który pojawił się rzeczywiście w korpusie.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Przewidywanie słowa (GPT-2)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Jeden ze sposobów pretrenowania modelu to po prostu przewidywanie\n", - "następnego słowa.\n", - "\n", - "Zainstalujmy najpierw bibliotekę transformers.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "! pip install transformers" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50257\n" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

14. Pretrenowane modele j\u0119zyka [wyk\u0142ad]

\n", + "

Filip Grali\u0144ski (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pretrenowanie modeli\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "System AlphaZero uczy si\u0119 graj\u0105c sam ze sob\u0105 \u2014 wystarczy 24 godziny,\n", + "by system nauczy\u0142 si\u0119 gra\u0107 w szachy lub go na nadludzkim poziomie.\n", + "\n", + "**Pytanie**: Dlaczego granie samemu ze sob\u0105 nie jest dobrym sposobem\n", + " nauczenia si\u0119 grania w szachy dla cz\u0142owieka, a dla maszyny jest?\n", + "\n", + "Co jest odpowiednikiem grania samemu ze sob\u0105 w \u015bwiecie przetwarzania tekstu?\n", + "Tzn. **pretrenowanie** (*pretraining*) na du\u017cym korpusie tekstu. (Tekst jest tani!)\n", + "\n", + "Jest kilka sposob\u00f3w na pretrenowanie modelu, w ka\u017cdym razie sprowadza\n", + "si\u0119 do odgadywania nast\u0119pnego b\u0105d\u017a zamaskowanego s\u0142owa.\n", + "W ka\u017cdym razie zawsze stosujemy softmax (by\u0107 mo\u017ce ze \u201esztuczkami\u201d takimi jak\n", + "negatywne pr\u00f3bkowanie albo hierarchiczny softmax) na pewnej **reprezentacji kontekstowej**:\n", + "\n", + "$$\\vec{p} = \\operatorname{softmax}(f(\\vec{c})).$$\n", + "\n", + "Model jest karany u\u017cywaj\u0105c funkcji log loss:\n", + "\n", + "$$-\\log(p_j),$$\n", + "\n", + "gdzie $w_j$ jest wyrazem, kt\u00f3ry pojawi\u0142 si\u0119 rzeczywi\u015bcie w korpusie.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Przewidywanie s\u0142owa (GPT-2)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Jeden ze sposob\u00f3w pretrenowania modelu to po prostu przewidywanie\n", + "nast\u0119pnego s\u0142owa.\n", + "\n", + "Zainstalujmy najpierw bibliotek\u0119 transformers.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "50257\n" + ] + }, + { + "data": { + "text/plain": [ + "[('\u00c2\u0142', 0.6182783842086792),\n", + " ('\u00c8', 0.1154019758105278),\n", + " ('\u00d1\u0123', 0.026960616931319237),\n", + " ('_____', 0.024418892338871956),\n", + " ('________', 0.014962316490709782),\n", + " ('\u00c3\u0124', 0.010653386823832989),\n", + " ('\u00e4\u00b8\u0143', 0.008340531960129738),\n", + " ('\u00d1', 0.007557711564004421),\n", + " ('\u00ca', 0.007046067621558905),\n", + " ('\u00e3\u0122', 0.006875576451420784),\n", + " ('ile', 0.006685272324830294),\n", + " ('____', 0.006307446397840977),\n", + " ('\u00e2\u0122\u012d', 0.006306538358330727),\n", + " ('\u00d1\u0122', 0.006197483278810978),\n", + " ('\u0120Belarus', 0.006108700763434172),\n", + " ('\u00c6', 0.005720408633351326),\n", + " ('\u0120Poland', 0.0053678699769079685),\n", + " ('\u00e1\u00b9', 0.004606408067047596),\n", + " ('\u00ee\u0122', 0.004161055199801922),\n", + " ('????', 0.004056799225509167),\n", + " ('_______', 0.0038176667876541615),\n", + " ('\u00e4\u00b8', 0.0036082742735743523),\n", + " ('\u00cc', 0.003221835708245635),\n", + " ('urs', 0.003080119378864765),\n", + " ('________________', 0.0027312245219945908),\n", + " ('\u0120Lithuania', 0.0023860156070441008),\n", + " ('ich', 0.0021211160346865654),\n", + " ('iz', 0.002069818088784814),\n", + " ('vern', 0.002001357264816761),\n", + " ('\u00c5\u0124', 0.001717406208626926)]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import torch\n", + "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n", + "tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')\n", + "model = GPT2LMHeadModel.from_pretrained('gpt2-large')\n", + "text = 'Warsaw is the capital city of'\n", + "encoded_input = tokenizer(text, return_tensors='pt')\n", + "output = model(**encoded_input)\n", + "next_token_probs = torch.softmax(output[0][:, -1, :][0], dim=0)\n", + "\n", + "nb_of_tokens = next_token_probs.size()[0]\n", + "print(nb_of_tokens)\n", + "\n", + "_, top_k_indices = torch.topk(next_token_probs, 30, sorted=True)\n", + "\n", + "words = tokenizer.convert_ids_to_tokens(top_k_indices)\n", + "\n", + "top_probs = []\n", + "\n", + "for ix in range(len(top_k_indices)):\n", + " top_probs.append((words[ix], next_token_probs[top_k_indices[ix]].item()))\n", + "\n", + "top_probs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Zalety tego podej\u015bcia:\n", + "\n", + "- prostota,\n", + "- dobra podstawa do strojenia system\u00f3w generowania tekstu zw\u0142aszcza\n", + " \u201eotwartego\u201d (systemy dialogowe, generowanie (fake) news\u00f3w, streszczanie tekstu),\n", + " ale niekoniecznie t\u0142umaczenia maszynowego,\n", + "- zaskakuj\u0105ca skuteczno\u015b\u0107 przy uczeniu *few-shot* i *zero-shot*.\n", + "\n", + "Wady:\n", + "\n", + "- asymetryczno\u015b\u0107, przetwarzanie tylko z lewej do prawej, preferencja\n", + " dla lewego kontekstu,\n", + "- mniejsza skuteczno\u015b\u0107 przy dostrajaniu do zada\u0144 klasyfikacji i innych zada\u0144\n", + " niepolegaj\u0105cych na prostym generowaniu.\n", + "\n", + "Przyk\u0142ady modeli: GPT, GPT-2, GPT-3, DialoGPT.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Maskowanie s\u0142\u00f3w (BERT)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Inn\u0105 metod\u0105 jest maskowanie s\u0142\u00f3w (*Masked Language Modeling*, *MLM*).\n", + "\n", + "W tym podej\u015bciu losowe wybrane zast\u0119pujemy losowe s\u0142owa specjalnym\n", + "tokenem (`[MASK]`) i ka\u017cemy modelowi odgadywa\u0107 w ten spos\u00f3b\n", + "zamaskowane s\u0142owa (z uwzgl\u0119dnieniem r\u00f3wnie\u017c prawego kontekstu!).\n", + "\n", + "M\u00f3ci\u0105c \u015bci\u015ble, w jednym z pierwszych modeli tego typu (BERT)\n", + "zastosowano schemat, w kt\u00f3rym r\u00f3wnie\u017c niezamaskowane s\u0142owa s\u0105 odgadywane (!):\n", + "\n", + "- wybieramy losowe 15% wyraz\u00f3w do odgadni\u0119cia\n", + "- 80% z nich zast\u0119pujemy tokenem `[MASK]`,\n", + "- 10% zast\u0119pujemy innym losowym wyrazem,\n", + "- 10% pozostawiamy bez zmian.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/filipg/.local/lib/python3.9/site-packages/transformers/models/auto/modeling_auto.py:806: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "W kt\u00f3rym pa\u0144stwie le\u017cy Bombaj? W USA. (score: 0.16715531051158905)\n", + "W kt\u00f3rym pa\u0144stwie le\u017cy Bombaj? W India. (score: 0.09912960231304169)\n", + "W kt\u00f3rym pa\u0144stwie le\u017cy Bombaj? W Indian. (score: 0.039642028510570526)\n", + "W kt\u00f3rym pa\u0144stwie le\u017cy Bombaj? W Nepal. (score: 0.027137665078043938)\n", + "W kt\u00f3rym pa\u0144stwie le\u017cy Bombaj? W Pakistan. (score: 0.027065709233283997)\n", + "W kt\u00f3rym pa\u0144stwie le\u017cy Bombaj? W Polsce. (score: 0.023737527430057526)\n", + "W kt\u00f3rym pa\u0144stwie le\u017cy Bombaj? W .... (score: 0.02306722290813923)\n", + "W kt\u00f3rym pa\u0144stwie le\u017cy Bombaj? W Bangladesh. (score: 0.022106658667325974)\n", + "W kt\u00f3rym pa\u0144stwie le\u017cy Bombaj? W .... (score: 0.01628892682492733)\n", + "W kt\u00f3rym pa\u0144stwie le\u017cy Bombaj? W Niemczech. (score: 0.014501162804663181)\n" + ] + } + ], + "source": [ + "from transformers import AutoModelWithLMHead, AutoTokenizer\n", + "import torch\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"xlm-roberta-large\")\n", + "model = AutoModelWithLMHead.from_pretrained(\"xlm-roberta-large\")\n", + "\n", + "sequence = f'W kt\u00f3rym pa\u0144stwie le\u017cy Bombaj? W {tokenizer.mask_token}.'\n", + "\n", + "input_ids = tokenizer.encode(sequence, return_tensors=\"pt\")\n", + "mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]\n", + "\n", + "token_logits = model(input_ids)[0]\n", + "mask_token_logits = token_logits[0, mask_token_index, :]\n", + "mask_token_logits = torch.softmax(mask_token_logits, dim=1)\n", + "\n", + "top_10 = torch.topk(mask_token_logits, 10, dim=1)\n", + "top_10_tokens = zip(top_10.indices[0].tolist(), top_10.values[0].tolist())\n", + "\n", + "for token, score in top_10_tokens:\n", + " print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])), f\"(score: {score})\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Przyk\u0142ady: BERT, RoBERTa (r\u00f3wnie\u017c Polish RoBERTa).\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Podej\u015bcie generatywne (koder-dekoder).\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "System ma wygenerowa\u0107 odpowied\u017a na r\u00f3\u017cne pytania (r\u00f3wnie\u017c\n", + "odpowiadaj\u0105ce zadaniu MLM), np.:\n", + "\n", + "- \"translate English to German: That is good.\" => \"Das ist gut.\"\n", + "- \"cola sentence: The course is jumping well.\" => \"not acceptable\"\n", + "- \"summarize: state authorities dispatched emergency crews tuesday to survey the damage after an onslaught of severe weather in mississippi…\"\n", + " => \"six people hospitalized after a storm in attala county\"\n", + "- \"Thank you for me to your party week.\" => for inviting last \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['World War II ended in World War II.',\n", + " 'World War II ended in 1945..',\n", + " 'World War II ended in 1945.',\n", + " 'World War II ended in 1945.',\n", + " 'World War II ended in 1945.']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration\n", + "\n", + "T5_PATH = 't5-base'\n", + "\n", + "t5_tokenizer = T5Tokenizer.from_pretrained(T5_PATH)\n", + "t5_config = T5Config.from_pretrained(T5_PATH)\n", + "t5_mlm = T5ForConditionalGeneration.from_pretrained(T5_PATH, config=t5_config)\n", + "\n", + "slot = ''\n", + "\n", + "text = f'World War II ended in {slot}.'\n", + "\n", + "encoded = t5_tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')\n", + "input_ids = encoded['input_ids']\n", + "\n", + "outputs = t5_mlm.generate(input_ids=input_ids,\n", + " num_beams=200, num_return_sequences=5,\n", + " max_length=5)\n", + "\n", + "_0_index = text.index(slot)\n", + "_result_prefix = text[:_0_index]\n", + "_result_suffix = text[_0_index+len(slot):]\n", + "\n", + "def _filter(output, end_token=''):\n", + " _txt = t5_tokenizer.decode(output[2:], skip_special_tokens=False, clean_up_tokenization_spaces=False)\n", + " if end_token in _txt:\n", + " _end_token_index = _txt.index(end_token)\n", + " return _result_prefix + _txt[:_end_token_index] + _result_suffix\n", + " else:\n", + " return _result_prefix + _txt + _result_suffix\n", + "\n", + "\n", + "results = [_filter(out) for out in outputs]\n", + "results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(Zob. [https://arxiv.org/pdf/1910.10683.pdf](https://arxiv.org/pdf/1910.10683.pdf))\n", + "\n", + "Przyk\u0142ad: T5, mT5\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "org": null, + "author": "Filip Grali\u0144ski", + "email": "filipg@amu.edu.pl", + "lang": "pl", + "subtitle": "14.Pretrenowane modele j\u0119zyka[wyk\u0142ad]", + "title": "Ekstrakcja informacji", + "year": "2021" }, - { - "data": { - "text/plain": [ - "[('Âł', 0.6182783842086792),\n", - " ('È', 0.1154019758105278),\n", - " ('Ñģ', 0.026960616931319237),\n", - " ('_____', 0.024418892338871956),\n", - " ('________', 0.014962316490709782),\n", - " ('ÃĤ', 0.010653386823832989),\n", - " ('ä¸Ń', 0.008340531960129738),\n", - " ('Ñ', 0.007557711564004421),\n", - " ('Ê', 0.007046067621558905),\n", - " ('ãĢ', 0.006875576451420784),\n", - " ('ile', 0.006685272324830294),\n", - " ('____', 0.006307446397840977),\n", - " ('âĢĭ', 0.006306538358330727),\n", - " ('ÑĢ', 0.006197483278810978),\n", - " ('ĠBelarus', 0.006108700763434172),\n", - " ('Æ', 0.005720408633351326),\n", - " ('ĠPoland', 0.0053678699769079685),\n", - " ('á¹', 0.004606408067047596),\n", - " ('îĢ', 0.004161055199801922),\n", - " ('????', 0.004056799225509167),\n", - " ('_______', 0.0038176667876541615),\n", - " ('ä¸', 0.0036082742735743523),\n", - " ('Ì', 0.003221835708245635),\n", - " ('urs', 0.003080119378864765),\n", - " ('________________', 0.0027312245219945908),\n", - " ('ĠLithuania', 0.0023860156070441008),\n", - " ('ich', 0.0021211160346865654),\n", - " ('iz', 0.002069818088784814),\n", - " ('vern', 0.002001357264816761),\n", - " ('ÅĤ', 0.001717406208626926)]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import torch\n", - "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n", - "tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')\n", - "model = GPT2LMHeadModel.from_pretrained('gpt2-large')\n", - "text = 'Warsaw is the capital city of'\n", - "encoded_input = tokenizer(text, return_tensors='pt')\n", - "output = model(**encoded_input)\n", - "next_token_probs = torch.softmax(output[0][:, -1, :][0], dim=0)\n", - "\n", - "nb_of_tokens = next_token_probs.size()[0]\n", - "print(nb_of_tokens)\n", - "\n", - "_, top_k_indices = torch.topk(next_token_probs, 30, sorted=True)\n", - "\n", - "words = tokenizer.convert_ids_to_tokens(top_k_indices)\n", - "\n", - "top_probs = []\n", - "\n", - "for ix in range(len(top_k_indices)):\n", - " top_probs.append((words[ix], next_token_probs[top_k_indices[ix]].item()))\n", - "\n", - "top_probs" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Zalety tego podejścia:\n", - "\n", - "- prostota,\n", - "- dobra podstawa do strojenia systemów generowania tekstu zwłaszcza\n", - " „otwartego” (systemy dialogowe, generowanie (fake) newsów, streszczanie tekstu),\n", - " ale niekoniecznie tłumaczenia maszynowego,\n", - "- zaskakująca skuteczność przy uczeniu *few-shot* i *zero-shot*.\n", - "\n", - "Wady:\n", - "\n", - "- asymetryczność, przetwarzanie tylko z lewej do prawej, preferencja\n", - " dla lewego kontekstu,\n", - "- mniejsza skuteczność przy dostrajaniu do zadań klasyfikacji i innych zadań\n", - " niepolegających na prostym generowaniu.\n", - "\n", - "Przykłady modeli: GPT, GPT-2, GPT-3, DialoGPT.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Maskowanie słów (BERT)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Inną metodą jest maskowanie słów (*Masked Language Modeling*, *MLM*).\n", - "\n", - "W tym podejściu losowe wybrane zastępujemy losowe słowa specjalnym\n", - "tokenem (`[MASK]`) i każemy modelowi odgadywać w ten sposób\n", - "zamaskowane słowa (z uwzględnieniem również prawego kontekstu!).\n", - "\n", - "Móciąc ściśle, w jednym z pierwszych modeli tego typu (BERT)\n", - "zastosowano schemat, w którym również niezamaskowane słowa są odgadywane (!):\n", - "\n", - "- wybieramy losowe 15% wyrazów do odgadnięcia\n", - "- 80% z nich zastępujemy tokenem `[MASK]`,\n", - "- 10% zastępujemy innym losowym wyrazem,\n", - "- 10% pozostawiamy bez zmian.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/filipg/.local/lib/python3.9/site-packages/transformers/models/auto/modeling_auto.py:806: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "W którym państwie leży Bombaj? W USA. (score: 0.16715531051158905)\n", - "W którym państwie leży Bombaj? W India. (score: 0.09912960231304169)\n", - "W którym państwie leży Bombaj? W Indian. (score: 0.039642028510570526)\n", - "W którym państwie leży Bombaj? W Nepal. (score: 0.027137665078043938)\n", - "W którym państwie leży Bombaj? W Pakistan. (score: 0.027065709233283997)\n", - "W którym państwie leży Bombaj? W Polsce. (score: 0.023737527430057526)\n", - "W którym państwie leży Bombaj? W .... (score: 0.02306722290813923)\n", - "W którym państwie leży Bombaj? W Bangladesh. (score: 0.022106658667325974)\n", - "W którym państwie leży Bombaj? W .... (score: 0.01628892682492733)\n", - "W którym państwie leży Bombaj? W Niemczech. (score: 0.014501162804663181)\n" - ] - } - ], - "source": [ - "from transformers import AutoModelWithLMHead, AutoTokenizer\n", - "import torch\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(\"xlm-roberta-large\")\n", - "model = AutoModelWithLMHead.from_pretrained(\"xlm-roberta-large\")\n", - "\n", - "sequence = f'W którym państwie leży Bombaj? W {tokenizer.mask_token}.'\n", - "\n", - "input_ids = tokenizer.encode(sequence, return_tensors=\"pt\")\n", - "mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]\n", - "\n", - "token_logits = model(input_ids)[0]\n", - "mask_token_logits = token_logits[0, mask_token_index, :]\n", - "mask_token_logits = torch.softmax(mask_token_logits, dim=1)\n", - "\n", - "top_10 = torch.topk(mask_token_logits, 10, dim=1)\n", - "top_10_tokens = zip(top_10.indices[0].tolist(), top_10.values[0].tolist())\n", - "\n", - "for token, score in top_10_tokens:\n", - " print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])), f\"(score: {score})\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Przykłady: BERT, RoBERTa (również Polish RoBERTa).\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Podejście generatywne (koder-dekoder).\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "System ma wygenerować odpowiedź na różne pytania (również\n", - "odpowiadające zadaniu MLM), np.:\n", - "\n", - "- \"translate English to German: That is good.\" => \"Das ist gut.\"\n", - "- \"cola sentence: The course is jumping well.\" => \"not acceptable\"\n", - "- \"summarize: state authorities dispatched emergency crews tuesday to survey the damage after an onslaught of severe weather in mississippi…\"\n", - " => \"six people hospitalized after a storm in attala county\"\n", - "- \"Thank you for me to your party week.\" => for inviting last \n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['World War II ended in World War II.',\n", - " 'World War II ended in 1945..',\n", - " 'World War II ended in 1945.',\n", - " 'World War II ended in 1945.',\n", - " 'World War II ended in 1945.']" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration\n", - "\n", - "T5_PATH = 't5-base'\n", - "\n", - "t5_tokenizer = T5Tokenizer.from_pretrained(T5_PATH)\n", - "t5_config = T5Config.from_pretrained(T5_PATH)\n", - "t5_mlm = T5ForConditionalGeneration.from_pretrained(T5_PATH, config=t5_config)\n", - "\n", - "slot = ''\n", - "\n", - "text = f'World War II ended in {slot}.'\n", - "\n", - "encoded = t5_tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')\n", - "input_ids = encoded['input_ids']\n", - "\n", - "outputs = t5_mlm.generate(input_ids=input_ids,\n", - " num_beams=200, num_return_sequences=5,\n", - " max_length=5)\n", - "\n", - "_0_index = text.index(slot)\n", - "_result_prefix = text[:_0_index]\n", - "_result_suffix = text[_0_index+len(slot):]\n", - "\n", - "def _filter(output, end_token=''):\n", - " _txt = t5_tokenizer.decode(output[2:], skip_special_tokens=False, clean_up_tokenization_spaces=False)\n", - " if end_token in _txt:\n", - " _end_token_index = _txt.index(end_token)\n", - " return _result_prefix + _txt[:_end_token_index] + _result_suffix\n", - " else:\n", - " return _result_prefix + _txt + _result_suffix\n", - "\n", - "\n", - "results = [_filter(out) for out in outputs]\n", - "results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "(Zob. [https://arxiv.org/pdf/1910.10683.pdf](https://arxiv.org/pdf/1910.10683.pdf))\n", - "\n", - "Przykład: T5, mT5\n", - "\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" - }, - "org": null - }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/wyk/15_transformer.ipynb b/wyk/15_transformer.ipynb index 9131627..19df858 100644 --- a/wyk/15_transformer.ipynb +++ b/wyk/15_transformer.ipynb @@ -1,5 +1,19 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n", + "
\n", + "

Ekstrakcja informacji

\n", + "

15. Sieci Transformer i ich zastosowanie w ekstrakcji informacji [wykład]

\n", + "

Filip Graliński (2021)

\n", + "
\n", + "\n", + "![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -226,11 +240,14 @@ } ], "metadata": { + "author": "Filip Graliński", + "email": "filipg@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, + "lang": "pl", "language_info": { "codemirror_mode": { "name": "ipython", @@ -241,10 +258,13 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.9.6" }, - "org": null + "org": null, + "subtitle": "15.Sieci Transformer i ich zastosowanie w ekstrakcji informacji[wykład]", + "title": "Ekstrakcja informacji", + "year": "2021" }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 4 }