Compare commits
83 Commits
Author | SHA1 | Date | |
---|---|---|---|
72db2aa687 | |||
|
f804311da1 | ||
|
3c0223d434 | ||
|
0f34dcdeb4 | ||
|
5acffc0265 | ||
|
4437a7f71b | ||
|
ad34aaeae0 | ||
1836dc18c1 | |||
fedffd5456 | |||
9a61b2c06c | |||
22f1e74aef | |||
72c6fbcbf6 | |||
a45fd570e5 | |||
|
cc675a8591 | ||
|
93ea351350 | ||
|
2cd5bef0a0 | ||
|
d3c996511c | ||
|
f3db74bfd3 | ||
|
691dd36092 | ||
|
2a031bc8d8 | ||
077c2b6f90 | |||
97d92d38e8 | |||
|
9d96c9ec4f | ||
|
f1bc633468 | ||
78fb510cba | |||
edf3811cd7 | |||
89155edea0 | |||
|
0ea752f091 | ||
0d10bc2fca | |||
429caef49c | |||
aebba6c18b | |||
98012914c4 | |||
|
af907e23af | ||
|
15854bb61f | ||
a004360ec5 | |||
1071d5ba44 | |||
86a5fbe20c | |||
87e0faf1a2 | |||
a6a4106844 | |||
|
d76157a5a5 | ||
|
6099566d29 | ||
340197a94c | |||
54f52bbc6a | |||
10981fc2bc | |||
|
22c32c8a43 | ||
|
da334a57fe | ||
90b0947029 | |||
ebfd32b60d | |||
b40e011c66 | |||
ce461797fb | |||
a50afdf750 | |||
65e28bbbb9 | |||
|
db2fad735e | ||
|
d2bf465d4a | ||
|
81e0329199 | ||
|
a8fd576d16 | ||
|
4fa73c353a | ||
|
a2d7d975ff | ||
|
36155ad5b4 | ||
|
c9825d8d60 | ||
4dd0ace030 | |||
517fb0dae8 | |||
91c4d13617 | |||
|
a0ca3e657d | ||
|
54ab26b5f9 | ||
9866eb875e | |||
3d70d8a7ec | |||
|
550a399bff | ||
|
8f531a680c | ||
b5d6d177af | |||
c5864ab9ad | |||
724be2f486 | |||
|
dfd6873823 | ||
|
d5a60b064e | ||
|
57dc700d51 | ||
|
84426fb38b | ||
|
1c6482b8a1 | ||
|
f351417476 | ||
|
b7ebc44cc2 | ||
2687aaa426 | |||
094f2713ba | |||
|
8ac2210b6f | ||
|
56434f096b |
49
add-metadata.py
Executable file
@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
#procedura napisywania plików ipynb (generowanie nagłówka i metadanych)
|
||||
import json
|
||||
import sys
|
||||
import re
|
||||
|
||||
def modjup(filen,numer,tytul,typ,author,email,lang,title,year):
|
||||
zerocell=['![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n',
|
||||
'<div class="alert alert-block alert-info">\n',
|
||||
'<h1> %s </h1>\n'%(title),
|
||||
'<h2> %s. <i>%s</i> [%s]</h2> \n'%(numer,tytul,typ),
|
||||
'<h3> %s (%s)</h3>\n'%(author,year),
|
||||
'</div>\n',
|
||||
'\n',
|
||||
'![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)']
|
||||
zerodict={'cell_type': 'markdown','metadata': {'collapsed': False},'source': zerocell}
|
||||
with open(filen, 'r+',encoding='utf-8') as f:
|
||||
ll=json.load(f)
|
||||
ll["metadata"]["author"]=author
|
||||
ll["metadata"]["email"]=email
|
||||
ll["metadata"]["lang"]=lang
|
||||
subtitle="%s.%s[%s]"%(numer,tytul,typ)
|
||||
ll["metadata"]["subtitle"]=subtitle
|
||||
ll["metadata"]["title"]=title
|
||||
ll["metadata"]["year"]=year
|
||||
|
||||
if not(ll['cells'][0]['source'][0]==zerocell[0]):
|
||||
ll['cells'].insert(0,zerodict)
|
||||
else:
|
||||
ll['cells'][0]=zerodict
|
||||
f.seek(0)
|
||||
json.dump(ll,f,indent=4)
|
||||
|
||||
#zmodyfikuj te dane
|
||||
filen=sys.argv[1]
|
||||
|
||||
numer=re.match(r'^(?:\D+/)?0*(\d+)', filen).group(1)
|
||||
tytul=sys.argv[2]
|
||||
typ="wykład"
|
||||
|
||||
author="Filip Graliński"
|
||||
email="filipg@amu.edu.pl"
|
||||
lang= "pl"
|
||||
title="Ekstrakcja informacji"
|
||||
year="2021"
|
||||
|
||||
#uruchom procedurę
|
||||
modjup(filen,numer,tytul,typ,author,email,lang,title,year)
|
7
convert_ipynb_to_md.sh
Normal file
@ -0,0 +1,7 @@
|
||||
set -ex
|
||||
|
||||
FILEIPYNB=$1
|
||||
jupyter nbconvert --to script $1
|
||||
FILEPY=$(echo $FILEIPYNB | sed 's/.ipynb$/.py/')
|
||||
FILEMD=$(echo $FILEIPYNB | sed 's/.ipynb$/.md/')
|
||||
python convert_python_to_markdown.py "$FILEPY" "$FILEMD"
|
13
convert_python_to_markdown.py
Normal file
@ -0,0 +1,13 @@
|
||||
import sys
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from markdown import markdown
|
||||
|
||||
with open(sys.argv[1]) as f_in, open(sys.argv[2],'w') as f_out:
|
||||
for i, line in enumerate(f_in):
|
||||
if i in (1,2):
|
||||
continue
|
||||
if line[:2] == "# " and line[:5] != "# In[":
|
||||
text = line[:2]
|
||||
f_out.write(line[2:])
|
||||
|
@ -1,5 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 0. <i>Informacje na temat przedmiotu</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -57,21 +71,17 @@
|
||||
"\n",
|
||||
"**Żeby zaliczyć przedmiot należy pojawiać się na laboratoriach. Maksymalna liczba nieobecności to 3. Obecność będę sprawdzał poprzez panel MS TEAMS, czyli będę sprawdzał czy ktoś jest wdzwoniony na ćwiczenia. Jeżeli kogoś nie będzie więcej niż 3 razy, to nie będzie miał zaliczonego przedmiotu** \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@ -83,7 +93,10 @@
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
}
|
||||
},
|
||||
"subtitle": "0.Informacje na temat przedmiotu[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
|
@ -1,181 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Opracować w języku Haskell wyspecjalizowanego robota pobierającego dane z konkretnego serwisu.\n",
|
||||
"\n",
|
||||
"Punkty: 80 (domyślnie - niektóre zadanie są trudniejsze, wówczas podaję osobno liczbę punktów)\n",
|
||||
"\n",
|
||||
"Ogólne zasady:\n",
|
||||
"\n",
|
||||
"* pobieramy informacje (metadane) o plikach PDF, DjVU, JPG itp, ale nie same pliki,\n",
|
||||
"* nie pobierajmy całego serwisu, tylko tyle, ile trzeba, by pobrać metadane o interesujących nas zasobach,\n",
|
||||
"* interesują nas tylko teksty polskie, jeśli nie jest to trudne, należy odfiltrować publikacje obcojęzyczne,\n",
|
||||
"* staramy się ustalać datę z możliwie dużą dokładnością.\n",
|
||||
"\n",
|
||||
"Sposób pracy:\n",
|
||||
"\n",
|
||||
"0. Pobrać Haskell Stack\n",
|
||||
"\n",
|
||||
"~~~\n",
|
||||
"curl -sSL https://get.haskellstack.org/ | sh -s - -d ~/bin\n",
|
||||
"~~~\n",
|
||||
"\n",
|
||||
"Na fizycznych komputerach wydziałowych są błędnie ustawione prawa dostępu na dyskach sieciowych, Haskell Stack musi działać na fizycznym dysku:\n",
|
||||
"\n",
|
||||
"~~~\n",
|
||||
"rm -rf /mnt/poligon/.stack\n",
|
||||
"mkdir /mnt/poligon/.stack\n",
|
||||
"mv ~/.stack ~/.stack-bak # gdyby już był... proszę się nie przejmować błędem\n",
|
||||
"ln -s /mnt/poligon/.stack ~/.stack\n",
|
||||
"~~~\n",
|
||||
"\n",
|
||||
"1. Pobrać repozytorium:\n",
|
||||
"\n",
|
||||
"~~~\n",
|
||||
"git clone https://git.wmi.amu.edu.pl/filipg/twilight-library.git\n",
|
||||
"~~~\n",
|
||||
"\n",
|
||||
"2. Wypchnąć na początek do swojego repozytorium (trzeba sobie najpierw założyć to repozytorium na <https://git.wmi.amu.edu.pl>)\n",
|
||||
"\n",
|
||||
"~~~\n",
|
||||
"cd twilight-library\n",
|
||||
"git remote set-url origin git@git.wmi.amu.edu.pl:YOURID/twilight-library\n",
|
||||
"git push origin master\n",
|
||||
"git remote add mother git://gonito.net/twilight-library\n",
|
||||
"~~~\n",
|
||||
"\n",
|
||||
"3. Zobacz, czy przykładowy robot dla strony z „Alamanachem Muszyny” działa:\n",
|
||||
"\n",
|
||||
"~~~\n",
|
||||
"~/bin/stack install # może trwać długo za pierwszym razem\n",
|
||||
"~/bin/stack exec almanachmuszyny\n",
|
||||
"~~~\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"W razie problemów z instalacją:\n",
|
||||
"\n",
|
||||
"~~~\n",
|
||||
"sudo apt install libpcre3 libpcre3-dev\n",
|
||||
"~~~\n",
|
||||
"\n",
|
||||
"3. Opracuj swojego robota wzorując się na pliku `almanachmuszyny.hs`.\n",
|
||||
" (Ale dodaj swój plik, nie zmieniaj `almanachmuszyny.hs`!)\n",
|
||||
"\n",
|
||||
"4. Dopisz specyfikację swojego robota do `shadow-library.cabal`.\n",
|
||||
"\n",
|
||||
"5. Pracuj nad swoim robotem, uruchamiaj go w następujący sposób:\n",
|
||||
"\n",
|
||||
"~~~\n",
|
||||
"~/bin/stack install\n",
|
||||
"~/bin/stack exec mojrobot\n",
|
||||
"~~~\n",
|
||||
"\n",
|
||||
"(Tzn. nie nazywaj go „mojrobot”, tylko użyj jakieś sensownej nazwy.)\n",
|
||||
"\n",
|
||||
"6. Jeśli publikacja (np. pojedynczy numer gazety) składa się z wielu plików, powinien zostać wygenerowany jeden\n",
|
||||
"rekord, w `finalUrl` powinny znaleźć się URL do poszczególnych stron (np. plików JPR) oddzielone ` // `.\n",
|
||||
"\n",
|
||||
"7. Po zakończeniu prac prześlij mejla do prowadzącego zajęcia z URL-em do swojego repozytorium.\n",
|
||||
"\n",
|
||||
"Lista serwisów do wyboru (na każdy serwis 1 osoba):\n",
|
||||
"\n",
|
||||
"1. [Teksty Drugie](http://tekstydrugie.pl)\n",
|
||||
"2. [Archiwum Inspektora Pracy](https://www.pip.gov.pl/pl/inspektor-pracy/66546,archiwum-inspektora-pracy-.html)\n",
|
||||
"3. [Medycyna Weterynaryjna](http://www.medycynawet.edu.pl/archives) — również historyczne zasoby od 1945 roku, **120 punktów**\n",
|
||||
"4. [Polskie Towarzystwo Botaniczne](https://pbsociety.org.pl/default/dzialalnosc-wydawnicza/) — wszystkie dostępne zdigitalizowane publikacje!, **130 punktow**\n",
|
||||
"5. [Wieści Pepowa](http://archiwum2019.pepowo.pl/news/c-10/gazeta) — nie pominąć strony nr 2 z wynikami, **110 punktów**\n",
|
||||
"6. [Czasopismo Kosmos](http://kosmos.icm.edu.pl/)\n",
|
||||
"7. [Czasopismo Wszechświat](http://www.ptpk.org/archiwum.html)\n",
|
||||
"8. [Czasopisma polonijne we Francji](https://argonnaute.parisnanterre.fr/ark:/14707/a011403267917yQQFAS) — najlepiej w postaci PDF-ów, jak np. [https://argonnaute.parisnanterre.fr/medias/customer_3/periodique/immi_pol_lotmz1_pdf/BDIC_GFP_2929_1945_039.pdf](), **220 punktów**\n",
|
||||
"9. [Muzeum Sztuki — czasopisma](https://zasoby.msl.org.pl/mobjects/show), **220 punktów**, publikacje, teksty, czasopisma, wycinki\n",
|
||||
"10. [Wiadomości Urzędu Patentowego](https://grab.uprp.pl/sites/Wydawnictwa/WydawnictwaArchiwum/WydawnictwaArchiwum/Forms/AllItems.aspx)\n",
|
||||
"11. [Czas, czasopismo polonijne](https://digitalcollections.lib.umanitoba.ca/islandora/object/uofm:2222545), **140 punktów** S.G.\n",
|
||||
"12. [Stenogramy Okrągłego Stołu](http://okragly-stol.pl/stenogramy/), **110 punktów**\n",
|
||||
"13. [Nasze Popowice](https://smpopowice.pl/index.php/numery-archiwalne)\n",
|
||||
"14. [Czasopisma entomologiczne](http://pte.au.poznan.pl/)\n",
|
||||
"15. [Wiadomości matematyczne](https://wydawnictwa.ptm.org.pl/index.php/wiadomosci-matematyczne/issue/archive?issuesPage=2), **120 punktow**\n",
|
||||
"16. [Alkoholizm i Narkomania](http://www.ain.ipin.edu.pl/archiwum-starsze.html)\n",
|
||||
"17. [Czasopismo Etyka](https://etyka.uw.edu.pl/tag/etyka-562018/), O.K.\n",
|
||||
"18. [Skup makulatury](https://chomikuj.pl/skup.makulatury.prl), **250 punktów**\n",
|
||||
"19. [Hermes](https://chomikuj.pl/hermes50-1) i https://chomikuj.pl/hermes50-2, **250 punktów**\n",
|
||||
"20. [E-dziennik Województwa Mazowieckiego](https://edziennik.mazowieckie.pl/actbymonths) **150 punktów**\n",
|
||||
"21. [Czasopismo Węgiel Brunatny](http://www.ppwb.org.pl/wegiel_brunatny)\n",
|
||||
"22. [Gazeta GUM](https://gazeta.gumed.edu.pl/61323.html)\n",
|
||||
"23. [Nowiny Andrychowskie](https://radioandrychow.pl/nowiny/)\n",
|
||||
"24. [Kawęczyniak](http://bip.kaweczyn.pl/kaweczyn/pl/dla-mieszkanca/publikacje/archiwalne-numery-kaweczyniaka-rok-1995-2005/kaweczyniaki-rok-1997.html)\n",
|
||||
"25. [Zbór Chrześcijański w Bielawia](http://zborbielawa.pl/archiwum/)\n",
|
||||
"26. [Gazeta Rytwiańska](http://www.rytwiany.com.pl/index.php?sid=5)\n",
|
||||
"27. [Nasze Popowice](https://smpopowice.pl/gazeta/2005_12_nasze-popowice-nr_01.pdf)\n",
|
||||
"28. [Echo Chełmka](http://moksir.chelmek.pl/o-nas/echo-chelmka)\n",
|
||||
"29. [Głos Świdnika](http://s.bibliotekaswidnik.pl/index.php/archwium/116-glos-swidnika) **100 punktów**\n",
|
||||
"30. [Aneks](https://aneks.kulturaliberalna.pl/archiwum-aneksu/) **90 punktów**\n",
|
||||
"31. [Teatr Lalel](http://polunima.pl/teatr-lalek)\n",
|
||||
"32. [Biuletyn Bezpieczna Chemia](https://www.pipc.org.pl/publikacje/biuletyn-bezpieczna-chemia)\n",
|
||||
"33. [Głos Maszynisty](https://zzm.org.pl/glos-maszynisty/)\n",
|
||||
"34. [Kultura Paryska](https://www.kulturaparyska.com/pl/index), całe archiwum z książkami i innymi czasopismami, **180 punktów**\n",
|
||||
"35. [Gazeta Fabryczna - Kraśnik](https://80lat.flt.krasnik.pl/index.php/gazeta-fabryczna/) **120 punktów**\n",
|
||||
"36. [Artykuły o Jujutsu](http://www.kobudo.pl/artykuly_jujutsu.html)\n",
|
||||
"37. [Wycinki o Taekwon-Do](https://www2.pztkd.lublin.pl/archpras.html#z1996)\n",
|
||||
"38. [Materiały o kolejnictwie](https://enkol.pl/Strona_g%C5%82%C3%B3wna) **180 punktów**\n",
|
||||
"39. [Centralny Instytut Ochrony Pracy](http://archiwum.ciop.pl/), znaleźć wszystkie publikacje typu <http://archiwum.ciop.pl/44938>, wymaga trochę sprytu **130 punktów**\n",
|
||||
"40. [Biblioteka Sejmowa - Zasoby Cyfrowe](https://biblioteka.sejm.gov.pl/zasoby_cyfrowe/), **200 punktów**\n",
|
||||
"41. [Elektronika Praktyczna](https://ep.com.pl/archiwum), te numery, które dostępne w otwarty sposób, np. rok 1993\n",
|
||||
"42. [Litewska Akademia Nauk](http://www.mab.lt/), tylko materiały w jęz. polskim, takie jak np.\n",
|
||||
" <https://elibrary.mab.lt/handle/1/840>, **170 punktów**\n",
|
||||
"43. [Litewska Biblioteka Cyfrowa](https://www.epaveldas.lt), wyłuskać tylko materiały w jęz. polskim, **190 punktów**\n",
|
||||
"44. [Czasopisma Geologiczne](https://geojournals.pgi.gov.pl), **120 punktów**\n",
|
||||
"45. [Czasopisma PTTK](https://www.czasopisma.centralnabibliotekapttk.pl/index.php?i3), **120 punktów**\n",
|
||||
"46. [Czasopisma Polskiego Towarzystwa Dendrologicznego](https://www.ptd.pl/?page_id=7), **100 punktów**\n",
|
||||
"47. [Kilka przedwojennych książek](https://dziemiela.com/documents.htm)\n",
|
||||
"48. [Historia polskiej informatyki](http://klio.spit.iq.pl/a4-wyroby-polskiej-informatyki/a4-2-sprzet/) - wyjątkowo bez datowania\n",
|
||||
"49. [Zeszyty Formacyjne Katolickiego Stowarzyszenia „Civitas Christania”](http://podkarpacki.civitaschristiana.pl/formacja/zeszyty-formacyjne/), tylko niektóre pliki można zdatować\n",
|
||||
"50. [Józef Piłsudski Institute of America](https://archiwa.pilsudski.org/) - **220 punktów**\n",
|
||||
"51. [Prasa podziemna — Częstochowa](http://www.podziemie.com.pl), również ulotki i inne materiały skanowane - **180 punktów**\n",
|
||||
"52. [Tajemnica Atari](http://krap.pl/mirrorz/atari/horror.mirage.com.pl/pixel/), plik ZIP z DjVu\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"### F.A.Q.\n",
|
||||
"\n",
|
||||
"**P: Nie działają strony z protokołem https, co zrobić?**\n",
|
||||
"\n",
|
||||
"O: Trzeba użyć modułu opartego na bibliotece curl. Paczka Ubuntu została zainstalowana na komputerach wydziałowych. Na\n",
|
||||
"swoim komputerze możemy zainstalować paczkę libcurl4-openssl-dev, a\n",
|
||||
"następnie można sobie ściągnąć wersję twilight-library opartą na libcurl:\n",
|
||||
"\n",
|
||||
" git fetch git://gonito.net/twilight-library withcurl\n",
|
||||
" git merge FETCH_HEAD\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
@ -1,5 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 1. <i>Wyszukiwarki wprowadzenie</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -234,11 +248,14 @@
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@ -249,8 +266,11 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.5"
|
||||
}
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "1.Wyszukiwarki wprowadzenie[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
|
@ -1,5 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 2. <i>Wyszukiwarki roboty</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -262,21 +276,17 @@
|
||||
"67. [Instytut Techniki Górniczej - wycinki](http://www.komag.gliwice.pl/archiwum/historia-komag)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@ -287,8 +297,11 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "2.Wyszukiwarki roboty[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
|
@ -1,5 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 3. <i>tfidf (1)</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -1065,21 +1079,17 @@
|
||||
"$|D|$ - ilość dokumentów w korpusie\n",
|
||||
"$|\\{d : t_i \\in d \\}|$ - ilość dokumentów w korpusie, gdzie dany term występuje chociaż jeden raz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@ -1091,7 +1101,10 @@
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
}
|
||||
},
|
||||
"subtitle": "3.tfidf (1)[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
@ -1,5 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 3. <i>tfidf (1)</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
@ -46,11 +60,14 @@
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@ -62,7 +79,10 @@
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
}
|
||||
},
|
||||
"subtitle": "3.tfidf (1)[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
@ -1,5 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 3. <i>tfidf (2)</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -434,217 +448,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"From: ray@netcom.com (Ray Fischer)\n",
|
||||
"Subject: Re: x86 ~= 680x0 ?? (How do they compare?)\n",
|
||||
"Organization: Netcom. San Jose, California\n",
|
||||
"Distribution: usa\n",
|
||||
"Lines: 36\n",
|
||||
"\n",
|
||||
"dhk@ubbpc.uucp (Dave Kitabjian) writes ...\n",
|
||||
">I'm sure Intel and Motorola are competing neck-and-neck for \n",
|
||||
">crunch-power, but for a given clock speed, how do we rank the\n",
|
||||
">following (from 1st to 6th):\n",
|
||||
"> 486\t\t68040\n",
|
||||
"> 386\t\t68030\n",
|
||||
"> 286\t\t68020\n",
|
||||
"\n",
|
||||
"040 486 030 386 020 286\n",
|
||||
"\n",
|
||||
">While you're at it, where will the following fit into the list:\n",
|
||||
"> 68060\n",
|
||||
"> Pentium\n",
|
||||
"> PowerPC\n",
|
||||
"\n",
|
||||
"060 fastest, then Pentium, with the first versions of the PowerPC\n",
|
||||
"somewhere in the vicinity.\n",
|
||||
"\n",
|
||||
">And about clock speed: Does doubling the clock speed double the\n",
|
||||
">overall processor speed? And fill in the __'s below:\n",
|
||||
"> 68030 @ __ MHz = 68040 @ __ MHz\n",
|
||||
"\n",
|
||||
"No. Computer speed is only partly dependent of processor/clock speed.\n",
|
||||
"Memory system speed play a large role as does video system speed and\n",
|
||||
"I/O speed. As processor clock rates go up, the speed of the memory\n",
|
||||
"system becomes the greatest factor in the overall system speed. If\n",
|
||||
"you have a 50MHz processor, it can be reading another word from memory\n",
|
||||
"every 20ns. Sure, you can put all 20ns memory in your computer, but\n",
|
||||
"it will cost 10 times as much as the slower 80ns SIMMs.\n",
|
||||
"\n",
|
||||
"And roughly, the 68040 is twice as fast at a given clock\n",
|
||||
"speed as is the 68030.\n",
|
||||
"\n",
|
||||
"-- \n",
|
||||
"Ray Fischer \"Convictions are more dangerous enemies of truth\n",
|
||||
"ray@netcom.com than lies.\" -- Friedrich Nietzsche\n",
|
||||
"\n",
|
||||
"0.4778416465020907\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"From: rvenkate@ux4.cso.uiuc.edu (Ravikuma Venkateswar)\n",
|
||||
"Subject: Re: x86 ~= 680x0 ?? (How do they compare?)\n",
|
||||
"Distribution: usa\n",
|
||||
"Organization: University of Illinois at Urbana\n",
|
||||
"Lines: 59\n",
|
||||
"\n",
|
||||
"ray@netcom.com (Ray Fischer) writes:\n",
|
||||
"\n",
|
||||
">dhk@ubbpc.uucp (Dave Kitabjian) writes ...\n",
|
||||
">>I'm sure Intel and Motorola are competing neck-and-neck for \n",
|
||||
">>crunch-power, but for a given clock speed, how do we rank the\n",
|
||||
">>following (from 1st to 6th):\n",
|
||||
">> 486\t\t68040\n",
|
||||
">> 386\t\t68030\n",
|
||||
">> 286\t\t68020\n",
|
||||
"\n",
|
||||
">040 486 030 386 020 286\n",
|
||||
"\n",
|
||||
"How about some numbers here? Some kind of benchmark?\n",
|
||||
"If you want, let me start it - 486DX2-66 - 32 SPECint92, 16 SPECfp92 .\n",
|
||||
"\n",
|
||||
">>While you're at it, where will the following fit into the list:\n",
|
||||
">> 68060\n",
|
||||
">> Pentium\n",
|
||||
">> PowerPC\n",
|
||||
"\n",
|
||||
">060 fastest, then Pentium, with the first versions of the PowerPC\n",
|
||||
">somewhere in the vicinity.\n",
|
||||
"\n",
|
||||
"Numbers? Pentium @66MHz - 65 SPECint92, 57 SPECfp92 .\n",
|
||||
"\t PowerPC @66MHz - 50 SPECint92, 80 SPECfp92 . (Note this is the 601)\n",
|
||||
" (Alpha @150MHz - 74 SPECint92,126 SPECfp92 - just for comparison)\n",
|
||||
"\n",
|
||||
">>And about clock speed: Does doubling the clock speed double the\n",
|
||||
">>overall processor speed? And fill in the __'s below:\n",
|
||||
">> 68030 @ __ MHz = 68040 @ __ MHz\n",
|
||||
"\n",
|
||||
">No. Computer speed is only partly dependent of processor/clock speed.\n",
|
||||
">Memory system speed play a large role as does video system speed and\n",
|
||||
">I/O speed. As processor clock rates go up, the speed of the memory\n",
|
||||
">system becomes the greatest factor in the overall system speed. If\n",
|
||||
">you have a 50MHz processor, it can be reading another word from memory\n",
|
||||
">every 20ns. Sure, you can put all 20ns memory in your computer, but\n",
|
||||
">it will cost 10 times as much as the slower 80ns SIMMs.\n",
|
||||
"\n",
|
||||
"Not in a clock-doubled system. There isn't a doubling in performance, but\n",
|
||||
"it _is_ quite significant. Maybe about a 70% increase in performance.\n",
|
||||
"\n",
|
||||
"Besides, for 0 wait state performance, you'd need a cache anyway. I mean,\n",
|
||||
"who uses a processor that runs at the speed of 80ns SIMMs? Note that this\n",
|
||||
"memory speed corresponds to a clock speed of 12.5 MHz.\n",
|
||||
"\n",
|
||||
">And roughly, the 68040 is twice as fast at a given clock\n",
|
||||
">speed as is the 68030.\n",
|
||||
"\n",
|
||||
"Numbers?\n",
|
||||
"\n",
|
||||
">-- \n",
|
||||
">Ray Fischer \"Convictions are more dangerous enemies of truth\n",
|
||||
">ray@netcom.com than lies.\" -- Friedrich Nietzsche\n",
|
||||
"-- \n",
|
||||
"Ravikumar Venkateswar\n",
|
||||
"rvenkate@uiuc.edu\n",
|
||||
"\n",
|
||||
"A pun is a no' blessed form of whit.\n",
|
||||
"\n",
|
||||
"0.44292082969477664\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"From: ray@netcom.com (Ray Fischer)\n",
|
||||
"Subject: Re: x86 ~= 680x0 ?? (How do they compare?)\n",
|
||||
"Organization: Netcom. San Jose, California\n",
|
||||
"Distribution: usa\n",
|
||||
"Lines: 30\n",
|
||||
"\n",
|
||||
"rvenkate@ux4.cso.uiuc.edu (Ravikuma Venkateswar) writes ...\n",
|
||||
">ray@netcom.com (Ray Fischer) writes:\n",
|
||||
">>040 486 030 386 020 286\n",
|
||||
">\n",
|
||||
">How about some numbers here? Some kind of benchmark?\n",
|
||||
"\n",
|
||||
"Benchmarks are for marketing dweebs and CPU envy. OK, if it will make\n",
|
||||
"you happy, the 486 is faster than the 040. BFD. Both architectures\n",
|
||||
"are nearing then end of their lifetimes. And especially with the x86\n",
|
||||
"architecture: good riddance.\n",
|
||||
"\n",
|
||||
">Besides, for 0 wait state performance, you'd need a cache anyway. I mean,\n",
|
||||
">who uses a processor that runs at the speed of 80ns SIMMs? Note that this\n",
|
||||
">memory speed corresponds to a clock speed of 12.5 MHz.\n",
|
||||
"\n",
|
||||
"The point being the processor speed is only one of many aspects of a\n",
|
||||
"computers performance. Clock speed, processor, memory speed, CPU\n",
|
||||
"architecture, I/O systems, even the application program all contribute \n",
|
||||
"to the overall system performance.\n",
|
||||
"\n",
|
||||
">>And roughly, the 68040 is twice as fast at a given clock\n",
|
||||
">>speed as is the 68030.\n",
|
||||
">\n",
|
||||
">Numbers?\n",
|
||||
"\n",
|
||||
"Look them up yourself.\n",
|
||||
"\n",
|
||||
"-- \n",
|
||||
"Ray Fischer \"Convictions are more dangerous enemies of truth\n",
|
||||
"ray@netcom.com than lies.\" -- Friedrich Nietzsche\n",
|
||||
"\n",
|
||||
"0.3491800997095306\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"From: mb4008@cehp11 (Morgan J Bullard)\n",
|
||||
"Subject: Re: speeding up windows\n",
|
||||
"Keywords: speed\n",
|
||||
"Organization: University of Illinois at Urbana\n",
|
||||
"Lines: 30\n",
|
||||
"\n",
|
||||
"djserian@flash.LakeheadU.Ca (Reincarnation of Elvis) writes:\n",
|
||||
"\n",
|
||||
">I have a 386/33 with 8 megs of memory\n",
|
||||
"\n",
|
||||
">I have noticed that lately when I use programs like WpfW or Corel Draw\n",
|
||||
">my computer \"boggs\" down and becomes really sluggish!\n",
|
||||
"\n",
|
||||
">What can I do to increase performance? What should I turn on or off\n",
|
||||
"\n",
|
||||
">Will not loading wallpapers or stuff like that help when it comes to\n",
|
||||
">the running speed of windows and the programs that run under it?\n",
|
||||
"\n",
|
||||
">Thanx in advance\n",
|
||||
"\n",
|
||||
">Derek\n",
|
||||
"\n",
|
||||
"1) make sure your hard drive is defragmented. This will speed up more than \n",
|
||||
" just windows BTW. Use something like Norton's or PC Tools.\n",
|
||||
"2) I _think_ that leaving the wall paper out will use less RAM and therefore\n",
|
||||
" will speed up your machine but I could very will be wrong on this.\n",
|
||||
"There's a good chance you've already done this but if not it may speed things\n",
|
||||
"up. good luck\n",
|
||||
"\t\t\t\tMorgan Bullard mb4008@coewl.cen.uiuc.edu\n",
|
||||
"\t\t\t\t\t or mjbb@uxa.cso.uiuc.edu\n",
|
||||
"\n",
|
||||
">--\n",
|
||||
">$_ /|$Derek J.P. Serianni $ E-Mail : djserian@flash.lakeheadu.ca $ \n",
|
||||
">$\\'o.O' $Sociologist $ It's 106 miles to Chicago,we've got a full tank$\n",
|
||||
">$=(___)=$Lakehead University $ of gas, half a pack of cigarettes,it's dark,and$\n",
|
||||
">$ U $Thunder Bay, Ontario$ we're wearing sunglasses. -Elwood Blues $ \n",
|
||||
"\n",
|
||||
"0.26949927393886913\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"----------------------------------------------------------------------------------------------------\n",
|
||||
"----------------------------------------------------------------------------------------------------\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i in range (1,5):\n",
|
||||
" print(newsgroups[similarities.argsort()[0][-i]])\n",
|
||||
@ -685,11 +493,14 @@
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@ -701,7 +512,10 @@
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
}
|
||||
},
|
||||
"subtitle": "3.tfidf (2)[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
|
@ -1,5 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 4. <i>Wyszukiwarki</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -71,21 +85,17 @@
|
||||
" * proszę zaznaczyć w MS TEAMS, że Państwo zrobili zadanie w assigments\n",
|
||||
" * zdawanie zadania będzie na zajęciach. Proszę przygotować prezentację do 5 minut"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@ -96,8 +106,11 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "4.wyszukiwarki[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
|
@ -1,5 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 5. <i>Ekstrakcja informacji z dokumentów</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -182,8 +196,8 @@
|
||||
"- stworzyć regułowy ekstraktor informacji (np. na podstawie wyrażeń regularnych)\n",
|
||||
"- wygenerować pliki `train/out.tsv`, `dev-0/out.tsv`, `test-A/out.tsv`\n",
|
||||
"- dodać do commita w swoim forku powyższe pliki wraz z ze skryptem (skryptami), który pomógł je wygenerować. Skrypty powinny byc w formie tekstowej (jeżeli to jupyter to proszę przekonwertować do zwykłego pliku .py, np jupyter nbconvert --to script a.ipynb)\n",
|
||||
"- wynik zaliczający zadanie to więcej niż 0.3 wg metryki F1 dla zbioru dev-0\n",
|
||||
"- punkty za zadanie: 40, a dla 5 osób które osiągną najwyższy wyniki dostaną 60 punktów zamiast 40\n",
|
||||
"- wynik zaliczający zadanie to więcej niż 0.1 wg metryki F1 dla zbioru test-A\n",
|
||||
"- punkty za zadanie: 40, a dla 5 osób które osiągną najwyższy wyniki dostaną 70 punktów zamiast 40\n",
|
||||
"- zadanie oddajemy do 27 kwietnia w MS TEAMS podając link do repozytorium. Proszę albo nadać użytkownikowi kubapok uprawnienia do przeglądania repozytorium, albo zrobić je publiczne"
|
||||
]
|
||||
},
|
||||
@ -213,11 +227,14 @@
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@ -229,7 +246,10 @@
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
}
|
||||
},
|
||||
"subtitle": "5.ekEtrakcja informacji z dokumentCCow[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
|
892
cw/06_klasyfikacja.ipynb
Normal file
@ -0,0 +1,892 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 6. <i>Klasyfikacja</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Zajęcia klasyfikacja"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Zbiór kleister"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pathlib\n",
|
||||
"from collections import Counter\n",
|
||||
"from sklearn.metrics import *"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"KLEISTER_PATH = pathlib.Path('/home/kuba/Syncthing/przedmioty/2020-02/IE/applica/kleister-nda')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Pytanie\n",
|
||||
"\n",
|
||||
"Czy jurysdykcja musi być zapisana explicite w umowie?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_expected_jurisdiction(filepath):\n",
|
||||
" dataset_expected_jurisdiction = []\n",
|
||||
" with open(filepath,'r') as train_expected_file:\n",
|
||||
" for line in train_expected_file:\n",
|
||||
" key_values = line.rstrip('\\n').split(' ')\n",
|
||||
" jurisdiction = None\n",
|
||||
" for key_value in key_values:\n",
|
||||
" key, value = key_value.split('=')\n",
|
||||
" if key == 'jurisdiction':\n",
|
||||
" jurisdiction = value\n",
|
||||
" if jurisdiction is None:\n",
|
||||
" jurisdiction = 'NONE'\n",
|
||||
" dataset_expected_jurisdiction.append(jurisdiction)\n",
|
||||
" return dataset_expected_jurisdiction"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_expected_jurisdiction = get_expected_jurisdiction(KLEISTER_PATH/'train'/'expected.tsv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dev_expected_jurisdiction = get_expected_jurisdiction(KLEISTER_PATH/'dev-0'/'expected.tsv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"254"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(train_expected_jurisdiction)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"False"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"'NONE' in train_expected_jurisdiction"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"31"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(set(train_expected_jurisdiction))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Czy wszystkie stany muszą występować w zbiorze trenującym w zbiorze kleister?\n",
|
||||
"\n",
|
||||
"https://en.wikipedia.org/wiki/U.S._state\n",
|
||||
"\n",
|
||||
"### Jaki jest baseline?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_counter = Counter(train_expected_jurisdiction)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('New_York', 43),\n",
|
||||
" ('Delaware', 39),\n",
|
||||
" ('California', 32),\n",
|
||||
" ('Massachusetts', 15),\n",
|
||||
" ('Texas', 13),\n",
|
||||
" ('Illinois', 10),\n",
|
||||
" ('Oregon', 9),\n",
|
||||
" ('Florida', 9),\n",
|
||||
" ('Pennsylvania', 9),\n",
|
||||
" ('Missouri', 9),\n",
|
||||
" ('Ohio', 8),\n",
|
||||
" ('New_Jersey', 7),\n",
|
||||
" ('Georgia', 6),\n",
|
||||
" ('Indiana', 5),\n",
|
||||
" ('Nevada', 5),\n",
|
||||
" ('Colorado', 4),\n",
|
||||
" ('Virginia', 4),\n",
|
||||
" ('Washington', 4),\n",
|
||||
" ('Michigan', 3),\n",
|
||||
" ('Minnesota', 3),\n",
|
||||
" ('Connecticut', 2),\n",
|
||||
" ('Wisconsin', 2),\n",
|
||||
" ('Maine', 2),\n",
|
||||
" ('North_Carolina', 2),\n",
|
||||
" ('Kansas', 2),\n",
|
||||
" ('Utah', 2),\n",
|
||||
" ('Iowa', 1),\n",
|
||||
" ('Idaho', 1),\n",
|
||||
" ('South_Dakota', 1),\n",
|
||||
" ('South_Carolina', 1),\n",
|
||||
" ('Rhode_Island', 1)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train_counter.most_common(100)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"most_common_answer = train_counter.most_common(100)[0][0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'New_York'"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"most_common_answer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dev_predictions_jurisdiction = [most_common_answer] * len(dev_expected_jurisdiction)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dev_expected_jurisdiction"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"accuracy: 0.14457831325301204\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"counter = 0 \n",
|
||||
"for pred, exp in zip(dev_predictions_jurisdiction, dev_expected_jurisdiction):\n",
|
||||
" if pred == exp:\n",
|
||||
" counter +=1\n",
|
||||
"print('accuracy: ', counter/len(dev_predictions_jurisdiction))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0.14457831325301204"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"accuracy_score(dev_predictions_jurisdiction, dev_expected_jurisdiction)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Co jeżeli nazwy klas nie występują explicite w zbiorach?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public\n",
|
||||
" \n",
|
||||
"https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"SPORT_PATH='/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia6_klasyfikacja/repos/sport-text-classification-ball'\n",
|
||||
"\n",
|
||||
"SPORT_TRAIN=$SPORT_PATH/train/train.tsv.gz\n",
|
||||
" \n",
|
||||
"SPORT_DEV_EXP=$SPORT_PATH/dev-0/expected.tsv"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### jaki jest baseline dla sport classification ball?\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"zcat $SPORT_TRAIN | awk '{print $1}' | wc -l"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"zcat $SPORT_TRAIN | awk '{print $1}' | grep 1 | wc -l"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"cat $SPORT_DEV_EXP | wc -l\n",
|
||||
"\n",
|
||||
"grep 1 $SPORT_DEV_EXP | wc -l"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Sprytne podejście do klasyfikacji tekstu? Naiwny bayess"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/kuba/anaconda3/lib/python3.8/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.\n",
|
||||
" warnings.warn(msg)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.datasets import fetch_20newsgroups\n",
|
||||
"# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
|
||||
"\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"import numpy as np\n",
|
||||
"import sklearn.metrics\n",
|
||||
"import gensim"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups = fetch_20newsgroups()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups_text = newsgroups['data']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups_text_tokenized = [list(set(gensim.utils.tokenize(x, lowercase = True))) for x in newsgroups_text]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"From: lerxst@wam.umd.edu (where's my thing)\n",
|
||||
"Subject: WHAT car is this!?\n",
|
||||
"Nntp-Posting-Host: rac3.wam.umd.edu\n",
|
||||
"Organization: University of Maryland, College Park\n",
|
||||
"Lines: 15\n",
|
||||
"\n",
|
||||
" I was wondering if anyone out there could enlighten me on this car I saw\n",
|
||||
"the other day. It was a 2-door sports car, looked to be from the late 60s/\n",
|
||||
"early 70s. It was called a Bricklin. The doors were really small. In addition,\n",
|
||||
"the front bumper was separate from the rest of the body. This is \n",
|
||||
"all I know. If anyone can tellme a model name, engine specs, years\n",
|
||||
"of production, where this car is made, history, or whatever info you\n",
|
||||
"have on this funky looking car, please e-mail.\n",
|
||||
"\n",
|
||||
"Thanks,\n",
|
||||
"- IL\n",
|
||||
" ---- brought to you by your neighborhood Lerxst ----\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(newsgroups_text[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['where', 'name', 'looked', 'to', 'have', 'out', 'on', 'by', 'park', 'what', 'from', 'host', 'doors', 'day', 'be', 'organization', 'e', 'front', 'in', 'it', 'history', 'brought', 'know', 'addition', 'il', 'of', 'lines', 'i', 'your', 'bumper', 'there', 'please', 'me', 'separate', 'is', 'tellme', 'can', 'could', 'called', 'specs', 'college', 'this', 'thanks', 'looking', 'if', 'production', 'sports', 'lerxst', 'whatever', 'anyone', 'enlighten', 'saw', 'all', 'small', 'you', 'wam', 'mail', 'rest', 's', 'late', 'rac', 'funky', 'edu', 'info', 'the', 'wondering', 'years', 'door', 'posting', 'car', 'made', 'or', 'maryland', 'subject', 'bricklin', 'was', 'model', 'thing', 'university', 'engine', 'nntp', 'other', 'really', 'neighborhood', 'early', 'a', 'umd', 'my', 'body', 'were']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(newsgroups_text_tokenized[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y = newsgroups['target']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([7, 4, 4, ..., 3, 1, 8])"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"Y"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_names = newsgroups['target_names']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['alt.atheism',\n",
|
||||
" 'comp.graphics',\n",
|
||||
" 'comp.os.ms-windows.misc',\n",
|
||||
" 'comp.sys.ibm.pc.hardware',\n",
|
||||
" 'comp.sys.mac.hardware',\n",
|
||||
" 'comp.windows.x',\n",
|
||||
" 'misc.forsale',\n",
|
||||
" 'rec.autos',\n",
|
||||
" 'rec.motorcycles',\n",
|
||||
" 'rec.sport.baseball',\n",
|
||||
" 'rec.sport.hockey',\n",
|
||||
" 'sci.crypt',\n",
|
||||
" 'sci.electronics',\n",
|
||||
" 'sci.med',\n",
|
||||
" 'sci.space',\n",
|
||||
" 'soc.religion.christian',\n",
|
||||
" 'talk.politics.guns',\n",
|
||||
" 'talk.politics.mideast',\n",
|
||||
" 'talk.politics.misc',\n",
|
||||
" 'talk.religion.misc']"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"Y_names"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'talk.politics.guns'"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"Y_names[16]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"$P('talk.politics.guns' | 'gun')= ?$ \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"$P(A|B) * P(A) = P(B) * P(B|A)$\n",
|
||||
"\n",
|
||||
"$P(A|B) = \\frac{P(B) * P(B|A)}{P(A)}$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"$P('talk.politics.guns' | 'gun') * P('gun') = P('gun'|'talk.politics.guns') * P('talk.politics.guns')$\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"$P('talk.politics.guns' | 'gun') = \\frac{P('gun'|'talk.politics.guns') * P('talk.politics.guns')}{P('gun')}$\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"$p1 = P('gun'|'talk.politics.guns')$\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"$p2 = P('talk.politics.guns')$\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"$p3 = P('gun')$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## obliczanie $p1 = P('gun'|'talk.politics.guns')$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# samodzielne wykonanie"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## obliczanie $p2 = P('talk.politics.guns')$\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# samodzielne wykonanie"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## obliczanie $p3 = P('gun')$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# samodzielne wykonanie"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## ostatecznie"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'p1' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"\u001b[0;32m<ipython-input-31-447f586cc09f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;34m(\u001b[0m\u001b[0mp1\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mp2\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mp3\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'p1' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"(p1 * p2) / p3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_prob(index ):\n",
|
||||
" talks_topic = [x for x,y in zip(newsgroups_text_tokenized,Y) if y == index]\n",
|
||||
"\n",
|
||||
" len([x for x in talks_topic if 'gun' in x])\n",
|
||||
"\n",
|
||||
" if len(talks_topic) == 0:\n",
|
||||
" return 0.0\n",
|
||||
" p1 = len([x for x in talks_topic if 'gun' in x]) / len(talks_topic)\n",
|
||||
" p2 = len(talks_topic) / len(Y)\n",
|
||||
" p3 = len([x for x in newsgroups_text_tokenized if 'gun' in x]) / len(Y)\n",
|
||||
"\n",
|
||||
" if p3 == 0:\n",
|
||||
" return 0.0\n",
|
||||
" else: \n",
|
||||
" return (p1 * p2)/ p3\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.01622 \t\t alt.atheism\n",
|
||||
"0.00000 \t\t comp.graphics\n",
|
||||
"0.00541 \t\t comp.os.ms-windows.misc\n",
|
||||
"0.01892 \t\t comp.sys.ibm.pc.hardware\n",
|
||||
"0.00270 \t\t comp.sys.mac.hardware\n",
|
||||
"0.00000 \t\t comp.windows.x\n",
|
||||
"0.01351 \t\t misc.forsale\n",
|
||||
"0.04054 \t\t rec.autos\n",
|
||||
"0.01892 \t\t rec.motorcycles\n",
|
||||
"0.00270 \t\t rec.sport.baseball\n",
|
||||
"0.00541 \t\t rec.sport.hockey\n",
|
||||
"0.03784 \t\t sci.crypt\n",
|
||||
"0.02973 \t\t sci.electronics\n",
|
||||
"0.00541 \t\t sci.med\n",
|
||||
"0.01622 \t\t sci.space\n",
|
||||
"0.00270 \t\t soc.religion.christian\n",
|
||||
"0.68378 \t\t talk.politics.guns\n",
|
||||
"0.04595 \t\t talk.politics.mideast\n",
|
||||
"0.03784 \t\t talk.politics.misc\n",
|
||||
"0.01622 \t\t talk.religion.misc\n",
|
||||
"1.00000 \t\tsuma\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"probs = []\n",
|
||||
"for i in range(len(Y_names)):\n",
|
||||
" probs.append(get_prob(i))\n",
|
||||
" print(\"%.5f\" % get_prob(i),'\\t\\t', Y_names[i])\n",
|
||||
" \n",
|
||||
"print(\"%.5f\" % sum(probs), '\\t\\tsuma',)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### zadanie samodzielne"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_prob2(index, word ):\n",
|
||||
" pass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# listing dla get_prob2, słowo 'god'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## założenie naiwnego bayesa"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"$P(class | word1, word2, word3) = \\frac{P(word1, word2, word3|class) * P(class)}{P(word1, word2, word3)}$\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**przy założeniu o niezależności zmiennych losowych $word1$, $word2$, $word3$**:\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"$P(word1, word2, word3|class) = P(word1|class)* P(word2|class) * P(word3|class)$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**ostatecznie:**\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"$P(class | word1, word2, word3) = \\frac{P(word1|class)* P(word2|class) * P(word3|class) * P(class)}{\\sum_k{P(word1|class_k)* P(word2|class_k) * P(word3|class_k) * P(class_k)}}$\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## zadania domowe naiwny bayes1 ręcznie"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- analogicznie zaimplementować funkcję get_prob3(index, document_tokenized), argument document_tokenized ma być zbiorem słów dokumentu. funkcja ma być naiwnym klasyfikatorem bayesowskim (w przypadku wielu słów)\n",
|
||||
"- odpalić powyższy listing prawdopodobieństw z funkcją get_prob3 dla dokumentów: {'i','love','guns'} oraz {'is','there','life','after'\n",
|
||||
",'death'}\n",
|
||||
"- zadanie proszę zrobić w jupyterze, wygenerować pdf (kod + wyniki odpalenia) i umieścić go jako zadanie w teams\n",
|
||||
"- termin 12.05, punktów: 40\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"## zadania domowe naiwny bayes2 gotowa biblioteka"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- wybrać jedno z poniższych repozytoriów i je sforkować:\n",
|
||||
" - https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public\n",
|
||||
" - https://git.wmi.amu.edu.pl/kubapok/sport-text-classification-ball-ISI-public\n",
|
||||
"- stworzyć klasyfikator bazujący na naiwnym bayessie (może być gotowa biblioteka), może też korzystać z gotowych implementacji tfidf\n",
|
||||
"- stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
|
||||
"- wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67\n",
|
||||
"- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo\n",
|
||||
"termin 12.05, 40 punktów\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "6.Klasyfikacja[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
1029
cw/06_klasyfikacja_ODPOWIEDZI.ipynb
Normal file
1089
cw/07_regresja_liniowa.ipynb
Normal file
1397
cw/07_regresja_liniowa_ODPOWIEDZI.ipynb
Normal file
1058
cw/08_regresja_logistyczna.ipynb
Normal file
1250
cw/08_regresja_logistyczna_ODPOWIEDZI.ipynb
Normal file
850
cw/09_sequence_labeling.ipynb
Normal file
@ -0,0 +1,850 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 9. <i>Sequence labeling</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Klasyfikacja wieloklasowa i sequence labelling"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import gensim\n",
|
||||
"import torch\n",
|
||||
"import pandas as pd\n",
|
||||
"import seaborn as sns\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"from torchtext.vocab import Vocab\n",
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"from sklearn.datasets import fetch_20newsgroups\n",
|
||||
"# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
|
||||
"\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.metrics import accuracy_score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Klasyfikacja"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Klasfikacja binarna- 2 klasy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"CATEGORIES = ['soc.religion.christian', 'alt.atheism']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups = fetch_20newsgroups(categories=CATEGORIES)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X = newsgroups['data']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y = newsgroups['target']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_names = newsgroups['target_names']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X[0:1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_names"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"del CATEGORIES, newsgroups, X, Y, Y_names"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### klasyfikacja wieloklasowa"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups_train_dev = fetch_20newsgroups(subset = 'train')\n",
|
||||
"newsgroups_test = fetch_20newsgroups(subset = 'test')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups_train_dev_text = newsgroups_train_dev['data']\n",
|
||||
"newsgroups_test_text = newsgroups_test['data']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_train_dev = newsgroups_train_dev['target']\n",
|
||||
"Y_test = newsgroups_test['target']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups_train_text, newsgroups_dev_text, Y_train, Y_dev = train_test_split(newsgroups_train_dev_text, Y_train_dev, random_state=42)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_names = newsgroups_train_dev['target_names']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_train_dev"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_names"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Jaki baseline?**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.value_counts(Y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"accuracy_score(Y_test, np.ones_like(Y_test) * 10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"**Pytanie** - w jaki sposób stworzyć taki klasyfikator na podstawie tylko wiedzy z poprzednich ćwiczeń?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Zadanie - stworzyć klasyfikator regresji logistycznej one vs rest na podstawie tfdif. TFIDF powinien mieć słownik o wielkości 10000\n",
|
||||
"\n",
|
||||
"https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html\n",
|
||||
"https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n",
|
||||
"https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.multiclass import OneVsRestClassifier\n",
|
||||
"from sklearn.linear_model import LogisticRegression\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Podejście softmax na tfidif"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Zadanie** Na podstawie poprzednich zajęć stworzyć sieć w pytorch bez warstw ukrytych, z jedną warstwą *output* z funkcją softmax (bez trenowania i ewaluacji sieci)\n",
|
||||
"\n",
|
||||
"Użyć https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html?highlight=softmax"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_train"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class NeuralNetworkModel(torch.nn.Module):\n",
|
||||
"\n",
|
||||
" pass"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"OUTPUT_SIZE = len(Y_names)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"nn_model = NeuralNetworkModel(FEAUTERES, OUTPUT_SIZE)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"nn_model(torch.Tensor(X_train[0:3].astype(np.float32).todense()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"BATCH_SIZE = 5"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"criterion = torch.nn.NLLLoss()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.2)\n",
|
||||
"#optimizer = torch.optim.Adam(nn_model.parameters())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_loss_acc(model, X_dataset, Y_dataset):\n",
|
||||
" loss_score = 0\n",
|
||||
" acc_score = 0\n",
|
||||
" items_total = 0\n",
|
||||
" model.eval()\n",
|
||||
" for i in range(0, Y_dataset.shape[0], BATCH_SIZE):\n",
|
||||
" X = X_dataset[i:i+BATCH_SIZE]\n",
|
||||
" X = torch.tensor(X.astype(np.float32).todense())\n",
|
||||
" Y = Y_dataset[i:i+BATCH_SIZE]\n",
|
||||
" Y = torch.tensor(Y)\n",
|
||||
" Y_predictions = model(X)\n",
|
||||
" acc_score += torch.sum(torch.argmax(Y_predictions,dim=1) == Y).item()\n",
|
||||
" items_total += Y.shape[0] \n",
|
||||
"\n",
|
||||
" loss = criterion(Y_predictions, Y)\n",
|
||||
"\n",
|
||||
" loss_score += loss.item() * Y.shape[0] \n",
|
||||
" return (loss_score / items_total), (acc_score / items_total)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for epoch in range(5):\n",
|
||||
" loss_score = 0\n",
|
||||
" acc_score = 0\n",
|
||||
" items_total = 0\n",
|
||||
" nn_model.train()\n",
|
||||
" for i in range(0, Y_train.shape[0], BATCH_SIZE):\n",
|
||||
" X = X_train[i:i+BATCH_SIZE]\n",
|
||||
" X = torch.tensor(X.astype(np.float32).todense())\n",
|
||||
" Y = Y_train[i:i+BATCH_SIZE]\n",
|
||||
"\n",
|
||||
" Y = torch.tensor(Y)\n",
|
||||
" Y_predictions = nn_model(X)\n",
|
||||
" acc_score += torch.sum(torch.argmax(Y_predictions,dim=1) == Y).item()\n",
|
||||
" items_total += Y.shape[0] \n",
|
||||
"\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss = criterion(Y_predictions, Y)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" loss_score += loss.item() * Y.shape[0]\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" display(epoch)\n",
|
||||
" display(get_loss_acc(nn_model, X_train, Y_train))\n",
|
||||
" display(get_loss_acc(nn_model, X_dev, Y_dev))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups_train_text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Podejście softmax z embeddingami na przykładzie NER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install torchtext\n",
|
||||
"# !pip install datasets"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://www.aclweb.org/anthology/W03-0419.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = load_dataset(\"conll2003\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def build_vocab(dataset):\n",
|
||||
" counter = Counter()\n",
|
||||
" for document in dataset:\n",
|
||||
" counter.update(document)\n",
|
||||
" return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vocab = build_vocab(dataset['train']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset['train']['tokens']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(vocab.itos)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vocab['on']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def data_process(dt):\n",
|
||||
" return [ torch.tensor([vocab['<bos>']] +[vocab[token] for token in document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def labels_process(dt):\n",
|
||||
" return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tokens_ids = data_process(dataset['train']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_tokens_ids = data_process(dataset['test']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_labels = labels_process(dataset['train']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_labels = labels_process(dataset['test']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tokens_ids[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"max([max(x) for x in dataset['train']['ner_tags'] ])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class NERModel(torch.nn.Module):\n",
|
||||
"\n",
|
||||
" def __init__(self,):\n",
|
||||
" super(NERModel, self).__init__()\n",
|
||||
" self.emb = torch.nn.Embedding(23627,200)\n",
|
||||
" self.fc1 = torch.nn.Linear(600,9)\n",
|
||||
" #self.softmax = torch.nn.Softmax(dim=0)\n",
|
||||
" # nie trzeba, bo używamy https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html\n",
|
||||
" # jako kryterium\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" x = self.emb(x)\n",
|
||||
" x = x.reshape(600) \n",
|
||||
" x = self.fc1(x)\n",
|
||||
" #x = self.softmax(x)\n",
|
||||
" return x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tokens_ids[0][1:4]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ner_model = NERModel()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ner_model(train_tokens_ids[0][1:4])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"criterion = torch.nn.CrossEntropyLoss()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer = torch.optim.Adam(ner_model.parameters())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(train_labels)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for epoch in range(2):\n",
|
||||
" loss_score = 0\n",
|
||||
" acc_score = 0\n",
|
||||
" prec_score = 0\n",
|
||||
" selected_items = 0\n",
|
||||
" recall_score = 0\n",
|
||||
" relevant_items = 0\n",
|
||||
" items_total = 0\n",
|
||||
" nn_model.train()\n",
|
||||
" #for i in range(len(train_labels)):\n",
|
||||
" for i in range(100):\n",
|
||||
" for j in range(1, len(train_labels[i]) - 1):\n",
|
||||
" \n",
|
||||
" X = train_tokens_ids[i][j-1: j+2]\n",
|
||||
" Y = train_labels[i][j: j+1]\n",
|
||||
"\n",
|
||||
" Y_predictions = ner_model(X)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" acc_score += int(torch.argmax(Y_predictions) == Y)\n",
|
||||
" \n",
|
||||
" if torch.argmax(Y_predictions) != 0:\n",
|
||||
" selected_items +=1\n",
|
||||
" if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():\n",
|
||||
" prec_score += 1\n",
|
||||
" \n",
|
||||
" if Y.item() != 0:\n",
|
||||
" relevant_items +=1\n",
|
||||
" if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():\n",
|
||||
" recall_score += 1\n",
|
||||
" \n",
|
||||
" items_total += 1\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss = criterion(Y_predictions.unsqueeze(0), Y)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" loss_score += loss.item() \n",
|
||||
" \n",
|
||||
" precision = prec_score / selected_items\n",
|
||||
" recall = recall_score / relevant_items\n",
|
||||
" f1_score = (2*precision * recall) / (precision + recall)\n",
|
||||
" display('epoch: ', epoch)\n",
|
||||
" display('loss: ', loss_score / items_total)\n",
|
||||
" display('acc: ', acc_score / items_total)\n",
|
||||
" display('prec: ', precision)\n",
|
||||
" display('recall: : ', recall)\n",
|
||||
" display('f1: ', f1_score)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loss_score = 0\n",
|
||||
"acc_score = 0\n",
|
||||
"prec_score = 0\n",
|
||||
"selected_items = 0\n",
|
||||
"recall_score = 0\n",
|
||||
"relevant_items = 0\n",
|
||||
"items_total = 0\n",
|
||||
"nn_model.eval()\n",
|
||||
"for i in range(100):\n",
|
||||
"#for i in range(len(test_labels)):\n",
|
||||
" for j in range(1, len(test_labels[i]) - 1):\n",
|
||||
"\n",
|
||||
" X = test_tokens_ids[i][j-1: j+2]\n",
|
||||
" Y = test_labels[i][j: j+1]\n",
|
||||
"\n",
|
||||
" Y_predictions = ner_model(X)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" acc_score += int(torch.argmax(Y_predictions) == Y)\n",
|
||||
"\n",
|
||||
" if torch.argmax(Y_predictions) != 0:\n",
|
||||
" selected_items +=1\n",
|
||||
" if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():\n",
|
||||
" prec_score += 1\n",
|
||||
"\n",
|
||||
" if Y.item() != 0:\n",
|
||||
" relevant_items +=1\n",
|
||||
" if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():\n",
|
||||
" recall_score += 1\n",
|
||||
"\n",
|
||||
" items_total += 1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" loss = criterion(Y_predictions.unsqueeze(0), Y)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" loss_score += loss.item() \n",
|
||||
"\n",
|
||||
"precision = prec_score / selected_items\n",
|
||||
"recall = recall_score / relevant_items\n",
|
||||
"f1_score = (2*precision * recall) / (precision + recall)\n",
|
||||
"display('loss: ', loss_score / items_total)\n",
|
||||
"display('acc: ', acc_score / items_total)\n",
|
||||
"display('prec: ', precision)\n",
|
||||
"display('recall: : ', recall)\n",
|
||||
"display('f1: ', f1_score)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Zadanie domowe\n",
|
||||
"\n",
|
||||
"- sklonować repozytorium https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003\n",
|
||||
"- stworzyć klasyfikator bazujący na sieci neuronowej feed forward w pytorchu (można bazować na tym jupyterze lub nie).\n",
|
||||
"- klasyfikator powinien obejmować dodatkowe cechy (np. długość wyrazu, czy wyraz zaczyna się od wielkiej litery, stemmming słowa, czy zawiera cyfrę)\n",
|
||||
"- stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
|
||||
"- wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.60\n",
|
||||
"- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo\n",
|
||||
"termin 08.06, 80 punktów\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "9.Sequence labeling[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
951
cw/09_sequence_labeling_ODPOWIEDZI.ipynb
Normal file
@ -0,0 +1,951 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 9. <i>Sequence labeling</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Klasyfikacja wieloklasowa i sequence labelling"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import gensim\n",
|
||||
"import torch\n",
|
||||
"import pandas as pd\n",
|
||||
"import seaborn as sns\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"from torchtext.vocab import Vocab\n",
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"from sklearn.datasets import fetch_20newsgroups\n",
|
||||
"# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
|
||||
"\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.metrics import accuracy_score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Klasyfikacja"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Klasfikacja binarna- 2 klasy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"CATEGORIES = ['soc.religion.christian', 'alt.atheism']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups = fetch_20newsgroups(categories=CATEGORIES)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X = newsgroups['data']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y = newsgroups['target']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_names = newsgroups['target_names']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X[0:1]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_names"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"del CATEGORIES, newsgroups, X, Y, Y_names"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### klasyfikacja wieloklasowa"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups_train_dev = fetch_20newsgroups(subset = 'train')\n",
|
||||
"newsgroups_test = fetch_20newsgroups(subset = 'test')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups_train_dev_text = newsgroups_train_dev['data']\n",
|
||||
"newsgroups_test_text = newsgroups_test['data']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_train_dev = newsgroups_train_dev['target']\n",
|
||||
"Y_test = newsgroups_test['target']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups_train_text, newsgroups_dev_text, Y_train, Y_dev = train_test_split(newsgroups_train_dev_text, Y_train_dev, random_state=42)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_names = newsgroups_train_dev['target_names']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_train_dev"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"Y_names"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Jaki baseline?**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"pd.value_counts(Y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"accuracy_score(Y_test, np.ones_like(Y_test) * 10)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"**Pytanie** - w jaki sposób stworzyć taki klasyfikator na podstawie tylko wiedzy z poprzednich ćwiczeń?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Zadanie - stworzyć klasyfikator regresji logistycznej one vs rest na podstawie tfdif. TFIDF powinien mieć słownik o wielkości 10000\n",
|
||||
"\n",
|
||||
"https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html\n",
|
||||
"https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n",
|
||||
"https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.multiclass import OneVsRestClassifier\n",
|
||||
"from sklearn.linear_model import LogisticRegression\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"FEAUTERES = 10_000"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vectorizer = TfidfVectorizer(max_features=FEAUTERES)\n",
|
||||
"X_train = vectorizer.fit_transform(newsgroups_train_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_dev = vectorizer.transform(newsgroups_dev_text)\n",
|
||||
"X_test = vectorizer.transform(newsgroups_test_text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"clf = OneVsRestClassifier(LogisticRegression()).fit(X_train, Y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"clf.predict(X_train[0:1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"clf.predict_proba(X_train[0:1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"np.max(clf.predict_proba(X_train[0]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"accuracy_score(clf.predict(X_train), Y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"accuracy_score(clf.predict(X_dev), Y_dev)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"accuracy_score(clf.predict(X_test), Y_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Podejście softmax na tfidif"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Zadanie** Na podstawie poprzednich zajęć stworzyć sieć w pytorch bez warstw ukrytych, z jedną warstwą *output* z funkcją softmax (bez trenowania i ewaluacji sieci)\n",
|
||||
"\n",
|
||||
"Użyć https://pytorch.org/docs/stable/generated/torch.nn.Softmax.html?highlight=softmax"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_train"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class NeuralNetworkModel(torch.nn.Module):\n",
|
||||
"\n",
|
||||
" def __init__(self,FEAUTERES, output_size):\n",
|
||||
" super(NeuralNetworkModel, self).__init__()\n",
|
||||
" self.fc1 = torch.nn.Linear(FEAUTERES,OUTPUT_SIZE)\n",
|
||||
" self.softmax = torch.nn.Softmax(dim=0)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" x = self.fc1(x)\n",
|
||||
" x = self.softmax(x)\n",
|
||||
" return x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"OUTPUT_SIZE = len(Y_names)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"nn_model = NeuralNetworkModel(FEAUTERES, OUTPUT_SIZE)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"nn_model(torch.Tensor(X_train[0:3].astype(np.float32).todense()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"BATCH_SIZE = 5"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"criterion = torch.nn.NLLLoss()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer = torch.optim.SGD(nn_model.parameters(), lr = 0.2)\n",
|
||||
"#optimizer = torch.optim.Adam(nn_model.parameters())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_loss_acc(model, X_dataset, Y_dataset):\n",
|
||||
" loss_score = 0\n",
|
||||
" acc_score = 0\n",
|
||||
" items_total = 0\n",
|
||||
" model.eval()\n",
|
||||
" for i in range(0, Y_dataset.shape[0], BATCH_SIZE):\n",
|
||||
" X = X_dataset[i:i+BATCH_SIZE]\n",
|
||||
" X = torch.tensor(X.astype(np.float32).todense())\n",
|
||||
" Y = Y_dataset[i:i+BATCH_SIZE]\n",
|
||||
" Y = torch.tensor(Y)\n",
|
||||
" Y_predictions = model(X)\n",
|
||||
" acc_score += torch.sum(torch.argmax(Y_predictions,dim=1) == Y).item()\n",
|
||||
" items_total += Y.shape[0] \n",
|
||||
"\n",
|
||||
" loss = criterion(Y_predictions, Y)\n",
|
||||
"\n",
|
||||
" loss_score += loss.item() * Y.shape[0] \n",
|
||||
" return (loss_score / items_total), (acc_score / items_total)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for epoch in range(5):\n",
|
||||
" loss_score = 0\n",
|
||||
" acc_score = 0\n",
|
||||
" items_total = 0\n",
|
||||
" nn_model.train()\n",
|
||||
" for i in range(0, Y_train.shape[0], BATCH_SIZE):\n",
|
||||
" X = X_train[i:i+BATCH_SIZE]\n",
|
||||
" X = torch.tensor(X.astype(np.float32).todense())\n",
|
||||
" Y = Y_train[i:i+BATCH_SIZE]\n",
|
||||
"\n",
|
||||
" Y = torch.tensor(Y)\n",
|
||||
" Y_predictions = nn_model(X)\n",
|
||||
" acc_score += torch.sum(torch.argmax(Y_predictions,dim=1) == Y).item()\n",
|
||||
" items_total += Y.shape[0] \n",
|
||||
"\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss = criterion(Y_predictions, Y)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" loss_score += loss.item() * Y.shape[0]\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" display(epoch)\n",
|
||||
" display(get_loss_acc(nn_model, X_train, Y_train))\n",
|
||||
" display(get_loss_acc(nn_model, X_dev, Y_dev))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"newsgroups_train_text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Podejście softmax z embeddingami na przykładzie NER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# !pip install torchtext\n",
|
||||
"# !pip install datasets"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://www.aclweb.org/anthology/W03-0419.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = load_dataset(\"conll2003\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def build_vocab(dataset):\n",
|
||||
" counter = Counter()\n",
|
||||
" for document in dataset:\n",
|
||||
" counter.update(document)\n",
|
||||
" return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vocab = build_vocab(dataset['train']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset['train']['tokens']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(vocab.itos)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vocab['on']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def data_process(dt):\n",
|
||||
" return [ torch.tensor([vocab['<bos>']] +[vocab[token] for token in document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def labels_process(dt):\n",
|
||||
" return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tokens_ids = data_process(dataset['train']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_tokens_ids = data_process(dataset['test']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_labels = labels_process(dataset['train']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_labels = labels_process(dataset['test']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tokens_ids[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"max([max(x) for x in dataset['train']['ner_tags'] ])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class NERModel(torch.nn.Module):\n",
|
||||
"\n",
|
||||
" def __init__(self,):\n",
|
||||
" super(NERModel, self).__init__()\n",
|
||||
" self.emb = torch.nn.Embedding(23627,200)\n",
|
||||
" self.fc1 = torch.nn.Linear(600,9)\n",
|
||||
" #self.softmax = torch.nn.Softmax(dim=0)\n",
|
||||
" # nie trzeba, bo używamy https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html\n",
|
||||
" # jako kryterium\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" x = self.emb(x)\n",
|
||||
" x = x.reshape(600) \n",
|
||||
" x = self.fc1(x)\n",
|
||||
" #x = self.softmax(x)\n",
|
||||
" return x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tokens_ids[0][1:4]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ner_model = NERModel()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ner_model(train_tokens_ids[0][1:4])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"criterion = torch.nn.CrossEntropyLoss()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer = torch.optim.Adam(ner_model.parameters())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(train_labels)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for epoch in range(2):\n",
|
||||
" loss_score = 0\n",
|
||||
" acc_score = 0\n",
|
||||
" prec_score = 0\n",
|
||||
" selected_items = 0\n",
|
||||
" recall_score = 0\n",
|
||||
" relevant_items = 0\n",
|
||||
" items_total = 0\n",
|
||||
" nn_model.train()\n",
|
||||
" #for i in range(len(train_labels)):\n",
|
||||
" for i in range(100):\n",
|
||||
" for j in range(1, len(train_labels[i]) - 1):\n",
|
||||
" \n",
|
||||
" X = train_tokens_ids[i][j-1: j+2]\n",
|
||||
" Y = train_labels[i][j: j+1]\n",
|
||||
"\n",
|
||||
" Y_predictions = ner_model(X)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" acc_score += int(torch.argmax(Y_predictions) == Y)\n",
|
||||
" \n",
|
||||
" if torch.argmax(Y_predictions) != 0:\n",
|
||||
" selected_items +=1\n",
|
||||
" if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():\n",
|
||||
" prec_score += 1\n",
|
||||
" \n",
|
||||
" if Y.item() != 0:\n",
|
||||
" relevant_items +=1\n",
|
||||
" if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():\n",
|
||||
" recall_score += 1\n",
|
||||
" \n",
|
||||
" items_total += 1\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss = criterion(Y_predictions.unsqueeze(0), Y)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" loss_score += loss.item() \n",
|
||||
" \n",
|
||||
" precision = prec_score / selected_items\n",
|
||||
" recall = recall_score / relevant_items\n",
|
||||
" f1_score = (2*precision * recall) / (precision + recall)\n",
|
||||
" display('epoch: ', epoch)\n",
|
||||
" display('loss: ', loss_score / items_total)\n",
|
||||
" display('acc: ', acc_score / items_total)\n",
|
||||
" display('prec: ', precision)\n",
|
||||
" display('recall: : ', recall)\n",
|
||||
" display('f1: ', f1_score)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loss_score = 0\n",
|
||||
"acc_score = 0\n",
|
||||
"prec_score = 0\n",
|
||||
"selected_items = 0\n",
|
||||
"recall_score = 0\n",
|
||||
"relevant_items = 0\n",
|
||||
"items_total = 0\n",
|
||||
"nn_model.eval()\n",
|
||||
"for i in range(100):\n",
|
||||
"#for i in range(len(test_labels)):\n",
|
||||
" for j in range(1, len(test_labels[i]) - 1):\n",
|
||||
"\n",
|
||||
" X = test_tokens_ids[i][j-1: j+2]\n",
|
||||
" Y = test_labels[i][j: j+1]\n",
|
||||
"\n",
|
||||
" Y_predictions = ner_model(X)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" acc_score += int(torch.argmax(Y_predictions) == Y)\n",
|
||||
"\n",
|
||||
" if torch.argmax(Y_predictions) != 0:\n",
|
||||
" selected_items +=1\n",
|
||||
" if torch.argmax(Y_predictions) != 0 and torch.argmax(Y_predictions) == Y.item():\n",
|
||||
" prec_score += 1\n",
|
||||
"\n",
|
||||
" if Y.item() != 0:\n",
|
||||
" relevant_items +=1\n",
|
||||
" if Y.item() != 0 and torch.argmax(Y_predictions) == Y.item():\n",
|
||||
" recall_score += 1\n",
|
||||
"\n",
|
||||
" items_total += 1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" loss = criterion(Y_predictions.unsqueeze(0), Y)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" loss_score += loss.item() \n",
|
||||
"\n",
|
||||
"precision = prec_score / selected_items\n",
|
||||
"recall = recall_score / relevant_items\n",
|
||||
"f1_score = (2*precision * recall) / (precision + recall)\n",
|
||||
"display('loss: ', loss_score / items_total)\n",
|
||||
"display('acc: ', acc_score / items_total)\n",
|
||||
"display('prec: ', precision)\n",
|
||||
"display('recall: : ', recall)\n",
|
||||
"display('f1: ', f1_score)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Zadanie domowe\n",
|
||||
"\n",
|
||||
"- sklonować repozytorium https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003\n",
|
||||
"- stworzyć klasyfikator bazujący na sieci neuronowej feed forward w pytorchu (można bazować na tym jupyterze lub nie).\n",
|
||||
"- klasyfikator powinien obejmować dodatkowe cechy (np. długość wyrazu, czy wyraz zaczyna się od wielkiej litery, stemmming słowa, czy zawiera cyfrę)\n",
|
||||
"- stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
|
||||
"- wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.60\n",
|
||||
"- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo\n",
|
||||
"termin 08.06, 80 punktów\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "9.Sequence labeling[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
447
cw/10_CRF.ipynb
Normal file
@ -0,0 +1,447 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 10. <i>CRF</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Podejście softmax z embeddingami na przykładzie NER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"source": [
|
||||
"https://pytorch-crf.readthedocs.io/en/stable/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://www.aclweb.org/anthology/W03-0419.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import gensim\n",
|
||||
"import torch\n",
|
||||
"import pandas as pd\n",
|
||||
"import seaborn as sns\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"from torchtext.vocab import Vocab\n",
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"from sklearn.datasets import fetch_20newsgroups\n",
|
||||
"# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
|
||||
"\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"\n",
|
||||
"from tqdm.notebook import tqdm\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"from torchcrf import CRF"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = load_dataset(\"conll2003\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def build_vocab(dataset):\n",
|
||||
" counter = Counter()\n",
|
||||
" for document in dataset:\n",
|
||||
" counter.update(document)\n",
|
||||
" return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vocab = build_vocab(dataset['train']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(vocab.itos)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vocab['on']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def data_process(dt):\n",
|
||||
" return [ torch.tensor([vocab['<bos>']] +[vocab[token] for token in document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def labels_process(dt):\n",
|
||||
" return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tokens_ids = data_process(dataset['train']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_tokens_ids = data_process(dataset['test']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"validation_tokens_ids = data_process(dataset['validation']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_labels = labels_process(dataset['train']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"validation_labels = labels_process(dataset['validation']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_labels = labels_process(dataset['test']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tokens_ids[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_scores(y_true, y_pred):\n",
|
||||
" acc_score = 0\n",
|
||||
" tp = 0\n",
|
||||
" fp = 0\n",
|
||||
" selected_items = 0\n",
|
||||
" relevant_items = 0 \n",
|
||||
"\n",
|
||||
" for p,t in zip(y_pred, y_true):\n",
|
||||
" if p == t:\n",
|
||||
" acc_score +=1\n",
|
||||
"\n",
|
||||
" if p > 0 and p == t:\n",
|
||||
" tp +=1\n",
|
||||
"\n",
|
||||
" if p > 0:\n",
|
||||
" selected_items += 1\n",
|
||||
"\n",
|
||||
" if t > 0 :\n",
|
||||
" relevant_items +=1\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" if selected_items == 0:\n",
|
||||
" precision = 1.0\n",
|
||||
" else:\n",
|
||||
" precision = tp / selected_items\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" if relevant_items == 0:\n",
|
||||
" recall = 1.0\n",
|
||||
" else:\n",
|
||||
" recall = tp / relevant_items\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" if precision + recall == 0.0 :\n",
|
||||
" f1 = 0.0\n",
|
||||
" else:\n",
|
||||
" f1 = 2* precision * recall / (precision + recall)\n",
|
||||
"\n",
|
||||
" return precision, recall, f1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"num_tags = max([max(x) for x in dataset['train']['ner_tags'] ]) + 1 "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class FF(torch.nn.Module):\n",
|
||||
"\n",
|
||||
" def __init__(self,):\n",
|
||||
" super(FF, self).__init__()\n",
|
||||
" self.emb = torch.nn.Embedding(23627,200)\n",
|
||||
" self.fc1 = torch.nn.Linear(200,num_tags)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" x = self.emb(x)\n",
|
||||
" x = self.fc1(x)\n",
|
||||
" return x"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ff = FF()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"crf = CRF(num_tags)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"params = list(ff.parameters()) + list(crf.parameters())\n",
|
||||
"\n",
|
||||
"optimizer = torch.optim.Adam(params)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def eval_model(dataset_tokens, dataset_labels):\n",
|
||||
" Y_true = []\n",
|
||||
" Y_pred = []\n",
|
||||
" ff.eval()\n",
|
||||
" crf.eval()\n",
|
||||
" for i in tqdm(range(len(dataset_labels))):\n",
|
||||
" batch_tokens = dataset_tokens[i]\n",
|
||||
" tags = list(dataset_labels[i].numpy())\n",
|
||||
" emissions = ff(batch_tokens).unsqueeze(1)\n",
|
||||
" Y_pred += crf.decode(emissions)[0]\n",
|
||||
" Y_true += tags\n",
|
||||
"\n",
|
||||
" return get_scores(Y_true, Y_pred)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"NUM_EPOCHS = 4"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i in range(NUM_EPOCHS):\n",
|
||||
" ff.train()\n",
|
||||
" crf.train()\n",
|
||||
" for i in tqdm(range(len(train_labels))):\n",
|
||||
" batch_tokens = train_tokens_ids[i]\n",
|
||||
" tags = train_labels[i].unsqueeze(1)\n",
|
||||
" emissions = ff(batch_tokens).unsqueeze(1)\n",
|
||||
"\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss = -crf(emissions,tags)\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" \n",
|
||||
" ff.eval()\n",
|
||||
" crf.eval()\n",
|
||||
" print(eval_model(validation_tokens_ids, validation_labels))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"eval_model(validation_tokens_ids, validation_labels)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"eval_model(test_tokens_ids, test_labels)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(train_tokens_ids)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Zadanie domowe\n",
|
||||
"\n",
|
||||
"- sklonować repozytorium https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003\n",
|
||||
"- stworzyć klasyfikator bazujący na sieci neuronowej feed forward w pytorchu + CRF (można bazować na tym jupyterze lub nie).\n",
|
||||
"- sieć feedforward powinna obejmować aktualne słowo, poprzednie i następne + dodatkowe cechy (np. długość wyrazu, czy wyraz zaczyna się od wielkiej litery, stemmming słowa, czy zawiera cyfrę)\n",
|
||||
"- stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
|
||||
"- wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.65\n",
|
||||
"- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo\n",
|
||||
"termin 15.06, 60 punktów, za najlepszy wynik- 100 punktów\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "10.CRF[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
615
cw/11_NER_RNN.ipynb
Normal file
@ -0,0 +1,615 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 11. <i>NER RNN</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Podejście softmax z embeddingami na przykładzie NER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import gensim\n",
|
||||
"import torch\n",
|
||||
"import pandas as pd\n",
|
||||
"import seaborn as sns\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"from torchtext.vocab import Vocab\n",
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"from sklearn.datasets import fetch_20newsgroups\n",
|
||||
"# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
|
||||
"\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"\n",
|
||||
"from tqdm.notebook import tqdm\n",
|
||||
"\n",
|
||||
"import torch"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = load_dataset(\"conll2003\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def build_vocab(dataset):\n",
|
||||
" counter = Counter()\n",
|
||||
" for document in dataset:\n",
|
||||
" counter.update(document)\n",
|
||||
" return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vocab = build_vocab(dataset['train']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"23627"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(vocab.itos)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"15"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"vocab['on']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def data_process(dt):\n",
|
||||
" return [ torch.tensor([vocab['<bos>']] +[vocab[token] for token in document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def labels_process(dt):\n",
|
||||
" return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tokens_ids = data_process(dataset['train']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_tokens_ids = data_process(dataset['test']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"validation_tokens_ids = data_process(dataset['validation']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_labels = labels_process(dataset['train']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"validation_labels = labels_process(dataset['validation']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_labels = labels_process(dataset['test']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"tensor([ 2, 966, 22409, 238, 773, 9, 4588, 212, 7686, 4,\n",
|
||||
" 3])"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train_tokens_ids[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],\n",
|
||||
" 'id': '0',\n",
|
||||
" 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0],\n",
|
||||
" 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],\n",
|
||||
" 'tokens': ['EU',\n",
|
||||
" 'rejects',\n",
|
||||
" 'German',\n",
|
||||
" 'call',\n",
|
||||
" 'to',\n",
|
||||
" 'boycott',\n",
|
||||
" 'British',\n",
|
||||
" 'lamb',\n",
|
||||
" '.']}"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dataset['train'][0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"tensor([0, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0])"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"train_labels[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_scores(y_true, y_pred):\n",
|
||||
" acc_score = 0\n",
|
||||
" tp = 0\n",
|
||||
" fp = 0\n",
|
||||
" selected_items = 0\n",
|
||||
" relevant_items = 0 \n",
|
||||
"\n",
|
||||
" for p,t in zip(y_pred, y_true):\n",
|
||||
" if p == t:\n",
|
||||
" acc_score +=1\n",
|
||||
"\n",
|
||||
" if p > 0 and p == t:\n",
|
||||
" tp +=1\n",
|
||||
"\n",
|
||||
" if p > 0:\n",
|
||||
" selected_items += 1\n",
|
||||
"\n",
|
||||
" if t > 0 :\n",
|
||||
" relevant_items +=1\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" if selected_items == 0:\n",
|
||||
" precision = 1.0\n",
|
||||
" else:\n",
|
||||
" precision = tp / selected_items\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" if relevant_items == 0:\n",
|
||||
" recall = 1.0\n",
|
||||
" else:\n",
|
||||
" recall = tp / relevant_items\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" if precision + recall == 0.0 :\n",
|
||||
" f1 = 0.0\n",
|
||||
" else:\n",
|
||||
" f1 = 2* precision * recall / (precision + recall)\n",
|
||||
"\n",
|
||||
" return precision, recall, f1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"num_tags = max([max(x) for x in dataset['train']['ner_tags'] ]) + 1 "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class LSTM(torch.nn.Module):\n",
|
||||
"\n",
|
||||
" def __init__(self):\n",
|
||||
" super(LSTM, self).__init__()\n",
|
||||
" self.emb = torch.nn.Embedding(len(vocab.itos),100)\n",
|
||||
" self.rec = torch.nn.LSTM(100, 256, 1, batch_first = True)\n",
|
||||
" self.fc1 = torch.nn.Linear( 256 , 9)\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" emb = torch.relu(self.emb(x))\n",
|
||||
" \n",
|
||||
" lstm_output, (h_n, c_n) = self.rec(emb)\n",
|
||||
" \n",
|
||||
" out_weights = self.fc1(lstm_output)\n",
|
||||
"\n",
|
||||
" return out_weights"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"lstm = LSTM()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"criterion = torch.nn.CrossEntropyLoss()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer = torch.optim.Adam(lstm.parameters())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def eval_model(dataset_tokens, dataset_labels, model):\n",
|
||||
" Y_true = []\n",
|
||||
" Y_pred = []\n",
|
||||
" for i in tqdm(range(len(dataset_labels))):\n",
|
||||
" batch_tokens = dataset_tokens[i].unsqueeze(0)\n",
|
||||
" tags = list(dataset_labels[i].numpy())\n",
|
||||
" Y_true += tags\n",
|
||||
" \n",
|
||||
" Y_batch_pred_weights = model(batch_tokens).squeeze(0)\n",
|
||||
" Y_batch_pred = torch.argmax(Y_batch_pred_weights,1)\n",
|
||||
" Y_pred += list(Y_batch_pred.numpy())\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" return get_scores(Y_true, Y_pred)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"NUM_EPOCHS = 5"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i in range(NUM_EPOCHS):\n",
|
||||
" lstm.train()\n",
|
||||
" #for i in tqdm(range(500)):\n",
|
||||
" for i in tqdm(range(len(train_labels))):\n",
|
||||
" batch_tokens = train_tokens_ids[i].unsqueeze(0)\n",
|
||||
" tags = train_labels[i].unsqueeze(1)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" predicted_tags = lstm(batch_tokens)\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss = criterion(predicted_tags.squeeze(0),tags.squeeze(1))\n",
|
||||
" \n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" \n",
|
||||
" lstm.eval()\n",
|
||||
" print(eval_model(validation_tokens_ids, validation_labels, lstm))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "5159f7a61c3a439bab45573f15ea55b2",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"HBox(children=(FloatProgress(value=0.0, max=3250.0), HTML(value='')))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(0.7963248522230789, 0.7203301174009067, 0.7564235581324383)"
|
||||
]
|
||||
},
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"eval_model(validation_tokens_ids, validation_labels, lstm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "4b604bbb796f4d4cb99528fad98cfdff",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"HBox(children=(FloatProgress(value=0.0, max=3453.0), HTML(value='')))"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(0.7450810185185185, 0.6348619329388561, 0.685569755058573)"
|
||||
]
|
||||
},
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"eval_model(test_tokens_ids, test_labels, lstm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"14041"
|
||||
]
|
||||
},
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(train_tokens_ids)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## pytania\n",
|
||||
"\n",
|
||||
"- co zrobić z trenowaniem na batchach > 1 ?\n",
|
||||
"- co zrobić, żeby sieć uwzględniała następne tokeny, a nie tylko poprzednie?\n",
|
||||
"- w jaki sposób wykorzystać taką sieć do zadania zwykłej klasyfikacji?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Zadanie na zajęcia ( 20 minut)\n",
|
||||
"\n",
|
||||
"zmodyfikować sieć tak, żeby była używała dwuwarstwowej, dwukierunkowej warstwy GRU oraz dropoutu. Dropout ma nałożony na embeddingi.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Zadanie domowe\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"- sklonować repozytorium https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003\n",
|
||||
"- stworzyć model seq labelling bazujący na sieci neuronowej opisanej w punkcie niżej (można bazować na tym jupyterze lub nie).\n",
|
||||
"- model sieci to GRU (o dowolnych parametrach) + CRF w pytorchu korzystając z modułu CRF z poprzednich zajęć- - stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
|
||||
"- wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.65\n",
|
||||
"- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo\n",
|
||||
"termin 22.06, 60 punktów, za najlepszy wynik- 100 punktów\n",
|
||||
" "
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "11.NER RNN[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
565
cw/11_NER_RNN_ODPOWIEDZI.ipynb
Normal file
@ -0,0 +1,565 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 11. <i>NER RNN</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Podejście softmax z embeddingami na przykładzie NER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import gensim\n",
|
||||
"import torch\n",
|
||||
"import pandas as pd\n",
|
||||
"import seaborn as sns\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"from datasets import load_dataset\n",
|
||||
"from torchtext.vocab import Vocab\n",
|
||||
"from collections import Counter\n",
|
||||
"\n",
|
||||
"from sklearn.datasets import fetch_20newsgroups\n",
|
||||
"# https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html\n",
|
||||
"\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.metrics import accuracy_score\n",
|
||||
"\n",
|
||||
"from tqdm.notebook import tqdm\n",
|
||||
"\n",
|
||||
"import torch"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset = load_dataset(\"conll2003\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def build_vocab(dataset):\n",
|
||||
" counter = Counter()\n",
|
||||
" for document in dataset:\n",
|
||||
" counter.update(document)\n",
|
||||
" return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vocab = build_vocab(dataset['train']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(vocab.itos)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vocab['on']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def data_process(dt):\n",
|
||||
" return [ torch.tensor([vocab['<bos>']] +[vocab[token] for token in document ] + [vocab['<eos>']], dtype = torch.long) for document in dt]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def labels_process(dt):\n",
|
||||
" return [ torch.tensor([0] + document + [0], dtype = torch.long) for document in dt]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tokens_ids = data_process(dataset['train']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_tokens_ids = data_process(dataset['test']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"validation_tokens_ids = data_process(dataset['validation']['tokens'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_labels = labels_process(dataset['train']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"validation_labels = labels_process(dataset['validation']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_labels = labels_process(dataset['test']['ner_tags'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_tokens_ids[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dataset['train'][0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_labels[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_scores(y_true, y_pred):\n",
|
||||
" acc_score = 0\n",
|
||||
" tp = 0\n",
|
||||
" fp = 0\n",
|
||||
" selected_items = 0\n",
|
||||
" relevant_items = 0 \n",
|
||||
"\n",
|
||||
" for p,t in zip(y_pred, y_true):\n",
|
||||
" if p == t:\n",
|
||||
" acc_score +=1\n",
|
||||
"\n",
|
||||
" if p > 0 and p == t:\n",
|
||||
" tp +=1\n",
|
||||
"\n",
|
||||
" if p > 0:\n",
|
||||
" selected_items += 1\n",
|
||||
"\n",
|
||||
" if t > 0 :\n",
|
||||
" relevant_items +=1\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" if selected_items == 0:\n",
|
||||
" precision = 1.0\n",
|
||||
" else:\n",
|
||||
" precision = tp / selected_items\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" if relevant_items == 0:\n",
|
||||
" recall = 1.0\n",
|
||||
" else:\n",
|
||||
" recall = tp / relevant_items\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" if precision + recall == 0.0 :\n",
|
||||
" f1 = 0.0\n",
|
||||
" else:\n",
|
||||
" f1 = 2* precision * recall / (precision + recall)\n",
|
||||
"\n",
|
||||
" return precision, recall, f1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"num_tags = max([max(x) for x in dataset['train']['ner_tags'] ]) + 1 "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class LSTM(torch.nn.Module):\n",
|
||||
"\n",
|
||||
" def __init__(self):\n",
|
||||
" super(LSTM, self).__init__()\n",
|
||||
" self.emb = torch.nn.Embedding(len(vocab.itos),100)\n",
|
||||
" self.rec = torch.nn.LSTM(100, 256, 1, batch_first = True)\n",
|
||||
" self.fc1 = torch.nn.Linear( 256 , 9)\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" emb = torch.relu(self.emb(x))\n",
|
||||
" \n",
|
||||
" lstm_output, (h_n, c_n) = self.rec(emb)\n",
|
||||
" \n",
|
||||
" out_weights = self.fc1(lstm_output)\n",
|
||||
"\n",
|
||||
" return out_weights"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"lstm = LSTM()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"criterion = torch.nn.CrossEntropyLoss()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer = torch.optim.Adam(lstm.parameters())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def eval_model(dataset_tokens, dataset_labels, model):\n",
|
||||
" Y_true = []\n",
|
||||
" Y_pred = []\n",
|
||||
" for i in tqdm(range(len(dataset_labels))):\n",
|
||||
" batch_tokens = dataset_tokens[i].unsqueeze(0)\n",
|
||||
" tags = list(dataset_labels[i].numpy())\n",
|
||||
" Y_true += tags\n",
|
||||
" \n",
|
||||
" Y_batch_pred_weights = model(batch_tokens).squeeze(0)\n",
|
||||
" Y_batch_pred = torch.argmax(Y_batch_pred_weights,1)\n",
|
||||
" Y_pred += list(Y_batch_pred.numpy())\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" return get_scores(Y_true, Y_pred)\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"NUM_EPOCHS = 5"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i in range(NUM_EPOCHS):\n",
|
||||
" lstm.train()\n",
|
||||
" #for i in tqdm(range(500)):\n",
|
||||
" for i in tqdm(range(len(train_labels))):\n",
|
||||
" batch_tokens = train_tokens_ids[i].unsqueeze(0)\n",
|
||||
" tags = train_labels[i].unsqueeze(1)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" predicted_tags = lstm(batch_tokens)\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss = criterion(predicted_tags.squeeze(0),tags.squeeze(1))\n",
|
||||
" \n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" \n",
|
||||
" lstm.eval()\n",
|
||||
" print(eval_model(validation_tokens_ids, validation_labels, lstm))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"eval_model(validation_tokens_ids, validation_labels, lstm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"eval_model(test_tokens_ids, test_labels, lstm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(train_tokens_ids)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## pytania\n",
|
||||
"\n",
|
||||
"- co zrobić z trenowaniem na batchach > 1 ?\n",
|
||||
"- co zrobić, żeby sieć uwzględniała następne tokeny, a nie tylko poprzednie?\n",
|
||||
"- w jaki sposób wykorzystać taką sieć do zadania zwykłej klasyfikacji?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Zadanie na zajęcia ( 20 minut)\n",
|
||||
"\n",
|
||||
"zmodyfikować sieć tak, żeby była używała dwuwarstwowej, dwukierunkowej warstwy GRU oraz dropoutu. Dropout ma nałożony na embeddingi.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"class GRU(torch.nn.Module):\n",
|
||||
"\n",
|
||||
" def __init__(self):\n",
|
||||
" super(GRU, self).__init__()\n",
|
||||
" self.emb = torch.nn.Embedding(len(vocab.itos),100)\n",
|
||||
" self.dropout = torch.nn.Dropout(0.2)\n",
|
||||
" self.rec = torch.nn.GRU(100, 256, 2, batch_first = True, bidirectional = True)\n",
|
||||
" self.fc1 = torch.nn.Linear(2* 256 , 9)\n",
|
||||
" \n",
|
||||
" def forward(self, x):\n",
|
||||
" emb = torch.relu(self.emb(x))\n",
|
||||
" emb = self.dropout(emb)\n",
|
||||
" \n",
|
||||
" gru_output, h_n = self.rec(emb)\n",
|
||||
" \n",
|
||||
" out_weights = self.fc1(gru_output)\n",
|
||||
"\n",
|
||||
" return out_weights"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"gru = GRU()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"criterion = torch.nn.CrossEntropyLoss()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"optimizer = torch.optim.Adam(gru.parameters())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"NUM_EPOCHS = 5"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for i in range(NUM_EPOCHS):\n",
|
||||
" gru.train()\n",
|
||||
" #for i in tqdm(range(50)):\n",
|
||||
" for i in tqdm(range(len(train_labels))):\n",
|
||||
" batch_tokens = train_tokens_ids[i].unsqueeze(0)\n",
|
||||
" tags = train_labels[i].unsqueeze(1)\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" predicted_tags = gru(batch_tokens)\n",
|
||||
"\n",
|
||||
" \n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" loss = criterion(predicted_tags.squeeze(0),tags.squeeze(1))\n",
|
||||
" \n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" \n",
|
||||
" \n",
|
||||
" gru.eval()\n",
|
||||
" print(eval_model(validation_tokens_ids, validation_labels, gru))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Zadanie domowe\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"- sklonować repozytorium https://git.wmi.amu.edu.pl/kubapok/en-ner-conll-2003\n",
|
||||
"- stworzyć model seq labelling bazujący na sieci neuronowej opisanej w punkcie niżej (można bazować na tym jupyterze lub nie).\n",
|
||||
"- model sieci to GRU (o dowolnych parametrach) + CRF w pytorchu korzystając z modułu CRF z poprzednich zajęć- - stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
|
||||
"- wynik fscore sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.65\n",
|
||||
"- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo\n",
|
||||
"termin 22.06, 60 punktów, za najlepszy wynik- 100 punktów\n",
|
||||
" "
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "11.NER RNN[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
690
cw/12_transformery.ipynb
Normal file
@ -0,0 +1,690 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 12. <i>Transformery</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# bpe"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"pip install tokenizers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://github.com/huggingface/tokenizers/tree/master/bindings/python"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tokenizers import Tokenizer, models, trainers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tokenizers.trainers import BpeTrainer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = Tokenizer(models.BPE())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"trainer = trainers.BpeTrainer(vocab_size=20000, min_frequency=2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://wolnelektury.pl/media/book/txt/pan-tadeusz.txt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer.train(files = ['/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia9_ngramowy_model_jDDezykowy/pan-tadeusz-train.txt'], trainer = trainer)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output = tokenizer.encode(\"Nie śpiewają piosenek: pracują leniwo,\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output.ids"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output.tokens"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer.save(\"./my-bpe.tokenizer.json\", pretty=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## ZADANIE\n",
|
||||
"stworzyć BPE tokenizer na podstawie https://git.wmi.amu.edu.pl/kubapok/lalka-lm/src/branch/master/train/train.tsv\n",
|
||||
"i stworzyć stokenizowaną listę: \n",
|
||||
"https://git.wmi.amu.edu.pl/kubapok/lalka-lm/src/branch/master/test-A/in.tsv\n",
|
||||
"\n",
|
||||
"wybrać vocab_size = 8k, uwzględnić dodatkowe tokeny: BOS oraz EOS i wpleść je do zbioru testowego"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# transformery"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# pip install transformers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"przykłady pochodzą częściowo z: https://huggingface.co/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import pipeline, set_seed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import RobertaTokenizer, RobertaModel"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = RobertaTokenizer.from_pretrained('roberta-base')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = RobertaModel.from_pretrained('roberta-base')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \"Replace me by any text you'd like. Bla Bla\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"encoded_input = tokenizer(text, return_tensors='pt')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"encoded_input['input_ids']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"encoded_input['input_ids']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer.decode([162])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output = model(**encoded_input)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://huggingface.co/transformers/main_classes/output.html#basemodeloutputwithpoolingandcrossattentionsM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://arxiv.org/pdf/1907.11692.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[0].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"output[1].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output = model(**encoded_input, output_hidden_states=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(output[2])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[2][0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[2][0].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[2][1].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[2][12].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output = model(**encoded_input, output_attentions=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(output[2])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[2][0].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[2][2]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## gotowe api"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### generowanie tekstu"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = pipeline('text-generation', model='gpt2')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model(\"Hello, I'm a computer science student\", max_length=30, num_return_sequences=5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model(\"I want to contribute to Google's Computer Vision Program, which is doing extensive work on big\", max_length=30, num_return_sequences=5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### sentiment analysis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import pipeline\n",
|
||||
"\n",
|
||||
"model = pipeline(\"sentiment-analysis\", model='distilbert-base-uncased-finetuned-sst-2-english')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model(\"I'm very happy. Today is the beatifull weather\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model(\"It's raining. What a terrible day...\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## NER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = pipeline(\"sentiment-analysis\", model='distilbert-base-uncased-finetuned-sst-2-english')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import pipeline\n",
|
||||
"model = pipeline(\"ner\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \"George Washington went to Washington\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### masked language modelling"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### ZADANIE (10 minut)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"przewidzieć <mask> token w \"The world <MASK> II started in 1939\"\" wg dowolnego anglojęzycznego modelu"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### text summarization"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summarizer = pipeline(\"summarization\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ARTICLE = \"\"\" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.\n",
|
||||
"A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.\n",
|
||||
"Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared \"I do\" five more times, sometimes only within two weeks of each other.\n",
|
||||
"In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her \"first and only\" marriage.\n",
|
||||
"Barrientos, now 39, is facing two criminal counts of \"offering a false instrument for filing in the first degree,\" referring to her false statements on the\n",
|
||||
"2010 marriage license application, according to court documents.\n",
|
||||
"Prosecutors said the marriages were part of an immigration scam.\n",
|
||||
"On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.\n",
|
||||
"After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective\n",
|
||||
"Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.\n",
|
||||
"All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.\n",
|
||||
"Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.\n",
|
||||
"Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.\n",
|
||||
"The case was referred to the Bronx District Attorney\\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\\'s\n",
|
||||
"Investigation Division. Seven of the men are from so-called \"red-flagged\" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.\n",
|
||||
"Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.\n",
|
||||
"If convicted, Barrientos faces up to four years in prison. Her next court appearance is scheduled for May 18.\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### ZADANIE DOMOWE"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- sforkować repozytorium: https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public\n",
|
||||
"- finetunować klasyfikator bazujący na jakieś pretrenowanej sieć typu transformer (np BERT, Roberta). Można użyć dowolnej biblioteki\n",
|
||||
" (np hugging face, fairseq)\n",
|
||||
"- stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
|
||||
"- wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67\n",
|
||||
"- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo\n",
|
||||
"termin 22.06, 60 punktów\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "12.Transformery[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
802
cw/12_transformery_ODPOWIEDZI.ipynb
Normal file
@ -0,0 +1,802 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 12. <i>Transformery</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# bpe"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"pip install tokenizers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://github.com/huggingface/tokenizers/tree/master/bindings/python"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tokenizers import Tokenizer, models, trainers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from tokenizers.trainers import BpeTrainer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = Tokenizer(models.BPE())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"trainer = trainers.BpeTrainer(vocab_size=20000, min_frequency=2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://wolnelektury.pl/media/book/txt/pan-tadeusz.txt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer.train(files = ['/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia9_ngramowy_model_jDDezykowy/pan-tadeusz-train.txt'], trainer = trainer)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output = tokenizer.encode(\"Nie śpiewają piosenek: pracują leniwo,\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output.ids"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output.tokens"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer.save(\"./my-bpe.tokenizer.json\", pretty=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## ZADANIE\n",
|
||||
"stworzyć BPE tokenizer na podstawie https://git.wmi.amu.edu.pl/kubapok/lalka-lm/src/branch/master/train/train.tsv\n",
|
||||
"i stworzyć stokenizowaną listę: \n",
|
||||
"https://git.wmi.amu.edu.pl/kubapok/lalka-lm/src/branch/master/test-A/in.tsv\n",
|
||||
"\n",
|
||||
"wybrać vocab_size = 8k, uwzględnić dodatkowe tokeny: BOS oraz EOS i wpleść je do zbioru testowego"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = Tokenizer(models.BPE())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"trainer = trainers.BpeTrainer(vocab_size=8000, special_tokens=[\"[BOS]\", \"[EOS]\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer.train(files = ['/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia12_transformers/lalka-lm/train/train_with_special_tokens'], trainer = trainer)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_path = '/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia12_transformers/lalka-lm/train/train.tsv'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"with open(test_path, 'r') as test_in:\n",
|
||||
" test_lines = ['[BOS] ' + x.rstrip('\\n') + ' [EOS]' for x in test_in.readlines()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"test_lines"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenized = [tokenizer.encode(x) for x in test_lines]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenized"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenized[0].tokens"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenized[0].tokens"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# transformery"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# pip install transformers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"przykłady pochodzą częściowo z: https://huggingface.co/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import pipeline, set_seed"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import RobertaTokenizer, RobertaModel"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = RobertaTokenizer.from_pretrained('roberta-base')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = RobertaModel.from_pretrained('roberta-base')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \"Replace me by any text you'd like. Bla Bla\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"encoded_input = tokenizer(text, return_tensors='pt')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"encoded_input['input_ids']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"encoded_input['input_ids']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer.decode([162])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output = model(**encoded_input)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://huggingface.co/transformers/main_classes/output.html#basemodeloutputwithpoolingandcrossattentionsM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://arxiv.org/pdf/1907.11692.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[0].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"output[1].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output = model(**encoded_input, output_hidden_states=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(output[2])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[2][0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[2][0].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[2][1].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[2][12].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output = model(**encoded_input, output_attentions=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(output)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"len(output[2])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[2][0].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"output[2][2]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## gotowe api"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### generowanie tekstu"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = pipeline('text-generation', model='gpt2')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model(\"Hello, I'm a computer science student\", max_length=30, num_return_sequences=5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model(\"I want to contribute to Google's Computer Vision Program, which is doing extensive work on big\", max_length=30, num_return_sequences=5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### sentiment analysis"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import pipeline\n",
|
||||
"\n",
|
||||
"model = pipeline(\"sentiment-analysis\", model='distilbert-base-uncased-finetuned-sst-2-english')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model(\"I'm very happy. Today is the beatifull weather\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model(\"It's raining. What a terrible day...\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## NER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = pipeline(\"sentiment-analysis\", model='distilbert-base-uncased-finetuned-sst-2-english')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import pipeline\n",
|
||||
"model = pipeline(\"ner\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \"George Washington went to Washington\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### masked language modelling"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"### ZADANIE (10 minut)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"przewidziać <mask> token w \"The world <MASK> II started in 1939\"\" wg dowolnego anglojęzycznego modelu"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = pipeline(\"fill-mask\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model(f\"The world {model.tokenizer.mask_token} II started in 1939\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### text summarization"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"summarizer = pipeline(\"summarization\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ARTICLE = \"\"\" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.\n",
|
||||
"A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.\n",
|
||||
"Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared \"I do\" five more times, sometimes only within two weeks of each other.\n",
|
||||
"In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her \"first and only\" marriage.\n",
|
||||
"Barrientos, now 39, is facing two criminal counts of \"offering a false instrument for filing in the first degree,\" referring to her false statements on the\n",
|
||||
"2010 marriage license application, according to court documents.\n",
|
||||
"Prosecutors said the marriages were part of an immigration scam.\n",
|
||||
"On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.\n",
|
||||
"After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective\n",
|
||||
"Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.\n",
|
||||
"All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.\n",
|
||||
"Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.\n",
|
||||
"Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.\n",
|
||||
"The case was referred to the Bronx District Attorney\\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\\'s\n",
|
||||
"Investigation Division. Seven of the men are from so-called \"red-flagged\" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.\n",
|
||||
"Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.\n",
|
||||
"If convicted, Barrientos faces up to four years in prison. Her next court appearance is scheduled for May 18.\n",
|
||||
"\"\"\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### ZADANIE DOMOWE"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"- sforkować repozytorium: https://git.wmi.amu.edu.pl/kubapok/paranormal-or-skeptic-ISI-public\n",
|
||||
"- finetunować klasyfikator bazujący na jakieś pretrenowanej sieć typu transformer (np BERT, Roberta). Można użyć dowolnej biblioteki\n",
|
||||
" (np hugging face, fairseq)\n",
|
||||
"- stworzyć predykcje w plikach dev-0/out.tsv oraz test-A/out.tsv\n",
|
||||
"- wynik accuracy sprawdzony za pomocą narzędzia geval (patrz poprzednie zadanie) powinien wynosić conajmniej 0.67\n",
|
||||
"- proszę umieścić predykcję oraz skrypty generujące (w postaci tekstowej a nie jupyter) w repo, a w MS TEAMS umieścić link do swojego repo\n",
|
||||
"termin 22.06, 60 punktów\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "12.Transformery[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
243
cw/13_transformery2.ipynb
Normal file
@ -0,0 +1,243 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 13. <i>Transformery 2</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Wizualizacja atencji\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://github.com/jessevig/bertviz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install bertviz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import AutoTokenizer, AutoModel\n",
|
||||
"from bertviz import model_view, head_view"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TEXT = \"This is a sample input sentence for a transformer model\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL = \"distilbert-base-uncased\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
|
||||
"model = AutoModel.from_pretrained(MODEL, output_attentions=True)\n",
|
||||
"inputs = tokenizer.encode(TEXT, return_tensors='pt')\n",
|
||||
"outputs = model(inputs)\n",
|
||||
"attention = outputs[-1]\n",
|
||||
"tokens = tokenizer.convert_ids_to_tokens(inputs[0]) \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### SELF ATTENTION MODELS"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"head_view(attention, tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_view(attention, tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### ENCODER-DECODER MODELS"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL = \"Helsinki-NLP/opus-mt-en-de\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TEXT_ENCODER = \"She sees the small elephant.\"\n",
|
||||
"TEXT_DECODER = \"Sie sieht den kleinen Elefanten.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
|
||||
"model = AutoModel.from_pretrained(MODEL, output_attentions=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"encoder_input_ids = tokenizer(TEXT_ENCODER, return_tensors=\"pt\", add_special_tokens=True).input_ids\n",
|
||||
"decoder_input_ids = tokenizer(TEXT_DECODER, return_tensors=\"pt\", add_special_tokens=True).input_ids\n",
|
||||
"\n",
|
||||
"outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)\n",
|
||||
"\n",
|
||||
"encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])\n",
|
||||
"decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"head_view(\n",
|
||||
" encoder_attention=outputs.encoder_attentions,\n",
|
||||
" decoder_attention=outputs.decoder_attentions,\n",
|
||||
" cross_attention=outputs.cross_attentions,\n",
|
||||
" encoder_tokens= encoder_text,\n",
|
||||
" decoder_tokens = decoder_text\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_view(\n",
|
||||
" encoder_attention=outputs.encoder_attentions,\n",
|
||||
" decoder_attention=outputs.decoder_attentions,\n",
|
||||
" cross_attention=outputs.cross_attentions,\n",
|
||||
" encoder_tokens= encoder_text,\n",
|
||||
" decoder_tokens = decoder_text\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Zadanie (10 minut)\n",
|
||||
"\n",
|
||||
"Za pomocą modelu en-fr przetłumacz dowolne zdanie z angielskiego na język francuski i sprawdź wagi atencji dla tego tłumaczenia"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### PRZYKŁAD: GPT3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### ZADANIE DOMOWE - POLEVAL"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "13.Transformery 2[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
336
cw/13_transformery2_ODPOWIEDZI.ipynb
Normal file
@ -0,0 +1,336 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 13. <i>Transformery 2</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Wizualizacja atencji\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://github.com/jessevig/bertviz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install bertviz"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import AutoTokenizer, AutoModel\n",
|
||||
"from bertviz import model_view, head_view"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TEXT = \"This is a sample input sentence for a transformer model\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL = \"distilbert-base-uncased\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
|
||||
"model = AutoModel.from_pretrained(MODEL, output_attentions=True)\n",
|
||||
"inputs = tokenizer.encode(TEXT, return_tensors='pt')\n",
|
||||
"outputs = model(inputs)\n",
|
||||
"attention = outputs[-1]\n",
|
||||
"tokens = tokenizer.convert_ids_to_tokens(inputs[0]) \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### SELF ATTENTION MODELS"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"head_view(attention, tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_view(attention, tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### ENCODER-DECODER MODELS"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL = \"Helsinki-NLP/opus-mt-en-de\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TEXT_ENCODER = \"She sees the small elephant.\"\n",
|
||||
"TEXT_DECODER = \"Sie sieht den kleinen Elefanten.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
|
||||
"model = AutoModel.from_pretrained(MODEL, output_attentions=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"encoder_input_ids = tokenizer(TEXT_ENCODER, return_tensors=\"pt\", add_special_tokens=True).input_ids\n",
|
||||
"decoder_input_ids = tokenizer(TEXT_DECODER, return_tensors=\"pt\", add_special_tokens=True).input_ids\n",
|
||||
"\n",
|
||||
"outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)\n",
|
||||
"\n",
|
||||
"encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])\n",
|
||||
"decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"head_view(\n",
|
||||
" encoder_attention=outputs.encoder_attentions,\n",
|
||||
" decoder_attention=outputs.decoder_attentions,\n",
|
||||
" cross_attention=outputs.cross_attentions,\n",
|
||||
" encoder_tokens= encoder_text,\n",
|
||||
" decoder_tokens = decoder_text\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model_view(\n",
|
||||
" encoder_attention=outputs.encoder_attentions,\n",
|
||||
" decoder_attention=outputs.decoder_attentions,\n",
|
||||
" cross_attention=outputs.cross_attentions,\n",
|
||||
" encoder_tokens= encoder_text,\n",
|
||||
" decoder_tokens = decoder_text\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Zadanie (10 minut)\n",
|
||||
"\n",
|
||||
"Za pomocą modelu en-fr przetłumacz dowolne zdanie z angielskiego na język francuski i sprawdź wagi atencji dla tego tłumaczenia"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MODEL = \"Helsinki-NLP/opus-mt-en-fr\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TEXT_ENCODER = \"Although I still have fresh memories of my brother the elder Hamlet’s death, and though it was proper to mourn him throughout our kingdom, life still goes on—I think it’s wise to mourn him while also thinking about my own well being.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import AutoModelWithLMHead, AutoTokenizer\n",
|
||||
"\n",
|
||||
"model = AutoModelWithLMHead.from_pretrained(MODEL)\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
|
||||
"\n",
|
||||
"inputs = tokenizer.encode(TEXT_ENCODER, return_tensors=\"pt\")\n",
|
||||
"outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TEXT_DECODER = tokenizer.decode(outputs[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"TEXT_DECODER"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tokenizer = AutoTokenizer.from_pretrained(MODEL)\n",
|
||||
"model = AutoModel.from_pretrained(MODEL, output_attentions=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"encoder_input_ids = tokenizer(TEXT_ENCODER, return_tensors=\"pt\", add_special_tokens=True).input_ids\n",
|
||||
"decoder_input_ids = tokenizer(TEXT_DECODER, return_tensors=\"pt\", add_special_tokens=True).input_ids\n",
|
||||
"\n",
|
||||
"outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)\n",
|
||||
"\n",
|
||||
"encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])\n",
|
||||
"decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"head_view(\n",
|
||||
" encoder_attention=outputs.encoder_attentions,\n",
|
||||
" decoder_attention=outputs.decoder_attentions,\n",
|
||||
" cross_attention=outputs.cross_attentions,\n",
|
||||
" encoder_tokens= encoder_text,\n",
|
||||
" decoder_tokens = decoder_text\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### PRZYKŁAD: GPT3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### ZADANIE DOMOWE - POLEVAL"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "13.Transformery 2[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
212
cw/14_ekstrakcja_informacji_seq_2_seq.ipynb
Normal file
@ -0,0 +1,212 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 14. <i>Ekstrakcja informacji seq2seq</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### SIMILARITY SEARCH\n",
|
||||
"1. zainstaluj faiss i zrób tutorial: https://github.com/facebookresearch/faiss\n",
|
||||
"2. wczytaj treści artykułów z BBC News Train.csv\n",
|
||||
"3. Użyj któregoś z transformerów (możesz użyć biblioteki sentence-transformers) do stworzenia embeddingów dokumentów\n",
|
||||
"4. wczytaj embeddingi do bazy danych faiss\n",
|
||||
"5. wyszukaj query 'consumer electronics market'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://www.kaggle.com/avishi/bbc-news-train-data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import pickle\n",
|
||||
"import numpy as np\n",
|
||||
"import faiss\n",
|
||||
"from sklearn.metrics import ndcg_score, dcg_score, average_precision_score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install sentence-transformers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sentence_transformers import SentenceTransformer\n",
|
||||
"sentences = [\"Hello World\", \"Hallo Welt\"]\n",
|
||||
"\n",
|
||||
"model = SentenceTransformer('LaBSE')\n",
|
||||
"embeddings = model.encode(sentences)\n",
|
||||
"print(embeddings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"r = pd.read_csv('BBC News Train.csv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DOCUMENTS = list(r.Text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embeddings = model.encode(DOCUMENTS)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embeddings = model.encode(list(r.Text))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"QUERY_STR = 'consumer electronics market'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = model.encode([QUERY_STR])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"index = faiss.IndexFlatL2(embeddings.shape[1]) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"index.add(np.ascontiguousarray(embeddings))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"D, I = index.search(query, 5) "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"I"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"D"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DOCUMENTS[1363]"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "14.Ekstrakcja informacji seq2seq[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
369
cw/15_similarity_search.ipynb
Normal file
@ -0,0 +1,369 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 15. <i>Similarity search</i> [ćwiczenia]</h2> \n",
|
||||
"<h3> Jakub Pokrywka (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://arxiv.org/pdf/1910.10683.pdf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"https://github.com/applicaai/kleister-nda"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import T5Tokenizer, T5ForConditionalGeneration"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \"translate English to French: My name is Azeem and I live in India\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \"summarize: Machine learning involves computers discovering how they can perform tasks without being explicitly programmed to do so. It involves computers learning from data provided so that they carry out certain tasks. For simple tasks assigned to computers, it is possible to program algorithms telling the machine how to execute all steps required to solve the problem at hand; on the computer's part, no learning is needed. For more advanced tasks, it can be challenging for a human to manually create the needed algorithms. In practice, it can turn out to be more effective to help the machine develop its own algorithm, rather than having human programmers specify every needed step.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import T5Tokenizer, T5ForConditionalGeneration\n",
|
||||
"\n",
|
||||
"tokenizer = T5Tokenizer.from_pretrained('t5-small')\n",
|
||||
"\n",
|
||||
"model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True,).to('cuda')\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# You can also use \"translate English to French\" and \"translate English to Romanian\"\n",
|
||||
"input_ids = tokenizer(text, return_tensors=\"pt\").input_ids.to('cuda') # Batch size 1\n",
|
||||
"\n",
|
||||
"outputs = model.generate(input_ids)\n",
|
||||
"\n",
|
||||
"decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
|
||||
"\n",
|
||||
"print(decoded)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"KLEISTER_PATH = '/media/kuba/ssdsam/Syncthing/Syncthing/przedmioty/2020-02/IE/applica/kleister-nda/'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_exp_f = open(KLEISTER_PATH + 'train/expected.tsv')\n",
|
||||
"train_exp = []\n",
|
||||
"for line in train_exp_f:\n",
|
||||
" line_splitted = line.strip('\\n').split(' ')\n",
|
||||
" found = False\n",
|
||||
" for elem in line_splitted:\n",
|
||||
" if 'jurisdiction=' in elem:\n",
|
||||
" train_exp.append('jurisdiction: ' + elem.split('=')[1])\n",
|
||||
" found = True\n",
|
||||
" break\n",
|
||||
" if not found:\n",
|
||||
" train_exp.append('jurisdiction: NONE')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dev_exp_f = open(KLEISTER_PATH + 'dev-0/expected.tsv')\n",
|
||||
"dev_exp = []\n",
|
||||
"for line in dev_exp_f:\n",
|
||||
" line_splitted = line.strip('\\n').split(' ')\n",
|
||||
" found = False\n",
|
||||
" for elem in line_splitted:\n",
|
||||
" if 'jurisdiction=' in elem:\n",
|
||||
" dev_exp.append('jurisdiction: ' + elem.split('=')[1])\n",
|
||||
" found = True\n",
|
||||
" break\n",
|
||||
" if not found:\n",
|
||||
" dev_exp.append('jurisdiction: NONE')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_exp"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_in_f = open(KLEISTER_PATH + 'train/in.tsv')\n",
|
||||
"train_in = []\n",
|
||||
"for line in train_in_f:\n",
|
||||
" line = line.rstrip('\\n')\n",
|
||||
" train_in.append(line)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dev_in_f = open(KLEISTER_PATH + 'dev-0/in.tsv')\n",
|
||||
"dev_in = []\n",
|
||||
"for line in dev_in_f:\n",
|
||||
" line = line.rstrip('\\n')\n",
|
||||
" dev_in.append(line)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_in[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.device"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"input = train_in[0]\n",
|
||||
"\n",
|
||||
"# You can also use \"translate English to French\" and \"translate English to Romanian\"\n",
|
||||
"input_ids = tokenizer(input, return_tensors=\"pt\").input_ids[:,:512].to('cuda') # Batch size 1\n",
|
||||
"\n",
|
||||
"outputs = model.generate(input_ids)\n",
|
||||
"\n",
|
||||
"decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
|
||||
"\n",
|
||||
"print(decoded)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids.to('cuda')\n",
|
||||
"labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids.to('cuda')\n",
|
||||
"# the forward function automatically creates the correct decoder_input_ids\n",
|
||||
"loss = model(input_ids=input_ids, labels=labels).loss"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loss"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from transformers import AdamW\n",
|
||||
"\n",
|
||||
"optimizer = AdamW(model.parameters(), lr=5e-5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.train()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for line_in, line_exp in zip(train_in, train_exp):\n",
|
||||
" input_ids = tokenizer(line_in, return_tensors='pt').input_ids[:,:512].to('cuda')\n",
|
||||
" labels = tokenizer(line_exp, return_tensors='pt').input_ids.to('cuda')\n",
|
||||
" # the forward function automatically creates the correct decoder_input_ids\n",
|
||||
" loss = model(input_ids=input_ids, labels=labels).loss\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" print(loss.item())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model.eval()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"input = dev_in[0]\n",
|
||||
"\n",
|
||||
"input_ids = tokenizer(input, return_tensors=\"pt\").input_ids[:,:512].to('cuda') # Batch size 1\n",
|
||||
"\n",
|
||||
"outputs = model.generate(input_ids)\n",
|
||||
"\n",
|
||||
"decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
|
||||
"\n",
|
||||
"print(decoded)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dev_exp[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"input = dev_in[2]\n",
|
||||
"\n",
|
||||
"input_ids = tokenizer(input, return_tensors=\"pt\").input_ids[:,:512].to('cuda') # Batch size 1\n",
|
||||
"\n",
|
||||
"outputs = model.generate(input_ids)\n",
|
||||
"\n",
|
||||
"decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
|
||||
"\n",
|
||||
"print(decoded)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dev_exp[2]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## pytanie:\n",
|
||||
"- co można poprawić w istniejącym rozwiązaniu?"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Jakub Pokrywka",
|
||||
"email": "kubapok@wmi.amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.3"
|
||||
},
|
||||
"subtitle": "15.Similarity search[ćwiczenia]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
1491
cw/BBC News Train.csv
Normal file
BIN
cw/obrazki/1.png
Normal file
After Width: | Height: | Size: 86 KiB |
266
cw/obrazki/1.svg
Normal file
@ -0,0 +1,266 @@
|
||||
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
||||
<!-- Created with Inkscape (http://www.inkscape.org/) -->
|
||||
|
||||
<svg
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:cc="http://creativecommons.org/ns#"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:svg="http://www.w3.org/2000/svg"
|
||||
xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
|
||||
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
|
||||
width="800mm"
|
||||
height="800mm"
|
||||
viewBox="0 0 800 800"
|
||||
version="1.1"
|
||||
id="svg16"
|
||||
sodipodi:docname="1.svg"
|
||||
inkscape:export-filename="/home/kuba/Syncthing/przedmioty/2020-02/ISI/zajecia7_regresja_liniowa/obrazki/6.png"
|
||||
inkscape:export-xdpi="96"
|
||||
inkscape:export-ydpi="96"
|
||||
inkscape:version="0.92.5 (2060ec1f9f, 2020-04-08)">
|
||||
<defs
|
||||
id="defs10" />
|
||||
<sodipodi:namedview
|
||||
id="base"
|
||||
pagecolor="#ffffff"
|
||||
bordercolor="#666666"
|
||||
borderopacity="1.0"
|
||||
inkscape:pageopacity="0.0"
|
||||
inkscape:pageshadow="2"
|
||||
inkscape:zoom="0.35"
|
||||
inkscape:cx="1485.1537"
|
||||
inkscape:cy="1417.9979"
|
||||
inkscape:document-units="mm"
|
||||
inkscape:current-layer="layer1"
|
||||
showgrid="false"
|
||||
width="800mm"
|
||||
inkscape:window-width="2560"
|
||||
inkscape:window-height="1389"
|
||||
inkscape:window-x="0"
|
||||
inkscape:window-y="0"
|
||||
inkscape:window-maximized="1">
|
||||
<inkscape:grid
|
||||
type="xygrid"
|
||||
id="grid253" />
|
||||
</sodipodi:namedview>
|
||||
<metadata
|
||||
id="metadata13">
|
||||
<rdf:RDF>
|
||||
<cc:Work
|
||||
rdf:about="">
|
||||
<dc:format>image/svg+xml</dc:format>
|
||||
<dc:type
|
||||
rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
|
||||
<dc:title></dc:title>
|
||||
</cc:Work>
|
||||
</rdf:RDF>
|
||||
</metadata>
|
||||
<g
|
||||
inkscape:label="Layer 1"
|
||||
inkscape:groupmode="layer"
|
||||
id="layer1"
|
||||
transform="translate(0,503)">
|
||||
<rect
|
||||
id="rect18"
|
||||
width="700.24615"
|
||||
height="11.759859"
|
||||
x="62.006527"
|
||||
y="148.39815"
|
||||
style="stroke-width:0.26458332" />
|
||||
<rect
|
||||
id="rect18-3"
|
||||
width="700.24615"
|
||||
height="11.759859"
|
||||
x="-475.47943"
|
||||
y="-99.864838"
|
||||
style="stroke-width:0.26458332"
|
||||
transform="rotate(90.042959)" />
|
||||
<circle
|
||||
id="path37"
|
||||
cx="138.44562"
|
||||
cy="-13.583364"
|
||||
r="11.22532"
|
||||
style="stroke-width:0.26458332" />
|
||||
<circle
|
||||
id="path37-9"
|
||||
cx="298.2728"
|
||||
cy="-3.4271142"
|
||||
r="11.22532"
|
||||
style="stroke-width:0.26458332" />
|
||||
<circle
|
||||
id="path37-7"
|
||||
cx="293.99649"
|
||||
cy="-161.65015"
|
||||
r="11.22532"
|
||||
style="stroke-width:0.26458332" />
|
||||
<circle
|
||||
id="path37-92"
|
||||
cx="349.58853"
|
||||
cy="-91.091507"
|
||||
r="11.22532"
|
||||
style="stroke-width:0.26458332" />
|
||||
<circle
|
||||
id="path37-0"
|
||||
cx="551.64429"
|
||||
cy="-123.16381"
|
||||
r="11.22532"
|
||||
style="stroke-width:0.26458332" />
|
||||
<circle
|
||||
id="path37-2"
|
||||
cx="505.67395"
|
||||
cy="-385.08951"
|
||||
r="11.22532"
|
||||
style="stroke-width:0.26458332" />
|
||||
<circle
|
||||
id="path37-3"
|
||||
cx="709.86786"
|
||||
cy="-417.16187"
|
||||
r="11.22532"
|
||||
style="stroke-width:0.26458332" />
|
||||
<circle
|
||||
id="path37-75"
|
||||
cx="450.08188"
|
||||
cy="-214.03429"
|
||||
r="11.22532"
|
||||
style="stroke-width:0.26458332" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-style:normal;font-weight:normal;font-size:42.33333333px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332;"
|
||||
x="655.34485"
|
||||
y="192.23036"
|
||||
id="text215"><tspan
|
||||
sodipodi:role="line"
|
||||
id="tspan213"
|
||||
x="655.34485"
|
||||
y="192.23036"
|
||||
style="stroke-width:0.26458332;font-size:42.33333333px;">x</tspan></text>
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-style:normal;font-weight:normal;font-size:42.33333206px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
|
||||
x="36.73391"
|
||||
y="-383.11801"
|
||||
id="text215-8"><tspan
|
||||
sodipodi:role="line"
|
||||
id="tspan213-9"
|
||||
x="36.73391"
|
||||
y="-345.66293"
|
||||
style="font-size:42.33333206px;stroke-width:0.26458332" /></text>
|
||||
<rect
|
||||
style="fill:#000000;stroke-width:0.26458332"
|
||||
id="rect263"
|
||||
width="6.8035712"
|
||||
height="38.55357"
|
||||
x="-218.69528"
|
||||
y="-431.2952"
|
||||
transform="rotate(37.42867)" />
|
||||
<rect
|
||||
style="fill:#000000;stroke-width:0.26458332"
|
||||
id="rect263-7"
|
||||
width="6.8035712"
|
||||
height="38.55357"
|
||||
x="-386.60941"
|
||||
y="255.82913"
|
||||
transform="rotate(139.04298)"
|
||||
inkscape:transform-center-x="-20.410714"
|
||||
inkscape:transform-center-y="6.8035653" />
|
||||
<rect
|
||||
style="fill:#000000;stroke-width:0.26458332"
|
||||
id="rect263-3"
|
||||
width="6.8035712"
|
||||
height="38.55357"
|
||||
x="-371.74628"
|
||||
y="-681.80341"
|
||||
transform="rotate(129.61772)" />
|
||||
<rect
|
||||
style="fill:#000000;stroke-width:0.26458332"
|
||||
id="rect263-7-6"
|
||||
width="6.8035712"
|
||||
height="38.55357"
|
||||
x="-601.17584"
|
||||
y="456.17935"
|
||||
transform="rotate(-128.76797)"
|
||||
inkscape:transform-center-x="7.5782166"
|
||||
inkscape:transform-center-y="20.135944" />
|
||||
<text
|
||||
xml:space="preserve"
|
||||
style="font-style:normal;font-weight:normal;font-size:42.33333206px;line-height:1.25;font-family:sans-serif;letter-spacing:0px;word-spacing:0px;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.26458332"
|
||||
x="48.032505"
|
||||
y="-377.82925"
|
||||
id="text215-1"><tspan
|
||||
sodipodi:role="line"
|
||||
id="tspan213-2"
|
||||
x="48.032505"
|
||||
y="-377.82925"
|
||||
style="font-size:42.33333206px;stroke-width:0.26458332">y</tspan><tspan
|
||||
sodipodi:role="line"
|
||||
x="48.032505"
|
||||
y="-324.9126"
|
||||
style="font-size:42.33333206px;stroke-width:0.26458332"
|
||||
id="tspan334" /></text>
|
||||
<rect
|
||||
id="rect18-9"
|
||||
width="670.43402"
|
||||
height="13.544262"
|
||||
x="114.69541"
|
||||
y="-151.7952"
|
||||
style="fill:#ff0000;stroke-width:0.27783805"
|
||||
transform="matrix(0.99999973,7.380958e-4,0.11550968,0.99330635,0,0)" />
|
||||
<rect
|
||||
style="fill:#00ff00;stroke-width:0.26458332"
|
||||
id="rect390"
|
||||
width="5.2916665"
|
||||
height="134.55952"
|
||||
x="136.07143"
|
||||
y="-146.74403" />
|
||||
<rect
|
||||
style="fill:#00ff00;stroke-width:0.26458332"
|
||||
id="rect392"
|
||||
width="5.2916665"
|
||||
height="20.410715"
|
||||
x="290.28571"
|
||||
y="-164.13097" />
|
||||
<rect
|
||||
style="fill:#00ff00;stroke-width:0.26458332"
|
||||
id="rect396"
|
||||
width="6.0476379"
|
||||
height="143.63097"
|
||||
x="295.57736"
|
||||
y="-143.72026" />
|
||||
<rect
|
||||
style="fill:#00ff00;stroke-width:0.26458332"
|
||||
id="rect398"
|
||||
width="4.5357141"
|
||||
height="55.184521"
|
||||
x="346.98215"
|
||||
y="-143.72023" />
|
||||
<rect
|
||||
style="fill:#00ff00;stroke-width:0.26458332"
|
||||
id="rect400"
|
||||
width="5.2916665"
|
||||
height="73.327377"
|
||||
x="448.27979"
|
||||
y="-215.53571" />
|
||||
<rect
|
||||
style="fill:#00ff00;stroke-width:0.26458332"
|
||||
id="rect402"
|
||||
width="3.7797618"
|
||||
height="243.41666"
|
||||
x="503.46429"
|
||||
y="-386.38095" />
|
||||
<rect
|
||||
style="fill:#00ff00;stroke-width:0.22913587"
|
||||
id="rect404"
|
||||
width="4.5357146"
|
||||
height="27.970238"
|
||||
x="547.30951"
|
||||
y="-145.9881" />
|
||||
<rect
|
||||
style="fill:#00ff00;stroke-width:0.26458332"
|
||||
id="rect406"
|
||||
width="4.5357141"
|
||||
height="276.67856"
|
||||
x="707.57141"
|
||||
y="-419.64285" />
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 7.6 KiB |
BIN
cw/obrazki/10.png
Normal file
After Width: | Height: | Size: 129 KiB |
BIN
cw/obrazki/2.png
Normal file
After Width: | Height: | Size: 92 KiB |
BIN
cw/obrazki/3.png
Normal file
After Width: | Height: | Size: 68 KiB |
BIN
cw/obrazki/4.png
Normal file
After Width: | Height: | Size: 102 KiB |
BIN
cw/obrazki/5.png
Normal file
After Width: | Height: | Size: 96 KiB |
BIN
cw/obrazki/6.png
Normal file
After Width: | Height: | Size: 69 KiB |
BIN
cw/obrazki/7.png
Normal file
After Width: | Height: | Size: 132 KiB |
BIN
cw/obrazki/8.png
Normal file
After Width: | Height: | Size: 137 KiB |
BIN
cw/obrazki/9.png
Normal file
After Width: | Height: | Size: 86 KiB |
5
run_conversion.sh
Normal file
@ -0,0 +1,5 @@
|
||||
for i in {cw,wyk}/*.ipynb;
|
||||
do
|
||||
bash convert_ipynb_to_md.sh $i
|
||||
echo $i done
|
||||
done
|
@ -1,5 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 1. <i>Wyszukiwarki — wprowadzenie</i> [wykład]</h2> \n",
|
||||
"<h3> Filip Graliński (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -8,16 +22,19 @@
|
||||
"\n",
|
||||
"## Systemy wyszukiwania informacji (information retrieval systems)\n",
|
||||
"\n",
|
||||
"![System wyszukiwania informacji](system-wyszukiwania-informacji.png)"
|
||||
"![Schemat systemu wyszukiwania informacji](system-wyszukiwania-informacji.png)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true,
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"## Wyszukiwarki\n",
|
||||
"\n",
|
||||
"![Wyszukiwarki](wyszukiwarka-internetowa.png)"
|
||||
"![Schemat wyszukiwarki internetowej](wyszukiwarka-internetowa.png)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -89,7 +106,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Dostępne są też \"ekstrakty\" czystego tekstu - zob. http://data.statmt.org/ngrams/raw/, np. 59 GB czystego tekstu po polsku z 2012 roku."
|
||||
"Dostępne są też „ekstrakty” czystego tekstu — zob. http://data.statmt.org/ngrams/raw/, np. 59 GB czystego tekstu po polsku z 2012 roku."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -284,7 +301,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Odpytywać \"pasożytniczo\" inną wyszukiwarkę"
|
||||
"### Odpytywać „pasożytniczo” inną wyszukiwarkę"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -293,7 +310,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# see https://hackernoon.com/how-to-scrape-google-with-python-bo7d2tal\n",
|
||||
"# zob. https://hackernoon.com/how-to-scrape-google-with-python-bo7d2tal\n",
|
||||
"\n",
|
||||
"import urllib\n",
|
||||
"import requests\n",
|
||||
@ -817,823 +834,7 @@
|
||||
"User-agent: *\n",
|
||||
"Disallow: /*/wyszukaj/\n",
|
||||
"Disallow: /*servlet\n",
|
||||
"Disallow: /reloadwww?\n",
|
||||
"Disallow: /dfptools/adview/\n",
|
||||
"Disallow: /pub/ips/*\n",
|
||||
"Disallow: /ods?\n",
|
||||
"Disallow: /getFile.servlet*\n",
|
||||
"Disallow: /aliasy/blad.jsp\n",
|
||||
"Disallow: /znajdz.do\n",
|
||||
"Disallow: /portalSearch.do\n",
|
||||
"Disallow: /im/ab/b4/10/z17515435Q.jpg\n",
|
||||
"Disallow: /75224259/\n",
|
||||
"\n",
|
||||
"User-agent: Googlebot-News\n",
|
||||
"Disallow: /nowy/\n",
|
||||
"Disallow: /mapa_strony\n",
|
||||
"Disallow: /*/wyszukaj/\n",
|
||||
"Disallow: /*/51,\n",
|
||||
"Disallow: /*/55,\n",
|
||||
"Disallow: /*/2,\n",
|
||||
"Disallow: /*order=\n",
|
||||
"Disallow: /*obxx=\n",
|
||||
"Disallow: /*tag=\n",
|
||||
"Disallow: /reloadwww?\n",
|
||||
"Disallow: /ods?\n",
|
||||
"Disallow: /*servlet\n",
|
||||
"Disallow: /dfptools/adview/\n",
|
||||
"\n",
|
||||
"User-agent: Yandex\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-Agent: bingbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: 008\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: 010\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: 360Spider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: 80legs\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Aboundex\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: accelobot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Add\\ Catalog\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: AhrefsBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: aiHitBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Alexibot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Aqua_Products\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: AskJeeves\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: asterias\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: awcheckBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: b2w/0.1\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BackDoorBot/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BacklinkCrawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Baiduspider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BecomeBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BLEXBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BlowFish/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Bookmark search tool\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BotALot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: brandwatch.net\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BuiltBotTough\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Bullseye/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: BunnySlippers\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Butterfly\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CatchBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Charlotte\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CheeseBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CherryPicker\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CherryPickerElite/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CherryPickerSE/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CLIPish\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Cliqzbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: COMODO\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Comodo-Certificates-Spider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CompSpyBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Copernic\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: CopyRightCheck\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: cosmos\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: crawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Crescent\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Crescent Internet ToolPak HTTP OLE Control v.1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Curious\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: curl\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: dataprovider\\.com\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: DinoPing\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: discoverybot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: DittoSpyder\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: DomainCrawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: DomainCrawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: dotbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: dotnetdotcom\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Dow\\ Jones\\ Searchbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: dumbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: EasouSpider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: EmailCollector\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: EmailSiphon\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: EmailWolf\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Enterprise_Search\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Enterprise_Search/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: EroCrawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: es\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Exabot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ExtractorPro\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: EzineArticlesLinkScanner\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Ezooms\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: FairAd Client\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Flaming AttackBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Foobot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: FreeFind\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: FTRF\\:\\ Friendly\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Gaisbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: GetRight/4.2\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: gigabot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: grub\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: grub-client\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Harvest/1.5\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Hatena Antenna\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: hloader\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: http://www.SearchEngineWorld.com bot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: http://www.WebmasterWorld.com bot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: HTTP_Request\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: HTTP_Request2\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: httplib\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: humanlinks\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ia_archiver\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ia_archiver\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ia_archiver/1.6\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Indy\\ Library\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: InfoNaviRobot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ip\\-web\\-crawler\\.com\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Iron33/1.0.2\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Jakarta\\ Commons-HttpClient\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Jeeves\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: JennyBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Jetbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Jetbot/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: JikeSpider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Kenjin Spider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Keyword Density/0.9\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: larbin\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: LexiBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: libWeb/clsHTTP\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: libwww-perl\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: lindex\\.com\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: linkdex\\.com\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: linkdexbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: LinkextractorPro\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: LinkScan/8.1a Unix\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: LinkWalker\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: lipperhey\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: LNSpiderguy\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: looksmart\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ltbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: lwp-trivial\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: lwp-trivial/1.34\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Lynx\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: magpie\\-crawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Mata Hari\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Microsoft URL Control\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Microsoft URL Control - 5.01.4511\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Microsoft URL Control - 6.00.8169\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: MIIxpc\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: MIIxpc/4.2\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Mister PiX\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: MJ12bot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: moget\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: moget/2.1\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Mozilla/4.0 (compatible; BullsEye; Windows 95)\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: MSIE\\ or\\ Firefox\\ mutant\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: MSIECrawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: naver\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: NCBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: NetAnts\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: NetcraftSurveyAgent\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: netEstate\\ NE\\ Crawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: NetMechanic\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Netseer\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: NextGenSearchBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: NICErsPRO\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Nutch\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Nutch\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Ocelli\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Offline Explorer\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: OmniExplorer_Bot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Openbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Openfind\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Openfind\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Openfind data gathere\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: OpenWebIndex\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Oracle Ultra Search\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: PagesInventory\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: PEAR\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: PeoplePal\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: PerMan\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ProCogSEOBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ProPowerBot/2.14\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ProWebWalker\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: proximic\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: psbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: purebot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: QueryN Metasearch\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: QuerySeekerSpider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Radiation Retriever 1.1\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: RepoMonkey\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: RepoMonkey Bait & Tackle/v1.01\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Riddler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: RMA\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: rojerbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: RyteBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: scooter\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ScoutJet\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Scrapy\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ScreenerBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: searchmetrics\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: searchpreview\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SemrushBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: sentibot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SEO-CRAWLING\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SEOENGWorldBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SEOkicks-Robot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ShopWiki\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: sistrix\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: sitebot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SiteSnagger\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Snoopy\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SocialSearcher\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Sogou\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SolomonoBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: sootle\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Sosospider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SpankBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: spanner\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: spbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Speedy\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Stanford\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Stanford Comp Sci\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: SurveyBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: suzuran\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Szukacz/1.4\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Szukacz/1.4\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Teleport\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: TeleportPro\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Telesoft\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Teoma\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: The Intraformant\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: The\\ Incutio\\ XML-RPC\\ PHP\\ Library\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: TheNomad\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: toCrawl/UrlDispatcher\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: True_Robot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: True_Robot/1.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: turingos\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: TurnitinBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: uCrawler\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: URL Control\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: URL_Spider_Pro\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: URLy Warning\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: VCI\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: VCI WebViewer VCI WebViewer Win32\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: visaduhoc\\.info\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WBSearchBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Web Image Collector\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebAuto\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebBandit\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebBandit/3.50\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebCapture\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebCopier\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebEnhancer\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebInDetail\\.com\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebmasterWorld Extractor\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebmasterWorldForumBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebSauger\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Website Quester\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WEBSITEtheWEB\\.COM\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Webster Pro\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebStripper\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebVac\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebZip\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WebZip/4.0\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Wget\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Wget/1.5.3\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Wget/1.6\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Wotbot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: www\\.integromedb\\.org\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: WWW-Collector-E\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Xenu's\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Xenu's Link Sleuth 1.1c\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: xpymep\\.exe\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: YamanaLab-Robot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: YisouSpider\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: YodaoBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: YoudaoBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Zend_Http_Client\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Zeus\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Zeus 32297 Webster Pro V2.9 Win32\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Zeus Link Scout\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ZmEu\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: ZumBot\n",
|
||||
"Disallow: /\n",
|
||||
"\n",
|
||||
"User-agent: Linguee\n",
|
||||
"Disallow: /\n",
|
||||
"...\n",
|
||||
"\n",
|
||||
"User-agent: sogou\n",
|
||||
"Disallow: /\n"
|
||||
@ -1675,11 +876,14 @@
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Filip Graliński",
|
||||
"email": "filipg@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@ -1690,8 +894,11 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
"version": "3.9.6"
|
||||
},
|
||||
"subtitle": "2.Wyszukiwarki — wprowadzenie[wykład]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
|
@ -1,5 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 2. <i>Wyszukiwarki — roboty</i> [wykład]</h2> \n",
|
||||
"<h3> Filip Graliński (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -314,7 +328,7 @@
|
||||
"\n",
|
||||
"* urllib\n",
|
||||
"* request\n",
|
||||
"* Beautiful Soup (do parsowania HTML-a)"
|
||||
"* Beautiful Soup (do parsowania dokumentów HTML)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -494,11 +508,14 @@
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Filip Graliński",
|
||||
"email": "filipg@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@ -509,8 +526,11 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
}
|
||||
"version": "3.9.6"
|
||||
},
|
||||
"subtitle": "2.Wyszukiwarki — roboty[wykład]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
|
9120
wyk/03_Tfidf.ipynb
@ -1,5 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 5. <i>Gęste reprezentacje wektorowe</i> [wykład]</h2> \n",
|
||||
"<h3> Filip Graliński (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -129,7 +143,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Musimy tylko sparametryzować naszą funkcję rozmiarem \"odcisku\" (parametr $b$)."
|
||||
"Musimy tylko sparametryzować naszą funkcję rozmiarem „odcisku” (parametr $b$)."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1604,11 +1618,14 @@
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Filip Graliński",
|
||||
"email": "filipg@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Haskell",
|
||||
"language": "haskell",
|
||||
"name": "haskell"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": "ihaskell",
|
||||
"file_extension": ".hs",
|
||||
@ -1616,7 +1633,10 @@
|
||||
"name": "haskell",
|
||||
"pygments_lexer": "Haskell",
|
||||
"version": "8.10.4"
|
||||
}
|
||||
},
|
||||
"subtitle": "5.Gęste reprezentacje wektorowe[wykład]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
|
@ -1,5 +1,19 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 6. <i>Wyzwania uczenia maszynowego</i> [wykład]</h2> \n",
|
||||
"<h3> Filip Graliński (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -367,11 +381,14 @@
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Filip Graliński",
|
||||
"email": "filipg@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
@ -382,8 +399,11 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.2"
|
||||
}
|
||||
"version": "3.9.6"
|
||||
},
|
||||
"subtitle": "6.Wyzwania uczenia maszynowego[wykład]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
|
391
wyk/07_Naiwny_klasyfikator_bayesowski.ipynb
Normal file
@ -0,0 +1,391 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "45264aad",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 7. <i>Naiwny klasyfikator bayesowski w ekstrakcji informacji</i> [wykład]</h2> \n",
|
||||
"<h3> Filip Graliński (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "moderate-array",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Klasyfikacja binarna dla tekstu\n",
|
||||
"\n",
|
||||
"Zakładamy, że mamy dwie klasy: $c$ i jej dopełnienie ($\\bar{c}$).\n",
|
||||
"\n",
|
||||
"Typowym przykładem jest zadanie klasyfikacji mejla, czy należy do spamu, czy nie (_spam_ vs _ham_), czyli, innymi słowy, filtr antyspamowy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "correct-victory",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Pytanie**: Czy można wyobrazić sobie zadanie klasyfikacji mejli, niebędące zadaniem klasyfikacji binarnej?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "spiritual-diploma",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Zakładamy paradygmat uczenia nadzorowanego, tzn. dysponujemy zbiorem uczącym.\n",
|
||||
"\n",
|
||||
"**Pytanie**: Czym jest i w jaki sposób powstaje zbiór uczący dla filtru antyspamowego?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "secure-performance",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Klasyfikacja regułowa\n",
|
||||
"\n",
|
||||
"Filtr anyspamowe _można_ zrealizować za pomocą metod innych niż opartych na uczeniu maszynowym. Można np. tworzyć reguły (np. wyrażenia regularne). Przykładem są (barokowe...) reguły w programie SpamAssassin, zob. fragment [pliku reguł](https://github.com/apache/spamassassin/blob/trunk/rules/20_advance_fee.cf):\n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"header __FRAUD_VQE\tSubject =~ /^(?:Re:|\\[.{1,10}\\])?\\s*(?:very )?urgent\\s+(?:(?:and|&)\\s+)?(?:confidential|assistance|business|attention|reply|response|help)\\b/i\n",
|
||||
"\n",
|
||||
"body __FRAUD_DBI\t/(?:\\bdollars?\\b|\\busd(?:ollars)?(?:[0-9]|\\b)|\\bus\\$|\\$[0-9,.]{6,}|\\$[0-9].{0,8}[mb]illion|\\$[0-9.,]{2,10} ?m|\\beuros?\\b|u[.]?s[.]? [0-9.]+ m)/i\n",
|
||||
"body __FRAUD_KJV\t/(?:claim|concerning) (?:the|this) money/i\n",
|
||||
"body __FRAUD_IRJ\t/(?:finance|holding|securit(?:ies|y)) (?:company|firm|storage house)/i\n",
|
||||
"body __FRAUD_NEB\t/(?:government|bank) of nigeria/i\n",
|
||||
"body __FRAUD_XJR\t/(?:who was a|as a|an? honest|you being a|to any) foreigner/i\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"**Pytanie:** Jakie są wady i zalety regułowych filtrów antyspamowych?\n",
|
||||
"\n",
|
||||
"Współcześnie zdecydowanie dominuje użycie metod statystycznych (opartych na nadzorowanym uczeniu maszynowym). Do popularności tych metod przyczynił się artykuł [Plan for spam](http://www.paulgraham.com/spam.html) autorstwa Paula Grahama."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "indoor-ending",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Podejście generatywne i dyskryminatywne\n",
|
||||
"\n",
|
||||
"W klasyfikacji (i w ogóle w uczeniu nadzorowanym) można wskazać dwa podejścia:\n",
|
||||
"\n",
|
||||
"* generatywne — wymyślamy pewną „historyjkę”, w jaki sposób powstaje tekst, „historyjka” powinna mieć miejsca do wypełnienia (parametry), np. częstości wyrazów, na podstawie zbioru uczącego dobieramy wartości parametrów (przez rachunki wprost); „historyjka” nie musi być prawdziwa, wystarczy, że jakoś przybliża rzeczywistość\n",
|
||||
"\n",
|
||||
"* dyskryminatywne — nie zastanawiamy się, w jaki sposób powstają teksty, po prostu „na siłę” dobieramy wartości parametrów (wag) modelu, tak aby uzyskać jak najmniejszą wartość funkcji kosztu na zbiorze uczącym; zwykle odbywa się to w iteracyjnym procesie (tak jak przedstawiono na schemacie na poprzednim wykładzie).\n",
|
||||
"\n",
|
||||
"**Pytanie**: Jakie są wady i zalety obu podejść?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "pleased-clinic",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Nasz \"dyżurny\" przykład\n",
|
||||
"\n",
|
||||
"Zakładamy, że nasz zbiór uczący ($X$) składa się z 4 dokumentów:\n",
|
||||
"\n",
|
||||
"* $x_1=\\mathit{kup\\ pan\\ Viagrę}$\n",
|
||||
"* $x_2=\\mathit{tanie\\ miejsce\\ dla\\ pana}$\n",
|
||||
"* $x_3=\\mathit{viagra\\ viagra\\ viagra}$\n",
|
||||
"* $x_4=\\mathit{kup\\ tanie\\ cartridge'e}$\n",
|
||||
"\n",
|
||||
"z następującymi etykietami:\n",
|
||||
"\n",
|
||||
"* $y_1=c$ (spam)\n",
|
||||
"* $y_2=\\bar{c}$ (nie-spam)\n",
|
||||
"* $y_3=c$\n",
|
||||
"* $y_4=c$\n",
|
||||
"\n",
|
||||
"Zakładamy, że dokumenty podlegają lematyzacji i sprowadzeniu do mały liter, więc ostatecznie będziemy mieli następujące ciąg termów:\n",
|
||||
"\n",
|
||||
"* $x_1=(\\mathit{kupić}, \\mathit{pan}, \\mathit{viagra})$\n",
|
||||
"* $x_2=(\\mathit{tani}, \\mathit{miejsce}, \\mathit{dla}, \\mathit{pan})$\n",
|
||||
"* $x_3=(\\mathit{viagra}, \\mathit{viagra}, \\mathit{viagra})$\n",
|
||||
"* $x_4=(\\mathit{kupić}, \\mathit{tani}, \\mathit{cartridge})$\n",
|
||||
"\n",
|
||||
"$P(tani|c) = (1+1)/(9+7) = 2/16 = 0.125$\n",
|
||||
"$P(viagra|c) = \\frac{4+1}{9 + 7} = 5/16 = 0.3125 $\n",
|
||||
"$P(dla|c) = \\frac{0+1}{9+7} = 1/16 = 0.0625$\n",
|
||||
"$P(pan|c) = (1+1)/(9+7) = 2/16 = 0.125 $\n",
|
||||
"$P(c) = 0.75$\n",
|
||||
"\n",
|
||||
"w wersji wielomianowej: $P(c)P(tani|c)P(tani|c)P(viagra|c)P(dla|c)P(pan|c) = 0.75 * 0.125 * 0.125 * 0.3125 * 0.0625 * 0.125= 0.0002861$\n",
|
||||
"\n",
|
||||
"w werjis Bernoulliego: $P(c)P(U_{dla}=1|c)P(U_{cartridge}=0|c)P(U_{viagra}=1|c)P(U_{pan}=1|c)P(U_{tani}=1|c)P(U_{miejsce}=0|c)P(U_{kup}=0|c)$\n",
|
||||
"\n",
|
||||
"$P(tani|\\bar{c}) = (1+1)/(4+7) = 2/11 =0.182 $\n",
|
||||
"$P(viagra|\\bar{c}) = 1/11 = 0.091 $\n",
|
||||
"$P(dla|\\bar{c}) = 2/11 = 0.182 $\n",
|
||||
"$P(pan|\\bar{c}) = 2/11 = 0.182 $\n",
|
||||
"$P(\\bar{c}) = 0.25$\n",
|
||||
"\n",
|
||||
"$P(\\bar{c})P(tani|\\bar{c})P(tani|\\bar{c})P(dla|\\bar{c})P(pan|\\bar{c}) = 0.25 * 0.182 * 0.182 * 0.091 * 0.182 * 0.182 = 0.00002496$\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Uczymy na tym zbiorze klasyfikator, który będziemy testować na dokumencie $d=\\mathit{tania\\ tania\\ viagra\\ dla\\ pana}$, tj. po normalizacji\n",
|
||||
"$d=(\\mathit{tani}, \\mathit{tani}, \\mathit{viagra}, \\mathit{dla}, \\mathit{pan})$.\n",
|
||||
"\n",
|
||||
"**Uwaga:** Przykład jest oczywiście nierealistyczny i trudno będzie nam ocenić sensowność odpowiedzi. Za to będziemy w stanie policzyć ręcznie wynik.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "partial-military",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Naiwny klasyfikator bayesowski\n",
|
||||
"\n",
|
||||
"* _naiwny_— niekoniecznie oznacza, że to „głupi”, bezużyteczny klasyfikator\n",
|
||||
"* _klasyfikator_ \n",
|
||||
"* _bayesowski_ — będzie odwoływać się do wzoru Bayesa.\n",
|
||||
"\n",
|
||||
"Naiwny klasyfikator bayesowski raczej nie powinien być stosowany „produkcyjnie” (są lepsze metody). Natomiast jest to metoda bardzo prosta w implementacji dająca przyzwoity _baseline_.\n",
|
||||
"\n",
|
||||
"Naiwny klasyfikator bayesowski ma dwie odmiany:\n",
|
||||
"\n",
|
||||
"* wielomianową,\n",
|
||||
"* Bernoulliego.\n",
|
||||
"\n",
|
||||
"Wielomianowy naiwny klasyfikator bayesowski jest częściej spotykany i od niego zaczniemy."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "colonial-creature",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Mamy dokument $d$ i dwie klasy $c$ i $\\bar{c}$. Policzymy prawdopodobieństwa $P(c|d)$ (mamy dokument $d$, jakie jest prawdopodobieństwo, że to klasa $c$) i $P(\\bar{c}|d)$. A właściwie będziemy te prawdopodobieństwa porównywać.\n",
|
||||
"\n",
|
||||
"**Uwaga**: nasz zapis to skrót notacyjny, właściwie powinniśmy podać zmienne losowe $P(C=c|D=d)$, ale zazwyczaj będziemy je pomijać. \n",
|
||||
"\n",
|
||||
"**Pytanie**: kiedy ostatecznie nasz klasyfikator zwróci informację, że klasa $c$, a kiedy że $\\bar{c}$? czy użytkownika interesują prawdopodobieństwa $P(c|d)$ i $P(\\bar{c}|d)$?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "governing-fiction",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Zastosujmy najpierw wzór Bayesa.\n",
|
||||
"\n",
|
||||
"$P(c|d) = \\frac{P(d|c) P(c)}{P(d)}$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "northern-spine",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"$P(\\bar{c}|d) = \\frac{P(d|\\bar{c}) P(\\bar{c})}{P(d)}$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "utility-induction",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"(Oczywiście skądinąd $P(\\bar{c}|d) = 1 - P(c|d)$, ale nie będziemy teraz tego wykorzystywali.)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "timely-force",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Co możemy pominąć, jeśli tylko porównujemy $P(c|d)$ i $P(\\bar{c}|d)$?\n",
|
||||
"\n",
|
||||
"Użyjmy znaku proporcjonalności $\\propto$:\n",
|
||||
"\n",
|
||||
"$P(c|d) = \\frac{P(d|c) P(c)}{P(d)} \\propto P(d|c) P(c)$\n",
|
||||
"\n",
|
||||
"$P(\\bar{c}|d) = \\frac{P(d|\\bar{c}) P(\\bar{c})}{P(d)} \\propto P(d|\\bar{c}) P(\\bar{c})$\n",
|
||||
"\n",
|
||||
"**Pytanie:** czy iloczyn $P(d|c)P(c)$ można interpretować jako prawdopodobieństwo?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "embedded-involvement",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Prawdopodobieństwo _a priori_\n",
|
||||
"\n",
|
||||
"$P(c)$ — prawdopodobieństwo a priori klasy $c$\n",
|
||||
"\n",
|
||||
"$\\hat{P}(c) = \\frac{N_c}{N}$\n",
|
||||
"\n",
|
||||
"gdzie\n",
|
||||
"\n",
|
||||
"* N — liczba wszystkich dokumentów w zbiorze uczącym\n",
|
||||
"* N_c — liczba dokumentow w zbiorze uczącym z klasą $c$\n",
|
||||
"\n",
|
||||
"$\\hat{P}(c) = 0,75$\n",
|
||||
"\n",
|
||||
"$\\hat{P}(\\bar{c}) = 0,25$\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "virgin-premiere",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Prawdopodobieństwo _a posteriori_\n",
|
||||
"\n",
|
||||
"Jak interpretować $P(d|c)$?\n",
|
||||
"\n",
|
||||
"Wymyślmy sobie model generatywny, $P(d|c)$ będzie prawdopodobieństwem, że spamer (jeśli $c$ to spam) wygeneruje tekst.\n",
|
||||
"\n",
|
||||
"Załóżmy, że dokument $d$ to ciąg $n$ termów, $d = (t_1\\dots t_n)$. Na razie niewiele z tego wynika."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "acting-zimbabwe",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"$P(d|c) = P(t_1\\dots t_n|c)$\n",
|
||||
"\n",
|
||||
"Aby pójść dalej, musimy doszczegółowić nasz model generatywny. Przyjmijmy bardzo naiwny i niezgodny z rzeczywistością model spamera (i nie-spamera): spamer wyciąga wyrazy z worka i wrzuca je z powrotem (losowanie ze zwracaniem). Jedyne co odróżnia spamera i nie-spamera, to **prawdopodobieństwo wylosowania wyrazu** (np. spamer wylosuje słowo _Viagra_ z dość dużym prawdopodobieństwem, nie-spamer — z bardzo niskim).\n",
|
||||
"\n",
|
||||
"**Pytanie:** Ile może wynosić $P(\\mathit{Viagra}|c)$?\n",
|
||||
"\n",
|
||||
"Po przyjęciu takich „naiwnych założeń”:\n",
|
||||
"\n",
|
||||
"$$P(d|c) = P(t_1\\dots t_n|c) \\approx P(t_1|c)\\dots P(t_n|c) = \\prod_i^n P(t_i|c)$$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "adjustable-disney",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Jak oszacować $\\hat{P}(t|c)$?\n",
|
||||
"\n",
|
||||
"$$\\hat{P}(t|c) = \\frac{\\#(t,c)}{\\sum_i^{|V|} \\#(t_i,c)} = \\frac{\\mathit{ile\\ razy\\ term\\ t\\ pojawił\\ się\\ w\\ dokumentach\\ klasy\\ c}}{liczba\\ wyrazów\\ w\\ klasie\\ c}$$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "associate-variance",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Wygładzanie\n",
|
||||
"\n",
|
||||
"Mamy problem z zerowymi prawdopodobieństwami.\n",
|
||||
"\n",
|
||||
"Czy jeśli w naszym zbiorze uczącym spamerzy ani razu nie użyli słowa _wykładzina_, to $P(\\mathit{wykładzina}|c) = 0$?.\n",
|
||||
"\n",
|
||||
"Musimy zastosować wygładzanie (_smoothing_). Spróbujmy wymyślić wygładzanie wychodząc od zdroworozsądkowych aksjomatów.\n",
|
||||
"\n",
|
||||
"#### Aksjomaty wygładzania.\n",
|
||||
"\n",
|
||||
"Założmy, że mamy dyskretną przestrzeń probabilistyczną $\\Omega = \\{\\omega_1,\\dots,\\omega_m\\}$, zdarzenie $\\omega_i$ w naszym zbiorze uczącym wystąpiło $k_i$ razy. Wprost prawdopodobieństwa byśmy oszacowali tak: $P(\\omega_i) = \\frac{k_i}{\\sum_j^m k_j}$.\n",
|
||||
"Szukamy zamiast tego funkcji wygładzającej $f(m, k, T)$ (innej niż $f(m, k, T) = \\frac{k}{T}$), która spełniałaby następujące aksjomaty:\n",
|
||||
"\n",
|
||||
"1. $f(m, k, T) \\in [0, 1]$\n",
|
||||
"2. $f(m, k, T) \\in (0, 1)$ jeśli $m > 1$\n",
|
||||
"3. $\\sum_i f(m, k_i, T) = 1$, jeśli $\\sum_i k_i = T$\n",
|
||||
"4. $f(m, 0, 0) = \\frac{1}{m}$\n",
|
||||
"5. $\\lim_{T \\to \\inf} f(m, k, T) = \\frac{k}{T}$\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"m=2, k1=2, k2=4, T=6, 2/6 => f(2, 2, 6) > 0.333, f(2, 4, 6) < 0.666 \n",
|
||||
"\n",
|
||||
"Jaka funkcja spełnia te aksjomaty?\n",
|
||||
"\n",
|
||||
"$$f(m, k, T) = \\frac{k+1}{T+m}$$\n",
|
||||
"\n",
|
||||
"Jest to wygładzanie +1, inaczej wygładzanie Laplace'a.\n",
|
||||
"\n",
|
||||
"**Pytanie:** Wymyślić jakiś inny przykład funkcji, która będzie spełniała aksjomaty.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "complimentary-airplane",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Po zastosowaniu do naszego naiwnego klasyfikatora otrzymamy:\n",
|
||||
" \n",
|
||||
"$$\\hat{P}(t|c) = \\frac{\\#(t,c) + 1}{\\sum_i^{|V|} \\#(t_i,c) + |V|}$$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "comprehensive-junior",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Metoda Bernoulliego"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "vocational-spanish",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"$$P(𝑑|𝑐) \\approx P(U=u_1|c)\\dots P(U=u_{|v|})$$, gdzie $u_i = 1$, $t_i$ pojawił się w dokumencie $d$, 0 - w przeciwnym razie\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "enabling-manitoba",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"$\\hat{P}(U_{viagra}=1|c) = \\frac{\\#(viagra,N_c) + 1}{N_c + 2}$"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "bearing-execution",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Filip Graliński",
|
||||
"email": "filipg@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
},
|
||||
"subtitle": "7.Naiwny klasyfikator bayesowski w ekstrakcji informacji[wykład]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
326
wyk/08_Regresja_liniowa.ipynb
Normal file
@ -0,0 +1,326 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "35c19016",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 8. <i>Regresja liniowa</i> [wykład]</h2> \n",
|
||||
"<h3> Filip Graliński (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cathedral-newark",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Regresja liniowa\n",
|
||||
"\n",
|
||||
"Regresja liniowa jest prosta...\n",
|
||||
"\n",
|
||||
"![Ceny mieszkań](./08_files/linregr1.png)\n",
|
||||
"\n",
|
||||
"... dosłownie — dopasuj prostą $y = ax + b$ do punktów\n",
|
||||
"\n",
|
||||
"Należy odgadnąć $a$ i $b$ tak, aby zminimalizować błąd\n",
|
||||
"kwadratowy, tj. wartość:\n",
|
||||
"\n",
|
||||
"$$\\sum_{i=1}^n (y_i - (ax_i + b))^2$$\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "heard-clinton",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Regresje liniowa (jednej zmiennej) jest łatwa do rozwiązania — wystarczy podstawić do wzoru!\n",
|
||||
"\n",
|
||||
"$$\\hat{b} = \\frac{ \\sum_{i=1}^{n}{x_i y_i} - \\frac{1}{n} \\sum_{i=1}^n x_i\n",
|
||||
" \\sum_{j=1}^n y_j}{ \\sum_{i=1}^n {x_i^2} - \\frac{1}{n} (\\sum_{i=1}^n\n",
|
||||
" x_i)^2 }$$\n",
|
||||
"\n",
|
||||
"$$\\hat{a} = \\bar{y} - \\hat{b}\\,\\bar{x}$$\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Na przykład dla mieszkań: $b =$ -30809.203 zł, $a =$ 5733.693 zł/m$^2$.\n",
|
||||
"\n",
|
||||
"![Ceny mieszkań](./08_files/linregr2.png)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "preceding-impression",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Regresja wielu zmiennych\n",
|
||||
"\n",
|
||||
"W praktyce mamy do czynienia z **wielowymiarową** regresją\n",
|
||||
"liniową.\n",
|
||||
"\n",
|
||||
"Cena mieszkań może być prognozowana na podstawie:\n",
|
||||
"\n",
|
||||
"* powierzchni w m$^2$ ($x_1 = 32.3$) $w_1 = 7000$\n",
|
||||
"\n",
|
||||
"* liczby pokoi ($x_2 = 3$) $w_2 = -30000$\n",
|
||||
" \n",
|
||||
"* nr piętra ($x_3 = 4$) \n",
|
||||
"\n",
|
||||
"* wieku ($x_4 = 13$) $w_3 = -1000$\n",
|
||||
"\n",
|
||||
"* odległości od Dworca Centralnego w Warszawie ($x_5 = 371.3$)\n",
|
||||
"\n",
|
||||
"* wielkość miasta\n",
|
||||
"\n",
|
||||
"* gęstość zaludnienia\n",
|
||||
"\n",
|
||||
"* cech zerojedynkowych:\n",
|
||||
"\n",
|
||||
" * czy wielka płyta? ($x_6 = 0$)\n",
|
||||
"\n",
|
||||
" * czy jest jacuzzi? ($x_7 = 1$) $w_7 = 5000$\n",
|
||||
"\n",
|
||||
" * czy jest grzyb? ($x_8 = 0$) $w_8 = -40000$\n",
|
||||
" \n",
|
||||
" * czy to Kielce? ($x_9 = 1$)\n",
|
||||
" \n",
|
||||
" * czy to Kraków ($x_{10} = 0$)\n",
|
||||
" \n",
|
||||
" * czy to Katowice ($x_{11} = 0$)\n",
|
||||
" \n",
|
||||
" * czy obok budynku jest parking \n",
|
||||
" \n",
|
||||
" * czy w budynku jest parking\n",
|
||||
"\n",
|
||||
"* zakodowany opis \n",
|
||||
"\n",
|
||||
" * $(x_{12}, x_{|V|+12})$ - wektor tf-idf \n",
|
||||
"\n",
|
||||
"... więc uogólniamy na wiele ($k$) wymiarów:\n",
|
||||
"\n",
|
||||
"$$ y = w_0 + w_1x_1 + \\ldots + w_kx_k = w_0 + \\sum_{j=1}^{k} w_jx_j = w_0 + \\vec{w}\\vec{x}$$\n",
|
||||
"\n",
|
||||
"gdzie:\n",
|
||||
"\n",
|
||||
"* $x_1,\\dots,x_k$ -- zmienne, na podstawie których zgadujemy\n",
|
||||
"\n",
|
||||
"* $w_0, w_1,\\dots,w_k$ -- wagi modelu (do wymyślenia na\n",
|
||||
" podstawie przykładów)\n",
|
||||
"\n",
|
||||
"* $y$ -- odgadywana wartość\n",
|
||||
"\n",
|
||||
"Też istnieje wzór ładny wzór na wyliczenie wektora wag!\n",
|
||||
"\n",
|
||||
"$$\\mathbf{w} = (\\mathbf{X}^{\\rm T}\\mathbf{X})^{-1} \\mathbf{X}^{\\rm T}\\mathbf{y}$$\n",
|
||||
"\n",
|
||||
"... niestety odwracanie macierzy nie jest tanie :("
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "confused-increase",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Kilka spostrzeżeń\n",
|
||||
"\n",
|
||||
"Regresja liniowa to najprostszy możliwy model:\n",
|
||||
"\n",
|
||||
"* im czegoś więcej na wejściu, tym proporcjonalnie (troszkę) więcej/mniej na wyjściu\n",
|
||||
"\n",
|
||||
"* nic prostszego nie da się wymyślić (funkcja stała??)\n",
|
||||
"\n",
|
||||
"* niestety model liniowy czasami kompletnie nie ma sensu (np. wzrost człowieka w\n",
|
||||
" stosunku do wieku)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "freelance-controversy",
|
||||
"metadata": {
|
||||
"jp-MarkdownHeadingCollapsed": true,
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"## Uczenie\n",
|
||||
"\n",
|
||||
"A jak nauczyć się wag z przykładów?\n",
|
||||
"\n",
|
||||
"* wzór (z odwracaniem macierzy) — problematyczny\n",
|
||||
"\n",
|
||||
"### Metoda gradientu prostego\n",
|
||||
"\n",
|
||||
"![Morskie oko; Autor:Krzysztof Dudzik; Źródło: [https://pl.wikipedia.org/wiki/Morskie_Oko#/media/Plik:Morskie_Oko_ze_szlaku_przez_%C5%9Awist%C3%B3wk%C4%99.jpg](https://pl.wikipedia.org/wiki/Morskie_Oko#/media/Plik:Morskie_Oko_ze_szlaku_przez_%C5%9Awist%C3%B3wk%C4%99.jpg); Licencja: CC-BY 3.0](08_files/morskieoko.jpg)\n",
|
||||
"\n",
|
||||
"Schodź wzdłuż lokalnego spadku funkcji błędu.\n",
|
||||
"\n",
|
||||
"Tak więc w praktyce zamiast podstawiać do wzoru lepiej się uczyć iteracyjnie —\n",
|
||||
" metodą **gradientu prostego** (ang. _gradient descent_).\n",
|
||||
"\n",
|
||||
"1. Zacznij od byle jakich wag $w_i$ (np. wylosuj)\n",
|
||||
"2. Weź losowy przykład uczący $x_1,\\dots,x_n$, $y$.\n",
|
||||
"3. Oblicz wyjście $\\hat{y}$ na podstawie $x_1,\\dots,x_n$.\n",
|
||||
"4. Oblicz funkcję błędu między $y$ a $\\hat{y}$.\n",
|
||||
"5. Zmodyfikuj lekko wagi $(w_i)$ w kierunku spadku funkcji błędu.\n",
|
||||
"6. Jeśli błąd jest duży, idź do 2.\n",
|
||||
"\n",
|
||||
"Modyfikacja wag:\n",
|
||||
"\n",
|
||||
"$$w_i := w_i - x_i (\\hat{y} - y) \\eta$$\n",
|
||||
"\n",
|
||||
"gdzie $\\eta$ to **współczynnik uczenia** _learning rate_.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "divine-medium",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Ewaluacja regresji\n",
|
||||
"\n",
|
||||
"To miary błędu (im mniej, tym lepiej!)}\n",
|
||||
"\n",
|
||||
"### Błąd bezwzględny (Mean Absolute Error, MAE)\n",
|
||||
"\n",
|
||||
"$$\\frac{1}{n}\\sum_{i=1}^n |\\hat{y}_i - y_i| $$\n",
|
||||
"\n",
|
||||
"### Mean Squared Error (MSE)\n",
|
||||
"\n",
|
||||
"$$\\frac{1}{n}\\sum_{i=1}^n (\\hat{y}_i - y_i)^2$$\n",
|
||||
"\n",
|
||||
"### Root Mean Squared Error (RMSE)\n",
|
||||
"\n",
|
||||
"$$\\sqrt{\\frac{1}{n}\\sum_{i=1}^n (\\hat{y}_i - y_i)^2}$$\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "supreme-tennessee",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Regresja liniowa dla tekstu\n",
|
||||
"\n",
|
||||
"Czym jest wektor $\\vec{x} = (x_1,\\dots,x_n)$? Wiemy, np. reprezentacja tf-idf (być z trikiem z haszowaniem, Word2vec etc.).\n",
|
||||
"\n",
|
||||
"![Schemat regresji liniowej tekstu](08_files/regresja-liniowa-tekst.png)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "seasonal-syndication",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Przykład \n",
|
||||
"\n",
|
||||
"Wyzwanie RetroC2 - odgadywanie roku dla krótkiego tekstu (1814-2013), <https://gonito.net/challenge/retroc2>.\n",
|
||||
" \n",
|
||||
"Lista słów (obcięta do 7 znaków) z największą/najmniejszymi wagami. \n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"wzbudze -0.08071490\n",
|
||||
"paczka -0.08000180\n",
|
||||
"szarpi -0.05906200\n",
|
||||
"spadoch -0.05784140\n",
|
||||
"rzymsko -0.05466660\n",
|
||||
"sosnowy -0.05162170\n",
|
||||
"dębowyc -0.04778910\n",
|
||||
"nawinię -0.04649400\n",
|
||||
"odmówie -0.04522140\n",
|
||||
"zacisko -0.04480620\n",
|
||||
"funkcją -0.04479500\n",
|
||||
"werben -0.04423350\n",
|
||||
"nieumyś -0.04415200\n",
|
||||
"wodomie -0.04351570\n",
|
||||
"szczote -0.04313390\n",
|
||||
"exekucy -0.04297940\n",
|
||||
"listew -0.04214090\n",
|
||||
"daley -0.04145400\n",
|
||||
"metro -0.04080110\n",
|
||||
"wyjąwsz -0.04078060\n",
|
||||
"salda -0.04042050\n",
|
||||
"tkach -0.04020180\n",
|
||||
"cetnar -0.03999050\n",
|
||||
"zgóry -0.03855980\n",
|
||||
"belek -0.03833100\n",
|
||||
"formier -0.03805890\n",
|
||||
"wekslu -0.03796510\n",
|
||||
"odmową -0.03753760\n",
|
||||
"\n",
|
||||
"odwadni 0.04662140\n",
|
||||
"dozując 0.04672770\n",
|
||||
"wyników 0.04744650\n",
|
||||
"sprawst 0.04746330\n",
|
||||
"jakub 0.04750710\n",
|
||||
"ścieran 0.04791070\n",
|
||||
"wrodzon 0.04799800\n",
|
||||
"koryguj 0.04843560\n",
|
||||
"odnotow 0.04854360\n",
|
||||
"tłumiąc 0.04917320\n",
|
||||
"leasing 0.04963200\n",
|
||||
"ecznej 0.04994810\n",
|
||||
"2013r 0.05009500\n",
|
||||
"kompens 0.05049060\n",
|
||||
"comarch 0.05058620\n",
|
||||
"pojazde 0.05078540\n",
|
||||
"badanyc 0.05340480\n",
|
||||
"kontakc 0.05377990\n",
|
||||
"sygnali 0.05601120\n",
|
||||
"piasta 0.05658670\n",
|
||||
"2000r 0.05716820\n",
|
||||
"stropni 0.06123470\n",
|
||||
"oszone 0.06124600\n",
|
||||
"zamonto 0.06424310\n",
|
||||
"……….. 0.06498500\n",
|
||||
"kumulat 0.06596770\n",
|
||||
"faktura 0.07313080\n",
|
||||
"wielost 0.09677770\n",
|
||||
"wielomi 0.12307300\n",
|
||||
"```\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "encouraging-martial",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Filip Graliński",
|
||||
"email": "filipg@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
},
|
||||
"subtitle": "8.Regresja liniowa[wykład]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
BIN
wyk/08_files/linregr1.pdf
Normal file
BIN
wyk/08_files/linregr1.png
Normal file
After Width: | Height: | Size: 101 KiB |
8
wyk/08_files/linregr1.r
Normal file
@ -0,0 +1,8 @@
|
||||
library(ggplot2)
|
||||
|
||||
prices = read.csv("mieszkania.tsv", sep="\t", header=TRUE)
|
||||
prices$area = prices$powierzchnia
|
||||
prices$price = prices$cena
|
||||
pdf("linregr1.pdf")
|
||||
ggplot(prices, aes(x=area, y=price)) + geom_point()
|
||||
dev.off()
|
BIN
wyk/08_files/linregr2.pdf
Normal file
BIN
wyk/08_files/linregr2.png
Normal file
After Width: | Height: | Size: 125 KiB |
8
wyk/08_files/linregr2.r
Normal file
@ -0,0 +1,8 @@
|
||||
library(ggplot2)
|
||||
|
||||
prices = read.csv("mieszkania.tsv", sep="\t", header=TRUE)
|
||||
prices$area = prices$powierzchnia
|
||||
prices$price = prices$cena
|
||||
pdf("linregr2.pdf")
|
||||
ggplot(prices, aes(x=area, y=price)) + geom_point() + stat_smooth(method=lm, se=FALSE)
|
||||
dev.off()
|
121
wyk/08_files/mieszkania.tsv
Normal file
@ -0,0 +1,121 @@
|
||||
powierzchnia cena
|
||||
53 215000
|
||||
60.01 219990
|
||||
54 285000
|
||||
60 330000
|
||||
63 212000
|
||||
39 219000
|
||||
76.11 399000
|
||||
48 119000
|
||||
42.19 260000
|
||||
53.41 323000
|
||||
65.65 555000
|
||||
65 185000
|
||||
55 247000
|
||||
100 280000
|
||||
56 224000
|
||||
39 230000
|
||||
42.3 179000
|
||||
49.65 305000
|
||||
68 345000
|
||||
37 145000
|
||||
103 529000
|
||||
62.3 209000
|
||||
17.65 42000
|
||||
45 500000
|
||||
36.15 140000
|
||||
45 159000
|
||||
50 130000
|
||||
48 84000
|
||||
36 359000
|
||||
39.3 116400
|
||||
49.48 136950
|
||||
26 85000
|
||||
72 469000
|
||||
64 239000
|
||||
55 435000
|
||||
90 175903
|
||||
90 175903
|
||||
90 175903
|
||||
127.88 1710000
|
||||
59 649000
|
||||
48.7 240000
|
||||
73 259000
|
||||
32.9 275000
|
||||
64 170000
|
||||
44.72 174408
|
||||
68 275000
|
||||
38 323000
|
||||
35 110000
|
||||
63 165000
|
||||
25 69000
|
||||
50 290000
|
||||
76.312 572325
|
||||
65 429000
|
||||
52.5 499000
|
||||
58 145000
|
||||
34 95000
|
||||
46 280000
|
||||
38 120000
|
||||
52 269000
|
||||
47 105000
|
||||
63 266000
|
||||
67.79 275000
|
||||
60 550000
|
||||
107 1230000
|
||||
53 228000
|
||||
48.65 148000
|
||||
39 140000
|
||||
23 170000
|
||||
35 195000
|
||||
71.19 245000
|
||||
75 329000
|
||||
53 185000
|
||||
51 135000
|
||||
42 133000
|
||||
38 142000
|
||||
45.6 470000
|
||||
50 194000
|
||||
29 158999
|
||||
28.8 199000
|
||||
36 199000
|
||||
57.43 385621
|
||||
57.71 402305
|
||||
60.12 395000
|
||||
38 210000
|
||||
56.28 419000
|
||||
60 346800
|
||||
41 295000
|
||||
28.7 219000
|
||||
39 275000
|
||||
37 105000
|
||||
47 330000
|
||||
64 435000
|
||||
96 151200
|
||||
35.34 87000
|
||||
101 489000
|
||||
50 129000
|
||||
49.5 315000
|
||||
14 2000
|
||||
31 110000
|
||||
50.9 265000
|
||||
117 129000
|
||||
52.2 250000
|
||||
28 140000
|
||||
15 5000
|
||||
41.7 249000
|
||||
56.4 490000
|
||||
30.9 161000
|
||||
42.3 229000
|
||||
53 270000
|
||||
72.4 409000
|
||||
52.9 370000
|
||||
37.77 135000
|
||||
82 260000
|
||||
32 195000
|
||||
59 590000
|
||||
62.01 205000
|
||||
52.5 543000
|
||||
56 170000
|
||||
67.61 285000
|
||||
51 494000
|
|
BIN
wyk/08_files/morskieoko.jpg
Normal file
After Width: | Height: | Size: 291 KiB |
BIN
wyk/08_files/regresja-liniowa-tekst.png
Normal file
After Width: | Height: | Size: 61 KiB |
2048
wyk/09_neurozoo.ipynb
Normal file
1057
wyk/09_neurozoo.org
Normal file
543
wyk/11_rnn.ipynb
Normal file
@ -0,0 +1,543 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 11. <i>Sieci rekurencyjne</i> [wykład]</h2> \n",
|
||||
"<h3> Filip Graliński (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Rekurencyjne sieci neuronowe\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Inne spojrzenie na sieci przedstawione do tej pory\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Regresja liniowa/logistyczna lub klasyfikacja wieloklasowa na całym tekście\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"W regresji liniowej czy logistycznej bądź w klasyfikacji wieloklasowej\n",
|
||||
"(z funkcją Softmax) stosowaliśmy następujący schemat:\n",
|
||||
"\n",
|
||||
"Do tej pory patrzyliśmy na to tak, że po prostu cały tekst jest od\n",
|
||||
"razu przetwarzany przez (prostą) sieć neuronową, popatrzmy na ten\n",
|
||||
"przypadek, jak na sytuację przetwarzania sekwencyjnego. Będzie to\n",
|
||||
"trochę sztuczne, ale uogólnimy to potem w sensowny sposób.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Wektoryzacja\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Po pierwsze, zauważmy, że w wielu schematach wektoryzacji (np. tf), wektor\n",
|
||||
"dokumentów jest po prostu sumą wektorów poszczególnych składowych:\n",
|
||||
"\n",
|
||||
"$$\\vec{v}(d) = \\vec{v}(t^1,\\ldots,t^K) = \\vec{v}(t^1) + \\ldots + \\vec{v}(t^K) = \\sum_{k=1}^K \\vec{v}(t^i),$$\n",
|
||||
"\n",
|
||||
"gdzie w schemacie tf \\vec{v}(t<sup>i</sup>) to po prostu wektor *one-hot* dla słowa.\n",
|
||||
"\n",
|
||||
"**Pytanie** Jak postać przyjmie w \\vec{v}(t<sup>i</sup>) dla wektoryzacji tf-idf?\n",
|
||||
"\n",
|
||||
"Wektory $\\vec{v}(t^k)$ mogą być również gęstymi wektorami\n",
|
||||
"($\\vec{v}(t^k) \\in \\mathcal{R}^n$, gdzie $n$ jest rzędu 10-1000), np.\n",
|
||||
"w modelu Word2vec albo mogą to być **wyuczalne** wektory (zanurzenia\n",
|
||||
"słów, *embeddings*), tzn. wektory, które są parametrami uczonej sieci!\n",
|
||||
"\n",
|
||||
"**Pytanie** Ile wag (parametrów) wnoszą wyuczalne wektory do sieci?\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Prosta wektoryzacja wyrażona w modelu sekwencyjnym\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Jak zapisać równoważnie powyższą wektoryzację w modelu **sekwencyjnym**, tj. przy założeniu, że\n",
|
||||
"przetwarzamy wejście token po tokenie (a nie „naraz”)? Ogólnie wprowadzimy bardzo\n",
|
||||
"ogólny model sieci **rekurencyjnej**.\n",
|
||||
"\n",
|
||||
"Po pierwsze zakładamy, że sieć ma pewien stan $\\vec{s^k} \\in\n",
|
||||
"\\mathcal{R}^m$ (stan jest wektorem o długości $m$), który może\n",
|
||||
"zmieniać się z każdym krokiem (przetwarzanym tokenem). Zmiana stanu\n",
|
||||
"jest określona przez pewną funkcję $R : \\mathcal{R}^m \\times\n",
|
||||
"\\mathcal{R}^n \\rightarrow \\mathcal{R}^m$ ($n$ to rozmiar wektorów\n",
|
||||
"$\\vec{v}(t^k)$):\n",
|
||||
"\n",
|
||||
"$$\\vec{s^k} = R(\\vec{s^{k-1}}, \\vec{v}(t^k)).$$\n",
|
||||
"\n",
|
||||
"W przypadku wektoryzacji tf-idf mamy do czynienia z prostym\n",
|
||||
"sumowaniem, więc $R$ przyjmuje bardzo prostą postać:\n",
|
||||
"\n",
|
||||
"$$\\vec{s^0} = [0,\\dots,0],$$\n",
|
||||
"\n",
|
||||
"$$R(\\vec{s}, \\vec{x}) = \\vec{s} + \\vec{x}.$$\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"##### Wyjście z modelu\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Dla regresji liniowej/logistycznej, oprócz funkcji $R$, która określa\n",
|
||||
"zmianę stanu, potrzebujemy funkcji $O$, która określa wyjście systemu w każdym kroku.\n",
|
||||
"\n",
|
||||
"$$y^k = O(\\vec{s^k})$$\n",
|
||||
"\n",
|
||||
"W zadaniach klasyfikacji czy regresji, kiedy patrzymy na cały tekst w\n",
|
||||
"zasadzie wystarczy wziąć *ostatnią* wartość (tj. $y^K$). Można sobie\n",
|
||||
"wyobrazić sytuację, kiedy wartości $y^k$ dla $k < k$ również mogą być jakoś przydatne\n",
|
||||
"(np. klasyfikujemy na bieżąco tekst wpisywany przez użytkownika).\n",
|
||||
"\n",
|
||||
"W każdym razie dla regresji liniowej funkcja $O$ przyjmie postać:\n",
|
||||
"\n",
|
||||
"$$O(\\vec{s}) = \\vec{w}\\vec{s}$$,\n",
|
||||
"\n",
|
||||
"gdzie $\\vec{w}$ jest wektorem wyuczylnych wag, dla regresji zaś logistycznej:\n",
|
||||
"\n",
|
||||
"$$O(\\vec{s}) = \\operatorname{softmax}(\\vec{w}\\vec{s})$$\n",
|
||||
"\n",
|
||||
"**Pytanie**: jaką postać przyjmie $O$ dla klasyfikacji wieloklasowej\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Prosta sieć rekurencyjna\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"W najprostszej sieci rekurencyjnej (*Vanilla RNN*, sieć Elmana,\n",
|
||||
"czasami po prostu RNN) w każdym kroku oprócz właściwego wejścia\n",
|
||||
"($\\vec{v}(t^k)$) będziemy również podawać na wejściu poprzedni stan\n",
|
||||
"sieci ($\\vec{s^{k-1}}$).\n",
|
||||
"\n",
|
||||
"Innymi słowy, funkcje $R$ przyjmie następującą postać:\n",
|
||||
"\n",
|
||||
"$$s^k = \\sigma(W\\langle\\vec{v}(t^k), \\vec{s^{k-1}}\\rangle + \\vec{b}),$$\n",
|
||||
"\n",
|
||||
"gdzie:\n",
|
||||
"\n",
|
||||
"- $\\langle\\vec{x},\\vec{y}\\rangle$ to konkatenacja dwóch wektorów,\n",
|
||||
"- $W \\in \\mathcal{R}^m \\times \\mathcal{R}^{n+m}$ — macierz wag,\n",
|
||||
"- $b \\in \\mathcal{R}^m$ — wektor obciążeń (*biases*).\n",
|
||||
"\n",
|
||||
"Taką sieć RNN można przedstawić schematycznie w następujący sposób:\n",
|
||||
"\n",
|
||||
"![Pojedynczy krok sieci rekurencyjnej](./img-rnn.png)\n",
|
||||
"\n",
|
||||
"Zauważmy, że zamiast macierzy $W$ działającej na konkatenacji wektorów można wprowadzić dwie\n",
|
||||
"macierze $U$ i $V$ i tak zapisać wzór:\n",
|
||||
"\n",
|
||||
"$$s^k = \\sigma(U\\vec{v}(t^k) + V\\vec{s^{k-1}} + \\vec{b}).$$\n",
|
||||
"\n",
|
||||
"Jeszcze inne spojrzenie na sieć RNN:\n",
|
||||
"\n",
|
||||
"![Pojedynczy krok sieci rekurencyjnej II](./rnn.png)\n",
|
||||
"\n",
|
||||
"Powyższy rysunek przedstawia pojedynczy krok sieci RNN. Dla całego\n",
|
||||
"wejścia (powiedzmy, 3-wyrazowego) możemy sieć rozwinąć (*unroll*):\n",
|
||||
"\n",
|
||||
"![Rozwinięta sieć rekurencyjna](./rnn-seq.png)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Zastosowanie sieci RNN do etykietowania sekwencji\n",
|
||||
"\n",
|
||||
"Sieć RNN może w prosty sposób być użyta do etykietowania sekwencji (w każdym kroku zwracamy etykietę)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Problemy z prostymi sieciami RNN\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"W praktyce proste sieci RNN są bardzo trudne w uczenia, zazwyczaj\n",
|
||||
"pojawia się problem **zanikających** (rzadziej: **eksplodujących**)\n",
|
||||
"gradientów: w propagacji wstecznej błąd szybko zanika i nie jest w\n",
|
||||
"stanie dotrzeć do początkowych wejść.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Sieci RNN z bramkami\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"W prostych sieciach RNN podstawowa trudność polega na tym, że mamy\n",
|
||||
"niewielką kontrolę nad tym jak pamięć (stan) jest aktualizowana. Aby\n",
|
||||
"zwiększyć tę kontrolę, potrzebujemy **bramek**.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Bramki\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Zazwyczaj do tej pory rozpatrywaliśmy iloczyn skalarny wektorów, w\n",
|
||||
"wyniku którego otrzymujemy liczbę (w PyTorchu wyrażany za pomocą operatora `@`), np.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# Out[2]:\n",
|
||||
"tensor(-5)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"a = torch.tensor([-1, 0, 3])\n",
|
||||
"b = torch.tensor([2, 5, -1])\n",
|
||||
"a @ b"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Czasami przydatny jest **iloczyn Hadamarda**, czyli przemnożenie\n",
|
||||
"wektorów (albo macierzy) po współrzędnych. W PyTorchu taki iloczyn\n",
|
||||
"wyrażany jest za pomocą operatora `*`, w notacji matematycznej będziemy używali\n",
|
||||
"znaku $\\odot$.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# Out[3]:\n",
|
||||
"tensor([-2, 0, -3])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"a = torch.tensor([-1, 0, 3])\n",
|
||||
"b = torch.tensor([2, 5, -1])\n",
|
||||
"a * b"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Zauważmy, że iloczyn Hadamarda przez wektor złożony z zer i jedynek daje nam *filtr*, możemy\n",
|
||||
"selektywnie wygaszać pozycje wektora, np. tutaj wyzerowaliśmy 2. i 5. pozycję wektora:\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# Out[4]:\n",
|
||||
"tensor([1., 0., 3., 4., 0.])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"a = torch.tensor([1., 2., 3., 4., 5.])\n",
|
||||
"b = torch.tensor([1., 0., 1., 1., 0.])\n",
|
||||
"a * b"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Co więcej, za pomocą bramki możemy selektywnie kontrolować, co\n",
|
||||
"zapamiętujemy, a co zapominamy. Rozpatrzmy mianowicie wektor zer i\n",
|
||||
"jedynek $\\vec{g} \\in \\{0,1\\}^m$, dla stanu (pamięci) $\\vec{s}$ i nowej informacji\n",
|
||||
"$\\vec{x}$ możemy dokonywać aktualizacji w następujący sposób:\n",
|
||||
"\n",
|
||||
"$$\\vec{s} \\leftarrow \\vec{g} \\odot \\vec{x} + (1 - \\vec{g}) \\odot \\vec{s}$$\n",
|
||||
"\n",
|
||||
"Na przykład, za pomocą bramki można wpisać nową wartość na 2. i 5. pozycję wektora.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# Out[8]:\n",
|
||||
"tensor([ 1., 7., 3., 4., -8.])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"s = torch.tensor([1., 2., 3., 4., 5.])\n",
|
||||
"x = torch.tensor([8., 7., 15., -3., -8.])\n",
|
||||
"\n",
|
||||
"g = torch.tensor([0., 1., 0., 0., 1.])\n",
|
||||
"\n",
|
||||
"s = g * x + (1 - g) * s\n",
|
||||
"s"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wektor bramki nie musi być z góry określony, może być wyuczalny. Wtedy\n",
|
||||
"jednak lepiej założyć, że bramka jest „miękka”, np. jej wartości\n",
|
||||
"pochodzi z sigmoidy zastosowanej do jakiejś wcześniejszej warstwy.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"# Out[14]:\n",
|
||||
"tensor([ 1.5310, 6.9998, 5.7777, 4.0000, -5.2159])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"s = torch.tensor([1., 2., 3., 4., 5.])\n",
|
||||
"x = torch.tensor([8., 7., 15., -3., -8.])\n",
|
||||
"\n",
|
||||
"pre_g = torch.tensor([-2.5, 10.0, -1.2, -101., 1.3])\n",
|
||||
"g = torch.sigmoid(pre_g)\n",
|
||||
"\n",
|
||||
"s = g * x + (1 - g) * s\n",
|
||||
"s"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Pytanie:** dlaczego sigmoida zamiast tanh?\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Sieć LSTM\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Architektura LSTM (*Long Short-Term Memory*) pozwala rozwiązać problem\n",
|
||||
"znikających gradientów — za cenę komplikacji obliczeń.\n",
|
||||
"\n",
|
||||
"W sieci LSTM stan $\\vec{s^k}$ ma dwie połówki, tj. $\\vec{s^k} =\n",
|
||||
"\\langle\\vec{c^k},\\vec{h^k}\\rangle$, gdzie\n",
|
||||
"\n",
|
||||
"- $\\vec{c^k}$ to **komórka pamięci**, która nie zmienia swojej, chyba że celowo zmodyfikujemy jej wartość\n",
|
||||
" za pomocą bramek,\n",
|
||||
"- $\\vec{h^k}$ to ukryty stan (przypominający $\\vec{s^k}$ ze zwykłej sieci RNN).\n",
|
||||
"\n",
|
||||
"Sieć LSTM zawiera 3 bramki:\n",
|
||||
"\n",
|
||||
"- bramkę zapominania (*forget gate*), która steruje wymazywaniem informacji z komórki\n",
|
||||
" pamięci $\\vec{c^k}$,\n",
|
||||
"- bramkę wejścia (*input gate*), która steruje tym, na ile nowe informacje aktualizują\n",
|
||||
" komórkę pamięci $\\vec{c^k}$,\n",
|
||||
"- bramkę wyjścia (*output gate*), która steruje tym, co z komórki\n",
|
||||
" pamięci przekazywane jest na wyjście.\n",
|
||||
"\n",
|
||||
"Wszystkie trzy bramki definiowane są za pomocą bardzo podobnego wzoru — warstwy liniowej na\n",
|
||||
"poprzedniej wartości warstwy ukrytej i bieżącego wejścia.\n",
|
||||
"\n",
|
||||
"$$\\vec{i} = \\sigma(W_i\\langle\\vec{v}(t^k),\\vec{h^{k-1}}\\rangle)$$\n",
|
||||
"\n",
|
||||
"$$\\vec{f} = \\sigma(W_f\\langle\\vec{v}(t^k),\\vec{h^{k-1}}\\rangle)$$\n",
|
||||
"\n",
|
||||
"$$\\vec{o} = \\sigma(W_o\\langle\\vec{v}(t^k),\\vec{h^{k-1}}\\rangle)$$\n",
|
||||
"\n",
|
||||
"Jak widać, wzory różnią się tylko macierzami wag $W_*$.\n",
|
||||
"\n",
|
||||
"Zmiana komórki pamięci jest zdefiniowana jak następuje:\n",
|
||||
"\n",
|
||||
"$$\\vec{c^k} = \\vec{f} \\odot \\vec{c^{k-1}} + \\vec{i} \\vec{z^k}$$,\n",
|
||||
"\n",
|
||||
"gdzie\n",
|
||||
"\n",
|
||||
"$$\\vec{z^k} = \\operatorname{tanh}(W_z\\langle\\vec{v}(t^k),\\vec{h^{k-1}}\\rangle)$$\n",
|
||||
"\n",
|
||||
"Stan ukryty zmienia się w następujący sposób:\n",
|
||||
"\n",
|
||||
"$$\\vec{h^K} = \\vec{o} \\odot \\operatorname{tanh}(\\vec{c^k})$$.\n",
|
||||
"\n",
|
||||
"Ostateczne wyjście może być wyliczane na podstawie wektora $\\vec{h^k}$:\n",
|
||||
"\n",
|
||||
"$$O(\\vec{s}) = O(\\langle\\vec{c},\\vec{h}\\rangle) = \\vec{h}$$\n",
|
||||
"\n",
|
||||
"**Pytanie**: Ile wag/parametrów ma sieć RNN o rozmiarze wejścia $n$ i rozmiarze warstwy ukrytej $m$?\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Literatura\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Yoav Goldberg, *Neural Network Methods for Natural Language Processing*,\n",
|
||||
"Morgan & Claypool Publishers, 2017\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Filip Graliński",
|
||||
"email": "filipg@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
},
|
||||
"org": null,
|
||||
"subtitle": "11.Sieci rekurencyjne[wykład]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
288
wyk/11_rnn.org
Normal file
@ -0,0 +1,288 @@
|
||||
|
||||
* Rekurencyjne sieci neuronowe
|
||||
** Inne spojrzenie na sieci przedstawione do tej pory
|
||||
*** Regresja liniowa/logistyczna lub klasyfikacja wieloklasowa na całym tekście
|
||||
|
||||
W regresji liniowej czy logistycznej bądź w klasyfikacji wieloklasowej
|
||||
(z funkcją Softmax) stosowaliśmy następujący schemat:
|
||||
|
||||
Do tej pory patrzyliśmy na to tak, że po prostu cały tekst jest od
|
||||
razu przetwarzany przez (prostą) sieć neuronową, popatrzmy na ten
|
||||
przypadek, jak na sytuację przetwarzania sekwencyjnego. Będzie to
|
||||
trochę sztuczne, ale uogólnimy to potem w sensowny sposób.
|
||||
|
||||
**** Wektoryzacja
|
||||
|
||||
Po pierwsze, zauważmy, że w wielu schematach wektoryzacji (np. tf), wektor
|
||||
dokumentów jest po prostu sumą wektorów poszczególnych składowych:
|
||||
|
||||
$$\vec{v}(d) = \vec{v}(t^1,\ldots,t^K) = \vec{v}(t^1) + \ldots + \vec{v}(t^K) = \sum_{k=1}^K \vec{v}(t^i),$$
|
||||
|
||||
gdzie w schemacie tf \vec{v}(t^i) to po prostu wektor /one-hot/ dla słowa.
|
||||
|
||||
*Pytanie* Jak postać przyjmie w \vec{v}(t^i) dla wektoryzacji tf-idf?
|
||||
|
||||
Wektory $\vec{v}(t^k)$ mogą być również gęstymi wektorami
|
||||
($\vec{v}(t^k) \in \mathcal{R}^n$, gdzie $n$ jest rzędu 10-1000), np.
|
||||
w modelu Word2vec albo mogą to być *wyuczalne* wektory (zanurzenia
|
||||
słów, /embeddings/), tzn. wektory, które są parametrami uczonej sieci!
|
||||
|
||||
*Pytanie* Ile wag (parametrów) wnoszą wyuczalne wektory do sieci?
|
||||
|
||||
**** Prosta wektoryzacja wyrażona w modelu sekwencyjnym
|
||||
|
||||
Jak zapisać równoważnie powyższą wektoryzację w modelu *sekwencyjnym*, tj. przy założeniu, że
|
||||
przetwarzamy wejście token po tokenie (a nie „naraz”)? Ogólnie wprowadzimy bardzo
|
||||
ogólny model sieci *rekurencyjnej*.
|
||||
|
||||
Po pierwsze zakładamy, że sieć ma pewien stan $\vec{s^k} \in
|
||||
\mathcal{R}^m$ (stan jest wektorem o długości $m$), który może
|
||||
zmieniać się z każdym krokiem (przetwarzanym tokenem). Zmiana stanu
|
||||
jest określona przez pewną funkcję $R : \mathcal{R}^m \times
|
||||
\mathcal{R}^n \rightarrow \mathcal{R}^m$ ($n$ to rozmiar wektorów
|
||||
$\vec{v}(t^k)$):
|
||||
|
||||
$$\vec{s^k} = R(\vec{s^{k-1}}, \vec{v}(t^k)).$$
|
||||
|
||||
W przypadku wektoryzacji tf-idf mamy do czynienia z prostym
|
||||
sumowaniem, więc $R$ przyjmuje bardzo prostą postać:
|
||||
|
||||
$$\vec{s^0} = [0,\dots,0],$$
|
||||
|
||||
$$R(\vec{s}, \vec{x}) = \vec{s} + \vec{x}.$$
|
||||
|
||||
**** Wyjście z modelu
|
||||
|
||||
Dla regresji liniowej/logistycznej, oprócz funkcji $R$, która określa
|
||||
zmianę stanu, potrzebujemy funkcji $O$, która określa wyjście systemu w każdym kroku.
|
||||
|
||||
$$y^k = O(\vec{s^k})$$
|
||||
|
||||
W zadaniach klasyfikacji czy regresji, kiedy patrzymy na cały tekst w
|
||||
zasadzie wystarczy wziąć /ostatnią/ wartość (tj. $y^K$). Można sobie
|
||||
wyobrazić sytuację, kiedy wartości $y^k$ dla $k < k$ również mogą być jakoś przydatne
|
||||
(np. klasyfikujemy na bieżąco tekst wpisywany przez użytkownika).
|
||||
|
||||
W każdym razie dla regresji liniowej funkcja $O$ przyjmie postać:
|
||||
|
||||
$$O(\vec{s}) = \vec{w}\vec{s}$$,
|
||||
|
||||
gdzie $\vec{w}$ jest wektorem wyuczylnych wag, dla regresji zaś logistycznej:
|
||||
|
||||
$$O(\vec{s}) = \operatorname{softmax}(\vec{w}\vec{s})$$
|
||||
|
||||
*Pytanie*: jaką postać przyjmie $O$ dla klasyfikacji wieloklasowej
|
||||
|
||||
** Prosta sieć rekurencyjna
|
||||
|
||||
W najprostszej sieci rekurencyjnej (/Vanilla RNN/, sieć Elmana,
|
||||
czasami po prostu RNN) w każdym kroku oprócz właściwego wejścia
|
||||
($\vec{v}(t^k)$) będziemy również podawać na wejściu poprzedni stan
|
||||
sieci ($\vec{s^{k-1}}$).
|
||||
|
||||
Innymi słowy, funkcje $R$ przyjmie następującą postać:
|
||||
|
||||
$$s^k = \sigma(W\langle\vec{v}(t^k), \vec{s^{k-1}}\rangle + \vec{b}),$$
|
||||
|
||||
gdzie:
|
||||
|
||||
- $\langle\vec{x},\vec{y}\rangle$ to konkatenacja dwóch wektorów,
|
||||
- $W \in \mathcal{R}^m \times \mathcal{R}^{n+m}$ — macierz wag,
|
||||
- $b \in \mathcal{R}^m$ — wektor obciążeń (/biases/).
|
||||
|
||||
Taką sieć RNN można przedstawić schematycznie w następujący sposób:
|
||||
|
||||
[[./img-rnn.png]]
|
||||
|
||||
Zauważmy, że zamiast macierzy $W$ działającej na konkatenacji wektorów można wprowadzić dwie
|
||||
macierze $U$ i $V$ i tak zapisać wzór:
|
||||
|
||||
$$s^k = \sigma(U\vec{v}(t^k) + V\vec{s^{k-1}} + \vec{b}).$$
|
||||
|
||||
Jeszcze inne spojrzenie na sieć RNN:
|
||||
|
||||
[[./rnn.png]]
|
||||
|
||||
Powyższy rysunek przedstawia pojedynczy krok sieci RNN. Dla całego
|
||||
wejścia (powiedzmy, 3-wyrazowego) możemy sieć rozwinąć (/unroll/):
|
||||
|
||||
[[./rnn-seq.png]]
|
||||
|
||||
*** Zastosowanie sieci RNN do etykietowania sekwencji
|
||||
|
||||
*** Problemy z prostymi sieciami RNN
|
||||
|
||||
W praktyce proste sieci RNN są bardzo trudne w uczenia, zazwyczaj
|
||||
pojawia się problem *zanikających* (rzadziej: *eksplodujących*)
|
||||
gradientów: w propagacji wstecznej błąd szybko zanika i nie jest w
|
||||
stanie dotrzeć do początkowych wejść.
|
||||
|
||||
** Sieci RNN z bramkami
|
||||
|
||||
W prostych sieciach RNN podstawowa trudność polega na tym, że mamy
|
||||
niewielką kontrolę nad tym jak pamięć (stan) jest aktualizowana. Aby
|
||||
zwiększyć tę kontrolę, potrzebujemy *bramek*.
|
||||
|
||||
*** Bramki
|
||||
|
||||
Zazwyczaj do tej pory rozpatrywaliśmy iloczyn skalarny wektorów, w
|
||||
wyniku którego otrzymujemy liczbę (w PyTorchu wyrażany za pomocą operatora ~@~), np.
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
import torch
|
||||
|
||||
a = torch.tensor([-1, 0, 3])
|
||||
b = torch.tensor([2, 5, -1])
|
||||
a @ b
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[2]:
|
||||
: tensor(-5)
|
||||
:end:
|
||||
|
||||
Czasami przydatny jest *iloczyn Hadamarda*, czyli przemnożenie
|
||||
wektorów (albo macierzy) po współrzędnych. W PyTorchu taki iloczyn
|
||||
wyrażany jest za pomocą operatora ~*~, w notacji matematycznej będziemy używali
|
||||
znaku $\odot$.
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
import torch
|
||||
|
||||
a = torch.tensor([-1, 0, 3])
|
||||
b = torch.tensor([2, 5, -1])
|
||||
a * b
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[3]:
|
||||
: tensor([-2, 0, -3])
|
||||
:end:
|
||||
|
||||
Zauważmy, że iloczyn Hadamarda przez wektor złożony z zer i jedynek daje nam /filtr/, możemy
|
||||
selektywnie wygaszać pozycje wektora, np. tutaj wyzerowaliśmy 2. i 5. pozycję wektora:
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
import torch
|
||||
|
||||
a = torch.tensor([1., 2., 3., 4., 5.])
|
||||
b = torch.tensor([1., 0., 1., 1., 0.])
|
||||
a * b
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[4]:
|
||||
: tensor([1., 0., 3., 4., 0.])
|
||||
:end:
|
||||
|
||||
|
||||
Co więcej, za pomocą bramki możemy selektywnie kontrolować, co
|
||||
zapamiętujemy, a co zapominamy. Rozpatrzmy mianowicie wektor zer i
|
||||
jedynek $\vec{g} \in \{0,1\}^m$, dla stanu (pamięci) $\vec{s}$ i nowej informacji
|
||||
$\vec{x}$ możemy dokonywać aktualizacji w następujący sposób:
|
||||
|
||||
$$\vec{s} \leftarrow \vec{g} \odot \vec{x} + (1 - \vec{g}) \odot \vec{s}$$
|
||||
|
||||
Na przykład, za pomocą bramki można wpisać nową wartość na 2. i 5. pozycję wektora.
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
import torch
|
||||
|
||||
s = torch.tensor([1., 2., 3., 4., 5.])
|
||||
x = torch.tensor([8., 7., 15., -3., -8.])
|
||||
|
||||
g = torch.tensor([0., 1., 0., 0., 1.])
|
||||
|
||||
s = g * x + (1 - g) * s
|
||||
s
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[8]:
|
||||
: tensor([ 1., 7., 3., 4., -8.])
|
||||
:end:
|
||||
|
||||
Wektor bramki nie musi być z góry określony, może być wyuczalny. Wtedy
|
||||
jednak lepiej założyć, że bramka jest „miękka”, np. jej wartości
|
||||
pochodzi z sigmoidy zastosowanej do jakiejś wcześniejszej warstwy.
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
import torch
|
||||
|
||||
s = torch.tensor([1., 2., 3., 4., 5.])
|
||||
x = torch.tensor([8., 7., 15., -3., -8.])
|
||||
|
||||
pre_g = torch.tensor([-2.5, 10.0, -1.2, -101., 1.3])
|
||||
g = torch.sigmoid(pre_g)
|
||||
|
||||
s = g * x + (1 - g) * s
|
||||
s
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[14]:
|
||||
: tensor([ 1.5310, 6.9998, 5.7777, 4.0000, -5.2159])
|
||||
:end:
|
||||
|
||||
*Pytanie:* dlaczego sigmoida zamiast tanh?
|
||||
|
||||
*** Sieć LSTM
|
||||
|
||||
Architektura LSTM (/Long Short-Term Memory/) pozwala rozwiązać problem
|
||||
znikających gradientów — za cenę komplikacji obliczeń.
|
||||
|
||||
W sieci LSTM stan $\vec{s^k}$ ma dwie połówki, tj. $\vec{s^k} =
|
||||
\langle\vec{c^k},\vec{h^k}\rangle$, gdzie
|
||||
|
||||
- $\vec{c^k}$ to *komórka pamięci*, która nie zmienia swojej, chyba że celowo zmodyfikujemy jej wartość
|
||||
za pomocą bramek,
|
||||
- $\vec{h^k}$ to ukryty stan (przypominający $\vec{s^k}$ ze zwykłej sieci RNN).
|
||||
|
||||
Sieć LSTM zawiera 3 bramki:
|
||||
|
||||
- bramkę zapominania (/forget gate/), która steruje wymazywaniem informacji z komórki
|
||||
pamięci $\vec{c^k}$,
|
||||
- bramkę wejścia (/input gate/), która steruje tym, na ile nowe informacje aktualizują
|
||||
komórkę pamięci $\vec{c^k}$,
|
||||
- bramkę wyjścia (/output gate/), która steruje tym, co z komórki
|
||||
pamięci przekazywane jest na wyjście.
|
||||
|
||||
Wszystkie trzy bramki definiowane są za pomocą bardzo podobnego wzoru — warstwy liniowej na
|
||||
poprzedniej wartości warstwy ukrytej i bieżącego wejścia.
|
||||
|
||||
$$\vec{i} = \sigma(W_i\langle\vec{v}(t^k),\vec{h^{k-1}}\rangle)$$
|
||||
|
||||
$$\vec{f} = \sigma(W_f\langle\vec{v}(t^k),\vec{h^{k-1}}\rangle)$$
|
||||
|
||||
$$\vec{o} = \sigma(W_o\langle\vec{v}(t^k),\vec{h^{k-1}}\rangle)$$
|
||||
|
||||
Jak widać, wzory różnią się tylko macierzami wag $W_*$.
|
||||
|
||||
Zmiana komórki pamięci jest zdefiniowana jak następuje:
|
||||
|
||||
$$\vec{c^k} = \vec{f} \odot \vec{c^{k-1}} + \vec{i} \vec{z^k}$$,
|
||||
|
||||
gdzie
|
||||
|
||||
$$\vec{z^k} = \operatorname{tanh}(W_z\langle\vec{v}(t^k),\vec{h^{k-1}}\rangle)$$
|
||||
|
||||
Stan ukryty zmienia się w następujący sposób:
|
||||
|
||||
$$\vec{h^K} = \vec{o} \odot \operatorname{tanh}(\vec{c^k})$$.
|
||||
|
||||
Ostateczne wyjście może być wyliczane na podstawie wektora $\vec{h^k}$:
|
||||
|
||||
$$O(\vec{s}) = O(\langle\vec{c},\vec{h}\rangle) = \vec{h}$$
|
||||
|
||||
*Pytanie*: Ile wag/parametrów ma sieć RNN o rozmiarze wejścia $n$ i rozmiarze warstwy ukrytej $m$?
|
||||
|
||||
|
||||
** Literatura
|
||||
|
||||
Yoav Goldberg, /Neural Network Methods for Natural Language Processing/,
|
||||
Morgan & Claypool Publishers, 2017
|
859
wyk/12_bpe.ipynb
Normal file
396
wyk/12_bpe.org
Normal file
@ -0,0 +1,396 @@
|
||||
* Podział na jednostki podwyrazowe
|
||||
** Słownik nie może być za duży…
|
||||
|
||||
Jeśli używamy wyuczalnych zanurzeń słów (embeddingów), wówczas musimy
|
||||
je dopisać do listy parametrów całego modelu — jest to $|V|n$ wag,
|
||||
gdzie $n$ to rozmiar embeddingów; w wypadku uczenia dodatkowo musimy
|
||||
jeszcze pamiętać związane z embeddingami gradienty. Pamięć RAM karty
|
||||
graficznej jest rzecz jasna ograniczona, słownik więc nie może być
|
||||
dowolnie duży. Dla danego modelu karty graficznej dość łatwo ustalić
|
||||
maksymalny rozmiar słownika — jest „twarde” ograniczenie, które musimy
|
||||
spełnić.
|
||||
|
||||
*** Czy rzeczywiście słownik może być taki duży?
|
||||
|
||||
Ile jest różnych form fleksyjnych w języku polskim? Zobaczmy w słowniku PoliMorf…
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! wget -q 'http://zil.ipipan.waw.pl/PoliMorf?action=AttachFile&do=get&target=PoliMorf-0.6.7.tab.gz' -O - | zcat | cut -f 1 | uniq | head -n 20
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[2]:
|
||||
:end:
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! wget -q 'http://zil.ipipan.waw.pl/PoliMorf?action=AttachFile&do=get&target=PoliMorf-0.6.7.tab.gz' -O - | zcat | cut -f 1 | sort -u | wc -l
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[3]:
|
||||
:end:
|
||||
|
||||
*Pytanie* W którym języku europejskim wyrazów będzie jeszcze więcej niż języku polskim?
|
||||
|
||||
Tak naprawdę form jest jeszcze więcej, oczywiście PoliMorf nie wyczerpuje zbioru…
|
||||
|
||||
*Pytanie* Podaj przykłady „oczywistych” wyrazów, których nie ma w PoliMorfie. Jak w sposób systematyczny szukać takich wyrazów?
|
||||
|
||||
Z drugiej strony, w PoliMorfie jest dużo dziwnych, „sztucznych” wyrazów.
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! wget -q 'http://zil.ipipan.waw.pl/PoliMorf?action=AttachFile&do=get&target=PoliMorf-0.6.7.tab.gz' -O - | zcat | cut -f 1 | shuf -n 20
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[4]:
|
||||
:end:
|
||||
|
||||
Inaczej, zobaczmy, ile różnych wyrazów jest w jakimś rzeczywistym zbiorze tekstów, rozpatrzmy
|
||||
teksty zebrane na potrzeby identyfikacji płci autora tekstu:
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! git clone --single-branch --depth 1 git://gonito.net/petite-difference-challenge2
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[7]:
|
||||
:end:
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! xzcat petite-difference-challenge2/train/in.tsv.xz | perl -C -ne 'print "$&\n" while/\p{L}+/g;' | sort -u > vocab.txt
|
||||
#+END_SRC
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! head -n 50 vocab.txt
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[11]:
|
||||
:end:
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! wc -l vocab.txt
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[9]:
|
||||
:end:
|
||||
|
||||
Co gorsza, nawet jak weźmiemy cały taki słownik bez ograniczeń i tak
|
||||
nie pokryje on sporej części tekstów przetwarzanych w czasie inferencji.
|
||||
Zobaczmy, ilu wyrazów ze zbioru deweloperskiego nie będzie w słowniku.
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! cat petite-difference-challenge2/dev-0/in.tsv | perl -C -ne 'print "$&\n" while/\p{L}+/g;' | sort -u | comm vocab.txt - -13 | wc -l
|
||||
#+END_SRC
|
||||
|
||||
Takie wyrazy nazywamy wyrazami *OOV* (/out-of-vocabulary/).
|
||||
|
||||
** Obcięcie słownika
|
||||
|
||||
Najprostszy sposób ograniczenia słownika to po prostu obcięcie do $N$ najczęstszych słów.
|
||||
|
||||
Spróbujmy zastosować do korpusu „płci”:
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! xzcat petite-difference-challenge2/train/in.tsv.xz | perl -C -ne 'print "$&\n" while/\p{L}+/g;' | sort | uniq -c | sort -k 1rn | head -n 50000 | sort -k 2 > vocab50000.txt
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[8]:
|
||||
:end:
|
||||
|
||||
Daje to lepszy efekt niż można się spodziewać. Odrzucamy w ten sposób
|
||||
tylko bardzo rzadkie słowa (albo takie, które wystąpiły tylko raz w
|
||||
korpusie — tzw. /hapax legomena/), choć tych słów jest bardzo dużo.
|
||||
|
||||
*Zagadka*: 50000 najczęstszych słów (1,9% *typów*) pokrywa jaki odsetek *wystąpień*?
|
||||
|
||||
Rozkład normalny w języku nie jest… normalny — nie spotkamy się z nim
|
||||
badając języki. W tekstach dominują „skrzywione” rozkłady z długimi,
|
||||
„chudymi” ogonami.
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! xzcat petite-difference-challenge2/train/in.tsv.xz | perl -C -ne 'print "$&\n" while/\p{L}+/g;' | sort | uniq -c | sort -k 1rn | cut -f 1 > freqs.txt
|
||||
#+END_SRC
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :results file
|
||||
%matplotlib inline
|
||||
import matplotlib.pyplot as plt
|
||||
import re
|
||||
from math import log
|
||||
|
||||
freqs = []
|
||||
|
||||
with open('freqs.txt', 'r') as fh:
|
||||
for line in fh:
|
||||
m = re.match(r'\s*(\d+)', line)
|
||||
if m:
|
||||
freqs.append(int(m.group(1)))
|
||||
|
||||
plt.plot(range(len(freqs)), freqs)
|
||||
fname = 'word-distribution.png'
|
||||
plt.savefig(fname)
|
||||
fname
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
[[file:# Out[25]:
|
||||
: 'word-distribution.png'
|
||||
[[file:./obipy-resources/c0TrCn.png]]]]
|
||||
|
||||
|
||||
** Lematyzacja
|
||||
|
||||
Lematyzacja wydaje się dobrym pomysłem, zwłaszcza dla języków dla bogatej fleksji:
|
||||
|
||||
- znacznie redukujemy słownik,
|
||||
- formy fleksyjne tego samego wyrazu są traktowane tak samo (co wydaje się słuszne).
|
||||
|
||||
W praktyce współcześnie *nie* stosuje się lematyzacji (w połączeniu z
|
||||
metodami opartymi na sieciach neuronowych):
|
||||
|
||||
- lematyzacja wymaga wiedzy językowej (reguł lub słownika),
|
||||
wytworzenie takiej wiedzy może być kosztowne, obecnie preferowane
|
||||
są metody niezależne od języka;
|
||||
- tracimy pewną informację niesioną przez formę fleksyjną (co w szczególnych
|
||||
przypadkach może być niefortunne, np. /aspiracja/ i /aspiracje/);
|
||||
- lematyzacja nie jest trywialnym problemem ze względu na niejednoznaczności
|
||||
(/Lekarzu, lecz się sam/);
|
||||
- niektóre niejednoznaczności są seryjne, wybór lematu może być arbitralny,
|
||||
np. czy /posiadanie/, /gotowanie/, /skakanie/ to rzeczowniki czy czasowniki?
|
||||
a /urządzenie/, /mieszkanie/?
|
||||
- zazwyczaj sieci neuronowe (czy nawet prostsze modele typu Word2vec)
|
||||
są w stanie nauczyć się rekonstruowania zależności między formami fleksyjnymi
|
||||
(i więcej: błędnych form, błędów ortograficznych, form archaicznych itd.)
|
||||
|
||||
** Zejście na poziom znaków
|
||||
|
||||
Skoro słownik wyrazów jest zbyt duży, to może zejść na poziom znaków?
|
||||
|
||||
- pojedynczy znak alfabetu wprawdzie nic nie znaczy (co znaczy /h/?)
|
||||
|
||||
- … ale rozmiar wejścia przy kodowaniu gorącą jedynką
|
||||
dramatycznie się zmniejsza
|
||||
|
||||
- może działać, jeśli dodać wielowarstwową sieć
|
||||
neuronową
|
||||
|
||||
- … ale może być bardzo kosztowne obliczeniowo
|
||||
|
||||
A może coś pośredniego między znakami a wyrazami?
|
||||
|
||||
** BPE
|
||||
|
||||
Ani znaki, ani wyrazy — coś pomiędzy: jednostki podwyrazowe (/subword
|
||||
units/). Moglibyśmy np. dzielić wyraz /superkomputera/ na dwie
|
||||
jednostki /super/+/komputera/, a może nawet trzy: /super/+/komputer/+/a/?
|
||||
|
||||
Najpopularniejszy algorytm podziału na jednostki podwyrazowe to BPE
|
||||
(/byte-pair encoding/), zainspirowany algorytmami kompresji danych.
|
||||
Lista jednostek jest automatycznie indukowana na podstawie tekstu (nie
|
||||
potrzeba żadnej wiedzy o języku!). Ich liczba musi być natomiast z góry
|
||||
określona.
|
||||
|
||||
W kroku początkowym zaznaczamy końce wyrazów (tokenów), robimy to po
|
||||
to, żeby jednostki podwyrazowe nie przekraczały granic wyrazów.
|
||||
|
||||
Następnie wykonujemy tyle kroków iteracji, ile wynosi rozmiar zadanego
|
||||
słownika. W każdym kroku szukamy najczęstszego bigramu, od tego
|
||||
momentu traktujemy go jako całostkę (wkładamy go do „pudełka”).
|
||||
|
||||
[[./bpe.png]]
|
||||
|
||||
*** Implementacja w Pythonie
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
from collections import Counter
|
||||
|
||||
def replace_bigram(l, b, r):
|
||||
i = 0
|
||||
while i < len(l) - 1:
|
||||
if (l[i], l[i+1]) == b:
|
||||
l[i:i+2] = [r]
|
||||
i += 1
|
||||
return l
|
||||
|
||||
def learn_bpe_vocab(d, max_vocab_size):
|
||||
d = list(d.replace(' ', '$') + '$')
|
||||
|
||||
vocab = []
|
||||
|
||||
for ix in range(0, max_vocab_size):
|
||||
bigrams = [(d[i], d[i+1]) for i in range(0, len(d) - 1) if d[i][-1] != '$']
|
||||
selected_bigram = Counter(bigrams).most_common(1)[0][0]
|
||||
|
||||
new_subword = selected_bigram[0] + selected_bigram[1]
|
||||
d = replace_bigram(d, selected_bigram, new_subword)
|
||||
|
||||
vocab.append(new_subword)
|
||||
|
||||
return vocab
|
||||
|
||||
vocab1 = learn_bpe_vocab('to be or not to be that is the question', 10)
|
||||
vocab1
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[1]:
|
||||
: ['e$', 'to', 'to$', 'be$', 't$', 'th', 'or', 'or$', 'no', 'not$']
|
||||
:end:
|
||||
|
||||
Słownik jednostek podwyrazowych możemy zastosować do dowolnego tekstu, np. do tekstu,
|
||||
na którym słownik był wyuczony:
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
def apply_bpe_vocab(vocab, d):
|
||||
d = list(d.replace(' ', '$') + '$')
|
||||
vocab_set = set(vocab)
|
||||
|
||||
modified = True
|
||||
while modified:
|
||||
ix = 0
|
||||
modified = False
|
||||
while ix < len(d) - 1:
|
||||
bigram = d[ix] + d[ix+1]
|
||||
if bigram in vocab_set:
|
||||
d[ix:ix+2] = [bigram]
|
||||
modified = True
|
||||
else:
|
||||
ix += 1
|
||||
|
||||
return d
|
||||
|
||||
' '.join(apply_bpe_vocab(vocab1, 'to be or not to be that is the question'))
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[5]:
|
||||
: 'to$ be$ or$ not$ to$ be$ th a t$ i s $ th e$ q u e s t i o n $'
|
||||
:end:
|
||||
|
||||
Zauważmy, że oprócz jednostek podwyrazowych zostały izolowane litery,
|
||||
zazwyczaj dodajemy je do słownika. (I zazwyczaj, słownik jest trochę
|
||||
większy niż wartość podana jako parametr przy uczeniu BPE — jest
|
||||
większy o znaki i specjalne tokeny typu ~UNK~, ~BOS~, ~EOS~, ~PAD~.)
|
||||
|
||||
*Pytanie*: Jaki problem może pojawić przy zastosowaniu BPE dla tekstu,
|
||||
gdzie pojawiają się chińskie znaki? Jak można sobie z nim poradzić?
|
||||
|
||||
Słownik jednostek podwyrazowych można stosować dla dowolnego tekstu:
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
' '.join(apply_bpe_vocab(vocab1, 'tom will be the best'))
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[6]:
|
||||
: 'to m $ w i l l $ be$ th e$ b e s t$'
|
||||
:end:
|
||||
|
||||
Jak można zauważyć algorytm BPE daje dwa rodzaje jednostek podwyrazowych:
|
||||
|
||||
- jednostki, które mogą doklejane na początku wyrazu;
|
||||
- jednostki, które stanowią koniec wyrazu, w szczególności są całym wyrazem.
|
||||
|
||||
*** Gotowa implementacja
|
||||
|
||||
Po raz pierwszy BPE użyto do neuronowego tłumaczenia maszynowego.
|
||||
Użyjmy modułu autorstwa Rica Sennricha (https://github.com/rsennrich/subword-nmt).
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! pip install subword-nmt
|
||||
#+END_SRC
|
||||
|
||||
Wyindukujmy słownik dla zbioru uczącego zadania identyfikacji płci
|
||||
autora tekstu:
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! xzcat petite-difference-challenge2/train/in.tsv.xz | perl -C -ne 'print "$&\n" while/\p{L}+/g;' | python -m subword_nmt.learn_bpe -s 50000 -v > bpe_vocab.txt
|
||||
#+END_SRC
|
||||
|
||||
Procedura trwa kilka minut, trzeba uzbroić się w cierpliwość (ale wypisywanie bigramów przyspieszy!).
|
||||
|
||||
#+BEGIN_SRC
|
||||
pair 0: n i -> ni (frequency 17625075)
|
||||
pair 1: i e -> ie (frequency 11471590)
|
||||
pair 2: c z -> cz (frequency 9143490)
|
||||
pair 3: ni e</w> -> nie</w> (frequency 7901783)
|
||||
pair 4: p o -> po (frequency 7790826)
|
||||
pair 5: r z -> rz (frequency 7542046)
|
||||
pair 6: s t -> st (frequency 7269069)
|
||||
pair 7: e m</w> -> em</w> (frequency 7207280)
|
||||
pair 8: d z -> dz (frequency 6860931)
|
||||
pair 9: s z -> sz (frequency 6609907)
|
||||
pair 10: r a -> ra (frequency 6601618)
|
||||
pair 11: o w -> ow (frequency 6395963)
|
||||
pair 12: i e</w> -> ie</w> (frequency 5906869)
|
||||
pair 13: n a -> na (frequency 5300380)
|
||||
pair 14: r o -> ro (frequency 5181363)
|
||||
pair 15: n a</w> -> na</w> (frequency 5125807)
|
||||
pair 16: a ł -> ał (frequency 4786696)
|
||||
pair 17: j e -> je (frequency 4599579)
|
||||
pair 18: s i -> si (frequency 4300984)
|
||||
pair 19: a l -> al (frequency 4276823)
|
||||
pair 20: t e -> te (frequency 4033344)
|
||||
pair 21: w i -> wi (frequency 3939063)
|
||||
pair 22: c h</w> -> ch</w> (frequency 3919410)
|
||||
pair 23: c h -> ch (frequency 3661410)
|
||||
pair 24: k o -> ko (frequency 3629840)
|
||||
pair 25: z a -> za (frequency 3625424)
|
||||
pair 26: t a -> ta (frequency 3570094)
|
||||
pair 27: p rz -> prz (frequency 3494551)
|
||||
pair 28: g o</w> -> go</w> (frequency 3279997)
|
||||
pair 29: a r -> ar (frequency 3081492)
|
||||
pair 30: si ę</w> -> się</w> (frequency 2973681)
|
||||
...
|
||||
pair 49970: brz mieniu</w> -> brzmieniu</w> (frequency 483)
|
||||
pair 49971: bieżą cych</w> -> bieżących</w> (frequency 483)
|
||||
pair 49972: biegu nkę</w> -> biegunkę</w> (frequency 483)
|
||||
pair 49973: ban kowości</w> -> bankowości</w> (frequency 483)
|
||||
pair 49974: ba ku</w> -> baku</w> (frequency 483)
|
||||
pair 49975: ba cznie</w> -> bacznie</w> (frequency 483)
|
||||
pair 49976: Przypad kowo</w> -> Przypadkowo</w> (frequency 483)
|
||||
pair 49977: MA Ł -> MAŁ (frequency 483)
|
||||
pair 49978: Lep pera</w> -> Leppera</w> (frequency 483)
|
||||
pair 49979: Ko za -> Koza (frequency 483)
|
||||
pair 49980: Jak byś</w> -> Jakbyś</w> (frequency 483)
|
||||
pair 49981: Geni alne</w> -> Genialne</w> (frequency 483)
|
||||
pair 49982: Że nada</w> -> Żenada</w> (frequency 482)
|
||||
pair 49983: ń czykiem</w> -> ńczykiem</w> (frequency 482)
|
||||
pair 49984: zwie ń -> zwień (frequency 482)
|
||||
pair 49985: zost ałaś</w> -> zostałaś</w> (frequency 482)
|
||||
pair 49986: zni szczona</w> -> zniszczona</w> (frequency 482)
|
||||
pair 49987: ze stawi -> zestawi (frequency 482)
|
||||
pair 49988: za sób</w> -> zasób</w> (frequency 482)
|
||||
pair 49989: węd rówkę</w> -> wędrówkę</w> (frequency 482)
|
||||
pair 49990: wysko czyła</w> -> wyskoczyła</w> (frequency 482)
|
||||
pair 49991: wyle czenia</w> -> wyleczenia</w> (frequency 482)
|
||||
pair 49992: wychowaw cze</w> -> wychowawcze</w> (frequency 482)
|
||||
pair 49993: w t -> wt (frequency 482)
|
||||
pair 49994: un da -> unda (frequency 482)
|
||||
pair 49995: udzie lałem</w> -> udzielałem</w> (frequency 482)
|
||||
pair 49996: tę czy</w> -> tęczy</w> (frequency 482)
|
||||
pair 49997: tro sce</w> -> trosce</w> (frequency 482)
|
||||
pair 49998: słusz ności</w> -> słuszności</w> (frequency 482)
|
||||
pair 49999: su me</w> -> sume</w> (frequency 482
|
||||
#+END_SRC
|
||||
|
||||
Zastosujmy teraz wyindukowany słownik BPE dla jakiegoś rzeczywistego tekstu.
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! echo 'Cierpiałem na straszne lagi – kilkanaście sekund lub dłużej czarnego ekranu przy próbie przełączenia się / uruchomienia prawie każdej aplikacji. Dodatkowo telefon mi się wyłączał czasem bez powodu – sam z siebie, albo resetował. Ostatnio nawet przeglądarka zaczęła się często zawieszać i Android proponował wymuszone zamknięcie. Do tego te problemy z połączeniem do komputera przez USB.' | perl -C -ne 'print "$& " while/\p{L}+/g;' | python -m subword_nmt.apply_bpe -c bpe_vocab.txt
|
||||
#+END_SRC
|
||||
|
||||
Ta konkretna implementacja zaznacza za pomocą sekwencji ~@@ ~ koniec jednostki podwyrazowej.
|
133
wyk/13_generative_approach.ipynb
Normal file
@ -0,0 +1,133 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 13. <i>Podejście generatywne w ekstrakcji informacji</i> [wykład]</h2> \n",
|
||||
"<h3> Filip Graliński (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Ekstrakcja informacji a podejście generatywne\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Podejście generatywne\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Do tej pory zadanie ekstrakcji informacji traktowaliśmy jako zadanie etykietowania sekwencji, tzn. uczyliśmy system zaznaczać tokeny składające się na ekstrahowane informacje.\n",
|
||||
"\n",
|
||||
"![Ekstrakcja informacji jako etykietowanie sekwencji, schemat](./ie-seqlab.png)\n",
|
||||
"\n",
|
||||
"Możliwe jest inne podeście, **generatywne**, w którym podchodzimy do problemu ekstrakcji informacji jak do swego rodzaju **tłumaczenia maszynowego** — „tłumaczymy” tekst (wraz z pytaniem lub etykietą) na informację.\n",
|
||||
"\n",
|
||||
"![Ekstrakcja informacji w podejściu generatywnym](./ie-gener.png)\n",
|
||||
"\n",
|
||||
"To podejście może się wydawać trudniejsze niż etykietowanie sekwencji, ale wystarczająco zaawansowanej architekturze sieci, jest wykonalne.\n",
|
||||
"\n",
|
||||
"Zalety:\n",
|
||||
"\n",
|
||||
"- informacja nie musi być dosłownie zapisana w tekście, ekstraktor może nauczyć się również normalizacji czy parafrazowania,\n",
|
||||
"- nie wprowadzamy wielu kroków przetwarzania (gdzie błędy mogą się\n",
|
||||
" namnażać), system działa na zasadzie *end-to-end*.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Atencja\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Pierwsze systemu neuronowego tłumaczenia maszynowego używały siecie LSTM. Dopiero jednak dodanie tzw. atencji (*attention*) umożliwiło duży przeskok jakościowy. Najpierw atencję dodano do sieci rekurencyjnych, później powstały sieci oparte *wyłącznie* na atencji — modele Transformer.\n",
|
||||
"\n",
|
||||
"Idea atencji polega na tym, że sieć może kierować selektywnie „snop” uwagi na wyrazy na wejściu lub do tej pory wygenerowane wyrazy.\n",
|
||||
"\n",
|
||||
"Mechanizm atencji korzysta z:\n",
|
||||
"\n",
|
||||
"- z poprzedniego stanu sieci $\\vec{s^{k-1}}$ (to jest „miejsce”, z którego „kierujemy” atencję),\n",
|
||||
"- z wektora reprezentującego słowo $\\vec{v}(t_i)$ (to jest „miejsce”, na które kierujemy atencję), gdzie\n",
|
||||
" $\\vec{v}(t_i)$ to reprezentacja wektorowa wyrazu $t_i$ (statyczny embedding lub reprezentacja wektorowa\n",
|
||||
" z poprzedniej warstwy dla sieci wielowarstwowej),\n",
|
||||
"\n",
|
||||
"aby wytworzyć wektor kontekstu $\\vec{\\xi^k}$ (który z kolei będzie w jakiś sposób wnosił wkład do wyliczenia nowej wartości stanu $\\vec{s^k}$ lub wyjścia $y^k$.\n",
|
||||
"\n",
|
||||
"Najpierw wyliczymy skalarne wartości atencji, tzn. liczby, które będą sygnalizowały, jak bardzo wektor $\\vec{v}(t_i)$ „pasuje” do $\\vec{s^{k-1}}$, w najprostszej wersji można po prostu skorzystać z iloczynu skalarnego (o ile $n=m$),\n",
|
||||
"\n",
|
||||
"$$a(\\vec{s^{k-1}}, \\vec{v}(t_i)) = \\vec{s^{k-1}}\\vec{v}(t_i).$$\n",
|
||||
"\n",
|
||||
"**Pytanie**: co jeśli $n$ nie jest równe $m$, tzn. rozmiar embeddingu nie jest równy rozmiarowi wektora stanu?\n",
|
||||
"\n",
|
||||
"W przypadku sieci LSTM korzysta się częściej z bardziej skomplikowanego wzoru zawierającego dodatkowe wyuczalne wagi:\n",
|
||||
"\n",
|
||||
"$$a(\\vec{s^{k-1}}, \\vec{v}(t_i)) = \\vec{w_a}\\operatorname{tanh}(W_a\\vec{s^{k-1}} + U_a\\vec{v}(t_i))$$\n",
|
||||
"\n",
|
||||
"**Pytanie**: jakie rozmiary mają macierze $W_a$, $U_a$ i wektor $w_a$?\n",
|
||||
"\n",
|
||||
"Powtórzmy, że wartości $a$ są wartościami skalarnymi, natomiast nie są one znormalizowane (nie sumują się do jedynki), normalizujemy je używając schematu podobnego do softmaxa:\n",
|
||||
"\n",
|
||||
"$$\\alpha_{i} = \\frac{e^{a(\\vec{s^{k-1}}, \\vec{v}(t_i))}}{\\sum_j e^{a(\\vec{s^{k-1}}, \\vec{v}(t_j))}}$$\n",
|
||||
"\n",
|
||||
"Wektor kontekstu $\\vec{\\xi^k}$ będzie po prostu średnią ważoną wektorowych reprezentacji słów:\n",
|
||||
"\n",
|
||||
"$$\\vec{\\xi^k} = \\sum_i \\alpha_i\\vec{v}(t_i)$$\n",
|
||||
"\n",
|
||||
"**Pytanie**: zasadniczo atencja jest środkiem do celu (żeby sieć się sprawniej uczyła), czy można atencja sama w sobie może być do czegoś przydatna?\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Filip Graliński",
|
||||
"email": "filipg@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
},
|
||||
"org": null,
|
||||
"subtitle": "13.Podejście generatywne w ekstrakcji informacji[wykład]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
55
wyk/13_generative_approach.org
Normal file
@ -0,0 +1,55 @@
|
||||
* Ekstrakcja informacji a podejście generatywne
|
||||
** Podejście generatywne
|
||||
|
||||
Do tej pory zadanie ekstrakcji informacji traktowaliśmy jako zadanie etykietowania sekwencji, tzn. uczyliśmy system zaznaczać tokeny składające się na ekstrahowane informacje.
|
||||
|
||||
[[./ie-seqlab.png]]
|
||||
|
||||
Możliwe jest inne podeście, *generatywne*, w którym podchodzimy do problemu ekstrakcji informacji jak do swego rodzaju *tłumaczenia maszynowego* — „tłumaczymy” tekst (wraz z pytaniem lub etykietą) na informację.
|
||||
|
||||
[[./ie-gener.png]]
|
||||
|
||||
To podejście może się wydawać trudniejsze niż etykietowanie sekwencji, ale wystarczająco zaawansowanej architekturze sieci, jest wykonalne.
|
||||
|
||||
Zalety:
|
||||
|
||||
- informacja nie musi być dosłownie zapisana w tekście, ekstraktor może nauczyć się również normalizacji czy parafrazowania,
|
||||
- nie wprowadzamy wielu kroków przetwarzania (gdzie błędy mogą się
|
||||
namnażać), system działa na zasadzie /end-to-end/.
|
||||
|
||||
** Atencja
|
||||
|
||||
Pierwsze systemu neuronowego tłumaczenia maszynowego używały siecie LSTM. Dopiero jednak dodanie tzw. atencji (/attention/) umożliwiło duży przeskok jakościowy. Najpierw atencję dodano do sieci rekurencyjnych, później powstały sieci oparte /wyłącznie/ na atencji — modele Transformer.
|
||||
|
||||
Idea atencji polega na tym, że sieć może kierować selektywnie „snop” uwagi na wyrazy na wejściu lub do tej pory wygenerowane wyrazy.
|
||||
|
||||
Mechanizm atencji korzysta z:
|
||||
|
||||
- z poprzedniego stanu sieci $\vec{s^{k-1}}$ (to jest „miejsce”, z którego „kierujemy” atencję),
|
||||
- z wektora reprezentującego słowo $\vec{v}(t_i)$ (to jest „miejsce”, na które kierujemy atencję), gdzie
|
||||
$\vec{v}(t_i)$ to reprezentacja wektorowa wyrazu $t_i$ (statyczny embedding lub reprezentacja wektorowa
|
||||
z poprzedniej warstwy dla sieci wielowarstwowej),
|
||||
|
||||
aby wytworzyć wektor kontekstu $\vec{\xi^k}$ (który z kolei będzie w jakiś sposób wnosił wkład do wyliczenia nowej wartości stanu $\vec{s^k}$ lub wyjścia $y^k$.
|
||||
|
||||
Najpierw wyliczymy skalarne wartości atencji, tzn. liczby, które będą sygnalizowały, jak bardzo wektor $\vec{v}(t_i)$ „pasuje” do $\vec{s^{k-1}}$, w najprostszej wersji można po prostu skorzystać z iloczynu skalarnego (o ile $n=m$),
|
||||
|
||||
$$a(\vec{s^{k-1}}, \vec{v}(t_i)) = \vec{s^{k-1}}\vec{v}(t_i).$$
|
||||
|
||||
*Pytanie*: co jeśli $n$ nie jest równe $m$, tzn. rozmiar embeddingu nie jest równy rozmiarowi wektora stanu?
|
||||
|
||||
W przypadku sieci LSTM korzysta się częściej z bardziej skomplikowanego wzoru zawierającego dodatkowe wyuczalne wagi:
|
||||
|
||||
$$a(\vec{s^{k-1}}, \vec{v}(t_i)) = \vec{w_a}\operatorname{tanh}(W_a\vec{s^{k-1}} + U_a\vec{v}(t_i))$$
|
||||
|
||||
*Pytanie*: jakie rozmiary mają macierze $W_a$, $U_a$ i wektor $w_a$?
|
||||
|
||||
Powtórzmy, że wartości $a$ są wartościami skalarnymi, natomiast nie są one znormalizowane (nie sumują się do jedynki), normalizujemy je używając schematu podobnego do softmaxa:
|
||||
|
||||
$$\alpha_{i} = \frac{e^{a(\vec{s^{k-1}}, \vec{v}(t_i))}}{\sum_j e^{a(\vec{s^{k-1}}, \vec{v}(t_j))}}$$
|
||||
|
||||
Wektor kontekstu $\vec{\xi^k}$ będzie po prostu średnią ważoną wektorowych reprezentacji słów:
|
||||
|
||||
$$\vec{\xi^k} = \sum_i \alpha_i\vec{v}(t_i)$$
|
||||
|
||||
*Pytanie*: zasadniczo atencja jest środkiem do celu (żeby sieć się sprawniej uczyła), czy można atencja sama w sobie może być do czegoś przydatna?
|
389
wyk/14_pretrenowanie.ipynb
Normal file
@ -0,0 +1,389 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 14. <i>Pretrenowane modele języka</i> [wykład]</h2> \n",
|
||||
"<h3> Filip Graliński (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Pretrenowanie modeli\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"System AlphaZero uczy się grając sam ze sobą — wystarczy 24 godziny,\n",
|
||||
"by system nauczył się grać w szachy lub go na nadludzkim poziomie.\n",
|
||||
"\n",
|
||||
"**Pytanie**: Dlaczego granie samemu ze sobą nie jest dobrym sposobem\n",
|
||||
" nauczenia się grania w szachy dla człowieka, a dla maszyny jest?\n",
|
||||
"\n",
|
||||
"Co jest odpowiednikiem grania samemu ze sobą w świecie przetwarzania tekstu?\n",
|
||||
"Tzn. **pretrenowanie** (*pretraining*) na dużym korpusie tekstu. (Tekst jest tani!)\n",
|
||||
"\n",
|
||||
"Jest kilka sposobów na pretrenowanie modelu, w każdym razie sprowadza\n",
|
||||
"się do odgadywania następnego bądź zamaskowanego słowa.\n",
|
||||
"W każdym razie zawsze stosujemy softmax (być może ze „sztuczkami” takimi jak\n",
|
||||
"negatywne próbkowanie albo hierarchiczny softmax) na pewnej **reprezentacji kontekstowej**:\n",
|
||||
"\n",
|
||||
"$$\\vec{p} = \\operatorname{softmax}(f(\\vec{c})).$$\n",
|
||||
"\n",
|
||||
"Model jest karany przy użyciu funkcji log loss:\n",
|
||||
"\n",
|
||||
"$$-\\log(p_j),$$\n",
|
||||
"\n",
|
||||
"gdzie $w_j$ jest wyrazem, który pojawił się rzeczywiście w korpusie.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Przewidywanie słowa (GPT-2)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Jeden ze sposobów pretrenowania modelu to po prostu przewidywanie\n",
|
||||
"następnego słowa.\n",
|
||||
"\n",
|
||||
"Zainstalujmy najpierw bibliotekę transformers.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"! pip install transformers"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"50257\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('Âł', 0.6182783842086792),\n",
|
||||
" ('È', 0.1154019758105278),\n",
|
||||
" ('Ñģ', 0.026960616931319237),\n",
|
||||
" ('_____', 0.024418892338871956),\n",
|
||||
" ('________', 0.014962316490709782),\n",
|
||||
" ('ÃĤ', 0.010653386823832989),\n",
|
||||
" ('ä¸Ń', 0.008340531960129738),\n",
|
||||
" ('Ñ', 0.007557711564004421),\n",
|
||||
" ('Ê', 0.007046067621558905),\n",
|
||||
" ('ãĢ', 0.006875576451420784),\n",
|
||||
" ('ile', 0.006685272324830294),\n",
|
||||
" ('____', 0.006307446397840977),\n",
|
||||
" ('âĢĭ', 0.006306538358330727),\n",
|
||||
" ('ÑĢ', 0.006197483278810978),\n",
|
||||
" ('ĠBelarus', 0.006108700763434172),\n",
|
||||
" ('Æ', 0.005720408633351326),\n",
|
||||
" ('ĠPoland', 0.0053678699769079685),\n",
|
||||
" ('á¹', 0.004606408067047596),\n",
|
||||
" ('îĢ', 0.004161055199801922),\n",
|
||||
" ('????', 0.004056799225509167),\n",
|
||||
" ('_______', 0.0038176667876541615),\n",
|
||||
" ('ä¸', 0.0036082742735743523),\n",
|
||||
" ('Ì', 0.003221835708245635),\n",
|
||||
" ('urs', 0.003080119378864765),\n",
|
||||
" ('________________', 0.0027312245219945908),\n",
|
||||
" ('ĠLithuania', 0.0023860156070441008),\n",
|
||||
" ('ich', 0.0021211160346865654),\n",
|
||||
" ('iz', 0.002069818088784814),\n",
|
||||
" ('vern', 0.002001357264816761),\n",
|
||||
" ('ÅĤ', 0.001717406208626926)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"from transformers import GPT2Tokenizer, GPT2LMHeadModel\n",
|
||||
"tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')\n",
|
||||
"model = GPT2LMHeadModel.from_pretrained('gpt2-large')\n",
|
||||
"text = 'Warsaw is the capital city of'\n",
|
||||
"encoded_input = tokenizer(text, return_tensors='pt')\n",
|
||||
"output = model(**encoded_input)\n",
|
||||
"next_token_probs = torch.softmax(output[0][:, -1, :][0], dim=0)\n",
|
||||
"\n",
|
||||
"nb_of_tokens = next_token_probs.size()[0]\n",
|
||||
"print(nb_of_tokens)\n",
|
||||
"\n",
|
||||
"_, top_k_indices = torch.topk(next_token_probs, 30, sorted=True)\n",
|
||||
"\n",
|
||||
"words = tokenizer.convert_ids_to_tokens(top_k_indices)\n",
|
||||
"\n",
|
||||
"top_probs = []\n",
|
||||
"\n",
|
||||
"for ix in range(len(top_k_indices)):\n",
|
||||
" top_probs.append((words[ix], next_token_probs[top_k_indices[ix]].item()))\n",
|
||||
"\n",
|
||||
"top_probs"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Zalety tego podejścia:\n",
|
||||
"\n",
|
||||
"- prostota,\n",
|
||||
"- dobra podstawa do strojenia systemów generowania tekstu zwłaszcza\n",
|
||||
" „otwartego” (systemy dialogowe, generowanie (fake) newsów, streszczanie tekstu),\n",
|
||||
" ale niekoniecznie tłumaczenia maszynowego,\n",
|
||||
"- zaskakująca skuteczność przy uczeniu *few-shot* i *zero-shot*.\n",
|
||||
"\n",
|
||||
"Wady:\n",
|
||||
"\n",
|
||||
"- asymetryczność, przetwarzanie tylko z lewej do prawej, preferencja\n",
|
||||
" dla lewego kontekstu,\n",
|
||||
"- mniejsza skuteczność przy dostrajaniu do zadań klasyfikacji i innych zadań\n",
|
||||
" niepolegających na prostym generowaniu.\n",
|
||||
"\n",
|
||||
"Przykłady modeli: GPT, GPT-2, GPT-3, DialoGPT.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Maskowanie słów (BERT)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Inną metodą jest maskowanie słów (*Masked Language Modeling*, *MLM*).\n",
|
||||
"\n",
|
||||
"W tym podejściu losowe wybrane zastępujemy losowe słowa specjalnym\n",
|
||||
"tokenem (`[MASK]`) i każemy modelowi odgadywać w ten sposób\n",
|
||||
"zamaskowane słowa (z uwzględnieniem również prawego kontekstu!).\n",
|
||||
"\n",
|
||||
"Móciąc ściśle, w jednym z pierwszych modeli tego typu (BERT)\n",
|
||||
"zastosowano schemat, w którym również niezamaskowane słowa są odgadywane (!):\n",
|
||||
"\n",
|
||||
"- wybieramy losowe 15% wyrazów do odgadnięcia\n",
|
||||
"- 80% z nich zastępujemy tokenem `[MASK]`,\n",
|
||||
"- 10% zastępujemy innym losowym wyrazem,\n",
|
||||
"- 10% pozostawiamy bez zmian.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/home/filipg/.local/lib/python3.9/site-packages/transformers/models/auto/modeling_auto.py:806: FutureWarning: The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and `AutoModelForSeq2SeqLM` for encoder-decoder models.\n",
|
||||
" warnings.warn(\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"W którym państwie leży Bombaj? W USA. (score: 0.16715531051158905)\n",
|
||||
"W którym państwie leży Bombaj? W India. (score: 0.09912960231304169)\n",
|
||||
"W którym państwie leży Bombaj? W Indian. (score: 0.039642028510570526)\n",
|
||||
"W którym państwie leży Bombaj? W Nepal. (score: 0.027137665078043938)\n",
|
||||
"W którym państwie leży Bombaj? W Pakistan. (score: 0.027065709233283997)\n",
|
||||
"W którym państwie leży Bombaj? W Polsce. (score: 0.023737527430057526)\n",
|
||||
"W którym państwie leży Bombaj? W .... (score: 0.02306722290813923)\n",
|
||||
"W którym państwie leży Bombaj? W Bangladesh. (score: 0.022106658667325974)\n",
|
||||
"W którym państwie leży Bombaj? W .... (score: 0.01628892682492733)\n",
|
||||
"W którym państwie leży Bombaj? W Niemczech. (score: 0.014501162804663181)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from transformers import AutoModelWithLMHead, AutoTokenizer\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"tokenizer = AutoTokenizer.from_pretrained(\"xlm-roberta-large\")\n",
|
||||
"model = AutoModelWithLMHead.from_pretrained(\"xlm-roberta-large\")\n",
|
||||
"\n",
|
||||
"sequence = f'W którym państwie leży Bombaj? W {tokenizer.mask_token}.'\n",
|
||||
"\n",
|
||||
"input_ids = tokenizer.encode(sequence, return_tensors=\"pt\")\n",
|
||||
"mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]\n",
|
||||
"\n",
|
||||
"token_logits = model(input_ids)[0]\n",
|
||||
"mask_token_logits = token_logits[0, mask_token_index, :]\n",
|
||||
"mask_token_logits = torch.softmax(mask_token_logits, dim=1)\n",
|
||||
"\n",
|
||||
"top_10 = torch.topk(mask_token_logits, 10, dim=1)\n",
|
||||
"top_10_tokens = zip(top_10.indices[0].tolist(), top_10.values[0].tolist())\n",
|
||||
"\n",
|
||||
"for token, score in top_10_tokens:\n",
|
||||
" print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])), f\"(score: {score})\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Przykłady: BERT, RoBERTa (również Polish RoBERTa).\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Podejście generatywne (koder-dekoder).\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"System ma wygenerować odpowiedź na różne pytania (również\n",
|
||||
"odpowiadające zadaniu MLM), np.:\n",
|
||||
"\n",
|
||||
"- \"translate English to German: That is good.\" => \"Das ist gut.\"\n",
|
||||
"- \"cola sentence: The course is jumping well.\" => \"not acceptable\"\n",
|
||||
"- \"summarize: state authorities dispatched emergency crews tuesday to survey the damage after an onslaught of severe weather in mississippi…\"\n",
|
||||
" => \"six people hospitalized after a storm in attala county\"\n",
|
||||
"- \"Thank you for <X> me to your party <Y> week.\" => <X> for inviting <Y> last <Z>\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['World War II ended in World War II.',\n",
|
||||
" 'World War II ended in 1945..',\n",
|
||||
" 'World War II ended in 1945.',\n",
|
||||
" 'World War II ended in 1945.',\n",
|
||||
" 'World War II ended in 1945.']"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration\n",
|
||||
"\n",
|
||||
"T5_PATH = 't5-base'\n",
|
||||
"\n",
|
||||
"t5_tokenizer = T5Tokenizer.from_pretrained(T5_PATH)\n",
|
||||
"t5_config = T5Config.from_pretrained(T5_PATH)\n",
|
||||
"t5_mlm = T5ForConditionalGeneration.from_pretrained(T5_PATH, config=t5_config)\n",
|
||||
"\n",
|
||||
"slot = '<extra_id_0>'\n",
|
||||
"\n",
|
||||
"text = f'World War II ended in {slot}.'\n",
|
||||
"\n",
|
||||
"encoded = t5_tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')\n",
|
||||
"input_ids = encoded['input_ids']\n",
|
||||
"\n",
|
||||
"outputs = t5_mlm.generate(input_ids=input_ids,\n",
|
||||
" num_beams=200, num_return_sequences=5,\n",
|
||||
" max_length=5)\n",
|
||||
"\n",
|
||||
"_0_index = text.index(slot)\n",
|
||||
"_result_prefix = text[:_0_index]\n",
|
||||
"_result_suffix = text[_0_index+len(slot):]\n",
|
||||
"\n",
|
||||
"def _filter(output, end_token='<extra_id_1>'):\n",
|
||||
" _txt = t5_tokenizer.decode(output[2:], skip_special_tokens=False, clean_up_tokenization_spaces=False)\n",
|
||||
" if end_token in _txt:\n",
|
||||
" _end_token_index = _txt.index(end_token)\n",
|
||||
" return _result_prefix + _txt[:_end_token_index] + _result_suffix\n",
|
||||
" else:\n",
|
||||
" return _result_prefix + _txt + _result_suffix\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"results = [_filter(out) for out in outputs]\n",
|
||||
"results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"(Zob. [https://arxiv.org/pdf/1910.10683.pdf](https://arxiv.org/pdf/1910.10683.pdf))\n",
|
||||
"\n",
|
||||
"Przykład: T5, mT5\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Filip Graliński",
|
||||
"email": "filipg@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
},
|
||||
"org": null,
|
||||
"subtitle": "14.Pretrenowane modele języka[wykład]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
212
wyk/14_pretrenowanie.org
Normal file
@ -0,0 +1,212 @@
|
||||
* Pretrenowanie modeli
|
||||
|
||||
System AlphaZero uczy się grając sam ze sobą — wystarczy 24 godziny,
|
||||
by system nauczył się grać w szachy lub go na nadludzkim poziomie.
|
||||
|
||||
*Pytanie*: Dlaczego granie samemu ze sobą nie jest dobrym sposobem
|
||||
nauczenia się grania w szachy dla człowieka, a dla maszyny jest?
|
||||
|
||||
Co jest odpowiednikiem grania samemu ze sobą w świecie przetwarzania tekstu?
|
||||
Tzn. *pretrenowanie* (/pretraining/) na dużym korpusie tekstu. (Tekst jest tani!)
|
||||
|
||||
Jest kilka sposobów na pretrenowanie modelu, w każdym razie sprowadza
|
||||
się do odgadywania następnego bądź zamaskowanego słowa.
|
||||
W każdym razie zawsze stosujemy softmax (być może ze „sztuczkami” takimi jak
|
||||
negatywne próbkowanie albo hierarchiczny softamx) na pewnej *representecji kontekstowej*:
|
||||
|
||||
$$\vec{p} = \operatorname{softmax}(f(\vec{c})).$$
|
||||
|
||||
Model jest karany używając funkcji log loss:
|
||||
|
||||
$$-\log(p_j),$$
|
||||
|
||||
gdzie $w_j$ jest wyrazem, który pojawił się rzeczywiście w korpusie.
|
||||
|
||||
** Przewidywanie słowa (GPT-2)
|
||||
|
||||
Jeden ze sposobów pretrenowania modelu to po prostu przewidywanie
|
||||
następnego słowa.
|
||||
|
||||
Zainstalujmy najpierw bibliotekę transformers.
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
! pip install transformers
|
||||
#+END_SRC
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
import torch
|
||||
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
|
||||
model = GPT2LMHeadModel.from_pretrained('gpt2-large')
|
||||
text = "Warsaw is the capital city of"
|
||||
encoded_input = tokenizer(text, return_tensors='pt')
|
||||
output = model(**encoded_input)
|
||||
next_token_probs = torch.softmax(output[0][:, -1, :][0], dim=0)
|
||||
|
||||
nb_of_tokens = next_token_probs.size()[0]
|
||||
|
||||
_, top_k_indices = torch.topk(next_token_probs, 30, sorted=True)
|
||||
top_k_indices
|
||||
# words = tokenizer.convert_ids_to_tokens(top)
|
||||
|
||||
# top_probs = []
|
||||
|
||||
# for ix in range(len(top)):
|
||||
# top_probs.append((words[ix], next_token_probs[top[ix]].item()))
|
||||
|
||||
# top_probs
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[8]:
|
||||
#+BEGIN_EXAMPLE
|
||||
[('Ġthe', 0.4415026307106018),
|
||||
('ĠPoland', 0.236798495054245),
|
||||
('ĠBelarus', 0.10114768147468567),
|
||||
('ĠUkraine', 0.058283545076847076),
|
||||
('Ġeastern', 0.020564062520861626),
|
||||
('ĠEastern', 0.011137397028505802),
|
||||
('ĠPolish', 0.010205904021859169),
|
||||
('ĠWestern', 0.00833223108202219),
|
||||
('Ġwestern', 0.006872199941426516),
|
||||
('Ġa', 0.004939113277941942),
|
||||
('ĠSlovakia', 0.003553805174306035),
|
||||
('ĠLithuania', 0.003335304092615843),
|
||||
('ĠRussia', 0.002872465644031763),
|
||||
('Ġcentral', 0.002493523992598057),
|
||||
('Ġmodern', 0.0022767107002437115),
|
||||
('ĠCzech', 0.0022264323197305202),
|
||||
('ĠPr', 0.002146221464499831),
|
||||
('Ġformer', 0.0021054286044090986),
|
||||
('Ġwhat', 0.0017435317859053612),
|
||||
('ĠSlov', 0.0014634730760008097),
|
||||
('ĠUkrainian', 0.0014347084797918797),
|
||||
('ĠCentral', 0.0013676199596375227),
|
||||
('ĠSouth', 0.0013484350638464093),
|
||||
('Ġone', 0.001204205909743905),
|
||||
('ĠNorthern', 0.0011802552035078406),
|
||||
('ĠWest', 0.001175572513602674),
|
||||
('ĠEast', 0.0011596156982704997),
|
||||
('Ġsouthern', 0.0011580033460631967),
|
||||
('Ġnorthern', 0.001110077602788806),
|
||||
('Ġ"', 0.0010494199814274907)]
|
||||
#+END_EXAMPLE
|
||||
:end:
|
||||
|
||||
Zalety tego podejścia:
|
||||
|
||||
- prostota,
|
||||
- dobra podstawa do strojenia systemów generowania tekstu zwłaszcza
|
||||
„otwartego” (systemy dialogowe, generowanie (fake) newsów, streszczanie tekstu),
|
||||
ale niekoniecznie tłumaczenia maszynowego,
|
||||
- zaskakująca skuteczność przy uczeniu /few-shot/ i /zero-shot/.
|
||||
|
||||
Wady:
|
||||
|
||||
- asymetryczność, przetwarzanie tylko z lewej do prawej, preferencja
|
||||
dla lewego kontekstu,
|
||||
- mniejsza skuteczność przy dostrajaniu do zadań klasyfikacji i innych zadań
|
||||
niepolegających na prostym generowaniu.
|
||||
|
||||
Przykłady modeli: GPT, GPT-2, GPT-3, DialoGPT.
|
||||
|
||||
** Maskowanie słów (BERT)
|
||||
|
||||
Inną metodą jest maskowanie słów (/Masked Language Modeling/, /MLM/).
|
||||
|
||||
W tym podejściu losowe wybrane zastępujemy losowe słowa specjalnym
|
||||
tokenem (~[MASK]~) i każemy modelowi odgadywać w ten sposób
|
||||
zamaskowane słowa (z uwzględnieniem również prawego kontekstu!).
|
||||
|
||||
Móciąc ściśle, w jednym z pierwszych modeli tego typu (BERT)
|
||||
zastosowano schemat, w którym również niezamaskowane słowa są odgadywane (!):
|
||||
|
||||
- wybieramy losowe 15% wyrazów do odgadnięcia
|
||||
- 80% z nich zastępujemy tokenem ~[MASK]~,
|
||||
- 10% zastępujemy innym losowym wyrazem,
|
||||
- 10% pozostawiamy bez zmian.
|
||||
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
from transformers import AutoModelWithLMHead, AutoTokenizer
|
||||
import torch
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
|
||||
model = AutoModelWithLMHead.from_pretrained("xlm-roberta-large")
|
||||
|
||||
sequence = f'II wojna światowa zakończyła się w {tokenizer.mask_token} roku.'
|
||||
|
||||
input_ids = tokenizer.encode(sequence, return_tensors="pt")
|
||||
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
|
||||
|
||||
token_logits = model(input_ids)[0]
|
||||
mask_token_logits = token_logits[0, mask_token_index, :]
|
||||
mask_token_logits = torch.softmax(mask_token_logits, dim=1)
|
||||
|
||||
top_10 = torch.topk(mask_token_logits, 10, dim=1)
|
||||
top_10_tokens = zip(top_10.indices[0].tolist(), top_10.values[0].tolist())
|
||||
|
||||
for token, score in top_10_tokens:
|
||||
print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])), f"(score: {score})")
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[3]:
|
||||
:end:
|
||||
|
||||
|
||||
Przykłady: BERT, RoBERTa (również Polish RoBERTa).
|
||||
|
||||
** Podejście generatywne (koder-dekoder).
|
||||
|
||||
System ma wygenerować odpowiedź na różne pytania (również
|
||||
odpowiadające zadaniu MLM), np.:
|
||||
|
||||
- "translate English to German: That is good." => "Das ist gut."
|
||||
- "cola sentence: The course is jumping well." => "not acceptable"
|
||||
- "summarize: state authorities dispatched emergency crews tuesday to survey the damage after an onslaught of severe weather in mississippi..."
|
||||
=> "six people hospitalized after a storm in attala county"
|
||||
- "Thank you for <X> me to your party <Y> week." => <X> for inviting <Y> last <Z>
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration
|
||||
|
||||
T5_PATH = 't5-base'
|
||||
|
||||
t5_tokenizer = T5Tokenizer.from_pretrained(T5_PATH)
|
||||
t5_config = T5Config.from_pretrained(T5_PATH)
|
||||
t5_mlm = T5ForConditionalGeneration.from_pretrained(T5_PATH, config=t5_config)
|
||||
|
||||
slot = '<extra_id_0>'
|
||||
|
||||
text = f'Warsaw is the {slot} of Poland.'
|
||||
|
||||
encoded = t5_tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt')
|
||||
input_ids = encoded['input_ids']
|
||||
|
||||
outputs = t5_mlm.generate(input_ids=input_ids,
|
||||
num_beams=200, num_return_sequences=5,
|
||||
max_length=5)
|
||||
|
||||
_0_index = text.index(slot)
|
||||
_result_prefix = text[:_0_index]
|
||||
_result_suffix = text[_0_index+len(slot):]
|
||||
|
||||
def _filter(output, end_token='<extra_id_1>'):
|
||||
_txt = t5_tokenizer.decode(output[2:], skip_special_tokens=False, clean_up_tokenization_spaces=False)
|
||||
if end_token in _txt:
|
||||
_end_token_index = _txt.index(end_token)
|
||||
return _result_prefix + _txt[:_end_token_index] + _result_suffix
|
||||
else:
|
||||
return _result_prefix + _txt + _result_suffix
|
||||
|
||||
|
||||
results = [_filter(out) for out in outputs]
|
||||
results
|
||||
#+END_SRC
|
||||
|
||||
(Zob. https://arxiv.org/pdf/1910.10683.pdf)
|
||||
|
||||
Przykład: T5, mT5
|
273
wyk/15_transformer.ipynb
Normal file
@ -0,0 +1,273 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"![Logo 1](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech1.jpg)\n",
|
||||
"<div class=\"alert alert-block alert-info\">\n",
|
||||
"<h1> Ekstrakcja informacji </h1>\n",
|
||||
"<h2> 15. <i>Sieci Transformer i ich zastosowanie w ekstrakcji informacji</i> [wykład]</h2> \n",
|
||||
"<h3> Filip Graliński (2021)</h3>\n",
|
||||
"</div>\n",
|
||||
"\n",
|
||||
"![Logo 2](https://git.wmi.amu.edu.pl/AITech/Szablon/raw/branch/master/Logotyp_AITech2.jpg)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Modele Transformer\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Atencja\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Atencję w modelach Transformer można interpretować jako rodzaj\n",
|
||||
"„miękkiego” odpytywania swego rodzaju bazy danych, w której\n",
|
||||
"przechowywane są pary klucz-wartość. Mamy trzy rodzaje wektorów (a\n",
|
||||
"właściwie macierzy, bo wektory są od razu upakowane w macierze):\n",
|
||||
"\n",
|
||||
"- $Q$ - macierz zapytań,\n",
|
||||
"- $K$ - macierz kluczy,\n",
|
||||
"- $V$ - macierz wartości odpowiadających kluczom $K$.\n",
|
||||
"\n",
|
||||
"W atencji modeli Transformer patrzymy jak bardzo zapytania $Q$ pasują\n",
|
||||
"do kluczy $K$ i na tej podstawie zwracamy wartości $V$ (im bardziej\n",
|
||||
"**klucz** pasuje do **zapytania**, tym większy wkład wnosi odpowiednia **wartość**).\n",
|
||||
"Ten rodzaj odpytywania można zrealizować z pomocą mnożenia macierzy i funkcji softmax:\n",
|
||||
"\n",
|
||||
"$$\\operatorname{Atention}(Q,K,V) = \\operatorname{softmax}(QK^T)V$$\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Uproszczony przykład\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Załóżmy, że rozmiar embeddingu wynosi 4, w macierzach rozpatrywać\n",
|
||||
"będziemy po 3 wektory naraz (możemy sobie wyobrazić, że zdanie zawiera 3 wyrazy).\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"tensor([[20.5700, 36.2400, 31.1000],\n",
|
||||
" [15.1100, 13.9100, 7.9500],\n",
|
||||
" [ 2.2100, 7.1800, 7.4000]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"Q = torch.tensor([\n",
|
||||
" [0.3, -2.0, 0.4, 6.0],\n",
|
||||
" [-1.0, 1.5, 0.2, 3.0],\n",
|
||||
" [0.3, -1.0, 0.2, 1.0]])\n",
|
||||
"\n",
|
||||
"K = torch.tensor([\n",
|
||||
" [-0.5, 1.7, 0.3, 4.0],\n",
|
||||
" [0.4, -1.5, 0.3, 5.5],\n",
|
||||
" [-1.0, -3.5, 1.0, 4.0]])\n",
|
||||
"\n",
|
||||
"M = Q @ torch.transpose(K, 0, 1)\n",
|
||||
"M"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Jak widać, najbardziej pierwszy wektor $Q$ pasuje do drugiego wektora $K$.\n",
|
||||
"Znormalizujmy te wartości używać funkcji softmax.\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"tensor([[1.5562e-07, 9.9418e-01, 5.8236e-03],\n",
|
||||
" [7.6807e-01, 2.3134e-01, 5.9683e-04],\n",
|
||||
" [3.0817e-03, 4.4385e-01, 5.5307e-01]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"Mn = torch.softmax(M, 1)\n",
|
||||
"Mn"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Drugi wektor zapytania najbardziej pasuje do pierwszego klucza, trochę\n",
|
||||
"mniej do drugiego klucza, o wiele mniej do trzeciego klucza. Te\n",
|
||||
"wektory to oczywiście wektory atencji (drugie słowo najbardziej\n",
|
||||
"„patrzy” na pierwsze słowo).\n",
|
||||
"\n",
|
||||
"Teraz będziemy przemnażać przez wektory wartości:\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"tensor([[ 3.9750e+00, 9.9419e-02, 1.0116e-01, 1.5765e-01, 5.8255e-04],\n",
|
||||
" [ 9.2517e-01, 6.9357e+00, 2.3313e-02, -3.8112e+00, 9.2174e-01],\n",
|
||||
" [ 1.6095e+00, 7.2120e-02, 2.1031e-01, 5.5597e+00, 5.9005e-02]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"V = torch.tensor([\n",
|
||||
" [0.0, 9.0, 0.0, -5.0, 1.2],\n",
|
||||
" [4.0, 0.1, 0.1, 0.1, 0.0],\n",
|
||||
" [-0.3, 0.0, 0.3, 10.0, 0.1]])\n",
|
||||
"\n",
|
||||
"Mn @ V"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Dodatkowa normalizacja\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"W praktyce dobrze jest znormalizować pierwszy iloczyn przez\n",
|
||||
"$\\sqrt{d_k}$, gdzie $d_k$ to rozmiar wektora klucza.\n",
|
||||
"\n",
|
||||
"$$\\operatorname{Atention}(Q,K,V) = \\operatorname{softmax}(\\frac{QK^T}{\\sqrt{d^k}})V$$\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Skąd się biorą Q, K i V?\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Wektory (macierze) $Q$, $K$ i $V$ w pierwszej warstwie pochodzą z\n",
|
||||
"embeddingów tokenów $E$ (właściwie jednostek BPE).\n",
|
||||
"\n",
|
||||
"- $Q$ = $EW^Q$\n",
|
||||
"- $K$ = $EW^K$\n",
|
||||
"- $V$ = $EW^V$\n",
|
||||
"\n",
|
||||
"W kolejnych warstwach zamiast $E$ wykorzystywane jest wyjście z poprzedniej warstwy.\n",
|
||||
"\n",
|
||||
"## Zastosowanie w ekstrakcji informacji\n",
|
||||
"\n",
|
||||
"W prosty sposób możemy do sieci Transformer dołączyć głowicę realizującą etykietowanie sekwencji."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Literatura\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"[https://arxiv.org/pdf/1706.03762.pdf](https://arxiv.org/pdf/1706.03762.pdf)\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Filip Graliński",
|
||||
"email": "filipg@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.6"
|
||||
},
|
||||
"org": null,
|
||||
"subtitle": "15.Sieci Transformer i ich zastosowanie w ekstrakcji informacji[wykład]",
|
||||
"title": "Ekstrakcja informacji",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
124
wyk/15_transformer.org
Normal file
@ -0,0 +1,124 @@
|
||||
* Modele Transformer
|
||||
|
||||
** Atencja
|
||||
|
||||
Atencję w modelach Transformer można interpretować jako rodzaj
|
||||
„miękkiego” odpytywania swego rodzaju bazy danych, w której
|
||||
przechowywane są pary klucz-wartość. Mamy trzy rodzaje wektorów (a
|
||||
właściwie macierzy, bo wektory są od razu upakowane w macierze):
|
||||
|
||||
- $Q$ - macierz zapytań,
|
||||
- $K$ - macierz kluczy,
|
||||
- $V$ - macierz wartości odpowiadających kluczom $K$.
|
||||
|
||||
W atencji modeli Transformer patrzymy jak bardzo zapytania $Q$ pasują
|
||||
do kluczy $K$ i na tej podstawie zwracamy wartości $V$ (im bardziej
|
||||
*klucz* pasuje do *zapytania*, tym większy wkład wnosi odpowiednia *wartość*).
|
||||
Ten rodzaj odpytywania można zrealizować z pomocą mnożenia macierzy i funkcji softmax:
|
||||
|
||||
$$\operatorname{Atention}(Q,K,V) = \operatorname{softmax}(QK^T)V$$
|
||||
|
||||
*** Uproszczony przykład
|
||||
|
||||
Załóżmy, że rozmiar embeddingu wynosi 4, w macierzach rozpatrywać
|
||||
będziemy po 3 wektory naraz (możemy sobie wyobrazić, że zdanie zawiera 3 wyrazy).
|
||||
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
import torch
|
||||
|
||||
Q = torch.tensor([
|
||||
[0.3, -2.0, 0.4, 6.0],
|
||||
[-1.0, 1.5, 0.2, 3.0],
|
||||
[0.3, -1.0, 0.2, 1.0]])
|
||||
|
||||
K = torch.tensor([
|
||||
[-0.5, 1.7, 0.3, 4.0],
|
||||
[0.4, -1.5, 0.3, 5.5],
|
||||
[-1.0, -3.5, 1.0, 4.0]])
|
||||
|
||||
M = Q @ torch.transpose(K, 0, 1)
|
||||
M
|
||||
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[11]:
|
||||
#+BEGIN_EXAMPLE
|
||||
tensor([[20.5700, 36.2400, 31.1000],
|
||||
[15.1100, 13.9100, 7.9500],
|
||||
[ 2.2100, 7.1800, 7.4000]])
|
||||
#+END_EXAMPLE
|
||||
:end:
|
||||
|
||||
Jak widać, najbardziej pierwszy wektor $Q$ pasuje do drugiego wektora $K$.
|
||||
Znormalizujmy te wartości używać funkcji softmax.
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
import torch
|
||||
|
||||
Mn = torch.softmax(M, 1)
|
||||
Mn
|
||||
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[12]:
|
||||
#+BEGIN_EXAMPLE
|
||||
tensor([[1.5562e-07, 9.9418e-01, 5.8236e-03],
|
||||
[7.6807e-01, 2.3134e-01, 5.9683e-04],
|
||||
[3.0817e-03, 4.4385e-01, 5.5307e-01]])
|
||||
#+END_EXAMPLE
|
||||
:end:
|
||||
|
||||
Drugi wektor zapytania najbardziej pasuje do pierwszego klucza, trochę
|
||||
mniej do drugiego klucza, o wiele mniej do trzeciego klucza. Te
|
||||
wektory to oczywiście wektory atencji (drugie słowo najbardziej
|
||||
„patrzy” na pierwsze słowo).
|
||||
|
||||
Teraz będziemy przemnażać przez wektory wartości:
|
||||
|
||||
#+BEGIN_SRC ipython :session mysession :exports both :results raw drawer
|
||||
import torch
|
||||
|
||||
V = torch.tensor([
|
||||
[0.0, 9.0, 0.0, -5.0],
|
||||
[4.0, 0.1, 0.1, 0.1],
|
||||
[-0.3, 0.0, 0.3, 10.0]])
|
||||
|
||||
Mn @ V
|
||||
#+END_SRC
|
||||
|
||||
#+RESULTS:
|
||||
:results:
|
||||
# Out[13]:
|
||||
#+BEGIN_EXAMPLE
|
||||
tensor([[ 3.9750, 0.0994, 0.1012, 0.1577],
|
||||
[ 0.9252, 6.9357, 0.0233, -3.8112],
|
||||
[ 1.6095, 0.0721, 0.2103, 5.5597]])
|
||||
#+END_EXAMPLE
|
||||
:end:
|
||||
|
||||
*** Dodatkowa normalizacja
|
||||
|
||||
W praktyce dobrze jest znormalizować pierwszy iloczyn przez
|
||||
$\sqrt{d_k}$, gdzie $d_k$ to rozmiar wektora klucza.
|
||||
|
||||
$$\operatorname{Atention}(Q,K,V) = \operatorname{softmax}(\frac{QK^T}{d^k})V$$
|
||||
|
||||
*** Skąd się biorą Q, K i V?
|
||||
|
||||
Wektory (macierze) $Q$, $K$ i $V$ w pierwszej warstwie pochodzą z
|
||||
embeddingów tokenów $E$ (właściwie jednostek BPE).
|
||||
|
||||
- $Q$ = $EW^Q$
|
||||
- $K$ = $EW^K$
|
||||
- $V$ = $EW^V$
|
||||
|
||||
W kolejnych warstwach zamiast $E$ wykorzystywane jest wyjście z poprzedniej warstwy.
|
||||
|
||||
** Literatura
|
||||
|
||||
https://arxiv.org/pdf/1706.03762.pdf
|
BIN
wyk/bpe.png
Normal file
After Width: | Height: | Size: 124 KiB |
1
wyk/crf-viterbi.drawio
Normal file
@ -0,0 +1 @@
|
||||
<mxfile host="app.diagrams.net" modified="2021-05-26T13:43:59.377Z" agent="5.0 (X11)" etag="4emVx1cQBqc02lQgOp_X" version="14.6.13" type="device"><diagram id="7pFB8Xg2-vPC_YrQG171" name="Page-1">5ZpNc5swEIZ/Dcd2jAQ2PsbYTQ/pTKaZTt3eFJCNWowYWY5xf31FEAa0dr4DDLl40IIk9Lyr1UrYwv4muxQkjb7xkMYWGoWZhecWQlPHVr+54VAYXIwLw1qwsDDZleGG/aPaONLWHQvptvGg5DyWLG0aA54kNJANGxGC75uPrXjc7DUlawoMNwGJofUnC2VUWD00qexfKVtHZc/2eFrc2ZDyYT2SbURCvq+Z8MLCvuBcFlebzKdxzq7kUtT7cubu8cUETeRTKiwvJ+j3gvFb5s3Dq/2PdDlzPulW7ki80wO+iIl+X3koIUiaqS5mkdzEymCry60U/C/1ecyFsiQ8UU/OViyODROJ2TpRxUC9JFX22R0Vkim8F/rGhoVh3s1sHzFJb1IS5H3ulS8pm+C7JKT5+49USb+qaoBmZxnYR7LKIynfUCkO6pGywliLob3RdnR5X2lbmqKarKWGRHvT+thyBVxdaObP4I8A/82A8SO3Z/gxwJ/yPaOW71rTmeU7ljcZrhq4b2o4QA0f0FfNqNB/jlFNFLJNi/VgxbKc2tuED7dJzIPE8Ali+L2IjQGx634TQyd8rFViE0Dse7+J4VHHxJAHAYUqR9JFLmTE1zwh8aKyGsGqeuaK81TD+0OlPOiEj+wkb6KlGZPL2vWvvKnPri7NM93yfeFQFhI13mW9UKuVF6tq96WyXjG+fFAPa6YY8J0I6AOwNCtJxJrKh/KuM04gaEwku2u+yJtL6vU90CIzT+s60E7fNdDWcwUL4ZUX0CAAiYW6c+u5jvtWyZjBuPPQXG75+hubTWSdx2Yb7t96NpOx27OUyYZbrp7lTCay7icm3Cb1bGKayLqfmHAvs7XcmQKp2hvZljsHAIezr/SMKT+GYrS6r7Rh0m8PFz844zoxF9rFD9NNNFz83uNZVbv0YeqKh0sfO0bs6dr5EUxqBxx7DOe3y5WgM/ow2Rtw6AHH6507P0wcBxx7DOfHo66d3wX08xwUFTkoGnQOCpZh1PVUgEf1NTGGvSEAy8K465kBNwQ1MfCHEgN1vUZjeIYF6NMkvMj/zJFjjMl2y4KmFi89QH38zB9yrHFyT3AqbU8+ydc9XHOmOq5kmhoyuQb/4lOErlVJABrC5jGvbTRUfKsADd1reRz2K+SFOcCz5X3dx6Dq+8+vso12PgaVpF/8NagdVzselpUeMn6pq03OLLptuRo8dLv9KIdujiGiPe044cGn11g1Cn/YSthTM9t5NyVUsfr/YDGLqj9h4sV/</diagram></mxfile>
|
BIN
wyk/crf-viterbi.png
Normal file
After Width: | Height: | Size: 13 KiB |
1
wyk/ie-gener.drawio
Normal file
@ -0,0 +1 @@
|
||||
<mxfile host="app.diagrams.net" modified="2021-06-09T07:54:35.721Z" agent="5.0 (X11)" etag="NciLNBJF1axAiSJ0r0sv" version="14.7.3" type="device"><diagram id="HvCQlNLg7fWOxGx64C6g" name="Page-1">3Vjbbts4EP0aAe2DF9bNiR9jO9lisVekQLF9Y8SxRJsSVYqKrHz9Di+SJctF0m6TtvGDRB2RM+QcnhnKXrjOD79KUmZ/CArcC+b04IUbLwiWkY9XDbQWiMPQAqlk1EL+EbhlD+DAuUNrRqEadVRCcMXKMZiIooBEjTAipWjG3baCj72WJIUJcJsQPkU/MKoyi14GF0f8HbA06zz7i6V9k5Ous1tJlREqmgEUXnvhWgqhbCs/rIHr2HVxseNuPvO2n5iEQj1lgPgrjt7/+du/G76THw8f5Wyxv545K/eE127BXrDgaG+V4atFqlsdUpWk0OtQrQvO4lOtJ7+6I8k+laIu6CwRXEgvvMJ+Mr17E8Sxp6e1xutJ++1xfOfoHRDKirTzh0uxLsfTQHg6t/LFJ/u7kJDr8JZVre/UGEOPTOmtl4Oyo3FjVrgxQdX6LS6wZBVLzDLnwJnrVQE1NjTI6ioX+lFBXhqbrEgYZbQutOlaXzi5Q/+6t+p8g9l0aUG0G84+1eSXxyM5iFswCleg4GC2gco5Aj42t4zztQ3ZphAF2CibpYSb+Pj0XpQIzDB84arJmIJbxLXVBjMEYuIe5JYbKWSMUigQM4SA3ql6VKWk2EPnywvCufn100QDOL3P6sDv1YVZCQRSIVvs0g24dIJ0GclfuufmqO8gclg20rYDicspaW/7KDtsOOV9gQqDiQoV7CvNbAM7bx17y1XCRNM+gaRx6BxNZ5jDHZJi4DcJRg0QX+mYMsx7V+5FjtRoN2cpHNP1LUjxHyflHCfBc1ESnkmMJ8HHhF7q5pbD4UqXGgwFFNQ1NwknFSp9TI81AnRSdB4N1CAQ8ZlAdJgEThS7H5s/Fx3n4W/BTFZxPERRPOIBpTc2UYlaJuBGDavNiaHYf8SQIjIFNTFkyOqX/fX8RRP+tKIk2Sc7nSBNlnwgFaEPDEymtymRdfkQiZwpMcPbIFuyL8iWP6kQo8WJEC++sxCXEyK7SquLXV19XeH6llF/1moVh2M++kAP+Fi+JB9doh4Q0rR9kYJOLXey08qbprXSy0RDCjDHma2QOUl28Pb1Cik+PWYspsT1Ve9lmDtX0yxblN13dP1Tm7MnVP3B0gtvpqwOsuJg8P8TI4et+oGlODk4Rk88OD4fo9MqV7aKFFqGN6DaPQNFXq/E/PkJIeF3rlXdmWeUGwfJrzUfdfZkb0ia53XF9A0f1pF3eWE84PLb5KHAoz9GQ1QKv6rsV+KO7PXqrJVByn2tBMfLk+J3+Xw5FB+P/8XYU+jxD63w+j8=</diagram></mxfile>
|
BIN
wyk/ie-gener.png
Normal file
After Width: | Height: | Size: 30 KiB |
1
wyk/ie-seqlab.drawio
Normal file
@ -0,0 +1 @@
|
||||
<mxfile host="app.diagrams.net" modified="2021-06-09T07:51:04.277Z" agent="5.0 (X11)" etag="tdTX7mJTGI1dKBKJk9w0" version="14.7.3" type="device"><diagram id="HvCQlNLg7fWOxGx64C6g" name="Page-1">7Vhtk5MwEP41zOiHOhQKd/14bc+3Uet4zjj6LSVbyDUQDOEo/no3EEp5uempd+qc9kMbHpLd7PNkl6WWu4z3LyRJo7eCArccm+4td2U5znw2xW8NlDXguW4NhJLRGpq2wBX7Bga0DZozCllnohKCK5Z2wUAkCQSqgxEpRdGdthW86zUlIQyAq4DwIfqJURXV6Llz1uIvgYVR43nqz+s7MWkmm0iyiFBRHEHupeUupRCqHsX7JXDNXcNLve75LXcPG5OQqLssEGtv9vHd688rfi2/7L/Iib+7nBgrN4TnJmDL8TnaW0R4yw/1qEGylCQ6DlUacvyvud78YkOCXShFntBJILiQlnuB82S4eeJ4nqW3tcTv3vhpu75x9BIIZUnY+MNQapfdbSA83Fv62zf7RkiINb1plutfWhlDj0zpoxeDqlfjwczwYILK9V0MMGUZC6owbeDMzMqAVjY0yPIsFvpSQZxWNlkSMMponmjTuf7iZIP+9WzV+Ibq0IUJ0W44+5qTZ6eZPOLN6dDlKNhXx0DFHIEpDreM82VN2SoRCdQsV6G4K6+9+ihSBCZIn7soIqbgCnFttcAKgZi4AbnlVSpEjFJIEKsEAX1S9apMSbGDxpfluHb1OWwTDeD2bs2D6SG7sCqBQClkiVOaBecmIU1Fmvrmumjz25kZLOrktgGJqSnhwXabdjgwmfcDWegMslDBLtPKFnBtLT1rvgiYKMo7iNSlzsg0ohyekBCJXwXIGiC+0JwyrHsX5kaM0mg3oxJ25boPUaY9Uc6Gooxp4jyUJO5IYeyRjwU91cMth/2FftQgFZBQM1wFnGSY6V15aiNABw+dk0QdEeGNENFgEjhR7KZrfowd4+G9YFVVMTrMvK4OA4IzkcsAzKrjp03PkOecMKSIDEENDFViHcL+ef1mA/1AlTsGShQkYfB4M2nm/2WZ5P1vMR5Bi/EAQXvTkZj7rcwtOvU7nI3sq7MVVSjbOttaR0ssYAx0yO+gGPrHXxLrxEw2WTp6ba9Pz7nLmtswezFZrd+sP+Do1WG0PmKijqwb7TD+/z3fUVH0vDv0fP5Yzzd7qLLo/5sNxrl7Tw3G3P6zDcbZQL9Uikzp0lWUVGzKQHcZdtXI54+33Tif/WXtxnygS/+R8jNF7z5Zf9BKN+/nl+0N9Jj/Tj2aF7sjQYry8FILwwfYk6LUOSNJpFt1qHqTrZAxCa7h6eNNpPnZ6UfU4S35F5XDy/Z/x7ogtn/eupffAQ==</diagram></mxfile>
|
BIN
wyk/ie-seqlab.png
Normal file
After Width: | Height: | Size: 25 KiB |
BIN
wyk/img-feed-forward.png
Normal file
After Width: | Height: | Size: 21 KiB |
BIN
wyk/img-linear-regression.png
Normal file
After Width: | Height: | Size: 15 KiB |
BIN
wyk/img-logistic-regression-aardvark.png
Normal file
After Width: | Height: | Size: 15 KiB |
BIN
wyk/img-logistic-regression-hashing.png
Normal file
After Width: | Height: | Size: 15 KiB |
BIN
wyk/img-logistic-regression.png
Normal file
After Width: | Height: | Size: 15 KiB |
44
wyk/pytorch_regression/analyzer.py
Normal file
@ -0,0 +1,44 @@
|
||||
import regex as re
|
||||
from sklearn.feature_extraction.text import HashingVectorizer
|
||||
import torch
|
||||
|
||||
|
||||
token_root_len = 7
|
||||
|
||||
|
||||
class Analyzer(object):
|
||||
|
||||
def __init__(self):
|
||||
self.token_pat = re.compile(r'(?:\p{L}|\d)+')
|
||||
|
||||
def __call__(self, doc):
|
||||
return [tok[0:token_root_len] for tok in self.token_pat.findall(doc)]
|
||||
|
||||
|
||||
# hiperparametr - liczba bitów hasza
|
||||
vector_length = 2**18
|
||||
|
||||
|
||||
vectorizer = HashingVectorizer(n_features=vector_length, analyzer=Analyzer())
|
||||
|
||||
midpoint = 1913.0
|
||||
|
||||
|
||||
def vectorize_text(content):
|
||||
# musimy przekonwertować macierz sklearn => macierz numpy => tensor pytorcha
|
||||
return (torch.from_numpy(vectorizer.fit_transform([content]).toarray()))[0]
|
||||
|
||||
|
||||
def vectorize_batch(contents):
|
||||
# musimy przekonwertować macierz sklearn => macierz numpy => tensor pytorcha
|
||||
return (torch.from_numpy(vectorizer.fit_transform(contents).toarray()))
|
||||
|
||||
|
||||
|
||||
def process_line(line):
|
||||
fields = line.strip('\n').split('\t')
|
||||
|
||||
year_from, year_to, _, _, content = fields
|
||||
# normalizujemy lata do wartości (-1,1)
|
||||
year_normalized = ((float(year_from) + float(year_to)) / 2 - midpoint) / 100.0
|
||||
return (content, torch.tensor(year_normalized))
|
41
wyk/pytorch_regression/analyzer_classification.py
Normal file
@ -0,0 +1,41 @@
|
||||
import regex as re
|
||||
from sklearn.feature_extraction.text import HashingVectorizer
|
||||
import torch
|
||||
|
||||
|
||||
token_root_len = 7
|
||||
|
||||
|
||||
class Analyzer(object):
|
||||
|
||||
def __init__(self):
|
||||
self.token_pat = re.compile(r'(?:\p{L}|\d)+')
|
||||
|
||||
def __call__(self, doc):
|
||||
return [tok[0:token_root_len] for tok in self.token_pat.findall(doc)]
|
||||
|
||||
|
||||
# hiperparametr - liczba bitów hasza
|
||||
vector_length = 2**18
|
||||
|
||||
|
||||
vectorizer = HashingVectorizer(n_features=vector_length, analyzer=Analyzer())
|
||||
|
||||
def vectorize_text(content):
|
||||
# musimy przekonwertować macierz sklearn => macierz numpy => tensor pytorcha
|
||||
return (torch.from_numpy(vectorizer.fit_transform([content]).toarray()))[0]
|
||||
|
||||
|
||||
def vectorize_batch(contents):
|
||||
# musimy przekonwertować macierz sklearn => macierz numpy => tensor pytorcha
|
||||
return (torch.from_numpy(vectorizer.fit_transform(contents).toarray()))
|
||||
|
||||
|
||||
|
||||
def process_line(line):
|
||||
fields = line.strip('\n').split('\t')
|
||||
|
||||
label, content = fields
|
||||
# normalizujemy lata do wartości (-1,1)
|
||||
y = float(label)
|
||||
return (content, torch.tensor(y))
|
24
wyk/pytorch_regression/linear0-infer.py
Executable file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
|
||||
from analyzer import midpoint, vectorize_text
|
||||
import torch
|
||||
|
||||
w = torch.load('model.bin')
|
||||
|
||||
|
||||
def model(w, x):
|
||||
return x @ w
|
||||
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip('\n')
|
||||
content = line
|
||||
|
||||
x = vectorize_text(content)
|
||||
|
||||
y_hat = model(w, x)
|
||||
|
||||
# na wyjściu musimy mieć z powrotem rok
|
||||
print(max(1814.0, min(2013.999, y_hat.item() * 100.0 + midpoint)))
|
68
wyk/pytorch_regression/linear0.py
Executable file
@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
import torch
|
||||
|
||||
# Preprocessing i wektoryzację tekstów wydzialamy do osobnego modułu,
|
||||
# z którego będzie korzystał zarówno kod do uczenia, jak i predykcji.
|
||||
from analyzer import vectorizer, vector_length, process_line, vectorize_text
|
||||
|
||||
# Nasz model to zestaw wag w. Na koniec dojdziemy do modeli
|
||||
# Transformer o wielu milionach wag; w pewnym sensie algorytm
|
||||
# uczenia się nie zmieni.
|
||||
# W naszym zadaniu lepiej zacząć od zerowych wag zamiast losowych.
|
||||
# Znaczenie `requires_grad` zostanie omówione w kolejnym skrypcie.
|
||||
w = torch.zeros(vector_length, dtype=torch.double, requires_grad=False)
|
||||
|
||||
# Hiperparametr uczenia
|
||||
learning_rate = torch.tensor(.0032, dtype=torch.double)
|
||||
|
||||
|
||||
def model(w, x):
|
||||
# @ to iloczyn wektorów/macierzy w PyTorchu
|
||||
return x @ w
|
||||
|
||||
|
||||
# Funkcja kosztu.
|
||||
def loss_fun(y_hat, y_exp):
|
||||
return (y_hat - y_exp)**2
|
||||
|
||||
|
||||
# Co ile kroków będziemy wypisywali informacje o średniej funkcji kosztu.
|
||||
# To nie jest hiperparametr uczenia, nie ma to żadnego, ani pozytywnego, ani
|
||||
# negatywnego wpływu na uczenie.
|
||||
step = 5000
|
||||
i = 1
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
for line in sys.stdin:
|
||||
content, y_exp = process_line(line)
|
||||
|
||||
x = vectorize_text(content)
|
||||
|
||||
# wartość z predykcji
|
||||
y_hat = model(w, x)
|
||||
|
||||
# wyliczamy funkcję kosztu
|
||||
loss = loss_fun(y_hat, y_exp)
|
||||
|
||||
# Obliczamy gradient i aktualizujemy wagi.
|
||||
# TU SIĘ ODBYWA UCZENIE!
|
||||
grad = (y_hat - y_exp)
|
||||
w = w - learning_rate * x * grad
|
||||
|
||||
# (Niedogodność tej wersji: jeśli zmienimy model lub funkcję kosztu,
|
||||
# będziemy musieli zróżniczkować wszystko i ustalić formułę na grad.
|
||||
|
||||
# za jakiś czas pokazujemy uśrednioną funkcję kosztu
|
||||
closs += loss
|
||||
if i % step == 0:
|
||||
print("Sample item: ", y_exp.item(), " => ", y_hat.item(),
|
||||
" | Avg loss: ", (closs / step).item())
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
i += 1
|
||||
|
||||
|
||||
# serializujemy nasz model
|
||||
torch.save(w, "model.bin")
|
23
wyk/pytorch_regression/linear1-infer.py
Executable file
@ -0,0 +1,23 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
|
||||
from analyzer import vectorizer
|
||||
import torch
|
||||
|
||||
w = torch.load('model.bin')
|
||||
|
||||
|
||||
def fun(w, x):
|
||||
return x @ w
|
||||
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip('\n')
|
||||
content = line
|
||||
|
||||
x = (torch.from_numpy(vectorizer.fit_transform([content]).toarray()))[0]
|
||||
|
||||
y_hat = fun(w, x)
|
||||
|
||||
print(max(1814.0, min(2013.999, y_hat.item() * 100.0 + 1913.0)))
|
70
wyk/pytorch_regression/linear1.py
Executable file
@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# To samo co linear0.py tylko z automatycznym różniczkowaniem
|
||||
|
||||
import sys
|
||||
import torch
|
||||
|
||||
# Preprocessing i wektoryzację tekstów wydzialamy do osobnego modułu,
|
||||
# z którego będzie korzystał zarówno kod do uczenia, jak i predykcji.
|
||||
from analyzer import vectorizer, vector_length, process_line, vectorize_text
|
||||
|
||||
# Nasz model to zestaw wag w. Na koniec dojdziemy do modeli
|
||||
# Transformer o wielu milionach wag; w pewnym sensie algorytm
|
||||
# uczenia się nie zmieni.
|
||||
# W naszym zadaniu lepiej zacząć od zerowych wag zamiast losowych.
|
||||
# Tym razem zaznaczamy, że względem w będziemy różniczkować
|
||||
w = torch.zeros(vector_length, dtype=torch.double, requires_grad=True)
|
||||
|
||||
# Hiperparametr uczenia
|
||||
learning_rate = torch.tensor(.0032, dtype=torch.double)
|
||||
|
||||
|
||||
def model(w, x):
|
||||
# @ to iloczyn wektorów/macierzy w PyTorchu
|
||||
return x @ w
|
||||
|
||||
|
||||
# Funkcja kosztu.
|
||||
def loss_fun(y_hat, y_exp):
|
||||
return (y_hat - y_exp)**2
|
||||
|
||||
|
||||
# Co ile kroków będziemy wypisywali informacje o średniej funkcji kosztu.
|
||||
# To nie jest hiperparametr uczenia, nie ma to żadnego, ani pozytywnego, ani
|
||||
# negatywnego wpływu na uczenie.
|
||||
step = 5000
|
||||
i = 1
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
for line in sys.stdin:
|
||||
content, y_exp = process_line(line)
|
||||
|
||||
x = vectorize_text(content)
|
||||
|
||||
# wartość z predykcji
|
||||
y_hat = model(w, x)
|
||||
|
||||
# wyliczamy funkcję kosztu
|
||||
loss = loss_fun(y_hat, y_exp)
|
||||
|
||||
loss.backward()
|
||||
|
||||
with torch.no_grad():
|
||||
w = w - learning_rate * w.grad
|
||||
closs += loss
|
||||
|
||||
# za jakiś czas pokazujemy uśrednioną funkcję kosztu
|
||||
if i % step == 0:
|
||||
print("Sample item: ", y_exp.item(), " => ", y_hat.item(),
|
||||
" | Avg loss: ", (closs / step).item())
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
i += 1
|
||||
|
||||
# ponownie ustawiamy (i zerujemy) wagi
|
||||
w.requires_grad_(True)
|
||||
|
||||
|
||||
# serializujemy nasz model
|
||||
torch.save(w, "model.bin")
|
70
wyk/pytorch_regression/linear1b.py
Executable file
@ -0,0 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# To samo co linear0.py tylko z automatycznym różniczkowaniem
|
||||
|
||||
import sys
|
||||
import torch
|
||||
|
||||
# Preprocessing i wektoryzację tekstów wydzialamy do osobnego modułu,
|
||||
# z którego będzie korzystał zarówno kod do uczenia, jak i predykcji.
|
||||
from analyzer import vectorizer, vector_length, process_line, vectorize_text
|
||||
|
||||
# Nasz model to zestaw wag w. Na koniec dojdziemy do modeli
|
||||
# Transformer o wielu milionach wag; w pewnym sensie algorytm
|
||||
# uczenia się nie zmieni.
|
||||
# W naszym zadaniu lepiej zacząć od zerowych wag zamiast losowych.
|
||||
# Tym razem zaznaczamy, że względem w będziemy różniczkować
|
||||
w = torch.zeros(vector_length, dtype=torch.double, requires_grad=True)
|
||||
|
||||
# Hiperparametr uczenia
|
||||
learning_rate = torch.tensor(.0032, dtype=torch.double)
|
||||
|
||||
|
||||
def model(w, x):
|
||||
# @ to iloczyn wektorów/macierzy w PyTorchu
|
||||
return torch.tanh(x @ w)
|
||||
|
||||
|
||||
# Funkcja kosztu.
|
||||
def loss_fun(y_hat, y_exp):
|
||||
return (y_hat - y_exp)**2
|
||||
|
||||
|
||||
# Co ile kroków będziemy wypisywali informacje o średniej funkcji kosztu.
|
||||
# To nie jest hiperparametr uczenia, nie ma to żadnego, ani pozytywnego, ani
|
||||
# negatywnego wpływu na uczenie.
|
||||
step = 5000
|
||||
i = 1
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
for line in sys.stdin:
|
||||
content, y_exp = process_line(line)
|
||||
|
||||
x = vectorize_text(content)
|
||||
|
||||
# wartość z predykcji
|
||||
y_hat = model(w, x)
|
||||
|
||||
# wyliczamy funkcję kosztu
|
||||
loss = loss_fun(y_hat, y_exp)
|
||||
|
||||
loss.backward()
|
||||
|
||||
with torch.no_grad():
|
||||
w = w - learning_rate * w.grad
|
||||
closs += loss
|
||||
|
||||
# za jakiś czas pokazujemy uśrednioną funkcję kosztu
|
||||
if i % step == 0:
|
||||
print("Sample item: ", y_exp.item(), " => ", y_hat.item(),
|
||||
" | Avg loss: ", (closs / step).item())
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
i += 1
|
||||
|
||||
# ponownie ustawiamy (i zerujemy) wagi
|
||||
w.requires_grad_(True)
|
||||
|
||||
|
||||
# serializujemy nasz model
|
||||
torch.save(w, "model.bin")
|
72
wyk/pytorch_regression/linear2.py
Executable file
@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# To samo co linear0.py tylko z automatycznym różniczkowaniem
|
||||
|
||||
import sys
|
||||
import torch
|
||||
from torch import optim
|
||||
|
||||
# Preprocessing i wektoryzację tekstów wydzialamy do osobnego modułu,
|
||||
# z którego będzie korzystał zarówno kod do uczenia, jak i predykcji.
|
||||
from analyzer import vectorizer, vector_length, process_line, vectorize_text
|
||||
|
||||
# Nasz model to zestaw wag w. Na koniec dojdziemy do modeli
|
||||
# Transformer o wielu milionach wag; w pewnym sensie algorytm
|
||||
# uczenia się nie zmieni.
|
||||
# W naszym zadaniu lepiej zacząć od zerowych wag zamiast losowych.
|
||||
# Tym razem zaznaczamy, że względem w będziemy różniczkować
|
||||
w = torch.zeros(vector_length, dtype=torch.double, requires_grad=True)
|
||||
|
||||
# Tym razem użyjemy optymalizatora
|
||||
optimizer = optim.SGD([w], lr=.0064)
|
||||
|
||||
|
||||
def model(w, x):
|
||||
# @ to iloczyn wektorów/macierzy w PyTorchu
|
||||
return x @ w
|
||||
|
||||
|
||||
# Funkcja kosztu.
|
||||
def loss_fun(y_hat, y_exp):
|
||||
return (y_hat - y_exp)**2
|
||||
|
||||
|
||||
# Co ile kroków będziemy wypisywali informacje o średniej funkcji kosztu.
|
||||
# To nie jest hiperparametr uczenia, nie ma to żadnego, ani pozytywnego, ani
|
||||
# negatywnego wpływu na uczenie.
|
||||
step = 5000
|
||||
i = 1
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
for line in sys.stdin:
|
||||
optimizer.zero_grad()
|
||||
|
||||
content, y_exp = process_line(line)
|
||||
|
||||
x = vectorize_text(content)
|
||||
|
||||
# wartość z predykcji
|
||||
y_hat = model(w, x)
|
||||
|
||||
# wyliczamy funkcję kosztu
|
||||
loss = loss_fun(y_hat, y_exp)
|
||||
|
||||
loss.backward()
|
||||
|
||||
with torch.no_grad():
|
||||
closs += loss
|
||||
|
||||
# Optymalizator automagicznie zadba o aktualizację wag!
|
||||
optimizer.step()
|
||||
|
||||
# za jakiś czas pokazujemy uśrednioną funkcję kosztu
|
||||
if i % step == 0:
|
||||
print("Sample item: ", y_exp.item(), " => ", y_hat.item(),
|
||||
" | Avg loss: ", (closs / step).item())
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
i += 1
|
||||
|
||||
|
||||
# serializujemy nasz model
|
||||
torch.save(w, "model.bin")
|
20
wyk/pytorch_regression/linear3-infer.py
Executable file
@ -0,0 +1,20 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import sys
|
||||
|
||||
from analyzer import midpoint, vectorize_text
|
||||
import torch
|
||||
|
||||
model = torch.load('model.bin')
|
||||
|
||||
|
||||
for line in sys.stdin:
|
||||
line = line.strip('\n')
|
||||
content = line
|
||||
|
||||
x = vectorize_text(content)
|
||||
|
||||
y_hat = model(x)
|
||||
|
||||
# na wyjściu musimy mieć z powrotem rok
|
||||
print(max(1814.0, min(2013.999, y_hat.item() * 100.0 + midpoint)))
|
64
wyk/pytorch_regression/linear3.py
Executable file
@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# To samo co linear0.py tylko z automatycznym różniczkowaniem
|
||||
|
||||
import sys
|
||||
import torch
|
||||
from torch import optim
|
||||
|
||||
# Preprocessing i wektoryzację tekstów wydzialamy do osobnego modułu,
|
||||
# z którego będzie korzystał zarówno kod do uczenia, jak i predykcji.
|
||||
from analyzer import vectorizer, vector_length, process_line, vectorize_text
|
||||
|
||||
from my_linear_regressor import MyLinearRegressor
|
||||
|
||||
regressor = MyLinearRegressor(vector_length)
|
||||
|
||||
# Tym razem użyjemy optymalizatora
|
||||
optimizer = optim.SGD(regressor.parameters(), lr=.0064)
|
||||
|
||||
|
||||
# Funkcja kosztu.
|
||||
def loss_fun(y_hat, y_exp):
|
||||
return (y_hat - y_exp)**2
|
||||
|
||||
|
||||
# Co ile kroków będziemy wypisywali informacje o średniej funkcji kosztu.
|
||||
# To nie jest hiperparametr uczenia, nie ma to żadnego, ani pozytywnego, ani
|
||||
# negatywnego wpływu na uczenie.
|
||||
step = 5000
|
||||
i = 1
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
for line in sys.stdin:
|
||||
optimizer.zero_grad()
|
||||
|
||||
content, y_exp = process_line(line)
|
||||
|
||||
x = vectorize_text(content)
|
||||
|
||||
# wartość z predykcji
|
||||
y_hat = regressor(x)
|
||||
|
||||
# wyliczamy funkcję kosztu
|
||||
loss = loss_fun(y_hat, y_exp)
|
||||
|
||||
loss.backward()
|
||||
|
||||
with torch.no_grad():
|
||||
closs += loss
|
||||
|
||||
# Optymalizator automagicznie zadba o aktualizację wag!
|
||||
optimizer.step()
|
||||
|
||||
# za jakiś czas pokazujemy uśrednioną funkcję kosztu
|
||||
if i % step == 0:
|
||||
print("Sample item: ", y_exp.item(), " => ", y_hat.item(),
|
||||
" | Avg loss: ", (closs / step).item())
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
i += 1
|
||||
|
||||
|
||||
# serializujemy nasz model
|
||||
torch.save(regressor, "model.bin")
|
81
wyk/pytorch_regression/linear4-batches.py
Executable file
@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Wprowadzamy minibatche
|
||||
|
||||
import sys
|
||||
import torch
|
||||
from torch import optim
|
||||
import itertools
|
||||
|
||||
# Preprocessing i wektoryzację tekstów wydzialamy do osobnego modułu,
|
||||
# z którego będzie korzystał zarówno kod do uczenia, jak i predykcji.
|
||||
from analyzer import vectorizer, vector_length, process_line, vectorize_batch
|
||||
|
||||
from my_linear_regressor import MyLinearRegressor
|
||||
|
||||
regressor = MyLinearRegressor(vector_length)
|
||||
|
||||
# Rozmiar minibatcha
|
||||
batch_size = 16
|
||||
|
||||
# Pomocnicza funkcja do batchowania
|
||||
def grouper(n, iterable):
|
||||
it = iter(iterable)
|
||||
while True:
|
||||
chunk = tuple(itertools.islice(it, n))
|
||||
if not chunk:
|
||||
return
|
||||
yield chunk
|
||||
|
||||
|
||||
# Tym razem użyjemy optymalizatora
|
||||
optimizer = optim.Adam(regressor.parameters())
|
||||
|
||||
|
||||
# Funkcja kosztu.
|
||||
def loss_fun(y_hat, y_exp):
|
||||
return torch.sum((y_hat - y_exp)**2) / batch_size
|
||||
|
||||
|
||||
# Co ile kroków będziemy wypisywali informacje o średniej funkcji kosztu.
|
||||
# To nie jest hiperparametr uczenia, nie ma to żadnego, ani pozytywnego, ani
|
||||
# negatywnego wpływu na uczenie.
|
||||
step = 500
|
||||
i = 1
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
for batch in grouper(batch_size, sys.stdin):
|
||||
t = [process_line(line) for line in batch]
|
||||
contents = [entry[0] for entry in t]
|
||||
# y_exp będzie teraz wektorem!
|
||||
y_exp = torch.tensor([entry[1] for entry in t], dtype=torch.double)
|
||||
|
||||
optimizer.zero_grad()
|
||||
|
||||
x = vectorize_batch(contents)
|
||||
|
||||
# wartość z predykcji (też wektor!)
|
||||
y_hat = regressor(x)
|
||||
|
||||
# wyliczamy funkcję kosztu
|
||||
loss = loss_fun(y_hat, y_exp)
|
||||
|
||||
loss.backward()
|
||||
|
||||
with torch.no_grad():
|
||||
closs += loss
|
||||
|
||||
# Optymalizator automagicznie zadba o aktualizację wag!
|
||||
optimizer.step()
|
||||
|
||||
# za jakiś czas pokazujemy uśrednioną funkcję kosztu
|
||||
if i % step == 0:
|
||||
print("Sample item: ", y_exp[0].item(), " => ", y_hat[0].item(),
|
||||
" | Avg loss: ", (closs / step).item())
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
i += 1
|
||||
|
||||
|
||||
# serializujemy nasz model
|
||||
torch.save(regressor, "model.bin")
|
64
wyk/pytorch_regression/linear4.py
Executable file
@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# To samo co linear0.py tylko z automatycznym różniczkowaniem
|
||||
|
||||
import sys
|
||||
import torch
|
||||
from torch import optim
|
||||
|
||||
# Preprocessing i wektoryzację tekstów wydzialamy do osobnego modułu,
|
||||
# z którego będzie korzystał zarówno kod do uczenia, jak i predykcji.
|
||||
from analyzer import vectorizer, vector_length, process_line, vectorize_text
|
||||
|
||||
from my_linear_regressor import MyLinearRegressor
|
||||
|
||||
regressor = MyLinearRegressor(vector_length)
|
||||
|
||||
# Tym razem użyjemy optymalizatora
|
||||
optimizer = optim.Adam(regressor.parameters())
|
||||
|
||||
|
||||
# Funkcja kosztu.
|
||||
def loss_fun(y_hat, y_exp):
|
||||
return (y_hat - y_exp)**2
|
||||
|
||||
|
||||
# Co ile kroków będziemy wypisywali informacje o średniej funkcji kosztu.
|
||||
# To nie jest hiperparametr uczenia, nie ma to żadnego, ani pozytywnego, ani
|
||||
# negatywnego wpływu na uczenie.
|
||||
step = 5000
|
||||
i = 1
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
for line in sys.stdin:
|
||||
optimizer.zero_grad()
|
||||
|
||||
content, y_exp = process_line(line)
|
||||
|
||||
x = vectorize_text(content)
|
||||
|
||||
# wartość z predykcji
|
||||
y_hat = regressor(x)
|
||||
|
||||
# wyliczamy funkcję kosztu
|
||||
loss = loss_fun(y_hat, y_exp)
|
||||
|
||||
loss.backward()
|
||||
|
||||
with torch.no_grad():
|
||||
closs += loss
|
||||
|
||||
# Optymalizator automagicznie zadba o aktualizację wag!
|
||||
optimizer.step()
|
||||
|
||||
# za jakiś czas pokazujemy uśrednioną funkcję kosztu
|
||||
if i % step == 0:
|
||||
print("Sample item: ", y_exp.item(), " => ", y_hat.item(),
|
||||
" | Avg loss: ", (closs / step).item())
|
||||
closs = torch.tensor(0.0, dtype=torch.double, requires_grad=False)
|
||||
|
||||
i += 1
|
||||
|
||||
|
||||
# serializujemy nasz model
|
||||
torch.save(regressor, "model.bin")
|
81
wyk/pytorch_regression/linear5.py
Executable file
@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Wprowadzamy minibatche
|
||||
|
||||
import sys
|
||||
import torch
|
||||
from torch import optim
|
||||
import itertools
|
||||
|
||||
# Preprocessing i wektoryzację tekstów wydzialamy do osobnego modułu,
|
||||
# z którego będzie korzystał zarówno kod do uczenia, jak i predykcji.
|
||||
from analyzer import vectorizer, vector_length, process_line, vectorize_batch
|
||||
|
||||
from my_linear_regressor2 import MyLinearRegressor2
|
||||
|
||||
# Rozmiar minibatcha
|
||||
batch_size = 16
|
||||
|
||||
regressor = MyLinearRegressor2(vector_length)
|
||||
|
||||
# Pomocnicza funkcja do batchowania
|
||||
def grouper(n, iterable):
|
||||
it = iter(iterable)
|
||||
while True:
|
||||
chunk = tuple(itertools.islice(it, n))
|
||||
if not chunk:
|
||||
return
|
||||
yield chunk
|
||||
|
||||
|
||||
# Tym razem użyjemy optymalizatora
|
||||
optimizer = optim.Adam(regressor.parameters())
|
||||
|
||||
|
||||
# Funkcja kosztu.
|
||||
def loss_fun(y_hat, y_exp):
|
||||
return torch.sum((y_hat - y_exp)**2) / batch_size
|
||||
|
||||
|
||||
# Co ile kroków będziemy wypisywali informacje o średniej funkcji kosztu.
|
||||
# To nie jest hiperparametr uczenia, nie ma to żadnego, ani pozytywnego, ani
|
||||
# negatywnego wpływu na uczenie.
|
||||
step = 500
|
||||
i = 1
|
||||
closs = torch.tensor(0.0, dtype=torch.float, requires_grad=False)
|
||||
|
||||
for batch in grouper(batch_size, sys.stdin):
|
||||
t = [process_line(line) for line in batch]
|
||||
contents = [entry[0] for entry in t]
|
||||
# y_exp będzie teraz wektorem!
|
||||
y_exp = torch.tensor([entry[1] for entry in t], dtype=torch.float)
|
||||
|
||||
optimizer.zero_grad()
|
||||
|
||||
x = vectorize_batch(contents).float()
|
||||
|
||||
# wartość z predykcji (też wektor!)
|
||||
y_hat = regressor(x)
|
||||
|
||||
# wyliczamy funkcję kosztu
|
||||
loss = loss_fun(y_hat, y_exp)
|
||||
|
||||
loss.backward()
|
||||
|
||||
with torch.no_grad():
|
||||
closs += loss
|
||||
|
||||
# Optymalizator automagicznie zadba o aktualizację wag!
|
||||
optimizer.step()
|
||||
|
||||
# za jakiś czas pokazujemy uśrednioną funkcję kosztu
|
||||
if i % step == 0:
|
||||
print("Sample item: ", y_exp[0].item(), " => ", y_hat[0].item(),
|
||||
" | Avg loss: ", (closs / step).item())
|
||||
closs = torch.tensor(0.0, dtype=torch.float, requires_grad=False)
|
||||
|
||||
i += 1
|
||||
|
||||
|
||||
# serializujemy nasz model
|
||||
torch.save(regressor, "model.bin")
|
81
wyk/pytorch_regression/linear6.py
Executable file
@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Wprowadzamy minibatche
|
||||
|
||||
import sys
|
||||
import torch
|
||||
from torch import optim
|
||||
import itertools
|
||||
|
||||
# Preprocessing i wektoryzację tekstów wydzialamy do osobnego modułu,
|
||||
# z którego będzie korzystał zarówno kod do uczenia, jak i predykcji.
|
||||
from analyzer import vectorizer, vector_length, process_line, vectorize_batch
|
||||
|
||||
from my_neural_network import MyNeuralNetwork
|
||||
|
||||
# Rozmiar minibatcha
|
||||
batch_size = 16
|
||||
|
||||
regressor = MyNeuralNetwork(vector_length)
|
||||
|
||||
# Pomocnicza funkcja do batchowania
|
||||
def grouper(n, iterable):
|
||||
it = iter(iterable)
|
||||
while True:
|
||||
chunk = tuple(itertools.islice(it, n))
|
||||
if not chunk:
|
||||
return
|
||||
yield chunk
|
||||
|
||||
|
||||
# Tym razem użyjemy optymalizatora
|
||||
optimizer = optim.Adam(regressor.parameters())
|
||||
|
||||
|
||||
# Funkcja kosztu.
|
||||
def loss_fun(y_hat, y_exp):
|
||||
return torch.sum((y_hat - y_exp)**2) / batch_size
|
||||
|
||||
|
||||
# Co ile kroków będziemy wypisywali informacje o średniej funkcji kosztu.
|
||||
# To nie jest hiperparametr uczenia, nie ma to żadnego, ani pozytywnego, ani
|
||||
# negatywnego wpływu na uczenie.
|
||||
step = 500
|
||||
i = 1
|
||||
closs = torch.tensor(0.0, dtype=torch.float, requires_grad=False)
|
||||
|
||||
for batch in grouper(batch_size, sys.stdin):
|
||||
t = [process_line(line) for line in batch]
|
||||
contents = [entry[0] for entry in t]
|
||||
# y_exp będzie teraz wektorem!
|
||||
y_exp = torch.tensor([entry[1] for entry in t], dtype=torch.float)
|
||||
|
||||
optimizer.zero_grad()
|
||||
|
||||
x = vectorize_batch(contents).float()
|
||||
|
||||
# wartość z predykcji (też wektor!)
|
||||
y_hat = regressor(x)
|
||||
|
||||
# wyliczamy funkcję kosztu
|
||||
loss = loss_fun(y_hat, y_exp)
|
||||
|
||||
loss.backward()
|
||||
|
||||
with torch.no_grad():
|
||||
closs += loss
|
||||
|
||||
# Optymalizator automagicznie zadba o aktualizację wag!
|
||||
optimizer.step()
|
||||
|
||||
# za jakiś czas pokazujemy uśrednioną funkcję kosztu
|
||||
if i % step == 0:
|
||||
print("Sample item: ", y_exp[0].item(), " => ", y_hat[0].item(),
|
||||
" | Avg loss: ", (closs / step).item())
|
||||
closs = torch.tensor(0.0, dtype=torch.float, requires_grad=False)
|
||||
|
||||
i += 1
|
||||
|
||||
|
||||
# serializujemy nasz model
|
||||
torch.save(regressor, "model.bin")
|
81
wyk/pytorch_regression/logistic6.py
Executable file
@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# Wprowadzamy minibatche
|
||||
|
||||
import sys
|
||||
import torch
|
||||
from torch import optim
|
||||
import itertools
|
||||
|
||||
# Preprocessing i wektoryzację tekstów wydzialamy do osobnego modułu,
|
||||
# z którego będzie korzystał zarówno kod do uczenia, jak i predykcji.
|
||||
from analyzer_classification import vectorizer, vector_length, process_line, vectorize_batch
|
||||
|
||||
from my_neural_network import MyNeuralNetwork
|
||||
|
||||
# Rozmiar minibatcha
|
||||
batch_size = 16
|
||||
|
||||
regressor = MyNeuralNetwork(vector_length)
|
||||
|
||||
# Pomocnicza funkcja do batchowania
|
||||
def grouper(n, iterable):
|
||||
it = iter(iterable)
|
||||
while True:
|
||||
chunk = tuple(itertools.islice(it, n))
|
||||
if not chunk:
|
||||
return
|
||||
yield chunk
|
||||
|
||||
|
||||
# Tym razem użyjemy optymalizatora
|
||||
optimizer = optim.Adam(regressor.parameters())
|
||||
|
||||
|
||||
# Funkcja kosztu.
|
||||
def loss_fun(y_hat, y_exp):
|
||||
return torch.sum((y_hat - y_exp)**2) / batch_size
|
||||
|
||||
|
||||
# Co ile kroków będziemy wypisywali informacje o średniej funkcji kosztu.
|
||||
# To nie jest hiperparametr uczenia, nie ma to żadnego, ani pozytywnego, ani
|
||||
# negatywnego wpływu na uczenie.
|
||||
step = 500
|
||||
i = 1
|
||||
closs = torch.tensor(0.0, dtype=torch.float, requires_grad=False)
|
||||
|
||||
for batch in grouper(batch_size, sys.stdin):
|
||||
t = [process_line(line) for line in batch]
|
||||
contents = [entry[0] for entry in t]
|
||||
# y_exp będzie teraz wektorem!
|
||||
y_exp = torch.tensor([entry[1] for entry in t], dtype=torch.float)
|
||||
|
||||
optimizer.zero_grad()
|
||||
|
||||
x = vectorize_batch(contents).float()
|
||||
|
||||
# wartość z predykcji (też wektor!)
|
||||
y_hat = regressor(x)
|
||||
|
||||
# wyliczamy funkcję kosztu
|
||||
loss = loss_fun(y_hat, y_exp)
|
||||
|
||||
loss.backward()
|
||||
|
||||
with torch.no_grad():
|
||||
closs += loss
|
||||
|
||||
# Optymalizator automagicznie zadba o aktualizację wag!
|
||||
optimizer.step()
|
||||
|
||||
# za jakiś czas pokazujemy uśrednioną funkcję kosztu
|
||||
if i % step == 0:
|
||||
print("Sample item: ", y_exp[0].item(), " => ", y_hat[0].item(),
|
||||
" | Avg loss: ", (closs / step).item())
|
||||
closs = torch.tensor(0.0, dtype=torch.float, requires_grad=False)
|
||||
|
||||
i += 1
|
||||
|
||||
|
||||
# serializujemy nasz model
|
||||
torch.save(regressor, "model.bin")
|
12
wyk/pytorch_regression/my_linear_regressor.py
Normal file
@ -0,0 +1,12 @@
|
||||
import torch.nn as nn
|
||||
import torch
|
||||
|
||||
|
||||
class MyLinearRegressor(nn.Module):
|
||||
def __init__(self, vlen):
|
||||
super(MyLinearRegressor, self).__init__()
|
||||
self.register_parameter(name='w', param=torch.nn.Parameter(
|
||||
torch.zeros(vlen, dtype=torch.double, requires_grad=True)))
|
||||
|
||||
def forward(self, x):
|
||||
return x @ self.w
|
10
wyk/pytorch_regression/my_linear_regressor2.py
Normal file
@ -0,0 +1,10 @@
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class MyLinearRegressor2(nn.Module):
|
||||
def __init__(self, vlen):
|
||||
super(MyLinearRegressor2, self).__init__()
|
||||
self.w = nn.Linear(vlen, 1, bias=False)
|
||||
|
||||
def forward(self, x):
|
||||
return self.w(x).squeeze()
|
15
wyk/pytorch_regression/my_neural_network.py
Normal file
@ -0,0 +1,15 @@
|
||||
import torch.nn as nn
|
||||
import torch
|
||||
|
||||
|
||||
class MyNeuralNetwork(nn.Module):
|
||||
def __init__(self, vlen):
|
||||
super(MyNeuralNetwork, self).__init__()
|
||||
self.w1 = nn.Linear(vlen, 1)
|
||||
self.w2 = nn.Linear(vlen, 1)
|
||||
|
||||
self.u1 = torch.nn.Parameter(torch.rand(1, dtype=torch.float, requires_grad=True))
|
||||
self.u2 = torch.nn.Parameter(torch.rand(1, dtype=torch.float, requires_grad=True))
|
||||
|
||||
def forward(self, x):
|
||||
return self.u1 * torch.nn.functional.tanh(self.w1(x).squeeze()) + self.u2 * torch.nn.functional.tanh(self.w2(x).squeeze())
|