{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Modelowanie języka – laboratoria\n",
"### 13 marca 2024\n",
"# 2. Język"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import plotly.express as px\n",
"import numpy as np\n",
"import pandas as pd\n",
"import nltk"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://github.com/sdadas/polish-nlp-resources"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"program : program\n",
"programs : program\n",
"programmer : programm\n",
"programming : program\n",
"programmers : programm\n"
]
}
],
"source": [
"ps = nltk.stem.PorterStemmer()\n",
"\n",
"for w in [\"program\", \"programs\", \"programmer\", \"programming\", \"programmers\"]:\n",
" print(w, \" : \", ps.stem(w))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/pawel/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package stopwords to /home/pawel/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nltk.download('punkt')\n",
"nltk.download('stopwords')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Python',\n",
" 'is',\n",
" 'dynamically-typed',\n",
" 'and',\n",
" 'garbage-collected',\n",
" '.',\n",
" 'It',\n",
" 'supports',\n",
" 'multiple',\n",
" 'programming',\n",
" 'paradigms',\n",
" ',',\n",
" 'including',\n",
" 'structured',\n",
" '(',\n",
" 'particularly',\n",
" ',',\n",
" 'procedural',\n",
" ')',\n",
" ',',\n",
" 'object-oriented',\n",
" 'and',\n",
" 'functional',\n",
" 'programming',\n",
" '.',\n",
" 'It',\n",
" 'is',\n",
" 'often',\n",
" 'described',\n",
" 'as',\n",
" 'a',\n",
" '``',\n",
" 'batteries',\n",
" 'included',\n",
" \"''\",\n",
" 'language',\n",
" 'due',\n",
" 'to',\n",
" 'its',\n",
" 'comprehensive',\n",
" 'standard',\n",
" 'library',\n",
" '.']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text = \"\"\"Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly, procedural), object-oriented and functional programming. It is often described as a \"batteries included\" language due to its comprehensive standard library.\"\"\"\n",
"nltk.tokenize.word_tokenize(text)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Python is dynamically-typed and garbage-collected.',\n",
" 'It supports multiple programming paradigms, including structured (particularly, procedural), object-oriented and functional programming.',\n",
" 'It is often described as a \"batteries included\" language due to its comprehensive standard library.']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nltk.tokenize.sent_tokenize(text)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['aber',\n",
" 'alle',\n",
" 'allem',\n",
" 'allen',\n",
" 'aller',\n",
" 'alles',\n",
" 'als',\n",
" 'also',\n",
" 'am',\n",
" 'an',\n",
" 'ander',\n",
" 'andere',\n",
" 'anderem',\n",
" 'anderen',\n",
" 'anderer',\n",
" 'anderes',\n",
" 'anderm',\n",
" 'andern',\n",
" 'anderr',\n",
" 'anders',\n",
" 'auch',\n",
" 'auf',\n",
" 'aus',\n",
" 'bei',\n",
" 'bin',\n",
" 'bis',\n",
" 'bist',\n",
" 'da',\n",
" 'damit',\n",
" 'dann',\n",
" 'der',\n",
" 'den',\n",
" 'des',\n",
" 'dem',\n",
" 'die',\n",
" 'das',\n",
" 'dass',\n",
" 'daß',\n",
" 'derselbe',\n",
" 'derselben',\n",
" 'denselben',\n",
" 'desselben',\n",
" 'demselben',\n",
" 'dieselbe',\n",
" 'dieselben',\n",
" 'dasselbe',\n",
" 'dazu',\n",
" 'dein',\n",
" 'deine',\n",
" 'deinem',\n",
" 'deinen',\n",
" 'deiner',\n",
" 'deines',\n",
" 'denn',\n",
" 'derer',\n",
" 'dessen',\n",
" 'dich',\n",
" 'dir',\n",
" 'du',\n",
" 'dies',\n",
" 'diese',\n",
" 'diesem',\n",
" 'diesen',\n",
" 'dieser',\n",
" 'dieses',\n",
" 'doch',\n",
" 'dort',\n",
" 'durch',\n",
" 'ein',\n",
" 'eine',\n",
" 'einem',\n",
" 'einen',\n",
" 'einer',\n",
" 'eines',\n",
" 'einig',\n",
" 'einige',\n",
" 'einigem',\n",
" 'einigen',\n",
" 'einiger',\n",
" 'einiges',\n",
" 'einmal',\n",
" 'er',\n",
" 'ihn',\n",
" 'ihm',\n",
" 'es',\n",
" 'etwas',\n",
" 'euer',\n",
" 'eure',\n",
" 'eurem',\n",
" 'euren',\n",
" 'eurer',\n",
" 'eures',\n",
" 'für',\n",
" 'gegen',\n",
" 'gewesen',\n",
" 'hab',\n",
" 'habe',\n",
" 'haben',\n",
" 'hat',\n",
" 'hatte',\n",
" 'hatten',\n",
" 'hier',\n",
" 'hin',\n",
" 'hinter',\n",
" 'ich',\n",
" 'mich',\n",
" 'mir',\n",
" 'ihr',\n",
" 'ihre',\n",
" 'ihrem',\n",
" 'ihren',\n",
" 'ihrer',\n",
" 'ihres',\n",
" 'euch',\n",
" 'im',\n",
" 'in',\n",
" 'indem',\n",
" 'ins',\n",
" 'ist',\n",
" 'jede',\n",
" 'jedem',\n",
" 'jeden',\n",
" 'jeder',\n",
" 'jedes',\n",
" 'jene',\n",
" 'jenem',\n",
" 'jenen',\n",
" 'jener',\n",
" 'jenes',\n",
" 'jetzt',\n",
" 'kann',\n",
" 'kein',\n",
" 'keine',\n",
" 'keinem',\n",
" 'keinen',\n",
" 'keiner',\n",
" 'keines',\n",
" 'können',\n",
" 'könnte',\n",
" 'machen',\n",
" 'man',\n",
" 'manche',\n",
" 'manchem',\n",
" 'manchen',\n",
" 'mancher',\n",
" 'manches',\n",
" 'mein',\n",
" 'meine',\n",
" 'meinem',\n",
" 'meinen',\n",
" 'meiner',\n",
" 'meines',\n",
" 'mit',\n",
" 'muss',\n",
" 'musste',\n",
" 'nach',\n",
" 'nicht',\n",
" 'nichts',\n",
" 'noch',\n",
" 'nun',\n",
" 'nur',\n",
" 'ob',\n",
" 'oder',\n",
" 'ohne',\n",
" 'sehr',\n",
" 'sein',\n",
" 'seine',\n",
" 'seinem',\n",
" 'seinen',\n",
" 'seiner',\n",
" 'seines',\n",
" 'selbst',\n",
" 'sich',\n",
" 'sie',\n",
" 'ihnen',\n",
" 'sind',\n",
" 'so',\n",
" 'solche',\n",
" 'solchem',\n",
" 'solchen',\n",
" 'solcher',\n",
" 'solches',\n",
" 'soll',\n",
" 'sollte',\n",
" 'sondern',\n",
" 'sonst',\n",
" 'über',\n",
" 'um',\n",
" 'und',\n",
" 'uns',\n",
" 'unsere',\n",
" 'unserem',\n",
" 'unseren',\n",
" 'unser',\n",
" 'unseres',\n",
" 'unter',\n",
" 'viel',\n",
" 'vom',\n",
" 'von',\n",
" 'vor',\n",
" 'während',\n",
" 'war',\n",
" 'waren',\n",
" 'warst',\n",
" 'was',\n",
" 'weg',\n",
" 'weil',\n",
" 'weiter',\n",
" 'welche',\n",
" 'welchem',\n",
" 'welchen',\n",
" 'welcher',\n",
" 'welches',\n",
" 'wenn',\n",
" 'werde',\n",
" 'werden',\n",
" 'wie',\n",
" 'wieder',\n",
" 'will',\n",
" 'wir',\n",
" 'wird',\n",
" 'wirst',\n",
" 'wo',\n",
" 'wollen',\n",
" 'wollte',\n",
" 'würde',\n",
" 'würden',\n",
" 'zu',\n",
" 'zum',\n",
" 'zur',\n",
" 'zwar',\n",
" 'zwischen']"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nltk.corpus.stopwords.words('german')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('Python', 'is'), ('is', 'dynamically-typed'), ('dynamically-typed', 'and'), ('and', 'garbage-collected'), ('garbage-collected', '.'), ('.', 'It'), ('It', 'supports'), ('supports', 'multiple'), ('multiple', 'programming'), ('programming', 'paradigms'), ('paradigms', ','), (',', 'including'), ('including', 'structured'), ('structured', '('), ('(', 'particularly'), ('particularly', ','), (',', 'procedural'), ('procedural', ')'), (')', ','), (',', 'object-oriented'), ('object-oriented', 'and'), ('and', 'functional'), ('functional', 'programming'), ('programming', '.'), ('.', 'It'), ('It', 'is'), ('is', 'often'), ('often', 'described'), ('described', 'as'), ('as', 'a'), ('a', '``'), ('``', 'batteries'), ('batteries', 'included'), ('included', \"''\"), (\"''\", 'language'), ('language', 'due'), ('due', 'to'), ('to', 'its'), ('its', 'comprehensive'), ('comprehensive', 'standard'), ('standard', 'library'), ('library', '.')]\n"
]
}
],
"source": [
"nltk_tokens = nltk.word_tokenize(text)\n",
"print(list(nltk.bigrams(nltk_tokens)))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"alignmentgroup": "True",
"hovertemplate": "słowo=%{x}
liczba=%{y}