427 lines
183 KiB
Plaintext
427 lines
183 KiB
Plaintext
|
{
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {
|
||
|
"collapsed": true,
|
||
|
"pycharm": {
|
||
|
"is_executing": true
|
||
|
}
|
||
|
},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"!unxz gutenberg_poems_clean.txt.xz"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 1,
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"text = open('gutenberg_poems_clean.txt', encoding = 'utf-8').read()"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 57,
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stderr",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"[nltk_data] Downloading package wordnet to /home/ked/nltk_data...\n",
|
||
|
"[nltk_data] Package wordnet is already up-to-date!\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"import nltk\n",
|
||
|
"import matplotlib.pyplot as plt\n",
|
||
|
"from collections import OrderedDict, defaultdict, Counter\n",
|
||
|
"import regex as re\n",
|
||
|
"from math import log\n",
|
||
|
"from nltk.stem import WordNetLemmatizer\n",
|
||
|
"from nltk.stem.snowball import SnowballStemmer\n",
|
||
|
"import numpy as np\n",
|
||
|
"import matplotlib.ticker as ticker\n",
|
||
|
"nltk.download('wordnet')\n",
|
||
|
"\n",
|
||
|
"def get_characters(t):\n",
|
||
|
" yield from t\n",
|
||
|
"\n",
|
||
|
"def get_words(t):\n",
|
||
|
" for m in re.finditer(r'[\\p{L}0-9\\*]+', t):\n",
|
||
|
" yield m.group(0)\n",
|
||
|
"\n",
|
||
|
"def freq_list(g, top=None):\n",
|
||
|
" c = Counter(g)\n",
|
||
|
" if top is None:\n",
|
||
|
" items = c.items()\n",
|
||
|
" else:\n",
|
||
|
" items = c.most_common(top)\n",
|
||
|
" return OrderedDict(sorted(items, key=lambda t: -t[1]))\n",
|
||
|
"\n",
|
||
|
"def log_rang_log_freq(name, g):\n",
|
||
|
" freq = freq_list(g)\n",
|
||
|
" plt.figure().clear()\n",
|
||
|
" #plt.figure(figsize=(12, 4))\n",
|
||
|
" plt.plot([log(x) for x in range(1, len(freq.values())+1)], [log(y) for y in freq.values()])\n",
|
||
|
" fname = f'{name}.png'\n",
|
||
|
" plt.savefig(fname)\n",
|
||
|
" return fname\n",
|
||
|
"\n",
|
||
|
"def get_top_word_length(g, top=10):\n",
|
||
|
" d = defaultdict(set)\n",
|
||
|
" for item in g:\n",
|
||
|
" d[len(item)].add(item)\n",
|
||
|
" return {k : sorted(list(v)) for k, v in sorted(d.items(), reverse=True)[:top]}\n",
|
||
|
"\n",
|
||
|
"def ngrams(g, size):\n",
|
||
|
" ngram = []\n",
|
||
|
" for item in g:\n",
|
||
|
" ngram.append(item)\n",
|
||
|
" if len(ngram) == size:\n",
|
||
|
" yield tuple(ngram)\n",
|
||
|
" ngram = ngram[1:]\n",
|
||
|
"\n",
|
||
|
"def lemmas(g):\n",
|
||
|
" lemmatizer = WordNetLemmatizer()\n",
|
||
|
" for item in g:\n",
|
||
|
" yield lemmatizer.lemmatize(item)\n",
|
||
|
"\n",
|
||
|
"def stems(g):\n",
|
||
|
" stemmer = SnowballStemmer('english')\n",
|
||
|
" for item in g:\n",
|
||
|
" yield stemmer.stem(item)\n",
|
||
|
"\n",
|
||
|
"def get_freq_list(g):\n",
|
||
|
" c = Counter(g)\n",
|
||
|
" freq_list = list(enumerate(sorted([(word, freq) for word, freq in c.items()], reverse=True, key=lambda t: t[1]), start=1))\n",
|
||
|
" return freq_list\n",
|
||
|
"\n",
|
||
|
"def find_long_popular_words(g, top=None):\n",
|
||
|
" freq_list = get_freq_list(g)\n",
|
||
|
" len_vs_freq = sorted([(freq * len(word), word) for rank, (word, freq) in freq_list], reverse=True, key=lambda t: t[0])\n",
|
||
|
" if top is None:\n",
|
||
|
" return len_vs_freq\n",
|
||
|
" else:\n",
|
||
|
" return len_vs_freq[:top]\n",
|
||
|
"\n",
|
||
|
"def plot_brevity_law(g):\n",
|
||
|
" freq_list = get_freq_list(g)\n",
|
||
|
" plt.figure().clear()\n",
|
||
|
" #plt.figure(figsize=(12, 4))\n",
|
||
|
" x = [tup[0] for tup in freq_list]\n",
|
||
|
" y = [len(tup[1][0]) for tup in freq_list]\n",
|
||
|
" plt.plot(x[::5000], y[::5000])\n",
|
||
|
" plt.xlabel('Word rank (every 5000th)')\n",
|
||
|
" plt.ylabel('Word length (every 5000th)')\n",
|
||
|
" z = np.polyfit(x, y, 1)\n",
|
||
|
" p = np.poly1d(z)\n",
|
||
|
" plt.plot(x, p(x))\n",
|
||
|
" fname = 'brevity_law_plot.png'\n",
|
||
|
" plt.savefig(fname)\n",
|
||
|
"\n",
|
||
|
"def plot_heaps_law(g):\n",
|
||
|
" uniq_words = set()\n",
|
||
|
" text_length = 0\n",
|
||
|
" x = []\n",
|
||
|
" y = []\n",
|
||
|
" for word in g:\n",
|
||
|
" text_length += 1\n",
|
||
|
" uniq_words.add(word)\n",
|
||
|
" x.append(text_length)\n",
|
||
|
" y.append(len(uniq_words))\n",
|
||
|
" plt.figure().clear()\n",
|
||
|
" #plt.figure(figsize=(12, 4))\n",
|
||
|
" plt.xlabel('Text length (in words)')\n",
|
||
|
" plt.ylabel('Unique words')\n",
|
||
|
" plt.ticklabel_format(scilimits=(-5, 8))\n",
|
||
|
" plt.plot(x, y)\n",
|
||
|
" plt.savefig('heaps_law_plot.png')"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 27,
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"43: ninepenceashillingneteighteenpencetwoandsix\n",
|
||
|
"36: crownnettwoandeightpencethreeandnine\n",
|
||
|
"35: Lebensfeuerversicherunggesellschaft\n",
|
||
|
"33: Gottsdonnerkreuzschockschwerenoth\n",
|
||
|
"28: Wintztschitstopschinzoudhoff\n",
|
||
|
"25: Selbstanschauungsvermogen\n",
|
||
|
"24: Aldiborontiphoscophornio, WappenshieldWaffenschild, WunderscheenWunderschoen\n",
|
||
|
"22: honorificabilitudinity\n",
|
||
|
"21: Voelkerwanderungszeit\n",
|
||
|
"20: Alliterationspoesien, Fornminnesfoerenings, Retrogradeacompletes, andkindlystepthisway, richardwashburnchild, xinechoncentlalizqui\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# Top 10 longest words\n",
|
||
|
"toplist = get_top_word_length(get_words(text))\n",
|
||
|
"for k,v in toplist.items():\n",
|
||
|
" print(f\"{k}: {', '.join(v)}\")"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 66,
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": "'zipf_plot_words.png'"
|
||
|
},
|
||
|
"execution_count": 66,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": "<Figure size 640x480 with 1 Axes>",
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAh8AAAGdCAYAAACyzRGfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8TUlEQVR4nO3dd3RUZeLG8WdmUiHJQAJpECAU6US6EKwgqIiCBQsii641CIjrD1CxK5a1IoKwKjYsq4KKigLSayDSpAUJEEoS6kwSSJ35/REZNxpKYDJ3MvP9nDPn7L1zk/uc2SXz7L3vfV+T0+l0CgAAwEPMRgcAAAD+hfIBAAA8ivIBAAA8ivIBAAA8ivIBAAA8ivIBAAA8ivIBAAA8ivIBAAA8KsDoAH/lcDi0b98+hYeHy2QyGR0HAACcAafTqdzcXMXHx8tsPvW1Da8rH/v27VNCQoLRMQAAwFnIzMxU/fr1T3mM15WP8PBwSWXhIyIiDE4DAADOhN1uV0JCgut7/FS8rnycuNUSERFB+QAAoJo5kyETDDgFAAAeRfkAAAAeRfkAAAAeRfkAAAAeRfkAAAAeRfkAAAAeRfkAAAAeVenysWjRIvXr10/x8fEymUyaOXPmSY+99957ZTKZ9Prrr59DRAAA4EsqXT7y8/OVlJSkiRMnnvK4GTNmaMWKFYqPjz/rcAAAwPdUeobTK6+8UldeeeUpj9m7d68eeOAB/fTTT+rbt+9ZhwMAAL7H7WM+HA6HBg8erIcfflitW7d2968HAADVnNvXdnnxxRcVEBCg4cOHn9HxhYWFKiwsdG3b7XZ3RwIAAF7ErVc+1qxZozfeeEPTpk07o4VlJGn8+PGyWq2uV0JCgjsjuTidTo387Fd9uWaPnE5nlZwDAACcnlvLx+LFi5WTk6MGDRooICBAAQEB2rVrlx566CE1atSowp8ZO3asbDab65WZmenOSC4//ZalmWv36V//Xadh03+V7VhxlZwHAACcmltvuwwePFi9evUqt69Pnz4aPHiwhg4dWuHPBAcHKzg42J0xKnR5q1g93Ke5XpuzTd9v2K+03Uf06sDz1a1JVJWfGwAA/KnS5SMvL0/bt293bWdkZGjt2rWKjIxUgwYNFBVV/ss8MDBQsbGxat68+bmnPQcWs0kplzZVj6Z1NPLztco4mK9b/7NC91zURKMuP09BAcy3BgCAJ1T6G3f16tVq37692rdvL0kaNWqU2rdvr8cff9zt4apCUkItzXqgh27unCCnU5q88HddN2mpfj+QZ3Q0AAD8gsnpZaMv7Xa7rFarbDabIiIiqvRcszfu15ivN+josWKFBlo07upWuqVLwhkPlgUAAGUq8/3t1/carmgTp9kjLlJy0ygdLy7VIzM26O6P1uhwfpHR0QAA8Fl+XT4kKdYaoo/u6KpHr2qpQItJczZlq8/ri7Ro2wGjowEA4JP8vnxIktls0l0XNdbMlGQ1jQ7TgdxC3f7eKj0za5MKikuNjgcAgE+hfPyP1vFWfTeshwZf0FCS9O6SDPWfuFTbsnMNTgYAgO+gfPxFaJBFz/Rvo3eHdFJUzSBtycrV1ROWaNrSDGZGBQDADSgfJ9GzZYxmj7xIlzSvq6ISh578bpOGTkvVgdzC0/8wAAA4KcrHKdQND9b7/+isp65praAAsxZsPaArXl+kX7ZkGx0NAIBqi/JxGiaTSUO6N9J3w3qoRWy4DuUX6Y5pqzVu5kYdL2IwKgAAlUX5OEPNY8M1MyVZd/ZIlCR9tGKX+r21RL/tsxmcDACA6oXyUQkhf8yC+uEdXVQ3PFjbc/LUf+JSTV20Qw4Hg1EBADgTlI+zcNF5dfXTyIt0easYFZc69dwPmzX4vZXKshUYHQ0AAK9H+ThLkTWDNGVwRz0/oK1CAy1auv2QrnhjkWZv3G90NAAAvJpfLyznLr8fyNPIz9Zqw96y8R9J9a1qXc+qVnERahUfoRax4aoRFGBwSgAAqk5lvr8pH25SVOLQa3O3afLC3/XXT9RkkhKjaqplfISrkLSOi1Dd8GBW0AUA+ATKh4EyDx/T2syj2rTfrk377Nq0337SicnqhAWpZdyfhSSpfi01qlPTw4kBADh3lA8vcyC3UJv328sVkh0H8lTRAzJ928VpzBUtlBBZw/NBAQA4S5SPauB4Uam2Zee6Cslv+2z6NfOonE4pKMCsf/ZI1P2XNlVYMGNFAADej/JRTW3aZ9ez32/Sst8PSZLqhAXr4T7n6YaOCbKYGRsCAPBelI9qzOl0au7mHD3/w2ZlHMyXJLWKi9BjV7dU9yZ1DE4HAEDFKB8+oKjEoQ+X79Sb89JlLyiRJPVuFaNHrmrJoFQAgNehfPiQw/lFemPuNn28crdKHU4FWkwa0q2RHrismaw1Ao2OBwCAJMqHT0rPztVzP2zWgq0HJElhwQEa0r2h7uzRWJE1gwxOBwDwd5QPH7Zw2wG98OMWbd5vlyTVCLJo8AUN9c8LG6tueLDB6QAA/ory4eMcDqfmbs7Wm7+ka+PeshISEmjWoK4Ndc9FjRUdEWJwQgCAv6F8+Amn06n5W3P0xrztWpd5VFLZHCG3dE5Q79axahodpmimcAcAeADlw884nU4tTj+oN+ala82uI+XeCw8JULPoMDWNDlOz6HC1iAvX+Qm1FB7CYFUAgPtQPvyU0+nU8t8P6eOVu7Rlf652HsqvcAp3s0lqGRehzo0i/3jV5lYNAOCcUD4gSSooLtXOQ/lKz85Tek6etufkasNemzIPH//bsXHWELWOj1CreKtaxUUoKcGqOGuoAakBANUR5QOntN92XKt3HtHqnYe1aucRbcmy66//K7CYTXr5hna6rkN9Y0ICAKoVygcqJbegWJv35+q3fTb9ts+udZlHlZ6Tp6iaQVr4f5eyuB0A4LQq8/3NtwoUHhKoLomR6pIYKUkqLnWo92uLlHEwX+8uztCIXs0MTggA8CVmowPA+wRazHqo93mSpCmLftehvEKDEwEAfAnlAxW6qk2c2tSLUH5RqSbO/93oOAAAH0L5QIXMZpNGX9FCkvTxil3ac+SYwYkAAL6C8oGT6tG0jro3iVJRqUOvzUk3Og4AwEdQPnBSJtOfVz++/nWP1mYelZc9HAUAqIZ42gWnlJRQS1e2idWPG7PUf+JSBQWYFW8NUUJkDQ1oX0/9kuIVaKHDAgDOHPN84LR2HcrX0PdTlXEo/2+TkdWvHap7L26i/u3rMR8IAPixKp1kbNGiRXr55Ze1Zs0a7d+/XzNmzFD//v0lScXFxXrsscf0ww8/aMeOHbJarerVq5deeOEFxcfHuz08PKuoxKFse4H2HT2u1buO6L0lGTqUX+R6PzwkQHHWELWJt2pI90ZKSqhlXFgAgEdV5vu70tfL8/PzlZSUpIkTJ/7tvWPHjiktLU3jxo1TWlqavv76a23dulXXXHNNZU8DLxQUYFZCZA11bRyllEubasnoy/Rkv1aqV6tsDZjcghJty87T17/u1bUTl2rg5OVavfOwwakBAN7mnG67mEymclc+KpKamqouXbpo165datCgwWl/J1c+qqfcgmJl2Qq09+hxfbtun75bt0/FpU7VCLJoyejLFFkzyOiIAIAqVKVXPirLZrPJZDKpVq1aFb5fWFgou91e7oXqJzwkUM1iwnVJ82i9OvB8LRl9mVrGRehYUammLt5hdDwAgBep0vJRUFCg0aNH65ZbbjlpCxo/frysVqvrlZCQUJWR4CExESEadXnZFO0fLtupI/8zNgQA4N+qrHwUFxdr4MCBcjqdmjRp0kmPGzt2rGw2m+uVmZlZVZHgYb1aRqtVXNkU7e8uyTA6DgDAS1RJ+ThRPHbt2qU5c+ac8t5PcHCwIiIiyr3gG0wmk4b3LFsRd9qynTp6jKsfAIAqKB8nikd6errmzp2rqKgod58C1UjvVjFqERuuvMIS3Th5uT5cvlOH8gqZKRUA/FilZ4XKy8vT9u3bXdsZGRlau3atIiMjFRcXpxtuuEFpaWmaNWuWSktLlZWVJUmKjIxUUBBPPPgbs9mkJ/q11h3TUpWek6fHv/lNj3/zm8KDA9QkOkwjejXTpc2
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# Zipf's law plot for words\n",
|
||
|
"log_rang_log_freq('zipf_plot_words', get_words(text))"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"# Zbadać prawo Zipfa dla innych jednostek niż wyrazy (n-gramy, rdzenie, lematy, itp.)"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 59,
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": "'zipf_plot_ngrams.png'"
|
||
|
},
|
||
|
"execution_count": 59,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": "<Figure size 640x480 with 1 Axes>",
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAh8AAAGdCAYAAACyzRGfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA5+klEQVR4nO3dd3hUZcL+8XtmkkwKyaRASAIJhBophtDbKipreV0Uuy4q6qq7iiKi7sruD9G1RHTXtSHFVdEVLPtqEPvrIkWUDqFIbyEQEggJMylkUmZ+f0RHs4IQnMyZ8v1c17k8OXOSczti5uac5zzH5Ha73QIAAPARs9EBAABAaKF8AAAAn6J8AAAAn6J8AAAAn6J8AAAAn6J8AAAAn6J8AAAAn6J8AAAAnwozOsB/c7lcKioqUmxsrEwmk9FxAADAKXC73aqoqFBaWprM5p8/t+F35aOoqEjp6elGxwAAAKehsLBQ7du3/9l9/K58xMbGSmoMHxcXZ3AaAABwKhwOh9LT0z2f4z/H78rH95da4uLiKB8AAASYUxkywYBTAADgU5QPAADgU5QPAADgU5QPAADgU5QPAADgU5QPAADgU5QPAADgU5QPAADgU5QPAADgU5QPAADgU5QPAADgU5QPAADgU373YLmWUtfg0sR31ysrJVY90+LUM82mNrFWo2MBABByQqZ87DpcqQ/XF+nD9T9saxNr/a6INJaRHqlxykiMltl88ifyAQCA0xMy5SMhOkIPXpSlb4sc+rbIrj2lVTpc4dSibYe1aNthz36trGHqkRqnHmmNS8+0OHVNjlVEGFeoAADwBpPb7XYbHeLHHA6HbDab7Ha74uLiWuw41bX12nKwQpuL7Np80KFvixzaWlyh2nrXT/YNt5jUNTnWc5akR5pNZ6TGKjYyvMXyAQAQSJrz+R2y5eN46hpc2nW4UpuLHJ4zJJuLHHLU1B93/45J0Y2Xa350liQ5NtKnmQEA8AeUDy9yu93aX35M3xY5mpwlOWivOe7+rVtZf3SGpHEsSQfGkQAAghzlwweOVDq1+aCjyVmS3aVVOt67GRFmVlJMhBK/WxrXrUpq9d/bIpQUY1VcVJhMJsoKACBwUD4MUl1br63FFT+cJSlyaMsJxpH8nHCLSQnR35WRVo1FJT0hSr8bnqmkVtweDADwP5QPP1Lf4NJBe42OVNWqrMqpI5W1Kqtquhz50Xql8/jjSyQpPTFKr900QF2SY334bwAAwMk15/O72bfaLlmyRE8//bTWrFmjgwcPKi8vT6NHjz7uvn/4wx80c+ZM/eMf/9CECROae6igEGYxKz0xWumJ0ae0f01dg8qra5uUlNJKp95YVqB9ZdW67KVvNPP6fhrapXULJwcAoGU0e/KKqqoqZWdna9q0aT+7X15enpYvX660tLTTDheKIsMtSrVFqVc7m87q1kajc9rp1l91Ut6dQ9WvQ4Iqaup146sr9e6qQqOjAgBwWppdPi666CI99thjuuyyy064z4EDB3T33Xdrzpw5Cg9nLgxvSGpl1ZxbB+mS7DTVu9z643sbNPWzrXK5/OqqGQAAJ+X1aTtdLpduuOEGPfDAA+rZs6e3f3xIiwy36Llr+2j8eV0lSdMX7dLdb61TTV2DwckAADh1Xp9eferUqQoLC9P48eNPaX+n0ymn0+n52uFweDtSUDGZTJr4627qkBitB9/foI83HtSBo8f08o39eVAeACAgePXMx5o1a/Tcc89p9uzZpzxPRW5urmw2m2dJT0/3ZqSgdUW/9nrzd4MUHx2u/MKjGj3ta20vqTA6FgAAJ+XV8vHVV1/p0KFDysjIUFhYmMLCwlRQUKD77rtPHTt2PO73TJo0SXa73bMUFjKQ8lQN6pSkvDuHqWNStA4cPaYrXvpGX+04fPJvBADAQF4tHzfccIM2bNig/Px8z5KWlqYHHnhAn3/++XG/x2q1Ki4ursmCU5fZOkZ5dw7TwI6JqnDW66bXVumtlfuMjgUAwAk1e8xHZWWldu7c6fl6z549ys/PV2JiojIyMpSUlNRk//DwcKWkpKh79+6/PC2OKyEmQv+6daAefG+j8tYd0KT3N2pPaZUevDCLZ8oAAPxOs898rF69Wjk5OcrJyZEkTZw4UTk5OXrooYe8Hg6nzhpm0TNXZ+vekd0kSbOW7NYdc9boWC13wgAA/AvTqwehD/IP6IF/b1Btg0tntrfpnzf2V3JcpNGxAABBrDmf316f5wPGu7RPO825bZASosO1Yb9do6d9ra3F3MIMAPAPlI8gNaBjovLuHKZOrWNUZK/RldOXadG2Q0bHAgCA8hHMOraO0ft3DtXgTomqdNbrltmr9K/lBUbHAgCEOMpHkIuPjtAbtwzSFX3by+WWJs/bpEc/2qwGngkDADAI5SMERISZ9berztT95zfeCfPK0j36/b/WqMpZb3AyAEAoonyECJPJpLvO7aoXrstRRJhZ/9lSoqtnLlOJo8boaACAEMOttiFoTUGZbntjjcqqahUZbtagzCSd1a2Nzu7WWp3btDrl5/IAAPC95nx+Uz5C1L4j1fr9m2u05WDTW3DTbJE6q1sb/aprGw3v0lq26HCDEgIAAgnlA6fE7XZrW0mFvtpeqiU7DmvFnjLV1rs8r5tNUlZKnHIy4pWTkaCcjHhlJsUwZTsA4CcoHzgtx2obtGLPES35rozsPFT5k31sUeHqkx6vwZ2SdMOQDmplbfbjgQAAQYjyAa8ottdo7b5yrdtXrnX7jmrjAbucPzozkhIXqSmjeujCXimMEwGAEEf5QIuorXdpa7FDawvK9erXe7WvrFqSdHa3Nnrkkp7q2DrG4IQAAKNQPtDiauoa9NKiXZqxaJdqG1yKCDPrzhGd9YezOysy3GJ0PACAj1E+4DO7D1fqoQ++1dKdpZKkqHCLzmxvU98OCcpJj1ffDglq3cpqcEoAQEujfMCn3G63Pt54UI9/vEUH7T+dtCwuMkyJMRFKiIlQYnSEbNHhiokIU7TVoujwMMVYLYqKsCgmIkxRERalJ0TrjNRYxpEAQAChfMAQLpdbuw5Xau2+cq0tOKp1heXacahSp/MnLNUWqZFntNWI7m0UHx2ucItZYWazIsJMCreYFRVhUXJspPf/JQAAp4XyAb9RUVOnEodT5dW1Kq+qbfxndZ2qaxtU7axXdd13/6xtUHVtg6pq67X1YIWO1TWc9Gd3SW6l/+mVorO7J6ttnFVJMVZFRTDeBACMQPlAQKupa9A3u0r1xeYSrSkol7Pepbp6l+pcbtU1NK4fq2vQ8R7M2yM1TtcOTNdFvVIVFWFRmNnEAFgA8AHKB4Keo6ZOC7aU6JONxdq4366yqlrVNriOu28ra5hSbZG6ZkC6bhmWyQytANACKB8IOW63W2VVtZq/vkhvryzUtpKK4+43tHOS7jmvq7q2jVViTISPUwJA8KJ8IOTV1rvkcrvlrHeptNKpb3aW6olPtjYZS9IuPko5GfGKjQyXNcysHqlx6tkuTh2TYhTDtPEA0CzN+fzmNyyCUkSYWZIUGW6RLSpcndu00rAurfXUZ9u08YBdB44e8yzH0ybWqqyUWP3xgiz1bm/zZXQACHqc+UBIqnTWK3/fUW0qsquu3qUKZ73yC49qR0mFyqvrPPtFWMy67axMDe/SRoMyExkvAgAnwGUX4BewV9dp75EqvfDlTv1nS4lne4ekaN00tKOu7NdesZHhBiYEAP9D+QC8wO12K2/dAS3cdliLtx2So6be81p8dLgSoyOUGNO4pMVHaVR2mvpmxDMzK4CQRPkAvKy6tl7vrT2g2V/v0a7DVSfc78KeKbpjRGd1SIpWfDR30wAIHZQPoIW43W4dqWqcrfVIVa3KvlvW7Tuq+esPqK7hh/+d4iLDdEZqnPpkxCvWGiZbdIQGdExQVgp/rgEEH8oHYIBNB+ya+tlWbSuu0KEK53H3sZhNuue8rurfIUE5GQlMBw8gaFA+AIMdq21QQVmVVu4p097SalU567Wl2KEN++2efVJtkbp+cAd1TIpRis2qM1LjFBVuYcwIgIBE+QD8kNvt1hvLCrRk+2Gt329XaeVPz46k2iK
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# Zipf's law plot for n-grams\n",
|
||
|
"log_rang_log_freq('zipf_plot_ngrams', ngrams(get_characters(text), 3))"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 60,
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": "'zipf_plot_lemmas.png'"
|
||
|
},
|
||
|
"execution_count": 60,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": "<Figure size 640x480 with 1 Axes>",
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAh8AAAGdCAYAAACyzRGfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA85ElEQVR4nO3dZ3hUZcLG8XtmUgnJQCgpkEDo1RAIHSmKIiKKLoqIilh21VgQ5RVcwXUtEbsC0twVdMWyLqCioIj0DiFU6RBCCZ1MCmkz835AR1EMBCZzJjP/33XNh3PmTOa+BsjcnPM85zE5nU6nAAAAPMRsdAAAAOBfKB8AAMCjKB8AAMCjKB8AAMCjKB8AAMCjKB8AAMCjKB8AAMCjKB8AAMCjAowO8HsOh0OHDh1SeHi4TCaT0XEAAMBFcDqdysnJUWxsrMzm0s9teF35OHTokOLi4oyOAQAALkFmZqZq165d6jFeVz7Cw8MlnQ0fERFhcBoAAHAxbDab4uLiXN/jpfG68vHLpZaIiAjKBwAAFczFDJlgwCkAAPAoygcAAPAoygcAAPAoygcAAPAoygcAAPAoygcAAPAoygcAAPCoMpePxYsXq2/fvoqNjZXJZNKsWbP+9NgHH3xQJpNJb7/99mVEBAAAvqTM5SMvL0+JiYkaP358qcfNnDlTK1euVGxs7CWHAwAAvqfMdzjt3bu3evfuXeoxBw8e1KOPPqrvvvtOffr0ueRwAADA97h9zIfD4dBdd92l4cOHq3nz5u7+8QAAoIJz+9ouY8aMUUBAgB577LGLOr6wsFCFhYWubZvN5u5IAADAi7j1zMe6dev0zjvvaOrUqRe1sIwkpaamymq1uh5xcXHujOTidDr1xGfp+t+6A3I6neXyHgAA4MLcWj6WLFmio0ePKj4+XgEBAQoICFBGRoaefPJJ1a1b97yvGTlypLKzs12PzMxMd0Zymbs5SzPXH9ST/92gRz5Zr+z84nJ5HwAAUDq3Xna566671LNnz3P29erVS3fddZeGDBly3tcEBwcrODjYnTHO69rm0Rreq7HemrdD32w8rLSMU3rjtkR1ql+93N8bAAD8qszlIzc3V7t27XJt7927V+np6YqMjFR8fLyqVat2zvGBgYGKjo5W48aNLz/tZbCYTUrp0UBdGlTX0M/Stfd4nga9v0p/7VpPT17TWEEB3G8NAABPKPM37tq1a5WUlKSkpCRJ0rBhw5SUlKTRo0e7PVx5SIyrotmPdtHAdnFyOqVJi/bo5veWadfRHKOjAQDgF0xOLxt9abPZZLValZ2drYiIiHJ9r++2ZGnE/zbqVH6xQgLN+vv1TXVnhzoXPVgWAACcVZbvb7++1tCrebS+G9pVVzasroJih0Z9uUX3TVur47mFF34xAAC4JH5dPiSpZkSIpg1pp9E3NFNQgFk/bjuq695erAXbjhodDQAAn+T35UOSzGaT7u2SoK8e6awm0eE6nlukIVPXaNSszTpTZDc6HgAAPoXy8RtNoiM0K6Wz7u2cIEn6aGWG+o5bqi2Hsg1OBgCA76B8/E5IoEWj+zbTh/e2U83wYO06mqt+45dp0qLdcji8amwuAAAVEuXjT3RtVENzh3bVtc2iVGx3KnXONt35r1U6nH3G6GgAAFRolI9SRIYFadJdbfTKLS0VGmjR8t0ndN3bS/TNxsNGRwMAoMKifFyAyWTS7e3i9e3jVyqxtlXZZ4qVMj1NT36+QbmFJUbHAwCgwqF8XKSE6mH64qFOeqRHA5lN0v/SDuj6d5ZoXcYpo6MBAFChUD7KINBi1lO9Guuzv3VUrSqh2n8yX7dNWqG35u1Qid1hdDwAACoEysclaFs3UnOGXqmbk2rJ7nDqnfk7deukFco4kWd0NAAAvB7l4xJFhATqrQGt9M7trRQeEqD1+0/r+neW6L9rM+Vly+UAAOBV/HphOXc5cCpfwz7foNV7T0qSmsZE6IpaVjWNCVezWKuaxIQrIiTQ4JQAAJSfsnx/Uz7cxO5watLi3Xrz+x0qOc/NyOIjK50tIzFWNYuNUNOYcNWqEsoKugAAn0D5MFBWdoHW7z+lrYdt2nrIpp8O23Qou+C8x0aEBPxcRCLULCZCzWOtahIdLrOZQgIAqFgoH17mVF6RfjpsO1tIDtv00+Ec7TySc94zJJFhQbqyYXV1bVhDVzaqrprhIQYkBgCgbCgfFUBhiV27jubqp8M52nrIpq2Hs7X5oO0PNy5rFhOhbo1rqGvDGmpTp6qCAhgjDADwPpSPCqrY7tD6/ae1aMdRLd5xXJsOnruabliQRR3rV1e3xjV0TdMoRVs5KwIA8A6UDx9xPLdQS3ce16Idx7R4xzGdyCtyPWcySV0aVNdfWtfWtc2jVCkowMCkAAB/R/nwQQ6HU1sP27RoxzH9uO3oObd1DwuyqHfLGP2ldW21T4hkwCoAwOMoH35g/4l8zVh/QDPSDmr/yXzX/lpVQnVL61oa2C5esVVCDUwIAPAnlA8/4nQ6tTbjlP637oC+2XhYOT8PWA0PCdDrtyaqV/NogxMCAPwB5cNPFRTbNW/rEb2/ZI82HDg7WPWBKxP0f9c1UaCFWTIAgPJTlu9vvpF8SEigRX0TY/XFQ510f5cESdKUJXs1cPJKHc4+Y3A6AADOonz4oECLWc/e0EwT72yj8OAArc04pT7vLtWSnceMjgYAAOXDl13XIlqzH+ui5rEROplXpLv/vVpvfr9dR3POf7t3AAA8gTEffqCg2K7nv96qT1bvd+2rW62SkutGqm3dqkquG6l61cNY5A4AcMkYcIrz+jL9oCYs3K3tR3L0+z91a2igmsVEqFlshJrHRqhFLasa1qxMIQEAXBTKB0qVfaZYaftPae2+k1qz75TSM0+rqMTxh+PuaB+vl29uaUBCAEBFQ/lAmRSVOLTjyC8L3Nm05VC21macktMpzXy4k5LiqxodEQDg5cry/c2CIFBQgFktalnVopbVtW/4fzfov+sOaMzcbfrkgQ5cfgEAuA2zXXBeQ69ppKAAs1buOanFO48bHQcA4EMoHzivWlVCdXeHOpKkMXO2yeHwqqtzAIAKjPKBP5XSo4HCgwO09bBNX288ZHQcAICPoHzgT1UNC9LfutWTJL3x/Y7zzogBAKCsKB8o1b1dElQjPFj7T+Zr2vJ9KrZTQAAAl4eptrigj1ZmaNSszZIkk0mqUTlYsVVC1a1RDd3Tqa6qhgUZnBAAYDTu8wG3KrY7dN+0tVqx+7iK7ef+dakUZNGg9vG6p3OCYq0hTMkFAD9VruVj8eLFeu2117Ru3TodPnxYM2fOVL9+/SRJxcXFevbZZ/Xtt99qz549slqt6tmzp1555RXFxsa6PTw8y+Fw6kRekQ5nn9HOI7n619K92nrY5no+OMCsaGuI4iMr6bbkOPVuEa0AC1f2AMAflOX7u8zfDHl5eUpMTNT48eP/8Fx+fr7S0tI0atQopaWlacaMGdq+fbtuvPHGsr4NvJDZbFKN8GBdUbuK/tKmtr55rIs+uKet2tY9ewfUwhKHMk7ka8nO43r0k/Xq9tpCfbhin7zs5BoAwGCXddnFZDKdc+bjfNasWaN27dopIyND8fHxF/yZnPmomAqK7TpqK1SWrUDLdh3Xf1Zm6ERekSTprQGJujmptsEJAQDlqVzPfJRVdna2TCaTqlSpct7nCwsLZbPZznmg4gkJtCi+WiW1S4jUE9c00rIRV+m+LgmSpNe/26GCYrvBCQEA3qJcy0dBQYGefvppDRw48E9bUGpqqqxWq+sRFxdXnpHgISGBFg3v1Vgx1hAdPH1GH67YZ3QkAICXKLfyUVxcrNtuu01Op1MTJkz40+NGjhyp7Oxs1yMzM7O8IsHDQgItGnZNI0nSuB936XR+kcGJAADeoFzKxy/FIyMjQ/PmzSv12k9wcLAiIiLOecB33NK6tppEh8tWUKLhX2zUd1uylJVdYHQsAICBAtz9A38pHjt37tSCBQtUrVo1d78FKhCL2aQRvZvong/WaN7WI5q39Ygk6cqG1TW4Y111aVhdIYEWg1MCADypzOUjNzdXu3btcm3v3btX6enpioyMVEx
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# Zipf's law plot for lemmas\n",
|
||
|
"log_rang_log_freq('zipf_plot_lemmas', lemmas(get_words(text)))"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 61,
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": "'zipf_plot_stems.png'"
|
||
|
},
|
||
|
"execution_count": 61,
|
||
|
"metadata": {},
|
||
|
"output_type": "execute_result"
|
||
|
},
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": "<Figure size 640x480 with 1 Axes>",
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAh8AAAGdCAYAAACyzRGfAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAA8fUlEQVR4nO3deVhU9eLH8c/MsIqAIgKiqLinuOOKa5rmNcvWa2ma7UWpedusn3VbqW51LTNtt1Lbw8oWNXPJDRfENHPfcANXhiUGmJnfH+R0KVOwmTkDvF/PM8/jOXOY+TQl8+mc7/l+TU6n0ykAAAAvMRsdAAAAVC+UDwAA4FWUDwAA4FWUDwAA4FWUDwAA4FWUDwAA4FWUDwAA4FWUDwAA4FV+Rgf4I4fDoUOHDik0NFQmk8noOAAAoBycTqdyc3MVGxsrs/ns5zZ8rnwcOnRIcXFxRscAAADnITMzUw0aNDjrMT5XPkJDQyWVhg8LCzM4DQAAKA+r1aq4uDjX9/jZ+Fz5OH2pJSwsjPIBAEAlU54hEww4BQAAXkX5AAAAXkX5AAAAXkX5AAAAXkX5AAAAXkX5AAAAXkX5AAAAXkX5AAAAXkX5AAAAXkX5AAAAXkX5AAAAXlXh8rFs2TINGzZMsbGxMplMmjt37l8ee/vtt8tkMmnKlCl/IyIAAKhKKlw+8vPz1b59e02bNu2sx6Wmpmr16tWKjY0973Du5HQ6lTw7XbPT9infVmJ0HAAAqq0Kr2o7ZMgQDRky5KzHHDx4UHfffbfmz5+voUOHnnc4d0rbc0JfbzqsrzcdVso3W3V5x/oa1b2RWsace+lfAADgPm4f8+FwOHT99dfrvvvuU5s2bdz98uftgpgw/d/QC9QkMkR5thK9v3qfBk9ZpqtnrNQXGQdlK7EbHREAgGqhwmc+zuXZZ5+Vn5+fxo0bV67jbTabbDaba9tqtbo7kiQpvIa/bu7dRDf1itfKXcc1a/U+LdiSpbV7T2rt3pOqExKgqxPjNLJbQ8VF1PBIBgAA4ObysX79er300ktKT0+XyWQq18+kpKTosccec2eMszKZTEpqFqmkZpHKshbqwzWZ+mDNfh2xFmrG0l16bdku9W1RV6O6NVL/VlGymMv3zwEAAMrH5HQ6nef9wyaTUlNTNXz4cEnSlClTNHHiRJnNv1/NsdvtMpvNiouL0969e//0Gmc68xEXF6ecnByFhYWdb7QKKbE7tGhrtmat3qcfdxxz7a9fK1jXdo3TNV3iFBUa5JUsAABURlarVeHh4eX6/nZr+Th+/LgOHz5c5pjBgwfr+uuv19ixY9WyZUu3hveEvcfyNWfNfn28LlOnCoolSX5mkwYnxGhUt0bq3iSi3Gd1AACoLiry/V3hyy55eXnauXOna3vPnj3KyMhQRESEGjZsqDp16pQ53t/fXzExMeUqHr6gcWSIHvrHBZp4UQt9s+mwZq3ep/T9p/T1T4f19U+H1SyqpkZ2a6grOjVQeLC/0XEBAKh0KnzmY8mSJerfv/+f9o8ZM0YzZ8780/7GjRtrwoQJmjBhQrle3+gzH2ey5ZBVs9L2ae6GgyooKr0rJtjfokvbx2pU90Zq2yDc4IQAABjLa5ddPMEXy8dpuYXFmrvhoGat3q9tWbmu/e0bhGtk90Ya1i5WwQEWAxMCAGAMyoeHOZ1Ordt3UrNW79O3m46oyO6QJIUF+emqznEa2b2hmtataXBKAAC8h/LhRcfzbPp43QHNWbNPmSd+de0f3CZaT13eVpE1Aw1MBwCAd1A+DOBwOLV0x1HNXr1PP2zNlsMpRdYM0H+ubq/+LaOMjgcAgEdRPgy29YhV4z/IcI0LuaFnYz04pJWC/BkPAgComiry/e32tV0gtYoJ0xd3JWlsUmNJ0syVe3XpK8v1y2HPTB0PAEBlQvnwkCB/ix4d1kYzx3ZRZM1Abc/K02WvrNBby/fI4fCpk00AAHgV5cPD+rWM0vwJvTXwgigV2R16Yt4WjXlnjbKthUZHAwDAEJQPL6hTM1BvjE7Uk8MTFORv1o87jmnwlGVa8PMRo6MBAOB1lA8vMZlMGtW9kebd3Uut64XpZEGxbn1/vR5K3aSCohKj4wEA4DWUDy9rFhWq1OSeuq1PE0nSnLT9umTqcm06kGNwMgAAvIPyYYBAP4sm/eMCzb65m6LDArX7aL4uf3WFpi/ZJTuDUQEAVRzlw0BJzSL13fg+urhNjEocTj373VaNfHO1Dp369dw/DABAJcUkYz7A6XTqk3UH9O+vflZBkV2BfmZd1DpawzvUV58WdRXgR0cEAPg2ZjitpPYcy9fEjzO0Yf8p177aNfw1tF09De9QX50b1ZbJZDIuIAAAf4HyUYk5nU5tPmhV6oaD+nLjIR3Ls7mei4sI1mXt62t4x1g1iwo1MCUAAGVRPqqIErtDK3cd19yMg5q/+Yjyi+yu5xLqh2l4h/oa3rE+K+cCAAxH+aiCfi2ya+EvWZq74aCWbT+qkt/uign2t+imXvG6tW8ThQX5G5wSAFBdUT6quON5Nn2z6bA+WpepzQdLF6urVcNfd/ZrqtE9GrN6LgDA6ygf1YTT6dSCLVn6z/xt2pmdJ0mKCQvShIHNdVXnBvKzcJcMAMA7KB/VjN3h1OfpBzTl+x06+NscIU3qhujeQS01JCGGO2QAAB5H+aimCovtmp22X9MW79SJ/CJJ0gX1wjQkIUZ9WtRV2/rhspgpIgAA96N8VHO5hcV688c9evPH3WXukKlVw19JzSLVt3ld9W4RqXrhwQamBABUJZQPSCodmDr/5ywt235UK3YdU25h2dVzOzeqrVeu60gJAQD8bZQP/EmJ3aGMzFNatuOYlm0/qp8OnJLDKdULD9LMsV3VMoZJywAA54/ygXPKPFGgsTPXamd2nsKC/PTG6ER1a1LH6FgAgEqqIt/f3ItZTcVF1NCnt/dQYqPashaW6Pq31uibTYeNjgUAqAYoH9VYrRoBmnVzNw1uE60iu0PJc9I1c8Ueo2MBAKo4ykc1F+Rv0asjO+v67o3kdEr//mqLHv9qi2wl9nP/MAAA54HyAVnMJj1+WRvdN7ilJOntFXs0fNpKbc/KNTgZAKAqonxAkmQymZTcv5lev76zIkIC9Mthqy6ZulxvL98jh8OnxiQDACo5ygfKGNQmRt9N6K3+LeuqqMShx+dt0ei31+jQb9O2AwDwd3GrLc7I6XRqdtp+Pfn1FhUWOxRgMeuyDrG6qXe8WsXw7wUAUBbzfMBtdh3N06TPN2nNnhOufb2bR+rGXvHq3SySlXMBAJIoH/CA9P0n9dbyPfp202GdHgISERKgQa2jNaRtPfVsWkf+FBEAqLYoH/CYzBMFemfFXs3NOOhaOVeSatfw1zVd4nR990ZqULuGgQkBAEagfMDjSuwOrdlzQt9sPqzvNmfpWJ5NkmQ2SQMviNZtfZuqc6PaBqcEAHgL5QNeZXc49cPWbL27cq+W7zzm2j+sfaweHNJK9Wuxai4AVHUeXdtl2bJlGjZsmGJjY2UymTR37lzXc8XFxXrggQfUtm1bhYSEKDY2VqNHj9ahQ4cq/A+BysNiNumi1tGadXM3fT+xj65JbCCTSfpq4yFd+PwSTfl+u0rsDqNjAgB8RIXLR35+vtq3b69p06b96bmCggKlp6dr8uTJSk9P1+eff65t27bp0ksvdUtY+L5mUaF67qr2+uquXuoaHyFbiUNTvt+h695IU5a10Oh4AAAf8Lcuu5hMJqWmpmr48OF/eczatWvVtWtX7du3Tw0bNjzna3LZpepwOp36cuMhPZy6WXm2EkXWDFDKFe3Up0WkAv0sRscDALhRRb6//TwdJicnRyaTSbVq1Trj8zabTTabzbVttVo9HQleYjKZdFmH+mrXoJbumLVeW4/k6pb31inQz6wOcbU0qE2M/tklTjUDPf6fIQDAh3h0YobCwkI98MADuvbaa/+yBaWkpCg8PNz1iIuL82QkGCA+MkRzk5M0Nqmx6oQEyFbiUNqeE3pi3hb1SFmkp7/5RQu3ZGn30TzWkQGAasBjl12Ki4t15ZVX6sCBA1qyZMlflo8znfmIi4vjsks
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# Zipf's law plot for stems\n",
|
||
|
"log_rang_log_freq('zipf_plot_stems', stems(get_words(text)))"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"# Podać słowa, które najbardziej łamią prawo wiążące długość z częstością"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 63,
|
||
|
"outputs": [
|
||
|
{
|
||
|
"name": "stdout",
|
||
|
"output_type": "stream",
|
||
|
"text": [
|
||
|
"Word\t\tfrequency * length\n",
|
||
|
"------------------------------\n",
|
||
|
"1. the\t\t\t3346731\n",
|
||
|
"2. and\t\t\t1609434\n",
|
||
|
"3. of\t\t\t958674\n",
|
||
|
"4. And\t\t\t916899\n",
|
||
|
"5. to\t\t\t752068\n",
|
||
|
"6. that\t\t\t726440\n",
|
||
|
"7. with\t\t\t688892\n",
|
||
|
"8. The\t\t\t602034\n",
|
||
|
"9. in\t\t\t579134\n",
|
||
|
"10. his\t\t\t552441\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# Words which violate the length-frequency law the most (highest frequency to length ratio)\n",
|
||
|
"toplist = find_long_popular_words(get_words(text), top=10)\n",
|
||
|
"print('Word\\t\\tfrequency * length')\n",
|
||
|
"print('-'*30)\n",
|
||
|
"for i in range(len(toplist)):\n",
|
||
|
" print(f'{i + 1}. {toplist[i][1]}\\t\\t\\t{toplist[i][0]}')"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "markdown",
|
||
|
"source": [
|
||
|
"# Wymyślić i zbadać 2 zależności dotyczące wyrazów bądź innych jednostek w tekście"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 64,
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": "<Figure size 640x480 with 1 Axes>",
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkkAAAGwCAYAAAC99fF4AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAACv80lEQVR4nOydd3gc5bXG39m+q15sS7IlW5axjRsYG4xMs7ETOiEJ9xJwbmiBQAyEQEggoYQkQBoEkhBIyA1wCZCQUAIECM2muYNtbGxcZVtylVZlpe3lu3/sfrOzqy0zs7NNe37Po8fW1k+zOzNnznnPewTGGANBEARBEAQRgy7fCyAIgiAIgihEKEgiCIIgCIJIAAVJBEEQBEEQCaAgiSAIgiAIIgEUJBEEQRAEQSSAgiSCIAiCIIgEUJBEEARBEASRAEO+F5BtQqEQDhw4gIqKCgiCkO/lEARBEAQhA8YYBgcH0dTUBJ0uPzmdER8kHThwAM3NzfleBkEQBEEQKujs7MS4cePy8t4jPkiqqKgAEN7IlZWVeV4NQRAEQRBycDgcaG5uFs/j+WDEB0m8xFZZWUlBEkEQBEEUGfmUypBwmyAIgiAIIgEUJBEEQRAEQSSAgiSCIAiCIIgEUJBEEARBEASRAAqSCIIgCIIgEkBBEkEQBEEQRAIoSCIIgiAIgkgABUkEQRAEQRAJoCCJIAiCIAgiARQkEQRBEARBJICCJIIgCIIgiARQkEQQBEEQBJEACpIIgiAIQgJjDB5/MN/LIAoACpIIgiAIQsLtL23GsT95E3vtznwvhcgzFCQRBEEQhIRVu+3w+EPYetCR76UQeYaCJIIgCIKQYHf6AAAefyjPKyHyDQVJBEEQBBHBHwyh3+UHANIlERQkEQRBEASnL5JFAgA3BUklDwVJBEEQBBGhZygaJFG5jaAgiSAIgiAi2J1e8f9UbiMoSCIIgiCICL1OaSaJgqRSh4IkgiAIgogQW26jIKnUoSCJIAiCICLYh6LlNhJuExQkEQRBEEQEOwm3CQkUJBEEQRBEBBJuE1IoSCIIgiCICFJNEpXbCAqSCIIgCCKCNJPkpXJbyUNBEkEQBEFEiNEkBSiTVOpQkEQQBEEQANy+IFy+YMzvRGlDQRJBEARBILbUBlAmiaAgiSAIgiAAxJbaAMDtI01SqUNBEkEQBEEgmkmqMBsAAF7qbit5KEgiCIIgCETb/8fWWAFQuY2gIIkgCIIgAETLbWOrw0GSP8gQCFLJrZShIIkgCIIgEJ3bxjNJAOAJUJBUylCQRBAEQRAA7M5wJqmpWhIkkS6ppKEgiSAIgiAQDZLqy80wG8KnR/JKKm0oSCIIgiAIRMttdeUmWE16AICXxNslDQVJBEEQBIGocLu+zAyLIRwkkVdSaUNBEkEQBFHyMMZEnyRpJolsAEobCpIIgiCIksfhCcAfZACA2jKTqEki4XZpk9cg6f3338d5552HpqYmCIKAl156SbzP7/fjBz/4AWbOnImysjI0NTXhG9/4Bg4cOJC/BRMEQRAjEq5HKjcbYDHqYTHychsFSaVMXoMkp9OJY445Bg8//PCw+1wuFz755BPccccd+OSTT/DCCy9g27ZtOP/88/OwUoIgCGIkwzvb6spNAACrkZfbSJNUyhjy+eZnnXUWzjrrrIT3VVVV4a233oq57fe//z1OOOEE7Nu3Dy0tLQmf5/V64fVGJzk7HA7tFkwQWeTjvX3Y0NmPK06aAEEQ8r0cgkjJgX43XvikC0vmjUdNmSnfy4nB4fHjqZV7cd6sJrTU2WQ9R+xsi/wtFiOV24gi0yQNDAxAEARUV1cnfcx9992Hqqoq8ae5uTl3CySIDLjzX5vx01e3YENnf76XQhBp+dP7u/HrN7fj2bX78r2UYbzwcRd+9Z9tePCd7bKfE80kmQFALLdRkFTaFE2Q5PF48IMf/AAXX3wxKisrkz7utttuw8DAgPjT2dmZw1UShHr6XX4AQJ/Ll+eVEER6DvS7AQAH+z15XslwDg6E19TR45T9HLH9P77cRkFSSZPXcptc/H4//vu//xuMMTzyyCMpH2s2m2E2m3O0MoLQDn4wdpFQlCgCeOaFt80XEj2RgKez1y37OdFyW/j8YTaSTxJRBJkkHiDt3bsXb731VsosEkEUM24KkogiggcVPCApJHjg1jPkld2d1pNUuE37YylT0EESD5B27NiBt99+G3V1dfleEkFkBcaYmEmilmOiGODlKR4sFRJ2SeDW1eeS+RxuJMk1SSTcJvJcbhsaGsLOnTvF3zs6OrBhwwbU1taisbERF154IT755BO8+uqrCAaDOHToEACgtrYWJlNhdVMQRCb4gwyhsI+dmFEiiELFGwhi0BsAEC27FRLSwK2rz42jxlTIeE4kkyR2t5EmichzkLRu3TosXLhQ/P2mm24CAFx66aX48Y9/jJdffhkAcOyxx8Y8b9myZViwYEGulkkQWUcaGFG5jSh0eiWBUb/Lj0AwBIO+MAoT4fEi0fV1ys0kJSu3+UmTVMrkNUhasGABGGNJ7091H0GMJLySIMntC+RxJQSRHnucDqnX5cPoCkueVhOL0xeEV2IA2dmbPkgKhpjYVcqF21RuI4AC1yQRRKkgvVqlTBJR6PTE6ZDig6Z8Eq+RktPh1ufygTFAEIAamxGApLuNgqSShoIkgigA3DGZJDooE4VNfFBUSEFSfLednHIbX3+NzSSWDckniQAoSCKIgsBDmiSiiIj3RiokryTpoFogLNyW+5w6yXgVccAtaZJKGgqSCKIAiBFu05UrUeDEZ44KySuJC7BnjA176g24/XB4/CmfE++RBEQzSV7aH0saCpIIogDwkHCbKCLig6JC8kria2mptYmZoXTi7XiPJICE20QYCpIIogAg4TZRTPDyWkutLfx7AWaS6srNGBdZXzrxdrxHEiAtt9H+WMpQkEQQBYCHhNtEEcF9kiZHTBoLyVBSGvA011gBpHfd5kEfb/8HpGaSpEkqZShIIogCgITbRDHBA5EpDeXh3wtJuB1ZS325GeNqeCYpdZDEy4dSTRKV2wiAgiSCKAhigyTSJBGFC2NM9EkSM0mFVG6TBDzNtTyTlLrcxjNj9eXDy23eQAihEBkblyp5ddwmCCKMtM2YNBBEISN1tJ40OpJJKiDhNs8K1ZaZwIc2pPNKSiTc5t1tQDhQspr0w55HjHwok0QQBYA0k+QPMviDpIMgChMeUNhMejRHhNFOX7AgtHShEEOvpNzWLBFupxpzlUq4DdCFSylDQRJBFADxugfSJRGFilS/U2E2wBRxqC4EXVK/2w9eGauxmdBUbYEghIOcZOJyjz+IQW+4xC3NJOl1gvi3kS6pdKEgiSAKgPiDcCFclRNEIqLu1GYIgiCKnQtBl8TXVmU1wmTQwWzQo6EyPHg3mXib65GMegGVllgFipnE2yUPBUkEUQDEtxlTep8oVOxxImcxSCqATJI9gXP2uJrU4m27RMMkCELMfeSVRFCQRBAFQPxBmDrciEKFZ164pxD/tzAySZEATuJ31MxtAJKIt3sSeCRxrOSVVPJQkEQQBQCV24hioUfsBIvPJBVAkOSMXRuAtK7b9gQeSRzulUTz20oXCpIIogDwBGKvVEm4TRQq0vIUEO0IKwQbgESmkOlct/m668uHZ5Ko3EZQkEQQBYDHR91tRHEgdbQGoh1hhVFuC6+tVlpuq03tuh0tHybKJFG5rdShIIkgCgBPIK7c5idNElGYxJeneHDRUwjltqHhztlcuL2/353QOTuafaJMEjEcCpIIogDgGqQKc7gFmTJJRKEiBhWRbE29mEnKf7kt0aDaxiorDDoB/iDD4UFP8uck0CRZyQKg5KEgiSAKAJ5JqolclZNwmyhEYh2t44TbhVBuS2ABoNcJaKoOZ5MSibcTuW1zouU22h9LFQqSCKIA4JoHLobNRyYp1dgGLV47m68/ksn256KEGEdrLtzmmSSnN++fcaJyGwBx0G0iXVKiuW0ci6H0gqR8f4aFBgVJBFEAcOF2voKkd7Yexqy738Trmw5q/tqMMVz0p1X46iMraJq6Qn7/7g7M/dnb2N0
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# Brevity law\n",
|
||
|
"# https://en.wikipedia.org/wiki/Brevity_law\n",
|
||
|
"plot_brevity_law(get_words(text))"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": 65,
|
||
|
"outputs": [
|
||
|
{
|
||
|
"data": {
|
||
|
"text/plain": "<Figure size 640x480 with 1 Axes>",
|
||
|
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAlUAAAGzCAYAAAAG8+KwAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAABg9klEQVR4nO3de1xUdeI+8Ge4zHAdhvtFAVEUvCAoKmJek0QzS7PNzFo109W0MkrN3VL7ta2l20XNS7t903a3Mq208paGgpl4QxEFQUUUVAYQZIb7Zebz+8PlrJM3sIEzwPN+veaVc85nzjzMGPN45pzPUQghBIiIiIjod7GSOwARERFRa8BSRURERGQGLFVEREREZsBSRURERGQGLFVEREREZsBSRURERGQGLFVEREREZsBSRURERGQGLFVEREREZsBSRURERGQGNnI++dq1a7F27VpcvHgRANC9e3csWrQIo0aNAgBUVVXh1VdfxcaNG1FdXY3Y2FisWbMG3t7e0jZycnIwa9Ys7Nu3D05OTpg8eTKWLl0KG5v//WgJCQmIi4tDWloa/P398cYbb2DKlCkmWVavXo3ly5dDq9UiPDwcq1atQr9+/aT1DclyL0ajEVevXoWzszMUCsV9vGJERETU3IQQKC0thZ+fH6ys7rI/Ssjohx9+ENu3bxdnz54VmZmZ4s9//rOwtbUVp0+fFkIIMXPmTOHv7y/i4+PFsWPHRP/+/cWAAQOkx9fV1YkePXqImJgYceLECbFjxw7h4eEhFi5cKI25cOGCcHBwEHFxcSI9PV2sWrVKWFtbi127dkljNm7cKJRKpfjss89EWlqamD59utBoNCI/P18ac68sDZGbmysA8MYbb7zxxhtvLfCWm5t71895hRCWdUFlNzc3LF++HE888QQ8PT3x5Zdf4oknngAAZGRkoGvXrkhKSkL//v2xc+dOPPLII7h69aq0x2jdunVYsGABCgsLoVQqsWDBAmzfvh2nT5+WnuOpp55CSUkJdu3aBQCIiopC37598fHHHwO4sUfJ398fL774Il5//XXodLp7ZmkInU4HjUaD3NxcqNVqs71mRERE1HT0ej38/f1RUlICFxeXO46T9eu/mxkMBmzevBnl5eWIjo5GcnIyamtrERMTI40JDQ1FQECAVGSSkpIQFhZm8hVcbGwsZs2ahbS0NPTq1QtJSUkm26gfM3fuXABATU0NkpOTsXDhQmm9lZUVYmJikJSUBAANynI71dXVqK6ulu6XlpYCANRqNUsVERFRC3OvQ3dkP1D91KlTcHJygkqlwsyZM7FlyxZ069YNWq0WSqUSGo3GZLy3tze0Wi0AQKvV3nJMU/39e43R6/WorKzEtWvXYDAYbjvm5m3cK8vtLF26FC4uLtLN39+/YS8KERERtTiyl6qQkBCkpKTg8OHDmDVrFiZPnoz09HS5Y5nFwoULodPppFtubq7ckYiIiKiJyP71n1KpRHBwMAAgMjISR48exYoVKzBhwgTU1NSgpKTEZA9Rfn4+fHx8AAA+Pj44cuSIyfby8/OldfX/rV928xi1Wg17e3tYW1vD2tr6tmNu3sa9styOSqWCSqVqxKtBRERELZXse6p+y2g0orq6GpGRkbC1tUV8fLy0LjMzEzk5OYiOjgYAREdH49SpUygoKJDG7NmzB2q1Gt26dZPG3LyN+jH121AqlYiMjDQZYzQaER8fL41pSBYiIiJq4xo1J4CZvf766yIxMVFkZ2eL1NRU8frrrwuFQiF2794thLgxjUFAQIDYu3evOHbsmIiOjhbR0dHS4+unVBgxYoRISUkRu3btEp6enredUmHevHnizJkzYvXq1bedUkGlUokNGzaI9PR0MWPGDKHRaIRWq5XG3CtLQ+h0OgFA6HS6+33JiIiIqJk19PNb1lL13HPPicDAQKFUKoWnp6cYPny4VKiEEKKyslK88MILwtXVVTg4OIhx48aJvLw8k21cvHhRjBo1Stjb2wsPDw/x6quvitraWpMx+/btExEREUKpVIqOHTuK9evX35Jl1apVIiAgQCiVStGvXz9x6NAhk/UNyXIvLFVEREQtT0M/vy1unqrWTK/Xw8XFBTqdjlMqEBERtRAN/fy2uGOqiIiIiFoilioiIiIiM2CpIiIiIjIDlioiIiIiM2CpIiIiIjIDlioiIiJq8WoNRpwvKIOusla2DLJfpoaIiIiosYxGgQvXynAwqwj7zxbi0IVilFXXYdXEXhgT7idLJpYqIiIisngF+iqk5emRXViO01d02H/uGq6VVZuMsbe1Rgn3VBERERH9z6Wicvx991lkFZQht7gCpdV1t4yxs7VCJ08njOrhg/4d3dErwBXWVgoZ0t7AUkVEREQW5/8OZOPHk1el+1YKoKOnE4I9neDvZo/oTu4Y0MkDdrbWMqY0xVJFREREFud4znUAwOO92uGFYZ3QTuMAe6XlFKjbYakiIiIii1JnMOKstgwAMDemCwLcHWRO1DCcUoGIiIgsyuXrlagxGKGysUJ7V3u54zQYSxURERFZlAvXbuylCvJwhJWMB543FksVERERWZScogoAQIBby/jarx6PqSIiIiJZXS+vwe50LQ5mFeH0FR2yCssBAJ7OKpmTNQ5LFRERETU7o1Hgo5/PIjnnOo5mX0eNwWiy3kFpjUd6yjMz+v1iqSIiIqJml3iuECv3npfud/RwxMgePugb5IZgTyf4aexlncjzfrBUERERUbM7lFUEAPBwUmLlU70Q3ckdCkXLKlG/xVJFREREzS4ltwQA8NqIEAwI9pA3jJmwVBEREVGzqDMY8WtWEVbFn8OxSzdmTO/s7SxzKvNhqSIiIqImd6GwDFM3HMWl/06XAAAxXb0Q4a+RL5SZsVQRERFRkyksrca3xy/jswPZKCitho2VArE9fDB3eOdWtZcKYKkiIiIiM6ozGJGhLUVCZgF2nNIiPU8vrVNaW+GbWdHo2V4jX8AmxFJFREREv0tNnRFJF4qwL6MA36dcwfWKWpP1QR6OeLpfAEb39IWfpuVcy6+xWKqIiIio0Q5fKMK/ki6hsLQaRy4Wm6xT2lhhULAHHurmjUFdPOHnYtfip0toCJYqIiIiahSDUWD2lydwrazaZPkfItsjupM7Rvf0hcrGWqZ08mGpIiIiokbJ0OqlQrXsiZ5o72qPyEDXNlmkbsZSRURERA0mhMBft50BADwQ7I4n+/jLnMhyWMkdgIiIiFqOM3mlSLpw4xIzMwZ3kjmNZWGpIiIiogb7/uQVAMBD3bwxpIunzGksC0sVERERNUhNnRHfJl8GAIzr1U7mNJaHx1QRERHRXVXVGnAipwT/dyAb18pq4GJvi+FdveSOZXFYqoiIiOi26gxGXLhWjjlfHsfZ/DIAgK21Ah88Gd7mz/S7HZYqIiIiMpGSW4K3t6Uj9XIJag0CAOCotMbgLp54tn8gBgR7yJzQMrFUERERkYmF353Cmf9es09pY4Xw9i54Z1wYurSyCyCbG0sVERERmbhcXAEA+OTZSDzU1RtWVq3/EjPmwLP/iIiISFJZY0BpdR0AILqTOwtVI7BUERERkSRfXwUAsLe1hrOKX2g1BksVERERSQr/e00/L7UKCgX3UjUGSxURERFJCvQ3SpWnk0rmJC0PSxURERFJ6r/+81bbyZyk5WGpIiIiIomWpeq+sVQRERGRpH5Pla8LS1VjsVQRERGRpL5Ueal5TFVjsVQRERGRpP5AdS9n7qlqLJYqIiIiklz775QKns5KmZO0PLKWqqVLl6Jv375wdnaGl5cXxo4di8zMTJMxQ4cOhUKhMLnNnDnTZExOTg5Gjx4NBwcHeHl5Yd68eairqzMZk5CQgN69e0OlUiE4OBgbNmy4Jc/q1avRoUMH2NnZISoqCkeOHDFZX1VVhdmzZ8Pd3R1OTk4YP3488vPzzfNiEBERyazOYIS+6sbnp8aBpaqxZC1ViYmJmD17Ng4dOoQ9e/agtrYWI0aMQHl5ucm46dOnIy8vT7otW7ZMWmcwGDB69GjU1NTg4MGD+Pzzz7FhwwYsWrRIGpOdnY3Ro0dj2LBhSElJwdy5c/H888/jp59
|
||
|
},
|
||
|
"metadata": {},
|
||
|
"output_type": "display_data"
|
||
|
}
|
||
|
],
|
||
|
"source": [
|
||
|
"# Heaps' law\n",
|
||
|
"# https://en.wikipedia.org/wiki/Heaps'_law\n",
|
||
|
"plot_heaps_law(get_words(text))"
|
||
|
],
|
||
|
"metadata": {
|
||
|
"collapsed": false
|
||
|
}
|
||
|
}
|
||
|
],
|
||
|
"metadata": {
|
||
|
"kernelspec": {
|
||
|
"display_name": "Python 3",
|
||
|
"language": "python",
|
||
|
"name": "python3"
|
||
|
},
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 2
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython2",
|
||
|
"version": "2.7.6"
|
||
|
}
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 0
|
||
|
}
|