lab 15

lab 11-14
Merge branch 'wip'
2024-06-09 18:22:19 +02:00 · 2024-05-28 23:44:55 +02:00 · 2024-05-27 00:55:27 +02:00 · 2024-05-27 00:53:56 +02:00 · 2024-04-20 19:58:36 +02:00 · 2024-04-16 21:12:25 +02:00
10 changed files with 906 additions and 289 deletions
--- a/lab/lab_01.ipynb
+++ b/lab/lab_01.ipynb
@ -52,9 +52,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 5,
   "id": "narrow-romantic",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T11:05:09.046685900Z",
+     "start_time": "2024-04-13T11:05:08.877692800Z"
+    }
+   },
   "outputs": [],
   "source": [
    "translation_memory = [('Wciśnij przycisk Enter', 'Press the ENTER button'), \n",
@ -71,9 +76,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 6,
   "id": "indonesian-electron",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T11:05:09.131296300Z",
+     "start_time": "2024-04-13T11:05:08.893315Z"
+    }
+   },
   "outputs": [],
   "source": [
    "def tm_lookup(sentence):\n",
@ -82,9 +92,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 7,
   "id": "compact-trinidad",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T11:05:09.162547Z",
+     "start_time": "2024-04-13T11:05:08.924558500Z"
+    }
+   },
   "outputs": [
    {
     "data": {
@ -92,7 +107,7 @@
       "['Press the ENTER button']"
      ]
     },
-     "execution_count": 3,
+     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -119,9 +134,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 8,
   "id": "exposed-daniel",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T11:05:09.162547Z",
+     "start_time": "2024-04-13T11:05:08.946722400Z"
+    }
+   },
   "outputs": [],
   "source": [
    "translation_memory.append(('Drukarka jest wyłączona', 'The printer is switched off'))\n",
@ -139,9 +159,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
   "id": "serial-velvet",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T11:05:09.162547Z",
+     "start_time": "2024-04-13T11:05:08.955053700Z"
+    }
+   },
   "outputs": [
    {
     "data": {
@ -149,7 +174,7 @@
       "['Press the ENTER button', 'Press the ENTER key']"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -176,9 +201,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 10,
   "id": "every-gibson",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T11:05:09.178168700Z",
+     "start_time": "2024-04-13T11:05:08.970677700Z"
+    }
+   },
   "outputs": [
    {
     "data": {
@ -186,7 +216,7 @@
       "[]"
      ]
     },
-     "execution_count": 6,
+     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -213,13 +243,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 21,
   "id": "protected-rings",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T11:05:12.496455200Z",
+     "start_time": "2024-04-13T11:05:12.465209700Z"
+    }
+   },
   "outputs": [],
   "source": [
    "def tm_lookup(sentence):\n",
-    "    return ''"
+    "    sentence = sentence.lower()\n",
+    "    return [entry[1] for entry in translation_memory if entry[0].lower() == sentence]"
   ]
  },
  {
@ -232,17 +268,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 22,
   "id": "severe-alloy",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T11:05:14.153976900Z",
+     "start_time": "2024-04-13T11:05:14.120474700Z"
+    }
+   },
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "''"
+       "[]"
      ]
     },
-     "execution_count": 18,
+     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -261,13 +302,24 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 23,
   "id": "structural-diesel",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T11:05:15.199517300Z",
+     "start_time": "2024-04-13T11:05:15.105892400Z"
+    }
+   },
   "outputs": [],
   "source": [
+    "import string\n",
+    "\n",
+    "def normalize(sentence):\n",
+    "    return sentence.translate(str.maketrans('', '', string.punctuation)).lower()\n",
+    "\n",
    "def tm_lookup(sentence):\n",
-    "    return ''"
+    "    sentence = normalize(sentence)\n",
+    "    return [entry[1] for entry in translation_memory if normalize(entry[0]) == sentence]"
   ]
  },
  {
@ -280,17 +332,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 24,
   "id": "brief-senegal",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T11:05:17.857048100Z",
+     "start_time": "2024-04-13T11:05:17.825799600Z"
+    }
+   },
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "''"
+       "[]"
      ]
     },
-     "execution_count": 12,
+     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -317,13 +374,49 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 25,
   "id": "mathematical-customs",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T12:00:14.223561700Z",
+     "start_time": "2024-04-13T12:00:14.159559100Z"
+    }
+   },
   "outputs": [],
   "source": [
+    "def find_similar(sentence):\n",
+    "    mismatches_threshold = 2\n",
+    "    words = sentence.split()\n",
+    "    words_count = len(words)\n",
+    "    for entry in translation_memory:\n",
+    "        entry_words = normalize(entry[0]).split()\n",
+    "        if words_count != len(entry_words):\n",
+    "            continue\n",
+    "        mismatches = 0\n",
+    "        i = 0\n",
+    "        for word in words:\n",
+    "            if word != entry_words[i]:\n",
+    "                if mismatches < mismatches_threshold:\n",
+    "                    mismatches += 1\n",
+    "                else:\n",
+    "                    break\n",
+    "            i += 1\n",
+    "        if mismatches < mismatches_threshold:\n",
+    "            return entry[1]\n",
+    "    return []\n",
+    "\n",
+    "\n",
+    "def find_exact(sentence):\n",
+    "    return [entry[1] for entry in translation_memory if normalize(entry[0]) == sentence]\n",
+    "\n",
+    "\n",
    "def tm_lookup(sentence):\n",
-    "    return ''"
+    "    sentence = normalize(sentence)\n",
+    "    exact_match = find_exact(sentence)\n",
+    "    if not exact_match:\n",
+    "        return find_similar(sentence)\n",
+    "    else:\n",
+    "        return exact_match"
   ]
  },
  {
@ -344,9 +437,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 26,
   "id": "humanitarian-wrong",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T12:00:18.016836500Z",
+     "start_time": "2024-04-13T12:00:17.992836400Z"
+    }
+   },
   "outputs": [],
   "source": [
    "glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]"
@ -362,9 +460,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 27,
   "id": "located-perception",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T12:02:06.039160400Z",
+     "start_time": "2024-04-13T12:02:06.015160400Z"
+    }
+   },
   "outputs": [],
   "source": [
    "def glossary_lookup(sentence):\n",
@ -374,9 +477,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 28,
   "id": "advised-casting",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T12:02:06.846998600Z",
+     "start_time": "2024-04-13T12:02:06.823447800Z"
+    }
+   },
   "outputs": [
    {
     "data": {
@ -384,7 +492,7 @@
       "[('przycisk', 'button'), ('drukarka', 'printer')]"
      ]
     },
-     "execution_count": 17,
+     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -406,7 +514,9 @@
   "id": "defensive-fifteen",
   "metadata": {},
   "source": [
-    "Odpowiedź:"
+    "Odpowiedź: \n",
+    "złożoność pesymistyczna: m*n\n",
+    "złożoność optymistyczna: m"
   ]
  },
  {
@ -421,11 +531,17 @@
   "cell_type": "code",
   "execution_count": 19,
   "id": "original-tunisia",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T11:05:09.247171300Z",
+     "start_time": "2024-04-13T11:05:09.124790700Z"
+    }
+   },
   "outputs": [],
   "source": [
    "def glossary_lookup(sentence):\n",
-    "    return ''"
+    "    sentence_words = sentence.lower().split()\n",
+    "    return [entry for entry in glossary if entry[0].lower() in sentence_words]"
   ]
  },
  {
@ -440,11 +556,25 @@
   "cell_type": "code",
   "execution_count": 20,
   "id": "adolescent-semiconductor",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-13T11:05:09.247171300Z",
+     "start_time": "2024-04-13T11:05:09.146924500Z"
+    }
+   },
   "outputs": [],
   "source": [
    "def glossary_lookup(sentence):\n",
-    "    return ''"
+    "    sentence_words = sentence.lower().split()\n",
+    "    entry_words = []\n",
+    "    for entry in glossary:\n",
+    "        entry_words.append((entry[0].lower(), entry[1]))\n",
+    "    result = []\n",
+    "    for word in sentence_words:\n",
+    "        for entry_word in entry_words:\n",
+    "            if entry_word[0] == word:\n",
+    "                result.append(entry_word)\n",
+    "    return result"
   ]
  }
 ],
@ -452,7 +582,7 @@
  "author": "Rafał Jaworski",
  "email": "rjawor@amu.edu.pl",
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -467,7 +597,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.9.2"
  },
  "subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
  "title": "Komputerowe wspomaganie tłumaczenia",
--- a/lab/lab_02.ipynb
+++ b/lab/lab_02.ipynb
@ -40,9 +40,11 @@
   ]
  },
  {
-   "cell_type": "markdown",
-   "id": "existing-approval",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "961796fd-4463-4a17-ac15-afe712b3959e",
   "metadata": {},
+   "outputs": [],
   "source": [
    "Jedną z funkcji dostępnych we wszystkich większych programach do wspomagania tłumaczenia jest znajdowanie bardzo pewnych dopasowań w pamięci tłumaczeń. Są one zwane **ICE** (In-Context Exact match) lub 101% match. Są to takie dopasowania z pamięci tłumaczeń, dla których nie tylko zdanie źródłowe z TM jest identyczne z tłumaczonym, ale także poprzednie zdanie źródłowe z TM zgadza się z poprzednim zdaniem tłumaczonym oraz następne z TM z następnym tłumaczonym."
   ]
@ -85,8 +87,31 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "def exact_match(sentence):\n",
+    "    for key, entry in enumerate(translation_memory):\n",
+    "        if entry[0] == sentence:\n",
+    "            return key, entry[1]\n",
+    "    return None, None\n",
+    "\n",
+    "\n",
+    "def has_exact_match_on_index(index, sentence):\n",
+    "    return translation_memory[index][0] == sentence\n",
+    "\n",
+    "\n",
    "def ice_lookup(sentence, prev_sentence, next_sentence):\n",
-    "    return []"
+    "    index, match = exact_match(sentence)\n",
+    "    trans_length = len(translation_memory)\n",
+    "    if index is None:\n",
+    "        return []\n",
+    "    if next_sentence \\\n",
+    "            and index < trans_length \\\n",
+    "            and not has_exact_match_on_index(index + 1, next_sentence):\n",
+    "        return []\n",
+    "    if prev_sentence \\\n",
+    "            and index > 0 \\\n",
+    "            and not has_exact_match_on_index(index - 1, prev_sentence):\n",
+    "        return []\n",
+    "    return [match]"
   ]
  },
  {
@ -141,7 +166,7 @@
   "id": "graduate-theorem",
   "metadata": {},
   "source": [
-    "Odpowiedź:"
+    "Odpowiedź: Nie. 1, 3, 4."
   ]
  },
  {
@ -179,7 +204,7 @@
   "id": "metallic-leave",
   "metadata": {},
   "source": [
-    "Odpowiedź:"
+    "Odpowiedź: Tak. 1, 2, 3, 4."
   ]
  },
  {
@ -206,7 +231,17 @@
   "id": "bibliographic-stopping",
   "metadata": {},
   "source": [
-    "Odpowiedź:"
+    "Odpowiedź: Tak.\n",
+    "1. Liczba operacji wykonanych nie może być ujemna.\n",
+    "2. Gdy x == y, nie są wymagane żadne operacje edycyjne, więc wynik funkcji to 0.\n",
+    "3. Zmiana jednego łańcucha znaków w drugi, wymaga tyle samo operacji edycji, co zmiana drugiego w pierwszy.\n",
+    "   Studia -> Studiel = 2; Studiel -> Studia = 2; 2 == 2\n",
+    "4. Istnieją trzy opcje\n",
+    "   -   Jeżeli x == y == z, więc 0 + 0 == 0\n",
+    "   -   Jeżeli x == y, x != z, a x -> z = n, to y -> z = n więc albo 0 + n == n, albo n + n > 0\n",
+    "   -   Jeżeli x != y != z to im z jest bliżej do x, tym jest dalej od y (jednostką odległości jest liczba przekształceń). Można by to przedstawić graficznie jako trójkąt (x, y, z). z stanowi punkt na pośredniej drodze pomiędzy x i y, która nie może być dłuższa niż droga bezpośrednia - wynika to z własności trójkąta.\n",
+    "       Studia -> Studiel = 2; Studiel -> udia = 4; udia -> Studia = 2;\n",
+    "       2 + 4 > 2; 2 + 2 == 4"
   ]
  },
  {
@ -214,6 +249,7 @@
   "id": "attended-channels",
   "metadata": {},
   "source": [
+    "\n",
    "W Pythonie dostępna jest biblioteka zawierająca implementację dystansu Levenshteina. Zainstaluj ją w swoim systemie przy użyciu polecenia:\n",
    "\n",
    "`pip3 install python-Levenshtein`\n",
@ -223,21 +259,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
-   "id": "secondary-wrist",
+   "execution_count": null,
+   "id": "355e4914-08da-4bd4-b8a2-67b055831c30",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "2"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "from Levenshtein import distance as levenshtein_distance\n",
    "\n",
@ -314,22 +339,9 @@
   ]
  },
  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "invisible-cambodia",
+   "cell_type": "raw",
+   "id": "4a47854f-df2e-451f-8e09-99f59210f86f",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0.631578947368421"
-      ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
   "source": [
    "levenshtein_similarity('Spróbuj wyłączyć i włączyć komputer', 'Nie próbuj wyłączać i włączać drukarki')"
   ]
@ -350,7 +362,11 @@
   "outputs": [],
   "source": [
    "def fuzzy_lookup(sentence, threshold):\n",
-    "    return []"
+    "    results = []\n",
+    "    for entry in translation_memory:\n",
+    "        if levenshtein_similarity(entry[0], sentence) >= threshold:\n",
+    "            results.append(entry[1])\n",
+    "    return results"
   ]
  }
 ],
@ -358,7 +374,7 @@
  "author": "Rafał Jaworski",
  "email": "rjawor@amu.edu.pl",
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -373,7 +389,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.9.2"
  },
  "subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
  "title": "Komputerowe wspomaganie tłumaczenia",
--- a/lab/lab_03.ipynb
+++ b/lab/lab_03.ipynb
@ -63,7 +63,7 @@
   "id": "diverse-sunglasses",
   "metadata": {},
   "source": [
-    "Odpowiedź:"
+    "Odpowiedź: \"metal cabinet guides\". https://translate.google.pl/"
   ]
  },
  {
@ -115,7 +115,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "dictionary = ['program', 'application', 'applet' 'compile']"
+    "dictionary = ['program', 'application', 'applet', 'compile']"
   ]
  },
  {
@ -133,8 +133,18 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "import re\n",
+    "\n",
    "def terminology_lookup():\n",
-    "    return []"
+    "    result = []\n",
+    "    regex = ''\n",
+    "    for word in dictionary:\n",
+    "        if regex != '':\n",
+    "            regex += '|'\n",
+    "        regex += '(' + word + ')'\n",
+    "    for occurrence in re.finditer(regex, text, re.I):\n",
+    "        result.append((occurrence.group(), occurrence.start(), occurrence.end()))\n",
+    "    return result"
   ]
  },
  {
@ -161,116 +171,34 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 1,
   "id": "tribal-attention",
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-04-20T15:23:32.727687100Z",
+     "start_time": "2024-04-20T15:23:24.826454500Z"
+    }
+   },
   "outputs": [
    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " \n",
-      "for\n",
-      "all\n",
-      "Java\n",
-      "programmer\n",
-      ":\n",
-      "this\n",
-      "section\n",
-      "explain\n",
-      "how\n",
-      "to\n",
-      "compile\n",
-      "and\n",
-      "run\n",
-      "a\n",
-      "swing\n",
-      "application\n",
-      "from\n",
-      "the\n",
-      "command\n",
-      "line\n",
-      ".\n",
-      "for\n",
-      "information\n",
-      "on\n",
-      "compile\n",
-      "and\n",
-      "run\n",
-      "a\n",
-      "swing\n",
-      "application\n",
-      "use\n",
-      "NetBeans\n",
-      "IDE\n",
-      ",\n",
-      "see\n",
-      "Running\n",
-      "Tutorial\n",
-      "Examples\n",
-      "in\n",
-      "NetBeans\n",
-      "IDE\n",
-      ".\n",
-      "the\n",
-      "compilation\n",
-      "instruction\n",
-      "work\n",
-      "for\n",
-      "all\n",
-      "swing\n",
-      "program\n",
-      "—\n",
-      "applet\n",
-      ",\n",
-      "as\n",
-      "well\n",
-      "as\n",
-      "application\n",
-      ".\n",
-      "here\n",
-      "be\n",
-      "the\n",
-      "step\n",
-      "-PRON-\n",
-      "need\n",
-      "to\n",
-      "follow\n",
-      ":\n",
-      "install\n",
-      "the\n",
-      "late\n",
-      "release\n",
-      "of\n",
-      "the\n",
-      "Java\n",
-      "SE\n",
-      "platform\n",
-      ",\n",
-      "if\n",
-      "-PRON-\n",
-      "have\n",
-      "not\n",
-      "already\n",
-      "do\n",
-      "so\n",
-      ".\n",
-      "create\n",
-      "a\n",
-      "program\n",
-      "that\n",
-      "use\n",
-      "Swing\n",
-      "component\n",
-      ".\n",
-      "compile\n",
-      "the\n",
-      "program\n",
-      ".\n",
-      "run\n",
-      "the\n",
-      "program\n",
-      ".\n"
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[1;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
+      "Cell \u001B[1;32mIn[1], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mspacy\u001B[39;00m\n\u001B[0;32m      2\u001B[0m nlp \u001B[38;5;241m=\u001B[39m spacy\u001B[38;5;241m.\u001B[39mload(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124men_core_web_sm\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m      4\u001B[0m doc \u001B[38;5;241m=\u001B[39m nlp(text)\n",
+      "File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\__init__.py:13\u001B[0m\n\u001B[0;32m     10\u001B[0m \u001B[38;5;66;03m# These are imported as part of the API\u001B[39;00m\n\u001B[0;32m     11\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mthinc\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mapi\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Config, prefer_gpu, require_cpu, require_gpu  \u001B[38;5;66;03m# noqa: F401\u001B[39;00m\n\u001B[1;32m---> 13\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m pipeline  \u001B[38;5;66;03m# noqa: F401\u001B[39;00m\n\u001B[0;32m     14\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m util\n\u001B[0;32m     15\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mabout\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m __version__  \u001B[38;5;66;03m# noqa: F401\u001B[39;00m\n",
+      "File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\pipeline\\__init__.py:1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mattributeruler\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m AttributeRuler\n\u001B[0;32m      2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdep_parser\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m DependencyParser\n\u001B[0;32m      3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01medit_tree_lemmatizer\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m EditTreeLemmatizer\n",
+      "File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\pipeline\\attributeruler.py:8\u001B[0m\n\u001B[0;32m      6\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m util\n\u001B[0;32m      7\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01merrors\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Errors\n\u001B[1;32m----> 8\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mlanguage\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Language\n\u001B[0;32m      9\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mmatcher\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Matcher\n\u001B[0;32m     10\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mscorer\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Scorer\n",
+      "File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\language.py:43\u001B[0m\n\u001B[0;32m     41\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mlang\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mtokenizer_exceptions\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m BASE_EXCEPTIONS, URL_MATCH\n\u001B[0;32m     42\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mlookups\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m load_lookups\n\u001B[1;32m---> 43\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpipe_analysis\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m analyze_pipes, print_pipe_analysis, validate_attrs\n\u001B[0;32m     44\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mschemas\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m (\n\u001B[0;32m     45\u001B[0m     ConfigSchema,\n\u001B[0;32m     46\u001B[0m     ConfigSchemaInit,\n\u001B[1;32m   (...)\u001B[0m\n\u001B[0;32m     49\u001B[0m     validate_init_settings,\n\u001B[0;32m     50\u001B[0m )\n\u001B[0;32m     51\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mscorer\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Scorer\n",
+      "File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\pipe_analysis.py:6\u001B[0m\n\u001B[0;32m      3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mwasabi\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m msg\n\u001B[0;32m      5\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01merrors\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Errors\n\u001B[1;32m----> 6\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mtokens\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Doc, Span, Token\n\u001B[0;32m      7\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutil\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m dot_to_dict\n\u001B[0;32m      9\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m TYPE_CHECKING:\n\u001B[0;32m     10\u001B[0m     \u001B[38;5;66;03m# This lets us add type hints for mypy etc. without causing circular imports\u001B[39;00m\n",
+      "File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\tokens\\__init__.py:1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m_serialize\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m DocBin\n\u001B[0;32m      2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdoc\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Doc\n\u001B[0;32m      3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mmorphanalysis\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m MorphAnalysis\n",
+      "File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\tokens\\_serialize.py:14\u001B[0m\n\u001B[0;32m     12\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01merrors\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Errors\n\u001B[0;32m     13\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutil\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m SimpleFrozenList, ensure_path\n\u001B[1;32m---> 14\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mvocab\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Vocab\n\u001B[0;32m     15\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m_dict_proxies\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m SpanGroups\n\u001B[0;32m     16\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdoc\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m DOCBIN_ALL_ATTRS \u001B[38;5;28;01mas\u001B[39;00m ALL_ATTRS\n",
+      "File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\vocab.pyx:1\u001B[0m, in \u001B[0;36minit spacy.vocab\u001B[1;34m()\u001B[0m\n",
+      "File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\tokens\\doc.pyx:1\u001B[0m, in \u001B[0;36minit spacy.tokens.doc\u001B[1;34m()\u001B[0m\n",
+      "File \u001B[1;32m<frozen importlib._bootstrap>:404\u001B[0m, in \u001B[0;36mparent\u001B[1;34m(self)\u001B[0m\n",
+      "\u001B[1;31mKeyboardInterrupt\u001B[0m: "
     ]
    }
   ],
@ -308,7 +236,12 @@
   "outputs": [],
   "source": [
    "def terminology_lookup():\n",
-    "    return []"
+    "    result = []\n",
+    "    for token in doc:\n",
+    "        if token.lemma_ in dictionary:\n",
+    "            result.append((token, token.idx, token.idx + len(token)))\n",
+    "\n",
+    "    return result"
   ]
  },
  {
@ -343,7 +276,13 @@
   "outputs": [],
   "source": [
    "def get_nouns(text):\n",
-    "    return []"
+    "    result = []\n",
+    "    doc = nlp(text)\n",
+    "    for token in doc:\n",
+    "        if token.pos_ == 'NOUN':\n",
+    "            result.append(token)\n",
+    "\n",
+    "    return result"
   ]
  },
  {
@ -380,7 +319,16 @@
   "outputs": [],
   "source": [
    "def extract_terms(text):\n",
-    "    return []"
+    "    result = {}\n",
+    "    doc = nlp(text)\n",
+    "    for token in doc:\n",
+    "        if token.pos_ == 'NOUN':\n",
+    "            if result.get(token.lemma_) is None:\n",
+    "                result[token.lemma_] = 1\n",
+    "            else:\n",
+    "                result[token.lemma_] += 1\n",
+    "\n",
+    "    return result"
   ]
  },
  {
@ -399,7 +347,16 @@
   "outputs": [],
   "source": [
    "def extract_terms(text):\n",
-    "    return []"
+    "    result = {}\n",
+    "    doc = nlp(text)\n",
+    "    for token in doc:\n",
+    "        if token.pos_ in ['NOUN', 'VERB', 'ADJ']:\n",
+    "            if result.get(token.lemma_) is None:\n",
+    "                result[token.lemma_] = 1\n",
+    "            else:\n",
+    "                result[token.lemma_] += 1\n",
+    "\n",
+    "    return result"
   ]
  }
 ],
@ -407,7 +364,7 @@
  "author": "Rafał Jaworski",
  "email": "rjawor@amu.edu.pl",
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -422,7 +379,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.9.2"
  },
  "subtitle": "3. Terminologia",
  "title": "Komputerowe wspomaganie tłumaczenia",
--- a/lab/lab_04-05.ipynb
+++ b/lab/lab_04-05.ipynb
--- a/lab/lab_06-07.ipynb
+++ b/lab/lab_06-07.ipynb
@ -60,8 +60,14 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "import regex\n",
+    "\n",
+    "\n",
    "def find_tags(text):\n",
-    "    return []"
+    "    result = []\n",
+    "    for occurance in regex.finditer(\"(\\</?\\w+\\>)\", text, regex.IGNORECASE):\n",
+    "        result.append(occurance.span())\n",
+    "    return result"
   ]
  },
  {
@ -79,8 +85,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "import regex\n",
+    "\n",
+    "\n",
+    "# Assuming text is a single word\n",
    "def is_translatable(text):\n",
-    "    return True"
+    "    return regex.fullmatch(\"[A-Z\\-]+\", text, regex.IGNORECASE) is not None"
   ]
  },
  {
@ -98,8 +108,26 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "import regex\n",
+    "\n",
+    "\n",
    "def find_dates(text):\n",
-    "    return []"
+    "    regex_format = regex.compile(\"(?P<day>[0-9]{1,2})[/.-](?P<month>[0-9]{1,2})[/.-](?P<year>[0-9]{4})\")\n",
+    "    matches = regex.match(regex_format, text)\n",
+    "    result = {\n",
+    "        'day': int(matches.group('day')),\n",
+    "        'month': int(matches.group('month')),\n",
+    "        'year': int(matches.group('year')),\n",
+    "    }\n",
+    "\n",
+    "    return result\n",
+    "\n",
+    "\n",
+    "print(find_dates(\"01/02/1970\"))\n",
+    "print(find_dates(\"01.02.1970\"))\n",
+    "print(find_dates(\"01-02-1970\"))\n",
+    "print(find_dates(\"1/2/1970\"))\n",
+    "print(find_dates(\"1.2.1970\"))"
   ]
  },
  {
@ -130,8 +158,22 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "formats = {\n",
+    "    'd/m/y': lambda date: f\"{date['day']}/{date['month']}/{date['year']}\",\n",
+    "    'y-m-d': lambda date: f\"{date['year']}-{date['month']}-{date['day']}\",\n",
+    "}\n",
+    "\n",
+    "\n",
    "def correct_dates(source_segment, target_segment, date_format):\n",
-    "    return ''"
+    "    source_date = find_dates(source_segment)\n",
+    "    target_date = find_dates(target_segment)\n",
+    "    if target_date != source_date:\n",
+    "        print('Dates differ')\n",
+    "\n",
+    "    return formats[date_format](source_date)\n",
+    "\n",
+    "\n",
+    "print(correct_dates(\"1.2.1970\", \"1.2.1970\", 'y-m-d'))"
   ]
  },
  {
@ -190,7 +232,7 @@
  "author": "Rafał Jaworski",
  "email": "rjawor@amu.edu.pl",
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -205,7 +247,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.9.2"
  },
  "subtitle": "6,7. Preprocessing i postprocessing",
  "title": "Komputerowe wspomaganie tłumaczenia",
--- a/lab/lab_09-10.ipynb
+++ b/lab/lab_09-10.ipynb
@ -50,37 +50,36 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Nastolatek ukradł znajomemu 4500 złotych. Wcześniej pił z nim alkohol\n",
-      "Czekają nas kolejne podwyżki rachunków. Tym razem za ogrzewanie i ciepłą wodę\n",
-      "Nie żyje Piotr Ś. Czyściciel kamienic miał 47 lat\n",
-      "Maciej Skorża nie zmienił zdania o systemie na mecz z Rakowem. Kolejorz ma szybką okazję do rehabilitacji\n",
-      "Kto zabił Kazimierę Kurkowiak? Poznańskie Archiwum X wraca do sprawy sprzed 30 lat\n",
-      "Mieszkańcy osiedla Kwiatowego zyskają nowy chodnik\n",
-      "Poznańskie ZOO ponownie się otwiera i apeluje o kupowanie biletów online\n",
-      "1700 zł mandatu dla motocyklisty: nie ma prawa jazdy, jechał za szybko\n",
-      "Plac Wolności ma tętnić życiem. Jest koncepcja zagospodarowania\n",
-      "Dzikie wysypisko w Wielkopolskim Parku Narodowym, a w nim paczka z telefonem odbiorcy\n",
-      "Dobre wieści z Łazarza! \"Zielona Perła\" sprzedana!\n",
-      "Sokoły wędrowne w gnieździe na kominie poznańskiej elektrociepłowni! Są 4 młode\n",
-      "720 nowych zakażeń w Wielkopolsce\n",
-      "Uderzył kobietę w sklepie: \"sprawca będzie rozliczony\"\n",
-      "Zespół Szkół Geodezyjno- Drogowych. Przyszłość rysuje się w kolorowych barwach!\n",
-      "Tajemniczy wypadek i pożar pod Kwilczem. Auto spłonęło, w środku nikogo nie było\n",
-      "Nad Jeziorem Maltańskim powstanie duży hotel? \"Ma uzupełniać infrastrukturę sportową\"\n",
-      "Śmiertelny wypadek na trasie S8: samochód potrącił rowerzystę\n",
-      "Specjaliści o poszukiwaniu Natalii Lick: \"niestety trop psa prowadził na Wartostradę\"\n",
-      "Korki przy skrzyżowaniu Grochowska / Grunwaldzka: ruszyły prace!\n",
-      "Restauracja w Kaliszu przyjmuje klientów: sanepid i policja \"odwiedzili\" lokal\n",
-      "Ile kosztuje wywóz odpadów?\n",
-      "Dachowanie auta na trasie Konin - Turek\n",
-      "Kierowca BMW pod wpływem narkotyków, pasażer w ich posiadaniu. Obaj zostali zatrzymani\n",
-      "Leszno: mężczyzna uderzył klientkę sklepu. Poszło o maseczkę?\n",
-      "Od poniedziałku zapłacimy za parkowanie na kolejnych ulicach\n",
-      "Włamał się do obiektu handlowego. Grozi mu nawet 15 lat więzienia\n",
-      "Rondo Śródka: kolizja z udziałem dwóch pojazdów\n",
-      "Europoseł PSL: oświadczenie Episkopatu ma wpływ na proces szczepień. \"Bardzo dużo ludzi zrezygnowało\"\n",
-      "Bezcenna wygrana Enea Energetyka. Poznanianki zagrają w fazie play-off\n",
-      "No to w drogę! Po odmienionych trasach w Wielkopolsce\n"
+      "W Poznaniu uroczyście odsłonięto monument upamiętniający cmentarz żydowski założony jeszcze w XIX wieku\n",
+      "Przez ulice Poznania przejdzie Marsz dla Życia. Będą utrudnienia\n",
+      "Sierść psa zatopiona w żywicy? Taką biżuterię pamiątkową zlecają właściciele czworonożnych pociech\n",
+      "Nagrał film w jednej z poznańskich \"Biedronek\". Kilka spleśniałych cytryn w kartonie. \"Nikt się tym nie przejmuje\"\n",
+      "Gniezno: poszkodowani po ulewie będą mogli ubiegać się o pomoc w ZUS i US. Powstała również specjalna infolinia\n",
+      "Zostawiła jedzenie dla potrzebujących. Coraz więcej głodnych osób, którym nie wystarcza pieniędzy po opłaceniu rachunków\n",
+      "Kolejne ostrzeżenie I stopnia od IMGW. Oprócz burz może wystąpić również grad\n",
+      "Lech przegrał Koroną. Na trybunach marsz żałobny i 'mamy k**** dość'\n",
+      "Warta Poznań po przegranej z Jagielonią Białystok spada do I ligi\n",
+      "Mieszkańcy skarżą się na właściciela samochodu, w którym notorycznie włącza się alarm. \"Uprzykrza nam to życie!\"\n",
+      "Leśne Placówki Montessori\n",
+      "Na autostradzie samochód wpadł w poślizg i stanął w poprzek. Są spore utrudnienia\n",
+      "Wróciła plaga kradzieży katalizatorów. Zmora dla kierowców, którzy nie mogą garażować auta\n",
+      "Nowy basen w Kiekrzu? W tunelu wody przybyło po same kolana\n",
+      "Pierożki Dim Sum z Para Bar Rataje ze specjalną zniżką!\n",
+      "Wielka głowa Darii Zawiałow zablokowała przez chwilę przejście dla pieszych na jednej z poznańskich ulic\n",
+      "Fałszywy pożar w centrum Poznania. Kłęby dymu w kamienicy?\n",
+      "Jest kolejne ostrzeżenie pierwszego stopnia, tym razem hydrologiczne. Gwałtowny wzrost stanu wody\n",
+      "Uwaga. Utrudnienia na drodze i ograniczenie prędkości. Potrwa to około 5 godzin\n",
+      "Chcą pobić rekord w kręceniu lodów. Tona lodów w ciągu doby\n",
+      "Jest ostrzeżenie IMGW dla Wielkopolski. Lepiej schować przedmioty, które mogą przemieścić się pod wypływem silnego wiatru\n",
+      "Nowe Centrum Medyczne Bizpark już w sprzedaży. Znajdź idealny lokal pod swoją działalność medyczną\n",
+      "Rondo Obornickie: zderzenie samochodu z motocyklem. Poszkodowany został odwieziony do szpitala. Chwilowe utrudnienia\n",
+      "Policjanci publikują wizerunek i szukają tego mężczyzny\n",
+      "Grupa Stonewall będzie miała program na antenie TVP3 Poznań. \"To będzie odtrutka na lata dezinformacji\"\n",
+      "Ruszył remont ważnego mostu. Co z kłódkami zakochanych?\n",
+      "Mieszkaniec spotkał wilka w Poznaniu?\n",
+      "Włamanie do... lokomotywy\n",
+      "W nadwarciański krajobraz wpisały się... żurawie. \"Jeden jest największy na świecie\"\n",
+      "Robisz remont? Za to możesz słono zapłacić!\n"
     ]
    }
   ],
@ -108,13 +107,51 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 10,
   "id": "moving-clothing",
   "metadata": {},
   "outputs": [],
   "source": [
-    "def get_names(article_type):\n",
-    "    return []"
+    "from bs4 import element\n",
+    "\n",
+    "def get_names(article_type, page_nr: int = 0):\n",
+    "    url = 'https://www.ceneo.pl/;szukaj-' + article_type + ';0020-30-0-0-' + str(page_nr) + '.htm'\n",
+    "    page = requests.get(url)\n",
+    "    if page_nr != 0 and url != page.url:\n",
+    "        return []\n",
+    "    soup = BeautifulSoup(page.content, 'html.parser')\n",
+    "    result = []\n",
+    "\n",
+    "    def is_product_title_container(tag: element.Tag) -> bool:\n",
+    "        if not tag.has_attr('class'):\n",
+    "            return False\n",
+    "\n",
+    "        classes = tag.attrs['class']\n",
+    "        if len(classes) != 1:\n",
+    "            return False\n",
+    "\n",
+    "        return classes[0] == 'cat-prod-row__name'\n",
+    "\n",
+    "    def is_product_title(tag: element.Tag) -> bool:\n",
+    "        if not tag.has_attr('class'):\n",
+    "            return True\n",
+    "\n",
+    "        classes = tag.attrs['class']\n",
+    "        if len(classes) != 1:\n",
+    "            return False\n",
+    "\n",
+    "        return classes[0] == 'font-bold'\n",
+    "\n",
+    "    for tag in soup.find_all(is_product_title_container):\n",
+    "        href = tag.find('a')\n",
+    "        if type(href) is not element.Tag:\n",
+    "            continue\n",
+    "        spans = href.find_all('span')\n",
+    "        for span in spans:\n",
+    "            if is_product_title(span):\n",
+    "                result.append(span.text)\n",
+    "\n",
+    "    return result"
   ]
  },
  {
@ -135,13 +172,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "id": "german-dispute",
   "metadata": {},
   "outputs": [],
   "source": [
    "def scrape_names():\n",
-    "    return []"
+    "    result = []\n",
+    "    search = 'laptop'\n",
+    "    page = 0\n",
+    "    while True:\n",
+    "        local_result = get_names(search, page)\n",
+    "        if len(local_result) == 0:\n",
+    "            return result\n",
+    "        result = result + local_result\n",
+    "        page += 1"
   ]
  },
  {
@ -197,13 +242,39 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "id": "regulation-sheriff",
   "metadata": {},
   "outputs": [],
   "source": [
    "def scrape_wmi():\n",
-    "    return []"
+    "    def get_text(soup_l: BeautifulSoup) -> str:\n",
+    "        for trash in soup_l(['script', 'style']):\n",
+    "            trash.extract()\n",
+    "\n",
+    "        text = soup_l.get_text()\n",
+    "\n",
+    "        return re.sub(r'\\s+', ' ', text)\n",
+    "\n",
+    "    result = []\n",
+    "\n",
+    "    base_url = 'https://wmi.amu.edu.pl/'\n",
+    "    page = requests.get(base_url)\n",
+    "    soup = BeautifulSoup(page.content, 'html.parser')\n",
+    "    result.append(get_text(soup))\n",
+    "    for href in soup.find_all('a'):\n",
+    "        if type(href) != element.Tag:\n",
+    "            continue\n",
+    "\n",
+    "        if not href.has_attr('href'):\n",
+    "            continue\n",
+    "\n",
+    "        if base_url in href.attrs['href']:\n",
+    "            sub_page = requests.get(href.attrs['href'])\n",
+    "            result.append(get_text(BeautifulSoup(sub_page.content, 'html.parser')))\n",
+    "\n",
+    "\n",
+    "    return result"
   ]
  },
  {
@ -222,30 +293,97 @@
    "### Ćwiczenie 4: Pobierz jak najwięcej słów w języku albańskim z serwisu glosbe.com."
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "706d6cba-c7a7-4d1b-9c2f-eb2119f859b5",
+   "metadata": {},
+   "source": [
+    "Nie jest to rozwiązanie zbalansowane, ale pobierze najwięcej słów (Przy odpowiedniej rotacji adresów IP, z których korzystamy, ale założyłem, że kwestia infrastruktury i tego jak strona jest chroniona przed atakami DOS, jest poza zakresem tego zadania)"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "id": "surgical-ozone",
   "metadata": {},
   "outputs": [],
   "source": [
    "def scrape_shqip():\n",
-    "    return []"
+    "    import string\n",
+    "\n",
+    "    result = []\n",
+    "    letters = list(string.ascii_lowercase)\n",
+    "    letters_count = len(letters)\n",
+    "    longest_sensible_english_word_len = 28\n",
+    "    base_url = 'https://glosbe.com/en/sq/'\n",
+    "\n",
+    "    def get_words(word_l: str) -> list[str]:\n",
+    "        def is_translated_word(tag: element.Tag) -> bool:\n",
+    "            if not tag.has_attr('id') or not tag.has_attr('lang'):\n",
+    "                return False\n",
+    "\n",
+    "            if not 'translation__' in tag.attrs['id'] or 'sq' != tag.attrs['lang']:\n",
+    "                return False\n",
+    "\n",
+    "            return True\n",
+    "\n",
+    "        result_l = []\n",
+    "        page = requests.get(base_url + word_l)\n",
+    "        soup = BeautifulSoup(page.content, 'html.parser')\n",
+    "        words_l = soup.find_all(is_translated_word)\n",
+    "        for word_l in words_l:\n",
+    "            text = word_l.text\n",
+    "            result_l.append(re.sub(r'\\s+', ' ', text))\n",
+    "\n",
+    "        return result_l\n",
+    "\n",
+    "    def trans(word_l: list[int]) -> str:\n",
+    "        result_l = ''\n",
+    "        for letter_l in word_l:\n",
+    "            result_l += letters[letter_l]\n",
+    "\n",
+    "        return result_l\n",
+    "\n",
+    "    def increment(word_l: list[int]) -> list[int]:\n",
+    "        done = False\n",
+    "        result_l = []\n",
+    "        for letter_l in word_l:\n",
+    "            if done:\n",
+    "                result_l.append(letter_l)\n",
+    "                continue\n",
+    "            next_letter_l = letter_l + 1\n",
+    "            if next_letter_l == letters_count:\n",
+    "                result_l.append(0)\n",
+    "                continue\n",
+    "\n",
+    "            result_l.append(next_letter_l)\n",
+    "            done = True\n",
+    "\n",
+    "        return result_l\n",
+    "\n",
+    "    for length in range(longest_sensible_english_word_len - 1):\n",
+    "        length += 1\n",
+    "        combos = pow(length, letters_count)\n",
+    "        word = []\n",
+    "        for pos in range(length):\n",
+    "            word.append(0)\n",
+    "        for i in range(combos):\n",
+    "            result.append(get_words(trans(word)))\n",
+    "            word = increment(word)\n",
+    "\n",
+    "    return result"
   ]
  }
 ],
 "metadata": {
  "author": "Rafał Jaworski",
  "email": "rjawor@amu.edu.pl",
-  "lang": "pl",
-  "subtitle": "9,10. Web scraping",
-  "title": "Komputerowe wspomaganie tłumaczenia",
-  "year": "2021",
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
+  "lang": "pl",
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
@ -256,8 +394,11 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
+   "version": "3.10.4"
+  },
+  "subtitle": "9,10. Web scraping",
+  "title": "Komputerowe wspomaganie tłumaczenia",
+  "year": "2021"
 },
 "nbformat": 4,
 "nbformat_minor": 5
--- a/lab/lab_11.ipynb
+++ b/lab/lab_11.ipynb
@ -52,13 +52,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
   "id": "german-dispute",
   "metadata": {},
   "outputs": [],
   "source": [
    "def sentence_split(text):\n",
-    "    return []"
+    "    def purge(text_l: str) -> str:\n",
+    "        return text_l.strip('.').strip()\n",
+    "    index = 0\n",
+    "    result = []\n",
+    "    for match in regex.finditer(r'\\. \\p{Lu}|\\n', text):\n",
+    "        result.append(purge(text[index:match.start(0)]))\n",
+    "        index = match.start(0)\n",
+    "    result.append(purge(text[index:len(text)]))\n",
+    "\n",
+    "    return result"
   ]
  },
  {
@ -69,6 +78,14 @@
    "### Ćwiczenie 2: Uruchom powyższy algorytm na treści wybranej przez siebie strony internetowej (do ściągnięcia treści strony wykorzystaj kod z laboratoriów nr 7). Zidentyfikuj co najmniej dwa wyjątki od ogólnej reguły podziału na segmenty i ulepsz algorytm."
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "20bc0bf7-35b7-44e5-8750-c22e6de9d048",
+   "metadata": {},
+   "source": [
+    "Dwa wyjatki to zdania zakończone wykrzyknikiem i zdania zakończone znakiem zapytania"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 3,
@ -76,8 +93,17 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "def sentence_split_enhanced(text):\n",
-    "    return []"
+    "def sentence_split(text):\n",
+    "    def purge(text_l: str) -> str:\n",
+    "        return text_l.strip('.').strip('?').strip('!').strip()\n",
+    "    index = 0\n",
+    "    result = []\n",
+    "    for match in regex.finditer(r'(\\.|\\?|\\!) \\p{Lu}|\\n', text):\n",
+    "        result.append(purge(text[index:match.start(0)]))\n",
+    "        index = match.start(0)\n",
+    "    result.append(purge(text[index:len(text)]))\n",
+    "\n",
+    "    return result"
   ]
  },
  {
@ -117,6 +143,14 @@
    "Wyjściem z Hunaligna jest plik w specjalnym formacie Hunaligna. Problem jednak w tym, że niestety nie można go w prosty sposób zaimportować do jakiegokolwiek narzędzia typu CAT. Potrzebna jest konwersja do któregoś z bardziej popularnych formatów, np. XLIFF."
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "80360005-5110-4f83-bfd6-dbe22a1d5b5b",
+   "metadata": {},
+   "source": [
+    "## *Linki do  pobrania tego progamu(ftp://ftp.mokk.bme.hu/Hunglish/src/hunalign/latest/hunalign-1.1-windows.zip), dostępne w README na githubie, nie działają.*"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "divided-chain",
@ -187,15 +221,12 @@
 "metadata": {
  "author": "Rafał Jaworski",
  "email": "rjawor@amu.edu.pl",
-  "lang": "pl",
-  "subtitle": "11. Urównoleglanie",
-  "title": "Komputerowe wspomaganie tłumaczenia",
-  "year": "2021",
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
+  "lang": "pl",
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
@ -206,8 +237,11 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
+   "version": "3.10.4"
+  },
+  "subtitle": "11. Urównoleglanie",
+  "title": "Komputerowe wspomaganie tłumaczenia",
+  "year": "2021"
 },
 "nbformat": 4,
 "nbformat_minor": 5
--- a/lab/lab_12.ipynb
+++ b/lab/lab_12.ipynb
@ -96,6 +96,26 @@
    "### Ćwiczenie 1: Wykorzystując powyższy kod napisz keylogger, który zapisuje wszystkie uderzenia w klawisze do pliku. Format pliku jest dowolny, każdy wpis musi zawierać precyzyjną godzinę uderzenia oraz uderzony klawisz. Uruchom program i przepisz paragraf dowolnie wybranego tekstu."
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8663ef15-88a0-4bb5-aff9-f19cbb3178c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import keyboard\n",
+    "\n",
+    "\n",
+    "def report_key(event: keyboard.KeyboardEvent):\n",
+    "    file = open('test.txt', 'a')\n",
+    "    file.write(f'[{event.time}] {event.name}\\n')\n",
+    "    file.close()\n",
+    "\n",
+    "\n",
+    "keyboard.on_release(callback=report_key)\n",
+    "keyboard.wait()"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "valuable-bearing",
@ -120,7 +140,40 @@
   "outputs": [],
   "source": [
    "def calculate_typing_speed():\n",
-    "    return 0"
+    "    import re\n",
+    "    import numpy\n",
+    "\n",
+    "    def parse(line_l: str) -> (float, str):\n",
+    "        res = re.findall(r'(\\d+.\\d+)|([a-zA-Z,.]+)', ''.join(line_l.split()))\n",
+    "        return float(res[0][0]), res[1][1]\n",
+    "\n",
+    "    file = open('test.txt',  'r')\n",
+    "    time_per_word = []\n",
+    "    time_per_character = []\n",
+    "    local_time_per_word = []\n",
+    "\n",
+    "    prev_char_timestamp = None\n",
+    "    for line in file:\n",
+    "        time, key = parse(line)\n",
+    "        if prev_char_timestamp is None or time - prev_char_timestamp > 5:\n",
+    "            prev_char_timestamp = time\n",
+    "            local_time_per_word = []\n",
+    "            continue\n",
+    "        elapsed = time - prev_char_timestamp\n",
+    "        time_per_character.append(elapsed)\n",
+    "        if key == 'space' or key == 'enter' or key == ',' or key == '.':\n",
+    "            if len(local_time_per_word) > 0:\n",
+    "                time_per_word.append(numpy.sum(local_time_per_word))\n",
+    "            local_time_per_word = []\n",
+    "            time_per_character.append(elapsed)\n",
+    "            prev_char_timestamp = time\n",
+    "            continue\n",
+    "        local_time_per_word.append(elapsed)\n",
+    "        prev_char_timestamp = time\n",
+    "    file.close()\n",
+    "    time_per_word.append(numpy.sum(local_time_per_word))\n",
+    "    \n",
+    "    return 60 / numpy.average(time_per_character), 60 / numpy.average(time_per_word)"
   ]
  },
  {
@ -147,22 +200,57 @@
   "outputs": [],
   "source": [
    "def find_pauses():\n",
-    "    return []"
+    "    import re\n",
+    "\n",
+    "    def parse(line_l: str) -> (float, str):\n",
+    "        res = re.findall(r'(\\d+.\\d+)|([a-zA-Z,.]+)', ''.join(line_l.split()))\n",
+    "        return float(res[0][0]), res[1][1]\n",
+    "\n",
+    "    file = open('test.txt', 'r')\n",
+    "    stops = []\n",
+    "    stop_reporting_time = 1\n",
+    "\n",
+    "    prev_char_timestamp = None\n",
+    "    lines = file.readlines()\n",
+    "    file.close()\n",
+    "    for i in range(len(lines)):\n",
+    "        time, key = parse(lines[i])\n",
+    "        if prev_char_timestamp is None:\n",
+    "            prev_char_timestamp = time\n",
+    "            continue\n",
+    "        elapsed = time - prev_char_timestamp\n",
+    "        if elapsed > stop_reporting_time:\n",
+    "            context_start = max(0, i - 20)\n",
+    "            context_end = min(len(lines), i + 20)\n",
+    "            context_before = ''\n",
+    "            context_after = ''\n",
+    "            for j in range(context_start, i):\n",
+    "                time_l, key_l = parse(lines[j])\n",
+    "                context_before += key_l\n",
+    "            for j in range(i, context_end):\n",
+    "                time_l, key_l = parse(lines[j])\n",
+    "                context_after += key_l\n",
+    "            stops.append((elapsed, (context_before, context_after)))\n",
+    "        prev_char_timestamp = time\n",
+    "\n",
+    "    def stop_sort(record: tuple):\n",
+    "        return record[0]\n",
+    "\n",
+    "    stops.sort(reverse=True, key=stop_sort)\n",
+    "    \n",
+    "    return stops"
   ]
  }
 ],
 "metadata": {
  "author": "Rafał Jaworski",
  "email": "rjawor@amu.edu.pl",
-  "lang": "pl",
-  "subtitle": "12. Key logging",
-  "title": "Komputerowe wspomaganie tłumaczenia",
-  "year": "2021",
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
+  "lang": "pl",
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
@ -173,8 +261,11 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
+   "version": "3.10.4"
+  },
+  "subtitle": "12. Key logging",
+  "title": "Komputerowe wspomaganie tłumaczenia",
+  "year": "2021"
 },
 "nbformat": 4,
 "nbformat_minor": 5
--- a/lab/lab_13-14.ipynb
+++ b/lab/lab_13-14.ipynb
@ -201,7 +201,7 @@
  "author": "Rafał Jaworski",
  "email": "rjawor@amu.edu.pl",
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@ -216,7 +216,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.4"
  },
  "subtitle": "13,14. Korekta pisowni",
  "title": "Komputerowe wspomaganie tłumaczenia",
--- a/lab/lab_15.ipynb
+++ b/lab/lab_15.ipynb
Author	SHA1	Message	Date
Patryk	9068e7e85d	lab 15	2024-06-09 18:22:19 +02:00
Patryk	a3dca39152	lab 11-14	2024-05-28 23:44:55 +02:00
Patryk	824f7d373d	Merge branch 'wip' # Conflicts: # lab/lab_02.ipynb	2024-05-27 00:55:27 +02:00
Patryk	6a0efac373	lab 09-10	2024-05-27 00:53:56 +02:00
Patryk Osiński	78982a4f21	wip	2024-04-20 19:58:36 +02:00
Patryk	9b9e46df22	lab 3	2024-04-16 21:12:25 +02:00
Patryk	2b22583359	lab 2	2024-04-16 08:47:38 +02:00
Patryk Osiński	ddd2833663	lab 1	2024-04-13 14:22:23 +02:00