From 9a804fe3dd2546ae35bf24af266191b375f6a5e8 Mon Sep 17 00:00:00 2001
From: s495724 <mickas12@st.amu.edu.pl>
Date: Tue, 16 Apr 2024 13:36:13 +0200
Subject: [PATCH] Upload files to "lab"

---
 lab/lab_01.ipynb | 113 +++++++++++++++++++++++++------
 lab/lab_02.ipynb | 112 ++++++++++++++++++++++++-------
 lab/lab_03.ipynb | 168 ++++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 328 insertions(+), 65 deletions(-)

diff --git a/lab/lab_01.ipynb b/lab/lab_01.ipynb
index 0ffe833..8353712 100644
--- a/lab/lab_01.ipynb
+++ b/lab/lab_01.ipynb
@@ -213,13 +213,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 13,
    "id": "protected-rings",
    "metadata": {},
    "outputs": [],
    "source": [
     "def tm_lookup(sentence):\n",
-    "    return ''"
+    "    return [entry[1].casefold() for entry in translation_memory if entry[0] == sentence]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "99d75100-0f9d-4586-82ef-ab42180472a2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['press the enter button', 'press the enter key']"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tm_lookup('Wciśnij przycisk Enter')"
    ]
   },
   {
@@ -232,17 +253,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 15,
    "id": "severe-alloy",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "''"
+       "[]"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -261,13 +282,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 16,
    "id": "structural-diesel",
    "metadata": {},
    "outputs": [],
    "source": [
+    "import string \n",
+    "\n",
     "def tm_lookup(sentence):\n",
-    "    return ''"
+    "    return [entry[1].casefold() for entry in translation_memory if entry[0] == sentence]\n",
+    "    translator = str.maketrans('', '', string.punctuation) \n",
+    "    return sentence.translate(translator)"
    ]
   },
   {
@@ -280,17 +305,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 20,
    "id": "brief-senegal",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "''"
+       "[]"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -317,15 +342,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 24,
    "id": "mathematical-customs",
    "metadata": {},
    "outputs": [],
    "source": [
     "def tm_lookup(sentence):\n",
+    "    translator = str.maketrans('', '', string.punctuation)\n",
+    "    sentence = sentence.translate(translator).casefold()\n",
+    "    for entry in translation_memory:\n",
+    "        if any(word in entry[0].casefold() for word in sentence.split()):\n",
+    "            return entry[1]\n",
     "    return ''"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "f6537825-62a6-4503-91a5-bbb17d84170b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'System restart required'"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tm_lookup('Wymagane ponowne uruchomienie maszyny')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "meaningful-virus",
@@ -344,7 +395,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 26,
    "id": "humanitarian-wrong",
    "metadata": {},
    "outputs": [],
@@ -362,7 +413,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 27,
    "id": "located-perception",
    "metadata": {},
    "outputs": [],
@@ -374,7 +425,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 28,
    "id": "advised-casting",
    "metadata": {},
    "outputs": [
@@ -384,7 +435,7 @@
        "[('przycisk', 'button'), ('drukarka', 'printer')]"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -419,13 +470,35 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 37,
    "id": "original-tunisia",
    "metadata": {},
    "outputs": [],
    "source": [
     "def glossary_lookup(sentence):\n",
-    "    return ''"
+    "    sentence_words = sentence.casefold().split()\n",
+    "    return [entry for entry in glossary if entry[0] in sentence_words]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "b3ae5504-4168-4fe0-ad25-60558242a31d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('przycisk', 'button'), ('drukarka', 'printer')]"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "glossary_lookup('Każda Drukarka posiada przycisk wznowienia drukowania')"
    ]
   },
   {
@@ -438,7 +511,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 38,
    "id": "adolescent-semiconductor",
    "metadata": {},
    "outputs": [],
@@ -452,7 +525,7 @@
   "author": "Rafał Jaworski",
   "email": "rjawor@amu.edu.pl",
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -467,7 +540,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.12"
   },
   "subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
   "title": "Komputerowe wspomaganie tłumaczenia",
diff --git a/lab/lab_02.ipynb b/lab/lab_02.ipynb
index 10c2003..a2b779e 100644
--- a/lab/lab_02.ipynb
+++ b/lab/lab_02.ipynb
@@ -57,7 +57,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 31,
    "id": "confident-prison",
    "metadata": {},
    "outputs": [],
@@ -80,13 +80,49 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 29,
    "id": "continental-submission",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def ice_lookup(sentence, prev_sentence, next_sentence):\n",
-    "    return []"
+    "def ice_lookup(sentence, prev_sentence, next_sentence, translation_memory):\n",
+    "    \n",
+    "    ice_previous = \"\"\n",
+    "    ice_next = \"\"\n",
+    "    \n",
+    "    for original, translation in translation_memory:\n",
+    "        if sentence == original:\n",
+    "            index = translation_memory.index((original, translation))\n",
+    "            if index > 0:\n",
+    "                ice_previous = translation_memory[index - 1][1]\n",
+    "            if index < len(translation_memory) - 1:\n",
+    "                ice_next = translation_memory[index + 1][1]\n",
+    "            break\n",
+    "    \n",
+    "    return (ice_previous, ice_next)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "f125ddd2-89fc-4496-93d9-9d640b7f616e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "prev:  Press the ENTER button , next:  The printer is switched off\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence = \"Sprawdź ustawienia sieciowe\"\n",
+    "prev_sentence = \"Wciśnij przycisk Enter\"\n",
+    "next_sentence = \"Wymagane ponowne uruchomienie komputera\"\n",
+    "\n",
+    "ice_result = ice_lookup(sentence, prev_sentence, next_sentence, translation_memory)\n",
+    "print('prev: ', ice_result[0], ', next: ', ice_result[1])"
    ]
   },
   {
@@ -119,7 +155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 36,
    "id": "fourth-pillow",
    "metadata": {},
    "outputs": [],
@@ -141,7 +177,7 @@
    "id": "graduate-theorem",
    "metadata": {},
    "source": [
-    "Odpowiedź:"
+    "Odpowiedź: Funkcja nie jest dobrą funkcją dystansu, gdyż bierze pod uwagaę jedynie różnice w długości zdań."
    ]
   },
   {
@@ -154,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 56,
    "id": "continued-christopher",
    "metadata": {},
    "outputs": [],
@@ -179,7 +215,7 @@
    "id": "metallic-leave",
    "metadata": {},
    "source": [
-    "Odpowiedź:"
+    "Odpowiedź: Nie jest to dobra funkcja dystansu, gdyż znajduje jedynie fakt, że zdania się mogą między sobą różnić."
    ]
   },
   {
@@ -206,7 +242,7 @@
    "id": "bibliographic-stopping",
    "metadata": {},
    "source": [
-    "Odpowiedź:"
+    "Odpowiedź: Dystans Lavenshteina jest poprawną funkcją dystansu, opisuje ilość operacji, które należy wykonać, aby porównywane do siebie zdania były takie same (np. zamiana liter, wstawienie innej litery, usunięcie litery, itp)"
    ]
   },
   {
@@ -223,7 +259,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 62,
    "id": "secondary-wrist",
    "metadata": {},
    "outputs": [
@@ -233,7 +269,7 @@
        "2"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 62,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -254,7 +290,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 63,
    "id": "associate-tuner",
    "metadata": {},
    "outputs": [],
@@ -273,7 +309,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 64,
    "id": "focal-pathology",
    "metadata": {},
    "outputs": [
@@ -283,7 +319,7 @@
        "0.9166666666666666"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 64,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -294,7 +330,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 65,
    "id": "roman-ceiling",
    "metadata": {},
    "outputs": [
@@ -304,7 +340,7 @@
        "0.9428571428571428"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 65,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -315,7 +351,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 66,
    "id": "invisible-cambodia",
    "metadata": {},
    "outputs": [
@@ -325,7 +361,7 @@
        "0.631578947368421"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 66,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -344,13 +380,43 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 71,
    "id": "genetic-cradle",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def fuzzy_lookup(sentence, threshold):\n",
-    "    return []"
+    "import difflib\n",
+    "\n",
+    "def fuzzy_lookup(sentence, threshold, translation_memory):\n",
+    "    fuzzy_matches = []\n",
+    "    for original, translation in translation_memory:\n",
+    "        similarity = difflib.SequenceMatcher(None, sentence, original).ratio()\n",
+    "        if similarity >= threshold:\n",
+    "            fuzzy_matches.append((original, translation, similarity))\n",
+    "    return fuzzy_matches"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "id": "6bebcb12-8c73-4beb-b4c2-00553d3b375f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Wciśnij przycisk Enter', 'Press the ENTER button', 0.8636363636363636)]"
+      ]
+     },
+     "execution_count": 80,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sentence = 'Wcisnij pszycisk ęnter'\n",
+    "threshold = 0.8\n",
+    "fuzzy_lookup(sentence, threshold, translation_memory)"
    ]
   }
  ],
@@ -358,7 +424,7 @@
   "author": "Rafał Jaworski",
   "email": "rjawor@amu.edu.pl",
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -373,7 +439,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.12"
   },
   "subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
   "title": "Komputerowe wspomaganie tłumaczenia",
diff --git a/lab/lab_03.ipynb b/lab/lab_03.ipynb
index 5707f0d..9e5041b 100644
--- a/lab/lab_03.ipynb
+++ b/lab/lab_03.ipynb
@@ -63,7 +63,7 @@
    "id": "diverse-sunglasses",
    "metadata": {},
    "source": [
-    "Odpowiedź:"
+    "Odpowiedź: metal cabinets guides. Proz.com"
    ]
   },
   {
@@ -128,13 +128,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 7,
    "id": "cognitive-cedar",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def terminology_lookup():\n",
-    "    return []"
+    "import re\n",
+    "\n",
+    "def terminology_lookup(text, dictionary):\n",
+    "    pattern = re.compile(r'\\b(?:' + '|'.join(dictionary) + r')\\b', re.IGNORECASE)\n",
+    "    matches = pattern.finditer(text)\n",
+    "    occurance = ''\n",
+    "    for match in matches:\n",
+    "        occurance += (f\"({match.start()}, {match.end()})\")\n",
+    "    return occurance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "5781b95b-3af9-4c82-8388-b98a11e6c343",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'(80, 91)(164, 175)(468, 475)(516, 523)(533, 540)'"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "terminology_lookup(text, dictionary)"
    ]
   },
   {
@@ -161,7 +189,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 12,
    "id": "tribal-attention",
    "metadata": {},
    "outputs": [
@@ -205,7 +233,7 @@
       "IDE\n",
       ",\n",
       "see\n",
-      "Running\n",
+      "run\n",
       "Tutorial\n",
       "Examples\n",
       "in\n",
@@ -218,7 +246,7 @@
       "work\n",
       "for\n",
       "all\n",
-      "swing\n",
+      "Swing\n",
       "program\n",
       "—\n",
       "applet\n",
@@ -232,7 +260,7 @@
       "be\n",
       "the\n",
       "step\n",
-      "-PRON-\n",
+      "you\n",
       "need\n",
       "to\n",
       "follow\n",
@@ -248,7 +276,7 @@
       "platform\n",
       ",\n",
       "if\n",
-      "-PRON-\n",
+      "you\n",
       "have\n",
       "not\n",
       "already\n",
@@ -260,7 +288,7 @@
       "program\n",
       "that\n",
       "use\n",
-      "Swing\n",
+      "swing\n",
       "component\n",
       ".\n",
       "compile\n",
@@ -302,13 +330,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 19,
    "id": "surgical-demonstration",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def terminology_lookup():\n",
-    "    return []"
+    "def terminology_lookup(text, dictionary):\n",
+    "    nlp = spacy.load(\"en_core_web_sm\")\n",
+    "    doc = nlp(text)\n",
+    "\n",
+    "    word_forms = set()\n",
+    "    for word in dictionary:\n",
+    "        word_forms.add(word)\n",
+    "        for token in doc:\n",
+    "            if token.text.lower() == word:\n",
+    "                word_forms.add(token.lemma_)\n",
+    "\n",
+    "    matches = []\n",
+    "    for token in doc:\n",
+    "        if token.text.lower() in word_forms:\n",
+    "            matches.append((token.idx, token.idx + len(token)))\n",
+    "\n",
+    "    occurrences = ''\n",
+    "    for match in matches:\n",
+    "        occurrences += f\"({match[0]}, {match[1]})\"\n",
+    "    return occurrences"
    ]
   },
   {
@@ -337,13 +383,59 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 23,
    "id": "superb-butterfly",
    "metadata": {},
    "outputs": [],
    "source": [
     "def get_nouns(text):\n",
-    "    return []"
+    "    nlp = spacy.load(\"en_core_web_sm\")\n",
+    "    doc = nlp(text)\n",
+    "    nouns = [token.text for token in doc if token.pos_ == \"NOUN\"]\n",
+    "    \n",
+    "    return nouns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "8203c3e5-74a6-42c1-add1-e378f09164fd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['programmers',\n",
+       " 'section',\n",
+       " 'Swing',\n",
+       " 'application',\n",
+       " 'command',\n",
+       " 'line',\n",
+       " 'information',\n",
+       " 'Swing',\n",
+       " 'application',\n",
+       " 'compilation',\n",
+       " 'instructions',\n",
+       " 'programs',\n",
+       " 'applets',\n",
+       " 'applications',\n",
+       " 'steps',\n",
+       " 'release',\n",
+       " 'platform',\n",
+       " 'program',\n",
+       " 'Swing',\n",
+       " 'components',\n",
+       " 'program',\n",
+       " 'program']"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "get_nouns(text)"
    ]
   },
   {
@@ -356,7 +448,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 25,
    "id": "acting-tolerance",
    "metadata": {},
    "outputs": [],
@@ -374,13 +466,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 30,
    "id": "eight-redhead",
    "metadata": {},
    "outputs": [],
    "source": [
     "def extract_terms(text):\n",
-    "    return []"
+    "    nlp = spacy.load(\"en_core_web_sm\")\n",
+    "    doc = nlp(text)\n",
+    "    \n",
+    "    tally = {}\n",
+    "    \n",
+    "    for token in doc:\n",
+    "        if token.pos_ != \"NOUN\":\n",
+    "            continue\n",
+    "        \n",
+    "        lemma = token.lemma_.lower()\n",
+    "        \n",
+    "        if lemma in tally:\n",
+    "            tally[lemma] += 1\n",
+    "        else:\n",
+    "            tally[lemma] = 1\n",
+    "    \n",
+    "    return tally"
    ]
   },
   {
@@ -393,13 +501,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 32,
    "id": "monetary-mambo",
    "metadata": {},
    "outputs": [],
    "source": [
     "def extract_terms(text):\n",
-    "    return []"
+    "    nlp = spacy.load(\"en_core_web_sm\")\n",
+    "    doc = nlp(text)\n",
+    "    \n",
+    "    tally = {}\n",
+    "    \n",
+    "    for token in doc:\n",
+    "        if token.pos_ not in ['NOUN', 'VERB', 'ADJ']:\n",
+    "            continue\n",
+    "        \n",
+    "        lemma = token.lemma_.lower()\n",
+    "        \n",
+    "        if lemma in tally:\n",
+    "            tally[lemma] += 1\n",
+    "        else:\n",
+    "            tally[lemma] = 1\n",
+    "    \n",
+    "    return tally"
    ]
   }
  ],
@@ -407,7 +531,7 @@
   "author": "Rafał Jaworski",
   "email": "rjawor@amu.edu.pl",
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -422,7 +546,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.12"
   },
   "subtitle": "3. Terminologia",
   "title": "Komputerowe wspomaganie tłumaczenia",