2024-05-31 lab15

2024-05-31 lab13-14
lab_11 and lab_12, 2024-05-30
2024-05-31 21:11:08 +02:00 · 2024-05-31 20:52:27 +02:00 · 2024-05-30 23:51:52 +02:00 · 2024-05-12 23:12:15 +02:00 · 2024-04-14 22:21:17 +02:00 · 2024-04-14 19:33:04 +02:00
15 changed files with 4139 additions and 350 deletions
--- a/lab/data/.gitignore
+++ b/lab/data/.gitignore
@ -0,0 +1,2 @@
 corpus/
 NIPS Papers/
--- a/lab/data/lda_topics.txt
+++ b/lab/data/lda_topics.txt
@ -0,0 +1,10 @@
 (0, '0.006*"learning" + 0.005*"model" + 0.005*"data" + 0.004*"function" + 0.004*"set" + 0.004*"using" + 0.004*"number" + 0.004*"neural" + 0.004*"one" + 0.003*"error"')
 (1, '0.008*"learning" + 0.006*"data" + 0.005*"model" + 0.005*"set" + 0.004*"algorithm" + 0.004*"time" + 0.004*"one" + 0.004*"two" + 0.003*"used" + 0.003*"figure"')
 (2, '0.007*"data" + 0.005*"model" + 0.005*"set" + 0.005*"learning" + 0.004*"one" + 0.004*"algorithm" + 0.004*"time" + 0.003*"using" + 0.003*"figure" + 0.003*"training"')
 (3, '0.006*"data" + 0.005*"model" + 0.004*"learning" + 0.004*"two" + 0.004*"algorithm" + 0.004*"using" + 0.004*"function" + 0.004*"set" + 0.003*"number" + 0.003*"given"')
 (4, '0.006*"learning" + 0.005*"data" + 0.005*"model" + 0.005*"set" + 0.004*"algorithm" + 0.004*"time" + 0.004*"using" + 0.004*"two" + 0.004*"function" + 0.003*"one"')
 (5, '0.008*"learning" + 0.006*"data" + 0.005*"algorithm" + 0.004*"model" + 0.004*"two" + 0.004*"function" + 0.004*"number" + 0.003*"figure" + 0.003*"time" + 0.003*"set"')
 (6, '0.007*"learning" + 0.006*"model" + 0.005*"data" + 0.005*"algorithm" + 0.004*"function" + 0.004*"set" + 0.003*"time" + 0.003*"one" + 0.003*"based" + 0.003*"number"')
 (7, '0.007*"learning" + 0.005*"set" + 0.005*"data" + 0.005*"model" + 0.004*"algorithm" + 0.004*"function" + 0.004*"using" + 0.004*"number" + 0.004*"log" + 0.004*"figure"')
 (8, '0.005*"learning" + 0.005*"set" + 0.005*"algorithm" + 0.004*"model" + 0.004*"function" + 0.004*"data" + 0.004*"one" + 0.004*"time" + 0.003*"using" + 0.003*"given"')
 (9, '0.007*"data" + 0.006*"model" + 0.005*"learning" + 0.005*"algorithm" + 0.004*"two" + 0.003*"number" + 0.003*"time" + 0.003*"set" + 0.003*"function" + 0.003*"used"')
--- a/lab/data/top_nouns.txt
+++ b/lab/data/top_nouns.txt
@ -0,0 +1,100 @@
 project victims support visit mediation
 exhibition cooperation year meeting films
 exhibition cooperation year meeting films
 solution occupation settlement problem resolutions
 residence citizens permit security citizen
 residence citizens permit security citizen
 support measures countries farmers member
 data services infrastructure development project
 data services infrastructure development project
 photographs service scans materials films
 photographs service scans materials films
 insurance ZUS contributions benefits administration
 project archaeology research conservation history
 project archaeology research conservation history
 cases % coronavirus countries disease
 % year case cases coronavirus
 ship tug speed accident course
 ship tug speed accident course
 work scientists research science telomerase
 work scientists research science telomerase
 film media part time efforts
 film media part time efforts
 insurance ZUS contributions benefits administration
 use care stewardship resistance antibiotics
 services administration state information e
 services administration state information e
 coronavirus research measures outbreak member
 residence card foreigner work permit
 security e threats policy gas
 security e threats policy gas
 paper 15th reader file date
 paper 15th reader file date
 costs implementation management tasks expenditures
 food cooperation products market agri
 costs implementation management tasks expenditures
 costs implementation management tasks expenditures
 artist work painting paintings time
 artist work painting paintings time
 Home » rights representatives discrimination
 Home » rights representatives discrimination
 command documentation alias files directory
 water basis land status item
 water basis land status item
 % contract contracts . No
 food cooperation products market agri
 % contract contracts . No
 market level services age companies
 market level services age companies
 projects innovation R&D development companies
 projects innovation R&D development companies
 contracts contract % item procedures
 contracts contract % item procedures
 room A office information B
 room A office information B
 advantage production country countries goods
 measles vaccine disease person people
 advantage production country countries goods
 card residence permission business stamp
 card residence permission business stamp
 w % gospodarczego polityki publicznych
 system banks stability risk sector
 camps people concentration policy resistance
 camps people concentration policy resistance
 safety aviation management requirements entity
 safety aviation management requirements entity
 research call philosophy information project
 vaccination pertussis cancer risk disease
 research call philosophy information project
 energy gas % oil countries
 energy gas % oil countries
 cooperation meeting talks forces defence
 project education information coronavirus funding
 food education project measures assistance
 infection disease symptoms fever humans
 energy audit costs use management
 countries % development benefits funds
 years minister year rector persons
 water food fish times year
 land water population data age
 land water population data age
 market labour crisis unemployment countries
 market labour crisis unemployment countries
 accelerator research - operation model
 accelerator research - operation model
 energy policy power development objectives
 priest hand country wedding church
 eggs breakfast food products meat
 eggs breakfast food products meat
 water fish times food year
 honey production bread time taste
 honey production bread time taste
 data job portal vacancies Decision
 data job portal vacancies Decision
 food quality products apples farmers
 food quality products apples farmers
 visa activities child B-1 institution
 visa activities child B-1 institution
 - co preparations operation preparation
 - co preparations operation preparation
 project victims support visit mediation
--- a/lab/data/top_nouns_tfidf.txt
+++ b/lab/data/top_nouns_tfidf.txt
@ -0,0 +1,100 @@
 approval total lawyers priorities judges
 agriculture support guests offers author
 agriculture support guests offers author
 homeland invasion address prisoners sources
 identity positions elaboration issues terms
 identity positions elaboration issues terms
 distancing lenders mechanism check part
 IT Realization Services resolutions bases
 IT Realization Services resolutions bases
 occupation scans browser Service processes
 occupation scans browser Service processes
 am war month Insurance centralisation
 conservation zu provisions basin record
 conservation zu provisions basin record
 culture city abscesses aeronautics disruptors
 infection Recommendations man evening occurrence
 course hull STATE classifier certificate
 course hull STATE classifier certificate
 cooling work culture part laboratory
 cooling work culture part laboratory
 culture reverse advisor documentary service
 culture reverse advisor documentary service
 am war month Insurance centralisation
 pressure ability entry prescribers costs
 economies management role disk stakeholders
 economies management role disk stakeholders
 traders fears carriers illness distancing
 activity employment foreigners Visa graduate
 defense forecast quarter factors opportunity
 defense forecast quarter factors opportunity
 case author screen announcement typefaces
 case author screen announcement typefaces
 revenue office premises o proposals
 storage completion efforts Meeting crisis
 office Types premises protection days
 revenue office premises o proposals
 pictures splashing dobrze viewer culture
 pictures splashing dobrze viewer culture
 creation origin discrimination interest institutions
 creation origin discrimination interest institutions
 names contexts calculator program descriptor
 periods standards total name property
 periods standards total name property
 Art days liability authorities services
 storage completion efforts Meeting crisis
 Art days liability authorities services
 skills provision country economies science
 skills provision country economies science
 Project possibilities cancer members therapies
 Project possibilities cancer members therapies
 price auction actions telecommunications appointment
 price auction actions telecommunications appointment
 records coffee authorisation line times
 records coffee authorisation line times
 example manner source essence identification
 defences vaccines days spread body
 example manner source essence identification
 servants employees Possession insurance examinations
 servants employees Possession insurance examinations
 systemowe dopiero system latach popytem
 efficiency problems uncertainty improvement Risk
 uprising borders rights security campaign
 uprising borders rights security campaign
 part audits Responsibilities services authority
 protection competence version occurrence requisition
 Requirements members methodology data database
 whoop substitute cause exposure course
 Requirements members methodology data database
 erent decisions SOURCES spectrum economies
 erent decisions SOURCES spectrum economies
 invitation effects help armament round
 area teaching tax time travel
 time Recommendation participants guarantees work
 toxin mechanisms attacks Babies therapies
 production replacement control SMEs audit
 significance net ground participants levels
 functioning consultation interest expert procedures
 thing mercury eggs municipality lunch
 agriculture R result development prices
 agriculture R result development prices
 reflection basis sources points results
 reflection basis sources points results
 leaders reach author features publications
 leaders reach author features publications
 consumption Improvement bodies level need
 money delirium advice house couple
 work thanks BEgINNINg range funds
 work thanks BEgINNINg range funds
 option eggs dinner wine quantities
 seeds mead event maples approach
 seeds mead event maples approach
 case complaints consultation Employers actions
 case complaints consultation Employers actions
 activity fruit indications zation rice
 activity fruit indications zation rice
 building work premises Food child
 building work premises Food child
 virtue works culture sectors others
 virtue works culture sectors others
 approval total lawyers priorities judges
--- a/lab/lab_01.ipynb
+++ b/lab/lab_01.ipynb
@ -52,7 +52,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 191,
   "id": "narrow-romantic",
   "metadata": {},
   "outputs": [],
@ -71,7 +71,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 192,
   "id": "indonesian-electron",
   "metadata": {},
   "outputs": [],
@ -82,7 +82,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 193,
   "id": "compact-trinidad",
   "metadata": {},
   "outputs": [
@ -92,7 +92,7 @@
       "['Press the ENTER button']"
      ]
     },
-     "execution_count": 3,
+     "execution_count": 193,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -119,7 +119,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 194,
   "id": "exposed-daniel",
   "metadata": {},
   "outputs": [],
@ -139,7 +139,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 195,
   "id": "serial-velvet",
   "metadata": {},
   "outputs": [
@ -149,7 +149,7 @@
       "['Press the ENTER button', 'Press the ENTER key']"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 195,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -176,7 +176,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 196,
   "id": "every-gibson",
   "metadata": {},
   "outputs": [
@ -186,7 +186,7 @@
       "[]"
      ]
     },
-     "execution_count": 6,
+     "execution_count": 196,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -213,13 +213,37 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 197,
   "id": "protected-rings",
   "metadata": {},
   "outputs": [],
   "source": [
    "def preprocess(sentence):\n",
    "    return sentence.lower()\n",
    "\n",
    "def tm_lookup(sentence):\n",
-    "    return ''"
+    "    return [entry[1] for entry in translation_memory if preprocess(entry[0]) == preprocess(sentence)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "id": "7baee10b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Press the ENTER button', 'Press the ENTER key']"
      ]
     },
     "execution_count": 198,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tm_lookup('Wciśnij przycisk ENTER')"
   ]
  },
  {
@ -232,17 +256,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 199,
   "id": "severe-alloy",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "''"
+       "[]"
      ]
     },
-     "execution_count": 18,
+     "execution_count": 199,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -261,13 +285,40 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 200,
   "id": "structural-diesel",
   "metadata": {},
   "outputs": [],
   "source": [
    "import string\n",
    "\n",
    "def preprocess(s):\n",
    "    translator = str.maketrans('', '', string.punctuation)\n",
    "    return s.translate(translator).lower()\n",
    "\n",
    "def tm_lookup(sentence):\n",
-    "    return ''"
+    "    return [entry[1] for entry in translation_memory if preprocess(entry[0]) == preprocess(sentence)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 201,
   "id": "c03c6709",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Press the ENTER button', 'Press the ENTER key']"
      ]
     },
     "execution_count": 201,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tm_lookup('Wciśnij przycisk [ENTER]')"
   ]
  },
  {
@ -280,17 +331,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 202,
   "id": "brief-senegal",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "''"
+       "[]"
      ]
     },
-     "execution_count": 12,
+     "execution_count": 202,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -317,13 +368,43 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 203,
   "id": "mathematical-customs",
   "metadata": {},
   "outputs": [],
   "source": [
    "def compare_sentences(l1, l2):\n",
    "    return sum([1 for i, j in zip(l1.split(), l2.split()) if i != j]) <= 1\n",
    "\n",
    "import string\n",
    "\n",
    "def preprocess(s):\n",
    "    translator = str.maketrans('', '', string.punctuation)\n",
    "    return s.translate(translator).lower()\n",
    "\n",
    "def tm_lookup(sentence):\n",
-    "    return ''"
+    "    return [entry[1] for entry in translation_memory if compare_sentences(preprocess(entry[0]), preprocess(sentence))]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "id": "6264b722",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['System restart required']"
      ]
     },
     "execution_count": 204,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tm_lookup('Wymagane ponowne uruchomienie maszyny')"
   ]
  },
  {
@ -344,7 +425,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 205,
   "id": "humanitarian-wrong",
   "metadata": {},
   "outputs": [],
@ -362,7 +443,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 206,
   "id": "located-perception",
   "metadata": {},
   "outputs": [],
@ -374,7 +455,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 207,
   "id": "advised-casting",
   "metadata": {},
   "outputs": [
@ -384,7 +465,7 @@
       "[('przycisk', 'button'), ('drukarka', 'printer')]"
      ]
     },
-     "execution_count": 17,
+     "execution_count": 207,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -406,7 +487,7 @@
   "id": "defensive-fifteen",
   "metadata": {},
   "source": [
-    "Odpowiedź:"
+    "Odpowiedź: Jeżeli implementacja wygląda tak jak powyżej, złożoność to `O(n*m)`, ponieważ dla każdego słowa iteracyjnie przechodzimy przez cały nasz słownik i szukamy odpowiednika"
   ]
  },
  {
@ -419,13 +500,56 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 208,
   "id": "aca5d340",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('przycisk', 'button')]"
      ]
     },
     "execution_count": 208,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "glossary_lookup('Każda Drukarka posiada przycisk wznowienia drukowania')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 209,
   "id": "original-tunisia",
   "metadata": {},
   "outputs": [],
   "source": [
    "def glossary_lookup(sentence):\n",
-    "    return ''"
+    "    sentence_words = [word.lower() for word in sentence.split()]\n",
    "    return [entry for entry in glossary if entry[0] in sentence_words]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 210,
   "id": "716bbbe9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('przycisk', 'button'), ('drukarka', 'printer')]"
      ]
     },
     "execution_count": 210,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')"
   ]
  },
  {
@ -438,13 +562,50 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 211,
   "id": "32dec661",
   "metadata": {},
   "outputs": [],
   "source": [
    "glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]\n",
    "glossary = {\n",
    "    'komputer': 'computer',\n",
    "    'przycisk': 'button',\n",
    "    'drukarka': 'printer'\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 212,
   "id": "adolescent-semiconductor",
   "metadata": {},
   "outputs": [],
   "source": [
    "def glossary_lookup(sentence):\n",
-    "    return ''"
+    "    sentence_words = [word.lower() for word in sentence.split() if word.lower() in glossary]\n",
    "    return [(word, glossary[word]) for word in sentence_words]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 213,
   "id": "d1e991c6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('drukarka', 'printer'), ('przycisk', 'button')]"
      ]
     },
     "execution_count": 213,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')"
   ]
  }
 ],
@ -467,7 +628,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.14"
  },
  "subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
  "title": "Komputerowe wspomaganie tłumaczenia",
--- a/lab/lab_02.ipynb
+++ b/lab/lab_02.ipynb
@ -57,7 +57,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 17,
   "id": "confident-prison",
   "metadata": {},
   "outputs": [],
@ -80,13 +80,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 18,
   "id": "continental-submission",
   "metadata": {},
   "outputs": [],
   "source": [
    "def ice_lookup(sentence, prev_sentence, next_sentence):\n",
-    "    return []"
+    "    # Wyniki dopasowania ICE\n",
    "    ice_matches = []\n",
    "\n",
    "    # Iterujemy przez pamięć tłumaczeń, pomijając pierwszy i ostatni element dla bezpieczeństwa kontekstowego\n",
    "    for index in range(1, len(translation_memory) - 1):\n",
    "        # Pobieramy obecne, poprzednie i następne zdania z TM\n",
    "        prev_tm_sentence, _ = translation_memory[index - 1]\n",
    "        current_tm_sentence, current_tm_translation = translation_memory[index]\n",
    "        next_tm_sentence, _ = translation_memory[index + 1]\n",
    "\n",
    "        # Sprawdzamy, czy wszystkie trzy zdania zgadzają się z odpowiednikami w TM\n",
    "        if (prev_tm_sentence == prev_sentence and current_tm_sentence == current_sentence  and next_tm_sentence == next_sentence):\n",
    "            ice_matches.append(current_tm_translation)\n",
    "\n",
    "    return ice_matches"
   ]
  },
  {
@ -119,7 +133,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 19,
   "id": "fourth-pillow",
   "metadata": {},
   "outputs": [],
@ -141,7 +155,11 @@
   "id": "graduate-theorem",
   "metadata": {},
   "source": [
-    "Odpowiedź:"
+    "Odpowiedź: Nie, ponieważ w tej funkcji interesuje nas tylko długość zdania, tzn. drugi warunek nie będzie spełniony\n",
    "\n",
    "Przykład: `kot != bok`, a dla tej funkcji zwróci 0\n",
    "\n",
    "Spełnione warunki: 1, 3, 4"
   ]
  },
  {
@ -154,7 +172,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 20,
   "id": "continued-christopher",
   "metadata": {},
   "outputs": [],
@ -179,7 +197,40 @@
   "id": "metallic-leave",
   "metadata": {},
   "source": [
-    "Odpowiedź:"
+    "Odpowiedź: Tak, spełnia wszystkie warunki\n",
    "\n",
    "Sprawdzenie dla warunku 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "349a3547",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "True\n",
      "True\n",
      "True\n"
     ]
    }
   ],
   "source": [
    "# x == y i y == z\n",
    "print(sentence_distance(\"kot\", \"kot\") + sentence_distance(\"kot\", \"kot\") >= sentence_distance(\"kot\", \"kot\"))\n",
    "\n",
    "# x == y i y != z\n",
    "print(sentence_distance(\"kot\", \"kot\") + sentence_distance(\"kot\", \"pies\") >= sentence_distance(\"kot\", \"pies\"))\n",
    "\n",
    "# x != y i y == z\n",
    "print(sentence_distance(\"kot\", \"pies\") + sentence_distance(\"pies\", \"pies\") >= sentence_distance(\"kot\", \"pies\"))\n",
    "\n",
    "# x != y i y != z\n",
    "print(sentence_distance(\"kot\", \"pies\") + sentence_distance(\"pies\", \"kot\") >= sentence_distance(\"kot\", \"kot\"))"
   ]
  },
  {
@ -206,7 +257,11 @@
   "id": "bibliographic-stopping",
   "metadata": {},
   "source": [
-    "Odpowiedź:"
+    "Odpowiedź:\n",
    "- Dystans Levenshteina jest zawsze nieujemny\n",
    "- Jeśli dwa ciągi są identyczne, nie potrzeba żadnych operacji do przekształcenia jednego w drugi\n",
    "- Dystans Levenshteina jest symetryczny, ponieważ liczba operacji wymaganych do przekształcenia ciągu A w ciąg B jest taka sama jak liczba operacji potrzebnych do przekształcenia ciągu B w ciąg A\n",
    "- Dystans Levenshteina spełnia nierówność trójkąta. Można to uzasadnić rozważając, że przekształcenie ciągu X w Y przez ciąg pośredni Z (najpierw przekształcając X w Z, a następnie Z w Y) nie będzie wymagać więcej operacji niż bezpośrednie przekształcenie X w Y"
   ]
  },
  {
@ -223,7 +278,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 21,
   "id": "secondary-wrist",
   "metadata": {},
   "outputs": [
@ -233,7 +288,7 @@
       "2"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -254,7 +309,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 22,
   "id": "associate-tuner",
   "metadata": {},
   "outputs": [],
@ -273,7 +328,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 23,
   "id": "focal-pathology",
   "metadata": {},
   "outputs": [
@ -283,7 +338,7 @@
       "0.9166666666666666"
      ]
     },
-     "execution_count": 7,
+     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -294,7 +349,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 24,
   "id": "roman-ceiling",
   "metadata": {},
   "outputs": [
@ -304,7 +359,7 @@
       "0.9428571428571428"
      ]
     },
-     "execution_count": 8,
+     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -315,7 +370,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 25,
   "id": "invisible-cambodia",
   "metadata": {},
   "outputs": [
@ -325,7 +380,7 @@
       "0.631578947368421"
      ]
     },
-     "execution_count": 9,
+     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -344,13 +399,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 26,
   "id": "genetic-cradle",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write a fuzzy_lookup function that will search the translation memory for all sentences whose Levenshtein similarity to the searched sentence is greater than or equal to a set threshold.\n",
    "def fuzzy_lookup(sentence, threshold):\n",
-    "    return []"
+    "    fuzzy_matches = []\n",
    "\n",
    "    # Iterujemy przez pamięć tłumaczeń\n",
    "    for tm_sentence, tm_translation in translation_memory:\n",
    "        # Sprawdzamy, czy podobieństwo Levenshteina jest większe niż próg\n",
    "        if levenshtein_similarity(sentence, tm_sentence) >= threshold:\n",
    "            fuzzy_matches.append(tm_translation)\n",
    "\n",
    "    return fuzzy_matches"
   ]
  }
 ],
@ -373,7 +437,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.14"
  },
  "subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
  "title": "Komputerowe wspomaganie tłumaczenia",
--- a/lab/lab_03.ipynb
+++ b/lab/lab_03.ipynb
@ -63,7 +63,7 @@
   "id": "diverse-sunglasses",
   "metadata": {},
   "source": [
-    "Odpowiedź:"
+    "Odpowiedź: Wynik z Google Translate to `metal cabinet guides`"
   ]
  },
  {
@ -86,12 +86,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 11,
   "id": "loving-prince",
   "metadata": {},
   "outputs": [],
   "source": [
-    "text = \" For all Java programmers:\"\n",
+    "text =  \" For all Java programmers:\"\n",
    "text += \" This section explains how to compile and run a Swing application from the command line.\"\n",
    "text += \" For information on compiling and running a Swing application using NetBeans IDE,\"\n",
    "text += \" see Running Tutorial Examples in NetBeans IDE. The compilation instructions work for all Swing programs\"\n",
@ -110,7 +110,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 12,
   "id": "bound-auction",
   "metadata": {},
   "outputs": [],
@ -128,13 +128,46 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 13,
   "id": "cognitive-cedar",
   "metadata": {},
   "outputs": [],
   "source": [
    "def terminology_lookup():\n",
-    "    return []"
+    "    for term in dictionary:\n",
    "        start = 0\n",
    "        while True:\n",
    "            start = text.find(term, start)\n",
    "            if start == -1:\n",
    "                break\n",
    "            end = start + len(term)\n",
    "            print(f'{term}: ({start}, {end})')\n",
    "            start = end"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "0a4a26ba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "program: (14, 21)\n",
      "program: (291, 298)\n",
      "program: (468, 475)\n",
      "program: (516, 523)\n",
      "program: (533, 540)\n",
      "application: (80, 91)\n",
      "application: (164, 175)\n",
      "application: (322, 333)\n"
     ]
    }
   ],
   "source": [
    "terminology_lookup()"
   ]
  },
  {
@ -161,7 +194,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 15,
   "id": "tribal-attention",
   "metadata": {},
   "outputs": [
@ -169,108 +202,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      " \n",
+      "  for all Java programmer : this section explain how to compile and run a swing application from the command line . for information on compile and run a swing application use NetBeans IDE , see run Tutorial Examples in NetBeans IDE . the compilation instruction work for all Swing program — applet , as well as application . here be the step you need to follow : install the late release of the Java SE platform , if you have not already do so . create a program that use swing component . compile the program . run the program . "
      "for\n",
      "all\n",
      "Java\n",
      "programmer\n",
      ":\n",
      "this\n",
      "section\n",
      "explain\n",
      "how\n",
      "to\n",
      "compile\n",
      "and\n",
      "run\n",
      "a\n",
      "swing\n",
      "application\n",
      "from\n",
      "the\n",
      "command\n",
      "line\n",
      ".\n",
      "for\n",
      "information\n",
      "on\n",
      "compile\n",
      "and\n",
      "run\n",
      "a\n",
      "swing\n",
      "application\n",
      "use\n",
      "NetBeans\n",
      "IDE\n",
      ",\n",
      "see\n",
      "Running\n",
      "Tutorial\n",
      "Examples\n",
      "in\n",
      "NetBeans\n",
      "IDE\n",
      ".\n",
      "the\n",
      "compilation\n",
      "instruction\n",
      "work\n",
      "for\n",
      "all\n",
      "swing\n",
      "program\n",
      "—\n",
      "applet\n",
      ",\n",
      "as\n",
      "well\n",
      "as\n",
      "application\n",
      ".\n",
      "here\n",
      "be\n",
      "the\n",
      "step\n",
      "-PRON-\n",
      "need\n",
      "to\n",
      "follow\n",
      ":\n",
      "install\n",
      "the\n",
      "late\n",
      "release\n",
      "of\n",
      "the\n",
      "Java\n",
      "SE\n",
      "platform\n",
      ",\n",
      "if\n",
      "-PRON-\n",
      "have\n",
      "not\n",
      "already\n",
      "do\n",
      "so\n",
      ".\n",
      "create\n",
      "a\n",
      "program\n",
      "that\n",
      "use\n",
      "Swing\n",
      "component\n",
      ".\n",
      "compile\n",
      "the\n",
      "program\n",
      ".\n",
      "run\n",
      "the\n",
      "program\n",
      ".\n"
     ]
    }
   ],
@ -281,7 +213,7 @@
    "doc = nlp(text)\n",
    "\n",
    "for token in doc:\n",
-    "    print(token.lemma_)"
+    "    print(token.lemma_, end=' ')"
   ]
  },
  {
@ -302,13 +234,40 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 40,
   "id": "surgical-demonstration",
   "metadata": {},
   "outputs": [],
   "source": [
    "def terminology_lookup():\n",
-    "    return []"
+    "    for term in dictionary:\n",
    "        for token in doc:\n",
    "            if token.lemma_ == term:\n",
    "                print(f'{token}: ({token.idx}, {token.idx + len(token)})')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "74f600ea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "programs: (291, 299)\n",
      "program: (468, 475)\n",
      "program: (516, 523)\n",
      "program: (533, 540)\n",
      "application: (80, 91)\n",
      "application: (164, 175)\n",
      "applications: (322, 334)\n"
     ]
    }
   ],
   "source": [
    "terminology_lookup()"
   ]
  },
  {
@ -337,13 +296,56 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 22,
   "id": "superb-butterfly",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_nouns(text):\n",
-    "    return []"
+    "    doc = nlp(text)\n",
    "    return [token.text for token in doc if token.pos_ == 'NOUN']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "2bfedfa3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['programmers',\n",
       " 'section',\n",
       " 'Swing',\n",
       " 'application',\n",
       " 'command',\n",
       " 'line',\n",
       " 'information',\n",
       " 'Swing',\n",
       " 'application',\n",
       " 'compilation',\n",
       " 'instructions',\n",
       " 'programs',\n",
       " 'applets',\n",
       " 'applications',\n",
       " 'steps',\n",
       " 'release',\n",
       " 'platform',\n",
       " 'program',\n",
       " 'Swing',\n",
       " 'components',\n",
       " 'program',\n",
       " 'program']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_nouns(text)"
   ]
  },
  {
@ -356,7 +358,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 19,
   "id": "acting-tolerance",
   "metadata": {},
   "outputs": [],
@ -374,13 +376,54 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 26,
   "id": "eight-redhead",
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_terms(text):\n",
-    "    return []"
+    "    doc = nlp(text)\n",
    "    terms = {}\n",
    "    for token in doc:\n",
    "        if token.pos_ == 'NOUN':\n",
    "            term = token.lemma_\n",
    "            terms[term] = terms.get(term, 0) + 1\n",
    "    return terms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "07c1122a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'programmer': 1,\n",
       " 'section': 1,\n",
       " 'swing': 3,\n",
       " 'application': 3,\n",
       " 'command': 1,\n",
       " 'line': 1,\n",
       " 'information': 1,\n",
       " 'compilation': 1,\n",
       " 'instruction': 1,\n",
       " 'program': 4,\n",
       " 'applet': 1,\n",
       " 'step': 1,\n",
       " 'release': 1,\n",
       " 'platform': 1,\n",
       " 'component': 1}"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "extract_terms(text)"
   ]
  },
  {
@ -393,14 +436,82 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 32,
   "id": "monetary-mambo",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract and count nouns, verbs and adjectives\n",
    "def extract_terms(text):\n",
-    "    return []"
+    "    doc = nlp(text)\n",
    "    terms = {\"nouns\": {}, \"verbs\": {}, \"adjectives\": {}}\n",
    "    for token in doc:\n",
    "        if token.pos_ == 'NOUN':\n",
    "            term = token.lemma_\n",
    "            terms[\"nouns\"][term] = terms[\"nouns\"].get(term, 0) + 1\n",
    "        elif token.pos_ == 'VERB':\n",
    "            term = token.lemma_\n",
    "            terms[\"verbs\"][term] = terms[\"verbs\"].get(term, 0) + 1\n",
    "        elif token.pos_ == 'ADJ':\n",
    "            term = token.lemma_\n",
    "            terms[\"adjectives\"][term] = terms[\"adjectives\"].get(term, 0) + 1\n",
    "\n",
    "    return terms"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "1eb48136",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'adjectives': {'late': 1},\n",
      " 'nouns': {'applet': 1,\n",
      "           'application': 3,\n",
      "           'command': 1,\n",
      "           'compilation': 1,\n",
      "           'component': 1,\n",
      "           'information': 1,\n",
      "           'instruction': 1,\n",
      "           'line': 1,\n",
      "           'platform': 1,\n",
      "           'program': 4,\n",
      "           'programmer': 1,\n",
      "           'release': 1,\n",
      "           'section': 1,\n",
      "           'step': 1,\n",
      "           'swing': 3},\n",
      " 'verbs': {'compile': 3,\n",
      "           'create': 1,\n",
      "           'do': 1,\n",
      "           'explain': 1,\n",
      "           'follow': 1,\n",
      "           'install': 1,\n",
      "           'need': 1,\n",
      "           'run': 4,\n",
      "           'see': 1,\n",
      "           'use': 2,\n",
      "           'work': 1}}\n"
     ]
    }
   ],
   "source": [
    "from pprint import pprint\n",
    "\n",
    "pprint(extract_terms(text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62aeea83",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
@ -422,7 +533,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.14"
  },
  "subtitle": "3. Terminologia",
  "title": "Komputerowe wspomaganie tłumaczenia",
--- a/lab/lab_04-05.ipynb
+++ b/lab/lab_04-05.ipynb
--- a/lab/lab_06-07.ipynb
+++ b/lab/lab_06-07.ipynb
@ -55,13 +55,52 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 7,
   "id": "documented-hacker",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(10, 13), (17, 21)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "def find_tags(text):\n",
-    "    return []"
+    "    tags = re.finditer(r'<[^>]+>', text)\n",
    "    return [tag.span() for tag in tags]\n",
    "\n",
    "# Test the function\n",
    "text = 'This is a <b>bold</b> text'\n",
    "find_tags(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1781331d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('<b>', '</b>')"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text[10:13], text[17:21]"
   ]
  },
  {
@ -74,13 +113,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 9,
   "id": "unauthorized-study",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "(True, False, False)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def is_translatable(text):\n",
-    "    return True"
+    "    # Text is translatable if it contains only letters, spaces, and punctuation\n",
    "    return re.fullmatch(r'[a-zA-Z .,!?]+', text) is not None\n",
    "\n",
    "# Test the function\n",
    "is_translatable('Hello, world!'), is_translatable('Hello, 123!'), is_translatable('你好，世界！')"
   ]
  },
  {
@ -93,13 +147,65 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 12,
   "id": "beautiful-mathematics",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(12, 22), (28, 38), (42, 52), (56, 66), (70, 85)]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def find_dates(text):\n",
-    "    return []"
+    "    # Find all dates in 5 formats: yyyy-mm-dd, yyyy/mm/dd, dd-mm-yyyy, dd/mm/yyyy, dd month yyyy\n",
    "    # yyyy-mm-dd\n",
    "    dates = [date.span() for date in re.finditer(r'\\b\\d{4}-\\d{2}-\\d{2}\\b', text)]\n",
    "    # yyyy/mm/dd\n",
    "    dates = dates + [date.span() for date in re.finditer(r'\\b\\d{4}/\\d{2}/\\d{2}\\b', text)]\n",
    "    # dd-mm-yyyy\n",
    "    dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}-\\d{2}-\\d{4}\\b', text)]\n",
    "    # dd/mm/yyyy\n",
    "    dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2}/\\d{2}/\\d{4}\\b', text)]\n",
    "    # dd month yyyy\n",
    "    dates = dates + [date.span() for date in re.finditer(r'\\b\\d{2} [a-zA-Z]+ \\d{4}\\b', text)]\n",
    "    return dates\n",
    "\n",
    "# Test the function\n",
    "text = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020 or 01 January 2020'\n",
    "find_dates(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "215a4cbd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2020-01-01\n",
      "2020/01/01\n",
      "01-01-2020\n",
      "01/01/2020\n",
      "01 January 2020\n"
     ]
    }
   ],
   "source": [
    "print(text[12:22])\n",
    "print(text[28:38])\n",
    "print(text[42:52])\n",
    "print(text[56:66])\n",
    "print(text[70:85])"
   ]
  },
  {
@ -125,13 +231,164 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 37,
-   "id": "finished-essex",
+   "id": "e37a24ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = 'The date is 2020-01-02, not 2020/01/02 or 02-01-2020 or 02/01/2020 or 02 January 2020'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "4da1f53f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'The date is 01/02/2020, not 01/02/2020 or 02/01/2020 or 02/01/2020 or 01/02/2020'"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from dateutil.parser import parse\n",
    "\n",
    "def change_data_to_US_format(text):\n",
    "    dates = find_dates(text)\n",
    "\n",
    "    for start, end in dates:\n",
    "        date = text[start:end]\n",
    "        try:\n",
    "            new_date = parse(date).strftime('%m/%d/%Y')\n",
    "            text = text[:start] + new_date + text[end:]\n",
    "        except:\n",
    "            pass\n",
    "    return text\n",
    "\n",
    "# Test the function\n",
    "change_data_to_US_format(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "8a2bf3a3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'The date is 02/01/2020, not 02/01/2020 or 01/02/2020 or 01/02/2020 or 02/01/2020'"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from dateutil.parser import parse\n",
    "\n",
    "def change_data_to_EU_format(text):\n",
    "    dates = find_dates(text)\n",
    "\n",
    "    for start, end in dates:\n",
    "        date = text[start:end]\n",
    "        try:\n",
    "            new_date = parse(date).strftime('%d/%m/%Y')\n",
    "            text = text[:start] + new_date + text[end:]\n",
    "        except:\n",
    "            pass\n",
    "    return text\n",
    "\n",
    "# Test the function\n",
    "change_data_to_EU_format(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "e1c63075",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'The date is 2020.01.02, not 2020.01.02 or 2020.02.01 or 2020.02.01 or 2020.01.02'"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from dateutil.parser import parse\n",
    "\n",
    "def change_data_to_digit_dot_format(text):\n",
    "    dates = find_dates(text)\n",
    "\n",
    "    for start, end in dates:\n",
    "        date = text[start:end]\n",
    "        try:\n",
    "            new_date = parse(date).strftime('%Y.%m.%d')\n",
    "            text = text[:start] + new_date + text[end:]\n",
    "        except:\n",
    "            pass\n",
    "    return text\n",
    "\n",
    "# Test the function\n",
    "change_data_to_digit_dot_format(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "finished-essex",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Data jest 01/01/2020, a nie 01/01/2020 lub 01/01/2020 lub 01/01/2020'"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def correct_dates(source_segment, target_segment, date_format):\n",
-    "    return ''"
+    "    # Check if number of dates in source and target segments are the same\n",
    "    assert len(find_dates(source_segment)) == len(find_dates(target_segment))\n",
    "\n",
    "    # Check if all dates are the same (ignore the format)\n",
    "    source_dates = find_dates(source_segment)\n",
    "    target_dates = find_dates(target_segment)\n",
    "    for source_date, target_date in zip(source_dates, target_dates):\n",
    "        assert change_data_to_US_format(source_segment[source_date[0]:source_date[1]]) == change_data_to_US_format(target_segment[target_date[0]:target_date[1]]), f\"Dates are different: {source_segment[source_date[0]:source_date[1]]} and {target_segment[target_date[0]:target_date[1]]}\"\n",
    "\n",
    "    # Change the format of dates in the target segment\n",
    "    if date_format == 'US':\n",
    "        target_segment = change_data_to_US_format(target_segment)\n",
    "    elif date_format == 'EU':\n",
    "        target_segment = change_data_to_EU_format(target_segment)\n",
    "    elif date_format == 'digit.dot':\n",
    "        target_segment = change_data_to_digit_dot_format(target_segment)\n",
    "\n",
    "    return target_segment\n",
    "\n",
    "# Test the function\n",
    "source_segment = 'The date is 2020-01-01, not 2020/01/01 or 01-01-2020 or 01/01/2020'\n",
    "target_segment = \"Data jest 01/01/2020, a nie 2020-01-01 lub 01-01-2020 lub 01/01/2020\"\n",
    "correct_dates(source_segment, target_segment, 'US')"
   ]
  },
  {
@ -176,13 +433,84 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 60,
   "id": "romance-judge",
   "metadata": {},
   "outputs": [],
   "source": [
    "import math\n",
    "\n",
    "def transfer_tags(source_segment, target_segment):\n",
-    "    return ''"
+    "    # Split the segments into tokens\n",
    "    source_tokens = source_segment.split()\n",
    "    target_tokens = target_segment.split()\n",
    "\n",
    "    # Calculate the ratio of the number of tokens in the target to the number of tokens in the source\n",
    "    ratio = len(target_tokens) / len(source_tokens)\n",
    "\n",
    "    # Assign tags to tokens in the target tokens - if the source token has a tag, assign it to the corresponding token in the target tokens\n",
    "    for i, source_token in enumerate(source_tokens):\n",
    "        if re.match(r'<[^>]+>', source_token):\n",
    "            target_index = math.ceil(i * ratio)\n",
    "\n",
    "            if target_index >= len(target_tokens):\n",
    "                target_index = len(target_tokens) - 1\n",
    "\n",
    "            # Assign start tag\n",
    "            target_tokens[target_index] = re.findall(r'<[^>]+>', source_token)[0] + target_tokens[target_index]\n",
    "\n",
    "            # Assign end tag\n",
    "            target_tokens[target_index] = target_tokens[target_index] + re.findall(r'</[^>]+>', source_token)[0]\n",
    "\n",
    "    return ' '.join(target_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "fd8858d8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'To jest <b>ważny</b> tekst'"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Test the function (same number of tokens)\n",
    "source_segment = 'This is <b>bold</b> text'\n",
    "target_segment = 'To jest ważny tekst'\n",
    "transfer_tags(source_segment, target_segment)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "de9e6298",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'To jest bardzo <b>ważny</b> tekst'"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Test the function (different number of tokens)\n",
    "source_segment = 'This is <b>bold</b> text'\n",
    "target_segment = 'To jest bardzo ważny tekst'\n",
    "transfer_tags(source_segment, target_segment)"
   ]
  }
 ],
@ -205,7 +533,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.14"
  },
  "subtitle": "6,7. Preprocessing i postprocessing",
  "title": "Komputerowe wspomaganie tłumaczenia",
--- a/lab/lab_08.ipynb
+++ b/lab/lab_08.ipynb
--- a/lab/lab_09-10.ipynb
+++ b/lab/lab_09-10.ipynb
--- a/lab/lab_11.ipynb
+++ b/lab/lab_11.ipynb
@ -57,8 +57,28 @@
   "metadata": {},
   "outputs": [],
   "source": [
    "import regex\n",
    "\n",
    "def sentence_split(text):\n",
-    "    return []"
+    "    # Regular expression pattern to match sentence-ending punctuation followed by a space and an uppercase letter\n",
    "    pattern = regex.compile(r'(?<=[.!?])\\s+(?=\\p{Lu})', regex.UNICODE)\n",
    "    \n",
    "    # Split the text using the defined pattern\n",
    "    segments = regex.split(pattern, text)\n",
    "\n",
    "    # Remove leading and trailing whitespace from each segment\n",
    "    segments = [segment.strip() for segment in segments]\n",
    "\n",
    "    # Replace multiple newlines with a single newline\n",
    "    segments = [regex.sub(r'\\n+', '\\n', segment) for segment in segments]\n",
    "\n",
    "    # Replace multiple spaces with a single space\n",
    "    segments = [regex.sub(r'\\s+', ' ', segment) for segment in segments]\n",
    "\n",
    "    # Remove empty segments\n",
    "    segments = [segment for segment in segments if segment]\n",
    "    \n",
    "    return segments"
   ]
  },
  {
@ -71,13 +91,129 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "id": "guilty-morocco",
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Segment 1: Wydział Matematyki i Informatyki | Wydział Matematyki i Informatyki Brak obsługi JavaScript Do pełnej funkcjonalności strony potrzebujesz włączonej obsługi skryptów.\n",
      "Segment 2: Instrukcje, które pozwolą Ci włączyć skrypty w Twojej przeglądarce znajdziesz tutaj Przejdź do TreśćPrzejdź do Menu głównePrzejdź do Mapa serwisuPrzejdź do Dostępność A A A en pl Wyszukaj Wyszukaj Nawigacja mobilna Wydział - Wydział Matematyki i Informatyki Wydział - Wydział Matematyki i Informatyki NO XML TR1A Wydział Pokaż menu szczegółowe Powrót do głównego menu O wydziale Władze wydziału Struktura wydziału Rada Naukowa Dyscyplin Rady programowe Pracownicy Projekty Historia Biblioteka wydziałowa Informator WMI w mediach Wybory 2024 Kontakt Życie naukowe Pokaż menu szczegółowe Powrót do głównego menu Awanse naukowe Wykłady i seminaria Cykle wykładów Towarzystwa i redakcje Konferencje Doktorzy honoris causa Profesorowie Członkowie Akademii Konkurs im.\n",
      "Segment 3: Edyty Szymańskiej Dla Kandydata Pokaż menu szczegółowe Powrót do głównego menu Rekrutacja krok po kroku Studia I stopnia Studia II stopnia Studia doktoranckie Studia podyplomowe Akademia CISCO Samorząd studencki Koła i organizacje studenckie Uniwersytet Otwarty Dla Studenta Dla Pracownika Dla szkół Pokaż menu szczegółowe Powrót do głównego menu Edukacja matematyczno-informatyczna Współpraca ze szkołami Współpraca Pokaż menu szczegółowe Powrót do głównego menu Współpraca z biznesem Współpraca ze szkołami Targi pracy i staży branży IT Oferty pracy 30-LECIE Pokaż menu szczegółowe Powrót do głównego menu Harmonogram Wykłady naukowe z okazji 30-lecia WMI Wydarzenia KWUMI Galeria Zjazd Absolwentów powrót do góry Uniwersytet im.\n",
      "Segment 4: Adama Mickiewicza w PoznaniuIntranet pracownikaIntranet studenta Stypendium dla olimpijczykówJesteś laureatem lub finalistą olimpiady przedmiotowej?\n",
      "Segment 5: Sprawdź jak uzyskać stypendium!Czytaj więcej Jubileusz 30-leciaWydziału Matematyki i Informatyki UAMCzytaj więcej Z okazji 30-lecia wydziału22 czerwca 2024 r. serdecznie zapraszamy na zjazd absolwentówZAREJESTRUJ SIĘ!\n",
      "Segment 6: Data on CampusZapraszamy na Data on Campus #1Czytaj więcejStypendium dla olimpijczykówSprawdź!Jubileusz 30-leciaWydziału Matematyki i InformatykiZjazd absolwentów22 czerwca 2024 r.Data on Campus #18 czerwca 2024 r.\n",
      "Segment 7: Wiadomości Absolutorium 2024 28 maja 2024 IX edycja konkursu Study@research - laureaci z WMI 21 maja 2024 Pozytywna ocena PKA dla kierunku informatyka 14 maja 2024 Wyjazdowa Rada Pracodawców 13 maja 2024 Sportowe sukcesy WMI 09 maja 2024 Czytaj więcej Wydarzenia 5 czerwca 2024 Publiczna obrona rozprawy doktorskiej mgra Tomasza Ziętkiewicza 8 czerwca 2024 Data on Campus #1 10 czerwca 2024 Wykład 23: Grafowe modele sieci społecznościowych, czyli o światach dużych i małych 11 czerwca 2024 Wykład nr 24: O zbiorach rozmytych, czyli o tym, jak nauczyć komputer rozumieć oraz wykorzystywać informację nieprecyzyjną 13 czerwca 2024 Wykład nr 25: Jak z dwóch kryształów otrzymać jeden, czyli o dodawaniu i odejmowaniu wielościanów 15 czerwca 2024 Ultimate Hackathon Mission 3.0 Czytaj więcej O wydziale Jako jednostka uczelni badawczej, Wydział Matematyki i Informatyki UAM w Poznaniu kontynuuje ponad 100-letnią tradycję poznańskiej matematyki.\n",
      "Segment 8: Jest też jednym z najlepszych ośrodków badawczo-dydaktycznych w zakresie informatyki w Polsce.\n",
      "Segment 9: Obecnie Wydział prowadzi studia na czterech kierunkach: matematyce, informatyce, analizie i przetwarzaniu danych oraz na nauczaniu matematyki i informatyki.\n",
      "Segment 10: Ostatni z wymienionych kierunków stanowi ofertę wyjątkową w skali całego kraju.\n"
     ]
    }
   ],
   "source": [
-    "def sentence_split_enhanced(text):\n",
+    "import requests\n",
-    "    return []"
+    "from bs4 import BeautifulSoup\n",
    "\n",
    "def fetch_webpage_content(url):\n",
    "    response = requests.get(url)\n",
    "    response.raise_for_status()  # Raise an exception for HTTP errors\n",
    "    soup = BeautifulSoup(response.content, 'html.parser')\n",
    "    return soup.get_text()\n",
    "\n",
    "url = \"https://wmi.amu.edu.pl/\"\n",
    "webpage_content = fetch_webpage_content(url)\n",
    "\n",
    "import re\n",
    "import unicodedata\n",
    "\n",
    "segments = sentence_split(webpage_content)\n",
    "for i, segment in enumerate(segments[:10]):\n",
    "    print(f\"Segment {i+1}: {segment}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3cd97d83",
   "metadata": {},
   "source": [
    "### Wyjątek 1: Skróty zakończone kropką\n",
    "Skróty takie jak \"mgr.\", \"prof.\", \"dr.\" mogą powodować niepotrzebne podziały segmentów. Musimy upewnić się, że algorytm nie dzieli zdania po skrótach."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fd509273",
   "metadata": {},
   "source": [
    "### Wyjątek 2: Daty i inne liczby zakończone kropką\n",
    "Daty, takie jak \"22 czerwca 2024 r.\", mogą również powodować nieprawidłowe podziały. Musimy uwzględnić takie przypadki."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "20b69c09",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Segment 1: Wydział Matematyki i Informatyki | Wydział Matematyki i Informatyki Brak obsługi JavaScript Do pełnej funkcjonalności strony potrzebujesz włączonej obsługi skryptów.\n",
      "Segment 2: Instrukcje, które pozwolą Ci włączyć skrypty w Twojej przeglądarce znajdziesz tutaj Przejdź do TreśćPrzejdź do Menu głównePrzejdź do Mapa serwisuPrzejdź do Dostępność A A A en pl Wyszukaj Wyszukaj Nawigacja mobilna Wydział - Wydział Matematyki i Informatyki Wydział - Wydział Matematyki i Informatyki NO XML TR1A Wydział Pokaż menu szczegółowe Powrót do głównego menu O wydziale Władze wydziału Struktura wydziału Rada Naukowa Dyscyplin Rady programowe Pracownicy Projekty Historia Biblioteka wydziałowa Informator WMI w mediach Wybory 2024 Kontakt Życie naukowe Pokaż menu szczegółowe Powrót do głównego menu Awanse naukowe Wykłady i seminaria Cykle wykładów Towarzystwa i redakcje Konferencje Doktorzy honoris causa Profesorowie Członkowie Akademii Konkurs im. Edyty Szymańskiej Dla Kandydata Pokaż menu szczegółowe Powrót do głównego menu Rekrutacja krok po kroku Studia I stopnia Studia II stopnia Studia doktoranckie Studia podyplomowe Akademia CISCO Samorząd studencki Koła i organizacje studenckie Uniwersytet Otwarty Dla Studenta Dla Pracownika Dla szkół Pokaż menu szczegółowe Powrót do głównego menu Edukacja matematyczno-informatyczna Współpraca ze szkołami Współpraca Pokaż menu szczegółowe Powrót do głównego menu Współpraca z biznesem Współpraca ze szkołami Targi pracy i staży branży IT Oferty pracy 30-LECIE Pokaż menu szczegółowe Powrót do głównego menu Harmonogram Wykłady naukowe z okazji 30-lecia WMI Wydarzenia KWUMI Galeria Zjazd Absolwentów powrót do góry Uniwersytet im. Adama Mickiewicza w PoznaniuIntranet pracownikaIntranet studenta Stypendium dla olimpijczykówJesteś laureatem lub finalistą olimpiady przedmiotowej?\n",
      "Segment 3: Sprawdź jak uzyskać stypendium!Czytaj więcej Jubileusz 30-leciaWydziału Matematyki i Informatyki UAMCzytaj więcej Z okazji 30-lecia wydziału22 czerwca 2024 r. serdecznie zapraszamy na zjazd absolwentówZAREJESTRUJ SIĘ!\n",
      "Segment 4: Data on CampusZapraszamy na Data on Campus #1Czytaj więcejStypendium dla olimpijczykówSprawdź!Jubileusz 30-leciaWydziału Matematyki i InformatykiZjazd absolwentów22 czerwca 2024 r.Data on Campus #18 czerwca 2024 r. Wiadomości Absolutorium 2024 28 maja 2024 IX edycja konkursu Study@research - laureaci z WMI 21 maja 2024 Pozytywna ocena PKA dla kierunku informatyka 14 maja 2024 Wyjazdowa Rada Pracodawców 13 maja 2024 Sportowe sukcesy WMI 09 maja 2024 Czytaj więcej Wydarzenia 5 czerwca 2024 Publiczna obrona rozprawy doktorskiej mgra Tomasza Ziętkiewicza 8 czerwca 2024 Data on Campus #1 10 czerwca 2024 Wykład 23: Grafowe modele sieci społecznościowych, czyli o światach dużych i małych 11 czerwca 2024 Wykład nr 24: O zbiorach rozmytych, czyli o tym, jak nauczyć komputer rozumieć oraz wykorzystywać informację nieprecyzyjną 13 czerwca 2024 Wykład nr 25: Jak z dwóch kryształów otrzymać jeden, czyli o dodawaniu i odejmowaniu wielościanów 15 czerwca 2024 Ultimate Hackathon Mission 3.0 Czytaj więcej O wydziale Jako jednostka uczelni badawczej, Wydział Matematyki i Informatyki UAM w Poznaniu kontynuuje ponad 100-letnią tradycję poznańskiej matematyki.\n",
      "Segment 5: Jest też jednym z najlepszych ośrodków badawczo-dydaktycznych w zakresie informatyki w Polsce.\n",
      "Segment 6: Obecnie Wydział prowadzi studia na czterech kierunkach: matematyce, informatyce, analizie i przetwarzaniu danych oraz na nauczaniu matematyki i informatyki.\n",
      "Segment 7: Ostatni z wymienionych kierunków stanowi ofertę wyjątkową w skali całego kraju.\n",
      "Segment 8: W ofercie Wydziału można także znaleźć studia podyplomowe. 4 kierunki studiów 1700+ studentów 6000+ absolwentów Studia I stopnia Matematyka Fascynuje Cię królowa nauk?\n",
      "Segment 9: Jesteś umysłem ścisłym?\n",
      "Segment 10: Chcesz studiować matematykę na wiodącej uczelni w Polsce?\n"
     ]
    }
   ],
   "source": [
    "import regex\n",
    "\n",
    "def enhanced_sentence_split(text):\n",
    "    # Lista wyjątków, po których nie dzielimy nawet jeśli jest kropka\n",
    "    exceptions = ['r.', 'tzn.', 'np.', 'itp.', 'etc.', 'dr.', 'prof.', 'im.']\n",
    "\n",
    "    # Regular expression pattern to match sentence-ending punctuation followed by a space and an uppercase letter\n",
    "    pattern = regex.compile(r'(?<=[.!?])\\s+(?=\\p{Lu})', regex.UNICODE)\n",
    "\n",
    "    # Split the text using the defined pattern\n",
    "    segments = regex.split(pattern, text)\n",
    "\n",
    "    # Remove leading and trailing whitespace from each segment\n",
    "    segments = [segment.strip() for segment in segments]\n",
    "\n",
    "    # Rejoin segments that were incorrectly split due to exceptions\n",
    "    i = 0\n",
    "    while i < len(segments) - 1:\n",
    "        for exception in exceptions:\n",
    "            if segments[i].endswith(exception):\n",
    "                segments[i] += ' ' + segments.pop(i + 1)\n",
    "                break\n",
    "        else:\n",
    "            i += 1\n",
    "\n",
    "    # Replace multiple newlines with a single newline\n",
    "    segments = [regex.sub(r'\\n+', '\\n', segment) for segment in segments]\n",
    "\n",
    "    # Replace multiple spaces with a single space\n",
    "    segments = [regex.sub(r'\\s+', ' ', segment) for segment in segments]\n",
    "\n",
    "    # Remove empty segments\n",
    "    segments = [segment for segment in segments if segment]\n",
    "\n",
    "    return segments\n",
    "\n",
    "segments = enhanced_sentence_split(webpage_content)\n",
    "for i, segment in enumerate(segments[:10]):\n",
    "    print(f\"Segment {i+1}: {segment}\")"
   ]
  },
  {
@ -122,46 +258,145 @@
   "id": "divided-chain",
   "metadata": {},
   "source": [
-    "XLIFF jest formatem do przechowywania pamięci tłumaczeń, który opiera się na XML-u. Przykładowy plik XLIFF wygląda następująco:"
+    "XLIFF jest formatem do przechowywania pamięci tłumaczeń, który opiera się na XML-u"
   ]
  },
  {
-   "cell_type": "raw",
+   "cell_type": "code",
-   "id": "appropriate-timber",
+   "execution_count": 4,
   "id": "169d0134",
   "metadata": {},
   "outputs": [],
   "source": [
-    "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n",
+    "text_hu = fetch_webpage_content(\"https://hu.wikipedia.org/wiki/Sz%C3%A1m%C3%ADt%C3%A1studom%C3%A1ny\")\n",
-    "<xliff xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\">\n",
+    "text_en = fetch_webpage_content(\"https://en.wikipedia.org/wiki/Computer_science\")"
-    "    <file datatype=\"plaintext\" original=\"self\" source-language=\"en\" target-language=\"es\">\n",
+   ]
-    "        <header>\n",
+  },
-    "            <sxmd:metadata xmlns:sxmd=\"urn:x-sap:mlt:xliff12:metadata:1.0\" xmlns=\"urn:x-sap:mlt:tsmetadata:1.0\">\n",
+  {
-    "                <object-name>sample</object-name>\n",
+   "cell_type": "code",
-    "                <collection>KWT</collection>\n",
+   "execution_count": 5,
-    "                <domain>KWT</domain>\n",
+   "id": "f3549418",
-    "                <developer>123</developer>\n",
+   "metadata": {},
-    "                <description>sample XLIFF file</description>\n",
+   "outputs": [],
-    "            </sxmd:metadata>\n",
+   "source": [
-    "        </header>\n",
+    "hu_segments = enhanced_sentence_split(text_hu)\n",
-    "        <body>\n",
+    "en_segments = enhanced_sentence_split(text_en)"
-    "            <trans-unit>\n",
+   ]
-    "                <source>Hello world!</source>\n",
+  },
-    "                <target>Hola mundo!</target>\n",
+  {
-    "            </trans-unit>\n",
+   "cell_type": "code",
-    "            <trans-unit>\n",
+   "execution_count": 6,
-    "                <source>File</source>\n",
+   "id": "143730e4",
-    "                <target>Archivo</target>\n",
+   "metadata": {},
-    "            </trans-unit>\n",
+   "outputs": [
-    "            <trans-unit>\n",
+    {
-    "                <source>New</source>\n",
+     "data": {
-    "                <target>Nuevo</target>\n",
+      "text/plain": [
-    "            </trans-unit>\n",
+       "['Számítástudomány – Wikipédia Ugrás a tartalomhoz Főmenü Főmenü áthelyezés az oldalsávba elrejtés Navigáció KezdőlapTartalomKiemelt szócikkekFriss változtatásokLap találomraTudakozó Részvétel KezdőknekSegítségKözösségi portálKapcsolatfelvételAdományok Keresés Keresés Fiók létrehozása Bejelentkezés Személyes eszközök Fiók létrehozása Bejelentkezés Lapok kijelentkezett szerkesztőknek további információk KözreműködésekVitalap Tartalomjegyzék áthelyezés az oldalsávba elrejtés Bevezető 1Vizsgálati területei A(z) Vizsgálati területei alszakasz kinyitása/becsukása 1.1Számítástudomány 1.2Számítógép-tudomány 2Története és alágai 3Kapcsolódó szócikkek 4Jegyzetek 5További információk Tartalomjegyzék kinyitása/becsukása Számítástudomány 161 nyelv EnglishAfrikaansAlemannischአማርኛAragonésالعربيةمصرىঅসমীয়াAsturianuAzərbaycancaتۆرکجهБашҡортсаBoarischŽemaitėškaBikol CentralБеларускаяБеларуская (тарашкевіца)БългарскиभोजपुरीবাংলাBrezhonegBosanskiCatalàکوردیCorsuČeštinaKaszëbscziCymraegDanskDeutschZazakiΕλληνικάEmiliàn e rumagnòlEsperantoEspañolEestiEuskaraEstremeñuفارسیSuomiVõroFøroysktFrançaisNordfriiskFurlanFryskGaeilgeKriyòl gwiyannenGalegoGaelg客家語/Hak-kâ-ngîעבריתहिन्दीHrvatskiKreyòl ayisyenՀայերենInterlinguaBahasa IndonesiaInterlingueÍslenskaItalianoᐃᓄᒃᑎᑐᑦ / inuktitut日本語La .lojban.ქართულიQaraqalpaqshaTaqbaylitҚазақшаKalaallisutភាសាខ្មែរಕನ್ನಡ한국어KurdîLatinaLadinoLëtzebuergeschLigureLombardLietuviųLatgaļuLatviešuМокшеньMalagasyОлык марийMinangkabauМакедонскиമലയാളംМонголꯃꯤꯇꯩ ꯂꯣꯟBahasa MelayuMirandésမြန်မာဘာသာNapulitanoNedersaksiesनेपालीनेपाल भाषाNederlandsNorsk nynorskNorsk bokmålNovialNouormandOccitanOromooଓଡ଼ିଆPicardपालिPolskiPiemontèisپښتوPortuguêsRuna SimiRomânăArmãneashtiРусскийРусиньскыйसंस्कृतम्Саха тылаSarduSicilianuScotsسنڌيSrpskohrvatski / српскохрватскиTaclḥitၽႃႇသႃႇတႆး සිංහලSimple EnglishSlovenčinaSlovenščinaShqipСрпски / srpskiSeelterskSundaSvenskaKiswahiliதமிழ்తెలుగుTetunТоҷикӣไทยትግርኛTagalogTok PisinTürkçeТатарча / tatarçaئۇيغۇرچە / UyghurcheУкраїнськаاردوOʻzbekcha / ўзбекчаVènetoTiếng ViệtWalonWinarayWolof吴语მარგალურიYorùbáZeêuws中文閩南語 / Bân-lâm-gú粵語IsiZulu Hivatkozások szerkesztése SzócikkVitalap magyar OlvasásSzerkesztésLaptörténet Eszközök Eszközök áthelyezés az oldalsávba elrejtés Műveletek OlvasásSzerkesztésLaptörténet Általános Mi hivatkozik erre?Kapcsolódó változtatásokSpeciális lapokHivatkozás erre a változatraLapinformációkHogyan hivatkozz erre a lapra?Rövidített URL készítéseQR-kód letöltéseWikidata-adatlap Nyomtatás/\\u200bexportálás Könyv készítéseLetöltés PDF-kéntNyomtatható változat Társprojektek Wikimédia Commons A Wikipédiából, a szabad enciklopédiából Ez a közzétett változat, ellenőrizve: 2023. június 30.Pontosságellenőrzött Lásd még: Informatika A számítástudomány (computing science) és a számítógép-tudomány (computer science) egymáshoz nagyon közeli, egymást majdnem teljesen átfedő és szorosan összefüggő területeket ölel fel, ezért tárgyalásuk csak együttesen értelmezhető.',\n",
-    "            <trans-unit>\n",
+       " 'Mindkét tudományág lényege, hogy az információkezelést és -feldolgozást állítja vizsgálata fókuszába elméleti és gyakorlati megközelítésben.',\n",
-    "                <source>Exit</source>\n",
+       " 'Kialakulása az 1940-es években kezdődött, nemcsak időben egybeesve, de szoros kapcsolatban is az első elektronikus számítógépek tervezésével.',\n",
-    "                <target>Salir</target>\n",
+       " 'A számítástudomány nem azonos sem az informatikával, sem a számítástechnikával (főleg ha a szilíciumcsipek gyártásának technikáját is ideértjük), sem pedig az információelmélettel, bár vannak kisebb-nagyobb átfedések.',\n",
-    "            </trans-unit>\n",
+       " 'A számítástudománynak nem feladata konkrét szoftverek fejlesztése, bár foglalkozik azzal, miképp lehet a szoftverek hatékony tervezését segíteni, és ennek milyen elméleti alapjai vannak.',\n",
-    "        </body>\n",
+       " 'Nem feladata konkrét információfeldolgozó gépek tervezése, bár szintén foglalkozik azzal, hogyan lehet ezek hatékonyságát elméleti szinten növelni; végképp nem feladata pedig ezek megépítése, bár a tudományág úttörői, mint Alan Turing vagy Neumann János, munkatársként részt vettek a számítógépek korai modelljeinek építésében, kialakításában is (elméleti munkásságukkal szoros kapcsolatban).',\n",
-    "    </file>\n",
+       " 'Vizsgálati területei[szerkesztés] Számítástudomány[szerkesztés] A számítástudomány[1][2] a matematika egyik, igen fiatal tudományága, amely az információfeldolgozó gépek (például számítógépek) tervezésének és működtetésének elméleti, matematikai alapjaival foglalkozik.[3] Némileg elnagyoltan az algoritmusok általános elméletének is nevezhető.[4] „A számítógépek megjelenése, a mechanikus számítási eljárások megindították az algoritmus definíciójának és a programok írásmódjának formalizálását, az algoritmusok és programok szintaktikai (utasítások, vezérlési struktúra), szemantikai (helyesség, ekvivalencia), valamint kiszámíthatósági (a bemeneti értékekhez tartozó kiszámítási idő és memóriaszükséglet) tulajdonságainak mélyreható vizsgálatát.',\n",
-    "</xliff>"
+       " 'E kutatási területeket összefoglalóan matematikai számítástudománynak nevezzük.”[5][6] Az információkezelés és -feldolgozás matematikai alapjai köré csoportosul, és a számítások alapvető természetének megértésére irányul, mely számos alkalmazáshoz vezet a hatékony algoritmusok elemzésében és tervezésében, valamint a megbízható hardver- és szoftverrendszerek tervezésére és ellenőrzésére szolgáló formális módszerek fejlesztésében.',\n",
       " 'Elméleti alapjai: az automataelmélet, a fordítóprogramok, az adatbázis-elmélet.',\n",
       " 'Gyakorlati területei: a számítógépes irányítás és szabályozás, a nagy rendszerek analízise és szintézise, a mérnöki tervezés.',\n",
       " 'Ezek alapjait a halmazok, ítéletek, relációk, függvények, a számelmélet, a különböző algebrai struktúrák, azon belül főként a Boole-algebra adja.',\n",
       " 'Fontos része a kódelmélet, azon belül a zajmentes és zajos csatornák, az optimális és hibajavító kódolás alapelemei, az automaták és formális nyelvek elmélete, a párhuzamos és elosztott számítási rendszerek elmélete, valamint az algebra, a logika és a kategóriák a számítástudományban.',\n",
       " 'Komplex vizsgálati területe a kiszámíthatóságelmélet, valamint annak kiterjesztése, a bonyolultságelmélet, mely azt vizsgálja, miképp lehet osztályozni az algoritmikusan megoldható problémákat, feladatokat a megoldásukhoz szükséges erőforrások mennyisége szerint.',\n",
       " 'A számítógép-tudománnyal átfedésben levő átmeneti elemei: az adatstruktúrák, az algoritmusok, a programozási nyelvek, a szoftvertechnológia, a mesterséges intelligencia, az adatbázis-kezelés.',\n",
       " 'Számítógép-tudomány[szerkesztés] A számítógép-tudomány[7][8][9] tárgya maga a számítógép mint eszköz; az információfeldolgozó gépek tervezésének és használatának elméleti kérdéseit kutatja.',\n",
       " 'A matematika egyik igen fiatal tudományága, amely az információfeldolgozó gépek (például számítógépek) tervezésének és működtetésének elméleti, matematikai alapjaival foglalkozik.',\n",
       " 'Némileg elnagyoltan az algoritmusok általános elméletének is nevezhető.[10] Eredményei és tárgya közé tartoznak a számításokat végző rendszerek és módszerek megértésével, tervezési módszerekkel, algoritmusokkal és eszközökkel, a fogalmak tesztelésének, valamint az analízisnek és verifikációnak módszereivel, a tudásreprezentációval és ennek implementációjával foglalkozó elméletek.',\n",
       " 'Komplex vizsgálati területei a véges automaták, valamint a veremautomaták, mint a Turing-gép speciális esetei.',\n",
       " 'Fő elemei: az algoritmusok és adatszerkezetek, a programozási módszertan és nyelvek, valamint a számítógépes elemek és architektúrák.',\n",
       " 'Története és alágai[szerkesztés] A számítógép-tudomány a matematika egyik legkésőbb, mintegy fél évszázada önállósult ága.',\n",
       " 'Keletkezését 1936-tól, Alan Turing angol matematikus automata- és algoritmuselméleti cikkeinek megjelenésétől, illetve Neumann János, Stephen Cole Kleene, Andrej Markov, George H.',\n",
       " 'Mealy, Edward Forrest Moore, Emil Post, Kurt Gödel, John McCarthy és más kutatók hasonló jellegű munkáinak napvilágra kerülésétől kezdve számíthatjuk.',\n",
       " 'A számítógép-tudomány fejlődése rendkívül gyors, a legtöbb alágnak azonban már van kialakult és közmegegyezéses jellegű elnevezése és feladatköre.',\n",
       " 'Néhány alága, elméletcsoportja:[11] kiszámíthatóságelmélet, rekurzióelmélet: az algoritmusok futásának befejeződését, eredményes lefutásának lehetőségét és viszonyait vizsgálja,[12] más szavakkal: egyes függvényeknek, műveleteknek más függvényekkel való kiszámíthatóságával foglalkozik, tekinthető a számításelmélet egy olyan ágának vagy testvérterületének is, mely Turing-gépek és automaták helyett hagyományos matematikai fogalmakra (függvény, generált struktúra stb.) alapoz.',\n",
       " 'E terület úttörője Stephen Cole Kleene volt (érdekesség, hogy a matematikai logika részének is tekinthető).[13] A bonyolultságelmélet a kiszámíthatóságelmélet kiterjesztése.',\n",
       " 'Azt vizsgálja, hogyan lehet osztályozni az algoritmikusan megoldható problémákat, feladatokat a megoldásukhoz szükséges erőforrások mennyisége szerint.[14] automataelmélet,[8] számításelmélet, bonyolultságelmélet vagy komplexitáselmélet: formális nyelvek, formális nyelvtanok és automaták elmélete: ide sorolhatóak a generatív nyelvtanok, általánosabban a produkciós rendszerek, az automatatípusok által generált és elfogadott nyelvek vizsgálata, az egyes automatatípusok összehasonlítása.',\n",
       " 'Ennek az alágnak rengeteg fontos kutatója volt mind nyugaton, mind a Szovjetunióban, ill.',\n",
       " 'Oroszországban.',\n",
       " 'Fontos terület a Turing-gépek és hasonló automaták elmélete, mégpedig az ezek által futtatott algoritmusok idő-és memóriaigényének vizsgálata.',\n",
       " 'Központi problémája a hatékonysági vagy bonyolultsági osztályok (P, NP stb.) közti kapcsolatok megállapítása, illetve az indeterminisztikus algoritmusok vizsgálata és alkalmazása; absztrakt adatszerkezetek elmélete:[6] ide tartozik a gráfelméleti algoritmusok vizsgálata (keresési problémák és például a matroidok alkalmazása az ilyesfajta problémákra), az informatika bizonyos alapfogalmainak (adatszerkezetek) matematikai leírása; formális szemantika: ez a fordítóprogramok különböző formális nyelvtanokkal való leírásának matematikai elméletéből nőtte ki magát; fontos szerepet játszanak benne az attribútumnyelvtanok és rekurzív nyelvtanok elmélete (például), vagy például a logikai programozás elméleti leírása; logikai tervezés és optimalizálás:[8] ez a hagyományosan mérnöki tudomány a számítógép-tudomány absztrakt modelljeinek tanulmányozásával egy időben alakult ki, nagyrészt tőlük függetlenül, logikai áramköröket ugyanis nemcsak számítógépekben, hanem egyszerűbb automatákban is használnak.',\n",
       " 'Az áramkörök tervezésével és optimalizálásával foglalkozik, logikainak azért nevezik, mert az áramkörmodelleket ún. logikai kapukból építi fel: egy logikai kapu olyan elektronikus szerkezet, amely a bemenő digitális (a gyakorlatban szinte mindig elektronikus) jelek valamilyen logikai függvényét képes előállítani.',\n",
       " 'Optimális egy áramkör (általában), ha a kapuk számát sikerül minimalizálni. mesterségesintelligencia-kutatás[8] (pontosabban ennek matematikai alapjai): az az algoritmusok hatékonyságát azok önállóságának, önműködésének szempontjából vizsgálja; ez az elmélet a számítógép-tudomány, az informatika és a kognitív tudomány érdekes határterületeiből nőtt össze és ki; Számos terület (pl. a párhuzamos algoritmusok elmélete, az axiomatikus bonyolultságelmélet stb.) azonban még mindig inkább csak születőfélben lévő elmélet, mintsem önálló névvel rendelkező tudományág formájában létezik, és nehezebben sorolható a fenti alágak közé.',\n",
       " 'Kapcsolódó szócikkek[szerkesztés] algoritmus Informatika Jegyzetek[szerkesztés] ↑ Katona Gyula – Recski András – Szabó Csaba: A számítástudomány alapjai.',\n",
       " 'Typotex Kft., 2002.; ISBN 978-963-9664-19-7; ISBN 963-9664-19-7. ↑ A BME számítástudományi és információelméleti tanszékének honlapja.',\n",
       " 'Hiv. beill.: 2011. 12. 19. ↑ Computer science Archiválva 2010. május 27-i dátummal a Wayback Machine-ben - Szótári bejegyzés az amerikai NITRD (A Hálózati és Információs Technológia Nemzeti Együttműködést Irányító Hivatala - National Coordination Office for Networking and Information Technology) honlapján. ↑ Dayton Codebreakers.com[halott link] ↑ Giorgio Ausiello: Algoritmusok és rekurzív függvények bonyolultságelmélete.',\n",
       " 'Műszaki Könyvkiadó, Bp., 1984.',\n",
       " 'ISBN 963-10-5159-5. 14. o. ↑ a b U.',\n",
       " 'S.',\n",
       " 'National Research Council Committee on the Fundamentals of Computer Science : Computer Science.',\n",
       " 'Google elektronikus könyv (PDF), (erősen) korlátozott előnézet.',\n",
       " 'Hiv. beill. 2010. július 12.; 11.-13. o.Hiv. beillesztése: 2011. 12. 19. ↑ A kombinatorika és a séta mestere (beszélgetés Szemerédi Endre matematikussal).',\n",
       " 'Magyar Tudomány; 2008./06.; hiv. beill. 2010. augusztus 1.',\n",
       " 'Vö.: „Az elméleti számítástechnika művelése ugyanis sokszor nagyon nehéz, bonyolult matematikai eszközöket és gondolatokat igényel: szóval, az elméleti számítástechnika szerintem a matematika egyik ága!',\n",
       " 'Egyébként Magyarországon folyt vita arról, miképp nevezzék a gyereket, elméleti számítástechnika, számítógép-tudomány és ki tudja, mi még – egyik sem tűnik túl szerencsésnek.',\n",
       " 'Talán az elméleti számítástechnika a legjobb magyar fordítás…” ↑ a b c d Ralston, Anthony: Programozás és számítógép-tudomány.',\n",
       " 'Műszaki Könyvkiadó, Bp., 1974.',\n",
       " 'ISBN 963-10-0616-6. ↑ Az ELTE számítógép-tudományi tanszékének honlapja Archiválva 2010. május 15-i dátummal a Wayback Machine-ben.',\n",
       " 'Hiv. beill.: 2011. 12. 19. ↑ Tudomány és még sok minden. mindenkilapja.hu. [2016. augusztus 15-i dátummal az eredetiből archiválva]. (Hozzáférés: 2016. január 13.) ↑ Tasnádi Attila: Számítástudomány gazdaságinformatikusoknak | bookline. [2008. november 9-i dátummal az eredetiből archiválva]. (Hozzáférés: 2010. július 12.) ↑ Algoritmizálás alapjai. tankonyvtar.hu, 2011. (Hozzáférés: 2016. január 13.) ↑ (ld. angolul). [2004. április 16-i dátummal az eredetiből archiválva]. (Hozzáférés: 2004. október 3.) ↑ Ésik, Zoltán.',\n",
       " 'A számítástudomány alapjai.',\n",
       " 'TypotexKiadó, 5. o. (2011).',\n",
       " 'Hozzáférés ideje: 2016. január 13.',\n",
       " 'További információk[szerkesztés] Alice és Bob – 6. rész: Alice és Bob a kiszámíthatóság határán Alice és Bob – 7. rész: Alice és Bob egymillió dolláros kérdése Alice és Bob – 8. rész: Alice és Bob biztonsága Ralston, Anthony: Programozás és számítógép-tudomány.',\n",
       " 'Műszaki Könyvkiadó, Bp., 1974.',\n",
       " 'ISBN 963-10-0616-6. (er. mű: Introduction to Programming and Computer Science, McGraw-Hill Inc.; ford.',\n",
       " 'Dr. Szabados József).',\n",
       " 'Informatikai portál • összefoglaló, színes tartalomajánló lap Nemzetközi katalógusok LCCN: sh89003285 GND: 4026894-9 NKCS: ph124511 BNF: cb11932109b BNE: XX525961 A lap eredeti címe: „https://hu.wikipedia.org/w/index.php?title=Számítástudomány&oldid=26253398” Kategória: Számítógép-tudományRejtett kategóriák: Minden szócikk halott külső hivatkozásokkalSzócikkek halott külső hivatkozásokkal 2019 áprilisábólWikipédia-szócikkek LCCN azonosítóvalWikipédia-szócikkek GND azonosítóvalWikipédia-szócikkek BNF azonosítóval A lap utolsó módosítása: 2023. június 30., 11:20 A lap szövege Creative Commons Nevezd meg! – Így add tovább! 4.0 licenc alatt van; egyes esetekben más módon is felhasználható.',\n",
       " 'Részletekért lásd a felhasználási feltételeket.',\n",
       " 'Adatvédelmi irányelvek A Wikipédiáról Jogi nyilatkozat Code of Conduct Fejlesztők Statisztikák Sütinyilatkozat Mobil nézet Korlátozott tartalomszélesség ki/be']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hu_segments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "af282c08",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the Polish and English segments to separate files\n",
    "with open('hu_segments.txt', 'w', encoding='utf-8') as file:\n",
    "    for segment in hu_segments:\n",
    "        file.write(segment + '\\n')\n",
    "\n",
    "with open('en_segments.txt', 'w', encoding='utf-8') as file:\n",
    "    for segment in en_segments:\n",
    "        file.write(segment + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "4134e233",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading dictionary...\n",
      "59 source language sentences read.\n",
      "379 target language sentences read.\n",
      "Sizes differing too much. Ignoring files to avoid a rare loop bug.\n"
     ]
    }
   ],
   "source": [
    "!hunalign/src/hunalign/hunalign hunalign/data/hu-en.stem.dic hu_segments.txt en_segments.txt -hand=hunalign/examples/demo.manual.ladder -text > align.txt"
   ]
  },
  {
@ -174,28 +409,122 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
-   "id": "remarkable-pillow",
+   "id": "a30fb7bc",
   "metadata": {},
   "outputs": [],
   "source": [
-    "def convert2xliff(hunalign_file_name):\n",
+    "import xml.etree.ElementTree as ET\n",
-    "    return 0"
+    "\n",
    "def hunalign_to_xliff(hunalign_content, source_lang, target_lang, xliff_file):\n",
    "    xliff = ET.Element('xliff', {\n",
    "        'xmlns': 'urn:oasis:names:tc:xliff:document:1.2',\n",
    "        'version': '1.2'\n",
    "    })\n",
    "    \n",
    "    file_elem = ET.SubElement(xliff, 'file', {\n",
    "        'datatype': 'plaintext',\n",
    "        'original': 'self',\n",
    "        'source-language': source_lang,\n",
    "        'target-language': target_lang\n",
    "    })\n",
    "    \n",
    "    header = ET.SubElement(file_elem, 'header')\n",
    "    metadata = ET.SubElement(header, 'sxmd:metadata', {\n",
    "        'xmlns:sxmd': 'urn:x-sap:mlt:xliff12:metadata:1.0',\n",
    "        'xmlns': 'urn:x-sap:mlt:tsmetadata:1.0'\n",
    "    })\n",
    "    ET.SubElement(metadata, 'object-name').text = 'sample'\n",
    "    ET.SubElement(metadata, 'collection').text = 'KWT'\n",
    "    ET.SubElement(metadata, 'domain').text = 'KWT'\n",
    "    ET.SubElement(metadata, 'developer').text = '123'\n",
    "    ET.SubElement(metadata, 'description').text = 'sample XLIFF file'\n",
    "    \n",
    "    body = ET.SubElement(file_elem, 'body')\n",
    "    \n",
    "    for i, line in enumerate(hunalign_content.strip().split('\\n')):\n",
    "        src_tgt = line.strip().split(' ||| ')\n",
    "        if len(src_tgt) == 2:\n",
    "            trans_unit = ET.SubElement(body, 'trans-unit', {'id': str(i + 1)})\n",
    "            ET.SubElement(trans_unit, 'source').text = src_tgt[0]\n",
    "            ET.SubElement(trans_unit, 'target').text = src_tgt[1]\n",
    "    \n",
    "    tree = ET.ElementTree(xliff)\n",
    "    ET.indent(tree, space=\"    \", level=0)  # Formatowanie z wcięciami\n",
    "    tree.write(xliff_file, encoding='utf-8', xml_declaration=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "6b68cbed",
   "metadata": {},
   "outputs": [],
   "source": [
    "hunalign_content = \"\"\"\n",
    "0-0 Hello world! ||| Witaj świecie!\n",
    "1-1 This is a test. ||| To jest test.\n",
    "2-2 How are you? ||| Jak się masz?\n",
    "\"\"\"\n",
    "hunalign_to_xliff(hunalign_content, 'en', 'pl', 'output.xliff')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "d799237b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<?xml version='1.0' encoding='utf-8'?>\n",
      "<xliff xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\">\n",
      "    <file datatype=\"plaintext\" original=\"self\" source-language=\"en\" target-language=\"pl\">\n",
      "        <header>\n",
      "            <sxmd:metadata xmlns:sxmd=\"urn:x-sap:mlt:xliff12:metadata:1.0\" xmlns=\"urn:x-sap:mlt:tsmetadata:1.0\">\n",
      "                <object-name>sample</object-name>\n",
      "                <collection>KWT</collection>\n",
      "                <domain>KWT</domain>\n",
      "                <developer>123</developer>\n",
      "                <description>sample XLIFF file</description>\n",
      "            </sxmd:metadata>\n",
      "        </header>\n",
      "        <body>\n",
      "            <trans-unit id=\"1\">\n",
      "                <source>0-0 Hello world!</source>\n",
      "                <target>Witaj świecie!</target>\n",
      "            </trans-unit>\n",
      "            <trans-unit id=\"2\">\n",
      "                <source>1-1 This is a test.</source>\n",
      "                <target>To jest test.</target>\n",
      "            </trans-unit>\n",
      "            <trans-unit id=\"3\">\n",
      "                <source>2-2 How are you?</source>\n",
      "                <target>Jak się masz?</target>\n",
      "            </trans-unit>\n",
      "        </body>\n",
      "    </file>\n",
      "</xliff>\n"
     ]
    }
   ],
   "source": [
    "with open(\"output.xliff\", \"r\") as file:\n",
    "    print(file.read())"
   ]
  }
 ],
 "metadata": {
  "author": "Rafał Jaworski",
  "email": "rjawor@amu.edu.pl",
  "lang": "pl",
  "subtitle": "11. Urównoleglanie",
  "title": "Komputerowe wspomaganie tłumaczenia",
  "year": "2021",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "lang": "pl",
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
@ -206,8 +535,11 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.14"
-  }
+  },
  "subtitle": "11. Urównoleglanie",
  "title": "Komputerowe wspomaganie tłumaczenia",
  "year": "2021"
 },
 "nbformat": 4,
 "nbformat_minor": 5
--- a/lab/lab_12.ipynb
+++ b/lab/lab_12.ipynb
@ -104,6 +104,33 @@
    "Celem powyższego ćwiczenia jest pozyskanie danych testowych. Dalsze analizy będziemy prowadzili już bez key loggera, starając się korzystać jedynie z danych zapisanych w pliku. Oczywiście, jeśli zajdzie taka konieczność, można w każdej chwili wygenerować sobie nowy plik."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "983ebbed",
   "metadata": {},
   "outputs": [],
   "source": [
    "import keyboard\n",
    "from datetime import datetime\n",
    "\n",
    "# Ścieżka do pliku, w którym będą zapisywane dane\n",
    "log_file = \"keylog.txt\"\n",
    "\n",
    "def report_key(event):\n",
    "    with open(log_file, \"a\") as f:\n",
    "        # Pobieramy aktualny czas\n",
    "        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')\n",
    "        # Zapisujemy czas i wciśnięty klawisz do pliku\n",
    "        f.write(f\"{timestamp} - {event.name}\\n\")\n",
    "\n",
    "# Ustawienie callbacka dla zdarzeń klawiatury\n",
    "keyboard.on_release(callback=report_key)\n",
    "\n",
    "# Czekanie na zdarzenia klawiatury\n",
    "keyboard.wait()\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "boxed-maple",
@ -114,13 +141,64 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "id": "possible-holder",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import timedelta\n",
    "\n",
    "key_events = []\n",
    "\n",
    "def report_key(event):\n",
    "    # Pobieramy aktualny czas\n",
    "    timestamp = datetime.now()\n",
    "    # Zapisujemy czas i wciśnięty klawisz do listy\n",
    "    key_events.append((timestamp, event.name))\n",
    "\n",
    "# Ustawienie callbacka dla zdarzeń klawiatury\n",
    "keyboard.on_release(callback=report_key)\n",
    "\n",
    "def calculate_typing_speed():\n",
-    "    return 0"
+    "    if not key_events:\n",
    "        return \"No key events recorded.\"\n",
    "\n",
    "    total_time = timedelta()\n",
    "    total_chars = 0\n",
    "    total_words = 0\n",
    "    prev_time = key_events[0][0]\n",
    "\n",
    "    for i in range(1, len(key_events)):\n",
    "        current_time = key_events[i][0]\n",
    "        key = key_events[i][1]\n",
    "\n",
    "        # Obliczamy czas między kolejnymi naciśnięciami klawiszy\n",
    "        time_diff = current_time - prev_time\n",
    "\n",
    "        # Jeśli różnica czasu jest mniejsza niż 5 sekund, dodajemy do całkowitego czasu\n",
    "        if time_diff <= timedelta(seconds=5):\n",
    "            total_time += time_diff\n",
    "            total_chars += 1\n",
    "            if key == \"space\":\n",
    "                total_words += 1\n",
    "\n",
    "        prev_time = current_time\n",
    "\n",
    "    # Dodajemy ostatnie słowo (bo nie zawsze kończy się spacją)\n",
    "    total_words += 1\n",
    "\n",
    "    # Obliczamy prędkość pisania\n",
    "    total_minutes = total_time.total_seconds() / 60\n",
    "    chars_per_minute = total_chars / total_minutes if total_minutes > 0 else 0\n",
    "    words_per_minute = total_words / total_minutes if total_minutes > 0 else 0\n",
    "\n",
    "    return f\"Typing Speed: {chars_per_minute:.2f} chars/min, {words_per_minute:.2f} words/min\"\n",
    "\n",
    "# Uruchomienie keyloggera i czekanie na zdarzenia klawiatury\n",
    "keyboard.wait()\n",
    "\n",
    "# Po zakończeniu pisania, wyliczamy prędkość pisania\n",
    "print(calculate_typing_speed())"
   ]
  },
  {
@ -141,28 +219,73 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "id": "close-riverside",
   "metadata": {},
   "outputs": [],
   "source": [
    "key_events = []\n",
    "\n",
    "def report_key(event):\n",
    "    # Pobieramy aktualny czas\n",
    "    timestamp = datetime.now()\n",
    "    # Zapisujemy czas i wciśnięty klawisz do listy\n",
    "    key_events.append((timestamp, event.name))\n",
    "\n",
    "# Ustawienie callbacka dla zdarzeń klawiatury\n",
    "keyboard.on_release(callback=report_key)\n",
    "\n",
    "def find_pauses():\n",
-    "    return []"
+    "    if not key_events:\n",
    "        return \"No key events recorded.\"\n",
    "\n",
    "    pauses = []\n",
    "    prev_time = key_events[0][0]\n",
    "    full_text = ''.join([key[1] if key[1] != \"space\" else \" \" for key in key_events])\n",
    "\n",
    "    for i in range(1, len(key_events)):\n",
    "        current_time = key_events[i][0]\n",
    "        key = key_events[i][1]\n",
    "\n",
    "        # Obliczamy czas między kolejnymi naciśnięciami klawiszy\n",
    "        time_diff = current_time - prev_time\n",
    "\n",
    "        # Jeśli różnica czasu jest większa niż 3 sekundy, zapisujemy przerwę\n",
    "        if time_diff > timedelta(seconds=3):\n",
    "            start_idx = max(0, i - 21)\n",
    "            end_idx = min(len(full_text), i + 20)\n",
    "            context = full_text[start_idx:end_idx]\n",
    "            pauses.append((time_diff.total_seconds(), context))\n",
    "\n",
    "        prev_time = current_time\n",
    "\n",
    "    # Sortowanie przerw malejąco po długości\n",
    "    pauses.sort(reverse=True, key=lambda x: x[0])\n",
    "\n",
    "    return pauses\n",
    "\n",
    "# Uruchomienie keyloggera i czekanie na zdarzenia klawiatury\n",
    "keyboard.wait()\n",
    "\n",
    "# Po zakończeniu pisania, wykrywamy przerwy\n",
    "pauses = find_pauses()\n",
    "\n",
    "# Wyświetlanie przerw\n",
    "for pause in pauses:\n",
    "    length, context = pause\n",
    "    print(f\"Pause length: {length:.2f} seconds, Context: '{context}'\")\n"
   ]
  }
 ],
 "metadata": {
  "author": "Rafał Jaworski",
  "email": "rjawor@amu.edu.pl",
  "lang": "pl",
  "subtitle": "12. Key logging",
  "title": "Komputerowe wspomaganie tłumaczenia",
  "year": "2021",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "lang": "pl",
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
@ -173,8 +296,11 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.14"
-  }
+  },
  "subtitle": "12. Key logging",
  "title": "Komputerowe wspomaganie tłumaczenia",
  "year": "2021"
 },
 "nbformat": 4,
 "nbformat_minor": 5
--- a/lab/lab_13-14.ipynb
+++ b/lab/lab_13-14.ipynb
@ -44,7 +44,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 30,
   "id": "familiar-terrace",
   "metadata": {
    "scrolled": true
@ -120,13 +120,62 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
   "id": "d0970691",
   "metadata": {},
   "outputs": [],
   "source": [
    "pl_dict = set()\n",
    "with ZipFile('data/hunspell_pl.zip') as zipped_dictionary:\n",
    "    with zipped_dictionary.open('hunspell_pl.txt') as dictionary_file:\n",
    "        for line_bytes in dictionary_file:\n",
    "            line = line_bytes.decode('utf-8')\n",
    "            pl_dict.add(line.rstrip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "economic-southeast",
   "metadata": {},
   "outputs": [],
   "source": [
    "def correct_text(text):\n",
-    "    return []"
+    "    words = text.split()\n",
    "\n",
    "    result = []\n",
    "    for word in words:\n",
    "        if word in pl_dict:\n",
    "            result.append((word, \"correct\"))\n",
    "        else:\n",
    "            result.append((word, \"incorrect\"))\n",
    "\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "771a6c40",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('kalend', 'incorrect'),\n",
       " ('kalendarz', 'correct'),\n",
       " ('kaledoński', 'correct'),\n",
       " ('kalejdoskopowy', 'correct'),\n",
       " ('kalendarium', 'correct')]"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "correct_text(\"kalend kalendarz kaledoński kalejdoskopowy kalendarium\")"
   ]
  },
  {
@ -168,13 +217,51 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 34,
   "id": "built-sally",
   "metadata": {},
   "outputs": [],
   "source": [
    "def L1(w):\n",
-    "    return []"
+    "    letters = 'abcdefghijklmnopqrstuvwxyząćęłńóśźż'\n",
    "    splits = [(w[:i], w[i:]) for i in range(len(w) + 1)]\n",
    "    \n",
    "    deletes = [L + R[1:] for L, R in splits if R]\n",
    "    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]\n",
    "    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]\n",
    "    inserts = [L + c + R for L, R in splits for c in letters]\n",
    "    \n",
    "    return set(deletes + transposes + replaces + inserts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "dc3ffbfe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['kaqendarz',\n",
       " 'kalenydarz',\n",
       " 'kalendadz',\n",
       " 'kalenżarz',\n",
       " 'kalendlrz',\n",
       " 'kalendaóz',\n",
       " 'kalvendarz',\n",
       " 'kalendarzv',\n",
       " 'katendarz',\n",
       " 'kolendarz']"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(L1(\"kalendarz\"))[:10]"
   ]
  },
  {
@ -187,13 +274,49 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 36,
   "id": "coordinated-cooperation",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_suggestions(w):\n",
-    "    return []"
+    "    # Generate L1(w)\n",
    "    L1_set = L1(w)\n",
    "    # Generate S1(w)\n",
    "    S1 = L1_set.intersection(pl_dict)\n",
    "\n",
    "    # Generate L2(w)\n",
    "    L2_set = set()\n",
    "    for v in L1_set:\n",
    "        L2_set.update(L1(v))\n",
    "    \n",
    "    # Generate S2(w)\n",
    "    S2 = L2_set.intersection(pl_dict)\n",
    "\n",
    "    # Combine S1 and S2 and return as list\n",
    "    suggestions = S1.union(S2)\n",
    "    return list(suggestions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "e0c572ce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['kalendarz', 'kalandar', 'kalendarzyk', 'arendarz']"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "generate_suggestions(\"kalendarz\")"
   ]
  }
 ],
@ -216,7 +339,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "version": "3.10.14"
  },
  "subtitle": "13,14. Korekta pisowni",
  "title": "Komputerowe wspomaganie tłumaczenia",
--- a/lab/lab_15.ipynb
+++ b/lab/lab_15.ipynb
Author	SHA1	Message	Date
Patryk Bartkowiak	9545d2d669	2024-05-31 lab15	2024-05-31 21:11:08 +02:00
Patryk Bartkowiak	d4038eb5ae	2024-05-31 lab13-14	2024-05-31 20:52:27 +02:00
Patryk Bartkowiak	5bbba14a57	lab_11 and lab_12, 2024-05-30	2024-05-30 23:51:52 +02:00
Patryk Bartkowiak	fd590b3a22	2024-05-12 lab 9, 10	2024-05-12 23:12:15 +02:00
Patryk Bartkowiak	9b75563e6a	[2024-04-14] lab 8	2024-04-14 22:21:17 +02:00
Patryk Bartkowiak	7a6ac33f6e	[2024-04-14] lab 6, 7	2024-04-14 19:33:04 +02:00
Patryk Bartkowiak	5de69211e1	[2024-04-14] lab 4, 5	2024-04-14 18:45:52 +02:00
Patryk Bartkowiak	870b673fac	[2024-04-13] labs 1,2,3	2024-04-13 14:10:00 +02:00