Compare commits
2 Commits
Author | SHA1 | Date |
---|---|---|
s495724 | 9cc45ec99e | |
s495724 | 9a804fe3dd |
113
lab/lab_01.ipynb
113
lab/lab_01.ipynb
|
@ -213,13 +213,34 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 13,
|
||||
"id": "protected-rings",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tm_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" return [entry[1].casefold() for entry in translation_memory if entry[0] == sentence]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "99d75100-0f9d-4586-82ef-ab42180472a2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['press the enter button', 'press the enter key']"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tm_lookup('Wciśnij przycisk Enter')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -232,17 +253,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 15,
|
||||
"id": "severe-alloy",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"''"
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -261,13 +282,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 16,
|
||||
"id": "structural-diesel",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import string \n",
|
||||
"\n",
|
||||
"def tm_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" return [entry[1].casefold() for entry in translation_memory if entry[0] == sentence]\n",
|
||||
" translator = str.maketrans('', '', string.punctuation) \n",
|
||||
" return sentence.translate(translator)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -280,17 +305,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 20,
|
||||
"id": "brief-senegal",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"''"
|
||||
"[]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -317,15 +342,41 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 24,
|
||||
"id": "mathematical-customs",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tm_lookup(sentence):\n",
|
||||
" translator = str.maketrans('', '', string.punctuation)\n",
|
||||
" sentence = sentence.translate(translator).casefold()\n",
|
||||
" for entry in translation_memory:\n",
|
||||
" if any(word in entry[0].casefold() for word in sentence.split()):\n",
|
||||
" return entry[1]\n",
|
||||
" return ''"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "f6537825-62a6-4503-91a5-bbb17d84170b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'System restart required'"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tm_lookup('Wymagane ponowne uruchomienie maszyny')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "meaningful-virus",
|
||||
|
@ -344,7 +395,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 26,
|
||||
"id": "humanitarian-wrong",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -362,7 +413,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 27,
|
||||
"id": "located-perception",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -374,7 +425,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 28,
|
||||
"id": "advised-casting",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -384,7 +435,7 @@
|
|||
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -419,13 +470,35 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 37,
|
||||
"id": "original-tunisia",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def glossary_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" sentence_words = sentence.casefold().split()\n",
|
||||
" return [entry for entry in glossary if entry[0] in sentence_words]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 39,
|
||||
"id": "b3ae5504-4168-4fe0-ad25-60558242a31d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"glossary_lookup('Każda Drukarka posiada przycisk wznowienia drukowania')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -438,7 +511,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 38,
|
||||
"id": "adolescent-semiconductor",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -452,7 +525,7 @@
|
|||
"author": "Rafał Jaworski",
|
||||
"email": "rjawor@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
|
@ -467,7 +540,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.10.12"
|
||||
},
|
||||
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
|
112
lab/lab_02.ipynb
112
lab/lab_02.ipynb
|
@ -57,7 +57,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 31,
|
||||
"id": "confident-prison",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -80,13 +80,49 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 29,
|
||||
"id": "continental-submission",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def ice_lookup(sentence, prev_sentence, next_sentence):\n",
|
||||
" return []"
|
||||
"def ice_lookup(sentence, prev_sentence, next_sentence, translation_memory):\n",
|
||||
" \n",
|
||||
" ice_previous = \"\"\n",
|
||||
" ice_next = \"\"\n",
|
||||
" \n",
|
||||
" for original, translation in translation_memory:\n",
|
||||
" if sentence == original:\n",
|
||||
" index = translation_memory.index((original, translation))\n",
|
||||
" if index > 0:\n",
|
||||
" ice_previous = translation_memory[index - 1][1]\n",
|
||||
" if index < len(translation_memory) - 1:\n",
|
||||
" ice_next = translation_memory[index + 1][1]\n",
|
||||
" break\n",
|
||||
" \n",
|
||||
" return (ice_previous, ice_next)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"id": "f125ddd2-89fc-4496-93d9-9d640b7f616e",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"prev: Press the ENTER button , next: The printer is switched off\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sentence = \"Sprawdź ustawienia sieciowe\"\n",
|
||||
"prev_sentence = \"Wciśnij przycisk Enter\"\n",
|
||||
"next_sentence = \"Wymagane ponowne uruchomienie komputera\"\n",
|
||||
"\n",
|
||||
"ice_result = ice_lookup(sentence, prev_sentence, next_sentence, translation_memory)\n",
|
||||
"print('prev: ', ice_result[0], ', next: ', ice_result[1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -119,7 +155,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 36,
|
||||
"id": "fourth-pillow",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -141,7 +177,7 @@
|
|||
"id": "graduate-theorem",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: Funkcja nie jest dobrą funkcją dystansu, gdyż bierze pod uwagaę jedynie różnice w długości zdań."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -154,7 +190,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 56,
|
||||
"id": "continued-christopher",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -179,7 +215,7 @@
|
|||
"id": "metallic-leave",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: Nie jest to dobra funkcja dystansu, gdyż znajduje jedynie fakt, że zdania się mogą między sobą różnić."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -206,7 +242,7 @@
|
|||
"id": "bibliographic-stopping",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: Dystans Lavenshteina jest poprawną funkcją dystansu, opisuje ilość operacji, które należy wykonać, aby porównywane do siebie zdania były takie same (np. zamiana liter, wstawienie innej litery, usunięcie litery, itp)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -223,7 +259,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 62,
|
||||
"id": "secondary-wrist",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -233,7 +269,7 @@
|
|||
"2"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 62,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -254,7 +290,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 63,
|
||||
"id": "associate-tuner",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -273,7 +309,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 64,
|
||||
"id": "focal-pathology",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -283,7 +319,7 @@
|
|||
"0.9166666666666666"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"execution_count": 64,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -294,7 +330,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 65,
|
||||
"id": "roman-ceiling",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -304,7 +340,7 @@
|
|||
"0.9428571428571428"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 65,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -315,7 +351,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 66,
|
||||
"id": "invisible-cambodia",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -325,7 +361,7 @@
|
|||
"0.631578947368421"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 66,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -344,13 +380,43 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 71,
|
||||
"id": "genetic-cradle",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def fuzzy_lookup(sentence, threshold):\n",
|
||||
" return []"
|
||||
"import difflib\n",
|
||||
"\n",
|
||||
"def fuzzy_lookup(sentence, threshold, translation_memory):\n",
|
||||
" fuzzy_matches = []\n",
|
||||
" for original, translation in translation_memory:\n",
|
||||
" similarity = difflib.SequenceMatcher(None, sentence, original).ratio()\n",
|
||||
" if similarity >= threshold:\n",
|
||||
" fuzzy_matches.append((original, translation, similarity))\n",
|
||||
" return fuzzy_matches"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 80,
|
||||
"id": "6bebcb12-8c73-4beb-b4c2-00553d3b375f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('Wciśnij przycisk Enter', 'Press the ENTER button', 0.8636363636363636)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 80,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"sentence = 'Wcisnij pszycisk ęnter'\n",
|
||||
"threshold = 0.8\n",
|
||||
"fuzzy_lookup(sentence, threshold, translation_memory)"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -358,7 +424,7 @@
|
|||
"author": "Rafał Jaworski",
|
||||
"email": "rjawor@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
|
@ -373,7 +439,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.10.12"
|
||||
},
|
||||
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
|
168
lab/lab_03.ipynb
168
lab/lab_03.ipynb
|
@ -63,7 +63,7 @@
|
|||
"id": "diverse-sunglasses",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: metal cabinets guides. Proz.com"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -128,13 +128,41 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 7,
|
||||
"id": "cognitive-cedar",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def terminology_lookup():\n",
|
||||
" return []"
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def terminology_lookup(text, dictionary):\n",
|
||||
" pattern = re.compile(r'\\b(?:' + '|'.join(dictionary) + r')\\b', re.IGNORECASE)\n",
|
||||
" matches = pattern.finditer(text)\n",
|
||||
" occurance = ''\n",
|
||||
" for match in matches:\n",
|
||||
" occurance += (f\"({match.start()}, {match.end()})\")\n",
|
||||
" return occurance"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "5781b95b-3af9-4c82-8388-b98a11e6c343",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'(80, 91)(164, 175)(468, 475)(516, 523)(533, 540)'"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"terminology_lookup(text, dictionary)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -161,7 +189,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 12,
|
||||
"id": "tribal-attention",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -205,7 +233,7 @@
|
|||
"IDE\n",
|
||||
",\n",
|
||||
"see\n",
|
||||
"Running\n",
|
||||
"run\n",
|
||||
"Tutorial\n",
|
||||
"Examples\n",
|
||||
"in\n",
|
||||
|
@ -218,7 +246,7 @@
|
|||
"work\n",
|
||||
"for\n",
|
||||
"all\n",
|
||||
"swing\n",
|
||||
"Swing\n",
|
||||
"program\n",
|
||||
"—\n",
|
||||
"applet\n",
|
||||
|
@ -232,7 +260,7 @@
|
|||
"be\n",
|
||||
"the\n",
|
||||
"step\n",
|
||||
"-PRON-\n",
|
||||
"you\n",
|
||||
"need\n",
|
||||
"to\n",
|
||||
"follow\n",
|
||||
|
@ -248,7 +276,7 @@
|
|||
"platform\n",
|
||||
",\n",
|
||||
"if\n",
|
||||
"-PRON-\n",
|
||||
"you\n",
|
||||
"have\n",
|
||||
"not\n",
|
||||
"already\n",
|
||||
|
@ -260,7 +288,7 @@
|
|||
"program\n",
|
||||
"that\n",
|
||||
"use\n",
|
||||
"Swing\n",
|
||||
"swing\n",
|
||||
"component\n",
|
||||
".\n",
|
||||
"compile\n",
|
||||
|
@ -302,13 +330,31 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 19,
|
||||
"id": "surgical-demonstration",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def terminology_lookup():\n",
|
||||
" return []"
|
||||
"def terminology_lookup(text, dictionary):\n",
|
||||
" nlp = spacy.load(\"en_core_web_sm\")\n",
|
||||
" doc = nlp(text)\n",
|
||||
"\n",
|
||||
" word_forms = set()\n",
|
||||
" for word in dictionary:\n",
|
||||
" word_forms.add(word)\n",
|
||||
" for token in doc:\n",
|
||||
" if token.text.lower() == word:\n",
|
||||
" word_forms.add(token.lemma_)\n",
|
||||
"\n",
|
||||
" matches = []\n",
|
||||
" for token in doc:\n",
|
||||
" if token.text.lower() in word_forms:\n",
|
||||
" matches.append((token.idx, token.idx + len(token)))\n",
|
||||
"\n",
|
||||
" occurrences = ''\n",
|
||||
" for match in matches:\n",
|
||||
" occurrences += f\"({match[0]}, {match[1]})\"\n",
|
||||
" return occurrences"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -337,13 +383,59 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 23,
|
||||
"id": "superb-butterfly",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_nouns(text):\n",
|
||||
" return []"
|
||||
" nlp = spacy.load(\"en_core_web_sm\")\n",
|
||||
" doc = nlp(text)\n",
|
||||
" nouns = [token.text for token in doc if token.pos_ == \"NOUN\"]\n",
|
||||
" \n",
|
||||
" return nouns"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "8203c3e5-74a6-42c1-add1-e378f09164fd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['programmers',\n",
|
||||
" 'section',\n",
|
||||
" 'Swing',\n",
|
||||
" 'application',\n",
|
||||
" 'command',\n",
|
||||
" 'line',\n",
|
||||
" 'information',\n",
|
||||
" 'Swing',\n",
|
||||
" 'application',\n",
|
||||
" 'compilation',\n",
|
||||
" 'instructions',\n",
|
||||
" 'programs',\n",
|
||||
" 'applets',\n",
|
||||
" 'applications',\n",
|
||||
" 'steps',\n",
|
||||
" 'release',\n",
|
||||
" 'platform',\n",
|
||||
" 'program',\n",
|
||||
" 'Swing',\n",
|
||||
" 'components',\n",
|
||||
" 'program',\n",
|
||||
" 'program']"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"get_nouns(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -356,7 +448,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 25,
|
||||
"id": "acting-tolerance",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -374,13 +466,29 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 30,
|
||||
"id": "eight-redhead",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_terms(text):\n",
|
||||
" return []"
|
||||
" nlp = spacy.load(\"en_core_web_sm\")\n",
|
||||
" doc = nlp(text)\n",
|
||||
" \n",
|
||||
" tally = {}\n",
|
||||
" \n",
|
||||
" for token in doc:\n",
|
||||
" if token.pos_ != \"NOUN\":\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" lemma = token.lemma_.lower()\n",
|
||||
" \n",
|
||||
" if lemma in tally:\n",
|
||||
" tally[lemma] += 1\n",
|
||||
" else:\n",
|
||||
" tally[lemma] = 1\n",
|
||||
" \n",
|
||||
" return tally"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -393,13 +501,29 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 32,
|
||||
"id": "monetary-mambo",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_terms(text):\n",
|
||||
" return []"
|
||||
" nlp = spacy.load(\"en_core_web_sm\")\n",
|
||||
" doc = nlp(text)\n",
|
||||
" \n",
|
||||
" tally = {}\n",
|
||||
" \n",
|
||||
" for token in doc:\n",
|
||||
" if token.pos_ not in ['NOUN', 'VERB', 'ADJ']:\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" lemma = token.lemma_.lower()\n",
|
||||
" \n",
|
||||
" if lemma in tally:\n",
|
||||
" tally[lemma] += 1\n",
|
||||
" else:\n",
|
||||
" tally[lemma] = 1\n",
|
||||
" \n",
|
||||
" return tally"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -407,7 +531,7 @@
|
|||
"author": "Rafał Jaworski",
|
||||
"email": "rjawor@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
|
@ -422,7 +546,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.10.12"
|
||||
},
|
||||
"subtitle": "3. Terminologia",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
|
7543
lab/lab_04-05.ipynb
7543
lab/lab_04-05.ipynb
File diff suppressed because one or more lines are too long
|
@ -55,13 +55,40 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 2,
|
||||
"id": "documented-hacker",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def find_tags(text):\n",
|
||||
" return []"
|
||||
" pattern = re.compile(r'</?\\w+.*?>')\n",
|
||||
" \n",
|
||||
" matches = pattern.finditer(text)\n",
|
||||
" \n",
|
||||
" results = [(match.group(), match.start(), match.end()) for match in matches]\n",
|
||||
" return results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "8d1c8b3a-a3b0-43ca-a6c1-7b3481e58b87",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[('<note>', 0, 6), ('<to>', 6, 10), ('</to>', 14, 19), ('<from>', 19, 25), ('</from>', 29, 36), ('<heading>', 36, 45), ('</heading>', 53, 63), ('<body>', 63, 69), ('</body>', 98, 105), ('</note>', 105, 112)]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tekst = \"<note><to>Tove</to><from>Jani</from><heading>Reminder</heading><body>Don't forget me this weekend!</body></note>\"\n",
|
||||
"tagi = find_tags(tekst)\n",
|
||||
"print(tagi)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -74,15 +101,55 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 8,
|
||||
"id": "unauthorized-study",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def is_translatable(text):\n",
|
||||
" if re.search(r'\\b(v?\\d+(\\.\\d+)+)\\b', text):\n",
|
||||
" return False\n",
|
||||
" \n",
|
||||
" if re.search(r'function\\s+\\w+\\s*\\(|if\\s+\\(|\\w+\\s*=\\s*\\w+', text):\n",
|
||||
" return False\n",
|
||||
" \n",
|
||||
" if re.search(r'\\b[A-Z]{2,}\\b', text):\n",
|
||||
" return False\n",
|
||||
" \n",
|
||||
" return True"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "1051142c-8170-43fc-b55a-0c367bf69e31",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"'This should be translated.': True\n",
|
||||
"'Version 4.2.1 should not be translated.': False\n",
|
||||
"'Contact API_KEY for more details.': True\n",
|
||||
"'if (x == 10) { return x; }': False\n",
|
||||
"'Welcome to New York City!': True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"examples = [\n",
|
||||
" \"This should be translated.\",\n",
|
||||
" \"Version 4.2.1 should not be translated.\",\n",
|
||||
" \"Contact API_KEY for more details.\",\n",
|
||||
" \"if (x == 10) { return x; }\",\n",
|
||||
" \"Welcome to New York City!\"\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"for example in examples:\n",
|
||||
" print(f\"'{example}': {is_translatable(example)}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "plastic-crown",
|
||||
|
@ -93,13 +160,61 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 11,
|
||||
"id": "beautiful-mathematics",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def find_dates(text):\n",
|
||||
" return []"
|
||||
" \n",
|
||||
" date_patterns = re.compile(\n",
|
||||
" r'(?:\\b(\\d{2})[-.](\\d{2})[-.](\\d{4})\\b)|' # DD-MM-YYYY DD.MM.YYYY\n",
|
||||
" r'(?:\\b(\\d{2})[/](\\d{2})[/](\\d{4})\\b)|' # MM/DD/YYYY\n",
|
||||
" r'(?:\\b(\\d{4})[/](\\d{2})[/](\\d{2})\\b)|' # YYYY/MM/DD\n",
|
||||
" r'(?:\\b(\\d{4})[-](\\d{2})[-](\\d{2})\\b)' # YYYY-MM-DD\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" results = []\n",
|
||||
" for match in date_patterns.finditer(text):\n",
|
||||
" groups = match.groups()\n",
|
||||
" if groups[:3] != (None, None, None):\n",
|
||||
" day, month, year = groups[:3]\n",
|
||||
" elif groups[3:6] != (None, None, None):\n",
|
||||
" month, day, year = groups[3:6]\n",
|
||||
" elif groups[6:9] != (None, None, None):\n",
|
||||
" year, month, day = groups[6:9]\n",
|
||||
" elif groups[9:] != (None, None, None):\n",
|
||||
" year, month, day = groups[9:]\n",
|
||||
" \n",
|
||||
" results.append({\n",
|
||||
" \"date\": f\"{day.zfill(2)}-{month.zfill(2)}-{year}\",\n",
|
||||
" \"position\": (match.start(), match.end()),\n",
|
||||
" \"day\": day,\n",
|
||||
" \"month\": month,\n",
|
||||
" \"year\": year\n",
|
||||
" })\n",
|
||||
" \n",
|
||||
" return results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "48cf2ad7-b34d-4af6-84c8-5d941f2e323c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[{'date': '21-03-2023', 'position': (20, 30), 'day': '21', 'month': '03', 'year': '2023'}, {'date': '21-03-2023', 'position': (32, 42), 'day': '21', 'month': '03', 'year': '2023'}, {'date': '21-03-2023', 'position': (44, 54), 'day': '21', 'month': '03', 'year': '2023'}, {'date': '21-03-2023', 'position': (56, 66), 'day': '21', 'month': '03', 'year': '2023'}, {'date': '21-03-2023', 'position': (72, 82), 'day': '21', 'month': '03', 'year': '2023'}]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"example_text = \"Important dates are 21-03-2023, 03/21/2023, 2023/03/21, 21.03.2023, and 2023-03-21.\"\n",
|
||||
"found_dates = find_dates(example_text)\n",
|
||||
"print(found_dates)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -125,13 +240,67 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 23,
|
||||
"id": "finished-essex",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def correct_dates(source_segment, target_segment, date_format):\n",
|
||||
" return ''"
|
||||
" format_map = {\n",
|
||||
" \"Europe\": \"{2}-{1}-{0}\", # DD-MM-YYYY\n",
|
||||
" \"US\": \"{1}/{2}/{0}\", # MM/DD/YYYY\n",
|
||||
" \"digit-dot\": \"{2}.{1}.{0}\" # DD.MM.YYYY\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" if date_format not in format_map:\n",
|
||||
" raise ValueError(\"Błędny format\")\n",
|
||||
"\n",
|
||||
" source_dates = find_dates(source_segment)\n",
|
||||
" target_dates = find_dates(target_segment)\n",
|
||||
" \n",
|
||||
" if len(source_dates) != len(target_dates):\n",
|
||||
" return f\"Liczba dat w segmencie źródłowym ({len(source_dates)}) i tłumaczeniu ({len(target_dates)}) się nie zgadza.\"\n",
|
||||
"\n",
|
||||
" new_target_segment = target_segment\n",
|
||||
" for source, target in zip(source_dates, target_dates):\n",
|
||||
" if source[\"day\"] != target[\"day\"] or source[\"month\"] != target[\"month\"] or source[\"year\"] != target[\"year\"]:\n",
|
||||
" return \"Daty się nie zgadzają\"\n",
|
||||
"\n",
|
||||
" formatted_date = format_map[date_format].format(target[\"year\"], target[\"month\"].zfill(2), target[\"day\"].zfill(2))\n",
|
||||
" new_target_segment = new_target_segment[:target[\"position\"][0]] + formatted_date + new_target_segment[target[\"position\"][1]:]\n",
|
||||
"\n",
|
||||
" return new_target_segment"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"id": "7dfb44fd-297f-41ea-8f46-a647c98341a3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"source_text = \"The contract starts on 2023-12-21 and ends on 2023-06-21.\"\n",
|
||||
"target_text = \"Umowa zaczyna się 21.12.2023 i kończy 21.06.2023.\"\n",
|
||||
"expected_format = \"digit-dot\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"id": "f0922192-bb51-4194-aeea-4c41c5d195a5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Umowa zaczyna się 21.12.2023 i kończy 21.06.2023.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"text = correct_dates(source_text, target_text, expected_format)\n",
|
||||
"print(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -176,13 +345,63 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 32,
|
||||
"id": "romance-judge",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tokenize(text):\n",
|
||||
" tokens = re.split(r'(\\s+|<[^>]+>)', text)\n",
|
||||
" return [token for token in tokens if token.strip()]\n",
|
||||
"\n",
|
||||
"def transfer_tags(source_segment, target_segment):\n",
|
||||
" return ''"
|
||||
" source_tokens = tokenize(source_segment)\n",
|
||||
" target_tokens = [token for token in tokenize(target_segment) if not re.match(r'<[^>]+>', token)]\n",
|
||||
"\n",
|
||||
" source_tags = [(token, i) for i, token in enumerate(source_tokens) if re.match(r'<[^>]+>', token)]\n",
|
||||
"\n",
|
||||
" if len(source_tags) > 0:\n",
|
||||
" target_with_tags = []\n",
|
||||
" source_word_count = len([token for token in source_tokens if not re.match(r'<[^>]+>', token)])\n",
|
||||
" target_word_count = len(target_tokens)\n",
|
||||
" tag_index = 0\n",
|
||||
" tag_positions = [int(round(tag[1] * target_word_count / source_word_count)) for tag in source_tags]\n",
|
||||
"\n",
|
||||
" for i, token in enumerate(target_tokens):\n",
|
||||
" while tag_index < len(tag_positions) and i == tag_positions[tag_index]:\n",
|
||||
" target_with_tags.append(source_tags[tag_index][0])\n",
|
||||
" tag_index += 1\n",
|
||||
" target_with_tags.append(token)\n",
|
||||
" \n",
|
||||
" while tag_index < len(tag_positions):\n",
|
||||
" target_with_tags.append(source_tags[tag_index][0])\n",
|
||||
" tag_index += 1\n",
|
||||
"\n",
|
||||
" return ' '.join(target_with_tags)\n",
|
||||
" else:\n",
|
||||
" return ' '.join(target_tokens)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"id": "c1ebe058-1da6-43a1-8d99-78999aefca17",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"To jest <b> pogrubienie i </b> to jest kursywa. <i> </i>\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"source_segment = \"This is <b>bold</b> and this is <i>italic</i>.\"\n",
|
||||
"target_segment = \"To jest pogrubienie i to jest kursywa.\"\n",
|
||||
"\n",
|
||||
"result = transfer_tags(source_segment, target_segment)\n",
|
||||
"print(result)"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -190,7 +409,7 @@
|
|||
"author": "Rafał Jaworski",
|
||||
"email": "rjawor@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
|
@ -205,7 +424,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.10.12"
|
||||
},
|
||||
"subtitle": "6,7. Preprocessing i postprocessing",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
|
265
lab/lab_08.ipynb
265
lab/lab_08.ipynb
|
@ -57,13 +57,53 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 18,
|
||||
"id": "moving-clothing",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def calculate_bleu():\n",
|
||||
" return 0"
|
||||
"import zipfile\n",
|
||||
"from nltk.translate.bleu_score import corpus_bleu\n",
|
||||
"import io\n",
|
||||
"\n",
|
||||
"def calculate_bleu(zip_path):\n",
|
||||
" references = []\n",
|
||||
" candidates = []\n",
|
||||
"\n",
|
||||
" with zipfile.ZipFile(zip_path, 'r') as z:\n",
|
||||
" with z.open('EMEA.en-pl.en', 'r') as file_ref, z.open('EMEA.en-pl.pl', 'r') as file_trans:\n",
|
||||
" ref_buffer = io.TextIOWrapper(file_ref, encoding='utf-8')\n",
|
||||
" trans_buffer = io.TextIOWrapper(file_trans, encoding='utf-8')\n",
|
||||
" \n",
|
||||
" for ref_line, trans_line in zip(ref_buffer, trans_buffer):\n",
|
||||
" ref_tokens = [ref_line.strip().split()]\n",
|
||||
" trans_tokens = trans_line.strip().split()\n",
|
||||
" \n",
|
||||
" references.append(ref_tokens)\n",
|
||||
" candidates.append(trans_tokens)\n",
|
||||
"\n",
|
||||
" score = corpus_bleu(references, candidates)\n",
|
||||
" return score"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "b312fc2a-8d95-4eb5-a49f-a8b0707be8bf",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Ocena BLEU: 0.05086746137866238\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"zip_file_path = 'korpusy/emea.zip'\n",
|
||||
"bleu_score = calculate_bleu(zip_file_path)\n",
|
||||
"print(f\"Ocena BLEU: {bleu_score}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -76,13 +116,59 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 20,
|
||||
"id": "lasting-rolling",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def analyze_bleu():\n",
|
||||
" return []"
|
||||
"def analyze_bleu(zip_path, start_end_pairs):\n",
|
||||
" results = {}\n",
|
||||
" \n",
|
||||
" with zipfile.ZipFile(zip_path, 'r') as z:\n",
|
||||
" with z.open('EMEA.en-pl.en', 'r') as file_ref, z.open('EMEA.en-pl.pl', 'r') as file_trans:\n",
|
||||
" ref_buffer = io.TextIOWrapper(file_ref, encoding='utf-8')\n",
|
||||
" trans_buffer = io.TextIOWrapper(file_trans, encoding='utf-8')\n",
|
||||
" \n",
|
||||
" references_full = [line.strip().split() for line in ref_buffer]\n",
|
||||
" candidates_full = [line.strip().split() for line in trans_buffer]\n",
|
||||
"\n",
|
||||
" for label, (start, end) in start_end_pairs.items():\n",
|
||||
"\n",
|
||||
" references_segment = [references_full[i] for i in range(start, min(end, len(references_full)))]\n",
|
||||
" candidates_segment = [candidates_full[i] for i in range(start, min(end, len(candidates_full)))]\n",
|
||||
" \n",
|
||||
" references_segment = [[ref] for ref in references_segment]\n",
|
||||
"\n",
|
||||
" score = corpus_bleu(references_segment, candidates_segment)\n",
|
||||
" results[label] = score\n",
|
||||
" \n",
|
||||
" return results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "8c218410-048d-40fb-b609-9553f8dae28b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Ocena BLEU dla First 100 sentences: 0.03940935286156434\n",
|
||||
"Ocena BLEU dla Sentences 500-600: 1.9718207266585256e-155\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"fragments = {\n",
|
||||
" \"First 100 sentences\": (0, 100),\n",
|
||||
" \"Sentences 500-600\": (500, 600)\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"bleu_scores = analyze_bleu(zip_file_path, fragments)\n",
|
||||
"for label, score in bleu_scores.items():\n",
|
||||
" print(f\"Ocena BLEU dla {label}: {score}\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -120,13 +206,91 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 36,
|
||||
"id": "occupied-swing",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def calculate_wer():\n",
|
||||
" return 0"
|
||||
"import jiwer\n",
|
||||
"\n",
|
||||
"def calculate_wer(zip_path):\n",
|
||||
" def wer(reference, hypothesis):\n",
|
||||
" ref_words = reference.split()\n",
|
||||
" hyp_words = hypothesis.split()\n",
|
||||
" R = len(ref_words)\n",
|
||||
" H = len(hyp_words)\n",
|
||||
" cost_matrix = [[0] * (H + 1) for _ in range(R + 1)]\n",
|
||||
"\n",
|
||||
" for i in range(1, R + 1):\n",
|
||||
" cost_matrix[i][0] = i\n",
|
||||
" for j in range(1, H + 1):\n",
|
||||
" cost_matrix[0][j] = j\n",
|
||||
"\n",
|
||||
" for i in range(1, R + 1):\n",
|
||||
" for j in range(1, H + 1):\n",
|
||||
" substitution_cost = 0 if ref_words[i - 1] == hyp_words[j - 1] else 1\n",
|
||||
" cost_matrix[i][j] = min(\n",
|
||||
" cost_matrix[i - 1][j] + 1,\n",
|
||||
" cost_matrix[i][j - 1] + 1,\n",
|
||||
" cost_matrix[i - 1][j - 1] + substitution_cost\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" i, j = R, H\n",
|
||||
" substitutions = insertions = deletions = correct = 0\n",
|
||||
" while i > 0 and j > 0:\n",
|
||||
" if ref_words[i - 1] == hyp_words[j - 1]:\n",
|
||||
" correct += 1\n",
|
||||
" i -= 1\n",
|
||||
" j -= 1\n",
|
||||
" elif cost_matrix[i][j] == cost_matrix[i - 1][j - 1] + 1:\n",
|
||||
" substitutions += 1\n",
|
||||
" i -= 1\n",
|
||||
" j -= 1\n",
|
||||
" elif cost_matrix[i][j] == cost_matrix[i][j - 1] + 1:\n",
|
||||
" insertions += 1\n",
|
||||
" j -= 1\n",
|
||||
" else:\n",
|
||||
" deletions += 1\n",
|
||||
" i -= 1\n",
|
||||
"\n",
|
||||
" N = substitutions + deletions + correct\n",
|
||||
" WER = (substitutions + deletions + insertions) / N if N > 0 else 0\n",
|
||||
" return WER\n",
|
||||
"\n",
|
||||
" total_wer = 0\n",
|
||||
" num_sentences = 0\n",
|
||||
"\n",
|
||||
" with zipfile.ZipFile(zip_path, 'r') as z:\n",
|
||||
" with z.open('EMEA.en-pl.en', 'r') as file_ref, z.open('EMEA.en-pl.pl', 'r') as file_trans:\n",
|
||||
" ref_buffer = io.TextIOWrapper(file_ref, encoding='utf-8')\n",
|
||||
" trans_buffer = io.TextIOWrapper(file_trans, encoding='utf-8')\n",
|
||||
"\n",
|
||||
" for ref_line, hyp_line in zip(ref_buffer, trans_buffer):\n",
|
||||
" total_wer += wer(ref_line.strip(), hyp_line.strip())\n",
|
||||
" num_sentences += 1\n",
|
||||
"\n",
|
||||
" average_wer = total_wer / num_sentences if num_sentences > 0 else 0\n",
|
||||
" return average_wer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 37,
|
||||
"id": "ceccd005-714b-4dfb-b210-d13a9c5238c9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"WER (Word Error Rate): 82.13%\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"zip_file_path = 'korpusy/emea.zip'\n",
|
||||
"wer_result = calculate_wer(zip_file_path)\n",
|
||||
"print(f\"WER (Word Error Rate): {wer_result:.2%}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -147,13 +311,48 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 41,
|
||||
"id": "immediate-element",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def calculate_levenshtein():\n",
|
||||
" return 0"
|
||||
"import Levenshtein as lev\n",
|
||||
"\n",
|
||||
"def calculate_levenshtein(zip_path):\n",
|
||||
" total_distance = 0\n",
|
||||
" num_sentences = 0\n",
|
||||
"\n",
|
||||
" with zipfile.ZipFile(zip_path, 'r') as z:\n",
|
||||
" with z.open('EMEA.en-pl.en', 'r') as file_ref, z.open('EMEA.en-pl.pl', 'r') as file_trans:\n",
|
||||
" ref_buffer = io.TextIOWrapper(file_ref, encoding='utf-8')\n",
|
||||
" trans_buffer = io.TextIOWrapper(file_trans, encoding='utf-8')\n",
|
||||
"\n",
|
||||
" for ref_line, hyp_line in zip(ref_buffer, trans_buffer):\n",
|
||||
" distance = lev.distance(ref_line.strip(), hyp_line.strip())\n",
|
||||
" total_distance += distance\n",
|
||||
" num_sentences += 1\n",
|
||||
"\n",
|
||||
" average_distance = total_distance / num_sentences if num_sentences > 0 else 0\n",
|
||||
" return average_distance"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"id": "a4581547-7219-4a8d-913d-e0e4fa4d0914",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Średnia wartość dystancu Levenshteina: 61.29\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"average_distance = calculate_levenshtein(zip_file_path)\n",
|
||||
"print(f\"Średnia wartość dystancu Levenshteina: {average_distance:.2f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -177,28 +376,45 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "descending-easter",
|
||||
"execution_count": null,
|
||||
"id": "c0e3f109-795d-4844-b41d-9e0b570577c5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def analyze_translations():\n",
|
||||
" return []"
|
||||
"from PyDictionary import PyDictionary\n",
|
||||
"\n",
|
||||
"def translate_corpus(zip_path, filename):\n",
|
||||
" dictionary = PyDictionary()\n",
|
||||
" translations = {}\n",
|
||||
"\n",
|
||||
" with zipfile.ZipFile(zip_path, 'r') as z:\n",
|
||||
" with z.open(filename, 'r') as file:\n",
|
||||
" buffer = io.TextIOWrapper(file, encoding='utf-8')\n",
|
||||
"\n",
|
||||
" for line in buffer:\n",
|
||||
" words = line.strip().split()\n",
|
||||
" for word in words:\n",
|
||||
" if word not in translations:\n",
|
||||
" try:\n",
|
||||
" german_translation = dictionary.translate(word, \"German\")\n",
|
||||
" translations[word] = german_translation\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error translating {word}: {str(e)}\")\n",
|
||||
" translations[word] = None\n",
|
||||
"\n",
|
||||
" return translations"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Rafał Jaworski",
|
||||
"email": "rjawor@amu.edu.pl",
|
||||
"lang": "pl",
|
||||
"subtitle": "8. Wykorzystanie tłumaczenia automatycznego we wspomaganiu tłumaczenia",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
"year": "2021",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"lang": "pl",
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
|
@ -209,8 +425,11 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
}
|
||||
"version": "3.10.12"
|
||||
},
|
||||
"subtitle": "8. Wykorzystanie tłumaczenia automatycznego we wspomaganiu tłumaczenia",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
"year": "2021"
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
|
|
1016
lab/lab_09-10.ipynb
1016
lab/lab_09-10.ipynb
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue