From 9a804fe3dd2546ae35bf24af266191b375f6a5e8 Mon Sep 17 00:00:00 2001 From: s495724 Date: Tue, 16 Apr 2024 13:36:13 +0200 Subject: [PATCH] Upload files to "lab" --- lab/lab_01.ipynb | 113 +++++++++++++++++++++++++------ lab/lab_02.ipynb | 112 ++++++++++++++++++++++++------- lab/lab_03.ipynb | 168 ++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 328 insertions(+), 65 deletions(-) diff --git a/lab/lab_01.ipynb b/lab/lab_01.ipynb index 0ffe833..8353712 100644 --- a/lab/lab_01.ipynb +++ b/lab/lab_01.ipynb @@ -213,13 +213,34 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 13, "id": "protected-rings", "metadata": {}, "outputs": [], "source": [ "def tm_lookup(sentence):\n", - " return ''" + " return [entry[1].casefold() for entry in translation_memory if entry[0] == sentence]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "99d75100-0f9d-4586-82ef-ab42180472a2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['press the enter button', 'press the enter key']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm_lookup('Wciśnij przycisk Enter')" ] }, { @@ -232,17 +253,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 15, "id": "severe-alloy", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "''" + "[]" ] }, - "execution_count": 18, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -261,13 +282,17 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "id": "structural-diesel", "metadata": {}, "outputs": [], "source": [ + "import string \n", + "\n", "def tm_lookup(sentence):\n", - " return ''" + " return [entry[1].casefold() for entry in translation_memory if entry[0] == sentence]\n", + " translator = str.maketrans('', '', string.punctuation) \n", + " return sentence.translate(translator)" ] }, { @@ -280,17 +305,17 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 20, "id": "brief-senegal", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "''" + "[]" ] }, - "execution_count": 12, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -317,15 +342,41 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 24, "id": "mathematical-customs", "metadata": {}, "outputs": [], "source": [ "def tm_lookup(sentence):\n", + " translator = str.maketrans('', '', string.punctuation)\n", + " sentence = sentence.translate(translator).casefold()\n", + " for entry in translation_memory:\n", + " if any(word in entry[0].casefold() for word in sentence.split()):\n", + " return entry[1]\n", " return ''" ] }, + { + "cell_type": "code", + "execution_count": 25, + "id": "f6537825-62a6-4503-91a5-bbb17d84170b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'System restart required'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm_lookup('Wymagane ponowne uruchomienie maszyny')" + ] + }, { "cell_type": "markdown", "id": "meaningful-virus", @@ -344,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 26, "id": "humanitarian-wrong", "metadata": {}, "outputs": [], @@ -362,7 +413,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 27, "id": "located-perception", "metadata": {}, "outputs": [], @@ -374,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 28, "id": "advised-casting", "metadata": {}, "outputs": [ @@ -384,7 +435,7 @@ "[('przycisk', 'button'), ('drukarka', 'printer')]" ] }, - "execution_count": 17, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -419,13 +470,35 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 37, "id": "original-tunisia", "metadata": {}, "outputs": [], "source": [ "def glossary_lookup(sentence):\n", - " return ''" + " sentence_words = sentence.casefold().split()\n", + " return [entry for entry in glossary if entry[0] in sentence_words]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "b3ae5504-4168-4fe0-ad25-60558242a31d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('przycisk', 'button'), ('drukarka', 'printer')]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "glossary_lookup('Każda Drukarka posiada przycisk wznowienia drukowania')" ] }, { @@ -438,7 +511,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 38, "id": "adolescent-semiconductor", "metadata": {}, "outputs": [], @@ -452,7 +525,7 @@ "author": "Rafał Jaworski", "email": "rjawor@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -467,7 +540,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.12" }, "subtitle": "1. Podstawowe techniki wspomagania tłumaczenia", "title": "Komputerowe wspomaganie tłumaczenia", diff --git a/lab/lab_02.ipynb b/lab/lab_02.ipynb index 10c2003..a2b779e 100644 --- a/lab/lab_02.ipynb +++ b/lab/lab_02.ipynb @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 31, "id": "confident-prison", "metadata": {}, "outputs": [], @@ -80,13 +80,49 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 29, "id": "continental-submission", "metadata": {}, "outputs": [], "source": [ - "def ice_lookup(sentence, prev_sentence, next_sentence):\n", - " return []" + "def ice_lookup(sentence, prev_sentence, next_sentence, translation_memory):\n", + " \n", + " ice_previous = \"\"\n", + " ice_next = \"\"\n", + " \n", + " for original, translation in translation_memory:\n", + " if sentence == original:\n", + " index = translation_memory.index((original, translation))\n", + " if index > 0:\n", + " ice_previous = translation_memory[index - 1][1]\n", + " if index < len(translation_memory) - 1:\n", + " ice_next = translation_memory[index + 1][1]\n", + " break\n", + " \n", + " return (ice_previous, ice_next)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "f125ddd2-89fc-4496-93d9-9d640b7f616e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "prev: Press the ENTER button , next: The printer is switched off\n" + ] + } + ], + "source": [ + "sentence = \"Sprawdź ustawienia sieciowe\"\n", + "prev_sentence = \"Wciśnij przycisk Enter\"\n", + "next_sentence = \"Wymagane ponowne uruchomienie komputera\"\n", + "\n", + "ice_result = ice_lookup(sentence, prev_sentence, next_sentence, translation_memory)\n", + "print('prev: ', ice_result[0], ', next: ', ice_result[1])" ] }, { @@ -119,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 36, "id": "fourth-pillow", "metadata": {}, "outputs": [], @@ -141,7 +177,7 @@ "id": "graduate-theorem", "metadata": {}, "source": [ - "Odpowiedź:" + "Odpowiedź: Funkcja nie jest dobrą funkcją dystansu, gdyż bierze pod uwagaę jedynie różnice w długości zdań." ] }, { @@ -154,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 56, "id": "continued-christopher", "metadata": {}, "outputs": [], @@ -179,7 +215,7 @@ "id": "metallic-leave", "metadata": {}, "source": [ - "Odpowiedź:" + "Odpowiedź: Nie jest to dobra funkcja dystansu, gdyż znajduje jedynie fakt, że zdania się mogą między sobą różnić." ] }, { @@ -206,7 +242,7 @@ "id": "bibliographic-stopping", "metadata": {}, "source": [ - "Odpowiedź:" + "Odpowiedź: Dystans Lavenshteina jest poprawną funkcją dystansu, opisuje ilość operacji, które należy wykonać, aby porównywane do siebie zdania były takie same (np. zamiana liter, wstawienie innej litery, usunięcie litery, itp)" ] }, { @@ -223,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 62, "id": "secondary-wrist", "metadata": {}, "outputs": [ @@ -233,7 +269,7 @@ "2" ] }, - "execution_count": 5, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -254,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 63, "id": "associate-tuner", "metadata": {}, "outputs": [], @@ -273,7 +309,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 64, "id": "focal-pathology", "metadata": {}, "outputs": [ @@ -283,7 +319,7 @@ "0.9166666666666666" ] }, - "execution_count": 7, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -294,7 +330,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 65, "id": "roman-ceiling", "metadata": {}, "outputs": [ @@ -304,7 +340,7 @@ "0.9428571428571428" ] }, - "execution_count": 8, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -315,7 +351,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 66, "id": "invisible-cambodia", "metadata": {}, "outputs": [ @@ -325,7 +361,7 @@ "0.631578947368421" ] }, - "execution_count": 9, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } @@ -344,13 +380,43 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 71, "id": "genetic-cradle", "metadata": {}, "outputs": [], "source": [ - "def fuzzy_lookup(sentence, threshold):\n", - " return []" + "import difflib\n", + "\n", + "def fuzzy_lookup(sentence, threshold, translation_memory):\n", + " fuzzy_matches = []\n", + " for original, translation in translation_memory:\n", + " similarity = difflib.SequenceMatcher(None, sentence, original).ratio()\n", + " if similarity >= threshold:\n", + " fuzzy_matches.append((original, translation, similarity))\n", + " return fuzzy_matches" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "6bebcb12-8c73-4beb-b4c2-00553d3b375f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Wciśnij przycisk Enter', 'Press the ENTER button', 0.8636363636363636)]" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sentence = 'Wcisnij pszycisk ęnter'\n", + "threshold = 0.8\n", + "fuzzy_lookup(sentence, threshold, translation_memory)" ] } ], @@ -358,7 +424,7 @@ "author": "Rafał Jaworski", "email": "rjawor@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -373,7 +439,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.12" }, "subtitle": "2. Zaawansowane użycie pamięci tłumaczeń", "title": "Komputerowe wspomaganie tłumaczenia", diff --git a/lab/lab_03.ipynb b/lab/lab_03.ipynb index 5707f0d..9e5041b 100644 --- a/lab/lab_03.ipynb +++ b/lab/lab_03.ipynb @@ -63,7 +63,7 @@ "id": "diverse-sunglasses", "metadata": {}, "source": [ - "Odpowiedź:" + "Odpowiedź: metal cabinets guides. Proz.com" ] }, { @@ -128,13 +128,41 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "id": "cognitive-cedar", "metadata": {}, "outputs": [], "source": [ - "def terminology_lookup():\n", - " return []" + "import re\n", + "\n", + "def terminology_lookup(text, dictionary):\n", + " pattern = re.compile(r'\\b(?:' + '|'.join(dictionary) + r')\\b', re.IGNORECASE)\n", + " matches = pattern.finditer(text)\n", + " occurance = ''\n", + " for match in matches:\n", + " occurance += (f\"({match.start()}, {match.end()})\")\n", + " return occurance" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5781b95b-3af9-4c82-8388-b98a11e6c343", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'(80, 91)(164, 175)(468, 475)(516, 523)(533, 540)'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "terminology_lookup(text, dictionary)" ] }, { @@ -161,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "id": "tribal-attention", "metadata": {}, "outputs": [ @@ -205,7 +233,7 @@ "IDE\n", ",\n", "see\n", - "Running\n", + "run\n", "Tutorial\n", "Examples\n", "in\n", @@ -218,7 +246,7 @@ "work\n", "for\n", "all\n", - "swing\n", + "Swing\n", "program\n", "—\n", "applet\n", @@ -232,7 +260,7 @@ "be\n", "the\n", "step\n", - "-PRON-\n", + "you\n", "need\n", "to\n", "follow\n", @@ -248,7 +276,7 @@ "platform\n", ",\n", "if\n", - "-PRON-\n", + "you\n", "have\n", "not\n", "already\n", @@ -260,7 +288,7 @@ "program\n", "that\n", "use\n", - "Swing\n", + "swing\n", "component\n", ".\n", "compile\n", @@ -302,13 +330,31 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 19, "id": "surgical-demonstration", "metadata": {}, "outputs": [], "source": [ - "def terminology_lookup():\n", - " return []" + "def terminology_lookup(text, dictionary):\n", + " nlp = spacy.load(\"en_core_web_sm\")\n", + " doc = nlp(text)\n", + "\n", + " word_forms = set()\n", + " for word in dictionary:\n", + " word_forms.add(word)\n", + " for token in doc:\n", + " if token.text.lower() == word:\n", + " word_forms.add(token.lemma_)\n", + "\n", + " matches = []\n", + " for token in doc:\n", + " if token.text.lower() in word_forms:\n", + " matches.append((token.idx, token.idx + len(token)))\n", + "\n", + " occurrences = ''\n", + " for match in matches:\n", + " occurrences += f\"({match[0]}, {match[1]})\"\n", + " return occurrences" ] }, { @@ -337,13 +383,59 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 23, "id": "superb-butterfly", "metadata": {}, "outputs": [], "source": [ "def get_nouns(text):\n", - " return []" + " nlp = spacy.load(\"en_core_web_sm\")\n", + " doc = nlp(text)\n", + " nouns = [token.text for token in doc if token.pos_ == \"NOUN\"]\n", + " \n", + " return nouns" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "8203c3e5-74a6-42c1-add1-e378f09164fd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['programmers',\n", + " 'section',\n", + " 'Swing',\n", + " 'application',\n", + " 'command',\n", + " 'line',\n", + " 'information',\n", + " 'Swing',\n", + " 'application',\n", + " 'compilation',\n", + " 'instructions',\n", + " 'programs',\n", + " 'applets',\n", + " 'applications',\n", + " 'steps',\n", + " 'release',\n", + " 'platform',\n", + " 'program',\n", + " 'Swing',\n", + " 'components',\n", + " 'program',\n", + " 'program']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_nouns(text)" ] }, { @@ -356,7 +448,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 25, "id": "acting-tolerance", "metadata": {}, "outputs": [], @@ -374,13 +466,29 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 30, "id": "eight-redhead", "metadata": {}, "outputs": [], "source": [ "def extract_terms(text):\n", - " return []" + " nlp = spacy.load(\"en_core_web_sm\")\n", + " doc = nlp(text)\n", + " \n", + " tally = {}\n", + " \n", + " for token in doc:\n", + " if token.pos_ != \"NOUN\":\n", + " continue\n", + " \n", + " lemma = token.lemma_.lower()\n", + " \n", + " if lemma in tally:\n", + " tally[lemma] += 1\n", + " else:\n", + " tally[lemma] = 1\n", + " \n", + " return tally" ] }, { @@ -393,13 +501,29 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 32, "id": "monetary-mambo", "metadata": {}, "outputs": [], "source": [ "def extract_terms(text):\n", - " return []" + " nlp = spacy.load(\"en_core_web_sm\")\n", + " doc = nlp(text)\n", + " \n", + " tally = {}\n", + " \n", + " for token in doc:\n", + " if token.pos_ not in ['NOUN', 'VERB', 'ADJ']:\n", + " continue\n", + " \n", + " lemma = token.lemma_.lower()\n", + " \n", + " if lemma in tally:\n", + " tally[lemma] += 1\n", + " else:\n", + " tally[lemma] = 1\n", + " \n", + " return tally" ] } ], @@ -407,7 +531,7 @@ "author": "Rafał Jaworski", "email": "rjawor@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -422,7 +546,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.12" }, "subtitle": "3. Terminologia", "title": "Komputerowe wspomaganie tłumaczenia",