From ddd28336637138e5741ad84c823e36d4f8c1af19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20Osi=C5=84ski?= Date: Sat, 13 Apr 2024 14:22:23 +0200 Subject: [PATCH] lab 1 --- lab/lab_01.ipynb | 242 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 180 insertions(+), 62 deletions(-) diff --git a/lab/lab_01.ipynb b/lab/lab_01.ipynb index 0ffe833..8e7aa68 100644 --- a/lab/lab_01.ipynb +++ b/lab/lab_01.ipynb @@ -52,9 +52,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "id": "narrow-romantic", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T11:05:09.046685900Z", + "start_time": "2024-04-13T11:05:08.877692800Z" + } + }, "outputs": [], "source": [ "translation_memory = [('Wciśnij przycisk Enter', 'Press the ENTER button'), \n", @@ -71,9 +76,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "id": "indonesian-electron", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T11:05:09.131296300Z", + "start_time": "2024-04-13T11:05:08.893315Z" + } + }, "outputs": [], "source": [ "def tm_lookup(sentence):\n", @@ -82,17 +92,20 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "id": "compact-trinidad", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T11:05:09.162547Z", + "start_time": "2024-04-13T11:05:08.924558500Z" + } + }, "outputs": [ { "data": { - "text/plain": [ - "['Press the ENTER button']" - ] + "text/plain": "['Press the ENTER button']" }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -119,9 +132,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "id": "exposed-daniel", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T11:05:09.162547Z", + "start_time": "2024-04-13T11:05:08.946722400Z" + } + }, "outputs": [], "source": [ "translation_memory.append(('Drukarka jest wyłączona', 'The printer is switched off'))\n", @@ -139,17 +157,20 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "id": "serial-velvet", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T11:05:09.162547Z", + "start_time": "2024-04-13T11:05:08.955053700Z" + } + }, "outputs": [ { "data": { - "text/plain": [ - "['Press the ENTER button', 'Press the ENTER key']" - ] + "text/plain": "['Press the ENTER button', 'Press the ENTER key']" }, - "execution_count": 5, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -176,17 +197,20 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "id": "every-gibson", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T11:05:09.178168700Z", + "start_time": "2024-04-13T11:05:08.970677700Z" + } + }, "outputs": [ { "data": { - "text/plain": [ - "[]" - ] + "text/plain": "[]" }, - "execution_count": 6, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -213,13 +237,19 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 21, "id": "protected-rings", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T11:05:12.496455200Z", + "start_time": "2024-04-13T11:05:12.465209700Z" + } + }, "outputs": [], "source": [ "def tm_lookup(sentence):\n", - " return ''" + " sentence = sentence.lower()\n", + " return [entry[1] for entry in translation_memory if entry[0].lower() == sentence]" ] }, { @@ -232,17 +262,20 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 22, "id": "severe-alloy", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T11:05:14.153976900Z", + "start_time": "2024-04-13T11:05:14.120474700Z" + } + }, "outputs": [ { "data": { - "text/plain": [ - "''" - ] + "text/plain": "[]" }, - "execution_count": 18, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -261,13 +294,24 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 23, "id": "structural-diesel", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T11:05:15.199517300Z", + "start_time": "2024-04-13T11:05:15.105892400Z" + } + }, "outputs": [], "source": [ + "import string\n", + "\n", + "def normalize(sentence):\n", + " return sentence.translate(str.maketrans('', '', string.punctuation)).lower()\n", + "\n", "def tm_lookup(sentence):\n", - " return ''" + " sentence = normalize(sentence)\n", + " return [entry[1] for entry in translation_memory if normalize(entry[0]) == sentence]" ] }, { @@ -280,17 +324,20 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 24, "id": "brief-senegal", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T11:05:17.857048100Z", + "start_time": "2024-04-13T11:05:17.825799600Z" + } + }, "outputs": [ { "data": { - "text/plain": [ - "''" - ] + "text/plain": "[]" }, - "execution_count": 12, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -317,13 +364,49 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 25, "id": "mathematical-customs", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T12:00:14.223561700Z", + "start_time": "2024-04-13T12:00:14.159559100Z" + } + }, "outputs": [], "source": [ + "def find_similar(sentence):\n", + " mismatches_threshold = 2\n", + " words = sentence.split()\n", + " words_count = len(words)\n", + " for entry in translation_memory:\n", + " entry_words = normalize(entry[0]).split()\n", + " if words_count != len(entry_words):\n", + " continue\n", + " mismatches = 0\n", + " i = 0\n", + " for word in words:\n", + " if word != entry_words[i]:\n", + " if mismatches < mismatches_threshold:\n", + " mismatches += 1\n", + " else:\n", + " break\n", + " i += 1\n", + " if mismatches < mismatches_threshold:\n", + " return entry[1]\n", + " return []\n", + "\n", + "\n", + "def find_exact(sentence):\n", + " return [entry[1] for entry in translation_memory if normalize(entry[0]) == sentence]\n", + "\n", + "\n", "def tm_lookup(sentence):\n", - " return ''" + " sentence = normalize(sentence)\n", + " exact_match = find_exact(sentence)\n", + " if not exact_match:\n", + " return find_similar(sentence)\n", + " else:\n", + " return exact_match" ] }, { @@ -344,9 +427,14 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 26, "id": "humanitarian-wrong", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T12:00:18.016836500Z", + "start_time": "2024-04-13T12:00:17.992836400Z" + } + }, "outputs": [], "source": [ "glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]" @@ -362,9 +450,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 27, "id": "located-perception", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T12:02:06.039160400Z", + "start_time": "2024-04-13T12:02:06.015160400Z" + } + }, "outputs": [], "source": [ "def glossary_lookup(sentence):\n", @@ -374,17 +467,20 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 28, "id": "advised-casting", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T12:02:06.846998600Z", + "start_time": "2024-04-13T12:02:06.823447800Z" + } + }, "outputs": [ { "data": { - "text/plain": [ - "[('przycisk', 'button'), ('drukarka', 'printer')]" - ] + "text/plain": "[('przycisk', 'button'), ('drukarka', 'printer')]" }, - "execution_count": 17, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -406,7 +502,9 @@ "id": "defensive-fifteen", "metadata": {}, "source": [ - "Odpowiedź:" + "Odpowiedź: \n", + "złożoność pesymistyczna: m*n\n", + "złożoność optymistyczna: m" ] }, { @@ -421,11 +519,17 @@ "cell_type": "code", "execution_count": 19, "id": "original-tunisia", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T11:05:09.247171300Z", + "start_time": "2024-04-13T11:05:09.124790700Z" + } + }, "outputs": [], "source": [ "def glossary_lookup(sentence):\n", - " return ''" + " sentence_words = sentence.lower().split()\n", + " return [entry for entry in glossary if entry[0].lower() in sentence_words]" ] }, { @@ -440,11 +544,25 @@ "cell_type": "code", "execution_count": 20, "id": "adolescent-semiconductor", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-04-13T11:05:09.247171300Z", + "start_time": "2024-04-13T11:05:09.146924500Z" + } + }, "outputs": [], "source": [ "def glossary_lookup(sentence):\n", - " return ''" + " sentence_words = sentence.lower().split()\n", + " entry_words = []\n", + " for entry in glossary:\n", + " entry_words.append((entry[0].lower(), entry[1]))\n", + " result = []\n", + " for word in sentence_words:\n", + " for entry_word in entry_words:\n", + " if entry_word[0] == word:\n", + " result.append(entry_word)\n", + " return result" ] } ], @@ -452,7 +570,7 @@ "author": "Rafał Jaworski", "email": "rjawor@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -467,7 +585,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.11" }, "subtitle": "1. Podstawowe techniki wspomagania tłumaczenia", "title": "Komputerowe wspomaganie tłumaczenia",