Compare commits

...

3 Commits
main ... main

Author SHA1 Message Date
Patryk 9b9e46df22 lab 3 2024-04-16 21:12:25 +02:00
Patryk 2b22583359 lab 2 2024-04-16 08:47:38 +02:00
Patryk Osiński ddd2833663 lab 1 2024-04-13 14:22:23 +02:00
3 changed files with 284 additions and 88 deletions

View File

@ -52,9 +52,14 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 5,
"id": "narrow-romantic",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T11:05:09.046685900Z",
"start_time": "2024-04-13T11:05:08.877692800Z"
}
},
"outputs": [],
"source": [
"translation_memory = [('Wciśnij przycisk Enter', 'Press the ENTER button'), \n",
@ -71,9 +76,14 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 6,
"id": "indonesian-electron",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T11:05:09.131296300Z",
"start_time": "2024-04-13T11:05:08.893315Z"
}
},
"outputs": [],
"source": [
"def tm_lookup(sentence):\n",
@ -82,17 +92,20 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 7,
"id": "compact-trinidad",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T11:05:09.162547Z",
"start_time": "2024-04-13T11:05:08.924558500Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"['Press the ENTER button']"
]
"text/plain": "['Press the ENTER button']"
},
"execution_count": 3,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -119,9 +132,14 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 8,
"id": "exposed-daniel",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T11:05:09.162547Z",
"start_time": "2024-04-13T11:05:08.946722400Z"
}
},
"outputs": [],
"source": [
"translation_memory.append(('Drukarka jest wyłączona', 'The printer is switched off'))\n",
@ -139,17 +157,20 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 9,
"id": "serial-velvet",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T11:05:09.162547Z",
"start_time": "2024-04-13T11:05:08.955053700Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"['Press the ENTER button', 'Press the ENTER key']"
]
"text/plain": "['Press the ENTER button', 'Press the ENTER key']"
},
"execution_count": 5,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -176,17 +197,20 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 10,
"id": "every-gibson",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T11:05:09.178168700Z",
"start_time": "2024-04-13T11:05:08.970677700Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[]"
]
"text/plain": "[]"
},
"execution_count": 6,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -213,13 +237,19 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 21,
"id": "protected-rings",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T11:05:12.496455200Z",
"start_time": "2024-04-13T11:05:12.465209700Z"
}
},
"outputs": [],
"source": [
"def tm_lookup(sentence):\n",
" return ''"
" sentence = sentence.lower()\n",
" return [entry[1] for entry in translation_memory if entry[0].lower() == sentence]"
]
},
{
@ -232,17 +262,20 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 22,
"id": "severe-alloy",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T11:05:14.153976900Z",
"start_time": "2024-04-13T11:05:14.120474700Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"''"
]
"text/plain": "[]"
},
"execution_count": 18,
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
@ -261,13 +294,24 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 23,
"id": "structural-diesel",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T11:05:15.199517300Z",
"start_time": "2024-04-13T11:05:15.105892400Z"
}
},
"outputs": [],
"source": [
"import string\n",
"\n",
"def normalize(sentence):\n",
" return sentence.translate(str.maketrans('', '', string.punctuation)).lower()\n",
"\n",
"def tm_lookup(sentence):\n",
" return ''"
" sentence = normalize(sentence)\n",
" return [entry[1] for entry in translation_memory if normalize(entry[0]) == sentence]"
]
},
{
@ -280,17 +324,20 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 24,
"id": "brief-senegal",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T11:05:17.857048100Z",
"start_time": "2024-04-13T11:05:17.825799600Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"''"
]
"text/plain": "[]"
},
"execution_count": 12,
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@ -317,13 +364,49 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 25,
"id": "mathematical-customs",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T12:00:14.223561700Z",
"start_time": "2024-04-13T12:00:14.159559100Z"
}
},
"outputs": [],
"source": [
"def find_similar(sentence):\n",
" mismatches_threshold = 2\n",
" words = sentence.split()\n",
" words_count = len(words)\n",
" for entry in translation_memory:\n",
" entry_words = normalize(entry[0]).split()\n",
" if words_count != len(entry_words):\n",
" continue\n",
" mismatches = 0\n",
" i = 0\n",
" for word in words:\n",
" if word != entry_words[i]:\n",
" if mismatches < mismatches_threshold:\n",
" mismatches += 1\n",
" else:\n",
" break\n",
" i += 1\n",
" if mismatches < mismatches_threshold:\n",
" return entry[1]\n",
" return []\n",
"\n",
"\n",
"def find_exact(sentence):\n",
" return [entry[1] for entry in translation_memory if normalize(entry[0]) == sentence]\n",
"\n",
"\n",
"def tm_lookup(sentence):\n",
" return ''"
" sentence = normalize(sentence)\n",
" exact_match = find_exact(sentence)\n",
" if not exact_match:\n",
" return find_similar(sentence)\n",
" else:\n",
" return exact_match"
]
},
{
@ -344,9 +427,14 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 26,
"id": "humanitarian-wrong",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T12:00:18.016836500Z",
"start_time": "2024-04-13T12:00:17.992836400Z"
}
},
"outputs": [],
"source": [
"glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]"
@ -362,9 +450,14 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 27,
"id": "located-perception",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T12:02:06.039160400Z",
"start_time": "2024-04-13T12:02:06.015160400Z"
}
},
"outputs": [],
"source": [
"def glossary_lookup(sentence):\n",
@ -374,17 +467,20 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 28,
"id": "advised-casting",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T12:02:06.846998600Z",
"start_time": "2024-04-13T12:02:06.823447800Z"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[('przycisk', 'button'), ('drukarka', 'printer')]"
]
"text/plain": "[('przycisk', 'button'), ('drukarka', 'printer')]"
},
"execution_count": 17,
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
@ -406,7 +502,9 @@
"id": "defensive-fifteen",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: \n",
"złożoność pesymistyczna: m*n\n",
"złożoność optymistyczna: m"
]
},
{
@ -421,11 +519,17 @@
"cell_type": "code",
"execution_count": 19,
"id": "original-tunisia",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T11:05:09.247171300Z",
"start_time": "2024-04-13T11:05:09.124790700Z"
}
},
"outputs": [],
"source": [
"def glossary_lookup(sentence):\n",
" return ''"
" sentence_words = sentence.lower().split()\n",
" return [entry for entry in glossary if entry[0].lower() in sentence_words]"
]
},
{
@ -440,11 +544,25 @@
"cell_type": "code",
"execution_count": 20,
"id": "adolescent-semiconductor",
"metadata": {},
"metadata": {
"ExecuteTime": {
"end_time": "2024-04-13T11:05:09.247171300Z",
"start_time": "2024-04-13T11:05:09.146924500Z"
}
},
"outputs": [],
"source": [
"def glossary_lookup(sentence):\n",
" return ''"
" sentence_words = sentence.lower().split()\n",
" entry_words = []\n",
" for entry in glossary:\n",
" entry_words.append((entry[0].lower(), entry[1]))\n",
" result = []\n",
" for word in sentence_words:\n",
" for entry_word in entry_words:\n",
" if entry_word[0] == word:\n",
" result.append(entry_word)\n",
" return result"
]
}
],
@ -452,7 +570,7 @@
"author": "Rafał Jaworski",
"email": "rjawor@amu.edu.pl",
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@ -467,7 +585,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.10.11"
},
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
"title": "Komputerowe wspomaganie tłumaczenia",

View File

@ -85,8 +85,31 @@
"metadata": {},
"outputs": [],
"source": [
"def exact_match(sentence):\n",
" for key, entry in enumerate(translation_memory):\n",
" if entry[0] == sentence:\n",
" return key, entry[1]\n",
" return None, None\n",
"\n",
"\n",
"def has_exact_match_on_index(index, sentence):\n",
" return translation_memory[index][0] == sentence\n",
"\n",
"\n",
"def ice_lookup(sentence, prev_sentence, next_sentence):\n",
" return []"
" index, match = exact_match(sentence)\n",
" trans_length = len(translation_memory)\n",
" if index is None:\n",
" return []\n",
" if next_sentence \\\n",
" and index < trans_length \\\n",
" and not has_exact_match_on_index(index + 1, next_sentence):\n",
" return []\n",
" if prev_sentence \\\n",
" and index > 0 \\\n",
" and not has_exact_match_on_index(index - 1, prev_sentence):\n",
" return []\n",
" return [match]"
]
},
{
@ -141,7 +164,7 @@
"id": "graduate-theorem",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Nie. 1, 3, 4."
]
},
{
@ -179,7 +202,7 @@
"id": "metallic-leave",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Tak. 1, 2, 3, 4."
]
},
{
@ -206,7 +229,17 @@
"id": "bibliographic-stopping",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Tak.\n",
"1. Liczba operacji wykonanych nie może być ujemna.\n",
"2. Gdy x == y, nie są wymagane żadne operacje edycyjne, więc wynik funkcji to 0.\n",
"3. Zmiana jednego łańcucha znaków w drugi, wymaga tyle samo operacji edycji, co zmiana drugiego w pierwszy.\n",
" Studia -> Studiel = 2; Studiel -> Studia = 2; 2 == 2\n",
"4. Istnieją trzy opcje\n",
" - Jeżeli x == y == z, więc 0 + 0 == 0\n",
" - Jeżeli x == y, x != z, a x -> z = n, to y -> z = n więc albo 0 + n == n, albo n + n > 0\n",
" - Jeżeli x != y != z to im z jest bliżej do x, tym jest dalej od y (jednostką odległości jest liczba przekształceń). Można by to przedstawić graficznie jako trójkąt (x, y, z). z stanowi punkt na pośredniej drodze pomiędzy x i y, która nie może być dłuższa niż droga bezpośrednia - wynika to z własności trójkąta.\n",
" Studia -> Studiel = 2; Studiel -> udia = 4; udia -> Studia = 2;\n",
" 2 + 4 > 2; 2 + 2 == 4"
]
},
{
@ -214,6 +247,7 @@
"id": "attended-channels",
"metadata": {},
"source": [
"\n",
"W Pythonie dostępna jest biblioteka zawierająca implementację dystansu Levenshteina. Zainstaluj ją w swoim systemie przy użyciu polecenia:\n",
"\n",
"`pip3 install python-Levenshtein`\n",
@ -223,19 +257,20 @@
},
{
"cell_type": "code",
"execution_count": 5,
"id": "secondary-wrist",
"execution_count": null,
"id": "4064ce50",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
"ename": "ModuleNotFoundError",
"evalue": "No module named 'Levenshtein'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn [2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mLevenshtein\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m distance \u001b[38;5;28;01mas\u001b[39;00m levenshtein_distance\n\u001b[1;32m 3\u001b[0m levenshtein_distance(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mkotek\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mkotki\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'Levenshtein'"
]
}
],
"source": [
@ -350,7 +385,11 @@
"outputs": [],
"source": [
"def fuzzy_lookup(sentence, threshold):\n",
" return []"
" results = []\n",
" for entry in translation_memory:\n",
" if levenshtein_similarity(entry[0], sentence) >= threshold:\n",
" results.append(entry[1])\n",
" return results"
]
}
],
@ -358,7 +397,7 @@
"author": "Rafał Jaworski",
"email": "rjawor@amu.edu.pl",
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@ -373,7 +412,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.9.2"
},
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
"title": "Komputerowe wspomaganie tłumaczenia",

View File

@ -63,7 +63,7 @@
"id": "diverse-sunglasses",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: \"metal cabinet guides\". https://translate.google.pl/"
]
},
{
@ -115,7 +115,7 @@
"metadata": {},
"outputs": [],
"source": [
"dictionary = ['program', 'application', 'applet' 'compile']"
"dictionary = ['program', 'application', 'applet', 'compile']"
]
},
{
@ -133,8 +133,18 @@
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def terminology_lookup():\n",
" return []"
" result = []\n",
" regex = ''\n",
" for word in dictionary:\n",
" if regex != '':\n",
" regex += '|'\n",
" regex += '(' + word + ')'\n",
" for occurrence in re.finditer(regex, text, re.I):\n",
" result.append((occurrence.group(), occurrence.start(), occurrence.end()))\n",
" return result"
]
},
{
@ -308,7 +318,12 @@
"outputs": [],
"source": [
"def terminology_lookup():\n",
" return []"
" result = []\n",
" for token in doc:\n",
" if token.lemma_ in dictionary:\n",
" result.append((token, token.idx, token.idx + len(token)))\n",
"\n",
" return result"
]
},
{
@ -343,7 +358,13 @@
"outputs": [],
"source": [
"def get_nouns(text):\n",
" return []"
" result = []\n",
" doc = nlp(text)\n",
" for token in doc:\n",
" if token.pos_ == 'NOUN':\n",
" result.append(token)\n",
"\n",
" return result"
]
},
{
@ -380,7 +401,16 @@
"outputs": [],
"source": [
"def extract_terms(text):\n",
" return []"
" result = {}\n",
" doc = nlp(text)\n",
" for token in doc:\n",
" if token.pos_ == 'NOUN':\n",
" if result.get(token.lemma_) is None:\n",
" result[token.lemma_] = 1\n",
" else:\n",
" result[token.lemma_] += 1\n",
"\n",
" return result"
]
},
{
@ -399,7 +429,16 @@
"outputs": [],
"source": [
"def extract_terms(text):\n",
" return []"
" result = {}\n",
" doc = nlp(text)\n",
" for token in doc:\n",
" if token.pos_ in ['NOUN', 'VERB', 'ADJ']:\n",
" if result.get(token.lemma_) is None:\n",
" result[token.lemma_] = 1\n",
" else:\n",
" result[token.lemma_] += 1\n",
"\n",
" return result"
]
}
],
@ -407,7 +446,7 @@
"author": "Rafał Jaworski",
"email": "rjawor@amu.edu.pl",
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@ -422,7 +461,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.9.2"
},
"subtitle": "3. Terminologia",
"title": "Komputerowe wspomaganie tłumaczenia",