forked from bfijalkowski/KWT-2024
wip
This commit is contained in:
parent
ddd2833663
commit
78982a4f21
@ -103,7 +103,9 @@
|
|||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": "['Press the ENTER button']"
|
"text/plain": [
|
||||||
|
"['Press the ENTER button']"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 7,
|
"execution_count": 7,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -168,7 +170,9 @@
|
|||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": "['Press the ENTER button', 'Press the ENTER key']"
|
"text/plain": [
|
||||||
|
"['Press the ENTER button', 'Press the ENTER key']"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 9,
|
"execution_count": 9,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -208,7 +212,9 @@
|
|||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": "[]"
|
"text/plain": [
|
||||||
|
"[]"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 10,
|
"execution_count": 10,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -273,7 +279,9 @@
|
|||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": "[]"
|
"text/plain": [
|
||||||
|
"[]"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 22,
|
"execution_count": 22,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -335,7 +343,9 @@
|
|||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": "[]"
|
"text/plain": [
|
||||||
|
"[]"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 24,
|
"execution_count": 24,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -478,7 +488,9 @@
|
|||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": "[('przycisk', 'button'), ('drukarka', 'printer')]"
|
"text/plain": [
|
||||||
|
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||||
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 28,
|
"execution_count": 28,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -585,7 +597,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.11"
|
"version": "3.9.2"
|
||||||
},
|
},
|
||||||
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
|
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
@ -40,9 +40,11 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "code",
|
||||||
"id": "existing-approval",
|
"execution_count": null,
|
||||||
|
"id": "961796fd-4463-4a17-ac15-afe712b3959e",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"Jedną z funkcji dostępnych we wszystkich większych programach do wspomagania tłumaczenia jest znajdowanie bardzo pewnych dopasowań w pamięci tłumaczeń. Są one zwane **ICE** (In-Context Exact match) lub 101% match. Są to takie dopasowania z pamięci tłumaczeń, dla których nie tylko zdanie źródłowe z TM jest identyczne z tłumaczonym, ale także poprzednie zdanie źródłowe z TM zgadza się z poprzednim zdaniem tłumaczonym oraz następne z TM z następnym tłumaczonym."
|
"Jedną z funkcji dostępnych we wszystkich większych programach do wspomagania tłumaczenia jest znajdowanie bardzo pewnych dopasowań w pamięci tłumaczeń. Są one zwane **ICE** (In-Context Exact match) lub 101% match. Są to takie dopasowania z pamięci tłumaczeń, dla których nie tylko zdanie źródłowe z TM jest identyczne z tłumaczonym, ale także poprzednie zdanie źródłowe z TM zgadza się z poprzednim zdaniem tłumaczonym oraz następne z TM z następnym tłumaczonym."
|
||||||
]
|
]
|
||||||
@ -85,8 +87,31 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"def exact_match(sentence):\n",
|
||||||
|
" for key, entry in enumerate(translation_memory):\n",
|
||||||
|
" if entry[0] == sentence:\n",
|
||||||
|
" return key, entry[1]\n",
|
||||||
|
" return None, None\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def has_exact_match_on_index(index, sentence):\n",
|
||||||
|
" return translation_memory[index][0] == sentence\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"def ice_lookup(sentence, prev_sentence, next_sentence):\n",
|
"def ice_lookup(sentence, prev_sentence, next_sentence):\n",
|
||||||
" return []"
|
" index, match = exact_match(sentence)\n",
|
||||||
|
" trans_length = len(translation_memory)\n",
|
||||||
|
" if index is None:\n",
|
||||||
|
" return []\n",
|
||||||
|
" if next_sentence \\\n",
|
||||||
|
" and index < trans_length \\\n",
|
||||||
|
" and not has_exact_match_on_index(index + 1, next_sentence):\n",
|
||||||
|
" return []\n",
|
||||||
|
" if prev_sentence \\\n",
|
||||||
|
" and index > 0 \\\n",
|
||||||
|
" and not has_exact_match_on_index(index - 1, prev_sentence):\n",
|
||||||
|
" return []\n",
|
||||||
|
" return [match]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -141,7 +166,7 @@
|
|||||||
"id": "graduate-theorem",
|
"id": "graduate-theorem",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Odpowiedź:"
|
"Odpowiedź: Nie. 1, 3, 4."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -179,7 +204,7 @@
|
|||||||
"id": "metallic-leave",
|
"id": "metallic-leave",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Odpowiedź:"
|
"Odpowiedź: Tak. 1, 2, 3, 4."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -206,7 +231,17 @@
|
|||||||
"id": "bibliographic-stopping",
|
"id": "bibliographic-stopping",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Odpowiedź:"
|
"Odpowiedź: Tak.\n",
|
||||||
|
"1. Liczba operacji wykonanych nie może być ujemna.\n",
|
||||||
|
"2. Gdy x == y, nie są wymagane żadne operacje edycyjne, więc wynik funkcji to 0.\n",
|
||||||
|
"3. Zmiana jednego łańcucha znaków w drugi, wymaga tyle samo operacji edycji, co zmiana drugiego w pierwszy.\n",
|
||||||
|
" Studia -> Studiel = 2; Studiel -> Studia = 2; 2 == 2\n",
|
||||||
|
"4. Istnieją trzy opcje\n",
|
||||||
|
" - Jeżeli x == y == z, więc 0 + 0 == 0\n",
|
||||||
|
" - Jeżeli x == y, x != z, a x -> z = n, to y -> z = n więc albo 0 + n == n, albo n + n > 0\n",
|
||||||
|
" - Jeżeli x != y != z to im z jest bliżej do x, tym jest dalej od y (jednostką odległości jest liczba przekształceń). Można by to przedstawić graficznie jako trójkąt (x, y, z). z stanowi punkt na pośredniej drodze pomiędzy x i y, która nie może być dłuższa niż droga bezpośrednia - wynika to z własności trójkąta.\n",
|
||||||
|
" Studia -> Studiel = 2; Studiel -> udia = 4; udia -> Studia = 2;\n",
|
||||||
|
" 2 + 4 > 2; 2 + 2 == 4"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -214,6 +249,7 @@
|
|||||||
"id": "attended-channels",
|
"id": "attended-channels",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
|
"\n",
|
||||||
"W Pythonie dostępna jest biblioteka zawierająca implementację dystansu Levenshteina. Zainstaluj ją w swoim systemie przy użyciu polecenia:\n",
|
"W Pythonie dostępna jest biblioteka zawierająca implementację dystansu Levenshteina. Zainstaluj ją w swoim systemie przy użyciu polecenia:\n",
|
||||||
"\n",
|
"\n",
|
||||||
"`pip3 install python-Levenshtein`\n",
|
"`pip3 install python-Levenshtein`\n",
|
||||||
@ -223,21 +259,10 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": null,
|
||||||
"id": "secondary-wrist",
|
"id": "355e4914-08da-4bd4-b8a2-67b055831c30",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"2"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 5,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from Levenshtein import distance as levenshtein_distance\n",
|
"from Levenshtein import distance as levenshtein_distance\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -314,22 +339,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "raw",
|
||||||
"execution_count": 9,
|
"id": "4a47854f-df2e-451f-8e09-99f59210f86f",
|
||||||
"id": "invisible-cambodia",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"0.631578947368421"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 9,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"levenshtein_similarity('Spróbuj wyłączyć i włączyć komputer', 'Nie próbuj wyłączać i włączać drukarki')"
|
"levenshtein_similarity('Spróbuj wyłączyć i włączyć komputer', 'Nie próbuj wyłączać i włączać drukarki')"
|
||||||
]
|
]
|
||||||
@ -350,7 +362,11 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def fuzzy_lookup(sentence, threshold):\n",
|
"def fuzzy_lookup(sentence, threshold):\n",
|
||||||
" return []"
|
" results = []\n",
|
||||||
|
" for entry in translation_memory:\n",
|
||||||
|
" if levenshtein_similarity(entry[0], sentence) >= threshold:\n",
|
||||||
|
" results.append(entry[1])\n",
|
||||||
|
" return results"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -358,7 +374,7 @@
|
|||||||
"author": "Rafał Jaworski",
|
"author": "Rafał Jaworski",
|
||||||
"email": "rjawor@amu.edu.pl",
|
"email": "rjawor@amu.edu.pl",
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -373,7 +389,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.9.2"
|
||||||
},
|
},
|
||||||
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
|
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
189
lab/lab_03.ipynb
189
lab/lab_03.ipynb
@ -63,7 +63,7 @@
|
|||||||
"id": "diverse-sunglasses",
|
"id": "diverse-sunglasses",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Odpowiedź:"
|
"Odpowiedź: \"metal cabinet guides\". https://translate.google.pl/"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -115,7 +115,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"dictionary = ['program', 'application', 'applet' 'compile']"
|
"dictionary = ['program', 'application', 'applet', 'compile']"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -133,8 +133,18 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"\n",
|
||||||
"def terminology_lookup():\n",
|
"def terminology_lookup():\n",
|
||||||
" return []"
|
" result = []\n",
|
||||||
|
" regex = ''\n",
|
||||||
|
" for word in dictionary:\n",
|
||||||
|
" if regex != '':\n",
|
||||||
|
" regex += '|'\n",
|
||||||
|
" regex += '(' + word + ')'\n",
|
||||||
|
" for occurrence in re.finditer(regex, text, re.I):\n",
|
||||||
|
" result.append((occurrence.group(), occurrence.start(), occurrence.end()))\n",
|
||||||
|
" return result"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -161,116 +171,34 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 1,
|
||||||
"id": "tribal-attention",
|
"id": "tribal-attention",
|
||||||
"metadata": {},
|
"metadata": {
|
||||||
|
"ExecuteTime": {
|
||||||
|
"end_time": "2024-04-20T15:23:32.727687100Z",
|
||||||
|
"start_time": "2024-04-20T15:23:24.826454500Z"
|
||||||
|
}
|
||||||
|
},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"ename": "KeyboardInterrupt",
|
||||||
"output_type": "stream",
|
"evalue": "",
|
||||||
"text": [
|
"output_type": "error",
|
||||||
" \n",
|
"traceback": [
|
||||||
"for\n",
|
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
|
||||||
"all\n",
|
"\u001B[1;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)",
|
||||||
"Java\n",
|
"Cell \u001B[1;32mIn[1], line 1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28;01mimport\u001B[39;00m \u001B[38;5;21;01mspacy\u001B[39;00m\n\u001B[0;32m 2\u001B[0m nlp \u001B[38;5;241m=\u001B[39m spacy\u001B[38;5;241m.\u001B[39mload(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124men_core_web_sm\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m 4\u001B[0m doc \u001B[38;5;241m=\u001B[39m nlp(text)\n",
|
||||||
"programmer\n",
|
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\__init__.py:13\u001B[0m\n\u001B[0;32m 10\u001B[0m \u001B[38;5;66;03m# These are imported as part of the API\u001B[39;00m\n\u001B[0;32m 11\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mthinc\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mapi\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Config, prefer_gpu, require_cpu, require_gpu \u001B[38;5;66;03m# noqa: F401\u001B[39;00m\n\u001B[1;32m---> 13\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m pipeline \u001B[38;5;66;03m# noqa: F401\u001B[39;00m\n\u001B[0;32m 14\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m util\n\u001B[0;32m 15\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mabout\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m __version__ \u001B[38;5;66;03m# noqa: F401\u001B[39;00m\n",
|
||||||
":\n",
|
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\pipeline\\__init__.py:1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mattributeruler\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m AttributeRuler\n\u001B[0;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdep_parser\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m DependencyParser\n\u001B[0;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01medit_tree_lemmatizer\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m EditTreeLemmatizer\n",
|
||||||
"this\n",
|
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\pipeline\\attributeruler.py:8\u001B[0m\n\u001B[0;32m 6\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m util\n\u001B[0;32m 7\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01merrors\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Errors\n\u001B[1;32m----> 8\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mlanguage\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Language\n\u001B[0;32m 9\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mmatcher\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Matcher\n\u001B[0;32m 10\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mscorer\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Scorer\n",
|
||||||
"section\n",
|
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\language.py:43\u001B[0m\n\u001B[0;32m 41\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mlang\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mtokenizer_exceptions\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m BASE_EXCEPTIONS, URL_MATCH\n\u001B[0;32m 42\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mlookups\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m load_lookups\n\u001B[1;32m---> 43\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mpipe_analysis\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m analyze_pipes, print_pipe_analysis, validate_attrs\n\u001B[0;32m 44\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mschemas\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m (\n\u001B[0;32m 45\u001B[0m ConfigSchema,\n\u001B[0;32m 46\u001B[0m ConfigSchemaInit,\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 49\u001B[0m validate_init_settings,\n\u001B[0;32m 50\u001B[0m )\n\u001B[0;32m 51\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mscorer\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Scorer\n",
|
||||||
"explain\n",
|
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\pipe_analysis.py:6\u001B[0m\n\u001B[0;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01mwasabi\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m msg\n\u001B[0;32m 5\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01merrors\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Errors\n\u001B[1;32m----> 6\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mtokens\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Doc, Span, Token\n\u001B[0;32m 7\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutil\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m dot_to_dict\n\u001B[0;32m 9\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m TYPE_CHECKING:\n\u001B[0;32m 10\u001B[0m \u001B[38;5;66;03m# This lets us add type hints for mypy etc. without causing circular imports\u001B[39;00m\n",
|
||||||
"how\n",
|
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\tokens\\__init__.py:1\u001B[0m\n\u001B[1;32m----> 1\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m_serialize\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m DocBin\n\u001B[0;32m 2\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdoc\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Doc\n\u001B[0;32m 3\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mmorphanalysis\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m MorphAnalysis\n",
|
||||||
"to\n",
|
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\tokens\\_serialize.py:14\u001B[0m\n\u001B[0;32m 12\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01merrors\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Errors\n\u001B[0;32m 13\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mutil\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m SimpleFrozenList, ensure_path\n\u001B[1;32m---> 14\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mvocab\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m Vocab\n\u001B[0;32m 15\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01m_dict_proxies\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m SpanGroups\n\u001B[0;32m 16\u001B[0m \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;21;01m.\u001B[39;00m\u001B[38;5;21;01mdoc\u001B[39;00m \u001B[38;5;28;01mimport\u001B[39;00m DOCBIN_ALL_ATTRS \u001B[38;5;28;01mas\u001B[39;00m ALL_ATTRS\n",
|
||||||
"compile\n",
|
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\vocab.pyx:1\u001B[0m, in \u001B[0;36minit spacy.vocab\u001B[1;34m()\u001B[0m\n",
|
||||||
"and\n",
|
"File \u001B[1;32mj:\\.AppData\\Python\\Python310\\site-packages\\spacy\\tokens\\doc.pyx:1\u001B[0m, in \u001B[0;36minit spacy.tokens.doc\u001B[1;34m()\u001B[0m\n",
|
||||||
"run\n",
|
"File \u001B[1;32m<frozen importlib._bootstrap>:404\u001B[0m, in \u001B[0;36mparent\u001B[1;34m(self)\u001B[0m\n",
|
||||||
"a\n",
|
"\u001B[1;31mKeyboardInterrupt\u001B[0m: "
|
||||||
"swing\n",
|
|
||||||
"application\n",
|
|
||||||
"from\n",
|
|
||||||
"the\n",
|
|
||||||
"command\n",
|
|
||||||
"line\n",
|
|
||||||
".\n",
|
|
||||||
"for\n",
|
|
||||||
"information\n",
|
|
||||||
"on\n",
|
|
||||||
"compile\n",
|
|
||||||
"and\n",
|
|
||||||
"run\n",
|
|
||||||
"a\n",
|
|
||||||
"swing\n",
|
|
||||||
"application\n",
|
|
||||||
"use\n",
|
|
||||||
"NetBeans\n",
|
|
||||||
"IDE\n",
|
|
||||||
",\n",
|
|
||||||
"see\n",
|
|
||||||
"Running\n",
|
|
||||||
"Tutorial\n",
|
|
||||||
"Examples\n",
|
|
||||||
"in\n",
|
|
||||||
"NetBeans\n",
|
|
||||||
"IDE\n",
|
|
||||||
".\n",
|
|
||||||
"the\n",
|
|
||||||
"compilation\n",
|
|
||||||
"instruction\n",
|
|
||||||
"work\n",
|
|
||||||
"for\n",
|
|
||||||
"all\n",
|
|
||||||
"swing\n",
|
|
||||||
"program\n",
|
|
||||||
"—\n",
|
|
||||||
"applet\n",
|
|
||||||
",\n",
|
|
||||||
"as\n",
|
|
||||||
"well\n",
|
|
||||||
"as\n",
|
|
||||||
"application\n",
|
|
||||||
".\n",
|
|
||||||
"here\n",
|
|
||||||
"be\n",
|
|
||||||
"the\n",
|
|
||||||
"step\n",
|
|
||||||
"-PRON-\n",
|
|
||||||
"need\n",
|
|
||||||
"to\n",
|
|
||||||
"follow\n",
|
|
||||||
":\n",
|
|
||||||
"install\n",
|
|
||||||
"the\n",
|
|
||||||
"late\n",
|
|
||||||
"release\n",
|
|
||||||
"of\n",
|
|
||||||
"the\n",
|
|
||||||
"Java\n",
|
|
||||||
"SE\n",
|
|
||||||
"platform\n",
|
|
||||||
",\n",
|
|
||||||
"if\n",
|
|
||||||
"-PRON-\n",
|
|
||||||
"have\n",
|
|
||||||
"not\n",
|
|
||||||
"already\n",
|
|
||||||
"do\n",
|
|
||||||
"so\n",
|
|
||||||
".\n",
|
|
||||||
"create\n",
|
|
||||||
"a\n",
|
|
||||||
"program\n",
|
|
||||||
"that\n",
|
|
||||||
"use\n",
|
|
||||||
"Swing\n",
|
|
||||||
"component\n",
|
|
||||||
".\n",
|
|
||||||
"compile\n",
|
|
||||||
"the\n",
|
|
||||||
"program\n",
|
|
||||||
".\n",
|
|
||||||
"run\n",
|
|
||||||
"the\n",
|
|
||||||
"program\n",
|
|
||||||
".\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -308,7 +236,12 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def terminology_lookup():\n",
|
"def terminology_lookup():\n",
|
||||||
" return []"
|
" result = []\n",
|
||||||
|
" for token in doc:\n",
|
||||||
|
" if token.lemma_ in dictionary:\n",
|
||||||
|
" result.append((token, token.idx, token.idx + len(token)))\n",
|
||||||
|
"\n",
|
||||||
|
" return result"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -343,7 +276,13 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def get_nouns(text):\n",
|
"def get_nouns(text):\n",
|
||||||
" return []"
|
" result = []\n",
|
||||||
|
" doc = nlp(text)\n",
|
||||||
|
" for token in doc:\n",
|
||||||
|
" if token.pos_ == 'NOUN':\n",
|
||||||
|
" result.append(token)\n",
|
||||||
|
"\n",
|
||||||
|
" return result"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -380,7 +319,16 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def extract_terms(text):\n",
|
"def extract_terms(text):\n",
|
||||||
" return []"
|
" result = {}\n",
|
||||||
|
" doc = nlp(text)\n",
|
||||||
|
" for token in doc:\n",
|
||||||
|
" if token.pos_ == 'NOUN':\n",
|
||||||
|
" if result.get(token.lemma_) is None:\n",
|
||||||
|
" result[token.lemma_] = 1\n",
|
||||||
|
" else:\n",
|
||||||
|
" result[token.lemma_] += 1\n",
|
||||||
|
"\n",
|
||||||
|
" return result"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -399,7 +347,16 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def extract_terms(text):\n",
|
"def extract_terms(text):\n",
|
||||||
" return []"
|
" result = {}\n",
|
||||||
|
" doc = nlp(text)\n",
|
||||||
|
" for token in doc:\n",
|
||||||
|
" if token.pos_ in ['NOUN', 'VERB', 'ADJ']:\n",
|
||||||
|
" if result.get(token.lemma_) is None:\n",
|
||||||
|
" result[token.lemma_] = 1\n",
|
||||||
|
" else:\n",
|
||||||
|
" result[token.lemma_] += 1\n",
|
||||||
|
"\n",
|
||||||
|
" return result"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -407,7 +364,7 @@
|
|||||||
"author": "Rafał Jaworski",
|
"author": "Rafał Jaworski",
|
||||||
"email": "rjawor@amu.edu.pl",
|
"email": "rjawor@amu.edu.pl",
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -422,7 +379,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.9.2"
|
||||||
},
|
},
|
||||||
"subtitle": "3. Terminologia",
|
"subtitle": "3. Terminologia",
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
File diff suppressed because one or more lines are too long
@ -60,8 +60,14 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import regex\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"def find_tags(text):\n",
|
"def find_tags(text):\n",
|
||||||
" return []"
|
" result = []\n",
|
||||||
|
" for occurance in regex.finditer(\"(\\</?\\w+\\>)\", text, regex.IGNORECASE):\n",
|
||||||
|
" result.append(occurance.span())\n",
|
||||||
|
" return result"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -79,8 +85,12 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import regex\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Assuming text is a single word\n",
|
||||||
"def is_translatable(text):\n",
|
"def is_translatable(text):\n",
|
||||||
" return True"
|
" return regex.fullmatch(\"[A-Z\\-]+\", text, regex.IGNORECASE) is not None"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -98,8 +108,26 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"import regex\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"def find_dates(text):\n",
|
"def find_dates(text):\n",
|
||||||
" return []"
|
" regex_format = regex.compile(\"(?P<day>[0-9]{1,2})[/.-](?P<month>[0-9]{1,2})[/.-](?P<year>[0-9]{4})\")\n",
|
||||||
|
" matches = regex.match(regex_format, text)\n",
|
||||||
|
" result = {\n",
|
||||||
|
" 'day': int(matches.group('day')),\n",
|
||||||
|
" 'month': int(matches.group('month')),\n",
|
||||||
|
" 'year': int(matches.group('year')),\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" return result\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"print(find_dates(\"01/02/1970\"))\n",
|
||||||
|
"print(find_dates(\"01.02.1970\"))\n",
|
||||||
|
"print(find_dates(\"01-02-1970\"))\n",
|
||||||
|
"print(find_dates(\"1/2/1970\"))\n",
|
||||||
|
"print(find_dates(\"1.2.1970\"))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -130,8 +158,22 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
"formats = {\n",
|
||||||
|
" 'd/m/y': lambda date: f\"{date['day']}/{date['month']}/{date['year']}\",\n",
|
||||||
|
" 'y-m-d': lambda date: f\"{date['year']}-{date['month']}-{date['day']}\",\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"def correct_dates(source_segment, target_segment, date_format):\n",
|
"def correct_dates(source_segment, target_segment, date_format):\n",
|
||||||
" return ''"
|
" source_date = find_dates(source_segment)\n",
|
||||||
|
" target_date = find_dates(target_segment)\n",
|
||||||
|
" if target_date != source_date:\n",
|
||||||
|
" print('Dates differ')\n",
|
||||||
|
"\n",
|
||||||
|
" return formats[date_format](source_date)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"print(correct_dates(\"1.2.1970\", \"1.2.1970\", 'y-m-d'))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -190,7 +232,7 @@
|
|||||||
"author": "Rafał Jaworski",
|
"author": "Rafał Jaworski",
|
||||||
"email": "rjawor@amu.edu.pl",
|
"email": "rjawor@amu.edu.pl",
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
@ -205,7 +247,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.8.10"
|
"version": "3.9.2"
|
||||||
},
|
},
|
||||||
"subtitle": "6,7. Preprocessing i postprocessing",
|
"subtitle": "6,7. Preprocessing i postprocessing",
|
||||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||||
|
Loading…
Reference in New Issue
Block a user