[2024-04-13] labs 1,2,3

This commit is contained in:
Patryk Bartkowiak 2024-04-13 14:10:00 +02:00
parent 71ca3b66ed
commit 870b673fac
3 changed files with 507 additions and 171 deletions

View File

@ -52,7 +52,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 191,
"id": "narrow-romantic", "id": "narrow-romantic",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -71,7 +71,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 192,
"id": "indonesian-electron", "id": "indonesian-electron",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -82,7 +82,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 193,
"id": "compact-trinidad", "id": "compact-trinidad",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -92,7 +92,7 @@
"['Press the ENTER button']" "['Press the ENTER button']"
] ]
}, },
"execution_count": 3, "execution_count": 193,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -119,7 +119,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 194,
"id": "exposed-daniel", "id": "exposed-daniel",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -139,7 +139,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 195,
"id": "serial-velvet", "id": "serial-velvet",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -149,7 +149,7 @@
"['Press the ENTER button', 'Press the ENTER key']" "['Press the ENTER button', 'Press the ENTER key']"
] ]
}, },
"execution_count": 5, "execution_count": 195,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -176,7 +176,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 196,
"id": "every-gibson", "id": "every-gibson",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -186,7 +186,7 @@
"[]" "[]"
] ]
}, },
"execution_count": 6, "execution_count": 196,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -213,13 +213,37 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 197,
"id": "protected-rings", "id": "protected-rings",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def preprocess(sentence):\n",
" return sentence.lower()\n",
"\n",
"def tm_lookup(sentence):\n", "def tm_lookup(sentence):\n",
" return ''" " return [entry[1] for entry in translation_memory if preprocess(entry[0]) == preprocess(sentence)]"
]
},
{
"cell_type": "code",
"execution_count": 198,
"id": "7baee10b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Press the ENTER button', 'Press the ENTER key']"
]
},
"execution_count": 198,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tm_lookup('Wciśnij przycisk ENTER')"
] ]
}, },
{ {
@ -232,17 +256,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 199,
"id": "severe-alloy", "id": "severe-alloy",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"''" "[]"
] ]
}, },
"execution_count": 18, "execution_count": 199,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -261,13 +285,40 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 200,
"id": "structural-diesel", "id": "structural-diesel",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"import string\n",
"\n",
"def preprocess(s):\n",
" translator = str.maketrans('', '', string.punctuation)\n",
" return s.translate(translator).lower()\n",
"\n",
"def tm_lookup(sentence):\n", "def tm_lookup(sentence):\n",
" return ''" " return [entry[1] for entry in translation_memory if preprocess(entry[0]) == preprocess(sentence)]"
]
},
{
"cell_type": "code",
"execution_count": 201,
"id": "c03c6709",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Press the ENTER button', 'Press the ENTER key']"
]
},
"execution_count": 201,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tm_lookup('Wciśnij przycisk [ENTER]')"
] ]
}, },
{ {
@ -280,17 +331,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 202,
"id": "brief-senegal", "id": "brief-senegal",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"text/plain": [ "text/plain": [
"''" "[]"
] ]
}, },
"execution_count": 12, "execution_count": 202,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -317,13 +368,43 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 203,
"id": "mathematical-customs", "id": "mathematical-customs",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def compare_sentences(l1, l2):\n",
" return sum([1 for i, j in zip(l1.split(), l2.split()) if i != j]) <= 1\n",
"\n",
"import string\n",
"\n",
"def preprocess(s):\n",
" translator = str.maketrans('', '', string.punctuation)\n",
" return s.translate(translator).lower()\n",
"\n",
"def tm_lookup(sentence):\n", "def tm_lookup(sentence):\n",
" return ''" " return [entry[1] for entry in translation_memory if compare_sentences(preprocess(entry[0]), preprocess(sentence))]"
]
},
{
"cell_type": "code",
"execution_count": 204,
"id": "6264b722",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['System restart required']"
]
},
"execution_count": 204,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tm_lookup('Wymagane ponowne uruchomienie maszyny')"
] ]
}, },
{ {
@ -344,7 +425,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 205,
"id": "humanitarian-wrong", "id": "humanitarian-wrong",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -362,7 +443,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 16, "execution_count": 206,
"id": "located-perception", "id": "located-perception",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -374,7 +455,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 207,
"id": "advised-casting", "id": "advised-casting",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -384,7 +465,7 @@
"[('przycisk', 'button'), ('drukarka', 'printer')]" "[('przycisk', 'button'), ('drukarka', 'printer')]"
] ]
}, },
"execution_count": 17, "execution_count": 207,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -406,7 +487,7 @@
"id": "defensive-fifteen", "id": "defensive-fifteen",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Odpowiedź:" "Odpowiedź: Jeżeli implementacja wygląda tak jak powyżej, złożoność to `O(n*m)`, ponieważ dla każdego słowa iteracyjnie przechodzimy przez cały nasz słownik i szukamy odpowiednika"
] ]
}, },
{ {
@ -419,13 +500,56 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 208,
"id": "aca5d340",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('przycisk', 'button')]"
]
},
"execution_count": 208,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"glossary_lookup('Każda Drukarka posiada przycisk wznowienia drukowania')"
]
},
{
"cell_type": "code",
"execution_count": 209,
"id": "original-tunisia", "id": "original-tunisia",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def glossary_lookup(sentence):\n", "def glossary_lookup(sentence):\n",
" return ''" " sentence_words = [word.lower() for word in sentence.split()]\n",
" return [entry for entry in glossary if entry[0] in sentence_words]"
]
},
{
"cell_type": "code",
"execution_count": 210,
"id": "716bbbe9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('przycisk', 'button'), ('drukarka', 'printer')]"
]
},
"execution_count": 210,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')"
] ]
}, },
{ {
@ -438,13 +562,50 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 211,
"id": "32dec661",
"metadata": {},
"outputs": [],
"source": [
"glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]\n",
"glossary = {\n",
" 'komputer': 'computer',\n",
" 'przycisk': 'button',\n",
" 'drukarka': 'printer'\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 212,
"id": "adolescent-semiconductor", "id": "adolescent-semiconductor",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def glossary_lookup(sentence):\n", "def glossary_lookup(sentence):\n",
" return ''" " sentence_words = [word.lower() for word in sentence.split() if word.lower() in glossary]\n",
" return [(word, glossary[word]) for word in sentence_words]"
]
},
{
"cell_type": "code",
"execution_count": 213,
"id": "d1e991c6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('drukarka', 'printer'), ('przycisk', 'button')]"
]
},
"execution_count": 213,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')"
] ]
} }
], ],
@ -467,7 +628,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.8.10" "version": "3.10.14"
}, },
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia", "subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
"title": "Komputerowe wspomaganie tłumaczenia", "title": "Komputerowe wspomaganie tłumaczenia",

View File

@ -57,7 +57,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 17,
"id": "confident-prison", "id": "confident-prison",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -80,13 +80,27 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 18,
"id": "continental-submission", "id": "continental-submission",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def ice_lookup(sentence, prev_sentence, next_sentence):\n", "def ice_lookup(sentence, prev_sentence, next_sentence):\n",
" return []" " # Wyniki dopasowania ICE\n",
" ice_matches = []\n",
"\n",
" # Iterujemy przez pamięć tłumaczeń, pomijając pierwszy i ostatni element dla bezpieczeństwa kontekstowego\n",
" for index in range(1, len(translation_memory) - 1):\n",
" # Pobieramy obecne, poprzednie i następne zdania z TM\n",
" prev_tm_sentence, _ = translation_memory[index - 1]\n",
" current_tm_sentence, current_tm_translation = translation_memory[index]\n",
" next_tm_sentence, _ = translation_memory[index + 1]\n",
"\n",
" # Sprawdzamy, czy wszystkie trzy zdania zgadzają się z odpowiednikami w TM\n",
" if (prev_tm_sentence == prev_sentence and current_tm_sentence == current_sentence and next_tm_sentence == next_sentence):\n",
" ice_matches.append(current_tm_translation)\n",
"\n",
" return ice_matches"
] ]
}, },
{ {
@ -119,7 +133,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 19,
"id": "fourth-pillow", "id": "fourth-pillow",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -141,7 +155,11 @@
"id": "graduate-theorem", "id": "graduate-theorem",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Odpowiedź:" "Odpowiedź: Nie, ponieważ w tej funkcji interesuje nas tylko długość zdania, tzn. drugi warunek nie będzie spełniony\n",
"\n",
"Przykład: `kot != bok`, a dla tej funkcji zwróci 0\n",
"\n",
"Spełnione warunki: 1, 3, 4"
] ]
}, },
{ {
@ -154,7 +172,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 20,
"id": "continued-christopher", "id": "continued-christopher",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -179,7 +197,40 @@
"id": "metallic-leave", "id": "metallic-leave",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Odpowiedź:" "Odpowiedź: Tak, spełnia wszystkie warunki\n",
"\n",
"Sprawdzenie dla warunku 4"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "349a3547",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n",
"True\n",
"True\n",
"True\n"
]
}
],
"source": [
"# x == y i y == z\n",
"print(sentence_distance(\"kot\", \"kot\") + sentence_distance(\"kot\", \"kot\") >= sentence_distance(\"kot\", \"kot\"))\n",
"\n",
"# x == y i y != z\n",
"print(sentence_distance(\"kot\", \"kot\") + sentence_distance(\"kot\", \"pies\") >= sentence_distance(\"kot\", \"pies\"))\n",
"\n",
"# x != y i y == z\n",
"print(sentence_distance(\"kot\", \"pies\") + sentence_distance(\"pies\", \"pies\") >= sentence_distance(\"kot\", \"pies\"))\n",
"\n",
"# x != y i y != z\n",
"print(sentence_distance(\"kot\", \"pies\") + sentence_distance(\"pies\", \"kot\") >= sentence_distance(\"kot\", \"kot\"))"
] ]
}, },
{ {
@ -206,7 +257,11 @@
"id": "bibliographic-stopping", "id": "bibliographic-stopping",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Odpowiedź:" "Odpowiedź:\n",
"- Dystans Levenshteina jest zawsze nieujemny\n",
"- Jeśli dwa ciągi są identyczne, nie potrzeba żadnych operacji do przekształcenia jednego w drugi\n",
"- Dystans Levenshteina jest symetryczny, ponieważ liczba operacji wymaganych do przekształcenia ciągu A w ciąg B jest taka sama jak liczba operacji potrzebnych do przekształcenia ciągu B w ciąg A\n",
"- Dystans Levenshteina spełnia nierówność trójkąta. Można to uzasadnić rozważając, że przekształcenie ciągu X w Y przez ciąg pośredni Z (najpierw przekształcając X w Z, a następnie Z w Y) nie będzie wymagać więcej operacji niż bezpośrednie przekształcenie X w Y"
] ]
}, },
{ {
@ -223,7 +278,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 21,
"id": "secondary-wrist", "id": "secondary-wrist",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -233,7 +288,7 @@
"2" "2"
] ]
}, },
"execution_count": 5, "execution_count": 21,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -254,7 +309,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 22,
"id": "associate-tuner", "id": "associate-tuner",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -273,7 +328,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 23,
"id": "focal-pathology", "id": "focal-pathology",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -283,7 +338,7 @@
"0.9166666666666666" "0.9166666666666666"
] ]
}, },
"execution_count": 7, "execution_count": 23,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -294,7 +349,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 24,
"id": "roman-ceiling", "id": "roman-ceiling",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -304,7 +359,7 @@
"0.9428571428571428" "0.9428571428571428"
] ]
}, },
"execution_count": 8, "execution_count": 24,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -315,7 +370,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 25,
"id": "invisible-cambodia", "id": "invisible-cambodia",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -325,7 +380,7 @@
"0.631578947368421" "0.631578947368421"
] ]
}, },
"execution_count": 9, "execution_count": 25,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -344,13 +399,22 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 26,
"id": "genetic-cradle", "id": "genetic-cradle",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Write a fuzzy_lookup function that will search the translation memory for all sentences whose Levenshtein similarity to the searched sentence is greater than or equal to a set threshold.\n",
"def fuzzy_lookup(sentence, threshold):\n", "def fuzzy_lookup(sentence, threshold):\n",
" return []" " fuzzy_matches = []\n",
"\n",
" # Iterujemy przez pamięć tłumaczeń\n",
" for tm_sentence, tm_translation in translation_memory:\n",
" # Sprawdzamy, czy podobieństwo Levenshteina jest większe niż próg\n",
" if levenshtein_similarity(sentence, tm_sentence) >= threshold:\n",
" fuzzy_matches.append(tm_translation)\n",
"\n",
" return fuzzy_matches"
] ]
} }
], ],
@ -373,7 +437,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.8.10" "version": "3.10.14"
}, },
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń", "subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
"title": "Komputerowe wspomaganie tłumaczenia", "title": "Komputerowe wspomaganie tłumaczenia",

View File

@ -63,7 +63,7 @@
"id": "diverse-sunglasses", "id": "diverse-sunglasses",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Odpowiedź:" "Odpowiedź: Wynik z Google Translate to `metal cabinet guides`"
] ]
}, },
{ {
@ -86,7 +86,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 11,
"id": "loving-prince", "id": "loving-prince",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -110,7 +110,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 12,
"id": "bound-auction", "id": "bound-auction",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -128,13 +128,46 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 13,
"id": "cognitive-cedar", "id": "cognitive-cedar",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def terminology_lookup():\n", "def terminology_lookup():\n",
" return []" " for term in dictionary:\n",
" start = 0\n",
" while True:\n",
" start = text.find(term, start)\n",
" if start == -1:\n",
" break\n",
" end = start + len(term)\n",
" print(f'{term}: ({start}, {end})')\n",
" start = end"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "0a4a26ba",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"program: (14, 21)\n",
"program: (291, 298)\n",
"program: (468, 475)\n",
"program: (516, 523)\n",
"program: (533, 540)\n",
"application: (80, 91)\n",
"application: (164, 175)\n",
"application: (322, 333)\n"
]
}
],
"source": [
"terminology_lookup()"
] ]
}, },
{ {
@ -161,7 +194,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 15,
"id": "tribal-attention", "id": "tribal-attention",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -169,108 +202,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
" \n", " for all Java programmer : this section explain how to compile and run a swing application from the command line . for information on compile and run a swing application use NetBeans IDE , see run Tutorial Examples in NetBeans IDE . the compilation instruction work for all Swing program — applet , as well as application . here be the step you need to follow : install the late release of the Java SE platform , if you have not already do so . create a program that use swing component . compile the program . run the program . "
"for\n",
"all\n",
"Java\n",
"programmer\n",
":\n",
"this\n",
"section\n",
"explain\n",
"how\n",
"to\n",
"compile\n",
"and\n",
"run\n",
"a\n",
"swing\n",
"application\n",
"from\n",
"the\n",
"command\n",
"line\n",
".\n",
"for\n",
"information\n",
"on\n",
"compile\n",
"and\n",
"run\n",
"a\n",
"swing\n",
"application\n",
"use\n",
"NetBeans\n",
"IDE\n",
",\n",
"see\n",
"Running\n",
"Tutorial\n",
"Examples\n",
"in\n",
"NetBeans\n",
"IDE\n",
".\n",
"the\n",
"compilation\n",
"instruction\n",
"work\n",
"for\n",
"all\n",
"swing\n",
"program\n",
"—\n",
"applet\n",
",\n",
"as\n",
"well\n",
"as\n",
"application\n",
".\n",
"here\n",
"be\n",
"the\n",
"step\n",
"-PRON-\n",
"need\n",
"to\n",
"follow\n",
":\n",
"install\n",
"the\n",
"late\n",
"release\n",
"of\n",
"the\n",
"Java\n",
"SE\n",
"platform\n",
",\n",
"if\n",
"-PRON-\n",
"have\n",
"not\n",
"already\n",
"do\n",
"so\n",
".\n",
"create\n",
"a\n",
"program\n",
"that\n",
"use\n",
"Swing\n",
"component\n",
".\n",
"compile\n",
"the\n",
"program\n",
".\n",
"run\n",
"the\n",
"program\n",
".\n"
] ]
} }
], ],
@ -281,7 +213,7 @@
"doc = nlp(text)\n", "doc = nlp(text)\n",
"\n", "\n",
"for token in doc:\n", "for token in doc:\n",
" print(token.lemma_)" " print(token.lemma_, end=' ')"
] ]
}, },
{ {
@ -302,13 +234,40 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 40,
"id": "surgical-demonstration", "id": "surgical-demonstration",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def terminology_lookup():\n", "def terminology_lookup():\n",
" return []" " for term in dictionary:\n",
" for token in doc:\n",
" if token.lemma_ == term:\n",
" print(f'{token}: ({token.idx}, {token.idx + len(token)})')"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "74f600ea",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"programs: (291, 299)\n",
"program: (468, 475)\n",
"program: (516, 523)\n",
"program: (533, 540)\n",
"application: (80, 91)\n",
"application: (164, 175)\n",
"applications: (322, 334)\n"
]
}
],
"source": [
"terminology_lookup()"
] ]
}, },
{ {
@ -337,13 +296,56 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 6, "execution_count": 22,
"id": "superb-butterfly", "id": "superb-butterfly",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def get_nouns(text):\n", "def get_nouns(text):\n",
" return []" " doc = nlp(text)\n",
" return [token.text for token in doc if token.pos_ == 'NOUN']"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "2bfedfa3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['programmers',\n",
" 'section',\n",
" 'Swing',\n",
" 'application',\n",
" 'command',\n",
" 'line',\n",
" 'information',\n",
" 'Swing',\n",
" 'application',\n",
" 'compilation',\n",
" 'instructions',\n",
" 'programs',\n",
" 'applets',\n",
" 'applications',\n",
" 'steps',\n",
" 'release',\n",
" 'platform',\n",
" 'program',\n",
" 'Swing',\n",
" 'components',\n",
" 'program',\n",
" 'program']"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_nouns(text)"
] ]
}, },
{ {
@ -356,7 +358,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 19,
"id": "acting-tolerance", "id": "acting-tolerance",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@ -374,13 +376,54 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 26,
"id": "eight-redhead", "id": "eight-redhead",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def extract_terms(text):\n", "def extract_terms(text):\n",
" return []" " doc = nlp(text)\n",
" terms = {}\n",
" for token in doc:\n",
" if token.pos_ == 'NOUN':\n",
" term = token.lemma_\n",
" terms[term] = terms.get(term, 0) + 1\n",
" return terms"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "07c1122a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'programmer': 1,\n",
" 'section': 1,\n",
" 'swing': 3,\n",
" 'application': 3,\n",
" 'command': 1,\n",
" 'line': 1,\n",
" 'information': 1,\n",
" 'compilation': 1,\n",
" 'instruction': 1,\n",
" 'program': 4,\n",
" 'applet': 1,\n",
" 'step': 1,\n",
" 'release': 1,\n",
" 'platform': 1,\n",
" 'component': 1}"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_terms(text)"
] ]
}, },
{ {
@ -393,14 +436,82 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 32,
"id": "monetary-mambo", "id": "monetary-mambo",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Extract and count nouns, verbs and adjectives\n",
"def extract_terms(text):\n", "def extract_terms(text):\n",
" return []" " doc = nlp(text)\n",
" terms = {\"nouns\": {}, \"verbs\": {}, \"adjectives\": {}}\n",
" for token in doc:\n",
" if token.pos_ == 'NOUN':\n",
" term = token.lemma_\n",
" terms[\"nouns\"][term] = terms[\"nouns\"].get(term, 0) + 1\n",
" elif token.pos_ == 'VERB':\n",
" term = token.lemma_\n",
" terms[\"verbs\"][term] = terms[\"verbs\"].get(term, 0) + 1\n",
" elif token.pos_ == 'ADJ':\n",
" term = token.lemma_\n",
" terms[\"adjectives\"][term] = terms[\"adjectives\"].get(term, 0) + 1\n",
"\n",
" return terms"
] ]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "1eb48136",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'adjectives': {'late': 1},\n",
" 'nouns': {'applet': 1,\n",
" 'application': 3,\n",
" 'command': 1,\n",
" 'compilation': 1,\n",
" 'component': 1,\n",
" 'information': 1,\n",
" 'instruction': 1,\n",
" 'line': 1,\n",
" 'platform': 1,\n",
" 'program': 4,\n",
" 'programmer': 1,\n",
" 'release': 1,\n",
" 'section': 1,\n",
" 'step': 1,\n",
" 'swing': 3},\n",
" 'verbs': {'compile': 3,\n",
" 'create': 1,\n",
" 'do': 1,\n",
" 'explain': 1,\n",
" 'follow': 1,\n",
" 'install': 1,\n",
" 'need': 1,\n",
" 'run': 4,\n",
" 'see': 1,\n",
" 'use': 2,\n",
" 'work': 1}}\n"
]
}
],
"source": [
"from pprint import pprint\n",
"\n",
"pprint(extract_terms(text))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62aeea83",
"metadata": {},
"outputs": [],
"source": []
} }
], ],
"metadata": { "metadata": {
@ -422,7 +533,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.8.10" "version": "3.10.14"
}, },
"subtitle": "3. Terminologia", "subtitle": "3. Terminologia",
"title": "Komputerowe wspomaganie tłumaczenia", "title": "Komputerowe wspomaganie tłumaczenia",