diff --git a/lab/lab_01.ipynb b/lab/lab_01.ipynb index 0ffe833..055ca14 100644 --- a/lab/lab_01.ipynb +++ b/lab/lab_01.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 191, "id": "narrow-romantic", "metadata": {}, "outputs": [], @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 192, "id": "indonesian-electron", "metadata": {}, "outputs": [], @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 193, "id": "compact-trinidad", "metadata": {}, "outputs": [ @@ -92,7 +92,7 @@ "['Press the ENTER button']" ] }, - "execution_count": 3, + "execution_count": 193, "metadata": {}, "output_type": "execute_result" } @@ -119,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 194, "id": "exposed-daniel", "metadata": {}, "outputs": [], @@ -139,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 195, "id": "serial-velvet", "metadata": {}, "outputs": [ @@ -149,7 +149,7 @@ "['Press the ENTER button', 'Press the ENTER key']" ] }, - "execution_count": 5, + "execution_count": 195, "metadata": {}, "output_type": "execute_result" } @@ -176,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 196, "id": "every-gibson", "metadata": {}, "outputs": [ @@ -186,7 +186,7 @@ "[]" ] }, - "execution_count": 6, + "execution_count": 196, "metadata": {}, "output_type": "execute_result" } @@ -213,13 +213,37 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 197, "id": "protected-rings", "metadata": {}, "outputs": [], "source": [ + "def preprocess(sentence):\n", + " return sentence.lower()\n", + "\n", "def tm_lookup(sentence):\n", - " return ''" + " return [entry[1] for entry in translation_memory if preprocess(entry[0]) == preprocess(sentence)]" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "id": "7baee10b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Press the ENTER button', 'Press the ENTER key']" + ] + }, + "execution_count": 198, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm_lookup('Wciśnij przycisk ENTER')" ] }, { @@ -232,17 +256,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 199, "id": "severe-alloy", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "''" + "[]" ] }, - "execution_count": 18, + "execution_count": 199, "metadata": {}, "output_type": "execute_result" } @@ -261,13 +285,40 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 200, "id": "structural-diesel", "metadata": {}, "outputs": [], "source": [ + "import string\n", + "\n", + "def preprocess(s):\n", + " translator = str.maketrans('', '', string.punctuation)\n", + " return s.translate(translator).lower()\n", + "\n", "def tm_lookup(sentence):\n", - " return ''" + " return [entry[1] for entry in translation_memory if preprocess(entry[0]) == preprocess(sentence)]" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "id": "c03c6709", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Press the ENTER button', 'Press the ENTER key']" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm_lookup('Wciśnij przycisk [ENTER]')" ] }, { @@ -280,17 +331,17 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 202, "id": "brief-senegal", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "''" + "[]" ] }, - "execution_count": 12, + "execution_count": 202, "metadata": {}, "output_type": "execute_result" } @@ -317,13 +368,43 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 203, "id": "mathematical-customs", "metadata": {}, "outputs": [], "source": [ + "def compare_sentences(l1, l2):\n", + " return sum([1 for i, j in zip(l1.split(), l2.split()) if i != j]) <= 1\n", + "\n", + "import string\n", + "\n", + "def preprocess(s):\n", + " translator = str.maketrans('', '', string.punctuation)\n", + " return s.translate(translator).lower()\n", + "\n", "def tm_lookup(sentence):\n", - " return ''" + " return [entry[1] for entry in translation_memory if compare_sentences(preprocess(entry[0]), preprocess(sentence))]" + ] + }, + { + "cell_type": "code", + "execution_count": 204, + "id": "6264b722", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['System restart required']" + ] + }, + "execution_count": 204, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tm_lookup('Wymagane ponowne uruchomienie maszyny')" ] }, { @@ -344,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 205, "id": "humanitarian-wrong", "metadata": {}, "outputs": [], @@ -362,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 206, "id": "located-perception", "metadata": {}, "outputs": [], @@ -374,7 +455,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 207, "id": "advised-casting", "metadata": {}, "outputs": [ @@ -384,7 +465,7 @@ "[('przycisk', 'button'), ('drukarka', 'printer')]" ] }, - "execution_count": 17, + "execution_count": 207, "metadata": {}, "output_type": "execute_result" } @@ -406,7 +487,7 @@ "id": "defensive-fifteen", "metadata": {}, "source": [ - "Odpowiedź:" + "Odpowiedź: Jeżeli implementacja wygląda tak jak powyżej, złożoność to `O(n*m)`, ponieważ dla każdego słowa iteracyjnie przechodzimy przez cały nasz słownik i szukamy odpowiednika" ] }, { @@ -419,13 +500,56 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 208, + "id": "aca5d340", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('przycisk', 'button')]" + ] + }, + "execution_count": 208, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "glossary_lookup('Każda Drukarka posiada przycisk wznowienia drukowania')" + ] + }, + { + "cell_type": "code", + "execution_count": 209, "id": "original-tunisia", "metadata": {}, "outputs": [], "source": [ "def glossary_lookup(sentence):\n", - " return ''" + " sentence_words = [word.lower() for word in sentence.split()]\n", + " return [entry for entry in glossary if entry[0] in sentence_words]" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "id": "716bbbe9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('przycisk', 'button'), ('drukarka', 'printer')]" + ] + }, + "execution_count": 210, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')" ] }, { @@ -438,13 +562,50 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 211, + "id": "32dec661", + "metadata": {}, + "outputs": [], + "source": [ + "glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]\n", + "glossary = {\n", + " 'komputer': 'computer',\n", + " 'przycisk': 'button',\n", + " 'drukarka': 'printer'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 212, "id": "adolescent-semiconductor", "metadata": {}, "outputs": [], "source": [ "def glossary_lookup(sentence):\n", - " return ''" + " sentence_words = [word.lower() for word in sentence.split() if word.lower() in glossary]\n", + " return [(word, glossary[word]) for word in sentence_words]" + ] + }, + { + "cell_type": "code", + "execution_count": 213, + "id": "d1e991c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('drukarka', 'printer'), ('przycisk', 'button')]" + ] + }, + "execution_count": 213, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')" ] } ], @@ -467,7 +628,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.14" }, "subtitle": "1. Podstawowe techniki wspomagania tłumaczenia", "title": "Komputerowe wspomaganie tłumaczenia", diff --git a/lab/lab_02.ipynb b/lab/lab_02.ipynb index 10c2003..9cf061b 100644 --- a/lab/lab_02.ipynb +++ b/lab/lab_02.ipynb @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 17, "id": "confident-prison", "metadata": {}, "outputs": [], @@ -80,13 +80,27 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 18, "id": "continental-submission", "metadata": {}, "outputs": [], "source": [ "def ice_lookup(sentence, prev_sentence, next_sentence):\n", - " return []" + " # Wyniki dopasowania ICE\n", + " ice_matches = []\n", + "\n", + " # Iterujemy przez pamięć tłumaczeń, pomijając pierwszy i ostatni element dla bezpieczeństwa kontekstowego\n", + " for index in range(1, len(translation_memory) - 1):\n", + " # Pobieramy obecne, poprzednie i następne zdania z TM\n", + " prev_tm_sentence, _ = translation_memory[index - 1]\n", + " current_tm_sentence, current_tm_translation = translation_memory[index]\n", + " next_tm_sentence, _ = translation_memory[index + 1]\n", + "\n", + " # Sprawdzamy, czy wszystkie trzy zdania zgadzają się z odpowiednikami w TM\n", + " if (prev_tm_sentence == prev_sentence and current_tm_sentence == current_sentence and next_tm_sentence == next_sentence):\n", + " ice_matches.append(current_tm_translation)\n", + "\n", + " return ice_matches" ] }, { @@ -119,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 19, "id": "fourth-pillow", "metadata": {}, "outputs": [], @@ -141,7 +155,11 @@ "id": "graduate-theorem", "metadata": {}, "source": [ - "Odpowiedź:" + "Odpowiedź: Nie, ponieważ w tej funkcji interesuje nas tylko długość zdania, tzn. drugi warunek nie będzie spełniony\n", + "\n", + "Przykład: `kot != bok`, a dla tej funkcji zwróci 0\n", + "\n", + "Spełnione warunki: 1, 3, 4" ] }, { @@ -154,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 20, "id": "continued-christopher", "metadata": {}, "outputs": [], @@ -179,7 +197,40 @@ "id": "metallic-leave", "metadata": {}, "source": [ - "Odpowiedź:" + "Odpowiedź: Tak, spełnia wszystkie warunki\n", + "\n", + "Sprawdzenie dla warunku 4" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "349a3547", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n", + "True\n", + "True\n", + "True\n" + ] + } + ], + "source": [ + "# x == y i y == z\n", + "print(sentence_distance(\"kot\", \"kot\") + sentence_distance(\"kot\", \"kot\") >= sentence_distance(\"kot\", \"kot\"))\n", + "\n", + "# x == y i y != z\n", + "print(sentence_distance(\"kot\", \"kot\") + sentence_distance(\"kot\", \"pies\") >= sentence_distance(\"kot\", \"pies\"))\n", + "\n", + "# x != y i y == z\n", + "print(sentence_distance(\"kot\", \"pies\") + sentence_distance(\"pies\", \"pies\") >= sentence_distance(\"kot\", \"pies\"))\n", + "\n", + "# x != y i y != z\n", + "print(sentence_distance(\"kot\", \"pies\") + sentence_distance(\"pies\", \"kot\") >= sentence_distance(\"kot\", \"kot\"))" ] }, { @@ -206,7 +257,11 @@ "id": "bibliographic-stopping", "metadata": {}, "source": [ - "Odpowiedź:" + "Odpowiedź:\n", + "- Dystans Levenshteina jest zawsze nieujemny\n", + "- Jeśli dwa ciągi są identyczne, nie potrzeba żadnych operacji do przekształcenia jednego w drugi\n", + "- Dystans Levenshteina jest symetryczny, ponieważ liczba operacji wymaganych do przekształcenia ciągu A w ciąg B jest taka sama jak liczba operacji potrzebnych do przekształcenia ciągu B w ciąg A\n", + "- Dystans Levenshteina spełnia nierówność trójkąta. Można to uzasadnić rozważając, że przekształcenie ciągu X w Y przez ciąg pośredni Z (najpierw przekształcając X w Z, a następnie Z w Y) nie będzie wymagać więcej operacji niż bezpośrednie przekształcenie X w Y" ] }, { @@ -223,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 21, "id": "secondary-wrist", "metadata": {}, "outputs": [ @@ -233,7 +288,7 @@ "2" ] }, - "execution_count": 5, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -254,7 +309,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 22, "id": "associate-tuner", "metadata": {}, "outputs": [], @@ -273,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 23, "id": "focal-pathology", "metadata": {}, "outputs": [ @@ -283,7 +338,7 @@ "0.9166666666666666" ] }, - "execution_count": 7, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -294,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 24, "id": "roman-ceiling", "metadata": {}, "outputs": [ @@ -304,7 +359,7 @@ "0.9428571428571428" ] }, - "execution_count": 8, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -315,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 25, "id": "invisible-cambodia", "metadata": {}, "outputs": [ @@ -325,7 +380,7 @@ "0.631578947368421" ] }, - "execution_count": 9, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -344,13 +399,22 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 26, "id": "genetic-cradle", "metadata": {}, "outputs": [], "source": [ + "# Write a fuzzy_lookup function that will search the translation memory for all sentences whose Levenshtein similarity to the searched sentence is greater than or equal to a set threshold.\n", "def fuzzy_lookup(sentence, threshold):\n", - " return []" + " fuzzy_matches = []\n", + "\n", + " # Iterujemy przez pamięć tłumaczeń\n", + " for tm_sentence, tm_translation in translation_memory:\n", + " # Sprawdzamy, czy podobieństwo Levenshteina jest większe niż próg\n", + " if levenshtein_similarity(sentence, tm_sentence) >= threshold:\n", + " fuzzy_matches.append(tm_translation)\n", + "\n", + " return fuzzy_matches" ] } ], @@ -373,7 +437,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.14" }, "subtitle": "2. Zaawansowane użycie pamięci tłumaczeń", "title": "Komputerowe wspomaganie tłumaczenia", diff --git a/lab/lab_03.ipynb b/lab/lab_03.ipynb index 5707f0d..f9ba649 100644 --- a/lab/lab_03.ipynb +++ b/lab/lab_03.ipynb @@ -63,7 +63,7 @@ "id": "diverse-sunglasses", "metadata": {}, "source": [ - "Odpowiedź:" + "Odpowiedź: Wynik z Google Translate to `metal cabinet guides`" ] }, { @@ -86,12 +86,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 11, "id": "loving-prince", "metadata": {}, "outputs": [], "source": [ - "text = \" For all Java programmers:\"\n", + "text = \" For all Java programmers:\"\n", "text += \" This section explains how to compile and run a Swing application from the command line.\"\n", "text += \" For information on compiling and running a Swing application using NetBeans IDE,\"\n", "text += \" see Running Tutorial Examples in NetBeans IDE. The compilation instructions work for all Swing programs\"\n", @@ -110,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 12, "id": "bound-auction", "metadata": {}, "outputs": [], @@ -128,13 +128,46 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 13, "id": "cognitive-cedar", "metadata": {}, "outputs": [], "source": [ "def terminology_lookup():\n", - " return []" + " for term in dictionary:\n", + " start = 0\n", + " while True:\n", + " start = text.find(term, start)\n", + " if start == -1:\n", + " break\n", + " end = start + len(term)\n", + " print(f'{term}: ({start}, {end})')\n", + " start = end" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0a4a26ba", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "program: (14, 21)\n", + "program: (291, 298)\n", + "program: (468, 475)\n", + "program: (516, 523)\n", + "program: (533, 540)\n", + "application: (80, 91)\n", + "application: (164, 175)\n", + "application: (322, 333)\n" + ] + } + ], + "source": [ + "terminology_lookup()" ] }, { @@ -161,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 15, "id": "tribal-attention", "metadata": {}, "outputs": [ @@ -169,108 +202,7 @@ "name": "stdout", "output_type": "stream", "text": [ - " \n", - "for\n", - "all\n", - "Java\n", - "programmer\n", - ":\n", - "this\n", - "section\n", - "explain\n", - "how\n", - "to\n", - "compile\n", - "and\n", - "run\n", - "a\n", - "swing\n", - "application\n", - "from\n", - "the\n", - "command\n", - "line\n", - ".\n", - "for\n", - "information\n", - "on\n", - "compile\n", - "and\n", - "run\n", - "a\n", - "swing\n", - "application\n", - "use\n", - "NetBeans\n", - "IDE\n", - ",\n", - "see\n", - "Running\n", - "Tutorial\n", - "Examples\n", - "in\n", - "NetBeans\n", - "IDE\n", - ".\n", - "the\n", - "compilation\n", - "instruction\n", - "work\n", - "for\n", - "all\n", - "swing\n", - "program\n", - "—\n", - "applet\n", - ",\n", - "as\n", - "well\n", - "as\n", - "application\n", - ".\n", - "here\n", - "be\n", - "the\n", - "step\n", - "-PRON-\n", - "need\n", - "to\n", - "follow\n", - ":\n", - "install\n", - "the\n", - "late\n", - "release\n", - "of\n", - "the\n", - "Java\n", - "SE\n", - "platform\n", - ",\n", - "if\n", - "-PRON-\n", - "have\n", - "not\n", - "already\n", - "do\n", - "so\n", - ".\n", - "create\n", - "a\n", - "program\n", - "that\n", - "use\n", - "Swing\n", - "component\n", - ".\n", - "compile\n", - "the\n", - "program\n", - ".\n", - "run\n", - "the\n", - "program\n", - ".\n" + " for all Java programmer : this section explain how to compile and run a swing application from the command line . for information on compile and run a swing application use NetBeans IDE , see run Tutorial Examples in NetBeans IDE . the compilation instruction work for all Swing program — applet , as well as application . here be the step you need to follow : install the late release of the Java SE platform , if you have not already do so . create a program that use swing component . compile the program . run the program . " ] } ], @@ -281,7 +213,7 @@ "doc = nlp(text)\n", "\n", "for token in doc:\n", - " print(token.lemma_)" + " print(token.lemma_, end=' ')" ] }, { @@ -302,13 +234,40 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 40, "id": "surgical-demonstration", "metadata": {}, "outputs": [], "source": [ "def terminology_lookup():\n", - " return []" + " for term in dictionary:\n", + " for token in doc:\n", + " if token.lemma_ == term:\n", + " print(f'{token}: ({token.idx}, {token.idx + len(token)})')" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "74f600ea", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "programs: (291, 299)\n", + "program: (468, 475)\n", + "program: (516, 523)\n", + "program: (533, 540)\n", + "application: (80, 91)\n", + "application: (164, 175)\n", + "applications: (322, 334)\n" + ] + } + ], + "source": [ + "terminology_lookup()" ] }, { @@ -337,13 +296,56 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 22, "id": "superb-butterfly", "metadata": {}, "outputs": [], "source": [ "def get_nouns(text):\n", - " return []" + " doc = nlp(text)\n", + " return [token.text for token in doc if token.pos_ == 'NOUN']" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "2bfedfa3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['programmers',\n", + " 'section',\n", + " 'Swing',\n", + " 'application',\n", + " 'command',\n", + " 'line',\n", + " 'information',\n", + " 'Swing',\n", + " 'application',\n", + " 'compilation',\n", + " 'instructions',\n", + " 'programs',\n", + " 'applets',\n", + " 'applications',\n", + " 'steps',\n", + " 'release',\n", + " 'platform',\n", + " 'program',\n", + " 'Swing',\n", + " 'components',\n", + " 'program',\n", + " 'program']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_nouns(text)" ] }, { @@ -356,7 +358,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 19, "id": "acting-tolerance", "metadata": {}, "outputs": [], @@ -374,13 +376,54 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 26, "id": "eight-redhead", "metadata": {}, "outputs": [], "source": [ "def extract_terms(text):\n", - " return []" + " doc = nlp(text)\n", + " terms = {}\n", + " for token in doc:\n", + " if token.pos_ == 'NOUN':\n", + " term = token.lemma_\n", + " terms[term] = terms.get(term, 0) + 1\n", + " return terms" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "07c1122a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'programmer': 1,\n", + " 'section': 1,\n", + " 'swing': 3,\n", + " 'application': 3,\n", + " 'command': 1,\n", + " 'line': 1,\n", + " 'information': 1,\n", + " 'compilation': 1,\n", + " 'instruction': 1,\n", + " 'program': 4,\n", + " 'applet': 1,\n", + " 'step': 1,\n", + " 'release': 1,\n", + " 'platform': 1,\n", + " 'component': 1}" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extract_terms(text)" ] }, { @@ -393,14 +436,82 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 32, "id": "monetary-mambo", "metadata": {}, "outputs": [], "source": [ + "# Extract and count nouns, verbs and adjectives\n", "def extract_terms(text):\n", - " return []" + " doc = nlp(text)\n", + " terms = {\"nouns\": {}, \"verbs\": {}, \"adjectives\": {}}\n", + " for token in doc:\n", + " if token.pos_ == 'NOUN':\n", + " term = token.lemma_\n", + " terms[\"nouns\"][term] = terms[\"nouns\"].get(term, 0) + 1\n", + " elif token.pos_ == 'VERB':\n", + " term = token.lemma_\n", + " terms[\"verbs\"][term] = terms[\"verbs\"].get(term, 0) + 1\n", + " elif token.pos_ == 'ADJ':\n", + " term = token.lemma_\n", + " terms[\"adjectives\"][term] = terms[\"adjectives\"].get(term, 0) + 1\n", + "\n", + " return terms" ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "1eb48136", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'adjectives': {'late': 1},\n", + " 'nouns': {'applet': 1,\n", + " 'application': 3,\n", + " 'command': 1,\n", + " 'compilation': 1,\n", + " 'component': 1,\n", + " 'information': 1,\n", + " 'instruction': 1,\n", + " 'line': 1,\n", + " 'platform': 1,\n", + " 'program': 4,\n", + " 'programmer': 1,\n", + " 'release': 1,\n", + " 'section': 1,\n", + " 'step': 1,\n", + " 'swing': 3},\n", + " 'verbs': {'compile': 3,\n", + " 'create': 1,\n", + " 'do': 1,\n", + " 'explain': 1,\n", + " 'follow': 1,\n", + " 'install': 1,\n", + " 'need': 1,\n", + " 'run': 4,\n", + " 'see': 1,\n", + " 'use': 2,\n", + " 'work': 1}}\n" + ] + } + ], + "source": [ + "from pprint import pprint\n", + "\n", + "pprint(extract_terms(text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62aeea83", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -422,7 +533,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.14" }, "subtitle": "3. Terminologia", "title": "Komputerowe wspomaganie tłumaczenia",