diff --git a/lab/lab_01.ipynb b/lab/lab_01.ipynb index 0ffe833..a92daae 100644 --- a/lab/lab_01.ipynb +++ b/lab/lab_01.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 45, "id": "narrow-romantic", "metadata": {}, "outputs": [], @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 46, "id": "indonesian-electron", "metadata": {}, "outputs": [], @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 47, "id": "compact-trinidad", "metadata": {}, "outputs": [ @@ -92,7 +92,7 @@ "['Press the ENTER button']" ] }, - "execution_count": 3, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -119,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 48, "id": "exposed-daniel", "metadata": {}, "outputs": [], @@ -139,7 +139,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 49, "id": "serial-velvet", "metadata": {}, "outputs": [ @@ -149,7 +149,7 @@ "['Press the ENTER button', 'Press the ENTER key']" ] }, - "execution_count": 5, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -176,17 +176,17 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 52, "id": "every-gibson", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[]" + "['Press the ENTER button', 'Press the ENTER key']" ] }, - "execution_count": 6, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -213,13 +213,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 51, "id": "protected-rings", "metadata": {}, "outputs": [], "source": [ "def tm_lookup(sentence):\n", - " return ''" + " return [entry[1] for entry in translation_memory if entry[0].lower() == sentence.lower()]" ] }, { @@ -232,17 +232,17 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 55, "id": "severe-alloy", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "''" + "['Press the ENTER button', 'Press the ENTER key']" ] }, - "execution_count": 18, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -261,13 +261,21 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 86, "id": "structural-diesel", "metadata": {}, "outputs": [], "source": [ + "import string\n", + "\n", + "def sentence_similar(sentence):\n", + " translator = str.maketrans('', '', string.punctuation)\n", + " return sentence.translate(translator)\n", + "\n", "def tm_lookup(sentence):\n", - " return ''" + " return [entry[1] for entry in translation_memory if entry[0].lower() == sentence_similar(sentence).lower()]\n", + "\n", + "#print(string.punctuation)" ] }, { @@ -280,17 +288,17 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 67, "id": "brief-senegal", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "''" + "['System restart required']" ] }, - "execution_count": 12, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } @@ -317,13 +325,30 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 66, "id": "mathematical-customs", "metadata": {}, "outputs": [], "source": [ "def tm_lookup(sentence):\n", - " return ''" + " inWords = sentence_similar(sentence).lower().split(\" \")\n", + " lenSentence = len(inWords)\n", + " matchWords = 0\n", + " answer = []\n", + " for entry in translation_memory:\n", + " dicWords = entry[0].lower().split(\" \")\n", + " \n", + " for i in range(lenSentence-1):\n", + " if inWords[i] == dicWords[i]:\n", + " matchWords += 1\n", + " \n", + " if matchWords >= lenSentence-1:\n", + " answer.append(entry[1])\n", + " matchWords = 0\n", + " else:\n", + " matchWords = 0\n", + "\n", + " return answer" ] }, { @@ -344,7 +369,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 68, "id": "humanitarian-wrong", "metadata": {}, "outputs": [], @@ -362,19 +387,29 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 84, "id": "located-perception", "metadata": {}, "outputs": [], "source": [ "def glossary_lookup(sentence):\n", " sentence_words = sentence.split()\n", - " return [entry for entry in glossary if entry[0] in sentence_words]" + " return [entry for entry in glossary if entry[0] in sentence_words]\n", + "\n", + "def exercise4help(sentence):\n", + " sentence_words = sentence.split()\n", + " answer = []\n", + " for entry in glossary: #przechodzimy przez każdą tuple więc (n)\n", + " if entry[0] in sentence_words: # (m) porównań\n", + " answer.append(entry)\n", + " return answer\n", + "\n", + "#dla każdego hasła w słowniku, robimy tyle porównań ile jest słów w zdaniu O(n*m)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 76, "id": "advised-casting", "metadata": {}, "outputs": [ @@ -384,13 +419,41 @@ "[('przycisk', 'button'), ('drukarka', 'printer')]" ] }, - "execution_count": 17, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')" + "glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "70ae3dd8-d4ca-4a59-b8a9-ca47583bf54a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3\n" + ] + }, + { + "data": { + "text/plain": [ + "[('przycisk', 'button'), ('drukarka', 'printer')]" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exercise4help('Każda drukarka posiada przycisk wznowienia drukowania')" ] }, { @@ -406,7 +469,7 @@ "id": "defensive-fifteen", "metadata": {}, "source": [ - "Odpowiedź:" + "Odpowiedź: O(m*n)" ] }, { @@ -419,13 +482,27 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 82, "id": "original-tunisia", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[('przycisk', 'button'), ('drukarka', 'printer')]" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def glossary_lookup(sentence):\n", - " return ''" + " sentence_words = sentence.lower().split()\n", + " return [entry for entry in glossary if entry[0] in sentence_words]\n", + "\n", + "glossary_lookup('Każda DRUKARKA posiada PRZYCISK wznowienia drukowania')" ] }, { @@ -438,13 +515,41 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 108, "id": "adolescent-semiconductor", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[('drukarka', 'printer'), ('drukarka', 'printer'), ('przycisk', 'button')]" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "glossary = {\n", + " 'komputer': 'computer',\n", + " 'przycisk': 'button', \n", + " 'drukarka': 'printer'\n", + "}\n", + "\n", + "#glossary.get('komputer') == None\n", + "\n", "def glossary_lookup(sentence):\n", - " return ''" + " sentence_words = set(sentence.split()) #umieszczamy w zbiorze aby uniknąć przetwarzania mniejszej ilości słów.\n", + " answer = []\n", + " for word in sentence_words: # dla każdego słowa w zdaniu (m)\n", + " translated_word = glossary.get(word) #pobieramy zawartosć ze słownika \n", + " if translated_word != None: # (porównanie m razy)\n", + " answer.append((word,translated_word)) # dodanie do odpowiedzi m razy\n", + " \n", + " return answer\n", + "\n", + "glossary_lookup('drukarka - Każda drukarka posiada przycisk wznowienia drukowania')" ] } ], @@ -452,7 +557,7 @@ "author": "Rafał Jaworski", "email": "rjawor@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -467,7 +572,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.9.2" }, "subtitle": "1. Podstawowe techniki wspomagania tłumaczenia", "title": "Komputerowe wspomaganie tłumaczenia", diff --git a/lab/lab_02.ipynb b/lab/lab_02.ipynb index 10c2003..ffdf917 100644 --- a/lab/lab_02.ipynb +++ b/lab/lab_02.ipynb @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "confident-prison", "metadata": {}, "outputs": [], @@ -80,13 +80,102 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 21, "id": "continental-submission", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['Wciśnij przycisk Enter']" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "def tm_lookup(sentence):\n", + " return [entry[1] for entry in translation_memory if entry[0].lower() == sentence.lower()]\n", + "\n", "def ice_lookup(sentence, prev_sentence, next_sentence):\n", - " return []" + " if (not prev_sentence) or (not next_sentence):\n", + " return 'no context'\n", + " \n", + " if not sentence:\n", + " return 'enter your sentence'\n", + " \n", + " #Dobrze prawie ale tutaj zwracane są listy. wszystko okey, gdy zdanie poprzedzające i następne mają tamą ilość słów. JEST zdecydowanie błędny gdy zdania mają różną ilość słów!\n", + " if tm_lookup(prev_sentence) and tm_lookup(next_sentence):\n", + " return [entry[0] for entry in translation_memory if entry[0].lower() == sentence.lower()]\n", + " else:\n", + " return \"\"\n", + " \n", + "ice_lookup('Wciśnij przycisk Enter','Sprawdź ustawienia sieciowe','Drukarka jest wyłączona')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "ecb19925-7467-4e8a-bfdf-9adee52a5894", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'no context'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ice_lookup('Wciśnij przycisk Enter','Sprawdź ustawienia sieciowe','')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "cf60a398-ae06-4ca8-b658-e011632cdb33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'enter your sentence'" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ice_lookup('','Sprawdź ustawienia sieciowe','Drukarka jest wyłączona')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "d34415a4-d853-435e-b093-fabc4629ff26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'no context'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ice_lookup('Wciśnij przycisk Enter','','Drukarka jest wyłączona')" ] }, { @@ -141,7 +230,13 @@ "id": "graduate-theorem", "metadata": {}, "source": [ - "Odpowiedź:" + "### Odpowiedź:\n", + "- 1. **spełnia warunek**: dzięki zastosowaniu funkcji `abs()`\n", + "- 3. **spełnia warunek**: przemienność w tym przypadku również zawdzięczamy funkcj `abs()`\n", + "- 4. **spełnia warunek**:(z uproszczeniem że x i y i z to len(z danej zmiennej) -> |y-x| + |z-y| >= |z-x| =\n", + "- = |y - y - x + z| >= |z-x| = |z-x| >= |z-x|\n", + "\n", + "2 nie jest spełnione, ponieważ x i y muszą być tymi samymi zdaniami aby odległość była równa 0. A wyżej wymieniona funckja spełnia ten warunek dla wszytskich zdań które mają taką samą ilość znaków." ] }, { @@ -179,7 +274,11 @@ "id": "metallic-leave", "metadata": {}, "source": [ - "Odpowiedź:" + "### Odpowiedź:\n", + "- 1. **spełnia warunek**, ponieważ zwróci wartość 0 lub 3 które są >= 0\n", + "- 2. **spełnia waurenk**, ponieważ gdy zdanie x jest takie samo jak y = to odległość jest zwracana jako 0\n", + "- 3. **spełnia warunek**, ponieważ sprawdzenia wygląda w taki sposób, że porównujemy czy x == y (co jest tożsame z y == x) w przeciwnym wypadku zawsze zwracamy tą samą wartość\n", + "- 4. **spełnia warunek**, ponieważ gdy xyz są takie same to mamy 0>=0 | gdy wszystkie są różne to mamy 6>=3 | gdy jedna para się różni 6>=0 lub 3>=0" ] }, { @@ -206,7 +305,12 @@ "id": "bibliographic-stopping", "metadata": {}, "source": [ - "Odpowiedź:" + "### Odpowiedź: Jest funckją dystansu\n", + "- 1. **spełnia warunek** Liczba wymaganych operacji edycyjnych nie może być mniejsza niż zero. W przypadku gdy zdania są sobie równe d(x,y) = 0\n", + "- 2. **spełnia warunek** Gdy zdania są sobie równe d(x,y) = 0\n", + "- 3. **spełnia warunek** nie ważne czy zrobimy d(x,y) czy d(y,x) nadal liczba operacji edycyjnych będzie taka sama pa->papa (+2) | papa -> pa (-2)\n", + "- 4. **spełnia warunek**: (z uproszczeniem że x i y i z to liczba wymaganych zmian -> |y?x| + |z?y| >= |z?x| =\n", + "- = x + y >= x" ] }, { @@ -223,7 +327,38 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, + "id": "727b188d-eedd-4d19-9cbf-efcce71e145c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Collecting python-Levenshtein\n", + " Downloading python_Levenshtein-0.25.1-py3-none-any.whl.metadata (3.7 kB)\n", + "Collecting Levenshtein==0.25.1 (from python-Levenshtein)\n", + " Downloading Levenshtein-0.25.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)\n", + "Collecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein==0.25.1->python-Levenshtein)\n", + " Downloading rapidfuzz-3.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n", + "Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)\n", + "Downloading Levenshtein-0.25.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.4/177.4 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hDownloading rapidfuzz-3.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m40.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein\n", + "Successfully installed Levenshtein-0.25.1 python-Levenshtein-0.25.1 rapidfuzz-3.8.1\n" + ] + } + ], + "source": [ + "pip3 install python-Levenshtein" + ] + }, + { + "cell_type": "code", + "execution_count": 17, "id": "secondary-wrist", "metadata": {}, "outputs": [ @@ -233,7 +368,7 @@ "2" ] }, - "execution_count": 5, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -254,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 18, "id": "associate-tuner", "metadata": {}, "outputs": [], @@ -273,7 +408,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "id": "focal-pathology", "metadata": {}, "outputs": [ @@ -283,7 +418,7 @@ "0.9166666666666666" ] }, - "execution_count": 7, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -294,7 +429,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "id": "roman-ceiling", "metadata": {}, "outputs": [ @@ -304,7 +439,7 @@ "0.9428571428571428" ] }, - "execution_count": 8, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -315,7 +450,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "id": "invisible-cambodia", "metadata": {}, "outputs": [ @@ -325,7 +460,7 @@ "0.631578947368421" ] }, - "execution_count": 9, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -344,21 +479,80 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 37, "id": "genetic-cradle", "metadata": {}, "outputs": [], "source": [ "def fuzzy_lookup(sentence, threshold):\n", - " return []" + " \n", + " answer = []\n", + " \n", + " for entry in translation_memory:\n", + " if levenshtein_similarity(sentence.lower(),entry[0].lower()) >= threshold:\n", + " answer.append(entry[1])\n", + " \n", + " return answer" ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "cc0544a4-a515-4515-a116-f13b96e92812", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Press the ENTER button']" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#'Wciśnij przycisk Enter'\n", + "fuzzy_lookup('KlikNiJ przycisK EnTeR', 0.86)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "e2b8ff91-a103-45a4-a746-8ce3e9470c4c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Check the network settings']" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#'Sprawdź ustawienia sieciowe'\n", + "fuzzy_lookup('Sprawdź ustawienia sieci', 0.885)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df759469-b92e-490c-a672-96bd4c0d76b2", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "author": "Rafał Jaworski", "email": "rjawor@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -373,7 +567,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.9.2" }, "subtitle": "2. Zaawansowane użycie pamięci tłumaczeń", "title": "Komputerowe wspomaganie tłumaczenia", diff --git a/lab/lab_03.ipynb b/lab/lab_03.ipynb index 5707f0d..c62fd52 100644 --- a/lab/lab_03.ipynb +++ b/lab/lab_03.ipynb @@ -20,6 +20,13 @@ "id": "aggregate-listing", "metadata": {}, "source": [ + "```python\n", + "import collections\n", + "lista1 = [3,4,5,4,4,7,8,7]\n", + "lista2 = [3,4,5,4,4,7,8,7]\n", + "print((collections.Counter(lista) + collections.Counter(lista2)).most_common(5))\n", + "```\n", + "\n", "Na dzisiejszych zajęciach zajmiemy się bliżej słownikami używanymi do wspomagania tłumaczenia. Oczywiście na rynku dostępnych jest bardzo wiele słowników w formacie elektronicznym. Wiele z nich jest gotowych do użycia w SDL Trados, memoQ i innych narzędziach CAT. Zawierają one setki tysięcy lub miliony haseł i oferują natychmiastową pomoc tłumaczowi." ] }, @@ -63,7 +70,12 @@ "id": "diverse-sunglasses", "metadata": {}, "source": [ - "Odpowiedź:" + "### Odpowiedź:\n", + "- **DeepL:** metal cabinet slides / metal cabinet guides\n", + "- **Model GPT-3.5:** metal cabinet slides / metal wardrobe rails.\n", + "- **Model GPT-4:** guides for metal cabinets / metal cabinet guides\n", + "- **Google-translate**: metal cabinet guides\n", + "- **www.tlumaczangielskopolski.pl:** metal cabinet guides\n" ] }, { @@ -86,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 70, "id": "loving-prince", "metadata": {}, "outputs": [], @@ -110,12 +122,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 71, "id": "bound-auction", "metadata": {}, "outputs": [], "source": [ - "dictionary = ['program', 'application', 'applet' 'compile']" + "dictionary = ['program', 'application', 'applet', 'compile']" ] }, { @@ -128,13 +140,46 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 76, "id": "cognitive-cedar", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'applet': [(302, 308)],\n", + " 'application': [(80, 91), (153, 84), (300, 158)],\n", + " 'compile': [(56, 63), (497, 448)],\n", + " 'program': [(14, 21), (284, 277), (454, 177), (495, 48), (505, 17)]}\n" + ] + } + ], "source": [ + "import re\n", + "from pprint import pprint\n", + "\n", "def terminology_lookup():\n", - " return []" + " answer = {pattern:[] for pattern in dictionary}\n", + " low_text = text.lower()\n", + " for pattern in dictionary:\n", + " offset = 0\n", + " start = 0\n", + " end = 0\n", + " while True:\n", + " match = (re.search(pattern,low_text[offset:]))\n", + " if not match:\n", + " break\n", + " else:\n", + " start += match.start()\n", + " end = +match.end()\n", + " offset += end\n", + "\n", + " answer[pattern].append((start,end))\n", + " pprint(answer)\n", + " #return answer\n", + "\n", + "terminology_lookup()" ] }, { @@ -161,7 +206,113 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, + "id": "02e1c16f-be37-4a64-a514-8875b393ccb7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "Requirement already satisfied: spacy in /usr/local/lib/python3.9/dist-packages (3.4.1)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.9 in /usr/local/lib/python3.9/dist-packages (from spacy) (3.0.10)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (1.0.3)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (1.0.8)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.9/dist-packages (from spacy) (2.0.6)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.9/dist-packages (from spacy) (3.0.7)\n", + "Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (8.1.1)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.9.1 in /usr/local/lib/python3.9/dist-packages (from spacy) (0.10.1)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.9/dist-packages (from spacy) (2.4.4)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.9/dist-packages (from spacy) (2.0.8)\n", + "Requirement already satisfied: typer<0.5.0,>=0.3.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (0.4.2)\n", + "Requirement already satisfied: pathy>=0.3.5 in /usr/local/lib/python3.9/dist-packages (from spacy) (0.6.2)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (4.64.1)\n", + "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (1.21.6)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (2.28.1)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.10.0,>=1.7.4 in /usr/local/lib/python3.9/dist-packages (from spacy) (1.9.2)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.9/dist-packages (from spacy) (3.1.2)\n", + "Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from spacy) (52.0.0)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (21.3)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (3.3.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/lib/python3/dist-packages (from packaging>=20.0->spacy) (2.4.7)\n", + "Requirement already satisfied: smart-open<6.0.0,>=5.2.1 in /usr/local/lib/python3.9/dist-packages (from pathy>=0.3.5->spacy) (5.2.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from pydantic!=1.8,!=1.8.1,<1.10.0,>=1.7.4->spacy) (4.3.0)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.1.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.12)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2022.9.14)\n", + "Requirement already satisfied: blis<0.10.0,>=0.7.8 in /usr/local/lib/python3.9/dist-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.9.1)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.9/dist-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.0.1)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.9/dist-packages (from typer<0.5.0,>=0.3.0->spacy) (8.1.3)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from jinja2->spacy) (2.1.1)\n" + ] + } + ], + "source": [ + "pip3 install spacy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6d7e9f5-4d6f-49c5-8dea-9957bc6da318", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Defaulting to user installation because normal site-packages is not writeable\n", + "\u001b[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl#egg=en_core_web_sm==3.4.1 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617\u001b[0m\u001b[33m\n", + "\u001b[0mCollecting en-core-web-sm==3.4.1\n", + " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m45.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: spacy<3.5.0,>=3.4.0 in /usr/local/lib/python3.9/dist-packages (from en-core-web-sm==3.4.1) (3.4.1)\n", + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.9 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.10)\n", + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.0.3)\n", + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.0.8)\n", + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.6)\n", + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.7)\n", + "Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (8.1.1)\n", + "Requirement already satisfied: wasabi<1.1.0,>=0.9.1 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.10.1)\n", + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.4.4)\n", + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.8)\n", + "Requirement already satisfied: typer<0.5.0,>=0.3.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.4.2)\n", + "Requirement already satisfied: pathy>=0.3.5 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.6.2)\n", + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (4.64.1)\n", + "Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.21.6)\n", + "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.28.1)\n", + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.10.0,>=1.7.4 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.9.2)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.1.2)\n", + "Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (52.0.0)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (21.3)\n", + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.3.0)\n", + "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/lib/python3/dist-packages (from packaging>=20.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.4.7)\n", + "Requirement already satisfied: smart-open<6.0.0,>=5.2.1 in /usr/local/lib/python3.9/dist-packages (from pathy>=0.3.5->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (5.2.1)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from pydantic!=1.8,!=1.8.1,<1.10.0,>=1.7.4->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (4.3.0)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.1.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.4)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.26.12)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2022.9.14)\n", + "Requirement already satisfied: blis<0.10.0,>=0.7.8 in /usr/local/lib/python3.9/dist-packages (from thinc<8.2.0,>=8.1.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.9.1)\n", + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.9/dist-packages (from thinc<8.2.0,>=8.1.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.0.1)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.9/dist-packages (from typer<0.5.0,>=0.3.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (8.1.3)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from jinja2->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.1.1)\n", + "Installing collected packages: en-core-web-sm\n", + "Successfully installed en-core-web-sm-3.4.1\n", + "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", + "You can now load the package via spacy.load('en_core_web_sm')\n" + ] + } + ], + "source": [ + "python3 -m spacy download en_core_web_sm" + ] + }, + { + "cell_type": "code", + "execution_count": 15, "id": "tribal-attention", "metadata": {}, "outputs": [ @@ -232,7 +383,7 @@ "be\n", "the\n", "step\n", - "-PRON-\n", + "you\n", "need\n", "to\n", "follow\n", @@ -248,7 +399,7 @@ "platform\n", ",\n", "if\n", - "-PRON-\n", + "you\n", "have\n", "not\n", "already\n", @@ -260,7 +411,7 @@ "program\n", "that\n", "use\n", - "Swing\n", + "swing\n", "component\n", ".\n", "compile\n", @@ -302,13 +453,37 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 20, "id": "surgical-demonstration", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'program': [(14, 24), (291, 298), (468, 475), (516, 523), (533, 540)],\n", + " 'application': [(80, 91), (164, 175), (322, 333)],\n", + " 'applet': [(302, 308)],\n", + " 'compile': [(56, 63), (134, 141), (504, 511)]}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import re\n", + "\n", "def terminology_lookup():\n", - " return []" + " answer = {pattern:[] for pattern in dictionary}\n", + "\n", + " for pattern in dictionary:\n", + " for token in doc:\n", + " if pattern in token.lemma_:\n", + " answer[pattern].append((token.idx,token.idx+len(token.lemma_)))\n", + " return answer\n", + "\n", + "terminology_lookup()" ] }, { @@ -337,13 +512,52 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 73, "id": "superb-butterfly", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['programmers',\n", + " 'section',\n", + " 'Swing',\n", + " 'application',\n", + " 'command',\n", + " 'line',\n", + " 'information',\n", + " 'Swing',\n", + " 'application',\n", + " 'compilation',\n", + " 'instructions',\n", + " 'Swing',\n", + " 'programs',\n", + " 'applets',\n", + " 'applications',\n", + " 'steps',\n", + " 'release',\n", + " 'platform',\n", + " 'program',\n", + " 'Swing',\n", + " 'components',\n", + " 'program',\n", + " 'program']" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import spacy\n", "def get_nouns(text):\n", - " return []" + " nlp = spacy.load(\"en_core_web_sm\")\n", + " doc = nlp(text)\n", + " nouns = [token.text for token in doc if token.pos_ == \"NOUN\"]\n", + " return nouns\n", + "\n", + "get_nouns(text)" ] }, { @@ -374,13 +588,51 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 74, "id": "eight-redhead", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'programmer': 1,\n", + " 'section': 1,\n", + " 'swing': 4,\n", + " 'application': 3,\n", + " 'command': 1,\n", + " 'line': 1,\n", + " 'information': 1,\n", + " 'compilation': 1,\n", + " 'instruction': 1,\n", + " 'program': 4,\n", + " 'applet': 1,\n", + " 'step': 1,\n", + " 'release': 1,\n", + " 'platform': 1,\n", + " 'component': 1}" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "from collections import Counter\n", + "import spacy\n", + "\n", "def extract_terms(text):\n", - " return []" + " nlp = spacy.load(\"en_core_web_sm\")\n", + " doc = nlp(text)\n", + " tally = {}\n", + " nouns = [token.lemma_ for token in doc if token.pos_ == \"NOUN\"]\n", + " nouns_counts = Counter(nouns)\n", + " \n", + " for word, count in nouns_counts.items():\n", + " tally.update({word: count})\n", + " return tally\n", + "\n", + "extract_terms(text)" ] }, { @@ -393,13 +645,82 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 75, "id": "monetary-mambo", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'adjectives': {'late': 1},\n", + " 'nouns': {'applet': 1,\n", + " 'application': 3,\n", + " 'command': 1,\n", + " 'compilation': 1,\n", + " 'component': 1,\n", + " 'information': 1,\n", + " 'instruction': 1,\n", + " 'line': 1,\n", + " 'platform': 1,\n", + " 'program': 4,\n", + " 'programmer': 1,\n", + " 'release': 1,\n", + " 'section': 1,\n", + " 'step': 1,\n", + " 'swing': 4},\n", + " 'verbs': {'compile': 3,\n", + " 'create': 1,\n", + " 'do': 1,\n", + " 'explain': 1,\n", + " 'follow': 1,\n", + " 'install': 1,\n", + " 'need': 1,\n", + " 'run': 3,\n", + " 'see': 1,\n", + " 'use': 2,\n", + " 'work': 1}}\n" + ] + } + ], "source": [ + "from pprint import pprint\n", + "from collections import Counter\n", + "import spacy\n", + "\n", "def extract_terms(text):\n", - " return []" + " \n", + " nlp = spacy.load(\"en_core_web_sm\")\n", + " doc = nlp(text)\n", + " \n", + " nouns, verbs, adjectives = [], [], []\n", + " tally = {\"nouns\": {}, \"verbs\": {}, \"adjectives\": {}}\n", + " \n", + " for token in doc:\n", + " if token.pos_ == \"NOUN\":\n", + " nouns.append(token.lemma_)\n", + " elif token.pos_ == \"VERB\":\n", + " verbs.append(token.lemma_)\n", + " elif token.pos_ == \"ADJ\":\n", + " adjectives.append(token.lemma_)\n", + " \n", + " nouns_counts = Counter(nouns)\n", + " verbs_counts = Counter(verbs)\n", + " adjectives_counts = Counter(adjectives)\n", + "\n", + " for word, count in nouns_counts.items():\n", + " tally[\"nouns\"].update({word: count})\n", + " \n", + " for word, count in verbs_counts.items():\n", + " tally[\"verbs\"].update({word: count})\n", + " \n", + " for word, count in adjectives_counts.items():\n", + " tally[\"adjectives\"].update({word: count})\n", + "\n", + " pprint(tally)\n", + " #return tally\n", + "\n", + "extract_terms(text)" ] } ], @@ -407,7 +728,7 @@ "author": "Rafał Jaworski", "email": "rjawor@amu.edu.pl", "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -422,7 +743,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.9.2" }, "subtitle": "3. Terminologia", "title": "Komputerowe wspomaganie tłumaczenia",