Compare commits

...

3 Commits
main ... main

Author SHA1 Message Date
potato e02ff5ab39 lab 2 2024-04-23 23:52:29 +02:00
potato d32188878d Laboratorium 1 2024-04-16 22:51:31 +02:00
potato e5ed49b0b0 Labs 2024-04-15 21:46:15 +02:00
6 changed files with 19034 additions and 114 deletions

View File

@ -82,7 +82,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"id": "compact-trinidad",
"metadata": {},
"outputs": [
@ -92,7 +92,7 @@
"['Press the ENTER button']"
]
},
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@ -119,7 +119,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 6,
"id": "exposed-daniel",
"metadata": {},
"outputs": [],
@ -139,7 +139,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"id": "serial-velvet",
"metadata": {},
"outputs": [
@ -149,7 +149,7 @@
"['Press the ENTER button', 'Press the ENTER key']"
]
},
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -176,17 +176,17 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 11,
"id": "every-gibson",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[]"
"['Press the ENTER button', 'Press the ENTER key']"
]
},
"execution_count": 6,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@ -213,13 +213,13 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"id": "protected-rings",
"metadata": {},
"outputs": [],
"source": [
"def tm_lookup(sentence):\n",
" return ''"
" return [entry[1] for entry in translation_memory if entry[0].lower() == sentence.lower()]"
]
},
{
@ -232,17 +232,17 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 13,
"id": "severe-alloy",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"''"
"['Press the ENTER button', 'Press the ENTER key']"
]
},
"execution_count": 18,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@ -261,13 +261,20 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"id": "structural-diesel",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def remove_punctuation(sentence):\n",
" return re.sub(r'[^\\w\\s]', '', sentence)\n",
"\n",
"\n",
"def tm_lookup(sentence):\n",
" return ''"
" return [entry[1] for entry in translation_memory\n",
" if entry[0].lower() == remove_punctuation(sentence.lower())]"
]
},
{
@ -280,17 +287,17 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 27,
"id": "brief-senegal",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"''"
"['System restart required']"
]
},
"execution_count": 12,
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
@ -317,13 +324,26 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 26,
"id": "mathematical-customs",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def remove_punctuation(sentence):\n",
" return re.sub(r'[^\\w\\s]', '', sentence)\n",
"\n",
"\n",
"def tm_lookup(sentence):\n",
" return ''"
" values = []\n",
" for entry in translation_memory:\n",
" key = set(entry[0].lower().split())\n",
" mod_sentence = set(remove_punctuation(sentence.lower()).split())\n",
" remainder = list(key - mod_sentence)\n",
" if len(remainder) <= 1:\n",
" values.append(entry[1])\n",
" return values"
]
},
{
@ -344,7 +364,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 42,
"id": "humanitarian-wrong",
"metadata": {},
"outputs": [],
@ -362,7 +382,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 43,
"id": "located-perception",
"metadata": {},
"outputs": [],
@ -374,17 +394,31 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 44,
"id": "3437b88b",
"metadata": {},
"outputs": [],
"source": [
"glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]\n",
"\n",
"def glossary_lookup(sentence):\n",
" sentence_words = sentence.split()\n",
" return [entry for entry in glossary if entry[0] in sentence_words]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "advised-casting",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('przycisk', 'button'), ('drukarka', 'printer')]"
"[('drukarka', 'printer'), ('przycisk', 'button')]"
]
},
"execution_count": 17,
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
@ -406,7 +440,7 @@
"id": "defensive-fifteen",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Operacja split w pierwszej linijce funkcji moze zostac uznana za stala. Biorac pod uwage ze lista krotek zawierajaca glosariusz musi byc przejrzana za kazdym razem cala, jak i caly string skomplikowaność obliczen bedzie wynosic O(n*m)."
]
},
{
@ -419,13 +453,14 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 48,
"id": "original-tunisia",
"metadata": {},
"outputs": [],
"source": [
"def glossary_lookup(sentence):\n",
" return ''"
" sentence_words = [ element.lower() for element in sentence.split()]\n",
" return [entry for entry in glossary if entry[0].lower() in sentence_words]"
]
},
{
@ -438,13 +473,27 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 49,
"id": "f69a873a",
"metadata": {},
"outputs": [],
"source": [
"glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]\n",
"glossary = { k:v for k,v in glossary}\n",
"translated_words = list(glossary.keys())"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "adolescent-semiconductor",
"metadata": {},
"outputs": [],
"source": [
"def glossary_lookup(sentence):\n",
" return ''"
" words = sentence.split()\n",
" \n",
" return [(word, glossary[word]) for word in words if word in translated_words]"
]
}
],
@ -467,7 +516,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.0"
},
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
"title": "Komputerowe wspomaganie tłumaczenia",

View File

@ -67,7 +67,9 @@
" ('Sprawdź ustawienia sieciowe', 'Check the network settings'),\n",
" ('Drukarka jest wyłączona', 'The printer is switched off'),\n",
" ('Wymagane ponowne uruchomienie komputera', 'System restart required')\n",
" ]"
" ]\n",
"\n",
"translation_memory = { k:v for k,v in translation_memory}"
]
},
{
@ -86,7 +88,11 @@
"outputs": [],
"source": [
"def ice_lookup(sentence, prev_sentence, next_sentence):\n",
" return []"
" s_t = translation_memory.get(sentence, False)\n",
" p_s = translation_memory.get(prev_sentence, False)\n",
" n_s = translation_memory.get(next_sentence, False)\n",
" if s_t and p_s and n_s:\n",
" return s_t "
]
},
{
@ -141,7 +147,7 @@
"id": "graduate-theorem",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Nie jest to poprawna funkcja dystansu fuzzy match. Warunki 1,3,4 sa spelnione. 2 warunek jest nie spełniony poniewaz odleglosc pomiedzy dwoma zdaniami/slowami o tej samej dlugosci ale innych znakach bedzie rowna zero."
]
},
{
@ -179,7 +185,7 @@
"id": "metallic-leave",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Tak, jest to poprawna funkcja dystansu. Wszystkie warunki sa spelnione."
]
},
{
@ -201,12 +207,54 @@
"### Ćwiczenie 5: Czy dystans Levenshteina jest poprawną funkcją dystansu? Uzasadnij krótko swoją odpowiedź sprawdzając każdy z warunków."
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "79e4deef",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from Levenshtein import distance as levenshtein_distance\n",
"\n",
"# warunek 1\n",
"levenshtein_distance(\"smthn\", \"nothin\")\n",
"# Output: 3\n",
"# zawsze nieujemne\n",
"\n",
"# warunek 2\n",
"levenshtein_distance(\"and\", \"and\")\n",
"# Output: 0\n",
"# dwa takie same zdania ktore sa w odleglosci 0 od siebie\n",
"\n",
"# warunek 3\n",
"levenshtein_distance(\"zombie\", \"mombie\") == levenshtein_distance(\"mombie\", \"zombie\")\n",
"# Output: True\n",
"# zamiennosc zdan\n",
"\n",
"# warunek 4\n",
"x,y,z = 'zombie', 'glombie', 'mombie'\n",
"levenshtein_distance(x,y) + levenshtein_distance(y,z) >= levenshtein_distance(x,z)\n",
"#Output: True\n",
"# miara każdej odleglosci musi być mniejsza lub równa sumie miar dwóch pozostałych"
]
},
{
"cell_type": "markdown",
"id": "bibliographic-stopping",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Tak jest poprawną funkcją dystansu."
]
},
{
@ -223,7 +271,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"id": "secondary-wrist",
"metadata": {},
"outputs": [
@ -233,7 +281,7 @@
"2"
]
},
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -254,7 +302,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"id": "associate-tuner",
"metadata": {},
"outputs": [],
@ -273,7 +321,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"id": "focal-pathology",
"metadata": {},
"outputs": [
@ -283,7 +331,7 @@
"0.9166666666666666"
]
},
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -294,7 +342,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 10,
"id": "roman-ceiling",
"metadata": {},
"outputs": [
@ -304,7 +352,7 @@
"0.9428571428571428"
]
},
"execution_count": 8,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -315,7 +363,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 11,
"id": "invisible-cambodia",
"metadata": {},
"outputs": [
@ -325,7 +373,7 @@
"0.631578947368421"
]
},
"execution_count": 9,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@ -344,14 +392,43 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 12,
"id": "genetic-cradle",
"metadata": {},
"outputs": [],
"source": [
"def fuzzy_lookup(sentence, threshold):\n",
" return []"
" return [ v for k,v in translation_memory.items() if levenshtein_similarity(sentence, k) > threshold ]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "2e72b54a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Press the ENTER button', 'System restart required']"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fuzzy_lookup('Spróbuj wyłączyć i włączyć komputer', 0.25)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1f15316",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
@ -373,7 +450,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.0"
},
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
"title": "Komputerowe wspomaganie tłumaczenia",

View File

@ -63,7 +63,9 @@
"id": "diverse-sunglasses",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Narzędzie DeepL: https://www.deepl.com/translator\n",
"\n",
"przetłumaczyło tekst \"prowadnice szaf metalowych\" na \"metal cabinet slides\""
]
},
{
@ -86,7 +88,17 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 8,
"id": "8f6b6fa9",
"metadata": {},
"outputs": [],
"source": [
"import re"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "loving-prince",
"metadata": {},
"outputs": [],
@ -110,7 +122,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 10,
"id": "bound-auction",
"metadata": {},
"outputs": [],
@ -118,6 +130,27 @@
"dictionary = ['program', 'application', 'applet' 'compile']"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "821ee3ee",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\" For all Java programmers: This section explains how to compile and run a Swing application from the command line. For information on compiling and running a Swing application using NetBeans IDE, see Running Tutorial Examples in NetBeans IDE. The compilation instructions work for all Swing programs — applets, as well as applications. Here are the steps you need to follow: Install the latest release of the Java SE platform, if you haven't already done so. Create a program that uses Swing components. Compile the program. Run the program.\""
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text"
]
},
{
"cell_type": "markdown",
"id": "other-trinidad",
@ -128,13 +161,41 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 11,
"id": "cognitive-cedar",
"metadata": {},
"outputs": [],
"source": [
"def terminology_lookup():\n",
" return []"
"count_dictionary = {}\n",
"\n",
"def terminology_lookup(text, tags):\n",
" text = text.lower()\n",
" return [(tag, [[m.start(), m.end()] \n",
" for m in re.finditer(tag, text)])\n",
" for tag in tags if tag in text]\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "9fe3b66f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('program', [[14, 21], [291, 298], [468, 475], [516, 523], [533, 540]]),\n",
" ('application', [[80, 91], [164, 175], [322, 333]])]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"terminology_lookup(text, dictionary)"
]
},
{
@ -159,9 +220,122 @@
"`python3 -m spacy download en_core_web_sm`"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7b7b7569",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: spacy in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (3.7.4)\n",
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (3.0.12)\n",
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (1.0.5)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (1.0.10)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.0.8)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (3.0.9)\n",
"Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (8.2.3)\n",
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (1.1.2)\n",
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.4.8)\n",
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.0.10)\n",
"Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (0.3.4)\n",
"Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (0.9.4)\n",
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (6.4.0)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (4.66.2)\n",
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.31.0)\n",
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.7.0)\n",
"Requirement already satisfied: jinja2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (3.1.3)\n",
"Requirement already satisfied: setuptools in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (65.5.0)\n",
"Requirement already satisfied: packaging>=20.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (24.0)\n",
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (3.3.0)\n",
"Requirement already satisfied: numpy>=1.19.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (1.26.4)\n",
"Requirement already satisfied: annotated-types>=0.4.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (0.6.0)\n",
"Requirement already satisfied: pydantic-core==2.18.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (2.18.1)\n",
"Requirement already satisfied: typing-extensions>=4.6.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (4.11.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.2.1)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2024.2.2)\n",
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.7.11)\n",
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.1.4)\n",
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from typer<0.10.0,>=0.3.0->spacy) (8.1.7)\n",
"Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from weasel<0.4.0,>=0.1.0->spacy) (0.16.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from jinja2->spacy) (2.1.5)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
]
}
],
"source": [
"!pip3 install spacy"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f4d06ed3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting en-core-web-sm==3.7.1\n",
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)\n",
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from en-core-web-sm==3.7.1) (3.7.4)\n",
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)\n",
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)\n",
"Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.3)\n",
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.2)\n",
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)\n",
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)\n",
"Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.3.4)\n",
"Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.9.4)\n",
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (6.4.0)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.66.2)\n",
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.31.0)\n",
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.7.0)\n",
"Requirement already satisfied: jinja2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.3)\n",
"Requirement already satisfied: setuptools in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (65.5.0)\n",
"Requirement already satisfied: packaging>=20.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (24.0)\n",
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.0)\n",
"Requirement already satisfied: numpy>=1.19.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.4)\n",
"Requirement already satisfied: annotated-types>=0.4.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.6.0)\n",
"Requirement already satisfied: pydantic-core==2.18.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.18.1)\n",
"Requirement already satisfied: typing-extensions>=4.6.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.11.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.2.1)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2024.2.2)\n",
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.11)\n",
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.4)\n",
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)\n",
"Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.16.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.5)\n",
"Installing collected packages: en-core-web-sm\n",
"Successfully installed en-core-web-sm-3.7.1\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
"You can now load the package via spacy.load('en_core_web_sm')\n"
]
}
],
"source": [
"!python3 -m spacy download en_core_web_sm"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "tribal-attention",
"metadata": {},
"outputs": [
@ -205,7 +379,7 @@
"IDE\n",
",\n",
"see\n",
"Running\n",
"run\n",
"Tutorial\n",
"Examples\n",
"in\n",
@ -218,7 +392,7 @@
"work\n",
"for\n",
"all\n",
"swing\n",
"Swing\n",
"program\n",
"—\n",
"applet\n",
@ -232,7 +406,7 @@
"be\n",
"the\n",
"step\n",
"-PRON-\n",
"you\n",
"need\n",
"to\n",
"follow\n",
@ -248,7 +422,7 @@
"platform\n",
",\n",
"if\n",
"-PRON-\n",
"you\n",
"have\n",
"not\n",
"already\n",
@ -260,7 +434,7 @@
"program\n",
"that\n",
"use\n",
"Swing\n",
"swing\n",
"component\n",
".\n",
"compile\n",
@ -302,13 +476,13 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "surgical-demonstration",
"metadata": {},
"outputs": [],
"source": [
"def terminology_lookup():\n",
" return []"
" return None"
]
},
{
@ -337,13 +511,77 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 17,
"id": "superb-butterfly",
"metadata": {},
"outputs": [],
"source": [
"def get_nouns(text):\n",
" return []"
" doc = nlp(text)\n",
" return [token.text for token in doc if token.pos_ == \"NOUN\"]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "5e2be152",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'ADP'"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc[1].pos_"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "11430dc5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['programmers',\n",
" 'section',\n",
" 'Swing',\n",
" 'application',\n",
" 'command',\n",
" 'line',\n",
" 'information',\n",
" 'Swing',\n",
" 'application',\n",
" 'compilation',\n",
" 'instructions',\n",
" 'programs',\n",
" 'applets',\n",
" 'applications',\n",
" 'steps',\n",
" 'release',\n",
" 'platform',\n",
" 'program',\n",
" 'Swing',\n",
" 'components',\n",
" 'program',\n",
" 'program']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_nouns(text)"
]
},
{
@ -356,7 +594,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"id": "acting-tolerance",
"metadata": {},
"outputs": [],
@ -374,13 +612,55 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 24,
"id": "eight-redhead",
"metadata": {},
"outputs": [],
"source": [
"def extract_terms(text):\n",
" return []"
" doc = nlp(text)\n",
" noun_counts = {}\n",
" for token in doc:\n",
" if token.pos_ == \"NOUN\":\n",
" noun_counts[token.text] = noun_counts.get(token.text, 0) + 1\n",
" return noun_counts"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "c7d46f26",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'programmers': 1,\n",
" 'section': 1,\n",
" 'Swing': 3,\n",
" 'application': 2,\n",
" 'command': 1,\n",
" 'line': 1,\n",
" 'information': 1,\n",
" 'compilation': 1,\n",
" 'instructions': 1,\n",
" 'programs': 1,\n",
" 'applets': 1,\n",
" 'applications': 1,\n",
" 'steps': 1,\n",
" 'release': 1,\n",
" 'platform': 1,\n",
" 'program': 3,\n",
" 'components': 1}"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_terms(text)"
]
},
{
@ -393,14 +673,61 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 35,
"id": "monetary-mambo",
"metadata": {},
"outputs": [],
"source": [
"def extract_terms(text):\n",
" return []"
" doc = nlp(text.lower())\n",
" noun_counts = {}\n",
" for token in doc:\n",
" if token.pos_ in [\"NOUN\", \"VERB\", \"ADJ\"]:\n",
" noun_counts[token.text] = noun_counts.get(token.text, 0) + 1\n",
" return noun_counts"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "4259ee3f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'explains': 1,\n",
" 'compile': 2,\n",
" 'run': 2,\n",
" 'compiling': 1,\n",
" 'running': 2,\n",
" 'using': 1,\n",
" 'see': 1,\n",
" 'work': 1,\n",
" 'need': 1,\n",
" 'follow': 1,\n",
" 'install': 1,\n",
" 'done': 1,\n",
" 'create': 1,\n",
" 'uses': 1}"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_terms(text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d239c20",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
@ -422,7 +749,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.0"
},
"subtitle": "3. Terminologia",
"title": "Komputerowe wspomaganie tłumaczenia",

File diff suppressed because one or more lines are too long

View File

@ -60,8 +60,35 @@
"metadata": {},
"outputs": [],
"source": [
"def find_tags(text):\n",
" return []"
"import re\n",
"\n",
"def find_tags(string):\n",
" pattern = r'<[^>]+>'\n",
" matches = re.finditer(pattern, string)\n",
" tag_indexes = [(match.start(), match.end()) for match in matches]\n",
" return tag_indexes"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3dc08368",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['<tag1>', '</tag1>', '<tag2>', '</tag2>']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"string = \"<tag1>ADIOS</tag1><tag2>OLA</tag2>\"\n",
"[ string[out[0]:out[1]] for out in find_tags(string)]"
]
},
{
@ -74,13 +101,73 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 20,
"id": "unauthorized-study",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n",
"False\n",
"False\n",
"True\n"
]
}
],
"source": [
"import re\n",
"import string\n",
"\n",
"def is_translatable(text):\n",
" return True"
" return bool(re.match(r'^[^0-9IVXLCDM\\s' + re.escape(string.punctuation) + ']+$', text))\n",
"\n",
"text1 = \"This is a sample text.\"\n",
"text2 = \"2024.\"\n",
"text3 = \"Это пример текста.\"\n",
"text4 = \"おはよう\"\n",
"\n",
"print(is_translatable(text1)) \n",
"print(is_translatable(text2))\n",
"print(is_translatable(text3))\n",
"print(is_translatable(text4))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "ae92a18c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\"This is a sample text.\" is translatable?\n",
"Yes\n",
"\"2024.\" is translatable?\n",
"No\n",
"\"Это пример текста.\" is translatable?\n",
"Yes\n",
"\"おはよう\" is translatable?\n",
"Yes\n"
]
}
],
"source": [
"import re\n",
"import string\n",
"\n",
"def is_translatable(text):\n",
" return bool(re.match(r'^[^\\d]+$|^\\s+$', text))\n",
"\n",
"examples = [\"This is a sample text.\", \"2024.\", \n",
" \"Это пример текста.\", \"おはよう\"]\n",
"\n",
"for ex in examples:\n",
" response = 'Yes' if is_translatable(ex) else 'No'\n",
" print(f'\"{ex}\" is translatable?\\n{response}')"
]
},
{
@ -93,13 +180,74 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 44,
"id": "beautiful-mathematics",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('03/25/2022', 3, 25, 2022)\tday:\t3\tmonth:\t25\tyear:\t2022\n",
"('25-12-2023', 25, 12, 2023)\tday:\t25\tmonth:\t12\tyear:\t2023\n",
"('09/30/2025', 9, 30, 2025)\tday:\t9\tmonth:\t30\tyear:\t2025\n",
"('03/25/2022', 25, 3, 2022)\tday:\t25\tmonth:\t3\tyear:\t2022\n",
"('09/30/2025', 30, 9, 2025)\tday:\t30\tmonth:\t9\tyear:\t2025\n",
"('12 March 2024', 12, 3, 2024)\tday:\t12\tmonth:\t3\tyear:\t2024\n",
"('25-12-2023', 25, 12, 2023)\tday:\t25\tmonth:\t12\tyear:\t2023\n",
"('12 March 2024', 12, 3, 2024)\tday:\t12\tmonth:\t3\tyear:\t2024\n",
"('15 September, 2026', 15, 9, 2026)\tday:\t15\tmonth:\t9\tyear:\t2026\n"
]
}
],
"source": [
"import re\n",
"\n",
"def find_dates(text):\n",
" return []"
" date_formats = [\n",
" (r'(\\d{1,2})[-/](\\d{1,2})[-/](\\d{2,4})', 'day_first'),\n",
" (r'(\\d{1,2})[/](\\d{1,2})[/](\\d{2,4})', 'month_first'),\n",
" (r'(\\d{1,2}) (\\w{3,9}) (\\d{4})', 'day_first'),\n",
" (r'(\\d{1,2})-(\\d{1,2})-(\\d{2,4})', 'day_first'),\n",
" (r'(\\d{1,2}) (\\w{3,9}),? (\\d{4})', 'day_first')\n",
" ]\n",
"\n",
" months = {\n",
" 'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,\n",
" 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12\n",
" }\n",
"\n",
" dates_found = []\n",
" for date_pattern, format_type in date_formats:\n",
" matches = re.finditer(date_pattern, text)\n",
" for match in matches:\n",
" groups = match.groups()\n",
" if len(groups) == 3:\n",
" if format_type == 'day_first':\n",
" day, month, year = groups\n",
" else:\n",
" month, day, year = groups\n",
" if month.isdigit():\n",
" month = int(month)\n",
" else:\n",
" month = months[month]\n",
" dates_found.append((match.group(), int(day), month, int(year)))\n",
" elif len(groups) == 4:\n",
" if format_type == 'day_first':\n",
" day, month, _, year = groups\n",
" else:\n",
" month, day, _, year = groups\n",
" if month.isdigit():\n",
" month = int(month)\n",
" else:\n",
" month = months[month]\n",
" dates_found.append((match.group(), int(day), month, int(year)))\n",
" return dates_found\n",
"\n",
"text = \"Here are some dates: 03/25/2022, 25-12-2023, 12 March 2024, 09/30/2025, 15 September, 2026\"\n",
"dates = find_dates(text)\n",
"for date in dates:\n",
" print(f\"{date}\\tday:\\t{date[1]}\\tmonth:\\t{date[2]}\\tyear:\\t{date[3]}\")"
]
},
{
@ -205,7 +353,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.0"
},
"subtitle": "6,7. Preprocessing i postprocessing",
"title": "Komputerowe wspomaganie tłumaczenia",

File diff suppressed because it is too large Load Diff