Compare commits

...

7 Commits
main ... main

Author SHA1 Message Date
bpietrzak 9176a79d30 Small change 2024-06-20 19:59:05 +02:00
bpietrzak 35a01dd2d5 Little push 2024-06-20 19:58:33 +02:00
bpietrzak 946ec47b20 Little push 2024-06-20 19:58:19 +02:00
Bartosz Pietrzak 054d45b9ed lab 12-14 2024-05-28 21:18:57 +02:00
potato e02ff5ab39 lab 2 2024-04-23 23:52:29 +02:00
potato d32188878d Laboratorium 1 2024-04-16 22:51:31 +02:00
potato e5ed49b0b0 Labs 2024-04-15 21:46:15 +02:00
10 changed files with 19828 additions and 160 deletions

View File

@ -82,7 +82,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"id": "compact-trinidad",
"metadata": {},
"outputs": [
@ -92,7 +92,7 @@
"['Press the ENTER button']"
]
},
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@ -119,7 +119,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 6,
"id": "exposed-daniel",
"metadata": {},
"outputs": [],
@ -139,7 +139,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"id": "serial-velvet",
"metadata": {},
"outputs": [
@ -149,7 +149,7 @@
"['Press the ENTER button', 'Press the ENTER key']"
]
},
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -176,17 +176,17 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 11,
"id": "every-gibson",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[]"
"['Press the ENTER button', 'Press the ENTER key']"
]
},
"execution_count": 6,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@ -213,13 +213,13 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"id": "protected-rings",
"metadata": {},
"outputs": [],
"source": [
"def tm_lookup(sentence):\n",
" return ''"
" return [entry[1] for entry in translation_memory if entry[0].lower() == sentence.lower()]"
]
},
{
@ -232,17 +232,17 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 13,
"id": "severe-alloy",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"''"
"['Press the ENTER button', 'Press the ENTER key']"
]
},
"execution_count": 18,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@ -261,13 +261,20 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"id": "structural-diesel",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def remove_punctuation(sentence):\n",
" return re.sub(r'[^\\w\\s]', '', sentence)\n",
"\n",
"\n",
"def tm_lookup(sentence):\n",
" return ''"
" return [entry[1] for entry in translation_memory\n",
" if entry[0].lower() == remove_punctuation(sentence.lower())]"
]
},
{
@ -280,17 +287,17 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 27,
"id": "brief-senegal",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"''"
"['System restart required']"
]
},
"execution_count": 12,
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
@ -317,13 +324,26 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 26,
"id": "mathematical-customs",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"def remove_punctuation(sentence):\n",
" return re.sub(r'[^\\w\\s]', '', sentence)\n",
"\n",
"\n",
"def tm_lookup(sentence):\n",
" return ''"
" values = []\n",
" for entry in translation_memory:\n",
" key = set(entry[0].lower().split())\n",
" mod_sentence = set(remove_punctuation(sentence.lower()).split())\n",
" remainder = list(key - mod_sentence)\n",
" if len(remainder) <= 1:\n",
" values.append(entry[1])\n",
" return values"
]
},
{
@ -344,7 +364,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 42,
"id": "humanitarian-wrong",
"metadata": {},
"outputs": [],
@ -362,7 +382,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 43,
"id": "located-perception",
"metadata": {},
"outputs": [],
@ -374,17 +394,31 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 44,
"id": "3437b88b",
"metadata": {},
"outputs": [],
"source": [
"glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]\n",
"\n",
"def glossary_lookup(sentence):\n",
" sentence_words = sentence.split()\n",
" return [entry for entry in glossary if entry[0] in sentence_words]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "advised-casting",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('przycisk', 'button'), ('drukarka', 'printer')]"
"[('drukarka', 'printer'), ('przycisk', 'button')]"
]
},
"execution_count": 17,
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
@ -406,7 +440,7 @@
"id": "defensive-fifteen",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Operacja split w pierwszej linijce funkcji moze zostac uznana za stala. Biorac pod uwage ze lista krotek zawierajaca glosariusz musi byc przejrzana za kazdym razem cala, jak i caly string skomplikowaność obliczen bedzie wynosic O(n*m)."
]
},
{
@ -419,13 +453,14 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 48,
"id": "original-tunisia",
"metadata": {},
"outputs": [],
"source": [
"def glossary_lookup(sentence):\n",
" return ''"
" sentence_words = [ element.lower() for element in sentence.split()]\n",
" return [entry for entry in glossary if entry[0].lower() in sentence_words]"
]
},
{
@ -438,13 +473,27 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 49,
"id": "f69a873a",
"metadata": {},
"outputs": [],
"source": [
"glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]\n",
"glossary = { k:v for k,v in glossary}\n",
"translated_words = list(glossary.keys())"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "adolescent-semiconductor",
"metadata": {},
"outputs": [],
"source": [
"def glossary_lookup(sentence):\n",
" return ''"
" words = sentence.split()\n",
" \n",
" return [(word, glossary[word]) for word in words if word in translated_words]"
]
}
],
@ -467,7 +516,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.0"
},
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
"title": "Komputerowe wspomaganie tłumaczenia",

View File

@ -67,7 +67,9 @@
" ('Sprawdź ustawienia sieciowe', 'Check the network settings'),\n",
" ('Drukarka jest wyłączona', 'The printer is switched off'),\n",
" ('Wymagane ponowne uruchomienie komputera', 'System restart required')\n",
" ]"
" ]\n",
"\n",
"translation_memory = { k:v for k,v in translation_memory}"
]
},
{
@ -86,7 +88,11 @@
"outputs": [],
"source": [
"def ice_lookup(sentence, prev_sentence, next_sentence):\n",
" return []"
" s_t = translation_memory.get(sentence, False)\n",
" p_s = translation_memory.get(prev_sentence, False)\n",
" n_s = translation_memory.get(next_sentence, False)\n",
" if s_t and p_s and n_s:\n",
" return s_t "
]
},
{
@ -141,7 +147,7 @@
"id": "graduate-theorem",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Nie jest to poprawna funkcja dystansu fuzzy match. Warunki 1,3,4 sa spelnione. 2 warunek jest nie spełniony poniewaz odleglosc pomiedzy dwoma zdaniami/slowami o tej samej dlugosci ale innych znakach bedzie rowna zero."
]
},
{
@ -179,7 +185,7 @@
"id": "metallic-leave",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Tak, jest to poprawna funkcja dystansu. Wszystkie warunki sa spelnione."
]
},
{
@ -201,12 +207,54 @@
"### Ćwiczenie 5: Czy dystans Levenshteina jest poprawną funkcją dystansu? Uzasadnij krótko swoją odpowiedź sprawdzając każdy z warunków."
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "79e4deef",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from Levenshtein import distance as levenshtein_distance\n",
"\n",
"# warunek 1\n",
"levenshtein_distance(\"smthn\", \"nothin\")\n",
"# Output: 3\n",
"# zawsze nieujemne\n",
"\n",
"# warunek 2\n",
"levenshtein_distance(\"and\", \"and\")\n",
"# Output: 0\n",
"# dwa takie same zdania ktore sa w odleglosci 0 od siebie\n",
"\n",
"# warunek 3\n",
"levenshtein_distance(\"zombie\", \"mombie\") == levenshtein_distance(\"mombie\", \"zombie\")\n",
"# Output: True\n",
"# zamiennosc zdan\n",
"\n",
"# warunek 4\n",
"x,y,z = 'zombie', 'glombie', 'mombie'\n",
"levenshtein_distance(x,y) + levenshtein_distance(y,z) >= levenshtein_distance(x,z)\n",
"#Output: True\n",
"# miara każdej odleglosci musi być mniejsza lub równa sumie miar dwóch pozostałych"
]
},
{
"cell_type": "markdown",
"id": "bibliographic-stopping",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Tak jest poprawną funkcją dystansu."
]
},
{
@ -223,7 +271,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"id": "secondary-wrist",
"metadata": {},
"outputs": [
@ -233,7 +281,7 @@
"2"
]
},
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -254,7 +302,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"id": "associate-tuner",
"metadata": {},
"outputs": [],
@ -273,7 +321,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 9,
"id": "focal-pathology",
"metadata": {},
"outputs": [
@ -283,7 +331,7 @@
"0.9166666666666666"
]
},
"execution_count": 7,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -294,7 +342,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 10,
"id": "roman-ceiling",
"metadata": {},
"outputs": [
@ -304,7 +352,7 @@
"0.9428571428571428"
]
},
"execution_count": 8,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -315,7 +363,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 11,
"id": "invisible-cambodia",
"metadata": {},
"outputs": [
@ -325,7 +373,7 @@
"0.631578947368421"
]
},
"execution_count": 9,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@ -344,14 +392,43 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 12,
"id": "genetic-cradle",
"metadata": {},
"outputs": [],
"source": [
"def fuzzy_lookup(sentence, threshold):\n",
" return []"
" return [ v for k,v in translation_memory.items() if levenshtein_similarity(sentence, k) > threshold ]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "2e72b54a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Press the ENTER button', 'System restart required']"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fuzzy_lookup('Spróbuj wyłączyć i włączyć komputer', 0.25)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1f15316",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
@ -373,7 +450,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.0"
},
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
"title": "Komputerowe wspomaganie tłumaczenia",

View File

@ -63,7 +63,9 @@
"id": "diverse-sunglasses",
"metadata": {},
"source": [
"Odpowiedź:"
"Odpowiedź: Narzędzie DeepL: https://www.deepl.com/translator\n",
"\n",
"przetłumaczyło tekst \"prowadnice szaf metalowych\" na \"metal cabinet slides\""
]
},
{
@ -86,7 +88,17 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 8,
"id": "8f6b6fa9",
"metadata": {},
"outputs": [],
"source": [
"import re"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "loving-prince",
"metadata": {},
"outputs": [],
@ -110,7 +122,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 10,
"id": "bound-auction",
"metadata": {},
"outputs": [],
@ -118,6 +130,27 @@
"dictionary = ['program', 'application', 'applet' 'compile']"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "821ee3ee",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\" For all Java programmers: This section explains how to compile and run a Swing application from the command line. For information on compiling and running a Swing application using NetBeans IDE, see Running Tutorial Examples in NetBeans IDE. The compilation instructions work for all Swing programs — applets, as well as applications. Here are the steps you need to follow: Install the latest release of the Java SE platform, if you haven't already done so. Create a program that uses Swing components. Compile the program. Run the program.\""
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text"
]
},
{
"cell_type": "markdown",
"id": "other-trinidad",
@ -128,13 +161,41 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 11,
"id": "cognitive-cedar",
"metadata": {},
"outputs": [],
"source": [
"def terminology_lookup():\n",
" return []"
"count_dictionary = {}\n",
"\n",
"def terminology_lookup(text, tags):\n",
" text = text.lower()\n",
" return [(tag, [[m.start(), m.end()] \n",
" for m in re.finditer(tag, text)])\n",
" for tag in tags if tag in text]\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "9fe3b66f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('program', [[14, 21], [291, 298], [468, 475], [516, 523], [533, 540]]),\n",
" ('application', [[80, 91], [164, 175], [322, 333]])]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"terminology_lookup(text, dictionary)"
]
},
{
@ -159,9 +220,122 @@
"`python3 -m spacy download en_core_web_sm`"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "7b7b7569",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: spacy in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (3.7.4)\n",
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (3.0.12)\n",
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (1.0.5)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (1.0.10)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.0.8)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (3.0.9)\n",
"Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (8.2.3)\n",
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (1.1.2)\n",
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.4.8)\n",
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.0.10)\n",
"Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (0.3.4)\n",
"Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (0.9.4)\n",
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (6.4.0)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (4.66.2)\n",
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.31.0)\n",
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.7.0)\n",
"Requirement already satisfied: jinja2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (3.1.3)\n",
"Requirement already satisfied: setuptools in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (65.5.0)\n",
"Requirement already satisfied: packaging>=20.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (24.0)\n",
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (3.3.0)\n",
"Requirement already satisfied: numpy>=1.19.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (1.26.4)\n",
"Requirement already satisfied: annotated-types>=0.4.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (0.6.0)\n",
"Requirement already satisfied: pydantic-core==2.18.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (2.18.1)\n",
"Requirement already satisfied: typing-extensions>=4.6.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (4.11.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.2.1)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2024.2.2)\n",
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.7.11)\n",
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.1.4)\n",
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from typer<0.10.0,>=0.3.0->spacy) (8.1.7)\n",
"Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from weasel<0.4.0,>=0.1.0->spacy) (0.16.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from jinja2->spacy) (2.1.5)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
]
}
],
"source": [
"!pip3 install spacy"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f4d06ed3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting en-core-web-sm==3.7.1\n",
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)\n",
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from en-core-web-sm==3.7.1) (3.7.4)\n",
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)\n",
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)\n",
"Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.3)\n",
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.2)\n",
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)\n",
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)\n",
"Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.3.4)\n",
"Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.9.4)\n",
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (6.4.0)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.66.2)\n",
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.31.0)\n",
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.7.0)\n",
"Requirement already satisfied: jinja2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.3)\n",
"Requirement already satisfied: setuptools in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (65.5.0)\n",
"Requirement already satisfied: packaging>=20.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (24.0)\n",
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.0)\n",
"Requirement already satisfied: numpy>=1.19.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.4)\n",
"Requirement already satisfied: annotated-types>=0.4.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.6.0)\n",
"Requirement already satisfied: pydantic-core==2.18.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.18.1)\n",
"Requirement already satisfied: typing-extensions>=4.6.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.11.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.2.1)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2024.2.2)\n",
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.11)\n",
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.4)\n",
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)\n",
"Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.16.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.5)\n",
"Installing collected packages: en-core-web-sm\n",
"Successfully installed en-core-web-sm-3.7.1\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
"You can now load the package via spacy.load('en_core_web_sm')\n"
]
}
],
"source": [
"!python3 -m spacy download en_core_web_sm"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "tribal-attention",
"metadata": {},
"outputs": [
@ -205,7 +379,7 @@
"IDE\n",
",\n",
"see\n",
"Running\n",
"run\n",
"Tutorial\n",
"Examples\n",
"in\n",
@ -218,7 +392,7 @@
"work\n",
"for\n",
"all\n",
"swing\n",
"Swing\n",
"program\n",
"—\n",
"applet\n",
@ -232,7 +406,7 @@
"be\n",
"the\n",
"step\n",
"-PRON-\n",
"you\n",
"need\n",
"to\n",
"follow\n",
@ -248,7 +422,7 @@
"platform\n",
",\n",
"if\n",
"-PRON-\n",
"you\n",
"have\n",
"not\n",
"already\n",
@ -260,7 +434,7 @@
"program\n",
"that\n",
"use\n",
"Swing\n",
"swing\n",
"component\n",
".\n",
"compile\n",
@ -302,13 +476,13 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "surgical-demonstration",
"metadata": {},
"outputs": [],
"source": [
"def terminology_lookup():\n",
" return []"
" return None"
]
},
{
@ -337,13 +511,77 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 17,
"id": "superb-butterfly",
"metadata": {},
"outputs": [],
"source": [
"def get_nouns(text):\n",
" return []"
" doc = nlp(text)\n",
" return [token.text for token in doc if token.pos_ == \"NOUN\"]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "5e2be152",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'ADP'"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc[1].pos_"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "11430dc5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['programmers',\n",
" 'section',\n",
" 'Swing',\n",
" 'application',\n",
" 'command',\n",
" 'line',\n",
" 'information',\n",
" 'Swing',\n",
" 'application',\n",
" 'compilation',\n",
" 'instructions',\n",
" 'programs',\n",
" 'applets',\n",
" 'applications',\n",
" 'steps',\n",
" 'release',\n",
" 'platform',\n",
" 'program',\n",
" 'Swing',\n",
" 'components',\n",
" 'program',\n",
" 'program']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_nouns(text)"
]
},
{
@ -356,7 +594,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"id": "acting-tolerance",
"metadata": {},
"outputs": [],
@ -374,13 +612,55 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 24,
"id": "eight-redhead",
"metadata": {},
"outputs": [],
"source": [
"def extract_terms(text):\n",
" return []"
" doc = nlp(text)\n",
" noun_counts = {}\n",
" for token in doc:\n",
" if token.pos_ == \"NOUN\":\n",
" noun_counts[token.text] = noun_counts.get(token.text, 0) + 1\n",
" return noun_counts"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "c7d46f26",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'programmers': 1,\n",
" 'section': 1,\n",
" 'Swing': 3,\n",
" 'application': 2,\n",
" 'command': 1,\n",
" 'line': 1,\n",
" 'information': 1,\n",
" 'compilation': 1,\n",
" 'instructions': 1,\n",
" 'programs': 1,\n",
" 'applets': 1,\n",
" 'applications': 1,\n",
" 'steps': 1,\n",
" 'release': 1,\n",
" 'platform': 1,\n",
" 'program': 3,\n",
" 'components': 1}"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_terms(text)"
]
},
{
@ -393,14 +673,61 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 35,
"id": "monetary-mambo",
"metadata": {},
"outputs": [],
"source": [
"def extract_terms(text):\n",
" return []"
" doc = nlp(text.lower())\n",
" noun_counts = {}\n",
" for token in doc:\n",
" if token.pos_ in [\"NOUN\", \"VERB\", \"ADJ\"]:\n",
" noun_counts[token.text] = noun_counts.get(token.text, 0) + 1\n",
" return noun_counts"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "4259ee3f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'explains': 1,\n",
" 'compile': 2,\n",
" 'run': 2,\n",
" 'compiling': 1,\n",
" 'running': 2,\n",
" 'using': 1,\n",
" 'see': 1,\n",
" 'work': 1,\n",
" 'need': 1,\n",
" 'follow': 1,\n",
" 'install': 1,\n",
" 'done': 1,\n",
" 'create': 1,\n",
" 'uses': 1}"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_terms(text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8d239c20",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
@ -422,7 +749,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.0"
},
"subtitle": "3. Terminologia",
"title": "Komputerowe wspomaganie tłumaczenia",

File diff suppressed because one or more lines are too long

View File

@ -60,8 +60,35 @@
"metadata": {},
"outputs": [],
"source": [
"def find_tags(text):\n",
" return []"
"import re\n",
"\n",
"def find_tags(string):\n",
" pattern = r'<[^>]+>'\n",
" matches = re.finditer(pattern, string)\n",
" tag_indexes = [(match.start(), match.end()) for match in matches]\n",
" return tag_indexes"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3dc08368",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['<tag1>', '</tag1>', '<tag2>', '</tag2>']"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"string = \"<tag1>ADIOS</tag1><tag2>OLA</tag2>\"\n",
"[ string[out[0]:out[1]] for out in find_tags(string)]"
]
},
{
@ -74,13 +101,73 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 20,
"id": "unauthorized-study",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n",
"False\n",
"False\n",
"True\n"
]
}
],
"source": [
"import re\n",
"import string\n",
"\n",
"def is_translatable(text):\n",
" return True"
" return bool(re.match(r'^[^0-9IVXLCDM\\s' + re.escape(string.punctuation) + ']+$', text))\n",
"\n",
"text1 = \"This is a sample text.\"\n",
"text2 = \"2024.\"\n",
"text3 = \"Это пример текста.\"\n",
"text4 = \"おはよう\"\n",
"\n",
"print(is_translatable(text1)) \n",
"print(is_translatable(text2))\n",
"print(is_translatable(text3))\n",
"print(is_translatable(text4))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "ae92a18c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\"This is a sample text.\" is translatable?\n",
"Yes\n",
"\"2024.\" is translatable?\n",
"No\n",
"\"Это пример текста.\" is translatable?\n",
"Yes\n",
"\"おはよう\" is translatable?\n",
"Yes\n"
]
}
],
"source": [
"import re\n",
"import string\n",
"\n",
"def is_translatable(text):\n",
" return bool(re.match(r'^[^\\d]+$|^\\s+$', text))\n",
"\n",
"examples = [\"This is a sample text.\", \"2024.\", \n",
" \"Это пример текста.\", \"おはよう\"]\n",
"\n",
"for ex in examples:\n",
" response = 'Yes' if is_translatable(ex) else 'No'\n",
" print(f'\"{ex}\" is translatable?\\n{response}')"
]
},
{
@ -93,13 +180,74 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 44,
"id": "beautiful-mathematics",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('03/25/2022', 3, 25, 2022)\tday:\t3\tmonth:\t25\tyear:\t2022\n",
"('25-12-2023', 25, 12, 2023)\tday:\t25\tmonth:\t12\tyear:\t2023\n",
"('09/30/2025', 9, 30, 2025)\tday:\t9\tmonth:\t30\tyear:\t2025\n",
"('03/25/2022', 25, 3, 2022)\tday:\t25\tmonth:\t3\tyear:\t2022\n",
"('09/30/2025', 30, 9, 2025)\tday:\t30\tmonth:\t9\tyear:\t2025\n",
"('12 March 2024', 12, 3, 2024)\tday:\t12\tmonth:\t3\tyear:\t2024\n",
"('25-12-2023', 25, 12, 2023)\tday:\t25\tmonth:\t12\tyear:\t2023\n",
"('12 March 2024', 12, 3, 2024)\tday:\t12\tmonth:\t3\tyear:\t2024\n",
"('15 September, 2026', 15, 9, 2026)\tday:\t15\tmonth:\t9\tyear:\t2026\n"
]
}
],
"source": [
"import re\n",
"\n",
"def find_dates(text):\n",
" return []"
" date_formats = [\n",
" (r'(\\d{1,2})[-/](\\d{1,2})[-/](\\d{2,4})', 'day_first'),\n",
" (r'(\\d{1,2})[/](\\d{1,2})[/](\\d{2,4})', 'month_first'),\n",
" (r'(\\d{1,2}) (\\w{3,9}) (\\d{4})', 'day_first'),\n",
" (r'(\\d{1,2})-(\\d{1,2})-(\\d{2,4})', 'day_first'),\n",
" (r'(\\d{1,2}) (\\w{3,9}),? (\\d{4})', 'day_first')\n",
" ]\n",
"\n",
" months = {\n",
" 'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,\n",
" 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12\n",
" }\n",
"\n",
" dates_found = []\n",
" for date_pattern, format_type in date_formats:\n",
" matches = re.finditer(date_pattern, text)\n",
" for match in matches:\n",
" groups = match.groups()\n",
" if len(groups) == 3:\n",
" if format_type == 'day_first':\n",
" day, month, year = groups\n",
" else:\n",
" month, day, year = groups\n",
" if month.isdigit():\n",
" month = int(month)\n",
" else:\n",
" month = months[month]\n",
" dates_found.append((match.group(), int(day), month, int(year)))\n",
" elif len(groups) == 4:\n",
" if format_type == 'day_first':\n",
" day, month, _, year = groups\n",
" else:\n",
" month, day, _, year = groups\n",
" if month.isdigit():\n",
" month = int(month)\n",
" else:\n",
" month = months[month]\n",
" dates_found.append((match.group(), int(day), month, int(year)))\n",
" return dates_found\n",
"\n",
"text = \"Here are some dates: 03/25/2022, 25-12-2023, 12 March 2024, 09/30/2025, 15 September, 2026\"\n",
"dates = find_dates(text)\n",
"for date in dates:\n",
" print(f\"{date}\\tday:\\t{date[1]}\\tmonth:\\t{date[2]}\\tyear:\\t{date[3]}\")"
]
},
{
@ -205,7 +353,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.0"
},
"subtitle": "6,7. Preprocessing i postprocessing",
"title": "Komputerowe wspomaganie tłumaczenia",

File diff suppressed because it is too large Load Diff

View File

@ -52,13 +52,45 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 10,
"id": "german-dispute",
"metadata": {},
"outputs": [],
"source": [
"def sentence_split(text):\n",
" return []"
" segments = []\n",
" start = 0\n",
" i = 0\n",
" n = len(text)\n",
" while i < n:\n",
" if text[i] == ' ' and i + 1 < n and text[i + 1].isupper():\n",
" segments.append(text[start:i])\n",
" start = i + 1\n",
" i += 1\n",
" segments.append(text[start:])\n",
" return segments\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "adbae35b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Dzien dobry,', 'CZy ten', 'TEKST zostanie poprawnie podzielony?']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text = \"Dzien dobry, CZy ten TEKST zostanie poprawnie podzielony?\"\n",
"sentence_split(text)"
]
},
{
@ -71,13 +103,54 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 13,
"id": "064c2343",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import requests\n",
"from bs4 import BeautifulSoup"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "guilty-morocco",
"metadata": {},
"outputs": [],
"source": [
"def sentence_split_enhanced(text):\n",
" return []"
"def fetch_page_content(url):\n",
" response = requests.get(url)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
"\n",
" for unwanted in soup([\"script\", \"style\"]):\n",
" unwanted.decompose()\n",
"\n",
" page_text = soup.get_text()\n",
"\n",
" clean_text = re.sub(r\"\\s+\", \" \", page_text)\n",
" return clean_text"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "7a2d689a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Toki', 'Meaning |', 'New', 'Zealand', 'Pounamu', 'Meanings &', 'Designs |', 'Mountain', 'Jade', 'NZToki (Adze)Worn as a symbol of strength, the', 'Toki carries with it deep meaning and symbolism associated with mana, reverence, strength and bravery.The', 'Meaning of the', 'TokiTraditionally, a', 'Toki (Adze) was an everyday tool used by', 'Māori for different woodworking tasks, such as felling trees, hollowing out waka (canoes), and constructing houses and wharenui (Māori communal houses).', 'Using abrasion techniques,', 'Māori would painstakingly carve an adze from pounamu and lash it to a wooden handle using natural materials such as harakeke (flax).', 'It was then swung powerfully to cut wood.Māori were skilful woodworkers, and as they did not have metal, pounamu served well as a woodcutting tool because of its exceptional toughness and ability to retain a hard, sharp cutting edge.', 'For this reason, the', 'Toki holds great significance and meaning in', 'Māori culture as a powerful symbol of strength and power.DiscoverToki (Adze)', 'NecklacesShop', 'All', 'TokiA', 'Toki', 'Poutangata is an adze worked from pounamu and lashed to an elaborately carved wooden handle, and adorned with the feathers of significant native birds.', 'Mana and', 'Life', 'ForceMauri is considered to be a life force or essence - it is a spark or the essential vitality of a being.', 'Māori hold that all people and all things have mauri.', 'Many believe that as pounamu has a spiritual value, it can take on the mauri of the person wielding or wearing the stone, such as when', 'Māori chiefs with great mana (prestige) wielded the', 'Toki', 'Poutangata.', 'As the stone passes down through generations of whanau (family), it continues to carry the spirit, energy, and strength of previous owners, growing in mana and honour with its rich histories and stories.', 'A', 'Symbol of', 'StrengthToki are worn today as a symbol of strength, with this association linking back to when pounamu adze were powerful woodcutting tools, and wielded by fearsome', 'Māori warriors.', 'They symbolise courage, determination, success, and bravery, and their significance is steeped in', 'Māori tradition and meaning.Shop', 'All', 'TokiWatch how we carve', 'TokiMorepounamu meanings & designsTwist (Pikorua)Koru (spiral)Hei', 'Matau (hook)HeartManaiaHei', 'TikiWe', 'Ship', 'WorldwidePremium', 'Gift', 'WrappingFree', 'Shipping*We', 'Ship', 'WorldwidePremium', 'Gift', 'WrappingFree', 'Shipping*ShopLearnToursFind', 'Us$NZD$AUD£GBP$CAD$USD€EUR¥JPYFollow usStay in the know on new releases, special offers, and more.Your email addressSupportContactFAQsShippingGift', 'WrappingReturnsCarePrivacy', 'PolicyTerms &', 'ConditionsWarrantyGift', 'CardsReviewsAboutOur', 'StorySustainabilityLearn about', 'JadeNewsletterAuckland', 'StoreRotorua', 'StoreOur', 'ToursTen', 'Reasons to', 'GiftNeed', 'Help?+64 7 349 3968customercare@mountainjade.co.nz1288', 'Fenton', 'Street,', 'RotoruaNew', 'ZealandSupportAboutNeed', 'Help?Website by']\n"
]
}
],
"source": [
"text = fetch_page_content('https://www.mountainjade.co.nz/pages/greenstone-meanings-and-designs/toki-adze')\n",
"split = sentence_split(text)\n",
"print(split)"
]
},
{
@ -187,15 +260,12 @@
"metadata": {
"author": "Rafał Jaworski",
"email": "rjawor@amu.edu.pl",
"lang": "pl",
"subtitle": "11. Urównoleglanie",
"title": "Komputerowe wspomaganie tłumaczenia",
"year": "2021",
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"lang": "pl",
"language_info": {
"codemirror_mode": {
"name": "ipython",
@ -206,8 +276,11 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
"version": "3.11.9"
},
"subtitle": "11. Urównoleglanie",
"title": "Komputerowe wspomaganie tłumaczenia",
"year": "2021"
},
"nbformat": 4,
"nbformat_minor": 5

File diff suppressed because one or more lines are too long

View File

@ -44,7 +44,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"id": "familiar-terrace",
"metadata": {
"scrolled": true
@ -120,13 +120,47 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 27,
"id": "economic-southeast",
"metadata": {},
"outputs": [],
"source": [
"def correct_text(text):\n",
" return []"
"def load_data(zip_file_path):\n",
" with ZipFile(zip_file_path) as zip_f:\n",
" with zip_f.open('hunspell_pl.txt') as f:\n",
" return set([line.strip().lower() for line in f.read().decode('utf-8').splitlines()])\n",
"\n",
"def correct_words(sentence, dictionary):\n",
" \"\"\"\n",
" 0 - incorrect\n",
" 1 - correct\n",
" \"\"\"\n",
" return [(word, 1) if word in dictionary\n",
" else (word, 0)\n",
" for word in sentence.lower().split()]\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "bba15ae8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('hakunamatata', 0)]\n",
"[('czy', 1), ('dobrze', 1), ('pisze', 0)]\n",
"[('ala', 1), ('ma', 1), ('kota', 1)]\n"
]
}
],
"source": [
"data = load_data('data/hunspell_pl.zip')\n",
"print(correct_words('Hakunamatata', data))\n",
"print(correct_words('Czy dobrze pisze', data))\n",
"print(correct_words('Ala ma kota', data))"
]
},
{
@ -168,13 +202,32 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 33,
"id": "built-sally",
"metadata": {},
"outputs": [],
"source": [
"alphabet = 'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'\n",
"\n",
"def L1(w):\n",
" return []"
" edits = set()\n",
" for i in range(len(w) + 1):\n",
" for c in alphabet:\n",
" if i < len(w):\n",
" edits.add(w[:i] + c + w[i+1:])\n",
" edits.add(w[:i] + c + w[i:])\n",
" if i < len(w):\n",
" edits.add(w[:i] + w[i+1:])\n",
" return edits\n",
"\n",
"def L2(l1):\n",
" edits = set()\n",
" for word in l1:\n",
" edits.update(L1(word))\n",
" return edits\n",
"\n",
"def S(edits, dictionary):\n",
" return set(dictionary).intersection(set(edits))"
]
},
{
@ -187,13 +240,37 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 39,
"id": "coordinated-cooperation",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['cześć'], ['mam'], ['na'], {'gajek', 'tajner', 'jasne', 'janne', 'jajem', 'janet', 'jacek', 'janez', 'banek', 'jarek', 'panek', 'ranek', 'majtek', 'danek', 'jadnak', 'jenek', 'najdek', 'jaje', 'jane', 'kajtek', 'jonek', 'jamnik', 'janik', 'pajek', 'janem', 'gajnik', 'jajnik', 'jasiek', 'garnek', 'majek', 'bajek', 'jasek', 'jasnej', 'janek', 'tajnik', 'ganek'}]\n"
]
}
],
"source": [
"def generate_suggestions(w):\n",
" return []"
"def apply_edits(sentence, dictionary):\n",
" suggestions = []\n",
" for word, is_misspelled in sentence:\n",
" if is_misspelled == 0:\n",
" words = generate_suggestions(word, dictionary)\n",
" suggestions.append(words)\n",
" else:\n",
" suggestions.append([word])\n",
" return suggestions\n",
"\n",
"def generate_suggestions(w, dictionary):\n",
" l1 = L1(w)\n",
" l2 = L2(l1)\n",
" s1 = S(l1, dictionary)\n",
" s2 = S(l2, dictionary)\n",
" return s1.union(s2)\n",
"\n",
"print(apply_edits(correct_words('Cześć mam na jajnek', data), data))"
]
}
],
@ -216,7 +293,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.0"
},
"subtitle": "13,14. Korekta pisowni",
"title": "Komputerowe wspomaganie tłumaczenia",

View File

@ -62,11 +62,42 @@
]
},
{
"cell_type": "raw",
"cell_type": "markdown",
"id": "academic-crest",
"metadata": {
"vscode": {
"languageId": "raw"
}
},
"source": []
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9020428a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting language_tool_python\n",
" Downloading language_tool_python-2.8-py3-none-any.whl (35 kB)\n",
"Requirement already satisfied: pip in /home/potato/.virtualenvs/praca/lib/python3.11/site-packages (from language_tool_python) (22.0.2)\n",
"Requirement already satisfied: tqdm in /home/potato/.virtualenvs/praca/lib/python3.11/site-packages (from language_tool_python) (4.66.4)\n",
"Requirement already satisfied: requests in /home/potato/.virtualenvs/praca/lib/python3.11/site-packages (from language_tool_python) (2.31.0)\n",
"Requirement already satisfied: wheel in /home/potato/.virtualenvs/praca/lib/python3.11/site-packages (from language_tool_python) (0.37.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /home/potato/.virtualenvs/praca/lib/python3.11/site-packages (from requests->language_tool_python) (3.7)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /home/potato/.virtualenvs/praca/lib/python3.11/site-packages (from requests->language_tool_python) (2.2.1)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /home/potato/.virtualenvs/praca/lib/python3.11/site-packages (from requests->language_tool_python) (2024.2.2)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /home/potato/.virtualenvs/praca/lib/python3.11/site-packages (from requests->language_tool_python) (3.3.2)\n",
"Installing collected packages: language_tool_python\n",
"Successfully installed language_tool_python-2.8\n"
]
}
],
"source": [
"pip3 install language_tool_python"
"!pip install language_tool_python"
]
},
{
@ -79,16 +110,25 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 2,
"id": "relative-anaheim",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Downloading LanguageTool 6.4: 100%|██████████| 246M/246M [00:07<00:00, 34.5MB/s] \n",
"Unzipping /tmp/tmp2yjlywpj.zip to /home/potato/.cache/language_tool_python.\n",
"Downloaded https://www.languagetool.org/download/LanguageTool-6.4.zip to /home/potato/.cache/language_tool_python.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Match({'ruleId': 'EN_A_VS_AN', 'message': 'Use “an” instead of a if the following word starts with a vowel sound, e.g. an article, an hour.', 'replacements': ['an'], 'offsetInContext': 16, 'context': 'A sentence with a error in the Hitchhikers Guide tot he ...', 'offset': 16, 'errorLength': 1, 'category': 'MISC', 'ruleIssueType': 'misspelling', 'sentence': 'A sentence with a error in the Hitchhikers Guide tot he Galaxy'}),\n",
" Match({'ruleId': 'TOT_HE', 'message': 'Did you mean “to the”?', 'replacements': ['to the'], 'offsetInContext': 43, 'context': '... with a error in the Hitchhikers Guide tot he Galaxy', 'offset': 50, 'errorLength': 6, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': 'A sentence with a error in the Hitchhikers Guide tot he Galaxy'})]\n"
"[Match({'ruleId': 'EN_A_VS_AN', 'message': 'Use “an” instead of a if the following word starts with a vowel sound, e.g. an article, an hour.', 'replacements': ['an'], 'offsetInContext': 16, 'context': 'A sentence with a error in the Hitchhikers Guide tot he ...', 'offset': 16, 'errorLength': 1, 'category': 'MISC', 'ruleIssueType': 'misspelling', 'sentence': \"A sentence with a error in the Hitchhiker's Guide tot he Galaxy\"}),\n",
" Match({'ruleId': 'TOT_HE', 'message': 'Did you mean “to the”?', 'replacements': ['to the'], 'offsetInContext': 43, 'context': '... with a error in the Hitchhikers Guide tot he Galaxy', 'offset': 50, 'errorLength': 6, 'category': 'TYPOS', 'ruleIssueType': 'misspelling', 'sentence': \"A sentence with a error in the Hitchhiker's Guide tot he Galaxy\"})]\n"
]
}
],
@ -122,13 +162,72 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"id": "sound-teaching",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[Match({'ruleId': 'SENTENCE_WHITESPACE', 'message': 'Add a space between sentences.', 'replacements': [' The'], 'offsetInContext': 43, 'context': '...h mana, reverence, strength and bravery.The Meaning of the TokiTraditionally, a Tok...', 'offset': 219, 'errorLength': 3, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': 'The Meaning of the TokiTraditionally, a Toki (Adze) was an everyday tool used by Māori for different woodworking tasks, such as felling trees, hollowing out waka (canoes), and constructing houses and wharenui (Māori communal houses).'}),\n",
" Match({'ruleId': 'SENTENCE_WHITESPACE', 'message': 'Add a space between sentences.', 'replacements': [' Māori'], 'offsetInContext': 43, 'context': '...t was then swung powerfully to cut wood.Māori were skilful woodworkers, and as they d...', 'offset': 654, 'errorLength': 5, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': 'Māori were skilful woodworkers, and as they did not have metal, pounamu served well as a woodcutting tool because of its exceptional toughness and ability to retain a hard, sharp cutting edge.'}),\n",
" Match({'ruleId': 'SENTENCE_WHITESPACE', 'message': 'Add a space between sentences.', 'replacements': [' DiscoverToki'], 'offsetInContext': 43, 'context': '...a powerful symbol of strength and power.DiscoverToki (Adze) NecklacesShop All TokiA Toki Pou...', 'offset': 970, 'errorLength': 12, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': 'DiscoverToki (Adze) NecklacesShop All TokiA Toki Poutangata is an adze worked from pounamu and lashed to an elaborately carved wooden handle, and adorned with the feathers of significant native birds.'}),\n",
" Match({'ruleId': 'SENTENCE_WHITESPACE', 'message': 'Add a space between sentences.', 'replacements': [' Shop'], 'offsetInContext': 43, 'context': '... steeped in Māori tradition and meaning.Shop All TokiWatch how we carve TokiMorepoun...', 'offset': 2076, 'errorLength': 4, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': 'Shop All TokiWatch how we carve TokiMorepounamu meanings & designsTwist (Pikorua)Koru (spiral)Hei Matau (hook)HeartManaiaHei TikiFollow usStay in the know on new releases, special offers, and more.'}),\n",
" Match({'ruleId': 'SENTENCE_WHITESPACE', 'message': 'Add a space between sentences.', 'replacements': [' Your'], 'offsetInContext': 43, 'context': '... new releases, special offers, and more.Your email addressSupportContactFAQsShipping...', 'offset': 2273, 'errorLength': 4, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': 'Your email addressSupportContactFAQsShippingGift WrappingReturnsCarePrivacy PolicyTerms & ConditionsWarrantyGift CardsReviewsAboutOur StorySustainabilityLearn about JadeNewsletterAuckland StoreRotorua StoreOur ToursTen Reasons to GiftNeed Help?+64 7 349 3968customercare@mountainjade.co.nz1288 Fenton Street, RotoruaNew ZealandSupportAboutNeed Help?'}),\n",
" Match({'ruleId': 'SENTENCE_WHITESPACE', 'message': 'Add a space between sentences.', 'replacements': [' Website'], 'offsetInContext': 43, 'context': '...RotoruaNew ZealandSupportAboutNeed Help?Website byWe Ship WorldwidePremium Gift Wrappin...', 'offset': 2622, 'errorLength': 7, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': 'Website byWe Ship WorldwidePremium Gift WrappingFree Shipping*We Ship WorldwidePremium Gift WrappingFree Shipping*ShopLearnToursFind Us$NZD$AUD£GBP$CAD$USD€EUR¥JPY'})]\n"
]
},
{
"data": {
"text/plain": [
"[Match({'ruleId': 'SENTENCE_WHITESPACE', 'message': 'Add a space between sentences.', 'replacements': [' The'], 'offsetInContext': 43, 'context': '...h mana, reverence, strength and bravery.The Meaning of the TokiTraditionally, a Tok...', 'offset': 219, 'errorLength': 3, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': 'The Meaning of the TokiTraditionally, a Toki (Adze) was an everyday tool used by Māori for different woodworking tasks, such as felling trees, hollowing out waka (canoes), and constructing houses and wharenui (Māori communal houses).'}),\n",
" Match({'ruleId': 'SENTENCE_WHITESPACE', 'message': 'Add a space between sentences.', 'replacements': [' Māori'], 'offsetInContext': 43, 'context': '...t was then swung powerfully to cut wood.Māori were skilful woodworkers, and as they d...', 'offset': 654, 'errorLength': 5, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': 'Māori were skilful woodworkers, and as they did not have metal, pounamu served well as a woodcutting tool because of its exceptional toughness and ability to retain a hard, sharp cutting edge.'}),\n",
" Match({'ruleId': 'SENTENCE_WHITESPACE', 'message': 'Add a space between sentences.', 'replacements': [' DiscoverToki'], 'offsetInContext': 43, 'context': '...a powerful symbol of strength and power.DiscoverToki (Adze) NecklacesShop All TokiA Toki Pou...', 'offset': 970, 'errorLength': 12, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': 'DiscoverToki (Adze) NecklacesShop All TokiA Toki Poutangata is an adze worked from pounamu and lashed to an elaborately carved wooden handle, and adorned with the feathers of significant native birds.'}),\n",
" Match({'ruleId': 'SENTENCE_WHITESPACE', 'message': 'Add a space between sentences.', 'replacements': [' Shop'], 'offsetInContext': 43, 'context': '... steeped in Māori tradition and meaning.Shop All TokiWatch how we carve TokiMorepoun...', 'offset': 2076, 'errorLength': 4, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': 'Shop All TokiWatch how we carve TokiMorepounamu meanings & designsTwist (Pikorua)Koru (spiral)Hei Matau (hook)HeartManaiaHei TikiFollow usStay in the know on new releases, special offers, and more.'}),\n",
" Match({'ruleId': 'SENTENCE_WHITESPACE', 'message': 'Add a space between sentences.', 'replacements': [' Your'], 'offsetInContext': 43, 'context': '... new releases, special offers, and more.Your email addressSupportContactFAQsShipping...', 'offset': 2273, 'errorLength': 4, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': 'Your email addressSupportContactFAQsShippingGift WrappingReturnsCarePrivacy PolicyTerms & ConditionsWarrantyGift CardsReviewsAboutOur StorySustainabilityLearn about JadeNewsletterAuckland StoreRotorua StoreOur ToursTen Reasons to GiftNeed Help?+64 7 349 3968customercare@mountainjade.co.nz1288 Fenton Street, RotoruaNew ZealandSupportAboutNeed Help?'}),\n",
" Match({'ruleId': 'SENTENCE_WHITESPACE', 'message': 'Add a space between sentences.', 'replacements': [' Website'], 'offsetInContext': 43, 'context': '...RotoruaNew ZealandSupportAboutNeed Help?Website byWe Ship WorldwidePremium Gift Wrappin...', 'offset': 2622, 'errorLength': 7, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': 'Website byWe Ship WorldwidePremium Gift WrappingFree Shipping*We Ship WorldwidePremium Gift WrappingFree Shipping*ShopLearnToursFind Us$NZD$AUD£GBP$CAD$USD€EUR¥JPY'})]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def find_errors(website_url):\n",
" return []"
"import re\n",
"import requests\n",
"from bs4 import BeautifulSoup\n",
"import language_tool_python\n",
"import pprint\n",
"\n",
"\n",
"def fetch_page_content(url):\n",
" response = requests.get(url)\n",
" soup = BeautifulSoup(response.content, 'html.parser')\n",
"\n",
" for unwanted in soup([\"script\", \"style\"]):\n",
" unwanted.decompose()\n",
"\n",
" page_text = soup.get_text()\n",
"\n",
" clean_text = re.sub(r\"\\s+\", \" \", page_text)\n",
" return clean_text\n",
"\n",
"\n",
"def identify_language_errors(site_url):\n",
" issues = []\n",
" page_content = fetch_page_content(site_url)\n",
" language_tool = language_tool_python.LanguageTool('en-NZ')\n",
" matches = language_tool.check(page_content)\n",
" for match in matches:\n",
" if match.ruleId != 'MORFOLOGIK_RULE_EN_NZ':\n",
" issues.append(match)\n",
"\n",
" pprint.pprint(issues)\n",
" return issues\n",
"\n",
"identify_language_errors('https://www.mountainjade.co.nz/pages/greenstone-meanings-and-designs/toki-adze')\n"
]
},
{
@ -170,7 +269,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
"version": "3.11.9"
},
"subtitle": "15. Korekta gramatyczna",
"title": "Komputerowe wspomaganie tłumaczenia",