Compare commits
3 Commits
Author | SHA1 | Date |
---|---|---|
potato | e02ff5ab39 | |
potato | d32188878d | |
potato | e5ed49b0b0 |
111
lab/lab_01.ipynb
111
lab/lab_01.ipynb
|
@ -82,7 +82,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 5,
|
||||
"id": "compact-trinidad",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -92,7 +92,7 @@
|
|||
"['Press the ENTER button']"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -119,7 +119,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 6,
|
||||
"id": "exposed-daniel",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -139,7 +139,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 7,
|
||||
"id": "serial-velvet",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -149,7 +149,7 @@
|
|||
"['Press the ENTER button', 'Press the ENTER key']"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -176,17 +176,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 11,
|
||||
"id": "every-gibson",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[]"
|
||||
"['Press the ENTER button', 'Press the ENTER key']"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -213,13 +213,13 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 9,
|
||||
"id": "protected-rings",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tm_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" return [entry[1] for entry in translation_memory if entry[0].lower() == sentence.lower()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -232,17 +232,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 13,
|
||||
"id": "severe-alloy",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"''"
|
||||
"['Press the ENTER button', 'Press the ENTER key']"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -261,13 +261,20 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 12,
|
||||
"id": "structural-diesel",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def remove_punctuation(sentence):\n",
|
||||
" return re.sub(r'[^\\w\\s]', '', sentence)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tm_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" return [entry[1] for entry in translation_memory\n",
|
||||
" if entry[0].lower() == remove_punctuation(sentence.lower())]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -280,17 +287,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 27,
|
||||
"id": "brief-senegal",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"''"
|
||||
"['System restart required']"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"execution_count": 27,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -317,13 +324,26 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 26,
|
||||
"id": "mathematical-customs",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def remove_punctuation(sentence):\n",
|
||||
" return re.sub(r'[^\\w\\s]', '', sentence)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def tm_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" values = []\n",
|
||||
" for entry in translation_memory:\n",
|
||||
" key = set(entry[0].lower().split())\n",
|
||||
" mod_sentence = set(remove_punctuation(sentence.lower()).split())\n",
|
||||
" remainder = list(key - mod_sentence)\n",
|
||||
" if len(remainder) <= 1:\n",
|
||||
" values.append(entry[1])\n",
|
||||
" return values"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -344,7 +364,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 42,
|
||||
"id": "humanitarian-wrong",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -362,7 +382,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 43,
|
||||
"id": "located-perception",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -374,17 +394,31 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 44,
|
||||
"id": "3437b88b",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]\n",
|
||||
"\n",
|
||||
"def glossary_lookup(sentence):\n",
|
||||
" sentence_words = sentence.split()\n",
|
||||
" return [entry for entry in glossary if entry[0] in sentence_words]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"id": "advised-casting",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||
"[('drukarka', 'printer'), ('przycisk', 'button')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"execution_count": 51,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -406,7 +440,7 @@
|
|||
"id": "defensive-fifteen",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: Operacja split w pierwszej linijce funkcji moze zostac uznana za stala. Biorac pod uwage ze lista krotek zawierajaca glosariusz musi byc przejrzana za kazdym razem cala, jak i caly string – skomplikowaność obliczen bedzie wynosic O(n*m)."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -419,13 +453,14 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 48,
|
||||
"id": "original-tunisia",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def glossary_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" sentence_words = [ element.lower() for element in sentence.split()]\n",
|
||||
" return [entry for entry in glossary if entry[0].lower() in sentence_words]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -438,13 +473,27 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 49,
|
||||
"id": "f69a873a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"glossary = [('komputer', 'computer'), ('przycisk', 'button'), ('drukarka', 'printer')]\n",
|
||||
"glossary = { k:v for k,v in glossary}\n",
|
||||
"translated_words = list(glossary.keys())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 50,
|
||||
"id": "adolescent-semiconductor",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def glossary_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" words = sentence.split()\n",
|
||||
" \n",
|
||||
" return [(word, glossary[word]) for word in words if word in translated_words]"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -467,7 +516,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.11.0"
|
||||
},
|
||||
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
|
111
lab/lab_02.ipynb
111
lab/lab_02.ipynb
|
@ -67,7 +67,9 @@
|
|||
" ('Sprawdź ustawienia sieciowe', 'Check the network settings'),\n",
|
||||
" ('Drukarka jest wyłączona', 'The printer is switched off'),\n",
|
||||
" ('Wymagane ponowne uruchomienie komputera', 'System restart required')\n",
|
||||
" ]"
|
||||
" ]\n",
|
||||
"\n",
|
||||
"translation_memory = { k:v for k,v in translation_memory}"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -86,7 +88,11 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"def ice_lookup(sentence, prev_sentence, next_sentence):\n",
|
||||
" return []"
|
||||
" s_t = translation_memory.get(sentence, False)\n",
|
||||
" p_s = translation_memory.get(prev_sentence, False)\n",
|
||||
" n_s = translation_memory.get(next_sentence, False)\n",
|
||||
" if s_t and p_s and n_s:\n",
|
||||
" return s_t "
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -141,7 +147,7 @@
|
|||
"id": "graduate-theorem",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: Nie jest to poprawna funkcja dystansu fuzzy match. Warunki 1,3,4 sa spelnione. 2 warunek jest nie spełniony poniewaz odleglosc pomiedzy dwoma zdaniami/slowami o tej samej dlugosci ale innych znakach bedzie rowna zero."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -179,7 +185,7 @@
|
|||
"id": "metallic-leave",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: Tak, jest to poprawna funkcja dystansu. Wszystkie warunki sa spelnione."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -201,12 +207,54 @@
|
|||
"### Ćwiczenie 5: Czy dystans Levenshteina jest poprawną funkcją dystansu? Uzasadnij krótko swoją odpowiedź sprawdzając każdy z warunków."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "79e4deef",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from Levenshtein import distance as levenshtein_distance\n",
|
||||
"\n",
|
||||
"# warunek 1\n",
|
||||
"levenshtein_distance(\"smthn\", \"nothin\")\n",
|
||||
"# Output: 3\n",
|
||||
"# zawsze nieujemne\n",
|
||||
"\n",
|
||||
"# warunek 2\n",
|
||||
"levenshtein_distance(\"and\", \"and\")\n",
|
||||
"# Output: 0\n",
|
||||
"# dwa takie same zdania ktore sa w odleglosci 0 od siebie\n",
|
||||
"\n",
|
||||
"# warunek 3\n",
|
||||
"levenshtein_distance(\"zombie\", \"mombie\") == levenshtein_distance(\"mombie\", \"zombie\")\n",
|
||||
"# Output: True\n",
|
||||
"# zamiennosc zdan\n",
|
||||
"\n",
|
||||
"# warunek 4\n",
|
||||
"x,y,z = 'zombie', 'glombie', 'mombie'\n",
|
||||
"levenshtein_distance(x,y) + levenshtein_distance(y,z) >= levenshtein_distance(x,z)\n",
|
||||
"#Output: True\n",
|
||||
"# miara każdej odleglosci musi być mniejsza lub równa sumie miar dwóch pozostałych"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bibliographic-stopping",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: Tak jest poprawną funkcją dystansu."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -223,7 +271,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 7,
|
||||
"id": "secondary-wrist",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -233,7 +281,7 @@
|
|||
"2"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -254,7 +302,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 8,
|
||||
"id": "associate-tuner",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -273,7 +321,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 9,
|
||||
"id": "focal-pathology",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -283,7 +331,7 @@
|
|||
"0.9166666666666666"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -294,7 +342,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 10,
|
||||
"id": "roman-ceiling",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -304,7 +352,7 @@
|
|||
"0.9428571428571428"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -315,7 +363,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 11,
|
||||
"id": "invisible-cambodia",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -325,7 +373,7 @@
|
|||
"0.631578947368421"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -344,14 +392,43 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 12,
|
||||
"id": "genetic-cradle",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def fuzzy_lookup(sentence, threshold):\n",
|
||||
" return []"
|
||||
" return [ v for k,v in translation_memory.items() if levenshtein_similarity(sentence, k) > threshold ]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "2e72b54a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Press the ENTER button', 'System restart required']"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"fuzzy_lookup('Spróbuj wyłączyć i włączyć komputer', 0.25)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e1f15316",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -373,7 +450,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.11.0"
|
||||
},
|
||||
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
|
369
lab/lab_03.ipynb
369
lab/lab_03.ipynb
|
@ -63,7 +63,9 @@
|
|||
"id": "diverse-sunglasses",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: Narzędzie DeepL: https://www.deepl.com/translator\n",
|
||||
"\n",
|
||||
"przetłumaczyło tekst \"prowadnice szaf metalowych\" na \"metal cabinet slides\""
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -86,7 +88,17 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 8,
|
||||
"id": "8f6b6fa9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "loving-prince",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -110,7 +122,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 10,
|
||||
"id": "bound-auction",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -118,6 +130,27 @@
|
|||
"dictionary = ['program', 'application', 'applet' 'compile']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "821ee3ee",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"\" For all Java programmers: This section explains how to compile and run a Swing application from the command line. For information on compiling and running a Swing application using NetBeans IDE, see Running Tutorial Examples in NetBeans IDE. The compilation instructions work for all Swing programs — applets, as well as applications. Here are the steps you need to follow: Install the latest release of the Java SE platform, if you haven't already done so. Create a program that uses Swing components. Compile the program. Run the program.\""
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "other-trinidad",
|
||||
|
@ -128,13 +161,41 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 11,
|
||||
"id": "cognitive-cedar",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def terminology_lookup():\n",
|
||||
" return []"
|
||||
"count_dictionary = {}\n",
|
||||
"\n",
|
||||
"def terminology_lookup(text, tags):\n",
|
||||
" text = text.lower()\n",
|
||||
" return [(tag, [[m.start(), m.end()] \n",
|
||||
" for m in re.finditer(tag, text)])\n",
|
||||
" for tag in tags if tag in text]\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "9fe3b66f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('program', [[14, 21], [291, 298], [468, 475], [516, 523], [533, 540]]),\n",
|
||||
" ('application', [[80, 91], [164, 175], [322, 333]])]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"terminology_lookup(text, dictionary)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -159,9 +220,122 @@
|
|||
"`python3 -m spacy download en_core_web_sm`"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "7b7b7569",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Requirement already satisfied: spacy in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (3.7.4)\n",
|
||||
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (3.0.12)\n",
|
||||
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (1.0.5)\n",
|
||||
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (1.0.10)\n",
|
||||
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.0.8)\n",
|
||||
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (3.0.9)\n",
|
||||
"Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (8.2.3)\n",
|
||||
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (1.1.2)\n",
|
||||
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.4.8)\n",
|
||||
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.0.10)\n",
|
||||
"Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (0.3.4)\n",
|
||||
"Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (0.9.4)\n",
|
||||
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (6.4.0)\n",
|
||||
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (4.66.2)\n",
|
||||
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.31.0)\n",
|
||||
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (2.7.0)\n",
|
||||
"Requirement already satisfied: jinja2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (3.1.3)\n",
|
||||
"Requirement already satisfied: setuptools in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (65.5.0)\n",
|
||||
"Requirement already satisfied: packaging>=20.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (24.0)\n",
|
||||
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (3.3.0)\n",
|
||||
"Requirement already satisfied: numpy>=1.19.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy) (1.26.4)\n",
|
||||
"Requirement already satisfied: annotated-types>=0.4.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (0.6.0)\n",
|
||||
"Requirement already satisfied: pydantic-core==2.18.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (2.18.1)\n",
|
||||
"Requirement already satisfied: typing-extensions>=4.6.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (4.11.0)\n",
|
||||
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.3.2)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.7)\n",
|
||||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.2.1)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2024.2.2)\n",
|
||||
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.7.11)\n",
|
||||
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from thinc<8.3.0,>=8.2.2->spacy) (0.1.4)\n",
|
||||
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from typer<0.10.0,>=0.3.0->spacy) (8.1.7)\n",
|
||||
"Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from weasel<0.4.0,>=0.1.0->spacy) (0.16.0)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=2.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from jinja2->spacy) (2.1.5)\n",
|
||||
"\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip3 install spacy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "f4d06ed3",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Collecting en-core-web-sm==3.7.1\n",
|
||||
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)\n",
|
||||
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m:01\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: spacy<3.8.0,>=3.7.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from en-core-web-sm==3.7.1) (3.7.4)\n",
|
||||
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.12)\n",
|
||||
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.5)\n",
|
||||
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.0.10)\n",
|
||||
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.8)\n",
|
||||
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.0.9)\n",
|
||||
"Requirement already satisfied: thinc<8.3.0,>=8.2.2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.2.3)\n",
|
||||
"Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.1.2)\n",
|
||||
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.4.8)\n",
|
||||
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.0.10)\n",
|
||||
"Requirement already satisfied: weasel<0.4.0,>=0.1.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.3.4)\n",
|
||||
"Requirement already satisfied: typer<0.10.0,>=0.3.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.9.4)\n",
|
||||
"Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (6.4.0)\n",
|
||||
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.66.2)\n",
|
||||
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.31.0)\n",
|
||||
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.7.0)\n",
|
||||
"Requirement already satisfied: jinja2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.1.3)\n",
|
||||
"Requirement already satisfied: setuptools in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (65.5.0)\n",
|
||||
"Requirement already satisfied: packaging>=20.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (24.0)\n",
|
||||
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.0)\n",
|
||||
"Requirement already satisfied: numpy>=1.19.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (1.26.4)\n",
|
||||
"Requirement already satisfied: annotated-types>=0.4.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.6.0)\n",
|
||||
"Requirement already satisfied: pydantic-core==2.18.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.18.1)\n",
|
||||
"Requirement already satisfied: typing-extensions>=4.6.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (4.11.0)\n",
|
||||
"Requirement already satisfied: charset-normalizer<4,>=2 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.3.2)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (3.7)\n",
|
||||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.2.1)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2024.2.2)\n",
|
||||
"Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.7.11)\n",
|
||||
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.1.4)\n",
|
||||
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from typer<0.10.0,>=0.3.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (8.1.7)\n",
|
||||
"Requirement already satisfied: cloudpathlib<0.17.0,>=0.7.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from weasel<0.4.0,>=0.1.0->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (0.16.0)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=2.0 in /Users/potoato/.pyenv/versions/3.11.0/lib/python3.11/site-packages (from jinja2->spacy<3.8.0,>=3.7.2->en-core-web-sm==3.7.1) (2.1.5)\n",
|
||||
"Installing collected packages: en-core-web-sm\n",
|
||||
"Successfully installed en-core-web-sm-3.7.1\n",
|
||||
"\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
||||
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
|
||||
"You can now load the package via spacy.load('en_core_web_sm')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!python3 -m spacy download en_core_web_sm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "tribal-attention",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
|
@ -205,7 +379,7 @@
|
|||
"IDE\n",
|
||||
",\n",
|
||||
"see\n",
|
||||
"Running\n",
|
||||
"run\n",
|
||||
"Tutorial\n",
|
||||
"Examples\n",
|
||||
"in\n",
|
||||
|
@ -218,7 +392,7 @@
|
|||
"work\n",
|
||||
"for\n",
|
||||
"all\n",
|
||||
"swing\n",
|
||||
"Swing\n",
|
||||
"program\n",
|
||||
"—\n",
|
||||
"applet\n",
|
||||
|
@ -232,7 +406,7 @@
|
|||
"be\n",
|
||||
"the\n",
|
||||
"step\n",
|
||||
"-PRON-\n",
|
||||
"you\n",
|
||||
"need\n",
|
||||
"to\n",
|
||||
"follow\n",
|
||||
|
@ -248,7 +422,7 @@
|
|||
"platform\n",
|
||||
",\n",
|
||||
"if\n",
|
||||
"-PRON-\n",
|
||||
"you\n",
|
||||
"have\n",
|
||||
"not\n",
|
||||
"already\n",
|
||||
|
@ -260,7 +434,7 @@
|
|||
"program\n",
|
||||
"that\n",
|
||||
"use\n",
|
||||
"Swing\n",
|
||||
"swing\n",
|
||||
"component\n",
|
||||
".\n",
|
||||
"compile\n",
|
||||
|
@ -302,13 +476,13 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": null,
|
||||
"id": "surgical-demonstration",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def terminology_lookup():\n",
|
||||
" return []"
|
||||
" return None"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -337,13 +511,77 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 17,
|
||||
"id": "superb-butterfly",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_nouns(text):\n",
|
||||
" return []"
|
||||
" doc = nlp(text)\n",
|
||||
" return [token.text for token in doc if token.pos_ == \"NOUN\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "5e2be152",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'ADP'"
|
||||
]
|
||||
},
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"doc[1].pos_"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "11430dc5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['programmers',\n",
|
||||
" 'section',\n",
|
||||
" 'Swing',\n",
|
||||
" 'application',\n",
|
||||
" 'command',\n",
|
||||
" 'line',\n",
|
||||
" 'information',\n",
|
||||
" 'Swing',\n",
|
||||
" 'application',\n",
|
||||
" 'compilation',\n",
|
||||
" 'instructions',\n",
|
||||
" 'programs',\n",
|
||||
" 'applets',\n",
|
||||
" 'applications',\n",
|
||||
" 'steps',\n",
|
||||
" 'release',\n",
|
||||
" 'platform',\n",
|
||||
" 'program',\n",
|
||||
" 'Swing',\n",
|
||||
" 'components',\n",
|
||||
" 'program',\n",
|
||||
" 'program']"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"get_nouns(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -356,7 +594,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": null,
|
||||
"id": "acting-tolerance",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
|
@ -374,13 +612,55 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 24,
|
||||
"id": "eight-redhead",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_terms(text):\n",
|
||||
" return []"
|
||||
" doc = nlp(text)\n",
|
||||
" noun_counts = {}\n",
|
||||
" for token in doc:\n",
|
||||
" if token.pos_ == \"NOUN\":\n",
|
||||
" noun_counts[token.text] = noun_counts.get(token.text, 0) + 1\n",
|
||||
" return noun_counts"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"id": "c7d46f26",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'programmers': 1,\n",
|
||||
" 'section': 1,\n",
|
||||
" 'Swing': 3,\n",
|
||||
" 'application': 2,\n",
|
||||
" 'command': 1,\n",
|
||||
" 'line': 1,\n",
|
||||
" 'information': 1,\n",
|
||||
" 'compilation': 1,\n",
|
||||
" 'instructions': 1,\n",
|
||||
" 'programs': 1,\n",
|
||||
" 'applets': 1,\n",
|
||||
" 'applications': 1,\n",
|
||||
" 'steps': 1,\n",
|
||||
" 'release': 1,\n",
|
||||
" 'platform': 1,\n",
|
||||
" 'program': 3,\n",
|
||||
" 'components': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"extract_terms(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -393,14 +673,61 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 35,
|
||||
"id": "monetary-mambo",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_terms(text):\n",
|
||||
" return []"
|
||||
" doc = nlp(text.lower())\n",
|
||||
" noun_counts = {}\n",
|
||||
" for token in doc:\n",
|
||||
" if token.pos_ in [\"NOUN\", \"VERB\", \"ADJ\"]:\n",
|
||||
" noun_counts[token.text] = noun_counts.get(token.text, 0) + 1\n",
|
||||
" return noun_counts"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"id": "4259ee3f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'explains': 1,\n",
|
||||
" 'compile': 2,\n",
|
||||
" 'run': 2,\n",
|
||||
" 'compiling': 1,\n",
|
||||
" 'running': 2,\n",
|
||||
" 'using': 1,\n",
|
||||
" 'see': 1,\n",
|
||||
" 'work': 1,\n",
|
||||
" 'need': 1,\n",
|
||||
" 'follow': 1,\n",
|
||||
" 'install': 1,\n",
|
||||
" 'done': 1,\n",
|
||||
" 'create': 1,\n",
|
||||
" 'uses': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 34,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"extract_terms(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "8d239c20",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -422,7 +749,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.11.0"
|
||||
},
|
||||
"subtitle": "3. Terminologia",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -60,8 +60,35 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def find_tags(text):\n",
|
||||
" return []"
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def find_tags(string):\n",
|
||||
" pattern = r'<[^>]+>'\n",
|
||||
" matches = re.finditer(pattern, string)\n",
|
||||
" tag_indexes = [(match.start(), match.end()) for match in matches]\n",
|
||||
" return tag_indexes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "3dc08368",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['<tag1>', '</tag1>', '<tag2>', '</tag2>']"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"string = \"<tag1>ADIOS</tag1><tag2>OLA</tag2>\"\n",
|
||||
"[ string[out[0]:out[1]] for out in find_tags(string)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -74,13 +101,73 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 20,
|
||||
"id": "unauthorized-study",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"False\n",
|
||||
"False\n",
|
||||
"False\n",
|
||||
"True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"import string\n",
|
||||
"\n",
|
||||
"def is_translatable(text):\n",
|
||||
" return True"
|
||||
" return bool(re.match(r'^[^0-9IVXLCDM\\s' + re.escape(string.punctuation) + ']+$', text))\n",
|
||||
"\n",
|
||||
"text1 = \"This is a sample text.\"\n",
|
||||
"text2 = \"2024.\"\n",
|
||||
"text3 = \"Это пример текста.\"\n",
|
||||
"text4 = \"おはよう\"\n",
|
||||
"\n",
|
||||
"print(is_translatable(text1)) \n",
|
||||
"print(is_translatable(text2))\n",
|
||||
"print(is_translatable(text3))\n",
|
||||
"print(is_translatable(text4))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"id": "ae92a18c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\"This is a sample text.\" is translatable?\n",
|
||||
"Yes\n",
|
||||
"\"2024.\" is translatable?\n",
|
||||
"No\n",
|
||||
"\"Это пример текста.\" is translatable?\n",
|
||||
"Yes\n",
|
||||
"\"おはよう\" is translatable?\n",
|
||||
"Yes\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"import string\n",
|
||||
"\n",
|
||||
"def is_translatable(text):\n",
|
||||
" return bool(re.match(r'^[^\\d]+$|^\\s+$', text))\n",
|
||||
"\n",
|
||||
"examples = [\"This is a sample text.\", \"2024.\", \n",
|
||||
" \"Это пример текста.\", \"おはよう\"]\n",
|
||||
"\n",
|
||||
"for ex in examples:\n",
|
||||
" response = 'Yes' if is_translatable(ex) else 'No'\n",
|
||||
" print(f'\"{ex}\" is translatable?\\n{response}')"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -93,13 +180,74 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 44,
|
||||
"id": "beautiful-mathematics",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"('03/25/2022', 3, 25, 2022)\tday:\t3\tmonth:\t25\tyear:\t2022\n",
|
||||
"('25-12-2023', 25, 12, 2023)\tday:\t25\tmonth:\t12\tyear:\t2023\n",
|
||||
"('09/30/2025', 9, 30, 2025)\tday:\t9\tmonth:\t30\tyear:\t2025\n",
|
||||
"('03/25/2022', 25, 3, 2022)\tday:\t25\tmonth:\t3\tyear:\t2022\n",
|
||||
"('09/30/2025', 30, 9, 2025)\tday:\t30\tmonth:\t9\tyear:\t2025\n",
|
||||
"('12 March 2024', 12, 3, 2024)\tday:\t12\tmonth:\t3\tyear:\t2024\n",
|
||||
"('25-12-2023', 25, 12, 2023)\tday:\t25\tmonth:\t12\tyear:\t2023\n",
|
||||
"('12 March 2024', 12, 3, 2024)\tday:\t12\tmonth:\t3\tyear:\t2024\n",
|
||||
"('15 September, 2026', 15, 9, 2026)\tday:\t15\tmonth:\t9\tyear:\t2026\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def find_dates(text):\n",
|
||||
" return []"
|
||||
" date_formats = [\n",
|
||||
" (r'(\\d{1,2})[-/](\\d{1,2})[-/](\\d{2,4})', 'day_first'),\n",
|
||||
" (r'(\\d{1,2})[/](\\d{1,2})[/](\\d{2,4})', 'month_first'),\n",
|
||||
" (r'(\\d{1,2}) (\\w{3,9}) (\\d{4})', 'day_first'),\n",
|
||||
" (r'(\\d{1,2})-(\\d{1,2})-(\\d{2,4})', 'day_first'),\n",
|
||||
" (r'(\\d{1,2}) (\\w{3,9}),? (\\d{4})', 'day_first')\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" months = {\n",
|
||||
" 'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6,\n",
|
||||
" 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" dates_found = []\n",
|
||||
" for date_pattern, format_type in date_formats:\n",
|
||||
" matches = re.finditer(date_pattern, text)\n",
|
||||
" for match in matches:\n",
|
||||
" groups = match.groups()\n",
|
||||
" if len(groups) == 3:\n",
|
||||
" if format_type == 'day_first':\n",
|
||||
" day, month, year = groups\n",
|
||||
" else:\n",
|
||||
" month, day, year = groups\n",
|
||||
" if month.isdigit():\n",
|
||||
" month = int(month)\n",
|
||||
" else:\n",
|
||||
" month = months[month]\n",
|
||||
" dates_found.append((match.group(), int(day), month, int(year)))\n",
|
||||
" elif len(groups) == 4:\n",
|
||||
" if format_type == 'day_first':\n",
|
||||
" day, month, _, year = groups\n",
|
||||
" else:\n",
|
||||
" month, day, _, year = groups\n",
|
||||
" if month.isdigit():\n",
|
||||
" month = int(month)\n",
|
||||
" else:\n",
|
||||
" month = months[month]\n",
|
||||
" dates_found.append((match.group(), int(day), month, int(year)))\n",
|
||||
" return dates_found\n",
|
||||
"\n",
|
||||
"text = \"Here are some dates: 03/25/2022, 25-12-2023, 12 March 2024, 09/30/2025, 15 September, 2026\"\n",
|
||||
"dates = find_dates(text)\n",
|
||||
"for date in dates:\n",
|
||||
" print(f\"{date}\\tday:\\t{date[1]}\\tmonth:\\t{date[2]}\\tyear:\\t{date[3]}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -205,7 +353,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.11.0"
|
||||
},
|
||||
"subtitle": "6,7. Preprocessing i postprocessing",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
|
18091
lab/lab_08.ipynb
18091
lab/lab_08.ipynb
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue