forked from bfijalkowski/KWT-2024
add tasks 1-3
This commit is contained in:
parent
71ca3b66ed
commit
8c25ab8484
179
lab/lab_01.ipynb
179
lab/lab_01.ipynb
@ -52,7 +52,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 45,
|
||||
"id": "narrow-romantic",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -71,7 +71,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 46,
|
||||
"id": "indonesian-electron",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -82,7 +82,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 47,
|
||||
"id": "compact-trinidad",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -92,7 +92,7 @@
|
||||
"['Press the ENTER button']"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"execution_count": 47,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -119,7 +119,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 48,
|
||||
"id": "exposed-daniel",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -139,7 +139,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 49,
|
||||
"id": "serial-velvet",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -149,7 +149,7 @@
|
||||
"['Press the ENTER button', 'Press the ENTER key']"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -176,17 +176,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 52,
|
||||
"id": "every-gibson",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[]"
|
||||
"['Press the ENTER button', 'Press the ENTER key']"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -213,13 +213,13 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 51,
|
||||
"id": "protected-rings",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tm_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" return [entry[1] for entry in translation_memory if entry[0].lower() == sentence.lower()]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -232,17 +232,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"execution_count": 55,
|
||||
"id": "severe-alloy",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"''"
|
||||
"['Press the ENTER button', 'Press the ENTER key']"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"execution_count": 55,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -261,13 +261,21 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 86,
|
||||
"id": "structural-diesel",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import string\n",
|
||||
"\n",
|
||||
"def sentence_similar(sentence):\n",
|
||||
" translator = str.maketrans('', '', string.punctuation)\n",
|
||||
" return sentence.translate(translator)\n",
|
||||
"\n",
|
||||
"def tm_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" return [entry[1] for entry in translation_memory if entry[0].lower() == sentence_similar(sentence).lower()]\n",
|
||||
"\n",
|
||||
"#print(string.punctuation)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -280,17 +288,17 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"execution_count": 67,
|
||||
"id": "brief-senegal",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"''"
|
||||
"['System restart required']"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"execution_count": 67,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -317,13 +325,30 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"execution_count": 66,
|
||||
"id": "mathematical-customs",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def tm_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" inWords = sentence_similar(sentence).lower().split(\" \")\n",
|
||||
" lenSentence = len(inWords)\n",
|
||||
" matchWords = 0\n",
|
||||
" answer = []\n",
|
||||
" for entry in translation_memory:\n",
|
||||
" dicWords = entry[0].lower().split(\" \")\n",
|
||||
" \n",
|
||||
" for i in range(lenSentence-1):\n",
|
||||
" if inWords[i] == dicWords[i]:\n",
|
||||
" matchWords += 1\n",
|
||||
" \n",
|
||||
" if matchWords >= lenSentence-1:\n",
|
||||
" answer.append(entry[1])\n",
|
||||
" matchWords = 0\n",
|
||||
" else:\n",
|
||||
" matchWords = 0\n",
|
||||
"\n",
|
||||
" return answer"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -344,7 +369,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 68,
|
||||
"id": "humanitarian-wrong",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -362,19 +387,29 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"execution_count": 84,
|
||||
"id": "located-perception",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def glossary_lookup(sentence):\n",
|
||||
" sentence_words = sentence.split()\n",
|
||||
" return [entry for entry in glossary if entry[0] in sentence_words]"
|
||||
" return [entry for entry in glossary if entry[0] in sentence_words]\n",
|
||||
"\n",
|
||||
"def exercise4help(sentence):\n",
|
||||
" sentence_words = sentence.split()\n",
|
||||
" answer = []\n",
|
||||
" for entry in glossary: #przechodzimy przez każdą tuple więc (n)\n",
|
||||
" if entry[0] in sentence_words: # (m) porównań\n",
|
||||
" answer.append(entry)\n",
|
||||
" return answer\n",
|
||||
"\n",
|
||||
"#dla każdego hasła w słowniku, robimy tyle porównań ile jest słów w zdaniu O(n*m)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"execution_count": 76,
|
||||
"id": "advised-casting",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -384,13 +419,41 @@
|
||||
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"execution_count": 76,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')"
|
||||
"glossary_lookup('Każda drukarka posiada przycisk wznowienia drukowania')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 81,
|
||||
"id": "70ae3dd8-d4ca-4a59-b8a9-ca47583bf54a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"3\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 81,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"exercise4help('Każda drukarka posiada przycisk wznowienia drukowania')"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -406,7 +469,7 @@
|
||||
"id": "defensive-fifteen",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"Odpowiedź: O(m*n)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -419,13 +482,27 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"execution_count": 82,
|
||||
"id": "original-tunisia",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('przycisk', 'button'), ('drukarka', 'printer')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 82,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def glossary_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" sentence_words = sentence.lower().split()\n",
|
||||
" return [entry for entry in glossary if entry[0] in sentence_words]\n",
|
||||
"\n",
|
||||
"glossary_lookup('Każda DRUKARKA posiada PRZYCISK wznowienia drukowania')"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -438,13 +515,41 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"execution_count": 108,
|
||||
"id": "adolescent-semiconductor",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('drukarka', 'printer'), ('drukarka', 'printer'), ('przycisk', 'button')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 109,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"glossary = {\n",
|
||||
" 'komputer': 'computer',\n",
|
||||
" 'przycisk': 'button', \n",
|
||||
" 'drukarka': 'printer'\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#glossary.get('komputer') == None\n",
|
||||
"\n",
|
||||
"def glossary_lookup(sentence):\n",
|
||||
" return ''"
|
||||
" sentence_words = set(sentence.split()) #umieszczamy w zbiorze aby uniknąć przetwarzania mniejszej ilości słów.\n",
|
||||
" answer = []\n",
|
||||
" for word in sentence_words: # dla każdego słowa w zdaniu (m)\n",
|
||||
" translated_word = glossary.get(word) #pobieramy zawartosć ze słownika \n",
|
||||
" if translated_word != None: # (porównanie m razy)\n",
|
||||
" answer.append((word,translated_word)) # dodanie do odpowiedzi m razy\n",
|
||||
" \n",
|
||||
" return answer\n",
|
||||
"\n",
|
||||
"glossary_lookup('drukarka - Każda drukarka posiada przycisk wznowienia drukowania')"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -452,7 +557,7 @@
|
||||
"author": "Rafał Jaworski",
|
||||
"email": "rjawor@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -467,7 +572,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.9.2"
|
||||
},
|
||||
"subtitle": "1. Podstawowe techniki wspomagania tłumaczenia",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
234
lab/lab_02.ipynb
234
lab/lab_02.ipynb
@ -57,7 +57,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 3,
|
||||
"id": "confident-prison",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -80,13 +80,102 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 21,
|
||||
"id": "continental-submission",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Wciśnij przycisk Enter']"
|
||||
]
|
||||
},
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def tm_lookup(sentence):\n",
|
||||
" return [entry[1] for entry in translation_memory if entry[0].lower() == sentence.lower()]\n",
|
||||
"\n",
|
||||
"def ice_lookup(sentence, prev_sentence, next_sentence):\n",
|
||||
" return []"
|
||||
" if (not prev_sentence) or (not next_sentence):\n",
|
||||
" return 'no context'\n",
|
||||
" \n",
|
||||
" if not sentence:\n",
|
||||
" return 'enter your sentence'\n",
|
||||
" \n",
|
||||
" #Dobrze prawie ale tutaj zwracane są listy. wszystko okey, gdy zdanie poprzedzające i następne mają tamą ilość słów. JEST zdecydowanie błędny gdy zdania mają różną ilość słów!\n",
|
||||
" if tm_lookup(prev_sentence) and tm_lookup(next_sentence):\n",
|
||||
" return [entry[0] for entry in translation_memory if entry[0].lower() == sentence.lower()]\n",
|
||||
" else:\n",
|
||||
" return \"\"\n",
|
||||
" \n",
|
||||
"ice_lookup('Wciśnij przycisk Enter','Sprawdź ustawienia sieciowe','Drukarka jest wyłączona')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "ecb19925-7467-4e8a-bfdf-9adee52a5894",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'no context'"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ice_lookup('Wciśnij przycisk Enter','Sprawdź ustawienia sieciowe','')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "cf60a398-ae06-4ca8-b658-e011632cdb33",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'enter your sentence'"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ice_lookup('','Sprawdź ustawienia sieciowe','Drukarka jest wyłączona')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "d34415a4-d853-435e-b093-fabc4629ff26",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'no context'"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"ice_lookup('Wciśnij przycisk Enter','','Drukarka jest wyłączona')"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -141,7 +230,13 @@
|
||||
"id": "graduate-theorem",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"### Odpowiedź:\n",
|
||||
"- 1. **spełnia warunek**: dzięki zastosowaniu funkcji `abs()`\n",
|
||||
"- 3. **spełnia warunek**: przemienność w tym przypadku również zawdzięczamy funkcj `abs()`\n",
|
||||
"- 4. **spełnia warunek**:(z uproszczeniem że x i y i z to len(z danej zmiennej) -> |y-x| + |z-y| >= |z-x| =\n",
|
||||
"- = |y - y - x + z| >= |z-x| = |z-x| >= |z-x|\n",
|
||||
"\n",
|
||||
"2 nie jest spełnione, ponieważ x i y muszą być tymi samymi zdaniami aby odległość była równa 0. A wyżej wymieniona funckja spełnia ten warunek dla wszytskich zdań które mają taką samą ilość znaków."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -179,7 +274,11 @@
|
||||
"id": "metallic-leave",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"### Odpowiedź:\n",
|
||||
"- 1. **spełnia warunek**, ponieważ zwróci wartość 0 lub 3 które są >= 0\n",
|
||||
"- 2. **spełnia waurenk**, ponieważ gdy zdanie x jest takie samo jak y = to odległość jest zwracana jako 0\n",
|
||||
"- 3. **spełnia warunek**, ponieważ sprawdzenia wygląda w taki sposób, że porównujemy czy x == y (co jest tożsame z y == x) w przeciwnym wypadku zawsze zwracamy tą samą wartość\n",
|
||||
"- 4. **spełnia warunek**, ponieważ gdy xyz są takie same to mamy 0>=0 | gdy wszystkie są różne to mamy 6>=3 | gdy jedna para się różni 6>=0 lub 3>=0"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -206,7 +305,12 @@
|
||||
"id": "bibliographic-stopping",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"### Odpowiedź: Jest funckją dystansu\n",
|
||||
"- 1. **spełnia warunek** Liczba wymaganych operacji edycyjnych nie może być mniejsza niż zero. W przypadku gdy zdania są sobie równe d(x,y) = 0\n",
|
||||
"- 2. **spełnia warunek** Gdy zdania są sobie równe d(x,y) = 0\n",
|
||||
"- 3. **spełnia warunek** nie ważne czy zrobimy d(x,y) czy d(y,x) nadal liczba operacji edycyjnych będzie taka sama pa->papa (+2) | papa -> pa (-2)\n",
|
||||
"- 4. **spełnia warunek**: (z uproszczeniem że x i y i z to liczba wymaganych zmian -> |y?x| + |z?y| >= |z?x| =\n",
|
||||
"- = x + y >= x"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -223,7 +327,38 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 1,
|
||||
"id": "727b188d-eedd-4d19-9cbf-efcce71e145c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Defaulting to user installation because normal site-packages is not writeable\n",
|
||||
"Collecting python-Levenshtein\n",
|
||||
" Downloading python_Levenshtein-0.25.1-py3-none-any.whl.metadata (3.7 kB)\n",
|
||||
"Collecting Levenshtein==0.25.1 (from python-Levenshtein)\n",
|
||||
" Downloading Levenshtein-0.25.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)\n",
|
||||
"Collecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein==0.25.1->python-Levenshtein)\n",
|
||||
" Downloading rapidfuzz-3.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n",
|
||||
"Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)\n",
|
||||
"Downloading Levenshtein-0.25.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.4/177.4 kB\u001b[0m \u001b[31m3.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading rapidfuzz-3.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.4/3.4 MB\u001b[0m \u001b[31m40.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein\n",
|
||||
"Successfully installed Levenshtein-0.25.1 python-Levenshtein-0.25.1 rapidfuzz-3.8.1\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pip3 install python-Levenshtein"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"id": "secondary-wrist",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -233,7 +368,7 @@
|
||||
"2"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -254,7 +389,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 18,
|
||||
"id": "associate-tuner",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -273,7 +408,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"execution_count": 3,
|
||||
"id": "focal-pathology",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -283,7 +418,7 @@
|
||||
"0.9166666666666666"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -294,7 +429,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 4,
|
||||
"id": "roman-ceiling",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -304,7 +439,7 @@
|
||||
"0.9428571428571428"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -315,7 +450,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 5,
|
||||
"id": "invisible-cambodia",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -325,7 +460,7 @@
|
||||
"0.631578947368421"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -344,21 +479,80 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 37,
|
||||
"id": "genetic-cradle",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def fuzzy_lookup(sentence, threshold):\n",
|
||||
" return []"
|
||||
" \n",
|
||||
" answer = []\n",
|
||||
" \n",
|
||||
" for entry in translation_memory:\n",
|
||||
" if levenshtein_similarity(sentence.lower(),entry[0].lower()) >= threshold:\n",
|
||||
" answer.append(entry[1])\n",
|
||||
" \n",
|
||||
" return answer"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"id": "cc0544a4-a515-4515-a116-f13b96e92812",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Press the ENTER button']"
|
||||
]
|
||||
},
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#'Wciśnij przycisk Enter'\n",
|
||||
"fuzzy_lookup('KlikNiJ przycisK EnTeR', 0.86)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"id": "e2b8ff91-a103-45a4-a746-8ce3e9470c4c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['Check the network settings']"
|
||||
]
|
||||
},
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#'Sprawdź ustawienia sieciowe'\n",
|
||||
"fuzzy_lookup('Sprawdź ustawienia sieci', 0.885)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "df759469-b92e-490c-a672-96bd4c0d76b2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"author": "Rafał Jaworski",
|
||||
"email": "rjawor@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -373,7 +567,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.9.2"
|
||||
},
|
||||
"subtitle": "2. Zaawansowane użycie pamięci tłumaczeń",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
371
lab/lab_03.ipynb
371
lab/lab_03.ipynb
@ -20,6 +20,13 @@
|
||||
"id": "aggregate-listing",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"```python\n",
|
||||
"import collections\n",
|
||||
"lista1 = [3,4,5,4,4,7,8,7]\n",
|
||||
"lista2 = [3,4,5,4,4,7,8,7]\n",
|
||||
"print((collections.Counter(lista) + collections.Counter(lista2)).most_common(5))\n",
|
||||
"```\n",
|
||||
"\n",
|
||||
"Na dzisiejszych zajęciach zajmiemy się bliżej słownikami używanymi do wspomagania tłumaczenia. Oczywiście na rynku dostępnych jest bardzo wiele słowników w formacie elektronicznym. Wiele z nich jest gotowych do użycia w SDL Trados, memoQ i innych narzędziach CAT. Zawierają one setki tysięcy lub miliony haseł i oferują natychmiastową pomoc tłumaczowi."
|
||||
]
|
||||
},
|
||||
@ -63,7 +70,12 @@
|
||||
"id": "diverse-sunglasses",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Odpowiedź:"
|
||||
"### Odpowiedź:\n",
|
||||
"- **DeepL:** metal cabinet slides / metal cabinet guides\n",
|
||||
"- **Model GPT-3.5:** metal cabinet slides / metal wardrobe rails.\n",
|
||||
"- **Model GPT-4:** guides for metal cabinets / metal cabinet guides\n",
|
||||
"- **Google-translate**: metal cabinet guides\n",
|
||||
"- **www.tlumaczangielskopolski.pl:** metal cabinet guides\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -86,7 +98,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": 70,
|
||||
"id": "loving-prince",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -110,12 +122,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 71,
|
||||
"id": "bound-auction",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dictionary = ['program', 'application', 'applet' 'compile']"
|
||||
"dictionary = ['program', 'application', 'applet', 'compile']"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -128,13 +140,46 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 76,
|
||||
"id": "cognitive-cedar",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'applet': [(302, 308)],\n",
|
||||
" 'application': [(80, 91), (153, 84), (300, 158)],\n",
|
||||
" 'compile': [(56, 63), (497, 448)],\n",
|
||||
" 'program': [(14, 21), (284, 277), (454, 177), (495, 48), (505, 17)]}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"from pprint import pprint\n",
|
||||
"\n",
|
||||
"def terminology_lookup():\n",
|
||||
" return []"
|
||||
" answer = {pattern:[] for pattern in dictionary}\n",
|
||||
" low_text = text.lower()\n",
|
||||
" for pattern in dictionary:\n",
|
||||
" offset = 0\n",
|
||||
" start = 0\n",
|
||||
" end = 0\n",
|
||||
" while True:\n",
|
||||
" match = (re.search(pattern,low_text[offset:]))\n",
|
||||
" if not match:\n",
|
||||
" break\n",
|
||||
" else:\n",
|
||||
" start += match.start()\n",
|
||||
" end = +match.end()\n",
|
||||
" offset += end\n",
|
||||
"\n",
|
||||
" answer[pattern].append((start,end))\n",
|
||||
" pprint(answer)\n",
|
||||
" #return answer\n",
|
||||
"\n",
|
||||
"terminology_lookup()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -161,7 +206,113 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"execution_count": 1,
|
||||
"id": "02e1c16f-be37-4a64-a514-8875b393ccb7",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Defaulting to user installation because normal site-packages is not writeable\n",
|
||||
"Requirement already satisfied: spacy in /usr/local/lib/python3.9/dist-packages (3.4.1)\n",
|
||||
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.9 in /usr/local/lib/python3.9/dist-packages (from spacy) (3.0.10)\n",
|
||||
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (1.0.3)\n",
|
||||
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (1.0.8)\n",
|
||||
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.9/dist-packages (from spacy) (2.0.6)\n",
|
||||
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.9/dist-packages (from spacy) (3.0.7)\n",
|
||||
"Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (8.1.1)\n",
|
||||
"Requirement already satisfied: wasabi<1.1.0,>=0.9.1 in /usr/local/lib/python3.9/dist-packages (from spacy) (0.10.1)\n",
|
||||
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.9/dist-packages (from spacy) (2.4.4)\n",
|
||||
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.9/dist-packages (from spacy) (2.0.8)\n",
|
||||
"Requirement already satisfied: typer<0.5.0,>=0.3.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (0.4.2)\n",
|
||||
"Requirement already satisfied: pathy>=0.3.5 in /usr/local/lib/python3.9/dist-packages (from spacy) (0.6.2)\n",
|
||||
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (4.64.1)\n",
|
||||
"Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (1.21.6)\n",
|
||||
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (2.28.1)\n",
|
||||
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.10.0,>=1.7.4 in /usr/local/lib/python3.9/dist-packages (from spacy) (1.9.2)\n",
|
||||
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.9/dist-packages (from spacy) (3.1.2)\n",
|
||||
"Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from spacy) (52.0.0)\n",
|
||||
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (21.3)\n",
|
||||
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from spacy) (3.3.0)\n",
|
||||
"Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/lib/python3/dist-packages (from packaging>=20.0->spacy) (2.4.7)\n",
|
||||
"Requirement already satisfied: smart-open<6.0.0,>=5.2.1 in /usr/local/lib/python3.9/dist-packages (from pathy>=0.3.5->spacy) (5.2.1)\n",
|
||||
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from pydantic!=1.8,!=1.8.1,<1.10.0,>=1.7.4->spacy) (4.3.0)\n",
|
||||
"Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.1.1)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4)\n",
|
||||
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.26.12)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2022.9.14)\n",
|
||||
"Requirement already satisfied: blis<0.10.0,>=0.7.8 in /usr/local/lib/python3.9/dist-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.9.1)\n",
|
||||
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.9/dist-packages (from thinc<8.2.0,>=8.1.0->spacy) (0.0.1)\n",
|
||||
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.9/dist-packages (from typer<0.5.0,>=0.3.0->spacy) (8.1.3)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from jinja2->spacy) (2.1.1)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pip3 install spacy"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f6d7e9f5-4d6f-49c5-8dea-9957bc6da318",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Defaulting to user installation because normal site-packages is not writeable\n",
|
||||
"\u001b[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl#egg=en_core_web_sm==3.4.1 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0mCollecting en-core-web-sm==3.4.1\n",
|
||||
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.8/12.8 MB\u001b[0m \u001b[31m45.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: spacy<3.5.0,>=3.4.0 in /usr/local/lib/python3.9/dist-packages (from en-core-web-sm==3.4.1) (3.4.1)\n",
|
||||
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.9 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.10)\n",
|
||||
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.0.3)\n",
|
||||
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.0.8)\n",
|
||||
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.6)\n",
|
||||
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.7)\n",
|
||||
"Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (8.1.1)\n",
|
||||
"Requirement already satisfied: wasabi<1.1.0,>=0.9.1 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.10.1)\n",
|
||||
"Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.4.4)\n",
|
||||
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.8)\n",
|
||||
"Requirement already satisfied: typer<0.5.0,>=0.3.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.4.2)\n",
|
||||
"Requirement already satisfied: pathy>=0.3.5 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.6.2)\n",
|
||||
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (4.64.1)\n",
|
||||
"Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.21.6)\n",
|
||||
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.28.1)\n",
|
||||
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.10.0,>=1.7.4 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.9.2)\n",
|
||||
"Requirement already satisfied: jinja2 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.1.2)\n",
|
||||
"Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (52.0.0)\n",
|
||||
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (21.3)\n",
|
||||
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /usr/local/lib/python3.9/dist-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.3.0)\n",
|
||||
"Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/lib/python3/dist-packages (from packaging>=20.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.4.7)\n",
|
||||
"Requirement already satisfied: smart-open<6.0.0,>=5.2.1 in /usr/local/lib/python3.9/dist-packages (from pathy>=0.3.5->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (5.2.1)\n",
|
||||
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.9/dist-packages (from pydantic!=1.8,!=1.8.1,<1.10.0,>=1.7.4->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (4.3.0)\n",
|
||||
"Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.1.1)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.4)\n",
|
||||
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.26.12)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/dist-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2022.9.14)\n",
|
||||
"Requirement already satisfied: blis<0.10.0,>=0.7.8 in /usr/local/lib/python3.9/dist-packages (from thinc<8.2.0,>=8.1.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.9.1)\n",
|
||||
"Requirement already satisfied: confection<1.0.0,>=0.0.1 in /usr/local/lib/python3.9/dist-packages (from thinc<8.2.0,>=8.1.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.0.1)\n",
|
||||
"Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/lib/python3.9/dist-packages (from typer<0.5.0,>=0.3.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (8.1.3)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.9/dist-packages (from jinja2->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.1.1)\n",
|
||||
"Installing collected packages: en-core-web-sm\n",
|
||||
"Successfully installed en-core-web-sm-3.4.1\n",
|
||||
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
|
||||
"You can now load the package via spacy.load('en_core_web_sm')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"python3 -m spacy download en_core_web_sm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "tribal-attention",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
@ -232,7 +383,7 @@
|
||||
"be\n",
|
||||
"the\n",
|
||||
"step\n",
|
||||
"-PRON-\n",
|
||||
"you\n",
|
||||
"need\n",
|
||||
"to\n",
|
||||
"follow\n",
|
||||
@ -248,7 +399,7 @@
|
||||
"platform\n",
|
||||
",\n",
|
||||
"if\n",
|
||||
"-PRON-\n",
|
||||
"you\n",
|
||||
"have\n",
|
||||
"not\n",
|
||||
"already\n",
|
||||
@ -260,7 +411,7 @@
|
||||
"program\n",
|
||||
"that\n",
|
||||
"use\n",
|
||||
"Swing\n",
|
||||
"swing\n",
|
||||
"component\n",
|
||||
".\n",
|
||||
"compile\n",
|
||||
@ -302,13 +453,37 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 20,
|
||||
"id": "surgical-demonstration",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'program': [(14, 24), (291, 298), (468, 475), (516, 523), (533, 540)],\n",
|
||||
" 'application': [(80, 91), (164, 175), (322, 333)],\n",
|
||||
" 'applet': [(302, 308)],\n",
|
||||
" 'compile': [(56, 63), (134, 141), (504, 511)]}"
|
||||
]
|
||||
},
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import re\n",
|
||||
"\n",
|
||||
"def terminology_lookup():\n",
|
||||
" return []"
|
||||
" answer = {pattern:[] for pattern in dictionary}\n",
|
||||
"\n",
|
||||
" for pattern in dictionary:\n",
|
||||
" for token in doc:\n",
|
||||
" if pattern in token.lemma_:\n",
|
||||
" answer[pattern].append((token.idx,token.idx+len(token.lemma_)))\n",
|
||||
" return answer\n",
|
||||
"\n",
|
||||
"terminology_lookup()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -337,13 +512,52 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"execution_count": 73,
|
||||
"id": "superb-butterfly",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['programmers',\n",
|
||||
" 'section',\n",
|
||||
" 'Swing',\n",
|
||||
" 'application',\n",
|
||||
" 'command',\n",
|
||||
" 'line',\n",
|
||||
" 'information',\n",
|
||||
" 'Swing',\n",
|
||||
" 'application',\n",
|
||||
" 'compilation',\n",
|
||||
" 'instructions',\n",
|
||||
" 'Swing',\n",
|
||||
" 'programs',\n",
|
||||
" 'applets',\n",
|
||||
" 'applications',\n",
|
||||
" 'steps',\n",
|
||||
" 'release',\n",
|
||||
" 'platform',\n",
|
||||
" 'program',\n",
|
||||
" 'Swing',\n",
|
||||
" 'components',\n",
|
||||
" 'program',\n",
|
||||
" 'program']"
|
||||
]
|
||||
},
|
||||
"execution_count": 73,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import spacy\n",
|
||||
"def get_nouns(text):\n",
|
||||
" return []"
|
||||
" nlp = spacy.load(\"en_core_web_sm\")\n",
|
||||
" doc = nlp(text)\n",
|
||||
" nouns = [token.text for token in doc if token.pos_ == \"NOUN\"]\n",
|
||||
" return nouns\n",
|
||||
"\n",
|
||||
"get_nouns(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -374,13 +588,51 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"execution_count": 74,
|
||||
"id": "eight-redhead",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'programmer': 1,\n",
|
||||
" 'section': 1,\n",
|
||||
" 'swing': 4,\n",
|
||||
" 'application': 3,\n",
|
||||
" 'command': 1,\n",
|
||||
" 'line': 1,\n",
|
||||
" 'information': 1,\n",
|
||||
" 'compilation': 1,\n",
|
||||
" 'instruction': 1,\n",
|
||||
" 'program': 4,\n",
|
||||
" 'applet': 1,\n",
|
||||
" 'step': 1,\n",
|
||||
" 'release': 1,\n",
|
||||
" 'platform': 1,\n",
|
||||
" 'component': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 74,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from collections import Counter\n",
|
||||
"import spacy\n",
|
||||
"\n",
|
||||
"def extract_terms(text):\n",
|
||||
" return []"
|
||||
" nlp = spacy.load(\"en_core_web_sm\")\n",
|
||||
" doc = nlp(text)\n",
|
||||
" tally = {}\n",
|
||||
" nouns = [token.lemma_ for token in doc if token.pos_ == \"NOUN\"]\n",
|
||||
" nouns_counts = Counter(nouns)\n",
|
||||
" \n",
|
||||
" for word, count in nouns_counts.items():\n",
|
||||
" tally.update({word: count})\n",
|
||||
" return tally\n",
|
||||
"\n",
|
||||
"extract_terms(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -393,13 +645,82 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"execution_count": 75,
|
||||
"id": "monetary-mambo",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'adjectives': {'late': 1},\n",
|
||||
" 'nouns': {'applet': 1,\n",
|
||||
" 'application': 3,\n",
|
||||
" 'command': 1,\n",
|
||||
" 'compilation': 1,\n",
|
||||
" 'component': 1,\n",
|
||||
" 'information': 1,\n",
|
||||
" 'instruction': 1,\n",
|
||||
" 'line': 1,\n",
|
||||
" 'platform': 1,\n",
|
||||
" 'program': 4,\n",
|
||||
" 'programmer': 1,\n",
|
||||
" 'release': 1,\n",
|
||||
" 'section': 1,\n",
|
||||
" 'step': 1,\n",
|
||||
" 'swing': 4},\n",
|
||||
" 'verbs': {'compile': 3,\n",
|
||||
" 'create': 1,\n",
|
||||
" 'do': 1,\n",
|
||||
" 'explain': 1,\n",
|
||||
" 'follow': 1,\n",
|
||||
" 'install': 1,\n",
|
||||
" 'need': 1,\n",
|
||||
" 'run': 3,\n",
|
||||
" 'see': 1,\n",
|
||||
" 'use': 2,\n",
|
||||
" 'work': 1}}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from pprint import pprint\n",
|
||||
"from collections import Counter\n",
|
||||
"import spacy\n",
|
||||
"\n",
|
||||
"def extract_terms(text):\n",
|
||||
" return []"
|
||||
" \n",
|
||||
" nlp = spacy.load(\"en_core_web_sm\")\n",
|
||||
" doc = nlp(text)\n",
|
||||
" \n",
|
||||
" nouns, verbs, adjectives = [], [], []\n",
|
||||
" tally = {\"nouns\": {}, \"verbs\": {}, \"adjectives\": {}}\n",
|
||||
" \n",
|
||||
" for token in doc:\n",
|
||||
" if token.pos_ == \"NOUN\":\n",
|
||||
" nouns.append(token.lemma_)\n",
|
||||
" elif token.pos_ == \"VERB\":\n",
|
||||
" verbs.append(token.lemma_)\n",
|
||||
" elif token.pos_ == \"ADJ\":\n",
|
||||
" adjectives.append(token.lemma_)\n",
|
||||
" \n",
|
||||
" nouns_counts = Counter(nouns)\n",
|
||||
" verbs_counts = Counter(verbs)\n",
|
||||
" adjectives_counts = Counter(adjectives)\n",
|
||||
"\n",
|
||||
" for word, count in nouns_counts.items():\n",
|
||||
" tally[\"nouns\"].update({word: count})\n",
|
||||
" \n",
|
||||
" for word, count in verbs_counts.items():\n",
|
||||
" tally[\"verbs\"].update({word: count})\n",
|
||||
" \n",
|
||||
" for word, count in adjectives_counts.items():\n",
|
||||
" tally[\"adjectives\"].update({word: count})\n",
|
||||
"\n",
|
||||
" pprint(tally)\n",
|
||||
" #return tally\n",
|
||||
"\n",
|
||||
"extract_terms(text)"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -407,7 +728,7 @@
|
||||
"author": "Rafał Jaworski",
|
||||
"email": "rjawor@amu.edu.pl",
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@ -422,7 +743,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.10"
|
||||
"version": "3.9.2"
|
||||
},
|
||||
"subtitle": "3. Terminologia",
|
||||
"title": "Komputerowe wspomaganie tłumaczenia",
|
||||
|
Loading…
Reference in New Issue
Block a user