This commit is contained in:
Adam Stelmaszyk 2024-05-04 14:59:05 +02:00
parent ff131152f6
commit 7fd3eb01b3
3 changed files with 837 additions and 22 deletions

BIN
lab/.DS_Store vendored

Binary file not shown.

BIN
lab/data/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -58,12 +58,58 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 1,
"id": "moving-clothing", "id": "d4f068df",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/anaconda3/lib/python3.11/site-packages/nltk/translate/bleu_score.py:552: UserWarning: \n",
"The hypothesis contains 0 counts of 3-gram overlaps.\n",
"Therefore the BLEU score evaluates to 0, independently of\n",
"how many N-gram overlaps of lower order it contains.\n",
"Consider using lower n-gram order or use SmoothingFunction()\n",
" warnings.warn(_msg)\n",
"/opt/anaconda3/lib/python3.11/site-packages/nltk/translate/bleu_score.py:552: UserWarning: \n",
"The hypothesis contains 0 counts of 4-gram overlaps.\n",
"Therefore the BLEU score evaluates to 0, independently of\n",
"how many N-gram overlaps of lower order it contains.\n",
"Consider using lower n-gram order or use SmoothingFunction()\n",
" warnings.warn(_msg)\n"
]
},
{
"data": {
"text/plain": [
"3.984587822441638e-156"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"import zipfile\n",
"import nltk.translate.bleu_score as bleu\n",
"import string\n",
"\n",
"def remove_punctuation(text):\n",
" text_without_punctuations = text.translate(str.maketrans('', '', string.punctuation))\n",
" sentences = text_without_punctuations.split('\\n')\n",
" return [[word.lower() for word in sentence.split()] for sentence in sentences if sentence != '']\n",
"\n",
"def calculate_bleu():\n", "def calculate_bleu():\n",
" return 0" " zip = zipfile.ZipFile('data/corpus_corrected.zip')\n",
" files = {name: remove_punctuation(zip.read(name).decode('utf-8'))\n",
" for name in zip.namelist()}\n",
" \n",
" corpus_de_human, corpus_de_nmt = files['corpus_de_human.txt'], files['corpus_de_nmt.txt']\n",
" \n",
" return bleu.corpus_bleu(corpus_de_human, corpus_de_nmt)\n",
"\n",
"calculate_bleu()"
] ]
}, },
{ {
@ -76,13 +122,50 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 3,
"id": "lasting-rolling", "id": "lasting-rolling",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 to 100 - 4.97555004481153e-232\n",
"500 to 600 - 5.956707985683837e-232\n",
"800 to 900 - 4.774461089627919e-232\n",
"200 to 300 - 5.56331772444502e-232\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/anaconda3/lib/python3.11/site-packages/nltk/translate/bleu_score.py:552: UserWarning: \n",
"The hypothesis contains 0 counts of 2-gram overlaps.\n",
"Therefore the BLEU score evaluates to 0, independently of\n",
"how many N-gram overlaps of lower order it contains.\n",
"Consider using lower n-gram order or use SmoothingFunction()\n",
" warnings.warn(_msg)\n"
]
}
],
"source": [ "source": [
"def analyze_bleu():\n", "\n",
" return []" "def analyze_bleu(start_sentence_index, finish_sentence_index):\n",
" zip = zipfile.ZipFile('data/corpus_corrected.zip')\n",
" files = {name: remove_punctuation(zip.read(name).decode('utf-8'))\n",
" for name in zip.namelist()}\n",
" \n",
" corpus_de_human, corpus_de_nmt = files['corpus_de_human.txt'][start_sentence_index:finish_sentence_index], files['corpus_de_nmt.txt'][start_sentence_index:finish_sentence_index]\n",
" \n",
" return bleu.corpus_bleu(corpus_de_human, corpus_de_nmt)\n",
"\n",
"\n",
"print(\"0 to 100 - \"+str(analyze_bleu(0, 100)))\n",
"print(\"500 to 600 - \"+str(analyze_bleu(500, 600)))\n",
"print(\"800 to 900 - \"+str(analyze_bleu(800, 900)))\n",
"print(\"200 to 300 - \"+str(analyze_bleu(200, 300)))\n",
"\n"
] ]
}, },
{ {
@ -102,6 +185,12 @@
" * N - liczba słów w tłumaczeniu referencyjnym (N=S+D+C)" " * N - liczba słów w tłumaczeniu referencyjnym (N=S+D+C)"
] ]
}, },
{
"cell_type": "markdown",
"id": "fb4f02ae",
"metadata": {},
"source": []
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "conscious-cookbook", "id": "conscious-cookbook",
@ -120,13 +209,39 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 25,
"id": "occupied-swing", "id": "occupied-swing",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"0.17355216569308377"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"from jiwer import wer\n",
"import zipfile\n",
"\n",
"def calculate_wer():\n", "def calculate_wer():\n",
" return 0" " ourZip = zipfile.ZipFile('data/corpus_corrected.zip')\n",
" files = {name: remove_punctuation(ourZip.read(name).decode('utf-8'))\n",
" for name in ourZip.namelist()}\n",
" \n",
" corpus_de_human, corpus_de_nmt = files['corpus_de_human.txt'], files['corpus_de_nmt.txt']\n",
"\n",
" sum_wer = 0\n",
" for human_sent, nmt_sent in zip(corpus_de_human, corpus_de_nmt):\n",
" sum_wer+= wer(\" \".join(human_sent), \" \".join(nmt_sent))\n",
"\n",
" return sum_wer/(len(corpus_de_human))\n",
"\n",
"calculate_wer()"
] ]
}, },
{ {
@ -147,13 +262,38 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 35,
"id": "immediate-element", "id": "immediate-element",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"data": {
"text/plain": [
"2.653"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"import Levenshtein\n",
"\n",
"def calculate_levenshtein():\n", "def calculate_levenshtein():\n",
" return 0" " ourZip = zipfile.ZipFile('data/corpus_corrected.zip')\n",
" files = {name: remove_punctuation(ourZip.read(name).decode('utf-8'))\n",
" for name in ourZip.namelist()}\n",
" \n",
" corpus_de_human, corpus_de_nmt = files['corpus_de_human.txt'], files['corpus_de_nmt.txt']\n",
"\n",
" sum_disatnce = 0\n",
" for human_element, nmt_element in zip(corpus_de_human, corpus_de_nmt):\n",
" sum_disatnce+= Levenshtein.distance(human_element, nmt_element)\n",
"\n",
" return sum_disatnce/(len(corpus_de_human))\n",
"\n",
"calculate_levenshtein()\n"
] ]
}, },
{ {
@ -177,28 +317,700 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 2,
"id": "descending-easter", "id": "descending-easter",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"press\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"h\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"while\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"review\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"mode\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"display\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"keyboard\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"shortcuts\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"for\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"working\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"review\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"mode\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"click\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"remove\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"shortcut\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"selects\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"nonselected\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"areas\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"for\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"more\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"information\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"see\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"screen\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"class\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"form\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"class\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"and\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"slide\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"class\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"actionscript\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"20\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"components\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"language\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"reference\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"clicking\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"mouse\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"retracts\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"bezier\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"handles\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"and\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"causes\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"curved\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"path\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"across\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"anchor\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"point\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"revert\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"straight\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"segments\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"an\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"array\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"of\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"keyvalue\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"pairs\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"that\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"represent\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"information\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"ilst\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"atom\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"which\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"is\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"equivalent\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"of\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"id3\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"tags\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"for\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"mp4\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"files\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"when\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"you\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"add\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"3d\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"comments\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"default\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"view\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"of\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"a\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"model\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"a\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"new\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"view\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"called\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"3dcommentview\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"is\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"created\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"when\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"you\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"re\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"finished\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"setting\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"options\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"click\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"set\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"default\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"set\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"location\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"for\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"stroke\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"relationship\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"marquee\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"by\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"choosing\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"inside\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"center\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"or\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"outside\n"
]
}
],
"source": [ "source": [
"from PyDictionary import PyDictionary\n",
"import zipfile\n",
"\n",
"def transalate(word_list):\n",
" transalted_words = {}\n",
" for word in word_list:\n",
" translation = PyDictionary().translate(word, 'German')\n",
" if translation:\n",
" transalted_words[word] = translation\n",
"\n",
" return transalted_words\n",
"\n",
"def analyze_translations():\n", "def analyze_translations():\n",
" return []" " ourZip = zipfile.ZipFile('data/corpus_corrected.zip')\n",
" files = {name: remove_punctuation(ourZip.read(name).decode('utf-8'))\n",
" for name in ourZip.namelist()}\n",
" \n",
" corpus_de_human, corpus_de_nmt, corpus_en = files['corpus_de_human.txt'], files['corpus_de_nmt.txt'], files['corpus_en.txt']\n",
"\n",
" for human_element, nmt_element, element in zip(corpus_de_human, corpus_de_nmt, corpus_en):\n",
" transalted_words = transalate(element)\n",
"\n",
" words = set(re.findall(r'\\w+', sentence.lower()))\n",
" matches = sum(1 for word in words if translations.get(word.lower()))\n",
" # tranlsations = [PyDictionary().translate(word, 'de') for word in element]\n",
" \n",
"\n",
"\n",
"analyze_translations()"
] ]
} }
], ],
"metadata": { "metadata": {
"author": "Rafał Jaworski", "author": "Rafał Jaworski",
"email": "rjawor@amu.edu.pl", "email": "rjawor@amu.edu.pl",
"lang": "pl",
"subtitle": "8. Wykorzystanie tłumaczenia automatycznego we wspomaganiu tłumaczenia",
"title": "Komputerowe wspomaganie tłumaczenia",
"year": "2021",
"kernelspec": { "kernelspec": {
"display_name": "Python 3", "display_name": "Python 3",
"language": "python", "language": "python",
"name": "python3" "name": "python3"
}, },
"lang": "pl",
"language_info": { "language_info": {
"codemirror_mode": { "codemirror_mode": {
"name": "ipython", "name": "ipython",
@ -209,8 +1021,11 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.8.10" "version": "3.11.7"
} },
"subtitle": "8. Wykorzystanie tłumaczenia automatycznego we wspomaganiu tłumaczenia",
"title": "Komputerowe wspomaganie tłumaczenia",
"year": "2021"
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 5 "nbformat_minor": 5