diff --git a/lab/.DS_Store b/lab/.DS_Store index 2ad125d..66aca8f 100644 Binary files a/lab/.DS_Store and b/lab/.DS_Store differ diff --git a/lab/data/.DS_Store b/lab/data/.DS_Store new file mode 100644 index 0000000..848faaf Binary files /dev/null and b/lab/data/.DS_Store differ diff --git a/lab/lab_08.ipynb b/lab/lab_08.ipynb index 569c392..ae86cd2 100644 --- a/lab/lab_08.ipynb +++ b/lab/lab_08.ipynb @@ -58,12 +58,58 @@ { "cell_type": "code", "execution_count": 1, - "id": "moving-clothing", + "id": "d4f068df", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/lib/python3.11/site-packages/nltk/translate/bleu_score.py:552: UserWarning: \n", + "The hypothesis contains 0 counts of 3-gram overlaps.\n", + "Therefore the BLEU score evaluates to 0, independently of\n", + "how many N-gram overlaps of lower order it contains.\n", + "Consider using lower n-gram order or use SmoothingFunction()\n", + " warnings.warn(_msg)\n", + "/opt/anaconda3/lib/python3.11/site-packages/nltk/translate/bleu_score.py:552: UserWarning: \n", + "The hypothesis contains 0 counts of 4-gram overlaps.\n", + "Therefore the BLEU score evaluates to 0, independently of\n", + "how many N-gram overlaps of lower order it contains.\n", + "Consider using lower n-gram order or use SmoothingFunction()\n", + " warnings.warn(_msg)\n" + ] + }, + { + "data": { + "text/plain": [ + "3.984587822441638e-156" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import zipfile\n", + "import nltk.translate.bleu_score as bleu\n", + "import string\n", + "\n", + "def remove_punctuation(text):\n", + " text_without_punctuations = text.translate(str.maketrans('', '', string.punctuation))\n", + " sentences = text_without_punctuations.split('\\n')\n", + " return [[word.lower() for word in sentence.split()] for sentence in sentences if sentence != '']\n", + "\n", "def calculate_bleu():\n", - " return 0" + " zip = zipfile.ZipFile('data/corpus_corrected.zip')\n", + " files = {name: remove_punctuation(zip.read(name).decode('utf-8'))\n", + " for name in zip.namelist()}\n", + " \n", + " corpus_de_human, corpus_de_nmt = files['corpus_de_human.txt'], files['corpus_de_nmt.txt']\n", + " \n", + " return bleu.corpus_bleu(corpus_de_human, corpus_de_nmt)\n", + "\n", + "calculate_bleu()" ] }, { @@ -76,13 +122,50 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "lasting-rolling", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 to 100 - 4.97555004481153e-232\n", + "500 to 600 - 5.956707985683837e-232\n", + "800 to 900 - 4.774461089627919e-232\n", + "200 to 300 - 5.56331772444502e-232\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/lib/python3.11/site-packages/nltk/translate/bleu_score.py:552: UserWarning: \n", + "The hypothesis contains 0 counts of 2-gram overlaps.\n", + "Therefore the BLEU score evaluates to 0, independently of\n", + "how many N-gram overlaps of lower order it contains.\n", + "Consider using lower n-gram order or use SmoothingFunction()\n", + " warnings.warn(_msg)\n" + ] + } + ], "source": [ - "def analyze_bleu():\n", - " return []" + "\n", + "def analyze_bleu(start_sentence_index, finish_sentence_index):\n", + " zip = zipfile.ZipFile('data/corpus_corrected.zip')\n", + " files = {name: remove_punctuation(zip.read(name).decode('utf-8'))\n", + " for name in zip.namelist()}\n", + " \n", + " corpus_de_human, corpus_de_nmt = files['corpus_de_human.txt'][start_sentence_index:finish_sentence_index], files['corpus_de_nmt.txt'][start_sentence_index:finish_sentence_index]\n", + " \n", + " return bleu.corpus_bleu(corpus_de_human, corpus_de_nmt)\n", + "\n", + "\n", + "print(\"0 to 100 - \"+str(analyze_bleu(0, 100)))\n", + "print(\"500 to 600 - \"+str(analyze_bleu(500, 600)))\n", + "print(\"800 to 900 - \"+str(analyze_bleu(800, 900)))\n", + "print(\"200 to 300 - \"+str(analyze_bleu(200, 300)))\n", + "\n" ] }, { @@ -102,6 +185,12 @@ " * N - liczba słów w tłumaczeniu referencyjnym (N=S+D+C)" ] }, + { + "cell_type": "markdown", + "id": "fb4f02ae", + "metadata": {}, + "source": [] + }, { "cell_type": "markdown", "id": "conscious-cookbook", @@ -120,13 +209,39 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 25, "id": "occupied-swing", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.17355216569308377" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "from jiwer import wer\n", + "import zipfile\n", + "\n", "def calculate_wer():\n", - " return 0" + " ourZip = zipfile.ZipFile('data/corpus_corrected.zip')\n", + " files = {name: remove_punctuation(ourZip.read(name).decode('utf-8'))\n", + " for name in ourZip.namelist()}\n", + " \n", + " corpus_de_human, corpus_de_nmt = files['corpus_de_human.txt'], files['corpus_de_nmt.txt']\n", + "\n", + " sum_wer = 0\n", + " for human_sent, nmt_sent in zip(corpus_de_human, corpus_de_nmt):\n", + " sum_wer+= wer(\" \".join(human_sent), \" \".join(nmt_sent))\n", + "\n", + " return sum_wer/(len(corpus_de_human))\n", + "\n", + "calculate_wer()" ] }, { @@ -147,13 +262,38 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 35, "id": "immediate-element", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "2.653" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "import Levenshtein\n", + "\n", "def calculate_levenshtein():\n", - " return 0" + " ourZip = zipfile.ZipFile('data/corpus_corrected.zip')\n", + " files = {name: remove_punctuation(ourZip.read(name).decode('utf-8'))\n", + " for name in ourZip.namelist()}\n", + " \n", + " corpus_de_human, corpus_de_nmt = files['corpus_de_human.txt'], files['corpus_de_nmt.txt']\n", + "\n", + " sum_disatnce = 0\n", + " for human_element, nmt_element in zip(corpus_de_human, corpus_de_nmt):\n", + " sum_disatnce+= Levenshtein.distance(human_element, nmt_element)\n", + "\n", + " return sum_disatnce/(len(corpus_de_human))\n", + "\n", + "calculate_levenshtein()\n" ] }, { @@ -177,28 +317,700 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "id": "descending-easter", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "press\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "h\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "while\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "in\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "review\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "mode\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "to\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "display\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "keyboard\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "shortcuts\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "for\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "working\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "in\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "review\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "mode\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "click\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "remove\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "shortcut\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "selects\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "nonselected\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "areas\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "for\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "more\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "information\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "see\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "screen\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "class\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "form\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "class\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "and\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "slide\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "class\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "in\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "actionscript\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "20\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "components\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "language\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "reference\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "clicking\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "mouse\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "retracts\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "bezier\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "handles\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "and\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "causes\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "curved\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "path\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "across\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "anchor\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "point\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "to\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "revert\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "to\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "straight\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "segments\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "an\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "array\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "of\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "keyvalue\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "pairs\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "that\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "represent\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "information\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "in\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "ilst\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "atom\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "which\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "is\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "equivalent\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "of\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "id3\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "tags\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "for\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "mp4\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "files\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "when\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "you\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "add\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "3d\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "comments\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "to\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "default\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "view\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "of\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "a\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "model\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "a\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "new\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "view\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "called\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "3dcommentview\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "is\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "created\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "when\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "you\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "re\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "finished\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "setting\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "options\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "click\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "set\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "default\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "set\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "location\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "for\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "stroke\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "in\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "relationship\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "to\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "the\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "marquee\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "by\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "choosing\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "inside\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "center\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "or\n", + "Invalid Word\n", + "None\n", + "\n", + "\n", + "outside\n" + ] + } + ], "source": [ + "from PyDictionary import PyDictionary\n", + "import zipfile\n", + "\n", + "def transalate(word_list):\n", + " transalted_words = {}\n", + " for word in word_list:\n", + " translation = PyDictionary().translate(word, 'German')\n", + " if translation:\n", + " transalted_words[word] = translation\n", + "\n", + " return transalted_words\n", + "\n", "def analyze_translations():\n", - " return []" + " ourZip = zipfile.ZipFile('data/corpus_corrected.zip')\n", + " files = {name: remove_punctuation(ourZip.read(name).decode('utf-8'))\n", + " for name in ourZip.namelist()}\n", + " \n", + " corpus_de_human, corpus_de_nmt, corpus_en = files['corpus_de_human.txt'], files['corpus_de_nmt.txt'], files['corpus_en.txt']\n", + "\n", + " for human_element, nmt_element, element in zip(corpus_de_human, corpus_de_nmt, corpus_en):\n", + " transalted_words = transalate(element)\n", + "\n", + " words = set(re.findall(r'\\w+', sentence.lower()))\n", + " matches = sum(1 for word in words if translations.get(word.lower()))\n", + " # tranlsations = [PyDictionary().translate(word, 'de') for word in element]\n", + " \n", + "\n", + "\n", + "analyze_translations()" ] } ], "metadata": { "author": "Rafał Jaworski", "email": "rjawor@amu.edu.pl", - "lang": "pl", - "subtitle": "8. Wykorzystanie tłumaczenia automatycznego we wspomaganiu tłumaczenia", - "title": "Komputerowe wspomaganie tłumaczenia", - "year": "2021", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, + "lang": "pl", "language_info": { "codemirror_mode": { "name": "ipython", @@ -209,8 +1021,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" - } + "version": "3.11.7" + }, + "subtitle": "8. Wykorzystanie tłumaczenia automatycznego we wspomaganiu tłumaczenia", + "title": "Komputerowe wspomaganie tłumaczenia", + "year": "2021" }, "nbformat": 4, "nbformat_minor": 5