This commit is contained in:
Adam Stelmaszyk 2024-05-05 08:05:59 +02:00
parent 7fd3eb01b3
commit efcac35c9e

View File

@ -57,7 +57,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 2,
"id": "d4f068df", "id": "d4f068df",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
@ -85,7 +85,7 @@
"3.984587822441638e-156" "3.984587822441638e-156"
] ]
}, },
"execution_count": 1, "execution_count": 2,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -317,668 +317,38 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 1,
"id": "descending-easter", "id": "descending-easter",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"name": "stdout", "ename": "NameError",
"output_type": "stream", "evalue": "name 'remove_punctuation' is not defined",
"text": [ "output_type": "error",
"press\n", "traceback": [
"Invalid Word\n", "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"None\n", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\n", "Cell \u001b[0;32mIn[1], line 45\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28mprint\u001b[39m(human_sum)\n\u001b[1;32m 41\u001b[0m \u001b[38;5;66;03m# tranlsations = [PyDictionary().translate(word, 'de') for word in element]\u001b[39;00m\n\u001b[0;32m---> 45\u001b[0m analyze_translations()\n",
"\n", "Cell \u001b[0;32mIn[1], line 19\u001b[0m, in \u001b[0;36manalyze_translations\u001b[0;34m()\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21manalyze_translations\u001b[39m():\n\u001b[1;32m 18\u001b[0m ourZip \u001b[38;5;241m=\u001b[39m zipfile\u001b[38;5;241m.\u001b[39mZipFile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/corpus_corrected.zip\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 19\u001b[0m files \u001b[38;5;241m=\u001b[39m {name: remove_punctuation(ourZip\u001b[38;5;241m.\u001b[39mread(name)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m ourZip\u001b[38;5;241m.\u001b[39mnamelist()}\n\u001b[1;32m 22\u001b[0m corpus_de_human, corpus_de_nmt, corpus_en \u001b[38;5;241m=\u001b[39m files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_human.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_nmt.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_en.txt\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 24\u001b[0m nmt_sum \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
"h\n", "Cell \u001b[0;32mIn[1], line 19\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21manalyze_translations\u001b[39m():\n\u001b[1;32m 18\u001b[0m ourZip \u001b[38;5;241m=\u001b[39m zipfile\u001b[38;5;241m.\u001b[39mZipFile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/corpus_corrected.zip\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 19\u001b[0m files \u001b[38;5;241m=\u001b[39m {name: remove_punctuation(ourZip\u001b[38;5;241m.\u001b[39mread(name)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m ourZip\u001b[38;5;241m.\u001b[39mnamelist()}\n\u001b[1;32m 22\u001b[0m corpus_de_human, corpus_de_nmt, corpus_en \u001b[38;5;241m=\u001b[39m files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_human.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_nmt.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_en.txt\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 24\u001b[0m nmt_sum \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
"Invalid Word\n", "\u001b[0;31mNameError\u001b[0m: name 'remove_punctuation' is not defined"
"None\n",
"\n",
"\n",
"while\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"review\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"mode\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"display\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"keyboard\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"shortcuts\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"for\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"working\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"review\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"mode\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"click\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"remove\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"shortcut\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"selects\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"nonselected\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"areas\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"for\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"more\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"information\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"see\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"screen\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"class\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"form\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"class\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"and\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"slide\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"class\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"actionscript\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"20\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"components\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"language\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"reference\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"clicking\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"mouse\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"retracts\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"bezier\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"handles\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"and\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"causes\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"curved\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"path\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"across\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"anchor\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"point\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"revert\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"straight\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"segments\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"an\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"array\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"of\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"keyvalue\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"pairs\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"that\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"represent\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"information\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"ilst\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"atom\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"which\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"is\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"equivalent\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"of\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"id3\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"tags\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"for\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"mp4\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"files\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"when\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"you\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"add\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"3d\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"comments\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"default\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"view\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"of\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"a\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"model\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"a\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"new\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"view\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"called\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"3dcommentview\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"is\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"created\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"when\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"you\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"re\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"finished\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"setting\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"options\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"click\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"set\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"default\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"set\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"location\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"for\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"stroke\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"relationship\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"marquee\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"by\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"choosing\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"inside\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"center\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"or\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"outside\n"
] ]
} }
], ],
"source": [ "source": [
"from PyDictionary import PyDictionary\n", "from PyDictionary import PyDictionary\n",
"import zipfile\n", "import zipfile\n",
"import re\n",
"\n", "\n",
"def transalate(word_list):\n", "def transalate(word_list):\n",
" transalted_words = {}\n", " transalted_words = {}\n",
" for word in word_list:\n", " for word in word_list:\n",
" try:\n",
" translation = PyDictionary().translate(word, 'German')\n", " translation = PyDictionary().translate(word, 'German')\n",
" if translation:\n", " if translation:\n",
" transalted_words[word] = translation\n", " transalted_words[word] = translation\n",
" except Exception as e:\n",
" print('Exception')\n",
"\n", "\n",
" return transalted_words\n", " return transalted_words\n",
"\n", "\n",
@ -989,11 +359,23 @@
" \n", " \n",
" corpus_de_human, corpus_de_nmt, corpus_en = files['corpus_de_human.txt'], files['corpus_de_nmt.txt'], files['corpus_en.txt']\n", " corpus_de_human, corpus_de_nmt, corpus_en = files['corpus_de_human.txt'], files['corpus_de_nmt.txt'], files['corpus_en.txt']\n",
"\n", "\n",
" nmt_sum = 0\n",
" human_sum = 0\n",
"\n",
" for human_element, nmt_element, element in zip(corpus_de_human, corpus_de_nmt, corpus_en):\n", " for human_element, nmt_element, element in zip(corpus_de_human, corpus_de_nmt, corpus_en):\n",
" transalted_words = transalate(element)\n", " transalted_words = transalate(element)\n",
"\n", "\n",
" words = set(re.findall(r'\\w+', sentence.lower()))\n", " # words = set(re.findall(r'\\w+', nmt_element.lower()))\n",
" matches = sum(1 for word in words if translations.get(word.lower()))\n", " nmt_sum += sum(1 for word in nmt_element if transalted_words.get(word.lower()))\n",
"\n",
" # words = set(re.findall(r'\\w+', human_element.lower()))\n",
" human_sum += sum(1 for word in human_element if transalted_words.get(word.lower()))\n",
"\n",
"\n",
" print(nmt_sum)\n",
" print(human_sum)\n",
"\n",
"\n",
" # tranlsations = [PyDictionary().translate(word, 'de') for word in element]\n", " # tranlsations = [PyDictionary().translate(word, 'de') for word in element]\n",
" \n", " \n",
"\n", "\n",