This commit is contained in:
Adam Stelmaszyk 2024-05-05 08:05:59 +02:00
parent 7fd3eb01b3
commit efcac35c9e
1 changed files with 34 additions and 652 deletions

View File

@ -57,7 +57,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"id": "d4f068df",
"metadata": {},
"outputs": [
@ -85,7 +85,7 @@
"3.984587822441638e-156"
]
},
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@ -317,668 +317,38 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"id": "descending-easter",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"press\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"h\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"while\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"review\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"mode\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"display\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"keyboard\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"shortcuts\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"for\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"working\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"review\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"mode\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"click\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"remove\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"shortcut\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"selects\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"nonselected\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"areas\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"for\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"more\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"information\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"see\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"screen\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"class\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"form\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"class\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"and\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"slide\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"class\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"actionscript\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"20\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"components\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"language\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"reference\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"clicking\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"mouse\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"retracts\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"bezier\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"handles\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"and\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"causes\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"curved\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"path\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"across\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"anchor\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"point\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"revert\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"straight\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"segments\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"an\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"array\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"of\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"keyvalue\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"pairs\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"that\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"represent\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"information\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"ilst\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"atom\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"which\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"is\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"equivalent\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"of\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"id3\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"tags\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"for\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"mp4\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"files\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"when\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"you\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"add\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"3d\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"comments\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"default\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"view\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"of\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"a\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"model\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"a\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"new\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"view\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"called\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"3dcommentview\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"is\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"created\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"when\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"you\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"re\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"finished\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"setting\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"options\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"click\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"set\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"default\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"set\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"location\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"for\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"stroke\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"in\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"relationship\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"to\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"the\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"marquee\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"by\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"choosing\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"inside\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"center\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"or\n",
"Invalid Word\n",
"None\n",
"\n",
"\n",
"outside\n"
"ename": "NameError",
"evalue": "name 'remove_punctuation' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 45\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28mprint\u001b[39m(human_sum)\n\u001b[1;32m 41\u001b[0m \u001b[38;5;66;03m# tranlsations = [PyDictionary().translate(word, 'de') for word in element]\u001b[39;00m\n\u001b[0;32m---> 45\u001b[0m analyze_translations()\n",
"Cell \u001b[0;32mIn[1], line 19\u001b[0m, in \u001b[0;36manalyze_translations\u001b[0;34m()\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21manalyze_translations\u001b[39m():\n\u001b[1;32m 18\u001b[0m ourZip \u001b[38;5;241m=\u001b[39m zipfile\u001b[38;5;241m.\u001b[39mZipFile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/corpus_corrected.zip\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 19\u001b[0m files \u001b[38;5;241m=\u001b[39m {name: remove_punctuation(ourZip\u001b[38;5;241m.\u001b[39mread(name)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m ourZip\u001b[38;5;241m.\u001b[39mnamelist()}\n\u001b[1;32m 22\u001b[0m corpus_de_human, corpus_de_nmt, corpus_en \u001b[38;5;241m=\u001b[39m files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_human.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_nmt.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_en.txt\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 24\u001b[0m nmt_sum \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
"Cell \u001b[0;32mIn[1], line 19\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21manalyze_translations\u001b[39m():\n\u001b[1;32m 18\u001b[0m ourZip \u001b[38;5;241m=\u001b[39m zipfile\u001b[38;5;241m.\u001b[39mZipFile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/corpus_corrected.zip\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 19\u001b[0m files \u001b[38;5;241m=\u001b[39m {name: remove_punctuation(ourZip\u001b[38;5;241m.\u001b[39mread(name)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m ourZip\u001b[38;5;241m.\u001b[39mnamelist()}\n\u001b[1;32m 22\u001b[0m corpus_de_human, corpus_de_nmt, corpus_en \u001b[38;5;241m=\u001b[39m files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_human.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_nmt.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_en.txt\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 24\u001b[0m nmt_sum \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
"\u001b[0;31mNameError\u001b[0m: name 'remove_punctuation' is not defined"
]
}
],
"source": [
"from PyDictionary import PyDictionary\n",
"import zipfile\n",
"import re\n",
"\n",
"def transalate(word_list):\n",
" transalted_words = {}\n",
" for word in word_list:\n",
" translation = PyDictionary().translate(word, 'German')\n",
" if translation:\n",
" transalted_words[word] = translation\n",
" try:\n",
" translation = PyDictionary().translate(word, 'German')\n",
" if translation:\n",
" transalted_words[word] = translation\n",
" except Exception as e:\n",
" print('Exception')\n",
"\n",
" return transalted_words\n",
"\n",
@ -989,11 +359,23 @@
" \n",
" corpus_de_human, corpus_de_nmt, corpus_en = files['corpus_de_human.txt'], files['corpus_de_nmt.txt'], files['corpus_en.txt']\n",
"\n",
" nmt_sum = 0\n",
" human_sum = 0\n",
"\n",
" for human_element, nmt_element, element in zip(corpus_de_human, corpus_de_nmt, corpus_en):\n",
" transalted_words = transalate(element)\n",
"\n",
" words = set(re.findall(r'\\w+', sentence.lower()))\n",
" matches = sum(1 for word in words if translations.get(word.lower()))\n",
" # words = set(re.findall(r'\\w+', nmt_element.lower()))\n",
" nmt_sum += sum(1 for word in nmt_element if transalted_words.get(word.lower()))\n",
"\n",
" # words = set(re.findall(r'\\w+', human_element.lower()))\n",
" human_sum += sum(1 for word in human_element if transalted_words.get(word.lower()))\n",
"\n",
"\n",
" print(nmt_sum)\n",
" print(human_sum)\n",
"\n",
"\n",
" # tranlsations = [PyDictionary().translate(word, 'de') for word in element]\n",
" \n",
"\n",