diff --git a/lab/lab_08.ipynb b/lab/lab_08.ipynb index ae86cd2..18d2e4d 100644 --- a/lab/lab_08.ipynb +++ b/lab/lab_08.ipynb @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "d4f068df", "metadata": {}, "outputs": [ @@ -85,7 +85,7 @@ "3.984587822441638e-156" ] }, - "execution_count": 1, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -317,668 +317,38 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "descending-easter", "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "press\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "h\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "while\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "in\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "review\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "mode\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "to\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "display\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "keyboard\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "shortcuts\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "for\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "working\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "in\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "review\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "mode\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "click\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "remove\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "shortcut\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "selects\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "nonselected\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "areas\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "for\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "more\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "information\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "see\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "screen\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "class\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "form\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "class\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "and\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "slide\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "class\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "in\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "actionscript\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "20\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "components\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "language\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "reference\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "clicking\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "mouse\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "retracts\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "bezier\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "handles\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "and\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "causes\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "curved\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "path\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "across\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "anchor\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "point\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "to\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "revert\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "to\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "straight\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "segments\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "an\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "array\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "of\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "keyvalue\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "pairs\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "that\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "represent\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "information\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "in\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "ilst\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "atom\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "which\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "is\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "equivalent\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "of\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "id3\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "tags\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "for\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "mp4\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "files\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "when\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "you\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "add\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "3d\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "comments\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "to\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "default\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "view\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "of\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "a\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "model\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "a\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "new\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "view\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "called\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "3dcommentview\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "is\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "created\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "when\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "you\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "re\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "finished\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "setting\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "options\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "click\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "set\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "default\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "set\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "location\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "for\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "stroke\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "in\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "relationship\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "to\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "the\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "marquee\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "by\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "choosing\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "inside\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "center\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "or\n", - "Invalid Word\n", - "None\n", - "\n", - "\n", - "outside\n" + "ename": "NameError", + "evalue": "name 'remove_punctuation' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 45\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28mprint\u001b[39m(human_sum)\n\u001b[1;32m 41\u001b[0m \u001b[38;5;66;03m# tranlsations = [PyDictionary().translate(word, 'de') for word in element]\u001b[39;00m\n\u001b[0;32m---> 45\u001b[0m analyze_translations()\n", + "Cell \u001b[0;32mIn[1], line 19\u001b[0m, in \u001b[0;36manalyze_translations\u001b[0;34m()\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21manalyze_translations\u001b[39m():\n\u001b[1;32m 18\u001b[0m ourZip \u001b[38;5;241m=\u001b[39m zipfile\u001b[38;5;241m.\u001b[39mZipFile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/corpus_corrected.zip\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 19\u001b[0m files \u001b[38;5;241m=\u001b[39m {name: remove_punctuation(ourZip\u001b[38;5;241m.\u001b[39mread(name)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m ourZip\u001b[38;5;241m.\u001b[39mnamelist()}\n\u001b[1;32m 22\u001b[0m corpus_de_human, corpus_de_nmt, corpus_en \u001b[38;5;241m=\u001b[39m files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_human.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_nmt.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_en.txt\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 24\u001b[0m nmt_sum \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n", + "Cell \u001b[0;32mIn[1], line 19\u001b[0m, in \u001b[0;36m\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21manalyze_translations\u001b[39m():\n\u001b[1;32m 18\u001b[0m ourZip \u001b[38;5;241m=\u001b[39m zipfile\u001b[38;5;241m.\u001b[39mZipFile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/corpus_corrected.zip\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 19\u001b[0m files \u001b[38;5;241m=\u001b[39m {name: remove_punctuation(ourZip\u001b[38;5;241m.\u001b[39mread(name)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m ourZip\u001b[38;5;241m.\u001b[39mnamelist()}\n\u001b[1;32m 22\u001b[0m corpus_de_human, corpus_de_nmt, corpus_en \u001b[38;5;241m=\u001b[39m files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_human.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_nmt.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_en.txt\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 24\u001b[0m nmt_sum \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n", + "\u001b[0;31mNameError\u001b[0m: name 'remove_punctuation' is not defined" ] } ], "source": [ "from PyDictionary import PyDictionary\n", "import zipfile\n", + "import re\n", "\n", "def transalate(word_list):\n", " transalted_words = {}\n", " for word in word_list:\n", - " translation = PyDictionary().translate(word, 'German')\n", - " if translation:\n", - " transalted_words[word] = translation\n", + " try:\n", + " translation = PyDictionary().translate(word, 'German')\n", + " if translation:\n", + " transalted_words[word] = translation\n", + " except Exception as e:\n", + " print('Exception')\n", "\n", " return transalted_words\n", "\n", @@ -989,11 +359,23 @@ " \n", " corpus_de_human, corpus_de_nmt, corpus_en = files['corpus_de_human.txt'], files['corpus_de_nmt.txt'], files['corpus_en.txt']\n", "\n", + " nmt_sum = 0\n", + " human_sum = 0\n", + "\n", " for human_element, nmt_element, element in zip(corpus_de_human, corpus_de_nmt, corpus_en):\n", " transalted_words = transalate(element)\n", "\n", - " words = set(re.findall(r'\\w+', sentence.lower()))\n", - " matches = sum(1 for word in words if translations.get(word.lower()))\n", + " # words = set(re.findall(r'\\w+', nmt_element.lower()))\n", + " nmt_sum += sum(1 for word in nmt_element if transalted_words.get(word.lower()))\n", + "\n", + " # words = set(re.findall(r'\\w+', human_element.lower()))\n", + " human_sum += sum(1 for word in human_element if transalted_words.get(word.lower()))\n", + "\n", + "\n", + " print(nmt_sum)\n", + " print(human_sum)\n", + "\n", + "\n", " # tranlsations = [PyDictionary().translate(word, 'de') for word in element]\n", " \n", "\n",