added notes

This commit is contained in:
Adam Stelmaszyk 2024-05-05 08:21:38 +02:00
parent d824ffc9d9
commit 58e6e90d2a

View File

@ -317,24 +317,10 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"id": "descending-easter",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'remove_punctuation' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 45\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28mprint\u001b[39m(human_sum)\n\u001b[1;32m 41\u001b[0m \u001b[38;5;66;03m# tranlsations = [PyDictionary().translate(word, 'de') for word in element]\u001b[39;00m\n\u001b[0;32m---> 45\u001b[0m analyze_translations()\n",
"Cell \u001b[0;32mIn[1], line 19\u001b[0m, in \u001b[0;36manalyze_translations\u001b[0;34m()\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21manalyze_translations\u001b[39m():\n\u001b[1;32m 18\u001b[0m ourZip \u001b[38;5;241m=\u001b[39m zipfile\u001b[38;5;241m.\u001b[39mZipFile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/corpus_corrected.zip\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 19\u001b[0m files \u001b[38;5;241m=\u001b[39m {name: remove_punctuation(ourZip\u001b[38;5;241m.\u001b[39mread(name)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m ourZip\u001b[38;5;241m.\u001b[39mnamelist()}\n\u001b[1;32m 22\u001b[0m corpus_de_human, corpus_de_nmt, corpus_en \u001b[38;5;241m=\u001b[39m files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_human.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_nmt.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_en.txt\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 24\u001b[0m nmt_sum \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
"Cell \u001b[0;32mIn[1], line 19\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21manalyze_translations\u001b[39m():\n\u001b[1;32m 18\u001b[0m ourZip \u001b[38;5;241m=\u001b[39m zipfile\u001b[38;5;241m.\u001b[39mZipFile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/corpus_corrected.zip\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 19\u001b[0m files \u001b[38;5;241m=\u001b[39m {name: remove_punctuation(ourZip\u001b[38;5;241m.\u001b[39mread(name)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m ourZip\u001b[38;5;241m.\u001b[39mnamelist()}\n\u001b[1;32m 22\u001b[0m corpus_de_human, corpus_de_nmt, corpus_en \u001b[38;5;241m=\u001b[39m files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_human.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_nmt.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_en.txt\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 24\u001b[0m nmt_sum \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
"\u001b[0;31mNameError\u001b[0m: name 'remove_punctuation' is not defined"
]
}
],
"outputs": [],
"source": [
"from PyDictionary import PyDictionary\n",
"import zipfile\n",
@ -365,22 +351,19 @@
" for human_element, nmt_element, element in zip(corpus_de_human, corpus_de_nmt, corpus_en):\n",
" transalted_words = transalate(element)\n",
"\n",
" # words = set(re.findall(r'\\w+', nmt_element.lower()))\n",
" nmt_sum += sum(1 for word in nmt_element if transalted_words.get(word.lower()))\n",
"\n",
" # words = set(re.findall(r'\\w+', human_element.lower()))\n",
" human_sum += sum(1 for word in human_element if transalted_words.get(word.lower()))\n",
"\n",
"\n",
" print(nmt_sum)\n",
" print(human_sum)\n",
"\n",
"\n",
" # tranlsations = [PyDictionary().translate(word, 'de') for word in element]\n",
" \n",
"\n",
"\n",
"analyze_translations()"
"#I think the PyDictionary mode doesn't work, the info from https://github.com/geekpradd/PyDictionary\n",
"#NOTE: Mainintaing this module requires constantly changing the scrapping endpoints which unfortunately I no longer have the bandwidth to do so, so this module is DEPRECATED. Kindly use other substitutes available on PyPI. Thanks!\n",
"#PyDictionary is a Dictionary Module for Python 2/3 to get meanings, translations, synonyms and Antonyms of words. It uses WordNet for getting meanings, Google for translations, and synonym.com for getting synonyms and antonyms.\n",
"#This module uses Python Requests, BeautifulSoup4 and goslate as dependencies\n",
" \n"
]
}
],