forked from bfijalkowski/KWT-2024
added notes
This commit is contained in:
parent
d824ffc9d9
commit
58e6e90d2a
@ -317,24 +317,10 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 4,
|
||||||
"id": "descending-easter",
|
"id": "descending-easter",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [],
|
||||||
{
|
|
||||||
"ename": "NameError",
|
|
||||||
"evalue": "name 'remove_punctuation' is not defined",
|
|
||||||
"output_type": "error",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
||||||
"Cell \u001b[0;32mIn[1], line 45\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[38;5;28mprint\u001b[39m(human_sum)\n\u001b[1;32m 41\u001b[0m \u001b[38;5;66;03m# tranlsations = [PyDictionary().translate(word, 'de') for word in element]\u001b[39;00m\n\u001b[0;32m---> 45\u001b[0m analyze_translations()\n",
|
|
||||||
"Cell \u001b[0;32mIn[1], line 19\u001b[0m, in \u001b[0;36manalyze_translations\u001b[0;34m()\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21manalyze_translations\u001b[39m():\n\u001b[1;32m 18\u001b[0m ourZip \u001b[38;5;241m=\u001b[39m zipfile\u001b[38;5;241m.\u001b[39mZipFile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/corpus_corrected.zip\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 19\u001b[0m files \u001b[38;5;241m=\u001b[39m {name: remove_punctuation(ourZip\u001b[38;5;241m.\u001b[39mread(name)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m ourZip\u001b[38;5;241m.\u001b[39mnamelist()}\n\u001b[1;32m 22\u001b[0m corpus_de_human, corpus_de_nmt, corpus_en \u001b[38;5;241m=\u001b[39m files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_human.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_nmt.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_en.txt\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 24\u001b[0m nmt_sum \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
|
|
||||||
"Cell \u001b[0;32mIn[1], line 19\u001b[0m, in \u001b[0;36m<dictcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21manalyze_translations\u001b[39m():\n\u001b[1;32m 18\u001b[0m ourZip \u001b[38;5;241m=\u001b[39m zipfile\u001b[38;5;241m.\u001b[39mZipFile(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata/corpus_corrected.zip\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m---> 19\u001b[0m files \u001b[38;5;241m=\u001b[39m {name: remove_punctuation(ourZip\u001b[38;5;241m.\u001b[39mread(name)\u001b[38;5;241m.\u001b[39mdecode(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mutf-8\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m ourZip\u001b[38;5;241m.\u001b[39mnamelist()}\n\u001b[1;32m 22\u001b[0m corpus_de_human, corpus_de_nmt, corpus_en \u001b[38;5;241m=\u001b[39m files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_human.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_de_nmt.txt\u001b[39m\u001b[38;5;124m'\u001b[39m], files[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcorpus_en.txt\u001b[39m\u001b[38;5;124m'\u001b[39m]\n\u001b[1;32m 24\u001b[0m nmt_sum \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
|
|
||||||
"\u001b[0;31mNameError\u001b[0m: name 'remove_punctuation' is not defined"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
"source": [
|
||||||
"from PyDictionary import PyDictionary\n",
|
"from PyDictionary import PyDictionary\n",
|
||||||
"import zipfile\n",
|
"import zipfile\n",
|
||||||
@ -365,22 +351,19 @@
|
|||||||
" for human_element, nmt_element, element in zip(corpus_de_human, corpus_de_nmt, corpus_en):\n",
|
" for human_element, nmt_element, element in zip(corpus_de_human, corpus_de_nmt, corpus_en):\n",
|
||||||
" transalted_words = transalate(element)\n",
|
" transalted_words = transalate(element)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # words = set(re.findall(r'\\w+', nmt_element.lower()))\n",
|
|
||||||
" nmt_sum += sum(1 for word in nmt_element if transalted_words.get(word.lower()))\n",
|
" nmt_sum += sum(1 for word in nmt_element if transalted_words.get(word.lower()))\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # words = set(re.findall(r'\\w+', human_element.lower()))\n",
|
|
||||||
" human_sum += sum(1 for word in human_element if transalted_words.get(word.lower()))\n",
|
" human_sum += sum(1 for word in human_element if transalted_words.get(word.lower()))\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
" print(nmt_sum)\n",
|
" print(nmt_sum)\n",
|
||||||
" print(human_sum)\n",
|
" print(human_sum)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"#I think the PyDictionary mode doesn't work, the info from https://github.com/geekpradd/PyDictionary\n",
|
||||||
" # tranlsations = [PyDictionary().translate(word, 'de') for word in element]\n",
|
"#NOTE: Mainintaing this module requires constantly changing the scrapping endpoints which unfortunately I no longer have the bandwidth to do so, so this module is DEPRECATED. Kindly use other substitutes available on PyPI. Thanks!\n",
|
||||||
" \n",
|
"#PyDictionary is a Dictionary Module for Python 2/3 to get meanings, translations, synonyms and Antonyms of words. It uses WordNet for getting meanings, Google for translations, and synonym.com for getting synonyms and antonyms.\n",
|
||||||
"\n",
|
"#This module uses Python Requests, BeautifulSoup4 and goslate as dependencies\n",
|
||||||
"\n",
|
" \n"
|
||||||
"analyze_translations()"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
Loading…
Reference in New Issue
Block a user