diff --git a/lab/lab_08.ipynb b/lab/lab_08.ipynb index 569c392..2fcc626 100644 --- a/lab/lab_08.ipynb +++ b/lab/lab_08.ipynb @@ -58,12 +58,76 @@ { "cell_type": "code", "execution_count": 1, - "id": "moving-clothing", + "id": "d83ccf58", "metadata": {}, "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a72875eb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(119008, 117487)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import zipfile\n", + "\n", + "archive = zipfile.ZipFile('./data/corpus_corrected.zip', 'r')\n", + "\n", + "n_samples = -1\n", + "original_text = archive.read('corpus_en.txt').decode('utf-8')[:n_samples]\n", + "reference_text = archive.read('corpus_de_human.txt').decode('utf-8')[:n_samples]\n", + "candidate_text = archive.read('corpus_de_nmt.txt').decode('utf-8')[:n_samples]\n", + "\n", + "len(reference_text), len(candidate_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "05a447be", + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.translate.bleu_score import sentence_bleu" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "moving-clothing", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3.088188515329241e-232" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def calculate_bleu():\n", - " return 0" + " return sentence_bleu(reference_text, candidate_text)\n", + "\n", + "calculate_bleu()" ] }, { @@ -76,13 +140,41 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "lasting-rolling", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ + "import matplotlib.pyplot as plt\n", + "\n", "def analyze_bleu():\n", - " return []" + " reference_sentences = reference_text.split('\\n')\n", + " candidate_sentences = candidate_text.split('\\n')\n", + "\n", + " # Calculate BLEU scores for every group of 100 sentences\n", + " bleu_scores = []\n", + " for i in range(0, len(reference_sentences), 100):\n", + " reference_batch = reference_sentences[i:i+100]\n", + " candidate_batch = candidate_sentences[i:i+100]\n", + " bleu_scores.append(sentence_bleu([reference_batch], candidate_batch))\n", + "\n", + " plt.plot(bleu_scores)\n", + " plt.ylabel('BLEU score')\n", + " plt.xlabel('Batch')\n", + " plt.show()\n", + "\n", + "analyze_bleu()" ] }, { @@ -120,13 +212,28 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "id": "occupied-swing", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.16925791156687617" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "from jiwer import wer\n", + "\n", "def calculate_wer():\n", - " return 0" + " return wer(reference_text, candidate_text)\n", + "\n", + "calculate_wer()" ] }, { @@ -147,13 +254,28 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "id": "immediate-element", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "14645" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "from Levenshtein import distance\n", + "\n", "def calculate_levenshtein():\n", - " return 0" + " return distance(reference_text, candidate_text)\n", + "\n", + "calculate_levenshtein()" ] }, { @@ -177,28 +299,70 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "descending-easter", "metadata": {}, "outputs": [], "source": [ + "from PyDictionary import PyDictionary\n", + "\n", "def analyze_translations():\n", - " return []" + " dictionary=PyDictionary()\n", + " english_words = set(original_text.split())\n", + " \n", + " german_words = set()\n", + " for word in english_words:\n", + " try:\n", + " german_word = dictionary.translate(word, 'de')\n", + " german_words.add(german_word)\n", + " except:\n", + " pass\n", + "\n", + " reference_words = set(reference_text.split())\n", + " candidate_words = set(candidate_text.split())\n", + "\n", + " manual_translations, automatic_translations = 0, 0\n", + " for word in german_words:\n", + " if word in candidate_words:\n", + " automatic_translations += 1 \n", + " if word in reference_words:\n", + " manual_translations += 1\n", + "\n", + " return manual_translations, automatic_translations\n", + "\n", + "manual_translations, automatic_translations = analyze_translations()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3d7a9458", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Manual translations: 134\n", + "Automatic translations: 122\n" + ] + } + ], + "source": [ + "print(f\"Manual translations: {manual_translations}\")\n", + "print(f\"Automatic translations: {automatic_translations}\")" ] } ], "metadata": { "author": "Rafał Jaworski", "email": "rjawor@amu.edu.pl", - "lang": "pl", - "subtitle": "8. Wykorzystanie tłumaczenia automatycznego we wspomaganiu tłumaczenia", - "title": "Komputerowe wspomaganie tłumaczenia", - "year": "2021", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, + "lang": "pl", "language_info": { "codemirror_mode": { "name": "ipython", @@ -209,8 +373,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" - } + "version": "3.10.14" + }, + "subtitle": "8. Wykorzystanie tłumaczenia automatycznego we wspomaganiu tłumaczenia", + "title": "Komputerowe wspomaganie tłumaczenia", + "year": "2021" }, "nbformat": 4, "nbformat_minor": 5