From a0a24dd6b544e28ef162fe7c37f44f7a8d6f3224 Mon Sep 17 00:00:00 2001 From: s440054 Date: Mon, 4 Apr 2022 16:03:59 +0200 Subject: [PATCH] work in progress --- testing.ipynb | 63 ++++++++++++++------------------------------------- 1 file changed, 17 insertions(+), 46 deletions(-) diff --git a/testing.ipynb b/testing.ipynb index d3cb33d..3130f82 100644 --- a/testing.ipynb +++ b/testing.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 18, + "execution_count": 36, "id": "21c9b695", "metadata": {}, "outputs": [], @@ -20,7 +20,7 @@ "\n", "def train_model(data, model):\n", " for _, row in data.iterrows():\n", - " words = nltk.word_tokenize(clean_text(row[\"final\"]))\n", + " words = nltk.word_tokenize(clean_text(row[760]))\n", " for w1, w2 in nltk.bigrams(words, pad_left=True, pad_right=True):\n", " if w1 and w2:\n", " model[w2][w1] += 1\n", @@ -36,7 +36,6 @@ "\n", " total_prob = 0.0\n", " str_prediction = \"\"\n", - "\n", " for word, prob in most_common.items():\n", " total_prob += prob\n", " str_prediction += f\"{word}:{prob} \"\n", @@ -129,7 +128,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 22, "id": "7662d802", "metadata": {}, "outputs": [], @@ -150,19 +149,19 @@ " on_bad_lines='skip',\n", " header=None,\n", " # names=out_cols,\n", - " quoting=csv.QUOTE_NONE,,\n", + " quoting=csv.QUOTE_NONE,\n", " encoding=\"utf-8\"\n", ")\n", "\n", "train_data = data[[7, 6]]\n", "train_data = pd.concat([train_data, train_words], axis=1)\n", "\n", - "train_data[\"final\"] = train_data[7] + train_data[0] + train_data[6]\n" + "train_data[760] = train_data[7] + train_data[0] + train_data[6]\n" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 23, "id": "c3d2cfec", "metadata": {}, "outputs": [ @@ -190,7 +189,7 @@ " 7\n", " 6\n", " 0\n", - " final\n", + " 760\n", " \n", " \n", " \n", @@ -277,7 +276,7 @@ "" ], "text/plain": [ - " 7 \\\n", + " 7 \\\n", "0 said\\nit's all squash. The best I could get\\ni... \n", "1 \\ninto a proper perspective with those\\nminor ... \n", "2 all notU\\nashore and afloat arc subjects for I... \n", @@ -290,7 +289,7 @@ "432020 \\na note of Wood, Dialogue fc Co., for\\nc27,im... \n", "432021 3214c;do White at 3614c: Mixed Western at\\n331... \n", "\n", - " 6 0 \\\n", + " 6 0 \\\n", "0 came fiom the last place to this\\nplace, and t... lie \n", "1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... himself \n", "2 \"Thera were in 1771 only aeventy-nine\\n*ub*erl... of \n", @@ -303,7 +302,7 @@ "432020 settlement with the department.\\nIt is also sh... for \n", "432021 Flour quotations—low extras at 1 R0®2 50;\\ncit... at \n", "\n", - " final \n", + " 760 \n", "0 said\\nit's all squash. The best I could get\\ni... \n", "1 \\ninto a proper perspective with those\\nminor ... \n", "2 all notU\\nashore and afloat arc subjects for I... \n", @@ -319,7 +318,7 @@ "[432022 rows x 4 columns]" ] }, - "execution_count": 8, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -330,38 +329,10 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 26, "id": "bd92ba07", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\842062938.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n", - "\n", - "\n", - " data = pd.read_csv(\n", - "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\842062938.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n", - "\n", - "\n", - " data = pd.read_csv(\n" - ] - }, - { - "ename": "UnicodeEncodeError", - "evalue": "'charmap' codec can't encode character '\\u03b2' in position 21: character maps to ", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mUnicodeEncodeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32mc:\\Users\\Norbert\\code\\challenging-america-word-gap-prediction\\testing.ipynb Cell 7'\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 3\u001b[0m train_model(train_data, model)\n\u001b[0;32m 4\u001b[0m predict_data(\u001b[39m\"\u001b[39m\u001b[39mdev-0/in.tsv.xz\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mdev-0/out.tsv\u001b[39m\u001b[39m\"\u001b[39m, model)\n\u001b[1;32m----> 5\u001b[0m predict_data(\u001b[39m\"\u001b[39;49m\u001b[39mtest-A/in.tsv.xz\u001b[39;49m\u001b[39m\"\u001b[39;49m, \u001b[39m\"\u001b[39;49m\u001b[39mtest-A/out.tsv\u001b[39;49m\u001b[39m\"\u001b[39;49m, model)\n", - "\u001b[1;32mc:\\Users\\Norbert\\code\\challenging-america-word-gap-prediction\\testing.ipynb Cell 1'\u001b[0m in \u001b[0;36mpredict_data\u001b[1;34m(read_path, save_path, model)\u001b[0m\n\u001b[0;32m 55\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 56\u001b[0m prediction \u001b[39m=\u001b[39m predict(words[\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m], model)\n\u001b[1;32m---> 57\u001b[0m file\u001b[39m.\u001b[39;49mwrite(prediction \u001b[39m+\u001b[39;49m \u001b[39m\"\u001b[39;49m\u001b[39m\\n\u001b[39;49;00m\u001b[39m\"\u001b[39;49m)\n", - "File \u001b[1;32mC:\\Python310\\lib\\encodings\\cp1250.py:19\u001b[0m, in \u001b[0;36mIncrementalEncoder.encode\u001b[1;34m(self, input, final)\u001b[0m\n\u001b[0;32m 18\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mencode\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m, final\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m):\n\u001b[1;32m---> 19\u001b[0m \u001b[39mreturn\u001b[39;00m codecs\u001b[39m.\u001b[39;49mcharmap_encode(\u001b[39minput\u001b[39;49m,\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49merrors,encoding_table)[\u001b[39m0\u001b[39m]\n", - "\u001b[1;31mUnicodeEncodeError\u001b[0m: 'charmap' codec can't encode character '\\u03b2' in position 21: character maps to " - ] - } - ], + "outputs": [], "source": [ "\n", "model = defaultdict(lambda: defaultdict(lambda: 0))\n", @@ -371,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 37, "id": "ad23240e", "metadata": {}, "outputs": [ @@ -379,7 +350,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\842062938.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n", + "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\749044266.py:46: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n", "\n", "\n", " data = pd.read_csv(\n" @@ -392,7 +363,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 38, "id": "195cb6cf", "metadata": {}, "outputs": [ @@ -400,7 +371,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\751703071.py:47: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n", + "C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\749044266.py:46: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n", "\n", "\n", " data = pd.read_csv(\n"