From 93502cafcdb6cd17af92be94d9aae986bade092f Mon Sep 17 00:00:00 2001
From: Joanna Kurczalska <joakur8@st.amu.edu.pl>
Date: Sat, 8 Apr 2023 11:27:52 +0200
Subject: [PATCH] =?UTF-8?q?Prze=C5=9Blij=20pliki=20do=20''?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 run2.ipynb | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 run2.ipynb

diff --git a/run2.ipynb b/run2.ipynb
new file mode 100644
index 0000000..95d9518
--- /dev/null
+++ b/run2.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "import pandas as pd\n",
+    "import regex as re\n",
+    "from csv import QUOTE_NONE\n",
+    "from collections import Counter, defaultdict\n",
+    "\n",
+    "ENCODING = \"utf-8\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_text(text):\n",
+    "    res = str(text).lower().strip()\n",
+    "    return res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_csv(fname):\n",
+    "    return pd.read_csv(\n",
+    "        fname,\n",
+    "        sep=\"\\t\",\n",
+    "        on_bad_lines='skip',\n",
+    "        header=None,\n",
+    "        quoting=QUOTE_NONE,\n",
+    "        encoding=ENCODING\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_model(data, model):\n",
+    "    for _, row in data.iterrows():\n",
+    "        words = nltk.word_tokenize(clean_text(row[607]))\n",
+    "        for w1, w2 in nltk.bigrams(words, pad_left=True, pad_right=True):\n",
+    "            if w1 and w2:\n",
+    "                model[w2][w1] += 1\n",
+    "    for w2 in model:\n",
+    "        total_count = float(sum(model[w2].values()))\n",
+    "        for w1 in model[w2]:\n",
+    "            model[w2][w1] /= total_count"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict_data(read_path, save_path, model):\n",
+    "    data = get_csv(read_path)\n",
+    "\n",
+    "    with open(save_path, \"w\", encoding=ENCODING) as f:\n",
+    "        for _, row in data.iterrows():\n",
+    "            words = nltk.word_tokenize(clean_text(row[7]))\n",
+    "            if len(words) < 3:\n",
+    "                prediction = \"the:0.3 be:0.2 to:0.2 of:0.1 and:0.1 :0.1\"\n",
+    "            else:\n",
+    "                prediction = predict(words[0], model)\n",
+    "            f.write(prediction + \"\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def predict(word, model):\n",
+    "    predictions = dict(model[word])\n",
+    "    most_common = dict(Counter(predictions).most_common(6))\n",
+    "\n",
+    "    total_prob = 0.0\n",
+    "    str_prediction = \"\"\n",
+    "\n",
+    "    for word, prob in most_common.items():\n",
+    "        total_prob += prob\n",
+    "        str_prediction += f\"{word}:{prob} \"\n",
+    "\n",
+    "    if total_prob == 0.0:\n",
+    "        return \"the:0.3 be:0.2 to:0.2 of:0.1 and:0.1 :0.1\"\n",
+    "\n",
+    "    rem_prob = 1 - total_prob\n",
+    "    if rem_prob < 0.01:\n",
+    "        rem_prob = 0.01\n",
+    "\n",
+    "    str_prediction += f\":{rem_prob}\"\n",
+    "\n",
+    "    return str_prediction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = get_csv(\"train/in.tsv.xz\")\n",
+    "\n",
+    "train_words = get_csv(\"train/expected.tsv\")\n",
+    "train_data = data[[6, 7]]\n",
+    "train_data = pd.concat([train_data, train_words], axis=1)\n",
+    "\n",
+    "train_data[607] = train_data[6] + train_data[0] + train_data[7]\n",
+    "\n",
+    "model = defaultdict(lambda: defaultdict(lambda: 0))\n",
+    "\n",
+    "train_model(train_data, model)\n",
+    "\n",
+    "predict_data(\"dev-0/in.tsv.xz\", \"dev-0/out.tsv\", model)\n",
+    "predict_data(\"test-A/in.tsv.xz\", \"test-A/out.tsv\", model)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}