{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading data...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\r", "0it [00:00, ?it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Training model...\n", "1/2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "50000it [03:35, 232.50it/s]\n", " 0%| | 8/753550 [00:00<3:31:51, 59.28it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "2/2\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████| 753550/753550 [00:04<00:00, 176601.27it/s]\n", " 0%| | 3/753550 [00:00<8:51:51, 23.61it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Smoothing...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████| 753550/753550 [00:06<00:00, 117904.94it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Predicting...\n", "Dev set\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "10519it [02:07, 82.51it/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Test set\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "7414it [01:16, 96.50it/s] \n" ] } ], "source": [ "import pandas as pd\n", "import csv\n", "import regex as re\n", "from nltk import bigrams, word_tokenize\n", "from collections import Counter, defaultdict\n", "import string\n", "import unicodedata\n", "from tqdm import tqdm\n", "\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.max_rows', None)\n", "\n", "NROWS = 50000\n", "ALPHA = 0.1\n", "\n", "\n", "def etl():\n", " data = pd.read_csv(\n", " \"train/in.tsv.xz\",\n", " sep=\"\\t\",\n", " error_bad_lines=False,\n", " header=None,\n", " quoting=csv.QUOTE_NONE,\n", " nrows=NROWS\n", " )\n", " train_labels = pd.read_csv(\n", " \"train/expected.tsv\",\n", " sep=\"\\t\",\n", " error_bad_lines=False,\n", " header=None,\n", " quoting=csv.QUOTE_NONE,\n", " nrows=NROWS\n", " )\n", " \n", " train_data = data[[6, 7]]\n", " train_data = pd.concat([train_data, train_labels], axis=1)\n", "\n", " train_data[\"final\"] = train_data[6] + train_data[0] + train_data[7]\n", "\n", " model = defaultdict(lambda: defaultdict(lambda: 0))\n", " return train_data, model\n", "\n", "\n", "def clean(text):\n", " text = str(text).lower().replace(\"-\\\\n\", \"\").replace(\"\\\\n\", \" \")\n", " return re.sub(r\"\\p{P}\", \"\", text)\n", "\n", "\n", "def train_model(data):\n", " print(\"1/2\")\n", " for _, row in tqdm(data.iterrows()):\n", " words = word_tokenize(clean(row[\"final\"]))\n", " for word_1, word_2 in bigrams(words, pad_left=True, pad_right=True):\n", " if word_1 and word_2:\n", " vocab.add(word_1)\n", " vocab.add(word_2)\n", " model[word_1][word_2] += 1\n", " print(\"2/2\")\n", " for word_1 in tqdm(model):\n", " total_count = float(sum(model[word_1].values()))\n", " for word_2 in model[word_1]:\n", " model[word_1][word_2] /= total_count\n", "\n", "\n", "def predict(word):\n", " predictions = dict(model[word])\n", " most_common = dict(Counter(predictions).most_common(5))\n", "\n", " total_prob = 0.0\n", " str_prediction = \"\"\n", "\n", " for word, prob in most_common.items():\n", " total_prob += prob\n", " str_prediction += f\"{word}:{prob} \"\n", "\n", " if not total_prob:\n", " return \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n", "\n", " if 1 - total_prob >= 0.01:\n", " str_prediction += f\":{1-total_prob}\"\n", " else:\n", " str_prediction += f\":0.01\"\n", "\n", " return str_prediction\n", "\n", "\n", "def predict_data(read_path, save_path):\n", " data = pd.read_csv(\n", " read_path, sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE\n", " )\n", " with open(save_path, \"w\", encoding=\"utf-8\") as file:\n", " for _, row in tqdm(data.iterrows()):\n", " words = word_tokenize(clean(row[6]))\n", " if len(words) < 3:\n", " prediction = \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n", " else:\n", " prediction = predict(words[-1])\n", " file.write(prediction + \"\\n\")\n", " \n", "def plus_alpha_smoothing():\n", " model_len = len(model)\n", " for word_1 in tqdm(model):\n", " word_1_occurrences = sum(model[word_1].values())\n", " for word_2 in model[word_1]:\n", " model[word_1][word_2] += ALPHA\n", " model[word_1][word_2] /= float(word_1_occurrences + ALPHA + len(word_2))\n", "\n", "\n", "print(\"Loading data...\")\n", "train_data, model = etl()\n", "vocab = set()\n", "print(\"Training model...\")\n", "train_model(train_data)\n", "print(\"Smoothing...\")\n", "plus_alpha_smoothing()\n", "print(\"Predicting...\")\n", "print(\"Dev set\")\n", "predict_data(\"dev-0/in.tsv.xz\", \"dev-0/out.tsv\")\n", "print(\"Test set\")\n", "predict_data(\"test-A/in.tsv.xz\", \"test-A/out.tsv\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }