{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict, Counter\n", "from nltk import trigrams, word_tokenize\n", "import csv\n", "import regex as re\n", "import pandas as pd\n", "import numpy as np\n", "import time\n", "\n", "in_file = 'train/in.tsv.xz'\n", "out_file = 'train/expected.tsv'\n", "\n", "X_train = pd.read_csv(in_file, sep='\\t', header=None, quoting=csv.QUOTE_NONE, nrows=30000, on_bad_lines='skip')\n", "Y_train = pd.read_csv(out_file, sep='\\t', header=None, quoting=csv.QUOTE_NONE, nrows=30000, on_bad_lines='skip')\n", "\n", "X_train = X_train[[6, 7]]\n", "X_train = pd.concat([X_train, Y_train], axis=1)\n", "X_train['row'] = X_train[6] + X_train[0] + X_train[7]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def train(X_train, Y_train, alpha):\n", " model = defaultdict(lambda: defaultdict(lambda: 0))\n", " vocabulary = set()\n", " for _, (_, row) in enumerate(X_train.iterrows()):\n", " text = preprocess(str(row['row']))\n", " words = word_tokenize(text)\n", " for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):\n", " if w1 and w2 and w3:\n", " model[(w1, w3)][w2] += 1\n", " vocabulary.add(w1)\n", " vocabulary.add(w2)\n", " vocabulary.add(w3)\n", "\n", " for _, w13 in enumerate(model):\n", " count = float(sum(model[w13].values()))\n", " denominator = count + alpha * len(vocabulary)\n", " for w2 in model[w13]:\n", " nominator = model[w13][w2] + alpha\n", " model[w13][w2] = nominator / denominator \n", " return model\n", "\n", "def preprocess(row):\n", " row = re.sub(r'\\p{P}', '', row.lower().replace('-\\\\n', '').replace('\\\\n', ' '))\n", " return row\n", "\n", "def predict_word(before, after, model):\n", " output = ''\n", " p = 0.0\n", " Y_pred = dict(Counter(dict(model[before, after])).most_common(7))\n", " for key, value in Y_pred.items():\n", " p += value\n", " output += f'{key}:{value} '\n", " if p == 0.0:\n", " output = 'the:0.04 be:0.04 to:0.04 and:0.02 not:0.02 or:0.02 a:0.02 :0.8'\n", " return output\n", " output += f':{max(1 - p, 0.01)}'\n", " return output\n", "\n", "def word_gap_prediction(file, model):\n", " X_test = pd.read_csv(f'{file}/in.tsv.xz', sep='\\t', header=None, quoting=csv.QUOTE_NONE, on_bad_lines='skip')\n", " with open(f'{file}/out.tsv', 'w', encoding='utf-8') as output_file:\n", " for _, row in X_test.iterrows():\n", " before, after = word_tokenize(preprocess(str(row[6]))), word_tokenize(preprocess(str(row[7])))\n", " if len(before) < 2 or len(after) < 2:\n", " output = 'the:0.04 be:0.04 to:0.04 and:0.02 not:0.02 or:0.02 a:0.02 :0.8'\n", " else:\n", " output = predict_word(before[-1], after[0],model)\n", " output_file.write(output + '\\n')\n", " \n", "def alpha_tuning(alphas):\n", " for alpha in alphas:\n", " model = train(X_train, Y_train, alpha)\n", " word_gap_prediction('dev-0',model)\n", " time.sleep(10)\n", " print(\"Alpha = \",alpha)\n", " print(\"dev-0 score\")\n", " !./geval -t dev-0" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "alphas = np.round(np.arange(0.1, 0.6, 0.1).tolist(),2)\n", "alphas2 = np.round(alphas * 0.01,3)\n", "alphas3 = np.round(alphas * 0.001,4)\n", "alphas4 = np.round(alphas * 0.0001,5)\n", "alphas5 = np.round(alphas * 0.00001,6)\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Alpha = 0.1\n", "dev-0 score\n", "789.71\n", "Alpha = 0.2\n", "dev-0 score\n", "819.57\n", "Alpha = 0.3\n", "dev-0 score\n", "833.52\n", "Alpha = 0.4\n", "dev-0 score\n", "841.93\n", "Alpha = 0.5\n", "dev-0 score\n", "847.66\n" ] } ], "source": [ "alpha_tuning(alphas)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Alpha = 0.001\n", "dev-0 score\n", "472.05\n", "Alpha = 0.002\n", "dev-0 score\n", "519.17\n", "Alpha = 0.003\n", "dev-0 score\n", "548.93\n", "Alpha = 0.004\n", "dev-0 score\n", "570.68\n", "Alpha = 0.005\n", "dev-0 score\n", "587.76\n" ] } ], "source": [ "alpha_tuning(alphas2)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Alpha = 0.0001\n", "dev-0 score\n", "367.28\n", "Alpha = 0.0002\n", "dev-0 score\n", "389.51\n", "Alpha = 0.0003\n", "dev-0 score\n", "406.30\n", "Alpha = 0.0004\n", "dev-0 score\n", "419.89\n", "Alpha = 0.0005\n", "dev-0 score\n", "431.39\n" ] } ], "source": [ "alpha_tuning(alphas3)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Alpha = 1e-05\n", "dev-0 score\n", "350.33\n", "Alpha = 2e-05\n", "dev-0 score\n", "346.35\n", "Alpha = 3e-05\n", "dev-0 score\n", "347.66\n", "Alpha = 4e-05\n", "dev-0 score\n", "350.20\n", "Alpha = 5e-05\n", "dev-0 score\n", "353.09\n" ] } ], "source": [ "alpha_tuning(alphas4)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Alpha = 1e-06\n", "dev-0 score\n", "422.25\n", "Alpha = 2e-06\n", "dev-0 score\n", "390.96\n", "Alpha = 3e-06\n", "dev-0 score\n", "376.49\n", "Alpha = 4e-06\n", "dev-0 score\n", "367.96\n", "Alpha = 5e-06\n", "dev-0 score\n", "362.34\n" ] } ], "source": [ "alpha_tuning(alphas5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 4 }