new

2023-04-23 00:01:17 +02:00 · 2023-04-23 00:01:17 +02:00 · 9e1eeced06
commit 9e1eeced06
16 changed files with 910733 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
+
+*~
+*.swp
+*.bak
+*.pyc
+*.o
+.DS_Store
+.token
--- a/README.md
+++ b/README.md
@ -0,0 +1,9 @@
+Challenging America word-gap prediction
+===================================
+
+Guess a word in a gap.
+
+Evaluation metric
+-----------------
+
+LikelihoodHashed is the metric
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
+--metric PerplexityHashed --precision 2  --in-header in-header.tsv  --out-header out-header.tsv
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/hate-speech-info.tsv
+++ b/dev-0/hate-speech-info.tsv
--- a/dev-0/in.tsv.xz
+++ b/dev-0/in.tsv.xz
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/in-header.tsv
+++ b/in-header.tsv
@ -0,0 +1 @@
+FileId	Year	LeftContext	RightContext
--- a/main.ipynb
+++ b/main.ipynb
@ -0,0 +1,284 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import lzma\n",
+    "import pickle\n",
+    "from collections import Counter"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def clean_line(line):\n",
+    "    prefix = line.split('\\t')[6].replace(r'\\n', ' ')\n",
+    "    suffix = line.split('\\t')[7].replace(r'\\n', ' ')\n",
+    "    return f'{prefix} {suffix}'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_words(filename):\n",
+    "    with lzma.open(filename, mode='rt', encoding='utf-8') as file:\n",
+    "        count = 1\n",
+    "        print('Words')\n",
+    "        for line in file:\n",
+    "            print(f'\\rProgress: {(count / 432022 * 100):2f}%', end='')\n",
+    "            text = clean_line(line)\n",
+    "            for word in text.split():\n",
+    "                yield word\n",
+    "            count += 1\n",
+    "        print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_bigrams(filename, V):\n",
+    "    with lzma.open(filename, mode='rt', encoding='utf-8') as file:\n",
+    "        count = 1\n",
+    "        print('Bigrams')\n",
+    "        for line in file:\n",
+    "            print(f'\\rProgress: {(count / 432022 * 100):2f}%', end='')\n",
+    "            text = clean_line(line)\n",
+    "            first_word = ''\n",
+    "            for second_word in text.split():\n",
+    "                if V.get(second_word) is None:\n",
+    "                    second_word = 'UNK'\n",
+    "                if second_word:\n",
+    "                    yield first_word, second_word\n",
+    "                first_word = second_word\n",
+    "            count += 1\n",
+    "        print()\n",
+    "\n",
+    "def get_trigrams(filename, V):\n",
+    "    with lzma.open(filename, mode='rt', encoding='utf-8') as file:\n",
+    "        count = 1\n",
+    "        print('Trigrams')\n",
+    "        for line in file:\n",
+    "            print(f'\\rProgress: {(count / 432022 * 100):2f}%', end='')\n",
+    "            text = clean_line(line)\n",
+    "            first_word = ''\n",
+    "            second_word = ''\n",
+    "            for third_word in text.split():\n",
+    "                if V.get(third_word) is None:\n",
+    "                    third_word = 'UNK'\n",
+    "                if first_word:\n",
+    "                    yield first_word, second_word, third_word\n",
+    "                first_word = second_word\n",
+    "                second_word = third_word\n",
+    "            count += 1\n",
+    "        print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Words\n",
+      "Progress: 100.000000%\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "WORD_LIMIT = 3000\n",
+    "V = Counter(get_words('train/in.tsv.xz'))\n",
+    "V_common_dict = dict(V.most_common(WORD_LIMIT))\n",
+    "UNK = 0\n",
+    "for key, value in V.items():\n",
+    "    if V_common_dict.get(key) is None:\n",
+    "        UNK += value\n",
+    "V_common_dict['UNK'] = UNK\n",
+    "with open('V.pickle', 'wb') as handle:\n",
+    "    pickle.dump(V_common_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
+    "    \n",
+    "with open('V.pickle', 'rb') as handle:\n",
+    "    V_common_dict = pickle.load(handle)\n",
+    "\n",
+    "total = sum(V_common_dict.values())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bigrams\n",
+      "Progress: 100.000000%\n"
+     ]
+    }
+   ],
+   "source": [
+    "V2 = Counter(get_bigrams('train/in.tsv.xz', V_common_dict))\n",
+    "V2_dict = dict(V2)\n",
+    "with open('V2.pickle', 'wb') as handle:\n",
+    "    pickle.dump(V2_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
+    "\n",
+    "with open('V2.pickle', 'rb') as handle:\n",
+    "    V2_dict = pickle.load(handle)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Trigrams\n",
+      "Progress: 100.000000%\n"
+     ]
+    }
+   ],
+   "source": [
+    "V3 = Counter(get_trigrams('train/in.tsv.xz', V_common_dict))\n",
+    "V3_dict = dict(V3)\n",
+    "with open('V3.pickle', 'wb') as handle:\n",
+    "    pickle.dump(V3_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
+    "\n",
+    "with open('V3.pickle', 'rb') as handle:\n",
+    "    V3_dict = pickle.load(handle)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "def calculate_probability(first_word, second_word=None, third_word=None):\n",
+    "    try:\n",
+    "        if second_word is None:\n",
+    "            return V_common_dict[first_word] / total\n",
+    "        if third_word is None:\n",
+    "            return V2_dict[(first_word, second_word)] / V_common_dict[first_word]\n",
+    "        else:\n",
+    "            return V3_dict[(first_word, second_word, third_word)] / V2_dict[(first_word, second_word)]\n",
+    "    except KeyError:\n",
+    "        return 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def smoothed(trigrams):\n",
+    "    first, second, third = trigrams\n",
+    "    return 0.6 * calculate_probability(first, second, third) + 0.25 * calculate_probability(second, third) + 0.15 * calculate_probability(\n",
+    "        third)\n",
+    "\n",
+    "\n",
+    "def candidates(left_context, right_context):\n",
+    "    cand = {}\n",
+    "    first, second= left_context\n",
+    "    fourth, fifth  = right_context\n",
+    "    for word in V_common_dict:\n",
+    "        p1 = smoothed((first, second, word))\n",
+    "        p2 = smoothed((second,  word, fourth))\n",
+    "        p3 = smoothed((word,  fourth,fifth))\n",
+    "        cand[word] = p1 * p2 * p3 \n",
+    "    cand = sorted(list(cand.items()), key=lambda x: x[1], reverse=True)[:5]\n",
+    "    norm = [(x[0], float(x[1]) / sum([y[1] for y in cand])) for x in cand]\n",
+    "    for index, elem in enumerate(norm):\n",
+    "        unk = None\n",
+    "        if 'UNK' in elem:\n",
+    "            unk = norm.pop(index)\n",
+    "            norm.append(('', unk[1]))\n",
+    "            break\n",
+    "    if unk is None:\n",
+    "        norm[-1] = ('', norm[-1][1])\n",
+    "    return ' '.join([f'{x[0]}:{x[1]}' for x in norm])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_outputs(folder_name):\n",
+    "    print(f'Creating outputs in {folder_name}')\n",
+    "    with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:\n",
+    "        with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8') as f:\n",
+    "            for line in fid:\n",
+    "                separated = line.split('\\t')\n",
+    "                prefix = separated[6].replace(r'\\n', ' ').split()\n",
+    "                suffix = separated[7].replace(r'\\n', ' ').split()\n",
+    "                left_context = [x if V_common_dict.get(x) else 'UNK' for x in prefix[-2:]]\n",
+    "                right_context = [x if V_common_dict.get(x) else 'UNK' for x in suffix[:2]]\n",
+    "                w = candidates(left_context, right_context)\n",
+    "                f.write(w + '\\n')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Creating outputs in dev-0\n",
+      "Creating outputs in test-A\n"
+     ]
+    }
+   ],
+   "source": [
+    "create_outputs('dev-0')\n",
+    "create_outputs('test-A')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/out-header.tsv
+++ b/out-header.tsv
@ -0,0 +1 @@
+Word
--- a/test-A/hate-speech-info.tsv
+++ b/test-A/hate-speech-info.tsv
--- a/test-A/in.tsv.xz
+++ b/test-A/in.tsv.xz
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/hate-speech-info.tsv
+++ b/train/hate-speech-info.tsv
--- a/train/in.tsv.xz
+++ b/train/in.tsv.xz
				`@ -0,0 +1 @@`
				`--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv`