new
This commit is contained in:
commit
9e1eeced06
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
*.pyc
|
||||
*.o
|
||||
.DS_Store
|
||||
.token
|
9
README.md
Normal file
9
README.md
Normal file
@ -0,0 +1,9 @@
|
||||
Challenging America word-gap prediction
|
||||
===================================
|
||||
|
||||
Guess a word in a gap.
|
||||
|
||||
Evaluation metric
|
||||
-----------------
|
||||
|
||||
LikelihoodHashed is the metric
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
||||
--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv
|
10519
dev-0/expected.tsv
Normal file
10519
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
10519
dev-0/hate-speech-info.tsv
Normal file
10519
dev-0/hate-speech-info.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
dev-0/in.tsv.xz
Normal file
BIN
dev-0/in.tsv.xz
Normal file
Binary file not shown.
10519
dev-0/out.tsv
Normal file
10519
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1
in-header.tsv
Normal file
1
in-header.tsv
Normal file
@ -0,0 +1 @@
|
||||
FileId Year LeftContext RightContext
|
|
284
main.ipynb
Normal file
284
main.ipynb
Normal file
@ -0,0 +1,284 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import lzma\n",
|
||||
"import pickle\n",
|
||||
"from collections import Counter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def clean_line(line):\n",
|
||||
" prefix = line.split('\\t')[6].replace(r'\\n', ' ')\n",
|
||||
" suffix = line.split('\\t')[7].replace(r'\\n', ' ')\n",
|
||||
" return f'{prefix} {suffix}'\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_words(filename):\n",
|
||||
" with lzma.open(filename, mode='rt', encoding='utf-8') as file:\n",
|
||||
" count = 1\n",
|
||||
" print('Words')\n",
|
||||
" for line in file:\n",
|
||||
" print(f'\\rProgress: {(count / 432022 * 100):2f}%', end='')\n",
|
||||
" text = clean_line(line)\n",
|
||||
" for word in text.split():\n",
|
||||
" yield word\n",
|
||||
" count += 1\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_bigrams(filename, V):\n",
|
||||
" with lzma.open(filename, mode='rt', encoding='utf-8') as file:\n",
|
||||
" count = 1\n",
|
||||
" print('Bigrams')\n",
|
||||
" for line in file:\n",
|
||||
" print(f'\\rProgress: {(count / 432022 * 100):2f}%', end='')\n",
|
||||
" text = clean_line(line)\n",
|
||||
" first_word = ''\n",
|
||||
" for second_word in text.split():\n",
|
||||
" if V.get(second_word) is None:\n",
|
||||
" second_word = 'UNK'\n",
|
||||
" if second_word:\n",
|
||||
" yield first_word, second_word\n",
|
||||
" first_word = second_word\n",
|
||||
" count += 1\n",
|
||||
" print()\n",
|
||||
"\n",
|
||||
"def get_trigrams(filename, V):\n",
|
||||
" with lzma.open(filename, mode='rt', encoding='utf-8') as file:\n",
|
||||
" count = 1\n",
|
||||
" print('Trigrams')\n",
|
||||
" for line in file:\n",
|
||||
" print(f'\\rProgress: {(count / 432022 * 100):2f}%', end='')\n",
|
||||
" text = clean_line(line)\n",
|
||||
" first_word = ''\n",
|
||||
" second_word = ''\n",
|
||||
" for third_word in text.split():\n",
|
||||
" if V.get(third_word) is None:\n",
|
||||
" third_word = 'UNK'\n",
|
||||
" if first_word:\n",
|
||||
" yield first_word, second_word, third_word\n",
|
||||
" first_word = second_word\n",
|
||||
" second_word = third_word\n",
|
||||
" count += 1\n",
|
||||
" print()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Words\n",
|
||||
"Progress: 100.000000%\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"WORD_LIMIT = 3000\n",
|
||||
"V = Counter(get_words('train/in.tsv.xz'))\n",
|
||||
"V_common_dict = dict(V.most_common(WORD_LIMIT))\n",
|
||||
"UNK = 0\n",
|
||||
"for key, value in V.items():\n",
|
||||
" if V_common_dict.get(key) is None:\n",
|
||||
" UNK += value\n",
|
||||
"V_common_dict['UNK'] = UNK\n",
|
||||
"with open('V.pickle', 'wb') as handle:\n",
|
||||
" pickle.dump(V_common_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
|
||||
" \n",
|
||||
"with open('V.pickle', 'rb') as handle:\n",
|
||||
" V_common_dict = pickle.load(handle)\n",
|
||||
"\n",
|
||||
"total = sum(V_common_dict.values())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Bigrams\n",
|
||||
"Progress: 100.000000%\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"V2 = Counter(get_bigrams('train/in.tsv.xz', V_common_dict))\n",
|
||||
"V2_dict = dict(V2)\n",
|
||||
"with open('V2.pickle', 'wb') as handle:\n",
|
||||
" pickle.dump(V2_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
|
||||
"\n",
|
||||
"with open('V2.pickle', 'rb') as handle:\n",
|
||||
" V2_dict = pickle.load(handle)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Trigrams\n",
|
||||
"Progress: 100.000000%\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"V3 = Counter(get_trigrams('train/in.tsv.xz', V_common_dict))\n",
|
||||
"V3_dict = dict(V3)\n",
|
||||
"with open('V3.pickle', 'wb') as handle:\n",
|
||||
" pickle.dump(V3_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
|
||||
"\n",
|
||||
"with open('V3.pickle', 'rb') as handle:\n",
|
||||
" V3_dict = pickle.load(handle)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"def calculate_probability(first_word, second_word=None, third_word=None):\n",
|
||||
" try:\n",
|
||||
" if second_word is None:\n",
|
||||
" return V_common_dict[first_word] / total\n",
|
||||
" if third_word is None:\n",
|
||||
" return V2_dict[(first_word, second_word)] / V_common_dict[first_word]\n",
|
||||
" else:\n",
|
||||
" return V3_dict[(first_word, second_word, third_word)] / V2_dict[(first_word, second_word)]\n",
|
||||
" except KeyError:\n",
|
||||
" return 0"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def smoothed(trigrams):\n",
|
||||
" first, second, third = trigrams\n",
|
||||
" return 0.6 * calculate_probability(first, second, third) + 0.25 * calculate_probability(second, third) + 0.15 * calculate_probability(\n",
|
||||
" third)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def candidates(left_context, right_context):\n",
|
||||
" cand = {}\n",
|
||||
" first, second= left_context\n",
|
||||
" fourth, fifth = right_context\n",
|
||||
" for word in V_common_dict:\n",
|
||||
" p1 = smoothed((first, second, word))\n",
|
||||
" p2 = smoothed((second, word, fourth))\n",
|
||||
" p3 = smoothed((word, fourth,fifth))\n",
|
||||
" cand[word] = p1 * p2 * p3 \n",
|
||||
" cand = sorted(list(cand.items()), key=lambda x: x[1], reverse=True)[:5]\n",
|
||||
" norm = [(x[0], float(x[1]) / sum([y[1] for y in cand])) for x in cand]\n",
|
||||
" for index, elem in enumerate(norm):\n",
|
||||
" unk = None\n",
|
||||
" if 'UNK' in elem:\n",
|
||||
" unk = norm.pop(index)\n",
|
||||
" norm.append(('', unk[1]))\n",
|
||||
" break\n",
|
||||
" if unk is None:\n",
|
||||
" norm[-1] = ('', norm[-1][1])\n",
|
||||
" return ' '.join([f'{x[0]}:{x[1]}' for x in norm])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def create_outputs(folder_name):\n",
|
||||
" print(f'Creating outputs in {folder_name}')\n",
|
||||
" with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:\n",
|
||||
" with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8') as f:\n",
|
||||
" for line in fid:\n",
|
||||
" separated = line.split('\\t')\n",
|
||||
" prefix = separated[6].replace(r'\\n', ' ').split()\n",
|
||||
" suffix = separated[7].replace(r'\\n', ' ').split()\n",
|
||||
" left_context = [x if V_common_dict.get(x) else 'UNK' for x in prefix[-2:]]\n",
|
||||
" right_context = [x if V_common_dict.get(x) else 'UNK' for x in suffix[:2]]\n",
|
||||
" w = candidates(left_context, right_context)\n",
|
||||
" f.write(w + '\\n')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Creating outputs in dev-0\n",
|
||||
"Creating outputs in test-A\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"create_outputs('dev-0')\n",
|
||||
"create_outputs('test-A')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.7"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
1
out-header.tsv
Normal file
1
out-header.tsv
Normal file
@ -0,0 +1 @@
|
||||
Word
|
|
7414
test-A/hate-speech-info.tsv
Normal file
7414
test-A/hate-speech-info.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
test-A/in.tsv.xz
Normal file
BIN
test-A/in.tsv.xz
Normal file
Binary file not shown.
7414
test-A/out.tsv
Normal file
7414
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
432022
train/expected.tsv
Normal file
432022
train/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
432022
train/hate-speech-info.tsv
Normal file
432022
train/hate-speech-info.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/in.tsv.xz
Normal file
BIN
train/in.tsv.xz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user