434766 plusalpha
This commit is contained in:
parent
dc5a5cfe83
commit
6eb5a5160f
15502
dev-0/out.tsv
15502
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
290
run.ipynb
Normal file
290
run.ipynb
Normal file
@ -0,0 +1,290 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from collections import defaultdict, Counter\n",
|
||||||
|
"from nltk import trigrams, word_tokenize\n",
|
||||||
|
"import csv\n",
|
||||||
|
"import regex as re\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import time\n",
|
||||||
|
"\n",
|
||||||
|
"in_file = 'train/in.tsv.xz'\n",
|
||||||
|
"out_file = 'train/expected.tsv'\n",
|
||||||
|
"\n",
|
||||||
|
"X_train = pd.read_csv(in_file, sep='\\t', header=None, quoting=csv.QUOTE_NONE, nrows=30000, on_bad_lines='skip')\n",
|
||||||
|
"Y_train = pd.read_csv(out_file, sep='\\t', header=None, quoting=csv.QUOTE_NONE, nrows=30000, on_bad_lines='skip')\n",
|
||||||
|
"\n",
|
||||||
|
"X_train = X_train[[6, 7]]\n",
|
||||||
|
"X_train = pd.concat([X_train, Y_train], axis=1)\n",
|
||||||
|
"X_train['row'] = X_train[6] + X_train[0] + X_train[7]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def train(X_train, Y_train, alpha):\n",
|
||||||
|
" model = defaultdict(lambda: defaultdict(lambda: 0))\n",
|
||||||
|
" vocabulary = set()\n",
|
||||||
|
" for _, (_, row) in enumerate(X_train.iterrows()):\n",
|
||||||
|
" text = preprocess(str(row['row']))\n",
|
||||||
|
" words = word_tokenize(text)\n",
|
||||||
|
" for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):\n",
|
||||||
|
" if w1 and w2 and w3:\n",
|
||||||
|
" model[(w1, w3)][w2] += 1\n",
|
||||||
|
" vocabulary.add(w1)\n",
|
||||||
|
" vocabulary.add(w2)\n",
|
||||||
|
" vocabulary.add(w3)\n",
|
||||||
|
"\n",
|
||||||
|
" for _, w13 in enumerate(model):\n",
|
||||||
|
" count = float(sum(model[w13].values()))\n",
|
||||||
|
" denominator = count + alpha * len(vocabulary)\n",
|
||||||
|
" for w2 in model[w13]:\n",
|
||||||
|
" nominator = model[w13][w2] + alpha\n",
|
||||||
|
" model[w13][w2] = nominator / denominator \n",
|
||||||
|
" return model\n",
|
||||||
|
"\n",
|
||||||
|
"def preprocess(row):\n",
|
||||||
|
" row = re.sub(r'\\p{P}', '', row.lower().replace('-\\\\n', '').replace('\\\\n', ' '))\n",
|
||||||
|
" return row\n",
|
||||||
|
"\n",
|
||||||
|
"def predict_word(before, after, model):\n",
|
||||||
|
" output = ''\n",
|
||||||
|
" p = 0.0\n",
|
||||||
|
" Y_pred = dict(Counter(dict(model[before, after])).most_common(7))\n",
|
||||||
|
" for key, value in Y_pred.items():\n",
|
||||||
|
" p += value\n",
|
||||||
|
" output += f'{key}:{value} '\n",
|
||||||
|
" if p == 0.0:\n",
|
||||||
|
" output = 'the:0.04 be:0.04 to:0.04 and:0.02 not:0.02 or:0.02 a:0.02 :0.8'\n",
|
||||||
|
" return output\n",
|
||||||
|
" output += f':{max(1 - p, 0.01)}'\n",
|
||||||
|
" return output\n",
|
||||||
|
"\n",
|
||||||
|
"def word_gap_prediction(file, model):\n",
|
||||||
|
" X_test = pd.read_csv(f'{file}/in.tsv.xz', sep='\\t', header=None, quoting=csv.QUOTE_NONE, on_bad_lines='skip')\n",
|
||||||
|
" with open(f'{file}/out.tsv', 'w', encoding='utf-8') as output_file:\n",
|
||||||
|
" for _, row in X_test.iterrows():\n",
|
||||||
|
" before, after = word_tokenize(preprocess(str(row[6]))), word_tokenize(preprocess(str(row[7])))\n",
|
||||||
|
" if len(before) < 2 or len(after) < 2:\n",
|
||||||
|
" output = 'the:0.04 be:0.04 to:0.04 and:0.02 not:0.02 or:0.02 a:0.02 :0.8'\n",
|
||||||
|
" else:\n",
|
||||||
|
" output = predict_word(before[-1], after[0],model)\n",
|
||||||
|
" output_file.write(output + '\\n')\n",
|
||||||
|
" \n",
|
||||||
|
"def alpha_tuning(alphas):\n",
|
||||||
|
" for alpha in alphas:\n",
|
||||||
|
" model = train(X_train, Y_train, alpha)\n",
|
||||||
|
" word_gap_prediction('dev-0',model)\n",
|
||||||
|
" time.sleep(10)\n",
|
||||||
|
" print(\"Alpha = \",alpha)\n",
|
||||||
|
" print(\"dev-0 score\")\n",
|
||||||
|
" !./geval -t dev-0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"alphas = np.round(np.arange(0.1, 0.6, 0.1).tolist(),2)\n",
|
||||||
|
"alphas2 = np.round(alphas * 0.01,3)\n",
|
||||||
|
"alphas3 = np.round(alphas * 0.001,4)\n",
|
||||||
|
"alphas4 = np.round(alphas * 0.0001,5)\n",
|
||||||
|
"alphas5 = np.round(alphas * 0.00001,6)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Alpha = 0.1\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"789.71\n",
|
||||||
|
"Alpha = 0.2\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"819.57\n",
|
||||||
|
"Alpha = 0.3\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"833.52\n",
|
||||||
|
"Alpha = 0.4\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"841.93\n",
|
||||||
|
"Alpha = 0.5\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"847.66\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"alpha_tuning(alphas)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Alpha = 0.001\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"472.05\n",
|
||||||
|
"Alpha = 0.002\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"519.17\n",
|
||||||
|
"Alpha = 0.003\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"548.93\n",
|
||||||
|
"Alpha = 0.004\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"570.68\n",
|
||||||
|
"Alpha = 0.005\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"587.76\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"alpha_tuning(alphas2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Alpha = 0.0001\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"367.28\n",
|
||||||
|
"Alpha = 0.0002\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"389.51\n",
|
||||||
|
"Alpha = 0.0003\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"406.30\n",
|
||||||
|
"Alpha = 0.0004\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"419.89\n",
|
||||||
|
"Alpha = 0.0005\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"431.39\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"alpha_tuning(alphas3)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Alpha = 1e-05\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"350.33\n",
|
||||||
|
"Alpha = 2e-05\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"346.35\n",
|
||||||
|
"Alpha = 3e-05\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"347.66\n",
|
||||||
|
"Alpha = 4e-05\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"350.20\n",
|
||||||
|
"Alpha = 5e-05\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"353.09\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"alpha_tuning(alphas4)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Alpha = 1e-06\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"422.25\n",
|
||||||
|
"Alpha = 2e-06\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"390.96\n",
|
||||||
|
"Alpha = 3e-06\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"376.49\n",
|
||||||
|
"Alpha = 4e-06\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"367.96\n",
|
||||||
|
"Alpha = 5e-06\n",
|
||||||
|
"dev-0 score\n",
|
||||||
|
"362.34\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"alpha_tuning(alphas5)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.12"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
38
run.py
38
run.py
@ -3,39 +3,45 @@ from nltk import trigrams, word_tokenize
|
|||||||
import csv
|
import csv
|
||||||
import regex as re
|
import regex as re
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
|
||||||
in_file = 'train/in.tsv.xz'
|
in_file = 'train/in.tsv.xz'
|
||||||
out_file = 'train/expected.tsv'
|
out_file = 'train/expected.tsv'
|
||||||
|
|
||||||
X_train = pd.read_csv(in_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, nrows=10000, error_bad_lines=False)
|
X_train = pd.read_csv(in_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, nrows=70000, on_bad_lines='skip')
|
||||||
Y_train = pd.read_csv(out_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, nrows=10000, error_bad_lines=False)
|
Y_train = pd.read_csv(out_file, sep='\t', header=None, quoting=csv.QUOTE_NONE, nrows=70000, on_bad_lines='skip')
|
||||||
|
|
||||||
X_train = X_train[[6, 7]]
|
X_train = X_train[[6, 7]]
|
||||||
X_train = pd.concat([X_train, Y_train], axis=1)
|
X_train = pd.concat([X_train, Y_train], axis=1)
|
||||||
X_train['row'] = X_train[6] + X_train[0] + X_train[7]
|
X_train['row'] = X_train[6] + X_train[0] + X_train[7]
|
||||||
|
|
||||||
def train(X_train, Y_train):
|
def train(X_train, Y_train, alpha):
|
||||||
model = defaultdict(lambda: defaultdict(lambda: 0))
|
model = defaultdict(lambda: defaultdict(lambda: 0))
|
||||||
|
vocabulary = set()
|
||||||
for _, (_, row) in enumerate(X_train.iterrows()):
|
for _, (_, row) in enumerate(X_train.iterrows()):
|
||||||
text = preprocess(str(row['row']))
|
text = preprocess(str(row['row']))
|
||||||
words = word_tokenize(text)
|
words = word_tokenize(text)
|
||||||
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
|
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
|
||||||
if w1 and w2 and w3:
|
if w1 and w2 and w3:
|
||||||
model[(w1, w3)][w2] += 1
|
model[(w1, w3)][w2] += 1
|
||||||
|
vocabulary.add(w1)
|
||||||
|
vocabulary.add(w2)
|
||||||
|
vocabulary.add(w3)
|
||||||
|
|
||||||
for _, w13 in enumerate(model):
|
for _, w13 in enumerate(model):
|
||||||
count = sum(model[w13].values())
|
count = float(sum(model[w13].values()))
|
||||||
|
denominator = count + alpha * len(vocabulary)
|
||||||
for w2 in model[w13]:
|
for w2 in model[w13]:
|
||||||
model[w13][w2] += 0.25
|
nominator = model[w13][w2] + alpha
|
||||||
model[w13][w2] /= float(count + 0.25 + len(w2))
|
model[w13][w2] = nominator / denominator
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
def preprocess(row):
|
def preprocess(row):
|
||||||
row = re.sub(r'\p{P}', '', row.lower().replace('-\\n', '').replace('\\n', ' '))
|
row = re.sub(r'\p{P}', '', row.lower().replace('-\\n', '').replace('\\n', ' '))
|
||||||
return row
|
return row
|
||||||
|
|
||||||
def predict_word(before, after):
|
def predict_word(before, after, model):
|
||||||
output = ''
|
output = ''
|
||||||
p = 0.0
|
p = 0.0
|
||||||
Y_pred = dict(Counter(dict(model[before, after])).most_common(7))
|
Y_pred = dict(Counter(dict(model[before, after])).most_common(7))
|
||||||
@ -48,17 +54,19 @@ def predict_word(before, after):
|
|||||||
output += f':{max(1 - p, 0.01)}'
|
output += f':{max(1 - p, 0.01)}'
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def word_gap_prediction(file):
|
def word_gap_prediction(file, model):
|
||||||
X_test = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', header=None, quoting=csv.QUOTE_NONE, error_bad_lines=False)
|
X_test = pd.read_csv(f'{file}/in.tsv.xz', sep='\t', header=None, quoting=csv.QUOTE_NONE, on_bad_lines='skip')
|
||||||
with open(f'{file}/out.tsv', 'w', encoding='utf-8') as output_file:
|
with open(f'{file}/out.tsv', 'w', encoding='utf-8') as output_file:
|
||||||
for _, row in X_test.iterrows():
|
for _, row in X_test.iterrows():
|
||||||
before, after = word_tokenize(preprocess(str(row[6]))), word_tokenize(preprocess(str(row[7])))
|
before, after = word_tokenize(preprocess(str(row[6]))), word_tokenize(preprocess(str(row[7])))
|
||||||
if len(before) < 3 or len(after) < 3:
|
if len(before) < 2 or len(after) < 2:
|
||||||
output = 'the:0.04 be:0.04 to:0.04 and:0.02 not:0.02 or:0.02 a:0.02 :0.8'
|
output = 'the:0.04 be:0.04 to:0.04 and:0.02 not:0.02 or:0.02 a:0.02 :0.8'
|
||||||
else:
|
else:
|
||||||
output = predict_word(before[-1], after[0])
|
output = predict_word(before[-1], after[0],model)
|
||||||
output_file.write(output + '\n')
|
output_file.write(output + '\n')
|
||||||
|
|
||||||
model = train(X_train, Y_train)
|
|
||||||
word_gap_prediction('dev-0')
|
alpha = 0.00002
|
||||||
word_gap_prediction('test-A')
|
model = train(X_train, Y_train, alpha)
|
||||||
|
word_gap_prediction('dev-0', model)
|
||||||
|
word_gap_prediction('test-A',model)
|
||||||
|
10302
test-A/out.tsv
10302
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user