Compare commits
15 Commits
Author | SHA1 | Date | |
---|---|---|---|
2b76575236 | |||
|
380ef29e71 | ||
4c12c0ab0c | |||
0c68c7fb35 | |||
8edfd77c57 | |||
dbe4dd56ac | |||
3898744e06 | |||
|
42b14b840c | ||
|
035ee66c44 | ||
|
aaccbbeb06 | ||
39c1f3a341 | |||
bb121718aa | |||
9332c1957b | |||
|
d877969ac2 | ||
|
2a4ab01f29 |
10
README.md
10
README.md
@ -1,9 +1 @@
|
|||||||
Challenging America word-gap prediction
|
# Bugfixy inferencji i wstawienie lepszych wyników 24.05.23.
|
||||||
===================================
|
|
||||||
|
|
||||||
Guess a word in a gap.
|
|
||||||
|
|
||||||
Evaluation metric
|
|
||||||
-----------------
|
|
||||||
|
|
||||||
LikelihoodHashed is the metric
|
|
||||||
|
10519
dev-0/in.tsv
10519
dev-0/in.tsv
File diff suppressed because it is too large
Load Diff
BIN
dev-0/in.tsv.xz
Normal file
BIN
dev-0/in.tsv.xz
Normal file
Binary file not shown.
21038
dev-0/out.tsv
21038
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
@ -1,11 +0,0 @@
|
|||||||
import sys
|
|
||||||
|
|
||||||
file = sys.argv[1]
|
|
||||||
|
|
||||||
with open(file, encoding='utf-8') as f1, open('out.tsv', 'w', encoding='utf-8') as f2:
|
|
||||||
for line in f1:
|
|
||||||
line = line.split('\t')
|
|
||||||
if line[-1][0].isupper():
|
|
||||||
f2.write('the:0.9 :0.1\n')
|
|
||||||
else:
|
|
||||||
f2.write('the:0.4 a:0.4 :0.2\n')
|
|
9
generations.txt
Normal file
9
generations.txt
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
According to recent news and a half of the north side or the other must be a man of great force of character to the country. He was a member of a family in the United States to the full amount of the principal and interest of this section shall be subject to the
|
||||||
|
|
||||||
|
Recent studies have shown that the present condition of things in which we have been in a very short time after the war and the war was over and that every man who has been in the hands of the United States of the North and the South American States, and those States who had
|
||||||
|
|
||||||
|
Today I was taking a stroll in the park when suddenly and that the said estate has by no Mr. ii him, but he was too young to be a very good reason that the above named de- - . . . They are able six e of a good deal of time and money to the amount of the tax
|
||||||
|
|
||||||
|
The most unbelievable story ever told goes like this to be the most important of these are the only two men who were at the time of the year when they tried an 1 the said sum of money to be paid in case of your South and the West may have to do the work of the committee
|
||||||
|
|
||||||
|
he war between natural and the few who are not in the interest of the said William H. and acres of them, more o the State from the control of the state of New York and New York and New York are the looked for food in the greatest number of the most
|
11
gonito.yaml
Normal file
11
gonito.yaml
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
description: neural network with bigrams
|
||||||
|
tags:
|
||||||
|
- neural-network
|
||||||
|
- left-context
|
||||||
|
- bigrams
|
||||||
|
params:
|
||||||
|
vocab_size: 20000
|
||||||
|
embed_size: 150
|
||||||
|
batch_size: 5000
|
||||||
|
param-files:
|
||||||
|
- "*.yaml"
|
286
solution.ipynb
Normal file
286
solution.ipynb
Normal file
@ -0,0 +1,286 @@
|
|||||||
|
{
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python3",
|
||||||
|
"display_name": "Python 3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
},
|
||||||
|
"accelerator": "GPU",
|
||||||
|
"gpuClass": "standard"
|
||||||
|
},
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"from torchtext.vocab import build_vocab_from_iterator\n",
|
||||||
|
"import pickle\n",
|
||||||
|
"from torch.utils.data import IterableDataset\n",
|
||||||
|
"import itertools\n",
|
||||||
|
"from torch import nn\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import lzma\n",
|
||||||
|
"from torch.utils.data import DataLoader\n",
|
||||||
|
"from tqdm import tqdm"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "WnglOFA8gGJl"
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"def simple_preprocess(line):\n",
|
||||||
|
" return line.replace(r'\\n', ' ')\n",
|
||||||
|
"\n",
|
||||||
|
"def get_words_from_line(line):\n",
|
||||||
|
" line = line.strip()\n",
|
||||||
|
" line = simple_preprocess(line)\n",
|
||||||
|
" yield '<s>'\n",
|
||||||
|
" for t in line.split():\n",
|
||||||
|
" yield t\n",
|
||||||
|
" yield '</s>'\n",
|
||||||
|
"\n",
|
||||||
|
"def get_word_lines_from_file(file_name, n_size=-1):\n",
|
||||||
|
" with lzma.open(file_name, 'r') as fh:\n",
|
||||||
|
" n = 0\n",
|
||||||
|
" for line in fh:\n",
|
||||||
|
" n += 1\n",
|
||||||
|
" yield get_words_from_line(line.decode('utf-8'))\n",
|
||||||
|
" if n == n_size:\n",
|
||||||
|
" break\n",
|
||||||
|
"\n",
|
||||||
|
"def look_ahead_iterator(gen):\n",
|
||||||
|
" prev = None\n",
|
||||||
|
" for item in gen:\n",
|
||||||
|
" if prev is not None:\n",
|
||||||
|
" yield prev, item\n",
|
||||||
|
" prev = item\n",
|
||||||
|
"\n",
|
||||||
|
"def build_vocab(file, vocab_size):\n",
|
||||||
|
" try:\n",
|
||||||
|
" with open(f'bigram_nn_vocab_{vocab_size}.pickle', 'rb') as handle:\n",
|
||||||
|
" vocab = pickle.load(handle)\n",
|
||||||
|
" except:\n",
|
||||||
|
" vocab = build_vocab_from_iterator(\n",
|
||||||
|
" get_word_lines_from_file(file),\n",
|
||||||
|
" max_tokens = vocab_size,\n",
|
||||||
|
" specials = ['<unk>'])\n",
|
||||||
|
" with open(f'bigram_nn_vocab_{vocab_size}.pickle', 'wb') as handle:\n",
|
||||||
|
" pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL)\n",
|
||||||
|
" return vocab\n",
|
||||||
|
"\n",
|
||||||
|
"class Bigrams(IterableDataset):\n",
|
||||||
|
" def __init__(self, text_file, vocabulary_size):\n",
|
||||||
|
" self.vocab = vocab\n",
|
||||||
|
" self.vocab.set_default_index(self.vocab['<unk>'])\n",
|
||||||
|
" self.vocabulary_size = vocabulary_size\n",
|
||||||
|
" self.text_file = text_file\n",
|
||||||
|
"\n",
|
||||||
|
" def __iter__(self):\n",
|
||||||
|
" return look_ahead_iterator(\n",
|
||||||
|
" (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))\n",
|
||||||
|
"\n",
|
||||||
|
"class SimpleBigramNeuralLanguageModel(nn.Module):\n",
|
||||||
|
" def __init__(self, vocabulary_size, embedding_size):\n",
|
||||||
|
" super(SimpleBigramNeuralLanguageModel, self).__init__()\n",
|
||||||
|
" self.model = nn.Sequential(\n",
|
||||||
|
" nn.Embedding(vocabulary_size, embedding_size),\n",
|
||||||
|
" nn.Linear(embedding_size, vocabulary_size),\n",
|
||||||
|
" nn.Softmax(dim=1)\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" def forward(self, x):\n",
|
||||||
|
" return self.model(x)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "aW_3JqSNgLLr"
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"max_steps= -1\n",
|
||||||
|
"vocab_size = 20000\n",
|
||||||
|
"embed_size = 150\n",
|
||||||
|
"batch_size = 5000\n",
|
||||||
|
"learning_rate = 0.001\n",
|
||||||
|
"vocab = build_vocab('challenging-america-word-gap-prediction/train/in.tsv.xz', vocab_size)\n",
|
||||||
|
"train_dataset = Bigrams('challenging-america-word-gap-prediction/train/in.tsv.xz', vocab_size)\n",
|
||||||
|
"if torch.cuda.is_available():\n",
|
||||||
|
" device = 'cuda'\n",
|
||||||
|
"else:\n",
|
||||||
|
" raise Exception()\n",
|
||||||
|
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
|
||||||
|
"data = DataLoader(train_dataset, batch_size=batch_size)\n",
|
||||||
|
"optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)\n",
|
||||||
|
"criterion = torch.nn.NLLLoss()\n",
|
||||||
|
"\n",
|
||||||
|
"model.train()\n",
|
||||||
|
"step = 0\n",
|
||||||
|
"for x, y in data:\n",
|
||||||
|
" x = x.to(device)\n",
|
||||||
|
" y = y.to(device)\n",
|
||||||
|
" optimizer.zero_grad()\n",
|
||||||
|
" y_predicted = model(x)\n",
|
||||||
|
" loss = criterion(torch.log(y_predicted), y)\n",
|
||||||
|
" if step % 1000 == 0:\n",
|
||||||
|
" print(f'steps: {step}, loss: {loss.item()}')\n",
|
||||||
|
" if step != 0:\n",
|
||||||
|
" torch.save(model.state_dict(), f'bigram_nn_model_steps-{step}_vocab-{vocab_size}_embed-{embed_size}_batch-{batch_size}.bin')\n",
|
||||||
|
" if step == max_steps:\n",
|
||||||
|
" break\n",
|
||||||
|
" step += 1\n",
|
||||||
|
" loss.backward()\n",
|
||||||
|
" optimizer.step()"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "QQw_E7Ku4h0a",
|
||||||
|
"outputId": "4a37d9ba-1abd-46ae-b157-cd6d52b951a2"
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/ked/PycharmProjects/mj9/venv/lib/python3.10/site-packages/torch/nn/modules/container.py:217: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
|
||||||
|
" input = module(input)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"steps: 0, loss: 10.091094017028809\n",
|
||||||
|
"steps: 1000, loss: 5.73332405090332\n",
|
||||||
|
"steps: 2000, loss: 5.655370712280273\n",
|
||||||
|
"steps: 3000, loss: 5.457630634307861\n",
|
||||||
|
"steps: 4000, loss: 5.38517427444458\n",
|
||||||
|
"steps: 5000, loss: 5.467936992645264\n",
|
||||||
|
"steps: 6000, loss: 5.372152328491211\n",
|
||||||
|
"steps: 7000, loss: 5.272013187408447\n",
|
||||||
|
"steps: 8000, loss: 5.439966201782227\n",
|
||||||
|
"steps: 9000, loss: 5.268238544464111\n",
|
||||||
|
"steps: 10000, loss: 5.1395182609558105\n",
|
||||||
|
"steps: 11000, loss: 5.2558159828186035\n",
|
||||||
|
"steps: 12000, loss: 5.263617515563965\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ename": "KeyboardInterrupt",
|
||||||
|
"evalue": "",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
|
||||||
|
"\u001B[0;31mKeyboardInterrupt\u001B[0m Traceback (most recent call last)",
|
||||||
|
"Cell \u001B[0;32mIn[4], line 31\u001B[0m\n\u001B[1;32m 29\u001B[0m \u001B[38;5;28;01mbreak\u001B[39;00m\n\u001B[1;32m 30\u001B[0m step \u001B[38;5;241m+\u001B[39m\u001B[38;5;241m=\u001B[39m \u001B[38;5;241m1\u001B[39m\n\u001B[0;32m---> 31\u001B[0m \u001B[43mloss\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbackward\u001B[49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 32\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mstep()\n",
|
||||||
|
"File \u001B[0;32m~/PycharmProjects/mj9/venv/lib/python3.10/site-packages/torch/_tensor.py:487\u001B[0m, in \u001B[0;36mTensor.backward\u001B[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001B[0m\n\u001B[1;32m 477\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m has_torch_function_unary(\u001B[38;5;28mself\u001B[39m):\n\u001B[1;32m 478\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m handle_torch_function(\n\u001B[1;32m 479\u001B[0m Tensor\u001B[38;5;241m.\u001B[39mbackward,\n\u001B[1;32m 480\u001B[0m (\u001B[38;5;28mself\u001B[39m,),\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 485\u001B[0m inputs\u001B[38;5;241m=\u001B[39minputs,\n\u001B[1;32m 486\u001B[0m )\n\u001B[0;32m--> 487\u001B[0m \u001B[43mtorch\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mautograd\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mbackward\u001B[49m\u001B[43m(\u001B[49m\n\u001B[1;32m 488\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;28;43mself\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mgradient\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mretain_graph\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcreate_graph\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43minputs\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43minputs\u001B[49m\n\u001B[1;32m 489\u001B[0m \u001B[43m\u001B[49m\u001B[43m)\u001B[49m\n",
|
||||||
|
"File \u001B[0;32m~/PycharmProjects/mj9/venv/lib/python3.10/site-packages/torch/autograd/__init__.py:200\u001B[0m, in \u001B[0;36mbackward\u001B[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001B[0m\n\u001B[1;32m 195\u001B[0m retain_graph \u001B[38;5;241m=\u001B[39m create_graph\n\u001B[1;32m 197\u001B[0m \u001B[38;5;66;03m# The reason we repeat same the comment below is that\u001B[39;00m\n\u001B[1;32m 198\u001B[0m \u001B[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001B[39;00m\n\u001B[1;32m 199\u001B[0m \u001B[38;5;66;03m# calls in the traceback and some print out the last line\u001B[39;00m\n\u001B[0;32m--> 200\u001B[0m \u001B[43mVariable\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_execution_engine\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mrun_backward\u001B[49m\u001B[43m(\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001B[39;49;00m\n\u001B[1;32m 201\u001B[0m \u001B[43m \u001B[49m\u001B[43mtensors\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mgrad_tensors_\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mretain_graph\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcreate_graph\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43minputs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[1;32m 202\u001B[0m \u001B[43m \u001B[49m\u001B[43mallow_unreachable\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43maccumulate_grad\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43;01mTrue\u001B[39;49;00m\u001B[43m)\u001B[49m\n",
|
||||||
|
"\u001B[0;31mKeyboardInterrupt\u001B[0m: "
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"vocab_size = 20000\n",
|
||||||
|
"embed_size = 150\n",
|
||||||
|
"batch_size = 5000\n",
|
||||||
|
"vocab = build_vocab('challenging-america-word-gap-prediction/train/in.tsv.xz', vocab_size)\n",
|
||||||
|
"vocab.set_default_index(vocab['<unk>'])"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "N9-wmLOEZ2aV"
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"topk = 5\n",
|
||||||
|
"preds = []\n",
|
||||||
|
"device = 'cuda'\n",
|
||||||
|
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
|
||||||
|
"model.load_state_dict(torch.load('bigram_nn_model_steps-10000_vocab-20000_embed-150_batch-5000.bin'))\n",
|
||||||
|
"model.eval()\n",
|
||||||
|
"for path in ['challenging-america-word-gap-prediction/dev-0', 'challenging-america-word-gap-prediction/test-A']:\n",
|
||||||
|
" with lzma.open(f'{path}/in.tsv.xz', 'r') as fh, open(f'{path}/out.tsv', 'w', encoding='utf-8') as f_out:\n",
|
||||||
|
" for line in fh:\n",
|
||||||
|
" previous_word = simple_preprocess(line.decode('utf-8').split('\\t')[-2].strip()).split()[-1]\n",
|
||||||
|
" ixs = torch.tensor(vocab.forward([previous_word])).to(device)\n",
|
||||||
|
" out = model(ixs)\n",
|
||||||
|
" top = torch.topk(out[0], topk)\n",
|
||||||
|
" top_indices = top.indices.tolist()\n",
|
||||||
|
" top_probs = top.values.tolist()\n",
|
||||||
|
" top_words = vocab.lookup_tokens(top_indices)\n",
|
||||||
|
" top_zipped = zip(top_words, top_probs)\n",
|
||||||
|
" pred = ''\n",
|
||||||
|
" total_prob = 0\n",
|
||||||
|
" for word, prob in top_zipped:\n",
|
||||||
|
" if word != '<unk>':\n",
|
||||||
|
" pred += f'{word}:{prob} '\n",
|
||||||
|
" total_prob += prob\n",
|
||||||
|
" unk_prob = 1 - total_prob\n",
|
||||||
|
" pred += f':{unk_prob}'\n",
|
||||||
|
" f_out.write(pred + '\\n')"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "99uioFpVCJL8",
|
||||||
|
"outputId": "d4267cb1-e557-478a-8cf7-91a90db07698"
|
||||||
|
},
|
||||||
|
"execution_count": 24,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/ked/PycharmProjects/mj9/challenging-america-word-gap-prediction\n",
|
||||||
|
"394.97\r\n",
|
||||||
|
"/home/ked/PycharmProjects/mj9\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%cd challenging-america-word-gap-prediction/\n",
|
||||||
|
"!./geval --test-name dev-0\n",
|
||||||
|
"%cd ../"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [],
|
||||||
|
"source": [],
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
7414
test-A/in.tsv
7414
test-A/in.tsv
File diff suppressed because it is too large
Load Diff
BIN
test-A/in.tsv.xz
Normal file
BIN
test-A/in.tsv.xz
Normal file
Binary file not shown.
14828
test-A/out.tsv
14828
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user