Compare commits
8 Commits
0455752e54
...
master
Author | SHA1 | Date | |
---|---|---|---|
bd46e2df8f | |||
e4f94b37ed | |||
530b818ee3 | |||
4fc9f1ecca | |||
209ddca5d5 | |||
85c37976a5 | |||
|
6e70850572 | ||
|
ebe71330aa |
21038
dev-0/out.tsv
21038
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
2261
lab8.ipynb
Normal file
2261
lab8.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
18
run.py
18
run.py
@ -1,4 +1,20 @@
|
||||
#!/usr/bin/python3
|
||||
import sys
|
||||
for line in sys.stdin:
|
||||
print('the:0.6 a:0.4')
|
||||
spitted_line = line.split('\t')
|
||||
left_context = spitted_line[6]
|
||||
right_context = spitted_line[7]
|
||||
left_context_words = left_context.split(' ')
|
||||
right_context_words = right_context.split(' ')
|
||||
# print(left_context_words)
|
||||
# print()
|
||||
# print(right_context_words)
|
||||
|
||||
if left_context_words[-1] == 'At' or left_context_words[-1] == 'at':
|
||||
print('first:0.6 which:0.3 :01')
|
||||
elif left_context_words[-1] == 'the':
|
||||
print('it:0.5 a:0.4 :01')
|
||||
elif left_context_words[-1] == 'a':
|
||||
print('the:0.7 it:0.2 :01')
|
||||
else:
|
||||
print('the:0.6 a:0.3 :01')
|
2436
run17.ipynb
Normal file
2436
run17.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
36
run2.py
Normal file
36
run2.py
Normal file
@ -0,0 +1,36 @@
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
||||
import sys
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
||||
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
|
||||
|
||||
for line in sys.stdin:
|
||||
splitted_line = line.split("\t")
|
||||
left_context = splitted_line[6].split(" ")[-1]
|
||||
right_context = splitted_line[7].split(" ")[0]
|
||||
|
||||
word = "[MASK]"
|
||||
|
||||
text = f"{left_context} {word} {right_context}"
|
||||
|
||||
input_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt", max_length=512, truncation=True)
|
||||
|
||||
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1][0]
|
||||
|
||||
with torch.inference_mode():
|
||||
outputs = model(input_ids)
|
||||
predictions = outputs[0][0, mask_token_index].softmax(dim=0)
|
||||
|
||||
top_k = 500
|
||||
top_k_tokens = torch.topk(predictions, top_k).indices.tolist()
|
||||
result = ''
|
||||
prob_sum = 0
|
||||
for token in top_k_tokens:
|
||||
word = tokenizer.convert_ids_to_tokens([token])[0]
|
||||
prob = predictions[token].item()
|
||||
prob_sum += prob
|
||||
result += f"{word}:{prob} "
|
||||
diff = 1.0 - prob_sum
|
||||
result += f":{diff}"
|
||||
print(result)
|
1
run3.ipynb
Normal file
1
run3.ipynb
Normal file
File diff suppressed because one or more lines are too long
273
run7.ipynb
Normal file
273
run7.ipynb
Normal file
@ -0,0 +1,273 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"pycharm": {
|
||||
"is_executing": true
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import itertools\n",
|
||||
"import lzma\n",
|
||||
"\n",
|
||||
"import regex as re\n",
|
||||
"import torch\n",
|
||||
"from torch import nn\n",
|
||||
"from torch.utils.data import IterableDataset, DataLoader\n",
|
||||
"from torchtext.vocab import build_vocab_from_iterator"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from google.colab import drive"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def clean_line(line: str):\n",
|
||||
" separated = line.split('\\t')\n",
|
||||
" prefix = separated[6].replace(r'\\n', ' ')\n",
|
||||
" suffix = separated[7].replace(r'\\n', ' ')\n",
|
||||
" return prefix + ' ' + suffix\n",
|
||||
"\n",
|
||||
"def get_words_from_line(line):\n",
|
||||
" line = clean_line(line)\n",
|
||||
" for m in re.finditer(r'[\\p{L}0-9\\*]+|\\p{P}+', line):\n",
|
||||
" yield m.group(0).lower()\n",
|
||||
"\n",
|
||||
"def get_word_lines_from_file(file_name):\n",
|
||||
" with lzma.open(file_name, mode='rt', encoding='utf-8') as fid:\n",
|
||||
" for line in fid:\n",
|
||||
" yield get_words_from_line(line)\n",
|
||||
"\n",
|
||||
"def look_ahead_iterator(gen):\n",
|
||||
" prev = None\n",
|
||||
" for item in gen:\n",
|
||||
" if prev is not None:\n",
|
||||
" yield (prev, item)\n",
|
||||
" prev = item\n",
|
||||
"\n",
|
||||
"def prediction(word: str) -> str:\n",
|
||||
" ixs = torch.tensor(vocab.forward([word])).to(device)\n",
|
||||
" out = model(ixs)\n",
|
||||
" top = torch.topk(out[0], 5)\n",
|
||||
" top_indices = top.indices.tolist()\n",
|
||||
" top_probs = top.values.tolist()\n",
|
||||
" top_words = vocab.lookup_tokens(top_indices)\n",
|
||||
" zipped = list(zip(top_words, top_probs))\n",
|
||||
" for index, element in enumerate(zipped):\n",
|
||||
" unk = None\n",
|
||||
" if '<unk>' in element:\n",
|
||||
" unk = zipped.pop(index)\n",
|
||||
" zipped.append(('', unk[1]))\n",
|
||||
" break\n",
|
||||
" if unk is None:\n",
|
||||
" zipped[-1] = ('', zipped[-1][1])\n",
|
||||
" return ' '.join([f'{x[0]}:{x[1]}' for x in zipped])\n",
|
||||
"\n",
|
||||
"def create_outputs(folder_name):\n",
|
||||
" print(f'Creating outputs in {folder_name}')\n",
|
||||
" with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:\n",
|
||||
" with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8', newline='\\n') as f:\n",
|
||||
" for line in fid:\n",
|
||||
" separated = line.split('\\t')\n",
|
||||
" prefix = separated[6].replace(r'\\n', ' ').split()[-1]\n",
|
||||
" output_line = prediction(prefix)\n",
|
||||
" f.write(output_line + '\\n')\n",
|
||||
"\n",
|
||||
"class Bigrams(IterableDataset):\n",
|
||||
" def __init__(self, text_file, vocabulary_size):\n",
|
||||
" self.vocab = build_vocab_from_iterator(\n",
|
||||
" get_word_lines_from_file(text_file),\n",
|
||||
" max_tokens=vocabulary_size,\n",
|
||||
" specials=['<unk>'])\n",
|
||||
" self.vocab.set_default_index(self.vocab['<unk>'])\n",
|
||||
" self.vocabulary_size = vocabulary_size\n",
|
||||
" self.text_file = text_file\n",
|
||||
"\n",
|
||||
" def __iter__(self):\n",
|
||||
" return look_ahead_iterator(\n",
|
||||
" (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))\n",
|
||||
"\n",
|
||||
"class SimpleBigramNeuralLanguageModel(nn.Module):\n",
|
||||
" def __init__(self, vocabulary_size, embedding_size):\n",
|
||||
" super(SimpleBigramNeuralLanguageModel, self).__init__()\n",
|
||||
" self.model = nn.Sequential(\n",
|
||||
" nn.Embedding(vocabulary_size, embedding_size),\n",
|
||||
" nn.Linear(embedding_size, vocabulary_size),\n",
|
||||
" nn.Softmax()\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" def forward(self, x):\n",
|
||||
" return self.model(x)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vocab_size = 15000\n",
|
||||
"embed_size = 150\n",
|
||||
"batch_size = 3000\n",
|
||||
"device = 'cuda'\n",
|
||||
"path_to_train = 'train/in.tsv.xz'\n",
|
||||
"path_to_model = 'model1.bin'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"drive.mount('/content/drive')\n",
|
||||
"%cd /content/drive/MyDrive/"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vocab = build_vocab_from_iterator(\n",
|
||||
" get_word_lines_from_file(path_to_train),\n",
|
||||
" max_tokens=vocab_size,\n",
|
||||
" specials=['<unk>']\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"vocab.set_default_index(vocab['<unk>'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"train_dataset = Bigrams(path_to_train, vocab_size)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
|
||||
"data = DataLoader(train_dataset, batch_size=batch_size)\n",
|
||||
"optimizer = torch.optim.Adam(model.parameters())\n",
|
||||
"criterion = torch.nn.NLLLoss()\n",
|
||||
"\n",
|
||||
"model.train()\n",
|
||||
"step = 0\n",
|
||||
"for x, y in data:\n",
|
||||
" x = x.to(device)\n",
|
||||
" y = y.to(device)\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" ypredicted = model(x)\n",
|
||||
" loss = criterion(torch.log(ypredicted), y)\n",
|
||||
" if step % 100 == 0:\n",
|
||||
" print(step, loss)\n",
|
||||
" step += 1\n",
|
||||
" loss.backward()\n",
|
||||
" optimizer.step()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"torch.save(model.state_dict(), path_to_model)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
|
||||
"model.load_state_dict(torch.load(path_to_model))\n",
|
||||
"model.eval()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"create_outputs('dev-0')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": false
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"create_outputs('test-A')"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 2
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython2",
|
||||
"version": "2.7.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0
|
||||
}
|
14828
test-A/out.tsv
14828
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user