Compare commits

..

No commits in common. "master" and "0455752e54b3e32f50a9b2c943df173cdfc63fab" have entirely different histories.

8 changed files with 17934 additions and 22957 deletions

File diff suppressed because it is too large Load Diff

2261
lab8.ipynb

File diff suppressed because it is too large Load Diff

18
run.py
View File

@ -1,20 +1,4 @@
#!/usr/bin/python3
import sys
for line in sys.stdin:
spitted_line = line.split('\t')
left_context = spitted_line[6]
right_context = spitted_line[7]
left_context_words = left_context.split(' ')
right_context_words = right_context.split(' ')
# print(left_context_words)
# print()
# print(right_context_words)
if left_context_words[-1] == 'At' or left_context_words[-1] == 'at':
print('first:0.6 which:0.3 :01')
elif left_context_words[-1] == 'the':
print('it:0.5 a:0.4 :01')
elif left_context_words[-1] == 'a':
print('the:0.7 it:0.2 :01')
else:
print('the:0.6 a:0.3 :01')
print('the:0.6 a:0.4')

File diff suppressed because it is too large Load Diff

36
run2.py
View File

@ -1,36 +0,0 @@
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import sys
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
for line in sys.stdin:
splitted_line = line.split("\t")
left_context = splitted_line[6].split(" ")[-1]
right_context = splitted_line[7].split(" ")[0]
word = "[MASK]"
text = f"{left_context} {word} {right_context}"
input_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt", max_length=512, truncation=True)
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1][0]
with torch.inference_mode():
outputs = model(input_ids)
predictions = outputs[0][0, mask_token_index].softmax(dim=0)
top_k = 500
top_k_tokens = torch.topk(predictions, top_k).indices.tolist()
result = ''
prob_sum = 0
for token in top_k_tokens:
word = tokenizer.convert_ids_to_tokens([token])[0]
prob = predictions[token].item()
prob_sum += prob
result += f"{word}:{prob} "
diff = 1.0 - prob_sum
result += f":{diff}"
print(result)

File diff suppressed because one or more lines are too long

View File

@ -1,273 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"import itertools\n",
"import lzma\n",
"\n",
"import regex as re\n",
"import torch\n",
"from torch import nn\n",
"from torch.utils.data import IterableDataset, DataLoader\n",
"from torchtext.vocab import build_vocab_from_iterator"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from google.colab import drive"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def clean_line(line: str):\n",
" separated = line.split('\\t')\n",
" prefix = separated[6].replace(r'\\n', ' ')\n",
" suffix = separated[7].replace(r'\\n', ' ')\n",
" return prefix + ' ' + suffix\n",
"\n",
"def get_words_from_line(line):\n",
" line = clean_line(line)\n",
" for m in re.finditer(r'[\\p{L}0-9\\*]+|\\p{P}+', line):\n",
" yield m.group(0).lower()\n",
"\n",
"def get_word_lines_from_file(file_name):\n",
" with lzma.open(file_name, mode='rt', encoding='utf-8') as fid:\n",
" for line in fid:\n",
" yield get_words_from_line(line)\n",
"\n",
"def look_ahead_iterator(gen):\n",
" prev = None\n",
" for item in gen:\n",
" if prev is not None:\n",
" yield (prev, item)\n",
" prev = item\n",
"\n",
"def prediction(word: str) -> str:\n",
" ixs = torch.tensor(vocab.forward([word])).to(device)\n",
" out = model(ixs)\n",
" top = torch.topk(out[0], 5)\n",
" top_indices = top.indices.tolist()\n",
" top_probs = top.values.tolist()\n",
" top_words = vocab.lookup_tokens(top_indices)\n",
" zipped = list(zip(top_words, top_probs))\n",
" for index, element in enumerate(zipped):\n",
" unk = None\n",
" if '<unk>' in element:\n",
" unk = zipped.pop(index)\n",
" zipped.append(('', unk[1]))\n",
" break\n",
" if unk is None:\n",
" zipped[-1] = ('', zipped[-1][1])\n",
" return ' '.join([f'{x[0]}:{x[1]}' for x in zipped])\n",
"\n",
"def create_outputs(folder_name):\n",
" print(f'Creating outputs in {folder_name}')\n",
" with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:\n",
" with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8', newline='\\n') as f:\n",
" for line in fid:\n",
" separated = line.split('\\t')\n",
" prefix = separated[6].replace(r'\\n', ' ').split()[-1]\n",
" output_line = prediction(prefix)\n",
" f.write(output_line + '\\n')\n",
"\n",
"class Bigrams(IterableDataset):\n",
" def __init__(self, text_file, vocabulary_size):\n",
" self.vocab = build_vocab_from_iterator(\n",
" get_word_lines_from_file(text_file),\n",
" max_tokens=vocabulary_size,\n",
" specials=['<unk>'])\n",
" self.vocab.set_default_index(self.vocab['<unk>'])\n",
" self.vocabulary_size = vocabulary_size\n",
" self.text_file = text_file\n",
"\n",
" def __iter__(self):\n",
" return look_ahead_iterator(\n",
" (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))\n",
"\n",
"class SimpleBigramNeuralLanguageModel(nn.Module):\n",
" def __init__(self, vocabulary_size, embedding_size):\n",
" super(SimpleBigramNeuralLanguageModel, self).__init__()\n",
" self.model = nn.Sequential(\n",
" nn.Embedding(vocabulary_size, embedding_size),\n",
" nn.Linear(embedding_size, vocabulary_size),\n",
" nn.Softmax()\n",
" )\n",
"\n",
" def forward(self, x):\n",
" return self.model(x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"vocab_size = 15000\n",
"embed_size = 150\n",
"batch_size = 3000\n",
"device = 'cuda'\n",
"path_to_train = 'train/in.tsv.xz'\n",
"path_to_model = 'model1.bin'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"drive.mount('/content/drive')\n",
"%cd /content/drive/MyDrive/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"vocab = build_vocab_from_iterator(\n",
" get_word_lines_from_file(path_to_train),\n",
" max_tokens=vocab_size,\n",
" specials=['<unk>']\n",
")\n",
"\n",
"vocab.set_default_index(vocab['<unk>'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train_dataset = Bigrams(path_to_train, vocab_size)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
"data = DataLoader(train_dataset, batch_size=batch_size)\n",
"optimizer = torch.optim.Adam(model.parameters())\n",
"criterion = torch.nn.NLLLoss()\n",
"\n",
"model.train()\n",
"step = 0\n",
"for x, y in data:\n",
" x = x.to(device)\n",
" y = y.to(device)\n",
" optimizer.zero_grad()\n",
" ypredicted = model(x)\n",
" loss = criterion(torch.log(ypredicted), y)\n",
" if step % 100 == 0:\n",
" print(step, loss)\n",
" step += 1\n",
" loss.backward()\n",
" optimizer.step()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"torch.save(model.state_dict(), path_to_model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
"model.load_state_dict(torch.load(path_to_model))\n",
"model.eval()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"create_outputs('dev-0')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"create_outputs('test-A')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

File diff suppressed because it is too large Load Diff