Compare commits

...

8 Commits

Author SHA1 Message Date
bd46e2df8f gpt2 fine tuned 2023-06-17 01:39:03 +02:00
e4f94b37ed lab8 2023-05-27 16:06:02 +02:00
530b818ee3 zad7 2023-05-26 19:25:33 +02:00
4fc9f1ecca trigram, tetragram 2023-05-26 19:20:02 +02:00
209ddca5d5 trigram 2023-05-26 19:09:54 +02:00
85c37976a5 bigram solution 2023-04-11 08:30:15 +02:00
Wirus006
6e70850572 fixes to not infinity and not the same lines 2023-03-29 12:31:19 +02:00
Wirus006
ebe71330aa change probability to prevent infinity 2023-03-29 12:12:26 +02:00
8 changed files with 22957 additions and 17934 deletions

File diff suppressed because it is too large Load Diff

2261
lab8.ipynb Normal file

File diff suppressed because it is too large Load Diff

18
run.py
View File

@ -1,4 +1,20 @@
#!/usr/bin/python3 #!/usr/bin/python3
import sys import sys
for line in sys.stdin: for line in sys.stdin:
print('the:0.6 a:0.4') spitted_line = line.split('\t')
left_context = spitted_line[6]
right_context = spitted_line[7]
left_context_words = left_context.split(' ')
right_context_words = right_context.split(' ')
# print(left_context_words)
# print()
# print(right_context_words)
if left_context_words[-1] == 'At' or left_context_words[-1] == 'at':
print('first:0.6 which:0.3 :01')
elif left_context_words[-1] == 'the':
print('it:0.5 a:0.4 :01')
elif left_context_words[-1] == 'a':
print('the:0.7 it:0.2 :01')
else:
print('the:0.6 a:0.3 :01')

2436
run17.ipynb Normal file

File diff suppressed because it is too large Load Diff

36
run2.py Normal file
View File

@ -0,0 +1,36 @@
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import sys
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
for line in sys.stdin:
splitted_line = line.split("\t")
left_context = splitted_line[6].split(" ")[-1]
right_context = splitted_line[7].split(" ")[0]
word = "[MASK]"
text = f"{left_context} {word} {right_context}"
input_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt", max_length=512, truncation=True)
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1][0]
with torch.inference_mode():
outputs = model(input_ids)
predictions = outputs[0][0, mask_token_index].softmax(dim=0)
top_k = 500
top_k_tokens = torch.topk(predictions, top_k).indices.tolist()
result = ''
prob_sum = 0
for token in top_k_tokens:
word = tokenizer.convert_ids_to_tokens([token])[0]
prob = predictions[token].item()
prob_sum += prob
result += f"{word}:{prob} "
diff = 1.0 - prob_sum
result += f":{diff}"
print(result)

1
run3.ipynb Normal file

File diff suppressed because one or more lines are too long

273
run7.ipynb Normal file
View File

@ -0,0 +1,273 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"pycharm": {
"is_executing": true
}
},
"outputs": [],
"source": [
"import itertools\n",
"import lzma\n",
"\n",
"import regex as re\n",
"import torch\n",
"from torch import nn\n",
"from torch.utils.data import IterableDataset, DataLoader\n",
"from torchtext.vocab import build_vocab_from_iterator"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from google.colab import drive"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def clean_line(line: str):\n",
" separated = line.split('\\t')\n",
" prefix = separated[6].replace(r'\\n', ' ')\n",
" suffix = separated[7].replace(r'\\n', ' ')\n",
" return prefix + ' ' + suffix\n",
"\n",
"def get_words_from_line(line):\n",
" line = clean_line(line)\n",
" for m in re.finditer(r'[\\p{L}0-9\\*]+|\\p{P}+', line):\n",
" yield m.group(0).lower()\n",
"\n",
"def get_word_lines_from_file(file_name):\n",
" with lzma.open(file_name, mode='rt', encoding='utf-8') as fid:\n",
" for line in fid:\n",
" yield get_words_from_line(line)\n",
"\n",
"def look_ahead_iterator(gen):\n",
" prev = None\n",
" for item in gen:\n",
" if prev is not None:\n",
" yield (prev, item)\n",
" prev = item\n",
"\n",
"def prediction(word: str) -> str:\n",
" ixs = torch.tensor(vocab.forward([word])).to(device)\n",
" out = model(ixs)\n",
" top = torch.topk(out[0], 5)\n",
" top_indices = top.indices.tolist()\n",
" top_probs = top.values.tolist()\n",
" top_words = vocab.lookup_tokens(top_indices)\n",
" zipped = list(zip(top_words, top_probs))\n",
" for index, element in enumerate(zipped):\n",
" unk = None\n",
" if '<unk>' in element:\n",
" unk = zipped.pop(index)\n",
" zipped.append(('', unk[1]))\n",
" break\n",
" if unk is None:\n",
" zipped[-1] = ('', zipped[-1][1])\n",
" return ' '.join([f'{x[0]}:{x[1]}' for x in zipped])\n",
"\n",
"def create_outputs(folder_name):\n",
" print(f'Creating outputs in {folder_name}')\n",
" with lzma.open(f'{folder_name}/in.tsv.xz', mode='rt', encoding='utf-8') as fid:\n",
" with open(f'{folder_name}/out.tsv', 'w', encoding='utf-8', newline='\\n') as f:\n",
" for line in fid:\n",
" separated = line.split('\\t')\n",
" prefix = separated[6].replace(r'\\n', ' ').split()[-1]\n",
" output_line = prediction(prefix)\n",
" f.write(output_line + '\\n')\n",
"\n",
"class Bigrams(IterableDataset):\n",
" def __init__(self, text_file, vocabulary_size):\n",
" self.vocab = build_vocab_from_iterator(\n",
" get_word_lines_from_file(text_file),\n",
" max_tokens=vocabulary_size,\n",
" specials=['<unk>'])\n",
" self.vocab.set_default_index(self.vocab['<unk>'])\n",
" self.vocabulary_size = vocabulary_size\n",
" self.text_file = text_file\n",
"\n",
" def __iter__(self):\n",
" return look_ahead_iterator(\n",
" (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))\n",
"\n",
"class SimpleBigramNeuralLanguageModel(nn.Module):\n",
" def __init__(self, vocabulary_size, embedding_size):\n",
" super(SimpleBigramNeuralLanguageModel, self).__init__()\n",
" self.model = nn.Sequential(\n",
" nn.Embedding(vocabulary_size, embedding_size),\n",
" nn.Linear(embedding_size, vocabulary_size),\n",
" nn.Softmax()\n",
" )\n",
"\n",
" def forward(self, x):\n",
" return self.model(x)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"vocab_size = 15000\n",
"embed_size = 150\n",
"batch_size = 3000\n",
"device = 'cuda'\n",
"path_to_train = 'train/in.tsv.xz'\n",
"path_to_model = 'model1.bin'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"drive.mount('/content/drive')\n",
"%cd /content/drive/MyDrive/"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"vocab = build_vocab_from_iterator(\n",
" get_word_lines_from_file(path_to_train),\n",
" max_tokens=vocab_size,\n",
" specials=['<unk>']\n",
")\n",
"\n",
"vocab.set_default_index(vocab['<unk>'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train_dataset = Bigrams(path_to_train, vocab_size)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
"data = DataLoader(train_dataset, batch_size=batch_size)\n",
"optimizer = torch.optim.Adam(model.parameters())\n",
"criterion = torch.nn.NLLLoss()\n",
"\n",
"model.train()\n",
"step = 0\n",
"for x, y in data:\n",
" x = x.to(device)\n",
" y = y.to(device)\n",
" optimizer.zero_grad()\n",
" ypredicted = model(x)\n",
" loss = criterion(torch.log(ypredicted), y)\n",
" if step % 100 == 0:\n",
" print(step, loss)\n",
" step += 1\n",
" loss.backward()\n",
" optimizer.step()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"torch.save(model.state_dict(), path_to_model)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
"model.load_state_dict(torch.load(path_to_model))\n",
"model.eval()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"create_outputs('dev-0')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"create_outputs('test-A')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

File diff suppressed because it is too large Load Diff