trigram solution & moved previous implementations
This commit is contained in:
parent
37f40392aa
commit
cf92fd9b73
@ -1,265 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"attachments": {},
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"# Zadanie 1\n",
|
|
||||||
"Wyucz prosty bigramowy model języka oparty na regresji logistycznej (jak przedstawiono na wykładzie)."
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from itertools import islice\n",
|
|
||||||
"import regex as re\n",
|
|
||||||
"import sys\n",
|
|
||||||
"from torchtext.vocab import build_vocab_from_iterator\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def get_words_from_line(line):\n",
|
|
||||||
" line = line.rstrip()\n",
|
|
||||||
" yield '<s>'\n",
|
|
||||||
" for m in re.finditer(r'[\\p{L}0-9\\*]+|\\p{P}+', line):\n",
|
|
||||||
" yield m.group(0).lower()\n",
|
|
||||||
" yield '</s>'\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"def get_word_lines_from_file(file_name):\n",
|
|
||||||
" with open(file_name, 'r') as fh:\n",
|
|
||||||
" for line in fh:\n",
|
|
||||||
" yield get_words_from_line(line)\n",
|
|
||||||
"\n",
|
|
||||||
"vocab_size = 20000\n",
|
|
||||||
"\n",
|
|
||||||
"vocab = build_vocab_from_iterator(\n",
|
|
||||||
" get_word_lines_from_file('test-A/in.tsv'),\n",
|
|
||||||
" max_tokens = vocab_size,\n",
|
|
||||||
" specials = ['<unk>'])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"3798"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 3,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"vocab['welcome']"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 10,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"tensor(5.5038e-05, grad_fn=<SelectBackward0>)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 10,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"from torch import nn\n",
|
|
||||||
"import torch\n",
|
|
||||||
"\n",
|
|
||||||
"embed_size = 100\n",
|
|
||||||
"\n",
|
|
||||||
"class SimpleBigramNeuralLanguageModel(nn.Module):\n",
|
|
||||||
" def __init__(self, vocabulary_size, embedding_size):\n",
|
|
||||||
" super(SimpleBigramNeuralLanguageModel, self).__init__()\n",
|
|
||||||
" self.model = nn.Sequential(\n",
|
|
||||||
" nn.Embedding(vocabulary_size, embedding_size),\n",
|
|
||||||
" nn.Linear(embedding_size, vocabulary_size),\n",
|
|
||||||
" nn.Softmax()\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" def forward(self, x):\n",
|
|
||||||
" return self.model(x)\n",
|
|
||||||
"\n",
|
|
||||||
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size)\n",
|
|
||||||
"\n",
|
|
||||||
"vocab.set_default_index(vocab['<unk>'])\n",
|
|
||||||
"ixs = torch.tensor(vocab.forward(['welcone']))\n",
|
|
||||||
"out = model(ixs)\n",
|
|
||||||
"out[0][vocab['to']]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"from torch.utils.data import IterableDataset\n",
|
|
||||||
"import itertools\n",
|
|
||||||
"\n",
|
|
||||||
"def look_ahead_iterator(gen):\n",
|
|
||||||
" prev = None\n",
|
|
||||||
" for item in gen:\n",
|
|
||||||
" if prev is not None:\n",
|
|
||||||
" yield (prev, item)\n",
|
|
||||||
" prev = item\n",
|
|
||||||
"\n",
|
|
||||||
"class Bigrams(IterableDataset):\n",
|
|
||||||
" def __init__(self, text_file, vocabulary_size):\n",
|
|
||||||
" self.vocab = build_vocab_from_iterator(\n",
|
|
||||||
" get_word_lines_from_file(text_file),\n",
|
|
||||||
" max_tokens = vocabulary_size,\n",
|
|
||||||
" specials = ['<unk>'])\n",
|
|
||||||
" self.vocab.set_default_index(self.vocab['<unk>'])\n",
|
|
||||||
" self.vocabulary_size = vocabulary_size\n",
|
|
||||||
" self.text_file = text_file\n",
|
|
||||||
"\n",
|
|
||||||
" def __iter__(self):\n",
|
|
||||||
" return look_ahead_iterator(\n",
|
|
||||||
" (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_file(self.text_file))))\n",
|
|
||||||
"\n",
|
|
||||||
"train_dataset = Bigrams('test-A/in.tsv', vocab_size)"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 13,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stdout",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"0 tensor(10.0928, grad_fn=<NllLossBackward0>)\n",
|
|
||||||
"100 tensor(8.4572, grad_fn=<NllLossBackward0>)\n",
|
|
||||||
"200 tensor(7.6165, grad_fn=<NllLossBackward0>)\n",
|
|
||||||
"300 tensor(6.9356, grad_fn=<NllLossBackward0>)\n",
|
|
||||||
"400 tensor(6.5687, grad_fn=<NllLossBackward0>)\n",
|
|
||||||
"500 tensor(6.2197, grad_fn=<NllLossBackward0>)\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"ename": "KeyboardInterrupt",
|
|
||||||
"evalue": "",
|
|
||||||
"output_type": "error",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
|
||||||
"\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
|
|
||||||
"Cell \u001b[1;32mIn[13], line 15\u001b[0m\n\u001b[0;32m 13\u001b[0m y \u001b[39m=\u001b[39m y\u001b[39m.\u001b[39mto(device)\n\u001b[0;32m 14\u001b[0m optimizer\u001b[39m.\u001b[39mzero_grad()\n\u001b[1;32m---> 15\u001b[0m ypredicted \u001b[39m=\u001b[39m model(x)\n\u001b[0;32m 16\u001b[0m loss \u001b[39m=\u001b[39m criterion(torch\u001b[39m.\u001b[39mlog(ypredicted), y)\n\u001b[0;32m 17\u001b[0m \u001b[39mif\u001b[39;00m step \u001b[39m%\u001b[39m \u001b[39m100\u001b[39m \u001b[39m==\u001b[39m \u001b[39m0\u001b[39m:\n",
|
|
||||||
"File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1499\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1500\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1501\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
|
|
||||||
"Cell \u001b[1;32mIn[10], line 16\u001b[0m, in \u001b[0;36mSimpleBigramNeuralLanguageModel.forward\u001b[1;34m(self, x)\u001b[0m\n\u001b[0;32m 15\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, x):\n\u001b[1;32m---> 16\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmodel(x)\n",
|
|
||||||
"File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1499\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1500\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1501\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
|
|
||||||
"File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\container.py:217\u001b[0m, in \u001b[0;36mSequential.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m 215\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m):\n\u001b[0;32m 216\u001b[0m \u001b[39mfor\u001b[39;00m module \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m:\n\u001b[1;32m--> 217\u001b[0m \u001b[39minput\u001b[39m \u001b[39m=\u001b[39m module(\u001b[39minput\u001b[39;49m)\n\u001b[0;32m 218\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39minput\u001b[39m\n",
|
|
||||||
"File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\module.py:1501\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1496\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1497\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1498\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1499\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1500\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1501\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 1502\u001b[0m \u001b[39m# Do not call functions when jit is used\u001b[39;00m\n\u001b[0;32m 1503\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[39m=\u001b[39m [], []\n",
|
|
||||||
"File \u001b[1;32mc:\\Users\\jadamski\\.conda\\envs\\modelowanie\\lib\\site-packages\\torch\\nn\\modules\\linear.py:114\u001b[0m, in \u001b[0;36mLinear.forward\u001b[1;34m(self, input)\u001b[0m\n\u001b[0;32m 113\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mforward\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39minput\u001b[39m: Tensor) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Tensor:\n\u001b[1;32m--> 114\u001b[0m \u001b[39mreturn\u001b[39;00m F\u001b[39m.\u001b[39;49mlinear(\u001b[39minput\u001b[39;49m, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mweight, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mbias)\n",
|
|
||||||
"\u001b[1;31mKeyboardInterrupt\u001b[0m: "
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"from torch.utils.data import DataLoader\n",
|
|
||||||
"\n",
|
|
||||||
"device = 'cpu' # cuda\n",
|
|
||||||
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
|
|
||||||
"data = DataLoader(train_dataset, batch_size=5000)\n",
|
|
||||||
"optimizer = torch.optim.Adam(model.parameters())\n",
|
|
||||||
"criterion = torch.nn.NLLLoss()\n",
|
|
||||||
"\n",
|
|
||||||
"model.train()\n",
|
|
||||||
"step = 0\n",
|
|
||||||
"for x, y in data:\n",
|
|
||||||
" x = x.to(device)\n",
|
|
||||||
" y = y.to(device)\n",
|
|
||||||
" optimizer.zero_grad()\n",
|
|
||||||
" ypredicted = model(x)\n",
|
|
||||||
" loss = criterion(torch.log(ypredicted), y)\n",
|
|
||||||
" if step % 100 == 0:\n",
|
|
||||||
" print(step, loss)\n",
|
|
||||||
" step += 1\n",
|
|
||||||
" loss.backward()\n",
|
|
||||||
" optimizer.step()\n",
|
|
||||||
"\n",
|
|
||||||
"torch.save(model.state_dict(), 'model1.bin')"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 16,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"data": {
|
|
||||||
"text/plain": [
|
|
||||||
"[('liquid', 6933, 0.0004737793351523578),\n",
|
|
||||||
" ('bia', 5842, 0.00043268679291941226),\n",
|
|
||||||
" ('sole', 6386, 0.0004295798426028341),\n",
|
|
||||||
" ('nmeant', 17711, 0.00034942160709761083),\n",
|
|
||||||
" ('savs', 16709, 0.00034736539237201214),\n",
|
|
||||||
" ('striving', 12414, 0.0003441996523179114),\n",
|
|
||||||
" ('nol', 2640, 0.00032789510441944003),\n",
|
|
||||||
" ('imposing', 8457, 0.0003199590719304979),\n",
|
|
||||||
" ('hound', 17348, 0.00031824613688513637),\n",
|
|
||||||
" ('?\"\\\\', 4294, 0.0003141215711366385)]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"execution_count": 16,
|
|
||||||
"metadata": {},
|
|
||||||
"output_type": "execute_result"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"device = 'cpu' # cuda\n",
|
|
||||||
"model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
|
|
||||||
"#model.load_state_dict(torch.load('model1.bin'))\n",
|
|
||||||
"model.eval()\n",
|
|
||||||
"\n",
|
|
||||||
"ixs = torch.tensor(vocab.forward(['welcome'])).to(device)\n",
|
|
||||||
"\n",
|
|
||||||
"out = model(ixs)\n",
|
|
||||||
"top = torch.topk(out[0], 10)\n",
|
|
||||||
"top_indices = top.indices.tolist()\n",
|
|
||||||
"top_probs = top.values.tolist()\n",
|
|
||||||
"top_words = vocab.lookup_tokens(top_indices)\n",
|
|
||||||
"list(zip(top_words, top_indices, top_probs))"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "modelowanie",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.10.10"
|
|
||||||
},
|
|
||||||
"orig_nbformat": 4
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 2
|
|
||||||
}
|
|
80
bigram.py
80
bigram.py
@ -1,80 +0,0 @@
|
|||||||
import collections
|
|
||||||
import re
|
|
||||||
import random
|
|
||||||
import math
|
|
||||||
|
|
||||||
input_file_path = "train/in.tsv"
|
|
||||||
bigrams = collections.defaultdict(lambda: collections.defaultdict(int))
|
|
||||||
|
|
||||||
|
|
||||||
def clean_text(text: str):
|
|
||||||
text = text.replace('\n', ' ')
|
|
||||||
text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
|
|
||||||
text = text.lower()
|
|
||||||
text = text.strip()
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
with open('train/expected.tsv', 'r', encoding="utf-8") as f:
|
|
||||||
expected = [line for line in f]
|
|
||||||
|
|
||||||
with open(input_file_path, 'r', encoding="utf-8") as f:
|
|
||||||
data = [line.split('\t') for line in f]
|
|
||||||
|
|
||||||
#data = data[:200000] # total is over 400 000
|
|
||||||
|
|
||||||
combined = []
|
|
||||||
|
|
||||||
for idx, row in enumerate(data):
|
|
||||||
line = clean_text(row[6]) + ' ' + expected[idx] + ' ' + clean_text(row[7])
|
|
||||||
combined.append(line.lower())
|
|
||||||
|
|
||||||
|
|
||||||
for line in combined:
|
|
||||||
tokens = re.findall(r"\b\w+\b", line)
|
|
||||||
|
|
||||||
for i in range(len(tokens) - 1):
|
|
||||||
bigrams[tokens[i]][tokens[i+1]] += 1
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
most_popular_words = [
|
|
||||||
"be:0.5 and:0.2 of:0.1 :0.2",
|
|
||||||
"a:0.5 in:0.2 to:0.1 :0.2",
|
|
||||||
"have:0.5 too:0.2 it:0.1 :0.2",
|
|
||||||
"I:0.5 that:0.2 for:0.1 :0.2",
|
|
||||||
"you:0.5 he:0.2 with:0.1 :0.2",
|
|
||||||
"on:0.5 do:0.2 say:0.1 :0.2",
|
|
||||||
"this:0.5 they:0.2 at:0.1 :0.2",
|
|
||||||
"but:0.5 we:0.2 his:0.1 :0.2"
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
with open('test-A/in.tsv', "r", encoding="utf-8") as input_file, open('test-A/out.tsv', "w", encoding="utf-8") as output_file:
|
|
||||||
|
|
||||||
lines = input_file.readlines()
|
|
||||||
|
|
||||||
for idx, line in enumerate(lines):
|
|
||||||
tokens = re.findall(r"\b\w+\b", clean_text(line.split("\t")[6]))
|
|
||||||
|
|
||||||
probabilities = []
|
|
||||||
denominator = sum(bigrams[tokens[-1]].values())
|
|
||||||
|
|
||||||
for possible_word in bigrams[tokens[-1]]:
|
|
||||||
probability = bigrams[tokens[-1]][possible_word] / denominator
|
|
||||||
probabilities.append((possible_word, probability))
|
|
||||||
|
|
||||||
probabilities.sort(key=lambda x: x[1], reverse=True)
|
|
||||||
print(f'Line {idx} of {len(lines)}')
|
|
||||||
|
|
||||||
if len(probabilities) >= 3:
|
|
||||||
out_line = ""
|
|
||||||
out_line += probabilities[0][0] + ":0.6 "
|
|
||||||
out_line += probabilities[1][0] + ":0.2 "
|
|
||||||
out_line += probabilities[2][0] + ":0.1 "
|
|
||||||
out_line += ":0.1"
|
|
||||||
output_file.write(out_line + "\n")
|
|
||||||
|
|
||||||
else:
|
|
||||||
output_file.write(random.choice(most_popular_words) + "\n")
|
|
||||||
|
|
20898
dev-0/out.tsv
20898
dev-0/out.tsv
File diff suppressed because it is too large
Load Diff
24
simple.py
24
simple.py
@ -1,24 +0,0 @@
|
|||||||
import random
|
|
||||||
|
|
||||||
most_popular_words = [
|
|
||||||
"be:0.5 and:0.2 of:0.1 :0.2",
|
|
||||||
"a:0.5 in:0.2 to:0.1 :0.2",
|
|
||||||
"have:0.5 too:0.2 it:0.1 :0.2",
|
|
||||||
"I:0.5 that:0.2 for:0.1 :0.2",
|
|
||||||
"you:0.5 he:0.2 with:0.1 :0.2",
|
|
||||||
"on:0.5 do:0.2 say:0.1 :0.2",
|
|
||||||
"this:0.5 they:0.2 at:0.1 :0.2",
|
|
||||||
"but:0.5 we:0.2 his:0.1 :0.2"
|
|
||||||
]
|
|
||||||
|
|
||||||
folder = "dev-0"
|
|
||||||
|
|
||||||
with open(folder + "/in.tsv", "r", encoding='utf-8') as in_file:
|
|
||||||
lines = in_file.readlines()
|
|
||||||
with open(folder + "/out.tsv", "w", encoding='utf-8') as out_file:
|
|
||||||
for line in lines:
|
|
||||||
out_file.write(random.choice(most_popular_words) + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
# słowo:prawdopodobieństwo słowo:prawdopodobieństwo :prawdopodobieństwo-reszty słów
|
|
||||||
# "the:0.2 at:0.3 :0.1"
|
|
14586
test-A/out.tsv
14586
test-A/out.tsv
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user