init solution

This commit is contained in:
Łukasz Jędyk 2022-05-16 13:51:25 +02:00
parent 5312a2ba03
commit ef5a97420d
11 changed files with 442798 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.ipynb_checkpoints/

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv

10519
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
dev-0/in.tsv.xz Normal file

Binary file not shown.

BIN
geval Normal file

Binary file not shown.

1
in-header.tsv Normal file
View File

@ -0,0 +1 @@
FileId Year LeftContext RightContext
1 FileId Year LeftContext RightContext

1
out-header.tsv Normal file
View File

@ -0,0 +1 @@
Word
1 Word

253
run.ipynb Normal file
View File

@ -0,0 +1,253 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "56fb2fdb",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import regex as re\n",
"import torch\n",
"import csv\n",
"from torch import nn\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "74f269c9",
"metadata": {},
"outputs": [],
"source": [
"torch.cuda.empty_cache()\n",
"device = 'cuda' if torch.cuda.is_available() else 'cpu'"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c939f600",
"metadata": {},
"outputs": [],
"source": [
"class Dataset(torch.utils.data.Dataset):\n",
" def __init__(self, sequence_length, file_path):\n",
" self.file_path = file_path\n",
" self.sequence_length = sequence_length\n",
" self.words = self.load()\n",
" self.uniq_words = self.get_uniq_words()\n",
"\n",
" self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}\n",
" self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}\n",
"\n",
" self.words_indexes = [self.word_to_index[w] for w in self.words]\n",
"\n",
" def load(self):\n",
" with open(self.file_path, 'r') as f_in:\n",
" text = [x.rstrip() for x in f_in.readlines() if x.strip()]\n",
" text = ' '.join(text).lower()\n",
" text = re.sub('[^a-ząćęłńóśźż ]', '', text) \n",
" text = text.split(' ')\n",
" return text\n",
" \n",
" def get_uniq_words(self):\n",
" word_counts = Counter(self.words)\n",
" return sorted(word_counts, key=word_counts.get, reverse=True)\n",
"\n",
" def __len__(self):\n",
" return len(self.words_indexes) - self.sequence_length\n",
"\n",
" def __getitem__(self, index):\n",
" return (\n",
" torch.tensor(self.words_indexes[index:index+self.sequence_length]),\n",
" torch.tensor(self.words_indexes[index+1:index+self.sequence_length+1]),\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "9f178921",
"metadata": {},
"outputs": [],
"source": [
"class Model(nn.Module):\n",
" def __init__(self, vocab_size):\n",
" super(Model, self).__init__()\n",
" self.lstm_size = 128\n",
" self.embedding_dim = 256\n",
" self.num_layers = 3\n",
"\n",
" self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embedding_dim)\n",
" self.lstm = nn.LSTM(input_size=self.lstm_size, hidden_size=self.lstm_size, num_layers=self.num_layers, dropout=0.2)\n",
" self.fc = nn.Linear(self.lstm_size, vocab_size)\n",
"\n",
" def forward(self, x, prev_state = None):\n",
" embed = self.embedding(x)\n",
" output, state = self.lstm(embed, prev_state)\n",
" logits = self.fc(output)\n",
" return logits, state\n",
"\n",
" def init_state(self, sequence_length):\n",
" zeros = torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(device)\n",
" return (zeros, zeros)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "39a27465",
"metadata": {},
"outputs": [],
"source": [
"def clean_text(text):\n",
" text = text.lower().replace('-\\\\\\\\\\\\\\\\n', '').replace('\\\\\\\\\\\\\\\\n', ' ')\n",
" text = re.sub(r'\\p{P}', '', text)\n",
" text = text.replace(\"'t\", \" not\").replace(\"'s\", \" is\").replace(\"'ll\", \" will\").replace(\"'m\", \" am\").replace(\"'ve\", \" have\")\n",
"\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "2f25401d",
"metadata": {},
"outputs": [],
"source": [
"train_data = pd.read_csv('train/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"train_labels = pd.read_csv('train/expected.tsv', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"\n",
"train_data = train_data[[6, 7]]\n",
"train_data = pd.concat([train_data, train_labels], axis=1)\n",
"\n",
"train_data['text'] = train_data[6] + train_data[0] + train_data[7]\n",
"train_data = train_data[['text']]\n",
"\n",
"with open('processed_train.txt', 'w', encoding='utf-8') as file:\n",
" for _, row in train_data.iterrows():\n",
" text = clean_text(str(row['text']))\n",
" file.write(text + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "bb55ce42",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'dataset' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m/tmp/ipykernel_14895/2199368365.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mDataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'processed_train.txt'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mModel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvocab_size\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muniq_words\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'dataset' is not defined"
]
}
],
"source": [
"data = Dataset(5, 'processed_train.txt')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dc097469",
"metadata": {},
"outputs": [],
"source": [
"model = Model(vocab_size = len(data.uniq_words)).to(device)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c9fcdfe7",
"metadata": {},
"outputs": [],
"source": [
"def train(dataset, model, max_epochs, batch_size):\n",
" model.train()\n",
"\n",
" dataloader = DataLoader(dataset, batch_size=batch_size)\n",
" criterion = nn.CrossEntropyLoss()\n",
" optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
"\n",
" for epoch in range(max_epochs):\n",
" for batch, (x, y) in enumerate(dataloader):\n",
" optimizer.zero_grad()\n",
" x = x.to(device)\n",
" y = y.to(device)\n",
"\n",
" y_pred, (state_h, state_c) = model(x)\n",
" loss = criterion(y_pred.transpose(1, 2), y)\n",
"\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
" print({ 'epoch': epoch, 'update in batch': batch, '/' : len(dataloader), 'loss': loss.item() "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "38d52d9a",
"metadata": {},
"outputs": [],
"source": [
"def predict(dataset, model, text, next_words=5):\n",
" model.eval()\n",
" words = text.split(' ')\n",
" state_h, state_c = model.init_state(len(words))\n",
"\n",
" for i in range(0, next_words):\n",
" x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]]).to(device)\n",
" y_pred, (state_h, state_c) = model(x, (state_h, state_c))\n",
"\n",
" last_word_logits = y_pred[0][-1]\n",
" p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()\n",
" word_index = np.random.choice(len(last_word_logits), p=p)\n",
" words.append(dataset.index_to_word[word_index])\n",
"\n",
" return words"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bcd99ea4",
"metadata": {},
"outputs": [],
"source": [
"#train(data, model, 1, 128)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

BIN
test-A/in.tsv.xz Normal file

Binary file not shown.

432022
train/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
train/in.tsv.xz Normal file

Binary file not shown.