removing files that are not from this exact branch

This commit is contained in:
Adrian Charkiewicz 2023-05-26 17:43:44 +02:00
parent 4b51db483f
commit 1f742b4802
8 changed files with 1 additions and 13415 deletions

1
.gitignore vendored
View File

@ -6,3 +6,4 @@
*.o
.DS_Store
.token
geval

File diff suppressed because it is too large Load Diff

View File

@ -1,849 +0,0 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "V100"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"gpuClass": "standard"
},
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "LYTCs2MjhLuZ"
},
"outputs": [],
"source": [
"import torch\n",
"from torch import nn\n",
"\n",
"torch.cuda.empty_cache()"
]
},
{
"cell_type": "code",
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "unzqnLN9isoP",
"outputId": "b44d1087-3600-4fc2-9998-cf6520e9e743"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%cd drive/MyDrive/moj7"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hRG7HFaFi6aV",
"outputId": "c498eecc-d661-4842-8ae5-91819e38b7cd"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"/content/drive/MyDrive/moj7\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!ls"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "T5XQ2uY5jH4U",
"outputId": "1ad2d4a8-a575-4021-cbc0-3875f956f874"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"config.txt\t in-header.tsv\tout-header.tsv\t test-A\n",
"dev-0\t\t model1.bin\tprocessed_train.txt train\n",
"filename.pickle model2.bin\tsimplepredict.py train_new.txt\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import regex as re\n",
"import csv\n",
"\n",
"def clean_text(text):\n",
" text = text.lower().replace('-\\\\\\\\\\\\\\\\n', '').replace('\\\\\\\\\\\\\\\\n', ' ')\n",
" text = re.sub(r'\\p{P}', '', text)\n",
" text = text.replace(\"'t\", \" not\").replace(\"'s\", \" is\").replace(\"'ll\", \" will\").replace(\"'m\", \" am\").replace(\"'ve\", \" have\")\n",
"\n",
" return text"
],
"metadata": {
"id": "6_8pn-p3hO2a"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train_data = pd.read_csv('train/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"train_labels = pd.read_csv('train/expected.tsv', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"\n",
"train_data = train_data[[6, 7]]\n",
"train_data = pd.concat([train_data, train_labels], axis=1)\n",
"\n",
"train_data['text'] = train_data[6] + train_data[0] + train_data[7]\n",
"train_data = train_data[['text']]\n",
"\n",
"with open('processed_train.txt', 'w', encoding='utf-8') as file:\n",
" for _, row in train_data.iterrows():\n",
" text = clean_text(str(row['text']))\n",
" file.write(text + '\\n')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3WU8aYOghO4x",
"outputId": "54b2531c-541d-4b8d-92f9-20bcd52d843f"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-6-c2ca5c6b11cc>:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_data = pd.read_csv('train/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"<ipython-input-6-c2ca5c6b11cc>:1: FutureWarning: The warn_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_data = pd.read_csv('train/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"<ipython-input-6-c2ca5c6b11cc>:2: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_labels = pd.read_csv('train/expected.tsv', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"<ipython-input-6-c2ca5c6b11cc>:2: FutureWarning: The warn_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_labels = pd.read_csv('train/expected.tsv', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import itertools\n",
"import lzma\n",
"import numpy as np\n",
"import regex as re\n",
"import torch\n",
"import pandas as pd\n",
"from torch import nn\n",
"from torch.utils.data import IterableDataset, DataLoader\n",
"import csv\n",
"from itertools import islice, chain\n",
"from torchtext.vocab import build_vocab_from_iterator"
],
"metadata": {
"id": "tw9MDSzpisGN"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "M-aI-gI7hO7V"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"source": [
"device='cuda'"
],
"metadata": {
"id": "tVHkGBzLhO9u"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"source": [
"train_data = pd.read_csv('train/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"train_labels = pd.read_csv('train/expected.tsv', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"train_data = train_data[[6, 7]]\n",
"train_data = pd.concat([train_data, train_labels], axis=1)\n",
"train_data['text'] = train_data[6] + train_data[0] + train_data[7]\n",
"train_data = train_data[['text']]"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ph3ibZmlhPAI",
"outputId": "c4524bf5-d7f9-4c7f-ed89-7f6451725ea2"
},
"execution_count": 9,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-9-28a7685109f8>:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_data = pd.read_csv('train/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"<ipython-input-9-28a7685109f8>:1: FutureWarning: The warn_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_data = pd.read_csv('train/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"<ipython-input-9-28a7685109f8>:2: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_labels = pd.read_csv('train/expected.tsv', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"<ipython-input-9-28a7685109f8>:2: FutureWarning: The warn_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_labels = pd.read_csv('train/expected.tsv', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"train_data"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 424
},
"id": "uASpVNQXhPC1",
"outputId": "45126fc2-5ff5-4be3-f114-c5fa7da9189c"
},
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" text\n",
"0 came fiom the last place to this\\nplace, and t...\n",
"1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...\n",
"2 \"Thera were in 1771 only aeventy-nine\\n*ub*erl...\n",
"3 A gixnl man y nitereRtiiiv dii-clos-\\nur«s reg...\n",
"4 Tin: 188UB TV THF BBABBT QABJE\\nMr. Schiffs *t...\n",
"... ...\n",
"432017 Sam Clendenin bad a fancy for Ui«\\nscience of ...\n",
"432018 Wita.htt halting the party ware dilven to the ...\n",
"432019 It was the last thing that either of\\nthem exp...\n",
"432020 settlement with the department.\\nIt is also sh...\n",
"432021 Flour quotations—low extras at 1 R0®2 50;\\ncit...\n",
"\n",
"[432022 rows x 1 columns]"
],
"text/html": [
"\n",
" <div id=\"df-dcb2e1c9-80ba-4d3d-adb6-9daac97738db\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>came fiom the last place to this\\nplace, and t...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>\"Thera were in 1771 only aeventy-nine\\n*ub*erl...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>A gixnl man y nitereRtiiiv dii-clos-\\nur«s reg...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Tin: 188UB TV THF BBABBT QABJE\\nMr. Schiffs *t...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432017</th>\n",
" <td>Sam Clendenin bad a fancy for Ui«\\nscience of ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432018</th>\n",
" <td>Wita.htt halting the party ware dilven to the ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432019</th>\n",
" <td>It was the last thing that either of\\nthem exp...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432020</th>\n",
" <td>settlement with the department.\\nIt is also sh...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432021</th>\n",
" <td>Flour quotations—low extras at 1 R0®2 50;\\ncit...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>432022 rows × 1 columns</p>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-dcb2e1c9-80ba-4d3d-adb6-9daac97738db')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-dcb2e1c9-80ba-4d3d-adb6-9daac97738db button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-dcb2e1c9-80ba-4d3d-adb6-9daac97738db');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {},
"execution_count": 10
}
]
},
{
"cell_type": "code",
"source": [
"with open('train_new.txt', 'w', encoding='utf-8') as file:\n",
" for _, row in train_data.iterrows():\n",
" text = clean_text(str(row['text']))\n",
" file.write(text + '\\n')\n",
"\n"
],
"metadata": {
"id": "_28Jf3EyhPFu"
},
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"source": [
"class SimpleTrigramNeuralLanguageModel(nn.Module):\n",
" def __init__(self, vocabulary_size, embedding_size, hidden_size):\n",
" super(SimpleTrigramNeuralLanguageModel, self).__init__()\n",
" self.embedding = nn.Embedding(vocabulary_size * 2, embedding_size)\n",
" self.linear1 = nn.Linear(embedding_size, hidden_size)\n",
" self.linear2 = nn.Linear(hidden_size, vocabulary_size * 2)\n",
"\n",
" def forward(self, x):\n",
" x = self.embedding(x)\n",
" x = self.linear1(x)\n",
" x = self.linear2(x)\n",
" x = torch.softmax(x, dim=1)\n",
" return x"
],
"metadata": {
"id": "HdaLacIRhPIS"
},
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"source": [
"vocab_size = 38000\n",
"embed_size = 300\n",
"hidden_size = 256"
],
"metadata": {
"id": "k-qcQuVYhPK7"
},
"execution_count": 13,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def words_line(line):\n",
" line = line.rstrip()\n",
" yield '<s>'\n",
" for m in re.finditer(r'[\\p{L}0-9\\*]+|\\p{P}+', line):\n",
" yield m.group(0).lower()\n",
" yield '</s>'\n",
"\n",
"def file_words(file_name):\n",
" with open(file_name, 'r', encoding='utf-8') as fh:\n",
" for line in fh:\n",
" yield words_line(line)"
],
"metadata": {
"id": "w9yhw6n0hPNV"
},
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def iterator_look(gen):\n",
" first_prev = None\n",
" sec_prev = None\n",
" for item in gen:\n",
" if first_prev and sec_prev:\n",
" yield (sec_prev+ first_prev, item)\n",
" sec_prev = first_prev\n",
" first_prev = item"
],
"metadata": {
"id": "suwoA5QFhPP9"
},
"execution_count": 15,
"outputs": []
},
{
"cell_type": "code",
"source": [
"class Trigrams(IterableDataset):\n",
" def __init__(self, text_file, vocabulary_size):\n",
" self.vocab = build_vocab_from_iterator(\n",
" file_words(text_file),\n",
" max_tokens = vocabulary_size,\n",
" specials = ['<unk>']\n",
" )\n",
" self.vocab.set_default_index(self.vocab['<unk>'])\n",
" self.vocabulary_size = vocabulary_size\n",
" self.text_file = text_file\n",
"\n",
" def __iter__(self):\n",
" return iterator_look((self.vocab[t] for t in chain.from_iterable(file_words(self.text_file))))"
],
"metadata": {
"id": "9ZZllfdxhPSd"
},
"execution_count": 16,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def training(xx):\n",
" train_dataset_new = Trigrams('train_new.txt', vocab_size)\n",
" model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size, hidden_size).to(device)\n",
" optimizer = torch.optim.Adam(model.parameters())\n",
" criterion = torch.nn.NLLLoss()\n",
" data = DataLoader(train_dataset_new, batch_size=800)\n",
" step = 0\n",
" for epoch in range(1):\n",
" model.train()\n",
" for x, y in data:\n",
" x = x.to(device)\n",
" y = y.to(device)\n",
" optimizer.zero_grad()\n",
" outputs = model(x)\n",
" loss = criterion(torch.log(outputs), y)\n",
" if step % 100 == 0:\n",
" print(step, loss)\n",
" step += 1\n",
" loss.backward()\n",
" optimizer.step()\n",
" torch.save(model.state_dict(), 'model2.bin')"
],
"metadata": {
"id": "QjZ9Rl7-kUYC"
},
"execution_count": 17,
"outputs": []
},
{
"cell_type": "code",
"source": [
"training(xx=0.0001)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "HOSUqszakUac",
"outputId": "ec9f6d23-3014-4787-e2d7-22520974a7df"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"0 tensor(11.2670, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"100 tensor(8.0867, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"200 tensor(6.8976, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"300 tensor(6.6515, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"400 tensor(6.6224, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"500 tensor(6.7443, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"600 tensor(6.7064, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"700 tensor(6.8224, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"800 tensor(6.8516, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"900 tensor(6.6103, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1000 tensor(6.5455, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1100 tensor(6.8369, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1200 tensor(6.5587, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1300 tensor(6.2804, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1400 tensor(6.5476, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1500 tensor(6.7563, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1600 tensor(6.5324, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1700 tensor(6.6478, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1800 tensor(6.4025, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"1900 tensor(6.4470, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2000 tensor(6.8199, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2100 tensor(6.2291, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2200 tensor(6.4627, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2300 tensor(6.5401, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2400 tensor(6.4382, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2500 tensor(6.4881, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2600 tensor(6.2683, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2700 tensor(6.5393, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2800 tensor(6.8077, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"2900 tensor(6.6460, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3000 tensor(6.4482, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3100 tensor(6.6288, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3200 tensor(6.4752, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3300 tensor(6.3716, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3400 tensor(6.4713, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3500 tensor(6.4488, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3600 tensor(6.5300, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3700 tensor(6.3824, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3800 tensor(6.6311, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"3900 tensor(6.3778, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4000 tensor(6.4160, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4100 tensor(6.5501, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4200 tensor(6.6891, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4300 tensor(6.4745, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4400 tensor(6.7940, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4500 tensor(6.2111, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4600 tensor(6.7691, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4700 tensor(6.2466, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4800 tensor(6.5852, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"4900 tensor(6.1048, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5000 tensor(6.5077, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5100 tensor(6.6974, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5200 tensor(6.4872, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5300 tensor(6.4792, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5400 tensor(6.4319, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5500 tensor(6.4370, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5600 tensor(6.5948, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5700 tensor(6.5184, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5800 tensor(6.4193, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"5900 tensor(6.4801, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6000 tensor(6.4735, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6100 tensor(6.4440, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6200 tensor(6.3385, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6300 tensor(6.2252, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6400 tensor(6.2866, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6500 tensor(6.8166, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6600 tensor(6.4074, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6700 tensor(6.6818, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6800 tensor(5.9832, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"6900 tensor(6.1267, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7000 tensor(6.6872, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7100 tensor(6.4554, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7200 tensor(6.5397, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7300 tensor(6.3267, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7400 tensor(6.4830, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7500 tensor(6.5805, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7600 tensor(6.1212, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7700 tensor(6.2900, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7800 tensor(6.1379, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"7900 tensor(6.1837, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8000 tensor(6.5634, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8100 tensor(6.5012, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8200 tensor(6.3135, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8300 tensor(6.6141, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8400 tensor(6.4679, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8500 tensor(6.2488, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8600 tensor(6.3222, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8700 tensor(6.4057, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8800 tensor(6.2209, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"8900 tensor(6.6274, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9000 tensor(6.4992, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9100 tensor(6.5748, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9200 tensor(6.2457, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9300 tensor(6.4364, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9400 tensor(6.4908, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9500 tensor(6.5462, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9600 tensor(6.3248, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9700 tensor(6.3758, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9800 tensor(6.1925, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"9900 tensor(6.5854, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10000 tensor(6.5270, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10100 tensor(6.3718, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10200 tensor(6.6314, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10300 tensor(6.3025, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10400 tensor(6.2880, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10500 tensor(6.6817, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10600 tensor(6.4151, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10700 tensor(6.5276, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10800 tensor(6.6714, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"10900 tensor(6.4049, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11000 tensor(6.2844, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11100 tensor(6.3522, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11200 tensor(6.5579, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11300 tensor(6.6415, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11400 tensor(6.2489, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11500 tensor(6.1745, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11600 tensor(6.5829, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11700 tensor(6.4514, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11800 tensor(6.4100, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"11900 tensor(6.2816, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12000 tensor(6.4974, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12100 tensor(6.3546, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12200 tensor(6.4354, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12300 tensor(6.2498, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12400 tensor(6.2456, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12500 tensor(6.2744, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12600 tensor(6.3540, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12700 tensor(6.4590, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12800 tensor(6.3227, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"12900 tensor(6.2072, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"13000 tensor(6.1667, device='cuda:0', grad_fn=<NllLossBackward0>)\n",
"13100 tensor(6.4865, device='cuda:0', grad_fn=<NllLossBackward0>)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size, hidden_size).to(device)\n",
"model.load_state_dict(torch.load('model2.bin'))\n",
"model.eval()\n",
"train_dataset_new = Trigrams('train_new.txt', vocab_size)\n",
"\n",
"def predict_words(words):\n",
" ixs = torch.tensor(train_dataset_new.vocab.forward(['with'])).to(device)\n",
" predictions = model(ixs)\n",
" total_prob = 0.0\n",
" prediction = ''\n",
" top = torch.topk(predictions[0], 30)\n",
" top_indices = top.indices.tolist()\n",
" top_probs = top.values.tolist()\n",
" top_words = train_dataset_new.vocab.lookup_tokens(top_indices)\n",
" top_preds = list(zip(top_words, top_indices, top_probs))\n",
"\n",
" for word, _, prob in top_preds:\n",
" if word != '<unk>':\n",
" prediction += f'{word}:{prob} '\n",
" total_prob += prob\n",
" prediction += f':{1 - total_prob}'\n",
" return prediction"
],
"metadata": {
"id": "5K9YlprQkUc8"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model = SimpleTrigramNeuralLanguageModel(vocab_size, embed_size, hidden_size).to(device)\n",
"model.load_state_dict(torch.load('model2.bin'))\n",
"model.eval() "
],
"metadata": {
"id": "MgaRdbD8kUfd"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"with lzma.open(f'dev-0/in.tsv.xz', mode='rt', encoding='utf-8') as fid:\n",
" with open(f'dev-0/out-HIDDEN-SIZE={hidden_size}.tsv', 'w', encoding='utf-8', newline='\\n') as f:\n",
" for line in fid:\n",
" separated = line.split('\\t')\n",
" prefix = separated[6].replace(r'\\n', ' ').split()[-2:]\n",
" output_line = predict_words(prefix)\n",
" f.write(output_line + '\\n')"
],
"metadata": {
"id": "MoL-FV4rkgZB"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"with lzma.open(f'test-A/in.tsv.xz', mode='rt', encoding='utf-8') as fid:\n",
" with open(f'test-A/out-HIDDEN-SIZE={hidden_size}.tsv', 'w', encoding='utf-8', newline='\\n') as f:\n",
" for line in fid:\n",
" separated = line.split('\\t')\n",
" prefix = separated[6].replace(r'\\n', ' ').split()[-2:]\n",
" output_line = predict_words(prefix)\n",
" f.write(output_line + '\\n')"
],
"metadata": {
"id": "jHlOHc8Hkgbg"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"torch.save(model.state_dict(), 'model2.bin')"
],
"metadata": {
"id": "CcX31HX1kgd4"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "DhbNd_O8koQv"
},
"execution_count": null,
"outputs": []
}
]
}

37
lm0.py
View File

@ -1,37 +0,0 @@
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import sys
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
for line in sys.stdin:
line_splitted = line.split("\t")
left_context = line_splitted[6].split(" ")[-1]
right_context = line_splitted[7].split(" ")[0]
word = "[MASK]"
text = f"{left_context} {word} {right_context}"
input_ids = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt", max_length=512, truncation=True)
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1][0]
with torch.inference_mode():
outputs = model(input_ids)
predictions = outputs[0][0, mask_token_index].softmax(dim=0)
top_k = 500
top_k_tokens = torch.topk(predictions, top_k).indices.tolist()
result = ''
prob_sum = 0
for token in top_k_tokens:
word = tokenizer.convert_ids_to_tokens([token])[0]
prob = predictions[token].item()
prob_sum += prob
result += f"{word}:{prob} "
diff = 1.0 - prob_sum
result += f":{diff}"
print(result)

5
lm1.py
View File

@ -1,5 +0,0 @@
#!/usr/bin/python3
import sys
for line in sys.stdin:
line = line.split('\t')[6].split(' ')[-1]
print(line)

8
lm2.py
View File

@ -1,8 +0,0 @@
#!/usr/bin/python3
import sys
for line in sys.stdin:
if "United" in line:
print('States:0.9 :0.1')
else:
print('the:0.6 a:0.3 :0.1')

File diff suppressed because one or more lines are too long

View File

@ -1,107 +0,0 @@
import csv
import pandas as pd
import regex as re
import nltk
import tqdm
from nltk import trigrams, word_tokenize
from collections import Counter, defaultdict
import string
nltk.download("punkt")
most_common_en_word = "the:0.3 be:0.2 to:0.15 of:0.1 and:0.025 a:0.0125 :0.2125"
train_count = 150000
# train set
train_data = pd.read_csv("train/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE, nrows=train_count)
# training labels
train_labels = pd.read_csv("train/expected.tsv", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE,nrows=train_count)
dev_data = pd.read_csv("dev-0/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
test_data = pd.read_csv("test-A/in.tsv.xz", sep="\t", on_bad_lines='skip', header=None, quoting=csv.QUOTE_NONE)
def prepare_text(text):
text = text.lower().replace("-\\n", "").replace("\\n", " ")
text = re.sub(r"\p{P}", "", text)
return text
def train_trigrams():
for _, row in tqdm.tqdm(train_data.iterrows()):
text = prepare_text(str(row["final"]))
words = word_tokenize(text)
for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):
if all([w1, w2, w3]):
model[(w2, w3)][w1] += 1
model[(w1, w2)][w3] += 1
for w_pair in model:
ngram_count = float(sum(model[w_pair].values()))
for w3 in model[w_pair]:
model[w_pair][w3] /= ngram_count
def predict_probs(word1, word2):
raw_prediction = dict(model[word1, word2])
prediction = dict(Counter(raw_prediction).most_common(6))
total_prob = 0.0
str_prediction = ""
for word, prob in prediction.items():
total_prob += prob
str_prediction += f"{word}:{prob} "
if total_prob == 0.0:
return most_common_en_word
remaining_prob = 1 - total_prob
if remaining_prob < 0.01:
remaining_prob = 0.01
str_prediction += f":{remaining_prob}"
return str_prediction
def write_output():
with open("dev-0/out.tsv", "w") as file:
for _, row in dev_data.iterrows():
text = prepare_text(str(row[7]))
words = word_tokenize(text)
if len(words) < 3:
prediction = most_common_en_word
else:
prediction = predict_probs(words[0], words[1])
file.write(prediction + "\n")
with open("test-A/out.tsv", "w") as file:
for _, row in test_data.iterrows():
text = prepare_text(str(row[7]))
words = word_tokenize(text)
if len(words) < 3:
prediction = most_common_en_word
else:
prediction = predict_probs(words[0], words[1])
file.write(prediction + "\n")
if __name__ == "__main__":
# Preapare train data
print("Preparing data...")
train_data = train_data[[6, 7]]
train_data = pd.concat([train_data, train_labels], axis=1)
train_data["final"] = train_data[6] + train_data[0] + train_data[7]
# declare model
print("Preparing model...")
model = defaultdict(lambda: defaultdict(lambda: 0))
# train model
print("Model training...")
train_trigrams()
# write outputs
print("Writing outputs...")
write_output()