Prześlij pliki do ''

This commit is contained in:
Tomasz Grzybowski 2022-07-02 12:26:40 +02:00
parent e7f3220490
commit 3f225003a0
5 changed files with 5968 additions and 0 deletions

283
kenlm.ipynb Normal file
View File

@ -0,0 +1,283 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "kenlm.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"source": [
"from google.colab import drive\n",
"drive.mount('/content/gdrive')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GQG8KfEo5BwV",
"outputId": "7899949c-5bc3-4d13-acb2-88aa47f46655"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Mounted at /content/gdrive\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!pip install https://github.com/kpu/kenlm/archive/master.zip"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GsoWSBmH5DT3",
"outputId": "f67d798f-54f8-4c90-bdef-590424b49dd5"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting https://github.com/kpu/kenlm/archive/master.zip\n",
" Using cached https://github.com/kpu/kenlm/archive/master.zip (550 kB)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!pip install english_words"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "rwNPsafM6KSb",
"outputId": "b4e21df6-cf55-4f7a-843c-a87f1acc6082"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting english_words\n",
" Downloading english-words-1.1.0.tar.gz (1.1 MB)\n",
"\u001b[K |████████████████████████████████| 1.1 MB 5.4 MB/s \n",
"\u001b[?25hBuilding wheels for collected packages: english-words\n",
" Building wheel for english-words (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Created wheel for english-words: filename=english_words-1.1.0-py3-none-any.whl size=1106680 sha256=9959ed5d02a4c06063019ede18eebf1ef1be2562a62aa85f86a13d6a3fe1e34b\n",
" Stored in directory: /root/.cache/pip/wheels/25/3d/4c/12a119ce90b46b4f90f9ddf41d719ecabb40faec6103379fc8\n",
"Successfully built english-words\n",
"Installing collected packages: english-words\n",
"Successfully installed english-words-1.1.0\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import nltk\n",
"nltk.download(\"punkt\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "02yP2lJ9_4dT",
"outputId": "5de6ad9b-41e0-4577-9af3-4ceefe85f3d0"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt.zip.\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"execution_count": 12
}
]
},
{
"cell_type": "code",
"source": [
" lmplz_command = f\"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa\"\n",
" build_binary_command = f\"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary\"\n",
" os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command))\n",
" os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YC397rhc7-CW",
"outputId": "53adb185-9cbf-4ace-8556-7335776313d6"
},
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"256"
]
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tt_ucItY484I",
"outputId": "e2839c64-b3b9-42fb-c2cf-dc7dc60ad8ab"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:51: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.\n",
"\n",
"\n"
]
}
],
"source": [
"import pandas as pd\n",
"import csv\n",
"import regex as re\n",
"import kenlm\n",
"from english_words import english_words_alpha_set\n",
"from nltk import word_tokenize\n",
"from math import log10\n",
"from pathlib import Path\n",
"import os\n",
"import numpy as np\n",
"\n",
"\n",
"KENLM_BUILD_PATH = Path(\"gdrive/My Drive/gonito/kenlm/build\")\n",
"KENLM_LMPLZ_PATH = KENLM_BUILD_PATH / \"bin\" / \"lmplz\"\n",
"KENLM_BUILD_BINARY_PATH = KENLM_BUILD_PATH / \"bin\" / \"build_binary\"\n",
"SUDO_PASSWORD = \"\"\n",
"PREDICTION = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'\n",
"\n",
"\n",
"def clean(text):\n",
" text = str(text).lower().replace(\"-\\\\n\", \"\").replace(\"\\\\n\", \" \")\n",
" return re.sub(r\"\\p{P}\", \"\", text)\n",
"\n",
"\n",
"def create_train_data():\n",
" data = pd.read_csv(\"gdrive/My Drive/gonito/train/in.tsv.xz\", sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=50000)\n",
" train_labels = pd.read_csv(\"gdrive/My Drive/gonito/train/expected.tsv\", sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=50000)\n",
"\n",
" train_data = data[[6, 7]]\n",
" train_data = pd.concat([train_data, train_labels], axis=1)\n",
"\n",
" return train_data[6] + train_data[0] + train_data[7]\n",
"\n",
"\n",
"def create_train_file(filename=\"gdrive/My Drive/gonito/train.txt\"):\n",
" with open(filename, \"w\") as f:\n",
" for line in create_train_data():\n",
" f.write(clean(line) + \"\\n\")\n",
" \n",
"\n",
"def train_model():\n",
" lmplz_command = f\"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa\"\n",
" build_binary_command = f\"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary\"\n",
" os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command))\n",
" os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command))\n",
" \n",
"\n",
"def softmax(x):\n",
" e_x = np.exp(x - np.max(x))\n",
" return e_x / e_x.sum(axis=0)\n",
"\n",
"def predict(model, before, after):\n",
" best_scores = []\n",
" for word in english_words_alpha_set:\n",
" text = ' '.join([before, word, after])\n",
" text_score = model.score(text, bos=False, eos=False)\n",
" if len(best_scores) < 12:\n",
" best_scores.append((word, text_score))\n",
" else:\n",
" worst_score = None\n",
" for score in best_scores:\n",
" if not worst_score:\n",
" worst_score = score\n",
" else:\n",
" if worst_score[1] > score[1]:\n",
" worst_score = score\n",
" if worst_score[1] < text_score:\n",
" best_scores.remove(worst_score)\n",
" best_scores.append((word, text_score))\n",
" probs = sorted(best_scores, key=lambda tup: tup[1], reverse=True)\n",
" pred_str = ''\n",
" for word, prob in probs:\n",
" pred_str += f'{word}:{prob} '\n",
" pred_str += f':{log10(0.99)}'\n",
" return pred_str\n",
"\n",
"def make_prediction(model, path, result_path):\n",
" data = pd.read_csv(path, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
" with open(result_path, 'w', encoding='utf-8') as file_out:\n",
" for _, row in data.iterrows():\n",
" before, after = word_tokenize(clean(str(row[6]))), word_tokenize(clean(str(row[7])))\n",
" if len(before) < 2 or len(after) < 2:\n",
" pred = PREDICTION\n",
" else:\n",
" pred = predict(model, before[-1], after[0])\n",
" file_out.write(pred + '\\n')\n",
"\n",
"\n",
"create_train_file()\n",
"train_model()\n",
"model = kenlm.Model('gdrive/My Drive/gonito/model.binary')\n",
"make_prediction(model, \"gdrive/My Drive/gonito/dev-0/in.tsv.xz\", \"gdrive/My Drive/gonito/dev-0/out.tsv\")\n",
"make_prediction(model, \"gdrive/My Drive/gonito/test-A/in.tsv.xz\", \"gdrive/My Drive/gonito/test-A/out.tsv\")"
]
}
]
}

117
kenlm.py Normal file
View File

@ -0,0 +1,117 @@
# -*- coding: utf-8 -*-
"""kenlm.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1ov9aRonhHahzGcs1BIMjVHEldjHg4yTs
"""
from google.colab import drive
drive.mount('/content/gdrive')
!pip install https://github.com/kpu/kenlm/archive/master.zip
!pip install english_words
import nltk
nltk.download("punkt")
lmplz_command = f"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa"
build_binary_command = f"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary"
os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command))
os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command))
import pandas as pd
import csv
import regex as re
import kenlm
from english_words import english_words_alpha_set
from nltk import word_tokenize
from math import log10
from pathlib import Path
import os
import numpy as np
KENLM_BUILD_PATH = Path("gdrive/My Drive/gonito/kenlm/build")
KENLM_LMPLZ_PATH = KENLM_BUILD_PATH / "bin" / "lmplz"
KENLM_BUILD_BINARY_PATH = KENLM_BUILD_PATH / "bin" / "build_binary"
SUDO_PASSWORD = ""
PREDICTION = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'
def clean(text):
text = str(text).lower().replace("-\\n", "").replace("\\n", " ")
return re.sub(r"\p{P}", "", text)
def create_train_data():
data = pd.read_csv("gdrive/My Drive/gonito/train/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=50000)
train_labels = pd.read_csv("gdrive/My Drive/gonito/train/expected.tsv", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=50000)
train_data = data[[6, 7]]
train_data = pd.concat([train_data, train_labels], axis=1)
return train_data[6] + train_data[0] + train_data[7]
def create_train_file(filename="gdrive/My Drive/gonito/train.txt"):
with open(filename, "w") as f:
for line in create_train_data():
f.write(clean(line) + "\n")
def train_model():
lmplz_command = f"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa"
build_binary_command = f"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary"
os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command))
os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command))
def softmax(x):
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum(axis=0)
def predict(model, before, after):
best_scores = []
for word in english_words_alpha_set:
text = ' '.join([before, word, after])
text_score = model.score(text, bos=False, eos=False)
if len(best_scores) < 12:
best_scores.append((word, text_score))
else:
worst_score = None
for score in best_scores:
if not worst_score:
worst_score = score
else:
if worst_score[1] > score[1]:
worst_score = score
if worst_score[1] < text_score:
best_scores.remove(worst_score)
best_scores.append((word, text_score))
probs = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
pred_str = ''
for word, prob in probs:
pred_str += f'{word}:{prob} '
pred_str += f':{log10(0.99)}'
return pred_str
def make_prediction(model, path, result_path):
data = pd.read_csv(path, sep='\t', header=None, quoting=csv.QUOTE_NONE)
with open(result_path, 'w', encoding='utf-8') as file_out:
for _, row in data.iterrows():
before, after = word_tokenize(clean(str(row[6]))), word_tokenize(clean(str(row[7])))
if len(before) < 2 or len(after) < 2:
pred = PREDICTION
else:
pred = predict(model, before[-1], after[0])
file_out.write(pred + '\n')
create_train_file()
train_model()
model = kenlm.Model('gdrive/My Drive/gonito/model.binary')
make_prediction(model, "gdrive/My Drive/gonito/dev-0/in.tsv.xz", "gdrive/My Drive/gonito/dev-0/out.tsv")
make_prediction(model, "gdrive/My Drive/gonito/test-A/in.tsv.xz", "gdrive/My Drive/gonito/test-A/out.tsv")

78
n-gram.py Normal file
View File

@ -0,0 +1,78 @@
import pandas as pd
import csv
import regex as re
from nltk import bigrams, word_tokenize
from collections import Counter, defaultdict
import string
import unicodedata
DEFAULT_PREDICTION = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
data = pd.read_csv("train/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
train_labels = pd.read_csv("train/expected.tsv", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
train_data = data[[6, 7]]
train_data = pd.concat([train_data, train_labels], axis=1)
train_data["final"] = train_data[6] + train_data[0] + train_data[7]
model = defaultdict(lambda: defaultdict(lambda: 0))
def clean(text):
text = str(text).lower().replace("-\\n", "").replace("\\n", " ")
return re.sub(r"\p{P}", "", text)
for _, row in train_data.iterrows():
words = word_tokenize(clean(row["final"]))
for w1, w2 in bigrams(words, pad_left=True, pad_right=True):
if w1 and w2:
model[w1][w2] += 1
for w1 in model:
total_count = float(sum(model[w1].values()))
for w2 in model[w1]:
model[w1][w2] /= total_count
def predict(word):
predictions = dict(model[word])
most_common = dict(Counter(predictions).most_common(5))
total_prob = 0.0
str_prediction = ""
for word, prob in most_common.items():
total_prob += prob
str_prediction += f"{word}:{prob} "
if not total_prob:
return DEFAULT_PREDICTION
if 1 - total_prob >= 0.01:
str_prediction += f":{1-total_prob}"
else:
str_prediction += f":0.01"
return str_prediction
data = pd.read_csv("dev-0/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
with open("dev-0/out.tsv", "w", encoding="UTF-8") as file:
for _, row in data.iterrows():
words = word_tokenize(clean(row[6]))
if len(words) < 3:
prediction = DEFAULT_PREDICTION
else:
prediction = predict(words[-1])
file.write(prediction + "\n")
data = pd.read_csv("test-A/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
with open("test-A/out.tsv", "w", encoding="UTF-8") as file:
for _, row in data.iterrows():
words = word_tokenize(clean(row[6]))
if len(words) < 3:
prediction = DEFAULT_PREDICTION
else:
prediction = predict(words[-1])
file.write(prediction + "\n")

5298
neural_network.ipynb Normal file

File diff suppressed because it is too large Load Diff

192
neural_network.py Normal file
View File

@ -0,0 +1,192 @@
# -*- coding: utf-8 -*-
"""neural_network.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1c4GrMHn9isBMqMEfYsJftgWbWqqLO1Np
"""
from google.colab import drive
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/gonito/'
import torch
torch.cuda.is_available()
import torch
import csv
torch.cuda.empty_cache()
from torch.utils.data import DataLoader
import pandas as pd
from os.path import exists
from torchtext.vocab import build_vocab_from_iterator
import itertools
import regex as re
from csv import QUOTE_NONE
from torch import nn
ENCODING = "utf-8"
REP = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+")
REM = re.compile(r"'s|[\-]\\n|\-\\n|\p{P}")
def read_csv(fname):
return pd.read_csv(fname, sep="\t", on_bad_lines='skip', header=None, quoting=QUOTE_NONE, encoding=ENCODING)
def clean_text(text):
res = str(text).lower().strip()
res = res.replace("", "'")
res = REM.sub("", res)
res = REP.sub(" ", res)
res = res.replace("'t", " not")
res = res.replace("'s", " is")
res = res.replace("'ll", " will")
res = res.replace("won't", "will not")
res = res.replace("isn't", "is not")
res = res.replace("aren't", "are not")
res = res.replace("'ve'", "have")
return res.replace("'m", " am")
def get_words_from_line(line, specials = True):
line = line.rstrip()
if specials:
yield '<s>'
for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
yield m.group(0).lower()
if specials:
yield '</s>'
def get_word_lines_from_data(d):
for line in d:
yield get_words_from_line(line)
class Bigrams(torch.utils.data.IterableDataset):
def __init__(self, data, vocabulary_size):
self.vocab = build_vocab_from_iterator(
get_word_lines_from_data(data),
max_tokens = vocabulary_size,
specials = ['<unk>'])
self.vocab.set_default_index(self.vocab['<unk>'])
self.vocabulary_size = vocabulary_size
self.data = data
@staticmethod
def look_ahead_iterator(gen):
w1 = None
for item in gen:
if w1 is not None:
yield (w1, item)
w1 = item
def __iter__(self):
return self.look_ahead_iterator(
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_data(self.data))))
class SimpleBigramNeuralLanguageModel(torch.nn.Module):
def __init__(self, vocabulary_size, embedding_size):
super(SimpleBigramNeuralLanguageModel, self).__init__()
self.model = nn.Sequential(
nn.Embedding(vocabulary_size, embedding_size),
nn.Linear(embedding_size, vocabulary_size),
nn.Softmax(),
)
def forward(self, x):
return self.model(x)
data = read_csv("gdrive/My Drive/gonito/train/in.tsv.xz")
train_words = read_csv("gdrive/My Drive/gonito/train/expected.tsv")
train_data = data[[6, 7]]
train_data = pd.concat([train_data, train_words], axis=1)
train_data = train_data[6] + train_data[0] + train_data[7]
train_data = train_data.apply(clean_text)
vocab_size = 30000
embed_size = 150
train_dataset = Bigrams(train_data, vocab_size)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
print(device)
if(not exists('model1.bin')):
data = DataLoader(train_dataset, batch_size=8000)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.NLLLoss()
model.train()
step = 0
for i in range(2):
print(f"EPOCH {i}=========================")
for x, y in data:
x = x.to(device)
y = y.to(device)
optimizer.zero_grad()
ypredicted = model(x)
loss = criterion(torch.log(ypredicted), y)
if step % 100 == 0:
print(step, loss)
step += 1
loss.backward()
optimizer.step()
torch.save(model.state_dict(), 'model1.bin')
else:
print("Loading model1")
model.load_state_dict(torch.load('model1.bin'))
vocab = train_dataset.vocab
def predict(tokens):
ixs = torch.tensor(vocab.forward(tokens)).to(device)
out = model(ixs)
top = torch.topk(out[0], 8)
top_indices = top.indices.tolist()
top_probs = top.values.tolist()
top_words = vocab.lookup_tokens(top_indices)
result = ""
for word, prob in list(zip(top_words, top_probs)):
result += f"{word}:{prob} "
# result += f':0.01'
return result
DEFAULT_PREDICTION = "a:0.2 the:0.2 to:0.2 of:0.1 and:0.1 of:0.1 :0.1"
def predict_file(result_path, data):
with open(result_path, "w+", encoding="UTF-8") as f:
for row in data:
result = {}
before = None
for before in get_words_from_line(clean_text(str(row)), False):
pass
before = [before]
print(before)
if(len(before) < 1):
result = DEFAULT_PREDICTION
else:
result = predict(before)
result = result.strip()
f.write(result + "\n")
print(result)
dev_data = pd.read_csv("gdrive/My Drive/gonito/dev-0/in.tsv.xz", sep='\t', header=None, quoting=csv.QUOTE_NONE)[6]
dev_data = dev_data.apply(clean_text)
predict_file("gdrive/My Drive/gonito/dev-0/out.tsv", dev_data)
test_data = pd.read_csv("gdrive/My Drive/gonito/test-A/in.tsv.xz", sep='\t', header=None, quoting=csv.QUOTE_NONE)[6]
test_data = test_data.apply(clean_text)
predict_file("gdrive/My Drive/gonito/test-A/out.tsv", test_data)