Prześlij pliki do ''
This commit is contained in:
parent
e7f3220490
commit
3f225003a0
283
kenlm.ipynb
Normal file
283
kenlm.ipynb
Normal file
@ -0,0 +1,283 @@
|
|||||||
|
{
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"name": "kenlm.ipynb",
|
||||||
|
"provenance": [],
|
||||||
|
"collapsed_sections": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python3",
|
||||||
|
"display_name": "Python 3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"from google.colab import drive\n",
|
||||||
|
"drive.mount('/content/gdrive')"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "GQG8KfEo5BwV",
|
||||||
|
"outputId": "7899949c-5bc3-4d13-acb2-88aa47f46655"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Mounted at /content/gdrive\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"!pip install https://github.com/kpu/kenlm/archive/master.zip"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "GsoWSBmH5DT3",
|
||||||
|
"outputId": "f67d798f-54f8-4c90-bdef-590424b49dd5"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
||||||
|
"Collecting https://github.com/kpu/kenlm/archive/master.zip\n",
|
||||||
|
" Using cached https://github.com/kpu/kenlm/archive/master.zip (550 kB)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"!pip install english_words"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "rwNPsafM6KSb",
|
||||||
|
"outputId": "b4e21df6-cf55-4f7a-843c-a87f1acc6082"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
||||||
|
"Collecting english_words\n",
|
||||||
|
" Downloading english-words-1.1.0.tar.gz (1.1 MB)\n",
|
||||||
|
"\u001b[K |████████████████████████████████| 1.1 MB 5.4 MB/s \n",
|
||||||
|
"\u001b[?25hBuilding wheels for collected packages: english-words\n",
|
||||||
|
" Building wheel for english-words (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
|
||||||
|
" Created wheel for english-words: filename=english_words-1.1.0-py3-none-any.whl size=1106680 sha256=9959ed5d02a4c06063019ede18eebf1ef1be2562a62aa85f86a13d6a3fe1e34b\n",
|
||||||
|
" Stored in directory: /root/.cache/pip/wheels/25/3d/4c/12a119ce90b46b4f90f9ddf41d719ecabb40faec6103379fc8\n",
|
||||||
|
"Successfully built english-words\n",
|
||||||
|
"Installing collected packages: english-words\n",
|
||||||
|
"Successfully installed english-words-1.1.0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"import nltk\n",
|
||||||
|
"nltk.download(\"punkt\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "02yP2lJ9_4dT",
|
||||||
|
"outputId": "5de6ad9b-41e0-4577-9af3-4ceefe85f3d0"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stderr",
|
||||||
|
"text": [
|
||||||
|
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
|
||||||
|
"[nltk_data] Unzipping tokenizers/punkt.zip.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 12
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
" lmplz_command = f\"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa\"\n",
|
||||||
|
" build_binary_command = f\"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary\"\n",
|
||||||
|
" os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command))\n",
|
||||||
|
" os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command))"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "YC397rhc7-CW",
|
||||||
|
"outputId": "53adb185-9cbf-4ace-8556-7335776313d6"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "execute_result",
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"256"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"execution_count": 8
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "tt_ucItY484I",
|
||||||
|
"outputId": "e2839c64-b3b9-42fb-c2cf-dc7dc60ad8ab"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stderr",
|
||||||
|
"text": [
|
||||||
|
"/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:51: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import csv\n",
|
||||||
|
"import regex as re\n",
|
||||||
|
"import kenlm\n",
|
||||||
|
"from english_words import english_words_alpha_set\n",
|
||||||
|
"from nltk import word_tokenize\n",
|
||||||
|
"from math import log10\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"import os\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"KENLM_BUILD_PATH = Path(\"gdrive/My Drive/gonito/kenlm/build\")\n",
|
||||||
|
"KENLM_LMPLZ_PATH = KENLM_BUILD_PATH / \"bin\" / \"lmplz\"\n",
|
||||||
|
"KENLM_BUILD_BINARY_PATH = KENLM_BUILD_PATH / \"bin\" / \"build_binary\"\n",
|
||||||
|
"SUDO_PASSWORD = \"\"\n",
|
||||||
|
"PREDICTION = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def clean(text):\n",
|
||||||
|
" text = str(text).lower().replace(\"-\\\\n\", \"\").replace(\"\\\\n\", \" \")\n",
|
||||||
|
" return re.sub(r\"\\p{P}\", \"\", text)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def create_train_data():\n",
|
||||||
|
" data = pd.read_csv(\"gdrive/My Drive/gonito/train/in.tsv.xz\", sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=50000)\n",
|
||||||
|
" train_labels = pd.read_csv(\"gdrive/My Drive/gonito/train/expected.tsv\", sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=50000)\n",
|
||||||
|
"\n",
|
||||||
|
" train_data = data[[6, 7]]\n",
|
||||||
|
" train_data = pd.concat([train_data, train_labels], axis=1)\n",
|
||||||
|
"\n",
|
||||||
|
" return train_data[6] + train_data[0] + train_data[7]\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"def create_train_file(filename=\"gdrive/My Drive/gonito/train.txt\"):\n",
|
||||||
|
" with open(filename, \"w\") as f:\n",
|
||||||
|
" for line in create_train_data():\n",
|
||||||
|
" f.write(clean(line) + \"\\n\")\n",
|
||||||
|
" \n",
|
||||||
|
"\n",
|
||||||
|
"def train_model():\n",
|
||||||
|
" lmplz_command = f\"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa\"\n",
|
||||||
|
" build_binary_command = f\"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary\"\n",
|
||||||
|
" os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command))\n",
|
||||||
|
" os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command))\n",
|
||||||
|
" \n",
|
||||||
|
"\n",
|
||||||
|
"def softmax(x):\n",
|
||||||
|
" e_x = np.exp(x - np.max(x))\n",
|
||||||
|
" return e_x / e_x.sum(axis=0)\n",
|
||||||
|
"\n",
|
||||||
|
"def predict(model, before, after):\n",
|
||||||
|
" best_scores = []\n",
|
||||||
|
" for word in english_words_alpha_set:\n",
|
||||||
|
" text = ' '.join([before, word, after])\n",
|
||||||
|
" text_score = model.score(text, bos=False, eos=False)\n",
|
||||||
|
" if len(best_scores) < 12:\n",
|
||||||
|
" best_scores.append((word, text_score))\n",
|
||||||
|
" else:\n",
|
||||||
|
" worst_score = None\n",
|
||||||
|
" for score in best_scores:\n",
|
||||||
|
" if not worst_score:\n",
|
||||||
|
" worst_score = score\n",
|
||||||
|
" else:\n",
|
||||||
|
" if worst_score[1] > score[1]:\n",
|
||||||
|
" worst_score = score\n",
|
||||||
|
" if worst_score[1] < text_score:\n",
|
||||||
|
" best_scores.remove(worst_score)\n",
|
||||||
|
" best_scores.append((word, text_score))\n",
|
||||||
|
" probs = sorted(best_scores, key=lambda tup: tup[1], reverse=True)\n",
|
||||||
|
" pred_str = ''\n",
|
||||||
|
" for word, prob in probs:\n",
|
||||||
|
" pred_str += f'{word}:{prob} '\n",
|
||||||
|
" pred_str += f':{log10(0.99)}'\n",
|
||||||
|
" return pred_str\n",
|
||||||
|
"\n",
|
||||||
|
"def make_prediction(model, path, result_path):\n",
|
||||||
|
" data = pd.read_csv(path, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
|
||||||
|
" with open(result_path, 'w', encoding='utf-8') as file_out:\n",
|
||||||
|
" for _, row in data.iterrows():\n",
|
||||||
|
" before, after = word_tokenize(clean(str(row[6]))), word_tokenize(clean(str(row[7])))\n",
|
||||||
|
" if len(before) < 2 or len(after) < 2:\n",
|
||||||
|
" pred = PREDICTION\n",
|
||||||
|
" else:\n",
|
||||||
|
" pred = predict(model, before[-1], after[0])\n",
|
||||||
|
" file_out.write(pred + '\\n')\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"create_train_file()\n",
|
||||||
|
"train_model()\n",
|
||||||
|
"model = kenlm.Model('gdrive/My Drive/gonito/model.binary')\n",
|
||||||
|
"make_prediction(model, \"gdrive/My Drive/gonito/dev-0/in.tsv.xz\", \"gdrive/My Drive/gonito/dev-0/out.tsv\")\n",
|
||||||
|
"make_prediction(model, \"gdrive/My Drive/gonito/test-A/in.tsv.xz\", \"gdrive/My Drive/gonito/test-A/out.tsv\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
117
kenlm.py
Normal file
117
kenlm.py
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""kenlm.ipynb
|
||||||
|
|
||||||
|
Automatically generated by Colaboratory.
|
||||||
|
|
||||||
|
Original file is located at
|
||||||
|
https://colab.research.google.com/drive/1ov9aRonhHahzGcs1BIMjVHEldjHg4yTs
|
||||||
|
"""
|
||||||
|
|
||||||
|
from google.colab import drive
|
||||||
|
drive.mount('/content/gdrive')
|
||||||
|
|
||||||
|
!pip install https://github.com/kpu/kenlm/archive/master.zip
|
||||||
|
|
||||||
|
!pip install english_words
|
||||||
|
|
||||||
|
import nltk
|
||||||
|
nltk.download("punkt")
|
||||||
|
|
||||||
|
lmplz_command = f"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa"
|
||||||
|
build_binary_command = f"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary"
|
||||||
|
os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command))
|
||||||
|
os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command))
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
import regex as re
|
||||||
|
import kenlm
|
||||||
|
from english_words import english_words_alpha_set
|
||||||
|
from nltk import word_tokenize
|
||||||
|
from math import log10
|
||||||
|
from pathlib import Path
|
||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
KENLM_BUILD_PATH = Path("gdrive/My Drive/gonito/kenlm/build")
|
||||||
|
KENLM_LMPLZ_PATH = KENLM_BUILD_PATH / "bin" / "lmplz"
|
||||||
|
KENLM_BUILD_BINARY_PATH = KENLM_BUILD_PATH / "bin" / "build_binary"
|
||||||
|
SUDO_PASSWORD = ""
|
||||||
|
PREDICTION = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'
|
||||||
|
|
||||||
|
|
||||||
|
def clean(text):
|
||||||
|
text = str(text).lower().replace("-\\n", "").replace("\\n", " ")
|
||||||
|
return re.sub(r"\p{P}", "", text)
|
||||||
|
|
||||||
|
|
||||||
|
def create_train_data():
|
||||||
|
data = pd.read_csv("gdrive/My Drive/gonito/train/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=50000)
|
||||||
|
train_labels = pd.read_csv("gdrive/My Drive/gonito/train/expected.tsv", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=50000)
|
||||||
|
|
||||||
|
train_data = data[[6, 7]]
|
||||||
|
train_data = pd.concat([train_data, train_labels], axis=1)
|
||||||
|
|
||||||
|
return train_data[6] + train_data[0] + train_data[7]
|
||||||
|
|
||||||
|
|
||||||
|
def create_train_file(filename="gdrive/My Drive/gonito/train.txt"):
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
for line in create_train_data():
|
||||||
|
f.write(clean(line) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def train_model():
|
||||||
|
lmplz_command = f"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa"
|
||||||
|
build_binary_command = f"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary"
|
||||||
|
os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command))
|
||||||
|
os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command))
|
||||||
|
|
||||||
|
|
||||||
|
def softmax(x):
|
||||||
|
e_x = np.exp(x - np.max(x))
|
||||||
|
return e_x / e_x.sum(axis=0)
|
||||||
|
|
||||||
|
def predict(model, before, after):
|
||||||
|
best_scores = []
|
||||||
|
for word in english_words_alpha_set:
|
||||||
|
text = ' '.join([before, word, after])
|
||||||
|
text_score = model.score(text, bos=False, eos=False)
|
||||||
|
if len(best_scores) < 12:
|
||||||
|
best_scores.append((word, text_score))
|
||||||
|
else:
|
||||||
|
worst_score = None
|
||||||
|
for score in best_scores:
|
||||||
|
if not worst_score:
|
||||||
|
worst_score = score
|
||||||
|
else:
|
||||||
|
if worst_score[1] > score[1]:
|
||||||
|
worst_score = score
|
||||||
|
if worst_score[1] < text_score:
|
||||||
|
best_scores.remove(worst_score)
|
||||||
|
best_scores.append((word, text_score))
|
||||||
|
probs = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
|
||||||
|
pred_str = ''
|
||||||
|
for word, prob in probs:
|
||||||
|
pred_str += f'{word}:{prob} '
|
||||||
|
pred_str += f':{log10(0.99)}'
|
||||||
|
return pred_str
|
||||||
|
|
||||||
|
def make_prediction(model, path, result_path):
|
||||||
|
data = pd.read_csv(path, sep='\t', header=None, quoting=csv.QUOTE_NONE)
|
||||||
|
with open(result_path, 'w', encoding='utf-8') as file_out:
|
||||||
|
for _, row in data.iterrows():
|
||||||
|
before, after = word_tokenize(clean(str(row[6]))), word_tokenize(clean(str(row[7])))
|
||||||
|
if len(before) < 2 or len(after) < 2:
|
||||||
|
pred = PREDICTION
|
||||||
|
else:
|
||||||
|
pred = predict(model, before[-1], after[0])
|
||||||
|
file_out.write(pred + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
create_train_file()
|
||||||
|
train_model()
|
||||||
|
model = kenlm.Model('gdrive/My Drive/gonito/model.binary')
|
||||||
|
make_prediction(model, "gdrive/My Drive/gonito/dev-0/in.tsv.xz", "gdrive/My Drive/gonito/dev-0/out.tsv")
|
||||||
|
make_prediction(model, "gdrive/My Drive/gonito/test-A/in.tsv.xz", "gdrive/My Drive/gonito/test-A/out.tsv")
|
78
n-gram.py
Normal file
78
n-gram.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
import regex as re
|
||||||
|
from nltk import bigrams, word_tokenize
|
||||||
|
from collections import Counter, defaultdict
|
||||||
|
import string
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
DEFAULT_PREDICTION = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
|
||||||
|
|
||||||
|
data = pd.read_csv("train/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
|
||||||
|
train_labels = pd.read_csv("train/expected.tsv", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
|
||||||
|
|
||||||
|
train_data = data[[6, 7]]
|
||||||
|
train_data = pd.concat([train_data, train_labels], axis=1)
|
||||||
|
|
||||||
|
train_data["final"] = train_data[6] + train_data[0] + train_data[7]
|
||||||
|
|
||||||
|
model = defaultdict(lambda: defaultdict(lambda: 0))
|
||||||
|
|
||||||
|
|
||||||
|
def clean(text):
|
||||||
|
text = str(text).lower().replace("-\\n", "").replace("\\n", " ")
|
||||||
|
return re.sub(r"\p{P}", "", text)
|
||||||
|
|
||||||
|
for _, row in train_data.iterrows():
|
||||||
|
words = word_tokenize(clean(row["final"]))
|
||||||
|
for w1, w2 in bigrams(words, pad_left=True, pad_right=True):
|
||||||
|
if w1 and w2:
|
||||||
|
model[w1][w2] += 1
|
||||||
|
for w1 in model:
|
||||||
|
total_count = float(sum(model[w1].values()))
|
||||||
|
for w2 in model[w1]:
|
||||||
|
model[w1][w2] /= total_count
|
||||||
|
|
||||||
|
|
||||||
|
def predict(word):
|
||||||
|
predictions = dict(model[word])
|
||||||
|
most_common = dict(Counter(predictions).most_common(5))
|
||||||
|
|
||||||
|
total_prob = 0.0
|
||||||
|
str_prediction = ""
|
||||||
|
|
||||||
|
for word, prob in most_common.items():
|
||||||
|
total_prob += prob
|
||||||
|
str_prediction += f"{word}:{prob} "
|
||||||
|
|
||||||
|
if not total_prob:
|
||||||
|
return DEFAULT_PREDICTION
|
||||||
|
|
||||||
|
if 1 - total_prob >= 0.01:
|
||||||
|
str_prediction += f":{1-total_prob}"
|
||||||
|
else:
|
||||||
|
str_prediction += f":0.01"
|
||||||
|
|
||||||
|
return str_prediction
|
||||||
|
|
||||||
|
|
||||||
|
data = pd.read_csv("dev-0/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
|
||||||
|
with open("dev-0/out.tsv", "w", encoding="UTF-8") as file:
|
||||||
|
for _, row in data.iterrows():
|
||||||
|
words = word_tokenize(clean(row[6]))
|
||||||
|
if len(words) < 3:
|
||||||
|
prediction = DEFAULT_PREDICTION
|
||||||
|
else:
|
||||||
|
prediction = predict(words[-1])
|
||||||
|
file.write(prediction + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
data = pd.read_csv("test-A/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
|
||||||
|
with open("test-A/out.tsv", "w", encoding="UTF-8") as file:
|
||||||
|
for _, row in data.iterrows():
|
||||||
|
words = word_tokenize(clean(row[6]))
|
||||||
|
if len(words) < 3:
|
||||||
|
prediction = DEFAULT_PREDICTION
|
||||||
|
else:
|
||||||
|
prediction = predict(words[-1])
|
||||||
|
file.write(prediction + "\n")
|
5298
neural_network.ipynb
Normal file
5298
neural_network.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
192
neural_network.py
Normal file
192
neural_network.py
Normal file
@ -0,0 +1,192 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""neural_network.ipynb
|
||||||
|
|
||||||
|
Automatically generated by Colaboratory.
|
||||||
|
|
||||||
|
Original file is located at
|
||||||
|
https://colab.research.google.com/drive/1c4GrMHn9isBMqMEfYsJftgWbWqqLO1Np
|
||||||
|
"""
|
||||||
|
|
||||||
|
from google.colab import drive
|
||||||
|
drive.mount('/content/gdrive')
|
||||||
|
|
||||||
|
root_path = 'gdrive/My Drive/gonito/'
|
||||||
|
|
||||||
|
import torch
|
||||||
|
torch.cuda.is_available()
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import csv
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
import pandas as pd
|
||||||
|
from os.path import exists
|
||||||
|
from torchtext.vocab import build_vocab_from_iterator
|
||||||
|
import itertools
|
||||||
|
import regex as re
|
||||||
|
from csv import QUOTE_NONE
|
||||||
|
from torch import nn
|
||||||
|
|
||||||
|
ENCODING = "utf-8"
|
||||||
|
|
||||||
|
REP = re.compile(r"[{}\[\]\&%^$*#\(\)@\t\n0123456789]+")
|
||||||
|
REM = re.compile(r"'s|[\-]\\n|\-\\n|\p{P}")
|
||||||
|
|
||||||
|
def read_csv(fname):
|
||||||
|
return pd.read_csv(fname, sep="\t", on_bad_lines='skip', header=None, quoting=QUOTE_NONE, encoding=ENCODING)
|
||||||
|
|
||||||
|
def clean_text(text):
|
||||||
|
res = str(text).lower().strip()
|
||||||
|
res = res.replace("’", "'")
|
||||||
|
res = REM.sub("", res)
|
||||||
|
res = REP.sub(" ", res)
|
||||||
|
res = res.replace("'t", " not")
|
||||||
|
res = res.replace("'s", " is")
|
||||||
|
res = res.replace("'ll", " will")
|
||||||
|
res = res.replace("won't", "will not")
|
||||||
|
res = res.replace("isn't", "is not")
|
||||||
|
res = res.replace("aren't", "are not")
|
||||||
|
res = res.replace("'ve'", "have")
|
||||||
|
return res.replace("'m", " am")
|
||||||
|
|
||||||
|
def get_words_from_line(line, specials = True):
|
||||||
|
line = line.rstrip()
|
||||||
|
if specials:
|
||||||
|
yield '<s>'
|
||||||
|
for m in re.finditer(r'[\p{L}0-9\*]+|\p{P}+', line):
|
||||||
|
yield m.group(0).lower()
|
||||||
|
if specials:
|
||||||
|
yield '</s>'
|
||||||
|
|
||||||
|
|
||||||
|
def get_word_lines_from_data(d):
|
||||||
|
for line in d:
|
||||||
|
yield get_words_from_line(line)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Bigrams(torch.utils.data.IterableDataset):
|
||||||
|
def __init__(self, data, vocabulary_size):
|
||||||
|
self.vocab = build_vocab_from_iterator(
|
||||||
|
get_word_lines_from_data(data),
|
||||||
|
max_tokens = vocabulary_size,
|
||||||
|
specials = ['<unk>'])
|
||||||
|
self.vocab.set_default_index(self.vocab['<unk>'])
|
||||||
|
self.vocabulary_size = vocabulary_size
|
||||||
|
self.data = data
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def look_ahead_iterator(gen):
|
||||||
|
w1 = None
|
||||||
|
for item in gen:
|
||||||
|
if w1 is not None:
|
||||||
|
yield (w1, item)
|
||||||
|
w1 = item
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return self.look_ahead_iterator(
|
||||||
|
(self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_data(self.data))))
|
||||||
|
|
||||||
|
class SimpleBigramNeuralLanguageModel(torch.nn.Module):
|
||||||
|
def __init__(self, vocabulary_size, embedding_size):
|
||||||
|
super(SimpleBigramNeuralLanguageModel, self).__init__()
|
||||||
|
self.model = nn.Sequential(
|
||||||
|
nn.Embedding(vocabulary_size, embedding_size),
|
||||||
|
nn.Linear(embedding_size, vocabulary_size),
|
||||||
|
nn.Softmax(),
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
return self.model(x)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
data = read_csv("gdrive/My Drive/gonito/train/in.tsv.xz")
|
||||||
|
train_words = read_csv("gdrive/My Drive/gonito/train/expected.tsv")
|
||||||
|
|
||||||
|
train_data = data[[6, 7]]
|
||||||
|
train_data = pd.concat([train_data, train_words], axis=1)
|
||||||
|
train_data = train_data[6] + train_data[0] + train_data[7]
|
||||||
|
train_data = train_data.apply(clean_text)
|
||||||
|
|
||||||
|
vocab_size = 30000
|
||||||
|
embed_size = 150
|
||||||
|
|
||||||
|
train_dataset = Bigrams(train_data, vocab_size)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
|
model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)
|
||||||
|
print(device)
|
||||||
|
if(not exists('model1.bin')):
|
||||||
|
data = DataLoader(train_dataset, batch_size=8000)
|
||||||
|
optimizer = torch.optim.Adam(model.parameters())
|
||||||
|
criterion = torch.nn.NLLLoss()
|
||||||
|
|
||||||
|
model.train()
|
||||||
|
step = 0
|
||||||
|
for i in range(2):
|
||||||
|
print(f"EPOCH {i}=========================")
|
||||||
|
for x, y in data:
|
||||||
|
x = x.to(device)
|
||||||
|
y = y.to(device)
|
||||||
|
optimizer.zero_grad()
|
||||||
|
ypredicted = model(x)
|
||||||
|
loss = criterion(torch.log(ypredicted), y)
|
||||||
|
if step % 100 == 0:
|
||||||
|
print(step, loss)
|
||||||
|
step += 1
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
|
||||||
|
torch.save(model.state_dict(), 'model1.bin')
|
||||||
|
else:
|
||||||
|
print("Loading model1")
|
||||||
|
model.load_state_dict(torch.load('model1.bin'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
vocab = train_dataset.vocab
|
||||||
|
|
||||||
|
def predict(tokens):
|
||||||
|
ixs = torch.tensor(vocab.forward(tokens)).to(device)
|
||||||
|
out = model(ixs)
|
||||||
|
top = torch.topk(out[0], 8)
|
||||||
|
top_indices = top.indices.tolist()
|
||||||
|
top_probs = top.values.tolist()
|
||||||
|
top_words = vocab.lookup_tokens(top_indices)
|
||||||
|
result = ""
|
||||||
|
for word, prob in list(zip(top_words, top_probs)):
|
||||||
|
result += f"{word}:{prob} "
|
||||||
|
# result += f':0.01'
|
||||||
|
return result
|
||||||
|
|
||||||
|
DEFAULT_PREDICTION = "a:0.2 the:0.2 to:0.2 of:0.1 and:0.1 of:0.1 :0.1"
|
||||||
|
|
||||||
|
def predict_file(result_path, data):
|
||||||
|
with open(result_path, "w+", encoding="UTF-8") as f:
|
||||||
|
for row in data:
|
||||||
|
result = {}
|
||||||
|
before = None
|
||||||
|
for before in get_words_from_line(clean_text(str(row)), False):
|
||||||
|
pass
|
||||||
|
before = [before]
|
||||||
|
print(before)
|
||||||
|
if(len(before) < 1):
|
||||||
|
result = DEFAULT_PREDICTION
|
||||||
|
else:
|
||||||
|
result = predict(before)
|
||||||
|
result = result.strip()
|
||||||
|
f.write(result + "\n")
|
||||||
|
print(result)
|
||||||
|
|
||||||
|
dev_data = pd.read_csv("gdrive/My Drive/gonito/dev-0/in.tsv.xz", sep='\t', header=None, quoting=csv.QUOTE_NONE)[6]
|
||||||
|
dev_data = dev_data.apply(clean_text)
|
||||||
|
predict_file("gdrive/My Drive/gonito/dev-0/out.tsv", dev_data)
|
||||||
|
|
||||||
|
test_data = pd.read_csv("gdrive/My Drive/gonito/test-A/in.tsv.xz", sep='\t', header=None, quoting=csv.QUOTE_NONE)[6]
|
||||||
|
test_data = test_data.apply(clean_text)
|
||||||
|
predict_file("gdrive/My Drive/gonito/test-A/out.tsv", test_data)
|
Loading…
Reference in New Issue
Block a user