diff --git a/kenlm.ipynb b/kenlm.ipynb
new file mode 100644
index 0000000..6f57391
--- /dev/null
+++ b/kenlm.ipynb
@@ -0,0 +1,283 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "kenlm.ipynb",
+ "provenance": [],
+ "collapsed_sections": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "source": [
+ "from google.colab import drive\n",
+ "drive.mount('/content/gdrive')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "GQG8KfEo5BwV",
+ "outputId": "7899949c-5bc3-4d13-acb2-88aa47f46655"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Mounted at /content/gdrive\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install https://github.com/kpu/kenlm/archive/master.zip"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "GsoWSBmH5DT3",
+ "outputId": "f67d798f-54f8-4c90-bdef-590424b49dd5"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+ "Collecting https://github.com/kpu/kenlm/archive/master.zip\n",
+ " Using cached https://github.com/kpu/kenlm/archive/master.zip (550 kB)\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install english_words"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "rwNPsafM6KSb",
+ "outputId": "b4e21df6-cf55-4f7a-843c-a87f1acc6082"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+ "Collecting english_words\n",
+ " Downloading english-words-1.1.0.tar.gz (1.1 MB)\n",
+ "\u001b[K |████████████████████████████████| 1.1 MB 5.4 MB/s \n",
+ "\u001b[?25hBuilding wheels for collected packages: english-words\n",
+ " Building wheel for english-words (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+ " Created wheel for english-words: filename=english_words-1.1.0-py3-none-any.whl size=1106680 sha256=9959ed5d02a4c06063019ede18eebf1ef1be2562a62aa85f86a13d6a3fe1e34b\n",
+ " Stored in directory: /root/.cache/pip/wheels/25/3d/4c/12a119ce90b46b4f90f9ddf41d719ecabb40faec6103379fc8\n",
+ "Successfully built english-words\n",
+ "Installing collected packages: english-words\n",
+ "Successfully installed english-words-1.1.0\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import nltk\n",
+ "nltk.download(\"punkt\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "02yP2lJ9_4dT",
+ "outputId": "5de6ad9b-41e0-4577-9af3-4ceefe85f3d0"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
+ "[nltk_data] Unzipping tokenizers/punkt.zip.\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 12
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ " lmplz_command = f\"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa\"\n",
+ " build_binary_command = f\"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary\"\n",
+ " os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command))\n",
+ " os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command))"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "YC397rhc7-CW",
+ "outputId": "53adb185-9cbf-4ace-8556-7335776313d6"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "256"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "tt_ucItY484I",
+ "outputId": "e2839c64-b3b9-42fb-c2cf-dc7dc60ad8ab"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:51: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version.\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import csv\n",
+ "import regex as re\n",
+ "import kenlm\n",
+ "from english_words import english_words_alpha_set\n",
+ "from nltk import word_tokenize\n",
+ "from math import log10\n",
+ "from pathlib import Path\n",
+ "import os\n",
+ "import numpy as np\n",
+ "\n",
+ "\n",
+ "KENLM_BUILD_PATH = Path(\"gdrive/My Drive/gonito/kenlm/build\")\n",
+ "KENLM_LMPLZ_PATH = KENLM_BUILD_PATH / \"bin\" / \"lmplz\"\n",
+ "KENLM_BUILD_BINARY_PATH = KENLM_BUILD_PATH / \"bin\" / \"build_binary\"\n",
+ "SUDO_PASSWORD = \"\"\n",
+ "PREDICTION = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'\n",
+ "\n",
+ "\n",
+ "def clean(text):\n",
+ " text = str(text).lower().replace(\"-\\\\n\", \"\").replace(\"\\\\n\", \" \")\n",
+ " return re.sub(r\"\\p{P}\", \"\", text)\n",
+ "\n",
+ "\n",
+ "def create_train_data():\n",
+ " data = pd.read_csv(\"gdrive/My Drive/gonito/train/in.tsv.xz\", sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=50000)\n",
+ " train_labels = pd.read_csv(\"gdrive/My Drive/gonito/train/expected.tsv\", sep=\"\\t\", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=50000)\n",
+ "\n",
+ " train_data = data[[6, 7]]\n",
+ " train_data = pd.concat([train_data, train_labels], axis=1)\n",
+ "\n",
+ " return train_data[6] + train_data[0] + train_data[7]\n",
+ "\n",
+ "\n",
+ "def create_train_file(filename=\"gdrive/My Drive/gonito/train.txt\"):\n",
+ " with open(filename, \"w\") as f:\n",
+ " for line in create_train_data():\n",
+ " f.write(clean(line) + \"\\n\")\n",
+ " \n",
+ "\n",
+ "def train_model():\n",
+ " lmplz_command = f\"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa\"\n",
+ " build_binary_command = f\"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary\"\n",
+ " os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command))\n",
+ " os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command))\n",
+ " \n",
+ "\n",
+ "def softmax(x):\n",
+ " e_x = np.exp(x - np.max(x))\n",
+ " return e_x / e_x.sum(axis=0)\n",
+ "\n",
+ "def predict(model, before, after):\n",
+ " best_scores = []\n",
+ " for word in english_words_alpha_set:\n",
+ " text = ' '.join([before, word, after])\n",
+ " text_score = model.score(text, bos=False, eos=False)\n",
+ " if len(best_scores) < 12:\n",
+ " best_scores.append((word, text_score))\n",
+ " else:\n",
+ " worst_score = None\n",
+ " for score in best_scores:\n",
+ " if not worst_score:\n",
+ " worst_score = score\n",
+ " else:\n",
+ " if worst_score[1] > score[1]:\n",
+ " worst_score = score\n",
+ " if worst_score[1] < text_score:\n",
+ " best_scores.remove(worst_score)\n",
+ " best_scores.append((word, text_score))\n",
+ " probs = sorted(best_scores, key=lambda tup: tup[1], reverse=True)\n",
+ " pred_str = ''\n",
+ " for word, prob in probs:\n",
+ " pred_str += f'{word}:{prob} '\n",
+ " pred_str += f':{log10(0.99)}'\n",
+ " return pred_str\n",
+ "\n",
+ "def make_prediction(model, path, result_path):\n",
+ " data = pd.read_csv(path, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
+ " with open(result_path, 'w', encoding='utf-8') as file_out:\n",
+ " for _, row in data.iterrows():\n",
+ " before, after = word_tokenize(clean(str(row[6]))), word_tokenize(clean(str(row[7])))\n",
+ " if len(before) < 2 or len(after) < 2:\n",
+ " pred = PREDICTION\n",
+ " else:\n",
+ " pred = predict(model, before[-1], after[0])\n",
+ " file_out.write(pred + '\\n')\n",
+ "\n",
+ "\n",
+ "create_train_file()\n",
+ "train_model()\n",
+ "model = kenlm.Model('gdrive/My Drive/gonito/model.binary')\n",
+ "make_prediction(model, \"gdrive/My Drive/gonito/dev-0/in.tsv.xz\", \"gdrive/My Drive/gonito/dev-0/out.tsv\")\n",
+ "make_prediction(model, \"gdrive/My Drive/gonito/test-A/in.tsv.xz\", \"gdrive/My Drive/gonito/test-A/out.tsv\")"
+ ]
+ }
+ ]
+}
\ No newline at end of file
diff --git a/kenlm.py b/kenlm.py
new file mode 100644
index 0000000..00aa7e7
--- /dev/null
+++ b/kenlm.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+"""kenlm.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+ https://colab.research.google.com/drive/1ov9aRonhHahzGcs1BIMjVHEldjHg4yTs
+"""
+
+from google.colab import drive
+drive.mount('/content/gdrive')
+
+!pip install https://github.com/kpu/kenlm/archive/master.zip
+
+!pip install english_words
+
+import nltk
+nltk.download("punkt")
+
+lmplz_command = f"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa"
+ build_binary_command = f"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary"
+ os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command))
+ os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command))
+
+import pandas as pd
+import csv
+import regex as re
+import kenlm
+from english_words import english_words_alpha_set
+from nltk import word_tokenize
+from math import log10
+from pathlib import Path
+import os
+import numpy as np
+
+
+KENLM_BUILD_PATH = Path("gdrive/My Drive/gonito/kenlm/build")
+KENLM_LMPLZ_PATH = KENLM_BUILD_PATH / "bin" / "lmplz"
+KENLM_BUILD_BINARY_PATH = KENLM_BUILD_PATH / "bin" / "build_binary"
+SUDO_PASSWORD = ""
+PREDICTION = 'the:0.03 be:0.03 to:0.03 of:0.025 and:0.025 a:0.025 in:0.020 that:0.020 have:0.015 I:0.010 it:0.010 for:0.010 not:0.010 on:0.010 with:0.010 he:0.010 as:0.010 you:0.010 do:0.010 at:0.010 :0.77'
+
+
+def clean(text):
+ text = str(text).lower().replace("-\\n", "").replace("\\n", " ")
+ return re.sub(r"\p{P}", "", text)
+
+
+def create_train_data():
+ data = pd.read_csv("gdrive/My Drive/gonito/train/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=50000)
+ train_labels = pd.read_csv("gdrive/My Drive/gonito/train/expected.tsv", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE, nrows=50000)
+
+ train_data = data[[6, 7]]
+ train_data = pd.concat([train_data, train_labels], axis=1)
+
+ return train_data[6] + train_data[0] + train_data[7]
+
+
+def create_train_file(filename="gdrive/My Drive/gonito/train.txt"):
+ with open(filename, "w") as f:
+ for line in create_train_data():
+ f.write(clean(line) + "\n")
+
+
+def train_model():
+ lmplz_command = f"{KENLM_LMPLZ_PATH} -o 4 < train.txt > model.arpa"
+ build_binary_command = f"{KENLM_BUILD_BINARY_PATH} model.arpa model.binary"
+ os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, lmplz_command))
+ os.system('echo %s|sudo -S %s' % (SUDO_PASSWORD, build_binary_command))
+
+
+def softmax(x):
+ e_x = np.exp(x - np.max(x))
+ return e_x / e_x.sum(axis=0)
+
+def predict(model, before, after):
+ best_scores = []
+ for word in english_words_alpha_set:
+ text = ' '.join([before, word, after])
+ text_score = model.score(text, bos=False, eos=False)
+ if len(best_scores) < 12:
+ best_scores.append((word, text_score))
+ else:
+ worst_score = None
+ for score in best_scores:
+ if not worst_score:
+ worst_score = score
+ else:
+ if worst_score[1] > score[1]:
+ worst_score = score
+ if worst_score[1] < text_score:
+ best_scores.remove(worst_score)
+ best_scores.append((word, text_score))
+ probs = sorted(best_scores, key=lambda tup: tup[1], reverse=True)
+ pred_str = ''
+ for word, prob in probs:
+ pred_str += f'{word}:{prob} '
+ pred_str += f':{log10(0.99)}'
+ return pred_str
+
+def make_prediction(model, path, result_path):
+ data = pd.read_csv(path, sep='\t', header=None, quoting=csv.QUOTE_NONE)
+ with open(result_path, 'w', encoding='utf-8') as file_out:
+ for _, row in data.iterrows():
+ before, after = word_tokenize(clean(str(row[6]))), word_tokenize(clean(str(row[7])))
+ if len(before) < 2 or len(after) < 2:
+ pred = PREDICTION
+ else:
+ pred = predict(model, before[-1], after[0])
+ file_out.write(pred + '\n')
+
+
+create_train_file()
+train_model()
+model = kenlm.Model('gdrive/My Drive/gonito/model.binary')
+make_prediction(model, "gdrive/My Drive/gonito/dev-0/in.tsv.xz", "gdrive/My Drive/gonito/dev-0/out.tsv")
+make_prediction(model, "gdrive/My Drive/gonito/test-A/in.tsv.xz", "gdrive/My Drive/gonito/test-A/out.tsv")
\ No newline at end of file
diff --git a/n-gram.py b/n-gram.py
new file mode 100644
index 0000000..6e19890
--- /dev/null
+++ b/n-gram.py
@@ -0,0 +1,78 @@
+import pandas as pd
+import csv
+import regex as re
+from nltk import bigrams, word_tokenize
+from collections import Counter, defaultdict
+import string
+import unicodedata
+
+DEFAULT_PREDICTION = "the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1"
+
+data = pd.read_csv("train/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
+train_labels = pd.read_csv("train/expected.tsv", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
+
+train_data = data[[6, 7]]
+train_data = pd.concat([train_data, train_labels], axis=1)
+
+train_data["final"] = train_data[6] + train_data[0] + train_data[7]
+
+model = defaultdict(lambda: defaultdict(lambda: 0))
+
+
+def clean(text):
+ text = str(text).lower().replace("-\\n", "").replace("\\n", " ")
+ return re.sub(r"\p{P}", "", text)
+
+for _, row in train_data.iterrows():
+ words = word_tokenize(clean(row["final"]))
+ for w1, w2 in bigrams(words, pad_left=True, pad_right=True):
+ if w1 and w2:
+ model[w1][w2] += 1
+for w1 in model:
+ total_count = float(sum(model[w1].values()))
+ for w2 in model[w1]:
+ model[w1][w2] /= total_count
+
+
+def predict(word):
+ predictions = dict(model[word])
+ most_common = dict(Counter(predictions).most_common(5))
+
+ total_prob = 0.0
+ str_prediction = ""
+
+ for word, prob in most_common.items():
+ total_prob += prob
+ str_prediction += f"{word}:{prob} "
+
+ if not total_prob:
+ return DEFAULT_PREDICTION
+
+ if 1 - total_prob >= 0.01:
+ str_prediction += f":{1-total_prob}"
+ else:
+ str_prediction += f":0.01"
+
+ return str_prediction
+
+
+data = pd.read_csv("dev-0/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
+with open("dev-0/out.tsv", "w", encoding="UTF-8") as file:
+ for _, row in data.iterrows():
+ words = word_tokenize(clean(row[6]))
+ if len(words) < 3:
+ prediction = DEFAULT_PREDICTION
+ else:
+ prediction = predict(words[-1])
+ file.write(prediction + "\n")
+
+
+data = pd.read_csv("test-A/in.tsv.xz", sep="\t", error_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)
+with open("test-A/out.tsv", "w", encoding="UTF-8") as file:
+ for _, row in data.iterrows():
+ words = word_tokenize(clean(row[6]))
+ if len(words) < 3:
+ prediction = DEFAULT_PREDICTION
+ else:
+ prediction = predict(words[-1])
+ file.write(prediction + "\n")
diff --git a/neural_network.ipynb b/neural_network.ipynb
new file mode 100644
index 0000000..3600f89
--- /dev/null
+++ b/neural_network.ipynb
@@ -0,0 +1,5298 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "neural_network.ipynb",
+ "provenance": [],
+ "collapsed_sections": []
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ },
+ "accelerator": "GPU",
+ "gpuClass": "standard"
+ },
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ECxQCLFdh2dg",
+ "outputId": "8402ab56-979b-4a4e-c331-d905cabccd23"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Mounted at /content/gdrive\n"
+ ]
+ }
+ ],
+ "source": [
+ "from google.colab import drive\n",
+ "drive.mount('/content/gdrive')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "root_path = 'gdrive/My Drive/gonito/'"
+ ],
+ "metadata": {
+ "id": "uWXe1O7FjKVZ"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import torch\n",
+ "torch.cuda.is_available()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "Lo2e9lxajQGy",
+ "outputId": "883c9239-c4fb-4607-85ef-6a0cbd6dad3e"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 4
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import torch\n",
+ "import csv\n",
+ "torch.cuda.empty_cache()\n",
+ "from torch.utils.data import DataLoader\n",
+ "import pandas as pd\n",
+ "from os.path import exists\n",
+ "from torchtext.vocab import build_vocab_from_iterator\n",
+ "import itertools\n",
+ "import regex as re\n",
+ "from csv import QUOTE_NONE\n",
+ "from torch import nn\n",
+ "\n",
+ "ENCODING = \"utf-8\"\n",
+ "\n",
+ "REP = re.compile(r\"[{}\\[\\]\\&%^$*#\\(\\)@\\t\\n0123456789]+\")\n",
+ "REM = re.compile(r\"'s|[\\-]\\\\n|\\-\\\\n|\\p{P}\")\n",
+ "\n",
+ "def read_csv(fname):\n",
+ " return pd.read_csv(fname, sep=\"\\t\", on_bad_lines='skip', header=None, quoting=QUOTE_NONE, encoding=ENCODING)\n",
+ "\n",
+ "def clean_text(text):\n",
+ " res = str(text).lower().strip()\n",
+ " res = res.replace(\"’\", \"'\")\n",
+ " res = REM.sub(\"\", res)\n",
+ " res = REP.sub(\" \", res)\n",
+ " res = res.replace(\"'t\", \" not\")\n",
+ " res = res.replace(\"'s\", \" is\")\n",
+ " res = res.replace(\"'ll\", \" will\")\n",
+ " res = res.replace(\"won't\", \"will not\")\n",
+ " res = res.replace(\"isn't\", \"is not\")\n",
+ " res = res.replace(\"aren't\", \"are not\")\n",
+ " res = res.replace(\"'ve'\", \"have\")\n",
+ " return res.replace(\"'m\", \" am\")\n",
+ "\n",
+ "def get_words_from_line(line, specials = True):\n",
+ " line = line.rstrip()\n",
+ " if specials:\n",
+ " yield ''\n",
+ " for m in re.finditer(r'[\\p{L}0-9\\*]+|\\p{P}+', line):\n",
+ " yield m.group(0).lower()\n",
+ " if specials:\n",
+ " yield ''\n",
+ "\n",
+ "\n",
+ "def get_word_lines_from_data(d):\n",
+ " for line in d:\n",
+ " yield get_words_from_line(line)\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "class Bigrams(torch.utils.data.IterableDataset):\n",
+ " def __init__(self, data, vocabulary_size):\n",
+ " self.vocab = build_vocab_from_iterator(\n",
+ " get_word_lines_from_data(data),\n",
+ " max_tokens = vocabulary_size,\n",
+ " specials = [''])\n",
+ " self.vocab.set_default_index(self.vocab[''])\n",
+ " self.vocabulary_size = vocabulary_size\n",
+ " self.data = data\n",
+ "\n",
+ " @staticmethod\n",
+ " def look_ahead_iterator(gen):\n",
+ " w1 = None\n",
+ " for item in gen:\n",
+ " if w1 is not None:\n",
+ " yield (w1, item)\n",
+ " w1 = item\n",
+ "\n",
+ " def __iter__(self):\n",
+ " return self.look_ahead_iterator(\n",
+ " (self.vocab[t] for t in itertools.chain.from_iterable(get_word_lines_from_data(self.data))))\n",
+ "\n",
+ "class SimpleBigramNeuralLanguageModel(torch.nn.Module):\n",
+ " def __init__(self, vocabulary_size, embedding_size):\n",
+ " super(SimpleBigramNeuralLanguageModel, self).__init__()\n",
+ " self.model = nn.Sequential(\n",
+ " nn.Embedding(vocabulary_size, embedding_size),\n",
+ " nn.Linear(embedding_size, vocabulary_size),\n",
+ " nn.Softmax(),\n",
+ " )\n",
+ "\n",
+ " def forward(self, x):\n",
+ " return self.model(x)\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "data = read_csv(\"gdrive/My Drive/gonito/train/in.tsv.xz\")\n",
+ "train_words = read_csv(\"gdrive/My Drive/gonito/train/expected.tsv\")\n",
+ "\n",
+ "train_data = data[[6, 7]]\n",
+ "train_data = pd.concat([train_data, train_words], axis=1)\n",
+ "train_data = train_data[6] + train_data[0] + train_data[7]\n",
+ "train_data = train_data.apply(clean_text)\n",
+ "\n",
+ "vocab_size = 30000\n",
+ "embed_size = 150\n",
+ "\n",
+ "train_dataset = Bigrams(train_data, vocab_size)\n",
+ "\n",
+ "\n",
+ "\n",
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+ "model = SimpleBigramNeuralLanguageModel(vocab_size, embed_size).to(device)\n",
+ "print(device)\n",
+ "if(not exists('model1.bin')):\n",
+ " data = DataLoader(train_dataset, batch_size=8000)\n",
+ " optimizer = torch.optim.Adam(model.parameters())\n",
+ " criterion = torch.nn.NLLLoss()\n",
+ "\n",
+ " model.train()\n",
+ " step = 0\n",
+ " for i in range(2):\n",
+ " print(f\"EPOCH {i}=========================\")\n",
+ " for x, y in data:\n",
+ " x = x.to(device)\n",
+ " y = y.to(device)\n",
+ " optimizer.zero_grad()\n",
+ " ypredicted = model(x)\n",
+ " loss = criterion(torch.log(ypredicted), y)\n",
+ " if step % 100 == 0:\n",
+ " print(step, loss)\n",
+ " step += 1\n",
+ " loss.backward()\n",
+ " optimizer.step()\n",
+ "\n",
+ " torch.save(model.state_dict(), 'model1.bin')\n",
+ "else:\n",
+ " print(\"Loading model1\")\n",
+ " model.load_state_dict(torch.load('model1.bin'))\n",
+ "\n",
+ "\n",
+ "\n",
+ "vocab = train_dataset.vocab\n",
+ "\n",
+ "def predict(tokens):\n",
+ " ixs = torch.tensor(vocab.forward(tokens)).to(device)\n",
+ " out = model(ixs)\n",
+ " top = torch.topk(out[0], 8)\n",
+ " top_indices = top.indices.tolist()\n",
+ " top_probs = top.values.tolist()\n",
+ " top_words = vocab.lookup_tokens(top_indices)\n",
+ " result = \"\"\n",
+ " for word, prob in list(zip(top_words, top_probs)):\n",
+ " result += f\"{word}:{prob} \"\n",
+ " # result += f':0.01'\n",
+ " return result\n",
+ "\n",
+ "DEFAULT_PREDICTION = \"a:0.2 the:0.2 to:0.2 of:0.1 and:0.1 of:0.1 :0.1\"\n",
+ "\n",
+ "def predict_file(result_path, data):\n",
+ " with open(result_path, \"w+\", encoding=\"UTF-8\") as f:\n",
+ " for row in data:\n",
+ " result = {}\n",
+ " before = None\n",
+ " for before in get_words_from_line(clean_text(str(row)), False):\n",
+ " pass\n",
+ " before = [before]\n",
+ " print(before)\n",
+ " if(len(before) < 1):\n",
+ " result = DEFAULT_PREDICTION\n",
+ " else:\n",
+ " result = predict(before)\n",
+ " result = result.strip()\n",
+ " f.write(result + \"\\n\")\n",
+ " print(result)\n",
+ "\n",
+ "dev_data = pd.read_csv(\"gdrive/My Drive/gonito/dev-0/in.tsv.xz\", sep='\\t', header=None, quoting=csv.QUOTE_NONE)[6]\n",
+ "dev_data = dev_data.apply(clean_text)\n",
+ "predict_file(\"gdrive/My Drive/gonito/dev-0/out.tsv\", dev_data)\n",
+ "\n",
+ "test_data = pd.read_csv(\"gdrive/My Drive/gonito/test-A/in.tsv.xz\", sep='\\t', header=None, quoting=csv.QUOTE_NONE)[6]\n",
+ "test_data = test_data.apply(clean_text)\n",
+ "predict_file(\"gdrive/My Drive/gonito/test-A/out.tsv\", test_data)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "GDsznRxrjNSi",
+ "outputId": "036e6f73-c657-4eaa-b13c-8af613572de7"
+ },
+ "execution_count": null,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "cuda\n",
+ "EPOCH 0=========================\n",
+ "0 tensor(10.4703, device='cuda:0', grad_fn=)\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/container.py:141: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
+ " input = module(input)\n"
+ ]
+ },
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\u001b[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.\u001b[0m\n",
+ "['employed']\n",
+ "in:0.18340028822422028 by:0.09708617627620697 :0.07454902678728104 to:0.057051483541727066 as:0.04201669245958328 at:0.03614005073904991 and:0.033467430621385574 on:0.02761789597570896\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['man']\n",
+ ":0.1252487599849701 who:0.11225556582212448 of:0.04771805554628372 and:0.043321795761585236 in:0.04039822518825531 is:0.02757207676768303 to:0.02686801366508007 was:0.02203497476875782\n",
+ "['acre']\n",
+ ":0.1225854679942131 of:0.0909235030412674 and:0.07578513026237488 in:0.04828302934765816 the:0.02837495319545269 on:0.025319421663880348 or:0.021057847887277603 for:0.020289437845349312\n",
+ "['muchnengaged']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['for']\n",
+ "the:0.2248937338590622 :0.1491411030292511 a:0.059844110161066055 this:0.014163156040012836 his:0.013331228867173195 it:0.011568314395844936 their:0.010646478272974491 tho:0.010418311692774296\n",
+ "['would']\n",
+ "be:0.153887540102005 :0.13322113454341888 have:0.08934537321329117 not:0.08149827271699905 make:0.017474716529250145 do:0.013833458535373211 bo:0.010931842029094696 like:0.010064331814646721\n",
+ "['thesenkilters']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['f']\n",
+ ":0.3357486128807068 the:0.06402868032455444 a:0.026820175349712372 r:0.02347833849489689 m:0.01216911617666483 c:0.010706819593906403 w:0.01030018925666809 and:0.009173448197543621\n",
+ "['different']\n",
+ ":0.22379206120967865 from:0.04116220772266388 parts:0.03810926526784897 kinds:0.018899263814091682 and:0.01785585843026638 states:0.013539664447307587 times:0.01226099207997322 in:0.010963457636535168\n",
+ "['hell']\n",
+ ":0.15849441289901733 gate:0.07461563497781754 and:0.05356302484869957 of:0.032700151205062866 be:0.019207622855901718 to:0.018932465463876724 at:0.016958946362137794 he:0.016710534691810608\n",
+ "['thenenumerator']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['this']\n",
+ ":0.17093975841999054 is:0.04876431077718735 city:0.023564649745821953 country:0.01899288035929203 time:0.015251473523676395 state:0.015107371844351292 was:0.014631252735853195 act:0.014048588462173939\n",
+ "['bayhaancnmiss']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['theso']\n",
+ ":0.19692622125148773 men:0.02251005358994007 are:0.014677719213068485 things:0.009723160415887833 days:0.007474579848349094 states:0.00688559003174305 people:0.006728324573487043 were:0.005231296177953482\n",
+ "['testimony']\n",
+ "of:0.18281643092632294 whereof:0.06235404312610626 to:0.055939286947250366 :0.055649884045124054 in:0.037573203444480896 that:0.03620908036828041 is:0.026429975405335426 and:0.02331814356148243\n",
+ "['again']\n",
+ ":0.15340158343315125 and:0.07057935744524002 the:0.045588500797748566 to:0.03883609175682068 in:0.034155577421188354 at:0.02189015969634056 i:0.018609723076224327 as:0.0158885158598423\n",
+ "['reach']\n",
+ "the:0.21995700895786285 :0.11747518926858902 of:0.11579948663711548 a:0.03944582864642143 and:0.020929697901010513 it:0.017987970262765884 his:0.01599469780921936 them:0.012410901486873627\n",
+ "['of']\n",
+ "the:0.2457614541053772 :0.1656663864850998 a:0.030220337212085724 this:0.019167711958289146 his:0.01403335202485323 tho:0.011820078827440739 said:0.009894217364490032 their:0.008327881805598736\n",
+ "['further']\n",
+ ":0.17722594738006592 ordered:0.04561755433678627 that:0.035907018929719925 enacted:0.03360915929079056 resolved:0.024442153051495552 than:0.018173767253756523 notified:0.01696488820016384 particulars:0.015963932499289513\n",
+ "['duncourse']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['he']\n",
+ ":0.14773434400558472 was:0.08953415602445602 had:0.06403657048940659 is:0.04345237836241722 has:0.03252505883574486 would:0.023517634719610214 will:0.01741141825914383 could:0.016593124717473984\n",
+ "['ton']\n",
+ ":0.13487884402275085 of:0.04617968201637268 and:0.041141998022794724 the:0.036419421434402466 inclusive:0.02917175181210041 per:0.02484716661274433 a:0.019901413470506668 to:0.017307564616203308\n",
+ "['barges']\n",
+ ":0.08957834541797638 and:0.08245287090539932 of:0.051981255412101746 to:0.04459141567349434 in:0.03906521573662758 the:0.02293824590742588 were:0.018135221675038338 for:0.01714289002120495\n",
+ "['thenrepairs']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['plat']\n",
+ "of:0.13270790874958038 :0.10824776440858841 recorded:0.10697674751281738 book:0.0745907723903656 thereof:0.06707336753606796 and:0.05016869679093361 on:0.023678768426179886 nform:0.020634010434150696\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['withn']\n",
+ ":0.1201028972864151 the:0.05784958600997925 a:0.03030189499258995 and:0.02290545217692852 per:0.01597488485276699 in:0.014931217767298222 acres:0.010343622416257858 of:0.00986756756901741\n",
+ "['advised']\n",
+ "to:0.08359719812870026 by:0.08131337910890579 :0.07083816081285477 that:0.06859848648309708 me:0.062309470027685165 him:0.06224164366722107 of:0.05205093324184418 the:0.05154396593570709\n",
+ "['evidences']\n",
+ "of:0.716045081615448 :0.0453171543776989 that:0.03211292624473572 ofnthe:0.02372738905251026 thereof:0.00914797279983759 and:0.008252683095633984 in:0.008041009306907654 the:0.006530883722007275\n",
+ "['may']\n",
+ "be:0.2874893546104431 :0.13178488612174988 have:0.0332358255982399 not:0.027478745207190514 bo:0.014187678694725037 he:0.012491154484450817 a:0.009335900656878948 deem:0.009177502244710922\n",
+ "['ho']\n",
+ ":0.20440232753753662 was:0.0715789869427681 had:0.05162014812231064 is:0.030420489609241486 has:0.02460101991891861 would:0.022303633391857147 will:0.01345206331461668 could:0.013116495683789253\n",
+ "['itn']\n",
+ ":0.18827538192272186 a:0.025874843820929527 the:0.02125515230000019 i:0.018875891342759132 and:0.016849452629685402 in:0.012834791094064713 is:0.01144351251423359 was:0.009813395328819752\n",
+ "['employ']\n",
+ "of:0.1568000316619873 :0.1102624386548996 the:0.07192815840244293 a:0.05583103746175766 nment:0.032724637538194656 and:0.014253983274102211 in:0.013584919273853302 it:0.011354354210197926\n",
+ "['then']\n",
+ ":0.17606347799301147 the:0.05414457619190216 th:0.02774103544652462 he:0.023316092789173126 a:0.019771868363022804 to:0.01886158250272274 i:0.018397148698568344 it:0.014618783257901669\n",
+ "['is']\n",
+ ":0.14478172361850739 a:0.07943203300237656 the:0.05567781254649162 not:0.04187404736876488 to:0.028269024565815926 in:0.017350036650896072 no:0.016243362799286842 now:0.014129472896456718\n",
+ "['peace']\n",
+ "and:0.14449922740459442 :0.12508276104927063 of:0.056388113647699356 in:0.03776905685663223 with:0.02659822814166546 the:0.025074880570173264 to:0.02169540897011757 for:0.01890190690755844\n",
+ "['mttiiin']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['been']\n",
+ ":0.18826988339424133 made:0.03294280916452408 a:0.03194691240787506 in:0.021169094368815422 the:0.017314286902546883 so:0.008444827049970627 given:0.0072046369314193726 taken:0.006368295289576054\n",
+ "['of']\n",
+ "the:0.2457614541053772 :0.1656663864850998 a:0.030220337212085724 this:0.019167711958289146 his:0.01403335202485323 tho:0.011820078827440739 said:0.009894217364490032 their:0.008327881805598736\n",
+ "['expense']\n",
+ "of:0.36909469962120056 :0.08672443777322769 to:0.05099482461810112 and:0.042982835322618484 in:0.021987490355968475 for:0.021453257650136948 the:0.019953222945332527 is:0.0162285715341568\n",
+ "['subterran']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['landn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['by']\n",
+ "the:0.23543408513069153 :0.15149088203907013 a:0.05592188611626625 said:0.01404251903295517 tho:0.01377563364803791 law:0.012522521428763866 his:0.011869429610669613 this:0.01121734082698822\n",
+ "['in']\n",
+ "the:0.2225157767534256 :0.15360069274902344 a:0.04575375095009804 this:0.025838620960712433 his:0.016054201871156693 which:0.01124879065901041 tho:0.011150272563099861 their:0.010169503279030323\n",
+ "['will']\n",
+ "be:0.21885207295417786 :0.13016089797019958 not:0.055081117898225784 have:0.021862562745809555 make:0.012456201016902924 bo:0.01236207690089941 do:0.010900363326072693 give:0.009462139569222927\n",
+ "['action']\n",
+ "of:0.17109675705432892 :0.07247553765773773 is:0.054370518773794174 and:0.05129477381706238 in:0.051144812256097794 on:0.038074787706136703 or:0.03719569370150566 was:0.02310146763920784\n",
+ "['are']\n",
+ ":0.17221172153949738 not:0.04035910218954086 the:0.027153803035616875 in:0.02351563051342964 to:0.018598660826683044 now:0.01452269684523344 a:0.011636641807854176 hereby:0.009858710691332817\n",
+ "['or']\n",
+ ":0.205952450633049 the:0.040565330535173416 a:0.016952330246567726 in:0.01596311666071415 any:0.015423777513206005 to:0.014240958727896214 other:0.014171672984957695 two:0.013151555322110653\n",
+ "['that']\n",
+ "the:0.1460985541343689 :0.117131806910038 he:0.04326099529862404 it:0.03482627123594284 they:0.021876059472560883 is:0.019501497969031334 a:0.019411209970712662 there:0.015709606930613518\n",
+ "['decatur']\n",
+ ":0.15276333689689636 and:0.07046772539615631 county:0.018545925617218018 was:0.015514259226620197 to:0.014468901790678501 is:0.011005360633134842 who:0.00980814266949892 of:0.008667166344821453\n",
+ "['tractionn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['vain']\n",
+ ":0.1348562389612198 to:0.10228743404150009 and:0.049475736916065216 for:0.042661309242248535 the:0.027841171249747276 that:0.024004092440009117 in:0.018741058185696602 endeavor:0.017718017101287842\n",
+ "['which']\n",
+ ":0.12084392458200455 the:0.06722808629274368 is:0.056981410831213 he:0.047483354806900024 was:0.02967275120317936 they:0.02605881169438362 it:0.0243590846657753 has:0.02149120159447193\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['so']\n",
+ ":0.16578255593776703 that:0.06845669448375702 much:0.05655328184366226 far:0.03643251582980156 long:0.02599366568028927 many:0.025185538455843925 as:0.022826656699180603 the:0.01513101439923048\n",
+ "['sulllclentn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['prescription']\n",
+ "is:0.09799686819314957 :0.09258561581373215 and:0.06643876433372498 of:0.057745084166526794 for:0.0504082515835762 was:0.027953259646892548 to:0.021979281678795815 furnished:0.021604644134640694\n",
+ "['about']\n",
+ ":0.15301167964935303 the:0.13289107382297516 a:0.05426119267940521 to:0.03148178383708 it:0.02428416907787323 two:0.016558997333049774 one:0.013876081444323063 three:0.01352003589272499\n",
+ "['n']\n",
+ ":0.2304985672235489 n:0.07303886860609055 y:0.030507313087582588 and:0.01975770853459835 c:0.019707409664988518 the:0.018424810841679573 w:0.017016446217894554 e:0.01662597805261612\n",
+ "['finally']\n",
+ ":0.16785986721515656 the:0.056882165372371674 he:0.027511363849043846 decided:0.018065981566905975 i:0.016650646924972534 a:0.015969231724739075 to:0.015786301344633102 got:0.01192978210747242\n",
+ "['contented']\n",
+ "with:0.12644736468791962 and:0.11518226563930511 :0.10782186686992645 in:0.04543733969330788 the:0.029721848666667938 to:0.028458675369620323 himself:0.023678159341216087 themselves:0.022108634933829308\n",
+ "['ton']\n",
+ ":0.13487884402275085 of:0.04617968201637268 and:0.041141998022794724 the:0.036419421434402466 inclusive:0.02917175181210041 per:0.02484716661274433 a:0.019901413470506668 to:0.017307564616203308\n",
+ "['quigiej']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['mnsundays']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['similar']\n",
+ ":0.17836304008960724 to:0.17380143702030182 cases:0.011765624396502972 character:0.010519596748054028 circumstances:0.010392246767878532 manner:0.008522173389792442 and:0.008275561966001987 nature:0.0076185353100299835\n",
+ "['said']\n",
+ ":0.1593175232410431 mortgage:0.05206912010908127 that:0.03717360645532608 county:0.029243318364024162 to:0.027466053143143654 he:0.019849851727485657 the:0.01763606071472168 court:0.014598236419260502\n",
+ "['lastnfairly']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['pepper']\n",
+ ":0.29400038719177246 and:0.17421139776706696 fish:0.023566890507936478 for:0.017975112423300743 or:0.013260542415082455 :0.011892473325133324 a:0.010433800518512726 the:0.009254380129277706\n",
+ "['is']\n",
+ ":0.14478172361850739 a:0.07943203300237656 the:0.05567781254649162 not:0.04187404736876488 to:0.028269024565815926 in:0.017350036650896072 no:0.016243362799286842 now:0.014129472896456718\n",
+ "['baa']\n",
+ "been:0.20547673106193542 :0.2021472454071045 a:0.034559380263090134 not:0.02000623196363449 to:0.015101452358067036 no:0.014268244616687298 the:0.01298484392464161 beea:0.010241138748824596\n",
+ "['boast']\n",
+ "of:0.4713343381881714 that:0.09632112085819244 :0.07245048135519028 and:0.020917708054184914 the:0.019986502826213837 to:0.014293057844042778 ofnthe:0.013465664349496365 in:0.01280994713306427\n",
+ "['ornsome']\n",
+ "other:0.2924415171146393 :0.15338388085365295 of:0.03273245692253113 one:0.030885839834809303 such:0.02907770499587059 the:0.019298627972602844 in:0.01272398792207241 a:0.006831368897110224\n",
+ "['socialistsn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['considerable']\n",
+ ":0.2001601904630661 distance:0.04652312770485878 number:0.029593581333756447 portion:0.023905429989099503 extent:0.021032564342021942 amount:0.020032186061143875 sum:0.015692351385951042 time:0.015287144109606743\n",
+ "['vn']\n",
+ ":0.14230304956436157 in:0.021454766392707825 the:0.02144487388432026 and:0.020002160221338272 to:0.018045274540781975 a:0.01615276001393795 n:0.015388541854918003 m:0.014946718700230122\n",
+ "['orphans']\n",
+ "court:0.19463521242141724 :0.12963558733463287 of:0.08818478137254715 and:0.07471950352191925 to:0.02052893117070198 in:0.019044985994696617 the:0.013242395594716072 courtnto:0.008674381300807\n",
+ "['ton']\n",
+ ":0.13487884402275085 of:0.04617968201637268 and:0.041141998022794724 the:0.036419421434402466 inclusive:0.02917175181210041 per:0.02484716661274433 a:0.019901413470506668 to:0.017307564616203308\n",
+ "['ut']\n",
+ ":0.1805303394794464 the:0.11023376137018204 a:0.032635729759931564 of:0.014200465753674507 tho:0.012322187423706055 in:0.011237742379307747 all:0.011080697178840637 that:0.010968870483338833\n",
+ "['for']\n",
+ "the:0.2248937338590622 :0.1491411030292511 a:0.059844110161066055 this:0.014163156040012836 his:0.013331228867173195 it:0.011568314395844936 their:0.010646478272974491 tho:0.010418311692774296\n",
+ "['me']\n",
+ ":0.1639685481786728 to:0.10376843065023422 and:0.047942470759153366 that:0.046799782663583755 in:0.03183149918913841 a:0.0275476835668087 i:0.026764782145619392 the:0.022275181487202644\n",
+ "['n']\n",
+ ":0.2304985672235489 n:0.07303886860609055 y:0.030507313087582588 and:0.01975770853459835 c:0.019707409664988518 the:0.018424810841679573 w:0.017016446217894554 e:0.01662597805261612\n",
+ "['sensatlin']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['of']\n",
+ "the:0.2457614541053772 :0.1656663864850998 a:0.030220337212085724 this:0.019167711958289146 his:0.01403335202485323 tho:0.011820078827440739 said:0.009894217364490032 their:0.008327881805598736\n",
+ "['bo']\n",
+ ":0.22734488546848297 a:0.02836987003684044 the:0.02119683474302292 made:0.016840048134326935 in:0.01235697977244854 no:0.008228198625147343 paid:0.007832511328160763 found:0.007096969988197088\n",
+ "['henmust']\n",
+ "be:0.15680639445781708 have:0.1065983921289444 :0.08229566365480423 make:0.0346418060362339 get:0.01745663583278656 not:0.01734565757215023 take:0.014780009165406227 say:0.012753983959555626\n",
+ "['in']\n",
+ "the:0.2225157767534256 :0.15360069274902344 a:0.04575375095009804 this:0.025838620960712433 his:0.016054201871156693 which:0.01124879065901041 tho:0.011150272563099861 their:0.010169503279030323\n",
+ "['lan']\n",
+ ":0.3306066393852234 nguage:0.23375831544399261 and:0.039719358086586 in:0.01737554371356964 i:0.016134483739733696 a:0.012394752353429794 to:0.010068115778267384 :0.008261092007160187\n",
+ "['englandnills']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['gonerat']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['salutarv']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['and']\n",
+ ":0.17935489118099213 the:0.06324105709791183 a:0.01719452068209648 in:0.01547625008970499 that:0.01295827142894268 to:0.012152859009802341 it:0.009857879020273685 i:0.007450021803379059\n",
+ "['distant']\n",
+ ":0.18569689989089966 from:0.06256778538227081 and:0.044124823063611984 in:0.02026318572461605 the:0.017159510403871536 feet:0.016964806243777275 to:0.015524206683039665 as:0.01432793214917183\n",
+ "['to']\n",
+ ":0.1648421436548233 the:0.12624020874500275 be:0.05039070546627045 a:0.02132132649421692 make:0.012324165552854538 do:0.01222158968448639 have:0.012199307791888714 his:0.00816959049552679\n",
+ "['ben']\n",
+ ":0.3822658061981201 a:0.020594149827957153 the:0.017652636393904686 of:0.008986832574009895 to:0.00862148217856884 and:0.008379093371331692 at:0.006962778512388468 on:0.006950080394744873\n",
+ "['cause']\n",
+ "of:0.2258577197790146 :0.07897095382213593 the:0.061442673206329346 to:0.04865115508437157 and:0.03545428439974785 for:0.031726475805044174 a:0.029496151953935623 it:0.017733043059706688\n",
+ "['ain']\n",
+ ":0.36265018582344055 in:0.03626257926225662 to:0.03326069191098213 the:0.03176010027527809 and:0.02541493996977806 a:0.025280524045228958 of:0.019695064052939415 at:0.011601264588534832\n",
+ "['consultationn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['not']\n",
+ ":0.15144947171211243 be:0.0474478118121624 only:0.0388217493891716 to:0.034419793635606766 a:0.02875548042356968 been:0.020789122208952904 the:0.01926942728459835 in:0.014720425941050053\n",
+ "['hn']\n",
+ ":0.39948591589927673 chatham:0.014391939155757427 the:0.014188987202942371 a:0.012618998065590858 in:0.012330169789493084 of:0.010796318762004375 and:0.010346564464271069 n:0.00967817660421133\n",
+ "['rivalry']\n",
+ "between:0.09951947629451752 in:0.0956057757139206 :0.06827601045370102 of:0.06659798324108124 with:0.04274709150195122 and:0.038787346333265305 is:0.028254086151719093 on:0.025708692148327827\n",
+ "['oxtondlng']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['timeoftheirndisbandment']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['said']\n",
+ ":0.1593175232410431 mortgage:0.05206912010908127 that:0.03717360645532608 county:0.029243318364024162 to:0.027466053143143654 he:0.019849851727485657 the:0.01763606071472168 court:0.014598236419260502\n",
+ "['made']\n",
+ ":0.1015426516532898 by:0.07200932502746582 in:0.06612687557935715 a:0.06336113810539246 to:0.057947542518377304 the:0.05160039663314819 and:0.03143705800175667 of:0.02788241021335125\n",
+ "['doubtful']\n",
+ "whether:0.15059790015220642 if:0.12911133468151093 :0.09127353876829147 to:0.022603003308176994 whethernthe:0.021511508151888847 and:0.018875613808631897 for:0.01826886646449566 as:0.018267281353473663\n",
+ "['ofn']\n",
+ ":0.10926195234060287 and:0.055518388748168945 the:0.05312678590416908 to:0.025936240330338478 per:0.025178812444210052 a:0.021900814026594162 in:0.018211573362350464 feet:0.015171929262578487\n",
+ "['probably']\n",
+ ":0.16313578188419342 be:0.07398518174886703 the:0.06690847128629684 a:0.031105272471904755 not:0.02879834920167923 have:0.023335441946983337 in:0.01651296205818653 to:0.01517055556178093\n",
+ "['savannah']\n",
+ ":0.15897585451602936 and:0.08499684929847717 the:0.02358519844710827 for:0.022426621988415718 florida:0.020633049309253693 in:0.018982360139489174 on:0.018451469019055367 ga:0.014527046121656895\n",
+ "['join']\n",
+ "the:0.19142349064350128 in:0.1507730782032013 :0.0976218730211258 with:0.043615054339170456 him:0.02391829714179039 us:0.02306033857166767 and:0.022882595658302307 her:0.022604113444685936\n",
+ "['different']\n",
+ ":0.22379206120967865 from:0.04116220772266388 parts:0.03810926526784897 kinds:0.018899263814091682 and:0.01785585843026638 states:0.013539664447307587 times:0.01226099207997322 in:0.010963457636535168\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['torsn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['enoughn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['none']\n",
+ "of:0.3183564841747284 :0.08367279917001724 in:0.02396748773753643 the:0.01837647706270218 and:0.016430392861366272 to:0.01493738405406475 more:0.014698643237352371 other:0.013850689865648746\n",
+ "['quimby']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['i']\n",
+ ":0.19254687428474426 have:0.051397904753685 am:0.03986556828022003 was:0.03864387422800064 had:0.029659055173397064 could:0.014713815413415432 will:0.013915982097387314 would:0.01353325042873621\n",
+ "['patrons']\n",
+ "of:0.24917373061180115 :0.09819620847702026 and:0.08005258440971375 in:0.023406242951750755 who:0.021632768213748932 to:0.019370002672076225 are:0.0182885080575943 the:0.017605045810341835\n",
+ "['u']\n",
+ ":0.278802752494812 s:0.07459220290184021 a:0.03555775806307793 the:0.015456290915608406 n:0.013914244249463081 and:0.013452518731355667 to:0.011983383446931839 m:0.011041088961064816\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['period']\n",
+ "of:0.34833791851997375 :0.08788375556468964 in:0.04378318786621094 the:0.03858410194516182 and:0.03443180397152901 to:0.01765896938741207 for:0.013331422582268715 is:0.008928538300096989\n",
+ "['in']\n",
+ "the:0.2225157767534256 :0.15360069274902344 a:0.04575375095009804 this:0.025838620960712433 his:0.016054201871156693 which:0.01124879065901041 tho:0.011150272563099861 their:0.010169503279030323\n",
+ "['them']\n",
+ ":0.1220034658908844 to:0.0824308842420578 in:0.04682813212275505 and:0.04125552996993065 the:0.02983928844332695 as:0.01791076920926571 a:0.017844490706920624 for:0.017421135678887367\n",
+ "['then']\n",
+ ":0.17606347799301147 the:0.05414457619190216 th:0.02774103544652462 he:0.023316092789173126 a:0.019771868363022804 to:0.01886158250272274 i:0.018397148698568344 it:0.014618783257901669\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['armed']\n",
+ ":0.18571661412715912 with:0.1591707468032837 and:0.07688059657812119 forces:0.025560399517416954 men:0.023661747574806213 in:0.02278321236371994 to:0.015095989219844341 the:0.01446873601526022\n",
+ "['in']\n",
+ "the:0.2225157767534256 :0.15360069274902344 a:0.04575375095009804 this:0.025838620960712433 his:0.016054201871156693 which:0.01124879065901041 tho:0.011150272563099861 their:0.010169503279030323\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['labora']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['intend']\n",
+ "to:0.6194930672645569 :0.07333464920520782 ned:0.023318199440836906 that:0.01795508712530136 the:0.013708957470953465 tonmake:0.0063772364519536495 and:0.005358982365578413 on:0.0050330692902207375\n",
+ "['esqnalsothat']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['newspapersn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['ton']\n",
+ ":0.13487884402275085 of:0.04617968201637268 and:0.041141998022794724 the:0.036419421434402466 inclusive:0.02917175181210041 per:0.02484716661274433 a:0.019901413470506668 to:0.017307564616203308\n",
+ "['notnwrithe']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['statun']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['a']\n",
+ ":0.24079222977161407 few:0.01626162976026535 large:0.011570313014090061 man:0.010044588707387447 good:0.010007260367274284 great:0.009656419977545738 very:0.00889244582504034 little:0.008056357502937317\n",
+ "['of']\n",
+ "the:0.2457614541053772 :0.1656663864850998 a:0.030220337212085724 this:0.019167711958289146 his:0.01403335202485323 tho:0.011820078827440739 said:0.009894217364490032 their:0.008327881805598736\n",
+ "['started']\n",
+ ":0.14558160305023193 to:0.10844799131155014 in:0.08500660955905914 for:0.07260371744632721 on:0.04054902866482735 out:0.039436932653188705 the:0.03718320280313492 at:0.0325232595205307\n",
+ "['oarsnwere']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['day']\n",
+ "of:0.29467064142227173 :0.084292933344841 and:0.049625616520643234 the:0.026211684569716454 to:0.020970266312360764 in:0.018730850890278816 at:0.01672487147152424 or:0.015357449650764465\n",
+ "['protestn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['with']\n",
+ "the:0.18622300028800964 :0.16538189351558685 a:0.09778723865747452 his:0.021504372358322144 an:0.015204782597720623 all:0.014655662700533867 which:0.011455700732767582 her:0.010617160238325596\n",
+ "['is']\n",
+ ":0.14478172361850739 a:0.07943203300237656 the:0.05567781254649162 not:0.04187404736876488 to:0.028269024565815926 in:0.017350036650896072 no:0.016243362799286842 now:0.014129472896456718\n",
+ "['hen']\n",
+ ":0.16873784363269806 the:0.06611846387386322 and:0.026158079504966736 he:0.024075910449028015 i:0.023457009345293045 in:0.0212318766862154 it:0.020974883809685707 nry:0.01772887073457241\n",
+ "['towardn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['pieco']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['unitedn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['thrilling']\n",
+ ":0.2542187571525574 and:0.06274773925542831 to:0.02287721075117588 of:0.015715472400188446 story:0.015374545007944107 that:0.014183633029460907 as:0.013924400322139263 interest:0.011304641142487526\n",
+ "['swift']\n",
+ ":0.24719104170799255 and:0.08201650530099869 in:0.01719803735613823 as:0.01654921844601631 specific:0.016434546560049057 to:0.014521720819175243 or:0.013171400874853134 from:0.012410074472427368\n",
+ "['he']\n",
+ ":0.14773434400558472 was:0.08953415602445602 had:0.06403657048940659 is:0.04345237836241722 has:0.03252505883574486 would:0.023517634719610214 will:0.01741141825914383 could:0.016593124717473984\n",
+ "['of']\n",
+ "the:0.2457614541053772 :0.1656663864850998 a:0.030220337212085724 this:0.019167711958289146 his:0.01403335202485323 tho:0.011820078827440739 said:0.009894217364490032 their:0.008327881805598736\n",
+ "['ii']\n",
+ ":0.31811708211898804 a:0.023816155269742012 the:0.0205168928951025 i:0.016757093369960785 m:0.014588823541998863 is:0.014371728524565697 and:0.012694701552391052 t:0.0126353669911623\n",
+ "['in']\n",
+ "the:0.2225157767534256 :0.15360069274902344 a:0.04575375095009804 this:0.025838620960712433 his:0.016054201871156693 which:0.01124879065901041 tho:0.011150272563099861 their:0.010169503279030323\n",
+ "['of']\n",
+ "the:0.2457614541053772 :0.1656663864850998 a:0.030220337212085724 this:0.019167711958289146 his:0.01403335202485323 tho:0.011820078827440739 said:0.009894217364490032 their:0.008327881805598736\n",
+ "['railway']\n",
+ ":0.18158483505249023 company:0.1026129424571991 and:0.030799081549048424 in:0.02205001190304756 station:0.019995171576738358 companies:0.016794268041849136 to:0.014651170000433922 or:0.014218137599527836\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['following']\n",
+ ":0.21577127277851105 the:0.05881590396165848 described:0.05686778947710991 is:0.021448928862810135 in:0.014103828929364681 a:0.012171845883131027 day:0.012054827995598316 statement:0.010573654435575008\n",
+ "['idea']\n",
+ "of:0.3579258918762207 that:0.13937389850616455 :0.08310220390558243 is:0.031523965299129486 was:0.028185207396745682 and:0.02015228196978569 in:0.015386296436190605 to:0.014290685765445232\n",
+ "['earn']\n",
+ ":0.11924708634614944 a:0.09131968021392822 the:0.07094936072826385 nings:0.03841331973671913 of:0.025638889521360397 for:0.024241583421826363 and:0.02357146143913269 their:0.018018681555986404\n",
+ "['late']\n",
+ ":0.17705953121185303 in:0.04229867085814476 of:0.04211277514696121 to:0.02988540753722191 war:0.021618669852614403 residence:0.019031599164009094 years:0.017552580684423447 and:0.017194461077451706\n",
+ "['to']\n",
+ ":0.1648421436548233 the:0.12624020874500275 be:0.05039070546627045 a:0.02132132649421692 make:0.012324165552854538 do:0.01222158968448639 have:0.012199307791888714 his:0.00816959049552679\n",
+ "['cd']\n",
+ ":0.22000914812088013 to:0.05910235270857811 the:0.05555296316742897 in:0.044114433228969574 of:0.0230178851634264 and:0.022832300513982773 by:0.021893413737416267 from:0.019184119999408722\n",
+ "['to']\n",
+ ":0.1648421436548233 the:0.12624020874500275 be:0.05039070546627045 a:0.02132132649421692 make:0.012324165552854538 do:0.01222158968448639 have:0.012199307791888714 his:0.00816959049552679\n",
+ "['nunited']\n",
+ "states:0.6553436517715454 :0.14613200724124908 slates:0.04757695272564888 and:0.008219176903367043 in:0.00815338734537363 stales:0.005069442559033632 the:0.003957140259444714 a:0.002728587482124567\n",
+ "['convicts']\n",
+ ":0.10529406368732452 and:0.06539241224527359 in:0.05121273174881935 were:0.03906647861003876 to:0.029881052672863007 from:0.027408134192228317 the:0.025714995339512825 are:0.023727672174572945\n",
+ "['wasn']\n",
+ ":0.17657959461212158 a:0.04198543354868889 the:0.03581542149186134 to:0.03424995020031929 and:0.031244082376360893 in:0.030172841623425484 feet:0.026601005345582962 of:0.023303182795643806\n",
+ "['n']\n",
+ ":0.2304985672235489 n:0.07303886860609055 y:0.030507313087582588 and:0.01975770853459835 c:0.019707409664988518 the:0.018424810841679573 w:0.017016446217894554 e:0.01662597805261612\n",
+ "['broadn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['sliding']\n",
+ ":0.09384260326623917 down:0.05685485526919365 up:0.04962737485766411 and:0.040662068873643875 into:0.02796805463731289 door:0.024196797981858253 back:0.020460547879338264 out:0.015266885980963707\n",
+ "['circumstance']\n",
+ "that:0.13611440360546112 :0.1238645687699318 of:0.09071065485477448 the:0.040128905326128006 and:0.037888962775468826 which:0.028391532599925995 in:0.02767891064286232 to:0.025091128423810005\n",
+ "['tho']\n",
+ ":0.2986600995063782 most:0.00548686645925045 state:0.004983977880328894 first:0.004856238607317209 city:0.004728939849883318 united:0.004388166591525078 other:0.004086659289896488 same:0.004024218302220106\n",
+ "['party']\n",
+ ":0.137193500995636 of:0.07543077319860458 in:0.05912679061293602 and:0.054417192935943604 to:0.04411286860704422 is:0.022773418575525284 the:0.02104310691356659 was:0.015841396525502205\n",
+ "['andn']\n",
+ ":0.11433161795139313 dollars:0.03491411358118057 block:0.026080600917339325 in:0.02532712183892727 the:0.021045975387096405 to:0.018029607832431793 feet:0.017727477476000786 of:0.016631392762064934\n",
+ "['young']\n",
+ ":0.24229498207569122 man:0.08830852806568146 men:0.057896748185157776 and:0.043797869235277176 people:0.033023007214069366 lady:0.02858775481581688 woman:0.02498183399438858 women:0.01666080765426159\n",
+ "['uiia']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['both']\n",
+ ":0.21267655491828918 of:0.05839509144425392 the:0.05706479400396347 in:0.0390687920153141 sides:0.026978710666298866 to:0.01650719717144966 sexes:0.01345144584774971 houses:0.013221687637269497\n",
+ "['n']\n",
+ ":0.2304985672235489 n:0.07303886860609055 y:0.030507313087582588 and:0.01975770853459835 c:0.019707409664988518 the:0.018424810841679573 w:0.017016446217894554 e:0.01662597805261612\n",
+ "['which']\n",
+ ":0.12084392458200455 the:0.06722808629274368 is:0.056981410831213 he:0.047483354806900024 was:0.02967275120317936 they:0.02605881169438362 it:0.0243590846657753 has:0.02149120159447193\n",
+ "['and']\n",
+ ":0.17935489118099213 the:0.06324105709791183 a:0.01719452068209648 in:0.01547625008970499 that:0.01295827142894268 to:0.012152859009802341 it:0.009857879020273685 i:0.007450021803379059\n",
+ "['poor']\n",
+ ":0.2695181667804718 and:0.05291508510708809 man:0.042659807950258255 fellow:0.030337585136294365 girl:0.016272474080324173 to:0.014947036281228065 woman:0.013972467742860317 people:0.012778660282492638\n",
+ "['windnsunday']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['on']\n",
+ "the:0.31753239035606384 :0.11931086331605911 a:0.040675438940525055 his:0.017299221828579903 tho:0.017211757600307465 this:0.015625758096575737 their:0.00978156179189682 account:0.009252509102225304\n",
+ "['tiara']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['people']\n",
+ "of:0.12193400412797928 :0.09777715802192688 and:0.04974498972296715 who:0.045906297862529755 in:0.042085688561201096 to:0.033859916031360626 are:0.031158922240138054 have:0.01813989318907261\n",
+ "['rattlominko']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['thenotnl']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['its']\n",
+ ":0.2697148621082306 own:0.019346537068486214 use:0.006492631044238806 power:0.005430210847407579 way:0.005250113550573587 provisions:0.005071519408375025 present:0.004421460907906294 first:0.003977332729846239\n",
+ "['us']\n",
+ ":0.11166591197252274 to:0.07174031436443329 in:0.03988940268754959 and:0.03797098621726036 the:0.033502474427223206 that:0.03229131922125816 a:0.026689764112234116 as:0.01615416817367077\n",
+ "['jollynnow']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['brothersngilbert']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['statinnnotes']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['thengrealest']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['bomn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['of']\n",
+ "the:0.2457614541053772 :0.1656663864850998 a:0.030220337212085724 this:0.019167711958289146 his:0.01403335202485323 tho:0.011820078827440739 said:0.009894217364490032 their:0.008327881805598736\n",
+ "['jail']\n",
+ ":0.11650297045707703 and:0.07821273803710938 at:0.06639481335878372 for:0.05068277567625046 the:0.02996179647743702 in:0.029850736260414124 on:0.024878855794668198 of:0.02356419526040554\n",
+ "['success']\n",
+ "of:0.19657552242279053 :0.09387556463479996 in:0.08405350893735886 and:0.05519338697195053 the:0.033764082938432693 is:0.029181139543652534 to:0.01938711293041706 it:0.017781412228941917\n",
+ "['ntion']\n",
+ "of:0.2134752869606018 and:0.0679025650024414 to:0.06008179858326912 in:0.03778368607163429 :0.032588180154561996 the:0.03091721422970295 is:0.027959434315562248 that:0.02140127122402191\n",
+ "['at']\n",
+ "the:0.20992843806743622 :0.15566672384738922 a:0.05062214657664299 oclock:0.020457791164517403 least:0.020247263833880424 all:0.018392644822597504 this:0.018265608698129654 any:0.0133592514321208\n",
+ "['van']\n",
+ ":0.5693686604499817 buren:0.08048800379037857 horn:0.027899423614144325 ness:0.017536483705043793 horne:0.01740090921521187 and:0.011475196108222008 brunt:0.011469315737485886 of:0.006038379389792681\n",
+ "['selln']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['near']\n",
+ ":0.22856134176254272 the:0.22382698953151703 future:0.03153371810913086 a:0.023880571126937866 by:0.020288709551095963 to:0.01727149821817875 as:0.014609070494771004 and:0.013822535052895546\n",
+ "['thatntree']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['forcesnof']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['uuited']\n",
+ ":0.4631017744541168 states:0.3957747220993042 slates:0.013747919350862503 in:0.008037136867642403 stales:0.006386904511600733 is:0.004318410996347666 statea:0.0040361033752560616 and:0.0031883548945188522\n",
+ "['of']\n",
+ "the:0.2457614541053772 :0.1656663864850998 a:0.030220337212085724 this:0.019167711958289146 his:0.01403335202485323 tho:0.011820078827440739 said:0.009894217364490032 their:0.008327881805598736\n",
+ "['northwestern']\n",
+ ":0.1632874459028244 railway:0.0334823839366436 railroad:0.03105076402425766 states:0.021967163309454918 corner:0.021039480343461037 line:0.015321964398026466 in:0.01384312566369772 railwayncompany:0.012512766756117344\n",
+ "['for']\n",
+ "the:0.2248937338590622 :0.1491411030292511 a:0.059844110161066055 this:0.014163156040012836 his:0.013331228867173195 it:0.011568314395844936 their:0.010646478272974491 tho:0.010418311692774296\n",
+ "['place']\n",
+ "of:0.17481647431850433 :0.09976828843355179 in:0.07564513385295868 and:0.050318870693445206 the:0.03843297064304352 to:0.036182112991809845 for:0.02668594941496849 at:0.023274289444088936\n",
+ "['in']\n",
+ "the:0.2225157767534256 :0.15360069274902344 a:0.04575375095009804 this:0.025838620960712433 his:0.016054201871156693 which:0.01124879065901041 tho:0.011150272563099861 their:0.010169503279030323\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['n']\n",
+ ":0.2304985672235489 n:0.07303886860609055 y:0.030507313087582588 and:0.01975770853459835 c:0.019707409664988518 the:0.018424810841679573 w:0.017016446217894554 e:0.01662597805261612\n",
+ "['volunteer']\n",
+ ":0.1677529513835907 army:0.05867685750126839 and:0.04254830256104469 regiments:0.028065308928489685 infantry:0.021123217418789864 service:0.020731108263134956 to:0.01664079539477825 forces:0.013440171256661415\n",
+ "['and']\n",
+ ":0.17935489118099213 the:0.06324105709791183 a:0.01719452068209648 in:0.01547625008970499 that:0.01295827142894268 to:0.012152859009802341 it:0.009857879020273685 i:0.007450021803379059\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['commercen']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['intersectionn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['eninthis']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['library']\n",
+ ":0.10060306638479233 of:0.08923526108264923 and:0.07398297637701035 is:0.039051350206136703 in:0.028199104592204094 the:0.02655693143606186 was:0.02064443565905094 at:0.019268440082669258\n",
+ "['prescriben']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['ourn']\n",
+ ":0.38391757011413574 a:0.01613396406173706 the:0.011311794631183147 and:0.009659508243203163 state:0.0074147251434624195 of:0.006076783873140812 new:0.005726506467908621 c:0.0040391492657363415\n",
+ "['claimed']\n",
+ "to:0.25726521015167236 that:0.16377007961273193 by:0.09904193878173828 :0.048546548932790756 for:0.028791476041078568 the:0.026262279599905014 and:0.01992448978126049 in:0.019556449726223946\n",
+ "['old']\n",
+ ":0.2567478120326996 and:0.03556905314326286 man:0.025310911238193512 home:0.007602435536682606 the:0.00690062902867794 gentleman:0.006484318524599075 age:0.006255576387047768 friends:0.0055646891705691814\n",
+ "['was']\n",
+ ":0.16102024912834167 a:0.06752540171146393 the:0.03868721053004265 not:0.028345400467514992 in:0.025672586634755135 to:0.01581629365682602 made:0.012294775806367397 no:0.010229735635221004\n",
+ "['greatern']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['the']\n",
+ ":0.22579598426818848 same:0.007832455448806286 state:0.007307393476366997 first:0.006077633239328861 city:0.005409764591604471 people:0.0050051286816596985 most:0.004963664337992668 united:0.004868470132350922\n",
+ "['a']\n",
+ ":0.24079222977161407 few:0.01626162976026535 large:0.011570313014090061 man:0.010044588707387447 good:0.010007260367274284 great:0.009656419977545738 very:0.00889244582504034 little:0.008056357502937317\n",
+ "['vessel']\n",
+ ":0.10529959201812744 and:0.060197144746780396 was:0.0496145524084568 is:0.030803058296442032 in:0.03064507059752941 to:0.03002854436635971 of:0.028302432969212532 the:0.025401635095477104\n",
+ "['in']\n",
+ "the:0.2225157767534256 :0.15360069274902344 a:0.04575375095009804 this:0.025838620960712433 his:0.016054201871156693 which:0.01124879065901041 tho:0.011150272563099861 their:0.010169503279030323\n",
+ "['distrustnupon']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['thensherman']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['whichn']\n",
+ ":0.0826198160648346 a:0.030106166377663612 is:0.029384031891822815 was:0.028257712721824646 the:0.02513360045850277 are:0.025032449513673782 have:0.02394338883459568 i:0.018231619149446487\n",
+ "['is']\n",
+ ":0.14478172361850739 a:0.07943203300237656 the:0.05567781254649162 not:0.04187404736876488 to:0.028269024565815926 in:0.017350036650896072 no:0.016243362799286842 now:0.014129472896456718\n",
+ "['dandelion']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['ton']\n",
+ ":0.13487884402275085 of:0.04617968201637268 and:0.041141998022794724 the:0.036419421434402466 inclusive:0.02917175181210041 per:0.02484716661274433 a:0.019901413470506668 to:0.017307564616203308\n",
+ "['our']\n",
+ ":0.20765885710716248 own:0.028943264856934547 people:0.016101203858852386 country:0.015432706102728844 state:0.009388145059347153 city:0.008618978783488274 national:0.00786765106022358 government:0.0072469450533390045\n",
+ "['he']\n",
+ ":0.14773434400558472 was:0.08953415602445602 had:0.06403657048940659 is:0.04345237836241722 has:0.03252505883574486 would:0.023517634719610214 will:0.01741141825914383 could:0.016593124717473984\n",
+ "['injunctions']\n",
+ ":0.21202898025512695 and:0.13268283009529114 of:0.11241118609905243 to:0.10384225845336914 in:0.04599481448531151 the:0.03195025399327278 that:0.02663229964673519 for:0.02555491402745247\n",
+ "['hernprecious']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['aqn']\n",
+ ":0.13839282095432281 the:0.05073755607008934 of:0.04705389216542244 and:0.04267781972885132 to:0.030805902555584908 in:0.02351704239845276 a:0.018242696300148964 that:0.010924139991402626\n",
+ "['onlyn']\n",
+ ":0.1767205446958542 per:0.05991939827799797 pounds:0.036194778978824615 the:0.026402555406093597 in:0.025634752586483955 cents:0.024617550894618034 and:0.02451268397271633 a:0.014309866353869438\n",
+ "['to']\n",
+ ":0.1648421436548233 the:0.12624020874500275 be:0.05039070546627045 a:0.02132132649421692 make:0.012324165552854538 do:0.01222158968448639 have:0.012199307791888714 his:0.00816959049552679\n",
+ "['in']\n",
+ "the:0.2225157767534256 :0.15360069274902344 a:0.04575375095009804 this:0.025838620960712433 his:0.016054201871156693 which:0.01124879065901041 tho:0.011150272563099861 their:0.010169503279030323\n",
+ "['like']\n",
+ ":0.18782390654087067 a:0.16584761440753937 the:0.10118193179368973 to:0.063145212829113 that:0.02274155430495739 it:0.017591245472431183 an:0.017437336966395378 this:0.01166405901312828\n",
+ "['tho']\n",
+ ":0.2986600995063782 most:0.00548686645925045 state:0.004983977880328894 first:0.004856238607317209 city:0.004728939849883318 united:0.004388166591525078 other:0.004086659289896488 same:0.004024218302220106\n",
+ "['forninstance']\n",
+ "the:0.1051710844039917 :0.04390120133757591 it:0.029704004526138306 he:0.026990149170160294 is:0.025271844118833542 a:0.024432646110653877 and:0.02340378239750862 was:0.02328353188931942\n",
+ "['cod']\n",
+ "