315 lines
8.8 KiB
Plaintext
315 lines
8.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "f834096a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from utils import *"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "41336a5e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data = get_csv(\"train/in.tsv.xz\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "fe2e1dd3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_labels = get_csv(\"train/expected.tsv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "4aad410d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_data = data[[6,7]]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "d6f0f760",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_data = pd.concat([train_data, train_labels], axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "02bda814",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_data[607] = train_data[6] + train_data[0] + train_data[7]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "e3128e14",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_data[607] = train_data[607].apply(clean_text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "fe2c05e4",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0 came fiom the last place to thisnplace and thi...\n",
|
|
"1 mb boot political obeednattempt to imagine a p...\n",
|
|
"2 thera were in only aeventyninenuberlbers lo ...\n",
|
|
"3 a gixnl man y niterertiiiv diiclosurs regard ...\n",
|
|
"4 tin ub tv thf bbabbt qabjenmr schiffs tutemen...\n",
|
|
" ... \n",
|
|
"432017 sam clendenin bad a fancy for uinscience of me...\n",
|
|
"432018 witahtt halting the party ware dilven to the s...\n",
|
|
"432019 it was the last thing that either ofnthem expe...\n",
|
|
"432020 settlement with the departmentnit is also show...\n",
|
|
"432021 flour quotationslow extras at r ® ncity mi...\n",
|
|
"Name: 607, Length: 432022, dtype: object"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"train_data[607]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "5b9b593c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(\"tmp\", \"w+\") as f:\n",
|
|
" for t in train_data[607]:\n",
|
|
" f.write(t + \"\\n\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "dd9e3fd6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== 1/5 Counting and sorting n-grams ===\n",
|
|
"Reading /home/me/challenging-america-word-gap-prediction-kenlm/tmp\n",
|
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
|
"************************/home/me/kenlm/lm/builder/corpus_count.cc:179 in void lm::builder::{anonymous}::ComplainDisallowed(StringPiece, lm::WarningAction&) threw FormatLoadException.\n",
|
|
"Special word <s> is not allowed in the corpus. I plan to support models containing <unk> in the future. Pass --skip_symbols to convert these symbols to whitespace.\n",
|
|
"/bin/bash: linia 1: 5055 Przerwane (zrzut pamięci) ../kenlm/build//bin/lmplz -o 4 < tmp > model.arpa\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"KENLM_BUILD_PATH = \"../kenlm/build/\"\n",
|
|
"!$KENLM_BUILD_PATH/bin/lmplz -o 4 < tmp > model.arpa"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "f8cba81c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!rm tmp"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "2c3db836",
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Loading the LM will be faster if you build a binary file.\n",
|
|
"Reading /home/me/challenging-america-word-gap-prediction-kenlm/model.arpa\n",
|
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
|
"****************************************************************************************************\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import kenlm\n",
|
|
"model = kenlm.Model(\"./model.arpa\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "35fb75ee",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Defaulting to user installation because normal site-packages is not writeable\n",
|
|
"Collecting english_words\n",
|
|
" Downloading english-words-1.1.0.tar.gz (1.1 MB)\n",
|
|
" |████████████████████████████████| 1.1 MB 985 kB/s \n",
|
|
"\u001b[?25hBuilding wheels for collected packages: english-words\n",
|
|
" Building wheel for english-words (setup.py) ... \u001b[?25ldone\n",
|
|
"\u001b[?25h Created wheel for english-words: filename=english_words-1.1.0-py3-none-any.whl size=1106680 sha256=818b2393457321fc616c24465b2c7ce020853e36d9d8e1b03142a5e18076713d\n",
|
|
" Stored in directory: /home/me/.cache/pip/wheels/2c/48/9a/f697d8d989ca4e4c1060f9da73caea372d7e1b78402abff8bb\n",
|
|
"Successfully built english-words\n",
|
|
"Installing collected packages: english-words\n",
|
|
"Successfully installed english-words-1.1.0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!pip install english_words"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"id": "6adb5045",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from english_words import english_words_alpha_set\n",
|
|
"from math import log10\n",
|
|
"\n",
|
|
"def predict(before, after):\n",
|
|
" result = ''\n",
|
|
" prob = 0.0\n",
|
|
" best = []\n",
|
|
" for word in english_words_alpha_set:\n",
|
|
" text = ' '.join([before, word, after])\n",
|
|
" text_score = model.score(text, bos=False, eos=False)\n",
|
|
" if len(best) < 12:\n",
|
|
" best.append((word, text_score))\n",
|
|
" else:\n",
|
|
" is_better = False\n",
|
|
" worst_score = None\n",
|
|
" for score in best:\n",
|
|
" if not worst_score:\n",
|
|
" worst_score = score\n",
|
|
" else:\n",
|
|
" if worst_score[1] > score[1]:\n",
|
|
" worst_score = score\n",
|
|
" if worst_score[1] < text_score:\n",
|
|
" best.remove(worst_score)\n",
|
|
" best.append((word, text_score))\n",
|
|
" probs = sorted(best, key=lambda tup: tup[1], reverse=True)\n",
|
|
" pred_str = ''\n",
|
|
" for word, prob in probs:\n",
|
|
" pred_str += f'{word}:{prob} '\n",
|
|
" pred_str += f':{log10(0.99)}'\n",
|
|
" return pred_str"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"id": "1ce44bdc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from nltk import trigrams, word_tokenize\n",
|
|
"\n",
|
|
"def make_prediction(path, result_path):\n",
|
|
" pdata = get_csv(path)\n",
|
|
" with open(result_path, 'w', encoding='utf-8') as file_out:\n",
|
|
" for _, row in pdata.iterrows():\n",
|
|
" before, after = word_tokenize(clean_text(str(row[6]))), word_tokenize(clean_text(str(row[7])))\n",
|
|
" if len(before) < 2 or len(after) < 2:\n",
|
|
" pred = prediction\n",
|
|
" else:\n",
|
|
" pred = predict(before[-1], after[0])\n",
|
|
" file_out.write(pred + '\\n')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"id": "154c9ff2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"make_prediction(\"dev-0/in.tsv.xz\", \"dev-0/out.tsv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"id": "82d0cc3f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"make_prediction(\"test-A/in.tsv.xz\", \"test-A/out.tsv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cf45ce49",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|