287 lines
8.7 KiB
Plaintext
287 lines
8.7 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "f834096a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"from utils import *"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "032ba328",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data = get_csv(\"train/in.tsv.xz\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "e0d94073",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_labels = get_csv(\"train/expected.tsv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "7c055510",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_data = data[[6,7]]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "bd81e581",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_data = pd.concat([train_data, train_labels], axis=1)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "0c4a5486",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_data[607] = train_data[6] + train_data[0] + train_data[7]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "aec319cd",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_data[607] = train_data[607].apply(clean_text)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "9b794391",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0 came fiom the last place to thisnplace and thi...\n",
|
|
"1 mb boot political obeednattempt to imagine a p...\n",
|
|
"2 thera were in only aeventyninenuberlbers lo ...\n",
|
|
"3 a gixnl man y niterertiiiv diiclosurs regard ...\n",
|
|
"4 tin ub tv thf bbabbt qabjenmr schiffs tutemen...\n",
|
|
" ... \n",
|
|
"432017 sam clendenin bad a fancy for uinscience of me...\n",
|
|
"432018 witahtt halting the party ware dilven to the s...\n",
|
|
"432019 it was the last thing that either ofnthem expe...\n",
|
|
"432020 settlement with the departmentnit is also show...\n",
|
|
"432021 flour quotationslow extras at r ® ncity mi...\n",
|
|
"Name: 607, Length: 432022, dtype: object"
|
|
]
|
|
},
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"train_data[607]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "f21d9139",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(\"tmp\", \"w+\") as f:\n",
|
|
" for t in train_data[607]:\n",
|
|
" f.write(t + \"\\n\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "362a6b83",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"=== 1/5 Counting and sorting n-grams ===\n",
|
|
"Reading /home/me/challenging-america-word-gap-prediction-kenlm/tmp\n",
|
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
|
|
"************************/home/me/kenlm/lm/builder/corpus_count.cc:179 in void lm::builder::{anonymous}::ComplainDisallowed(StringPiece, lm::WarningAction&) threw FormatLoadException.\n",
|
|
"Special word <s> is not allowed in the corpus. I plan to support models containing <unk> in the future. Pass --skip_symbols to convert these symbols to whitespace.\n",
|
|
"/bin/bash: linia 1: 5055 Przerwane (zrzut pamięci) ../kenlm/build//bin/lmplz -o 4 < tmp > model.arpa\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"KENLM_BUILD_PATH = \"../kenlm/build/\"\n",
|
|
"!$KENLM_BUILD_PATH/bin/lmplz -o 4 < tmp > model.arpa"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "456fa286",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"!rm tmp"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "3eaaf27b",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stderr",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Loading the LM will be faster if you build a binary file.\n",
|
|
"Reading /home/me/challenging-america-word-gap-prediction-kenlm/model.arpa\n",
|
|
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"
|
|
]
|
|
},
|
|
{
|
|
"ename": "OSError",
|
|
"evalue": "Cannot read model './model.arpa' (End of file Byte: 0)",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
|
|
"File \u001b[0;32mkenlm.pyx:139\u001b[0m, in \u001b[0;36mkenlm.Model.__init__\u001b[0;34m()\u001b[0m\n",
|
|
"\u001b[0;31mRuntimeError\u001b[0m: End of file Byte: 0",
|
|
"\nThe above exception was the direct cause of the following exception:\n",
|
|
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
|
|
"Input \u001b[0;32mIn [14]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mkenlm\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mkenlm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mModel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m./model.arpa\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
|
|
"File \u001b[0;32mkenlm.pyx:142\u001b[0m, in \u001b[0;36mkenlm.Model.__init__\u001b[0;34m()\u001b[0m\n",
|
|
"\u001b[0;31mOSError\u001b[0m: Cannot read model './model.arpa' (End of file Byte: 0)"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import kenlm\n",
|
|
"model = kenlm.Model(\"./model.arpa\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b3a22dcd",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def predict(before, after):\n",
|
|
" result = ''\n",
|
|
" prob = 0.0\n",
|
|
" best = []\n",
|
|
" for word in english_words_alpha_set:\n",
|
|
" text = ' '.join([before, word, after])\n",
|
|
" text_score = model.score(text, bos=False, eos=False)\n",
|
|
" if len(best) < 12:\n",
|
|
" best.append((word, text_score))\n",
|
|
" else:\n",
|
|
" is_better = False\n",
|
|
" worst_score = None\n",
|
|
" for score in best:\n",
|
|
" if not worst_score:\n",
|
|
" worst_score = score\n",
|
|
" else:\n",
|
|
" if worst_score[1] > score[1]:\n",
|
|
" worst_score = score\n",
|
|
" if worst_score[1] < text_score:\n",
|
|
" best.remove(worst_score)\n",
|
|
" best.append((word, text_score))\n",
|
|
" probs = sorted(best, key=lambda tup: tup[1], reverse=True)\n",
|
|
" pred_str = ''\n",
|
|
" for word, prob in probs:\n",
|
|
" pred_str += f'{word}:{prob} '\n",
|
|
" pred_str += f':{log10(0.99)}'\n",
|
|
" return pred_str"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "913dcf54",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def make_prediction(path, result_path):\n",
|
|
" data = pd.read_csv(path, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
|
|
" with open(result_path, 'w', encoding='utf-8') as file_out:\n",
|
|
" for _, row in data.iterrows():\n",
|
|
" before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7])))\n",
|
|
" if len(before) < 2 or len(after) < 2:\n",
|
|
" pred = prediction\n",
|
|
" else:\n",
|
|
" pred = predict(before[-1], after[0])\n",
|
|
" file_out.write(pred + '\\n')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "01c1b58d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"make_prediction(\"dev-0/in.tsv.xz\", \"dev-0/out.tsv\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "6d37cd24",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"make_prediction(\"test-A/in.tsv.xz\", \"test-A/out.tsv\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.4"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|