This commit is contained in:
Norbert Litkowski 2022-04-25 01:17:13 +02:00
parent 3e73ddf02d
commit d09c77e228
3 changed files with 18002 additions and 41 deletions

10519
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

110
run.ipynb
View File

@ -14,7 +14,7 @@
{
"cell_type": "code",
"execution_count": 2,
"id": "032ba328",
"id": "41336a5e",
"metadata": {},
"outputs": [],
"source": [
@ -24,7 +24,7 @@
{
"cell_type": "code",
"execution_count": 3,
"id": "e0d94073",
"id": "fe2e1dd3",
"metadata": {},
"outputs": [],
"source": [
@ -34,7 +34,7 @@
{
"cell_type": "code",
"execution_count": 4,
"id": "7c055510",
"id": "4aad410d",
"metadata": {},
"outputs": [],
"source": [
@ -44,7 +44,7 @@
{
"cell_type": "code",
"execution_count": 5,
"id": "bd81e581",
"id": "d6f0f760",
"metadata": {},
"outputs": [],
"source": [
@ -54,7 +54,7 @@
{
"cell_type": "code",
"execution_count": 6,
"id": "0c4a5486",
"id": "02bda814",
"metadata": {},
"outputs": [],
"source": [
@ -64,7 +64,7 @@
{
"cell_type": "code",
"execution_count": 7,
"id": "aec319cd",
"id": "e3128e14",
"metadata": {},
"outputs": [],
"source": [
@ -74,7 +74,7 @@
{
"cell_type": "code",
"execution_count": 8,
"id": "9b794391",
"id": "fe2c05e4",
"metadata": {},
"outputs": [
{
@ -106,7 +106,7 @@
{
"cell_type": "code",
"execution_count": 15,
"id": "f21d9139",
"id": "5b9b593c",
"metadata": {},
"outputs": [],
"source": [
@ -118,7 +118,7 @@
{
"cell_type": "code",
"execution_count": 10,
"id": "362a6b83",
"id": "dd9e3fd6",
"metadata": {},
"outputs": [
{
@ -142,7 +142,7 @@
{
"cell_type": "code",
"execution_count": 11,
"id": "456fa286",
"id": "f8cba81c",
"metadata": {},
"outputs": [],
"source": [
@ -151,9 +151,11 @@
},
{
"cell_type": "code",
"execution_count": 14,
"id": "3eaaf27b",
"metadata": {},
"execution_count": 16,
"id": "2c3db836",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
@ -161,23 +163,8 @@
"text": [
"Loading the LM will be faster if you build a binary file.\n",
"Reading /home/me/challenging-america-word-gap-prediction-kenlm/model.arpa\n",
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"
]
},
{
"ename": "OSError",
"evalue": "Cannot read model './model.arpa' (End of file Byte: 0)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32mkenlm.pyx:139\u001b[0m, in \u001b[0;36mkenlm.Model.__init__\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mRuntimeError\u001b[0m: End of file Byte: 0",
"\nThe above exception was the direct cause of the following exception:\n",
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
"Input \u001b[0;32mIn [14]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mkenlm\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m model \u001b[38;5;241m=\u001b[39m \u001b[43mkenlm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mModel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m./model.arpa\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32mkenlm.pyx:142\u001b[0m, in \u001b[0;36mkenlm.Model.__init__\u001b[0;34m()\u001b[0m\n",
"\u001b[0;31mOSError\u001b[0m: Cannot read model './model.arpa' (End of file Byte: 0)"
"----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n",
"****************************************************************************************************\n"
]
}
],
@ -188,11 +175,42 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "b3a22dcd",
"execution_count": 23,
"id": "35fb75ee",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Collecting english_words\n",
" Downloading english-words-1.1.0.tar.gz (1.1 MB)\n",
" |████████████████████████████████| 1.1 MB 985 kB/s \n",
"\u001b[?25hBuilding wheels for collected packages: english-words\n",
" Building wheel for english-words (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for english-words: filename=english_words-1.1.0-py3-none-any.whl size=1106680 sha256=818b2393457321fc616c24465b2c7ce020853e36d9d8e1b03142a5e18076713d\n",
" Stored in directory: /home/me/.cache/pip/wheels/2c/48/9a/f697d8d989ca4e4c1060f9da73caea372d7e1b78402abff8bb\n",
"Successfully built english-words\n",
"Installing collected packages: english-words\n",
"Successfully installed english-words-1.1.0\n"
]
}
],
"source": [
"!pip install english_words"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "6adb5045",
"metadata": {},
"outputs": [],
"source": [
"from english_words import english_words_alpha_set\n",
"from math import log10\n",
"\n",
"def predict(before, after):\n",
" result = ''\n",
" prob = 0.0\n",
@ -224,16 +242,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "913dcf54",
"execution_count": 27,
"id": "1ce44bdc",
"metadata": {},
"outputs": [],
"source": [
"from nltk import trigrams, word_tokenize\n",
"\n",
"def make_prediction(path, result_path):\n",
" data = pd.read_csv(path, sep='\\t', header=None, quoting=csv.QUOTE_NONE)\n",
" pdata = get_csv(path)\n",
" with open(result_path, 'w', encoding='utf-8') as file_out:\n",
" for _, row in data.iterrows():\n",
" before, after = word_tokenize(data_preprocessing(str(row[6]))), word_tokenize(data_preprocessing(str(row[7])))\n",
" for _, row in pdata.iterrows():\n",
" before, after = word_tokenize(clean_text(str(row[6]))), word_tokenize(clean_text(str(row[7])))\n",
" if len(before) < 2 or len(after) < 2:\n",
" pred = prediction\n",
" else:\n",
@ -243,8 +263,8 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "01c1b58d",
"execution_count": 28,
"id": "154c9ff2",
"metadata": {},
"outputs": [],
"source": [
@ -253,13 +273,21 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "6d37cd24",
"execution_count": 29,
"id": "82d0cc3f",
"metadata": {},
"outputs": [],
"source": [
"make_prediction(\"test-A/in.tsv.xz\", \"test-A/out.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf45ce49",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

7414
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff