1st try
This commit is contained in:
commit
ec06c05e97
9
README.md
Normal file
9
README.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
Challenging America word-gap prediction
|
||||||
|
===================================
|
||||||
|
|
||||||
|
Guess a word in a gap.
|
||||||
|
|
||||||
|
Evaluation metric
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
LikelihoodHashed is the metric
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
--metric PerplexityHashed --precision 2 --in-header in-header.tsv --out-header out-header.tsv
|
1
in-header.tsv
Normal file
1
in-header.tsv
Normal file
@ -0,0 +1 @@
|
|||||||
|
FileId Year LeftContext RightContext
|
|
507
nb.ipynb
Normal file
507
nb.ipynb
Normal file
@ -0,0 +1,507 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 184,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"from itertools import islice\n",
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from tqdm import tqdm"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 209,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import lzma\n",
|
||||||
|
"from collections import Counter, OrderedDict\n",
|
||||||
|
"import matplotlib.pyplot as plt\n",
|
||||||
|
"from math import log\n",
|
||||||
|
"import re\n",
|
||||||
|
"import numpy as np"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 200,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open(\"train/in.tsv\", encoding='utf8', mode=\"rt\") as file:\n",
|
||||||
|
" a = file.readlines()\n",
|
||||||
|
"\n",
|
||||||
|
"a = [line.split(\"\\t\") for line in a]\n",
|
||||||
|
"text = \" \".join([line[-2] + \" \" + line[-1] for line in a])\n",
|
||||||
|
"text = re.sub(r\"\\\\+n\", \" \", text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"del a"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 199,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"19560075"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 199,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 37,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"words = re.findall(\"\\w+\", text)\n",
|
||||||
|
"bigram_counter = Counter(zip(words, islice(words, 1, None)))\n",
|
||||||
|
"bigram_counter = dict(sorted(bigram_counter.items(), key=lambda item: item[1], reverse=True))\n",
|
||||||
|
"\n",
|
||||||
|
"del words"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 38,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"bigram_counter_short = {}\n",
|
||||||
|
"for key, value in bigram_counter.items():\n",
|
||||||
|
" if value > 5:\n",
|
||||||
|
" bigram_counter_short[key] = value\n",
|
||||||
|
"\n",
|
||||||
|
"bigram_counter = bigram_counter_short\n",
|
||||||
|
"del bigram_counter_short"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 201,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"unigram_counter = Counter(text.split(' '))\n",
|
||||||
|
"unigram_counter = unigram_counter.most_common(10_000)\n",
|
||||||
|
"# unigram_counter = dict(sorted(unigram_counter.items(), key=lambda item: item[1]), reverse=True)\n",
|
||||||
|
"unigram_counter_list = unigram_counter\n",
|
||||||
|
"unigram_counter = dict(unigram_counter) "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 73,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# with open(\"dev-0/in.tsv\", encoding='utf8', mode=\"rt\") as file:\n",
|
||||||
|
"# a = file.readlines()\n",
|
||||||
|
"\n",
|
||||||
|
"# a = [line.split(\"\\t\") for line in a]\n",
|
||||||
|
"# text = \" \".join([line[-2] + \" \" + line[-1] for line in a])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 152,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"C:\\Users\\micha\\AppData\\Local\\Temp\\ipykernel_14716\\2692353843.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" test_data = pd.read_csv('dev-0/in.tsv', sep='\\t', error_bad_lines=False, header=None)\n",
|
||||||
|
"Skipping line 654: expected 8 fields, saw 9\n",
|
||||||
|
"Skipping line 2220: expected 8 fields, saw 9\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"test_data = pd.read_csv('dev-0/in.tsv', sep='\\t', error_bad_lines=False, header=None)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 88,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <th>5</th>\n",
|
||||||
|
" <th>6</th>\n",
|
||||||
|
" <th>7</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>662ed514d56f7bc8743aa6f23794c731</td>\n",
|
||||||
|
" <td>LINCOLN TELEGRAPH</td>\n",
|
||||||
|
" <td>ChronAm</td>\n",
|
||||||
|
" <td>1838.834247</td>\n",
|
||||||
|
" <td>43.910755</td>\n",
|
||||||
|
" <td>-69.820862</td>\n",
|
||||||
|
" <td>rin 11K ui i rsognfd inlriliinnts i>r the town...</td>\n",
|
||||||
|
" <td>Northeasterly hv the head of said .^corn’s\\nan...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>0c3ac40edfe6a167ab692fdb9219a93c</td>\n",
|
||||||
|
" <td>THE WYANDOT PIONEER</td>\n",
|
||||||
|
" <td>ChronAm</td>\n",
|
||||||
|
" <td>1857.691781</td>\n",
|
||||||
|
" <td>40.827279</td>\n",
|
||||||
|
" <td>-83.281309</td>\n",
|
||||||
|
" <td>ton County feel an interest in. tn great is-\\n...</td>\n",
|
||||||
|
" <td>and design,\\nand hence, every election, be it ...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>b298097f3afd2f8c06b61fa2308ec725</td>\n",
|
||||||
|
" <td>RICHMOND ENQUIRER</td>\n",
|
||||||
|
" <td>ChronAm</td>\n",
|
||||||
|
" <td>1847.012329</td>\n",
|
||||||
|
" <td>37.538509</td>\n",
|
||||||
|
" <td>-77.434280</td>\n",
|
||||||
|
" <td>But at our own doors we have evidence ten\\ning...</td>\n",
|
||||||
|
" <td>Democrat\\nenlisting lor the Mexican wvir. They...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>1d50cf957a6a9cbbe0ee7773a72a76d4</td>\n",
|
||||||
|
" <td>RAFTSMAN'S JOURNAL</td>\n",
|
||||||
|
" <td>ChronAm</td>\n",
|
||||||
|
" <td>1867.541096</td>\n",
|
||||||
|
" <td>41.027280</td>\n",
|
||||||
|
" <td>-78.439188</td>\n",
|
||||||
|
" <td>The wonderful Flexibility and great comfort\\na...</td>\n",
|
||||||
|
" <td>will preserve their perfect aud grace\\nful sha...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>5a7297b76de00c7d9e1fb159384238c0</td>\n",
|
||||||
|
" <td>RICHMOND ENQUIRER</td>\n",
|
||||||
|
" <td>ChronAm</td>\n",
|
||||||
|
" <td>1826.083562</td>\n",
|
||||||
|
" <td>37.538509</td>\n",
|
||||||
|
" <td>-77.434280</td>\n",
|
||||||
|
" <td>Illinois.—The Legislature met at Ya:.ualia\\non...</td>\n",
|
||||||
|
" <td>to run the line between Arkansas and\\nthe’Vhnc...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>...</th>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" <td>...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>10397</th>\n",
|
||||||
|
" <td>02e9e019df1992daeafe82b041d94aac</td>\n",
|
||||||
|
" <td>WATERBURY EVENING DEMOCRAT</td>\n",
|
||||||
|
" <td>ChronAm</td>\n",
|
||||||
|
" <td>1888.949454</td>\n",
|
||||||
|
" <td>41.558153</td>\n",
|
||||||
|
" <td>-73.051497</td>\n",
|
||||||
|
" <td>the Fitzgeralds should perish like a common\\nt...</td>\n",
|
||||||
|
" <td>Brian, but there was also a touch\\nof self int...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>10398</th>\n",
|
||||||
|
" <td>74fa28868cbc998d15c242baea4e1faa</td>\n",
|
||||||
|
" <td>RICHMOND ENQUIRER</td>\n",
|
||||||
|
" <td>ChronAm</td>\n",
|
||||||
|
" <td>1836.012295</td>\n",
|
||||||
|
" <td>37.538509</td>\n",
|
||||||
|
" <td>-77.434280</td>\n",
|
||||||
|
" <td>herd, so soon as he conveniently can, after th...</td>\n",
|
||||||
|
" <td>Court dotli lurlher adjudge, order, and decree...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>10399</th>\n",
|
||||||
|
" <td>147be715e90bac01c55969d90254f29e</td>\n",
|
||||||
|
" <td>EVENING CAPITAL</td>\n",
|
||||||
|
" <td>ChronAm</td>\n",
|
||||||
|
" <td>1907.004110</td>\n",
|
||||||
|
" <td>38.978640</td>\n",
|
||||||
|
" <td>-76.492786</td>\n",
|
||||||
|
" <td>Drs. James J. Murphy, of Annapo-\\nlis, and Tho...</td>\n",
|
||||||
|
" <td>in the matter\\nor show any inclination to help...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>10400</th>\n",
|
||||||
|
" <td>1357f703947d912523ac23540cb99a0f</td>\n",
|
||||||
|
" <td>RAFTSMAN'S JOURNAL</td>\n",
|
||||||
|
" <td>ChronAm</td>\n",
|
||||||
|
" <td>1868.077869</td>\n",
|
||||||
|
" <td>41.027280</td>\n",
|
||||||
|
" <td>-78.439188</td>\n",
|
||||||
|
" <td>the soles of the feet spikes or corks are fixe...</td>\n",
|
||||||
|
" <td>\\nIn order to prevent \"the giant\" from\\nfright...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>10401</th>\n",
|
||||||
|
" <td>23346293dbc949ee2edc3380db29f33b</td>\n",
|
||||||
|
" <td>THE DEMOCRATIC WHIG</td>\n",
|
||||||
|
" <td>ChronAm</td>\n",
|
||||||
|
" <td>1843.760274</td>\n",
|
||||||
|
" <td>33.495674</td>\n",
|
||||||
|
" <td>-88.427263</td>\n",
|
||||||
|
" <td>tion which his opponent had taken, and whilst\\...</td>\n",
|
||||||
|
" <td>come criterion, by which to judge\\nof a nation...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"<p>10402 rows × 8 columns</p>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" 0 1 2 \\\n",
|
||||||
|
"0 662ed514d56f7bc8743aa6f23794c731 LINCOLN TELEGRAPH ChronAm \n",
|
||||||
|
"1 0c3ac40edfe6a167ab692fdb9219a93c THE WYANDOT PIONEER ChronAm \n",
|
||||||
|
"2 b298097f3afd2f8c06b61fa2308ec725 RICHMOND ENQUIRER ChronAm \n",
|
||||||
|
"3 1d50cf957a6a9cbbe0ee7773a72a76d4 RAFTSMAN'S JOURNAL ChronAm \n",
|
||||||
|
"4 5a7297b76de00c7d9e1fb159384238c0 RICHMOND ENQUIRER ChronAm \n",
|
||||||
|
"... ... ... ... \n",
|
||||||
|
"10397 02e9e019df1992daeafe82b041d94aac WATERBURY EVENING DEMOCRAT ChronAm \n",
|
||||||
|
"10398 74fa28868cbc998d15c242baea4e1faa RICHMOND ENQUIRER ChronAm \n",
|
||||||
|
"10399 147be715e90bac01c55969d90254f29e EVENING CAPITAL ChronAm \n",
|
||||||
|
"10400 1357f703947d912523ac23540cb99a0f RAFTSMAN'S JOURNAL ChronAm \n",
|
||||||
|
"10401 23346293dbc949ee2edc3380db29f33b THE DEMOCRATIC WHIG ChronAm \n",
|
||||||
|
"\n",
|
||||||
|
" 3 4 5 \\\n",
|
||||||
|
"0 1838.834247 43.910755 -69.820862 \n",
|
||||||
|
"1 1857.691781 40.827279 -83.281309 \n",
|
||||||
|
"2 1847.012329 37.538509 -77.434280 \n",
|
||||||
|
"3 1867.541096 41.027280 -78.439188 \n",
|
||||||
|
"4 1826.083562 37.538509 -77.434280 \n",
|
||||||
|
"... ... ... ... \n",
|
||||||
|
"10397 1888.949454 41.558153 -73.051497 \n",
|
||||||
|
"10398 1836.012295 37.538509 -77.434280 \n",
|
||||||
|
"10399 1907.004110 38.978640 -76.492786 \n",
|
||||||
|
"10400 1868.077869 41.027280 -78.439188 \n",
|
||||||
|
"10401 1843.760274 33.495674 -88.427263 \n",
|
||||||
|
"\n",
|
||||||
|
" 6 \\\n",
|
||||||
|
"0 rin 11K ui i rsognfd inlriliinnts i>r the town... \n",
|
||||||
|
"1 ton County feel an interest in. tn great is-\\n... \n",
|
||||||
|
"2 But at our own doors we have evidence ten\\ning... \n",
|
||||||
|
"3 The wonderful Flexibility and great comfort\\na... \n",
|
||||||
|
"4 Illinois.—The Legislature met at Ya:.ualia\\non... \n",
|
||||||
|
"... ... \n",
|
||||||
|
"10397 the Fitzgeralds should perish like a common\\nt... \n",
|
||||||
|
"10398 herd, so soon as he conveniently can, after th... \n",
|
||||||
|
"10399 Drs. James J. Murphy, of Annapo-\\nlis, and Tho... \n",
|
||||||
|
"10400 the soles of the feet spikes or corks are fixe... \n",
|
||||||
|
"10401 tion which his opponent had taken, and whilst\\... \n",
|
||||||
|
"\n",
|
||||||
|
" 7 \n",
|
||||||
|
"0 Northeasterly hv the head of said .^corn’s\\nan... \n",
|
||||||
|
"1 and design,\\nand hence, every election, be it ... \n",
|
||||||
|
"2 Democrat\\nenlisting lor the Mexican wvir. They... \n",
|
||||||
|
"3 will preserve their perfect aud grace\\nful sha... \n",
|
||||||
|
"4 to run the line between Arkansas and\\nthe’Vhnc... \n",
|
||||||
|
"... ... \n",
|
||||||
|
"10397 Brian, but there was also a touch\\nof self int... \n",
|
||||||
|
"10398 Court dotli lurlher adjudge, order, and decree... \n",
|
||||||
|
"10399 in the matter\\nor show any inclination to help... \n",
|
||||||
|
"10400 \\nIn order to prevent \"the giant\" from\\nfright... \n",
|
||||||
|
"10401 come criterion, by which to judge\\nof a nation... \n",
|
||||||
|
"\n",
|
||||||
|
"[10402 rows x 8 columns]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 88,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"test_data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 213,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"10519it [02:47, 62.67it/s]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"results_string = []\n",
|
||||||
|
"\n",
|
||||||
|
"with lzma.open(\"dev-0/in.tsv.xz\", encoding='utf8', mode=\"rt\") as file:\n",
|
||||||
|
" for line in tqdm(file):\n",
|
||||||
|
" line = line.split(\"\\t\")\n",
|
||||||
|
" text_before = str(line[-2]).replace('\\\\n', ' ').replace('\\n', ' ')\n",
|
||||||
|
" text_after = str(line[-1]).replace('\\\\n', ' ').replace('\\n', ' ')\n",
|
||||||
|
"\n",
|
||||||
|
" if text_before[-1] == ' ':\n",
|
||||||
|
" text_before = text_before[:-1]\n",
|
||||||
|
" if text_before[0] == ' ':\n",
|
||||||
|
" text_before = text_before[1:]\n",
|
||||||
|
"\n",
|
||||||
|
" if text_after[-1] == ' ':\n",
|
||||||
|
" text_after = text_after[:-1]\n",
|
||||||
|
" if text_after[0] == ' ':\n",
|
||||||
|
" text_after = text_after[1:]\n",
|
||||||
|
"\n",
|
||||||
|
" word_before = text_before.split(' ')[-1]\n",
|
||||||
|
" word_after = text_after.split(' ')[0]\n",
|
||||||
|
"\n",
|
||||||
|
" best_words = {}\n",
|
||||||
|
"\n",
|
||||||
|
" for word_middle, _ in unigram_counter_list:\n",
|
||||||
|
" current_score = 0\n",
|
||||||
|
" if (word_before, word_middle) in bigram_counter.keys() and (word_middle, word_after) in bigram_counter.keys() and word_before in unigram_counter.keys() and word_after in unigram_counter.keys():\n",
|
||||||
|
" current_score = (bigram_counter[(word_before, word_middle)] / unigram_counter[word_before]) * (bigram_counter[(word_middle, word_after)] / unigram_counter[word_middle])\n",
|
||||||
|
" best_words[word_middle] = current_score\n",
|
||||||
|
"\n",
|
||||||
|
" best_words = sorted(best_words.items(), key=lambda item: item[1], reverse=True)\n",
|
||||||
|
" leftover_probability = 0\n",
|
||||||
|
" for _, value in best_words[:5]:\n",
|
||||||
|
" if value == 0:\n",
|
||||||
|
" break\n",
|
||||||
|
" leftover_probability += value\n",
|
||||||
|
" leftover_probability = max(1 - leftover_probability, 0.01)\n",
|
||||||
|
"\n",
|
||||||
|
" result = f'{best_words[0][0]}:{round(best_words[0][1], 7):.8f} {best_words[1][0]}:{round(best_words[1][1], 7):.8f} {best_words[2][0]}:{round(best_words[2][1], 7):.8f} {best_words[3][0]}:{round(best_words[3][1], 7):.8f} {best_words[4][0]}:{round(best_words[4][1], 7):.8f} :{round(leftover_probability, 3):.8f}'\n",
|
||||||
|
" results_string.append(result)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 214,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
|
||||||
|
" 'own:0.00076790 way:0.00069960 head:0.00058630 work:0.00051990 place:0.00045550 :0.99700000',\n",
|
||||||
|
" 'the:0.00001150 a:0.00001040 Madison:0.00000230 every:0.00000210 Missouri:0.00000120 :1.00000000',\n",
|
||||||
|
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
|
||||||
|
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
|
||||||
|
" 'a:0.00008660 him:0.00000250 no:0.00000170 all:0.00000150 them:0.00000120 :1.00000000',\n",
|
||||||
|
" 'trees:0.00092990 and:0.00057150 is:0.00030980 growers:0.00029090 growing:0.00014300 :0.99800000',\n",
|
||||||
|
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
|
||||||
|
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
|
||||||
|
" 'that:0.00131940 as:0.00047920 sure:0.00009500 and:0.00009330 better:0.00007450 :0.99800000',\n",
|
||||||
|
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
|
||||||
|
" 'be:0.00032910 the:0.00014090 show:0.00012860 this:0.00001380 a:0.00000790 :0.99900000',\n",
|
||||||
|
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
|
||||||
|
" 'country:0.00007340 world:0.00007170 people:0.00006310 city:0.00005530 time:0.00004280 :1.00000000',\n",
|
||||||
|
" 'to:0.00030170 I:0.00005940 and:0.00002870 a:0.00001570 t:0.00001340 :1.00000000',\n",
|
||||||
|
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
|
||||||
|
" 'that:0.00014960 for:0.00010580 God:0.00005010 to:0.00002530 ing:0.00001430 :1.00000000',\n",
|
||||||
|
" 'founded:0.00097130 known:0.00064890 posted:0.00052370 as:0.00032530 fed:0.00027720 :0.99700000',\n",
|
||||||
|
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
|
||||||
|
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 214,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"results_string[:20]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open(r'test-A/out.tsv', 'w') as fp:\n",
|
||||||
|
" for item in results_string:\n",
|
||||||
|
" fp.write(\"%s\\n\" % item)\n",
|
||||||
|
" print('Done')"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "scweet",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.15"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
1
out-header.tsv
Normal file
1
out-header.tsv
Normal file
@ -0,0 +1 @@
|
|||||||
|
Word
|
|
7414
test-A/out.tsv
Normal file
7414
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user