challenging-america-word-ga.../nb.ipynb

508 lines
19 KiB
Plaintext
Raw Normal View History

2023-04-04 23:16:57 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 184,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from itertools import islice\n",
"from collections import Counter\n",
"import pandas as pd\n",
"from tqdm import tqdm"
]
},
{
"cell_type": "code",
"execution_count": 209,
"metadata": {},
"outputs": [],
"source": [
"import lzma\n",
"from collections import Counter, OrderedDict\n",
"import matplotlib.pyplot as plt\n",
"from math import log\n",
"import re\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 200,
"metadata": {},
"outputs": [],
"source": [
"with open(\"train/in.tsv\", encoding='utf8', mode=\"rt\") as file:\n",
" a = file.readlines()\n",
"\n",
"a = [line.split(\"\\t\") for line in a]\n",
"text = \" \".join([line[-2] + \" \" + line[-1] for line in a])\n",
"text = re.sub(r\"\\\\+n\", \" \", text)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"del a"
]
},
{
"cell_type": "code",
"execution_count": 199,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"19560075"
]
},
"execution_count": 199,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(text)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"words = re.findall(\"\\w+\", text)\n",
"bigram_counter = Counter(zip(words, islice(words, 1, None)))\n",
"bigram_counter = dict(sorted(bigram_counter.items(), key=lambda item: item[1], reverse=True))\n",
"\n",
"del words"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"bigram_counter_short = {}\n",
"for key, value in bigram_counter.items():\n",
" if value > 5:\n",
" bigram_counter_short[key] = value\n",
"\n",
"bigram_counter = bigram_counter_short\n",
"del bigram_counter_short"
]
},
{
"cell_type": "code",
"execution_count": 201,
"metadata": {},
"outputs": [],
"source": [
"unigram_counter = Counter(text.split(' '))\n",
"unigram_counter = unigram_counter.most_common(10_000)\n",
"# unigram_counter = dict(sorted(unigram_counter.items(), key=lambda item: item[1]), reverse=True)\n",
"unigram_counter_list = unigram_counter\n",
"unigram_counter = dict(unigram_counter) "
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"# with open(\"dev-0/in.tsv\", encoding='utf8', mode=\"rt\") as file:\n",
"# a = file.readlines()\n",
"\n",
"# a = [line.split(\"\\t\") for line in a]\n",
"# text = \" \".join([line[-2] + \" \" + line[-1] for line in a])"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\micha\\AppData\\Local\\Temp\\ipykernel_14716\\2692353843.py:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" test_data = pd.read_csv('dev-0/in.tsv', sep='\\t', error_bad_lines=False, header=None)\n",
"Skipping line 654: expected 8 fields, saw 9\n",
"Skipping line 2220: expected 8 fields, saw 9\n",
"\n"
]
}
],
"source": [
"test_data = pd.read_csv('dev-0/in.tsv', sep='\\t', error_bad_lines=False, header=None)"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" <th>4</th>\n",
" <th>5</th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>662ed514d56f7bc8743aa6f23794c731</td>\n",
" <td>LINCOLN TELEGRAPH</td>\n",
" <td>ChronAm</td>\n",
" <td>1838.834247</td>\n",
" <td>43.910755</td>\n",
" <td>-69.820862</td>\n",
" <td>rin 11K ui i rsognfd inlriliinnts i&gt;r the town...</td>\n",
" <td>Northeasterly hv the head of said .^corns\\nan...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0c3ac40edfe6a167ab692fdb9219a93c</td>\n",
" <td>THE WYANDOT PIONEER</td>\n",
" <td>ChronAm</td>\n",
" <td>1857.691781</td>\n",
" <td>40.827279</td>\n",
" <td>-83.281309</td>\n",
" <td>ton County feel an interest in. tn great is-\\n...</td>\n",
" <td>and design,\\nand hence, every election, be it ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>b298097f3afd2f8c06b61fa2308ec725</td>\n",
" <td>RICHMOND ENQUIRER</td>\n",
" <td>ChronAm</td>\n",
" <td>1847.012329</td>\n",
" <td>37.538509</td>\n",
" <td>-77.434280</td>\n",
" <td>But at our own doors we have evidence ten\\ning...</td>\n",
" <td>Democrat\\nenlisting lor the Mexican wvir. They...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1d50cf957a6a9cbbe0ee7773a72a76d4</td>\n",
" <td>RAFTSMAN'S JOURNAL</td>\n",
" <td>ChronAm</td>\n",
" <td>1867.541096</td>\n",
" <td>41.027280</td>\n",
" <td>-78.439188</td>\n",
" <td>The wonderful Flexibility and great comfort\\na...</td>\n",
" <td>will preserve their perfect aud grace\\nful sha...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5a7297b76de00c7d9e1fb159384238c0</td>\n",
" <td>RICHMOND ENQUIRER</td>\n",
" <td>ChronAm</td>\n",
" <td>1826.083562</td>\n",
" <td>37.538509</td>\n",
" <td>-77.434280</td>\n",
" <td>Illinois.—The Legislature met at Ya:.ualia\\non...</td>\n",
" <td>to run the line between Arkansas and\\ntheVhnc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10397</th>\n",
" <td>02e9e019df1992daeafe82b041d94aac</td>\n",
" <td>WATERBURY EVENING DEMOCRAT</td>\n",
" <td>ChronAm</td>\n",
" <td>1888.949454</td>\n",
" <td>41.558153</td>\n",
" <td>-73.051497</td>\n",
" <td>the Fitzgeralds should perish like a common\\nt...</td>\n",
" <td>Brian, but there was also a touch\\nof self int...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10398</th>\n",
" <td>74fa28868cbc998d15c242baea4e1faa</td>\n",
" <td>RICHMOND ENQUIRER</td>\n",
" <td>ChronAm</td>\n",
" <td>1836.012295</td>\n",
" <td>37.538509</td>\n",
" <td>-77.434280</td>\n",
" <td>herd, so soon as he conveniently can, after th...</td>\n",
" <td>Court dotli lurlher adjudge, order, and decree...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10399</th>\n",
" <td>147be715e90bac01c55969d90254f29e</td>\n",
" <td>EVENING CAPITAL</td>\n",
" <td>ChronAm</td>\n",
" <td>1907.004110</td>\n",
" <td>38.978640</td>\n",
" <td>-76.492786</td>\n",
" <td>Drs. James J. Murphy, of Annapo-\\nlis, and Tho...</td>\n",
" <td>in the matter\\nor show any inclination to help...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10400</th>\n",
" <td>1357f703947d912523ac23540cb99a0f</td>\n",
" <td>RAFTSMAN'S JOURNAL</td>\n",
" <td>ChronAm</td>\n",
" <td>1868.077869</td>\n",
" <td>41.027280</td>\n",
" <td>-78.439188</td>\n",
" <td>the soles of the feet spikes or corks are fixe...</td>\n",
" <td>\\nIn order to prevent \"the giant\" from\\nfright...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10401</th>\n",
" <td>23346293dbc949ee2edc3380db29f33b</td>\n",
" <td>THE DEMOCRATIC WHIG</td>\n",
" <td>ChronAm</td>\n",
" <td>1843.760274</td>\n",
" <td>33.495674</td>\n",
" <td>-88.427263</td>\n",
" <td>tion which his opponent had taken, and whilst\\...</td>\n",
" <td>come criterion, by which to judge\\nof a nation...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10402 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 \\\n",
"0 662ed514d56f7bc8743aa6f23794c731 LINCOLN TELEGRAPH ChronAm \n",
"1 0c3ac40edfe6a167ab692fdb9219a93c THE WYANDOT PIONEER ChronAm \n",
"2 b298097f3afd2f8c06b61fa2308ec725 RICHMOND ENQUIRER ChronAm \n",
"3 1d50cf957a6a9cbbe0ee7773a72a76d4 RAFTSMAN'S JOURNAL ChronAm \n",
"4 5a7297b76de00c7d9e1fb159384238c0 RICHMOND ENQUIRER ChronAm \n",
"... ... ... ... \n",
"10397 02e9e019df1992daeafe82b041d94aac WATERBURY EVENING DEMOCRAT ChronAm \n",
"10398 74fa28868cbc998d15c242baea4e1faa RICHMOND ENQUIRER ChronAm \n",
"10399 147be715e90bac01c55969d90254f29e EVENING CAPITAL ChronAm \n",
"10400 1357f703947d912523ac23540cb99a0f RAFTSMAN'S JOURNAL ChronAm \n",
"10401 23346293dbc949ee2edc3380db29f33b THE DEMOCRATIC WHIG ChronAm \n",
"\n",
" 3 4 5 \\\n",
"0 1838.834247 43.910755 -69.820862 \n",
"1 1857.691781 40.827279 -83.281309 \n",
"2 1847.012329 37.538509 -77.434280 \n",
"3 1867.541096 41.027280 -78.439188 \n",
"4 1826.083562 37.538509 -77.434280 \n",
"... ... ... ... \n",
"10397 1888.949454 41.558153 -73.051497 \n",
"10398 1836.012295 37.538509 -77.434280 \n",
"10399 1907.004110 38.978640 -76.492786 \n",
"10400 1868.077869 41.027280 -78.439188 \n",
"10401 1843.760274 33.495674 -88.427263 \n",
"\n",
" 6 \\\n",
"0 rin 11K ui i rsognfd inlriliinnts i>r the town... \n",
"1 ton County feel an interest in. tn great is-\\n... \n",
"2 But at our own doors we have evidence ten\\ning... \n",
"3 The wonderful Flexibility and great comfort\\na... \n",
"4 Illinois.—The Legislature met at Ya:.ualia\\non... \n",
"... ... \n",
"10397 the Fitzgeralds should perish like a common\\nt... \n",
"10398 herd, so soon as he conveniently can, after th... \n",
"10399 Drs. James J. Murphy, of Annapo-\\nlis, and Tho... \n",
"10400 the soles of the feet spikes or corks are fixe... \n",
"10401 tion which his opponent had taken, and whilst\\... \n",
"\n",
" 7 \n",
"0 Northeasterly hv the head of said .^corns\\nan... \n",
"1 and design,\\nand hence, every election, be it ... \n",
"2 Democrat\\nenlisting lor the Mexican wvir. They... \n",
"3 will preserve their perfect aud grace\\nful sha... \n",
"4 to run the line between Arkansas and\\ntheVhnc... \n",
"... ... \n",
"10397 Brian, but there was also a touch\\nof self int... \n",
"10398 Court dotli lurlher adjudge, order, and decree... \n",
"10399 in the matter\\nor show any inclination to help... \n",
"10400 \\nIn order to prevent \"the giant\" from\\nfright... \n",
"10401 come criterion, by which to judge\\nof a nation... \n",
"\n",
"[10402 rows x 8 columns]"
]
},
"execution_count": 88,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_data"
]
},
{
"cell_type": "code",
"execution_count": 213,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"10519it [02:47, 62.67it/s]\n"
]
}
],
"source": [
"results_string = []\n",
"\n",
"with lzma.open(\"dev-0/in.tsv.xz\", encoding='utf8', mode=\"rt\") as file:\n",
" for line in tqdm(file):\n",
" line = line.split(\"\\t\")\n",
" text_before = str(line[-2]).replace('\\\\n', ' ').replace('\\n', ' ')\n",
" text_after = str(line[-1]).replace('\\\\n', ' ').replace('\\n', ' ')\n",
"\n",
" if text_before[-1] == ' ':\n",
" text_before = text_before[:-1]\n",
" if text_before[0] == ' ':\n",
" text_before = text_before[1:]\n",
"\n",
" if text_after[-1] == ' ':\n",
" text_after = text_after[:-1]\n",
" if text_after[0] == ' ':\n",
" text_after = text_after[1:]\n",
"\n",
" word_before = text_before.split(' ')[-1]\n",
" word_after = text_after.split(' ')[0]\n",
"\n",
" best_words = {}\n",
"\n",
" for word_middle, _ in unigram_counter_list:\n",
" current_score = 0\n",
" if (word_before, word_middle) in bigram_counter.keys() and (word_middle, word_after) in bigram_counter.keys() and word_before in unigram_counter.keys() and word_after in unigram_counter.keys():\n",
" current_score = (bigram_counter[(word_before, word_middle)] / unigram_counter[word_before]) * (bigram_counter[(word_middle, word_after)] / unigram_counter[word_middle])\n",
" best_words[word_middle] = current_score\n",
"\n",
" best_words = sorted(best_words.items(), key=lambda item: item[1], reverse=True)\n",
" leftover_probability = 0\n",
" for _, value in best_words[:5]:\n",
" if value == 0:\n",
" break\n",
" leftover_probability += value\n",
" leftover_probability = max(1 - leftover_probability, 0.01)\n",
"\n",
" result = f'{best_words[0][0]}:{round(best_words[0][1], 7):.8f} {best_words[1][0]}:{round(best_words[1][1], 7):.8f} {best_words[2][0]}:{round(best_words[2][1], 7):.8f} {best_words[3][0]}:{round(best_words[3][1], 7):.8f} {best_words[4][0]}:{round(best_words[4][1], 7):.8f} :{round(leftover_probability, 3):.8f}'\n",
" results_string.append(result)"
]
},
{
"cell_type": "code",
"execution_count": 214,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
" 'own:0.00076790 way:0.00069960 head:0.00058630 work:0.00051990 place:0.00045550 :0.99700000',\n",
" 'the:0.00001150 a:0.00001040 Madison:0.00000230 every:0.00000210 Missouri:0.00000120 :1.00000000',\n",
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
" 'a:0.00008660 him:0.00000250 no:0.00000170 all:0.00000150 them:0.00000120 :1.00000000',\n",
" 'trees:0.00092990 and:0.00057150 is:0.00030980 growers:0.00029090 growing:0.00014300 :0.99800000',\n",
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
" 'that:0.00131940 as:0.00047920 sure:0.00009500 and:0.00009330 better:0.00007450 :0.99800000',\n",
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
" 'be:0.00032910 the:0.00014090 show:0.00012860 this:0.00001380 a:0.00000790 :0.99900000',\n",
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
" 'country:0.00007340 world:0.00007170 people:0.00006310 city:0.00005530 time:0.00004280 :1.00000000',\n",
" 'to:0.00030170 I:0.00005940 and:0.00002870 a:0.00001570 t:0.00001340 :1.00000000',\n",
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
" 'that:0.00014960 for:0.00010580 God:0.00005010 to:0.00002530 ing:0.00001430 :1.00000000',\n",
" 'founded:0.00097130 known:0.00064890 posted:0.00052370 as:0.00032530 fed:0.00027720 :0.99700000',\n",
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000',\n",
" 'the:0.00000000 of:0.00000000 and:0.00000000 to:0.00000000 a:0.00000000 :1.00000000']"
]
},
"execution_count": 214,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"results_string[:20]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(r'test-A/out.tsv', 'w') as fp:\n",
" for item in results_string:\n",
" fp.write(\"%s\\n\" % item)\n",
" print('Done')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "scweet",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.15"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}