408 lines
14 KiB
Plaintext
408 lines
14 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 36,
|
||
"id": "21c9b695",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import csv\n",
|
||
"import regex as re\n",
|
||
"import nltk\n",
|
||
"from collections import Counter, defaultdict\n",
|
||
"import string\n",
|
||
"import unicodedata\n",
|
||
"\n",
|
||
"def clean_text(text): \n",
|
||
" return re.sub(r\"\\p{P}\", \"\", str(text).lower().replace(\"-\\\\n\", \"\").replace(\"\\\\n\", \" \"))\n",
|
||
"\n",
|
||
"def train_model(data, model):\n",
|
||
" for _, row in data.iterrows():\n",
|
||
" words = nltk.word_tokenize(clean_text(row[760]))\n",
|
||
" for w1, w2 in nltk.bigrams(words, pad_left=True, pad_right=True):\n",
|
||
" if w1 and w2:\n",
|
||
" model[w2][w1] += 1\n",
|
||
" for w2 in model:\n",
|
||
" total_count = float(sum(model[w2].values()))\n",
|
||
" for w1 in model[w2]:\n",
|
||
" model[w2][w1] /= total_count\n",
|
||
"\n",
|
||
"\n",
|
||
"def predict(word, model):\n",
|
||
" predictions = dict(model[word])\n",
|
||
" most_common = dict(Counter(predictions).most_common(5))\n",
|
||
"\n",
|
||
" total_prob = 0.0\n",
|
||
" str_prediction = \"\"\n",
|
||
" for word, prob in most_common.items():\n",
|
||
" total_prob += prob\n",
|
||
" str_prediction += f\"{word}:{prob} \"\n",
|
||
"\n",
|
||
" if not total_prob:\n",
|
||
" return \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n",
|
||
"\n",
|
||
" if 1 - total_prob >= 0.01:\n",
|
||
" str_prediction += f\":{1-total_prob}\"\n",
|
||
" else:\n",
|
||
" str_prediction += f\":0.01\"\n",
|
||
"\n",
|
||
" return str_prediction\n",
|
||
"\n",
|
||
"\n",
|
||
"def predict_data(read_path, save_path, model):\n",
|
||
" data = pd.read_csv(\n",
|
||
" read_path,\n",
|
||
" sep=\"\\t\",\n",
|
||
" error_bad_lines=False,\n",
|
||
" header=None,\n",
|
||
" quoting=csv.QUOTE_NONE,\n",
|
||
" encoding=\"utf-8\"\n",
|
||
" )\n",
|
||
" with open(save_path, \"w\", encoding=\"utf-8\") as f:\n",
|
||
" for _, row in data.iterrows():\n",
|
||
" words = nltk.word_tokenize(clean_text(row[7]))\n",
|
||
" if len(words) < 3:\n",
|
||
" prediction = \"the:0.2 be:0.2 to:0.2 of:0.1 and:0.1 a:0.1 :0.1\"\n",
|
||
" else:\n",
|
||
" prediction = predict(words[-1], model)\n",
|
||
" f.write(prediction + \"\\n\")\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "e39473e2",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"with open(\"in-header.tsv\") as f:\n",
|
||
" in_cols = f.read().strip().split(\"\\t\")\n",
|
||
"\n",
|
||
"with open(\"out-header.tsv\") as f:\n",
|
||
" out_cols = f.read().strip().split(\"\\t\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "bde510c9",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['FileId', 'Year', 'LeftContext', 'RightContext']"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"in_cols"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "0e8b31dd",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"['Word']"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"out_cols"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "7662d802",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"data = pd.read_csv(\n",
|
||
" \"train/in.tsv.xz\",\n",
|
||
" sep=\"\\t\",\n",
|
||
" on_bad_lines='skip',\n",
|
||
" header=None,\n",
|
||
" # names=in_cols,\n",
|
||
" quoting=csv.QUOTE_NONE,\n",
|
||
" encoding=\"utf-8\"\n",
|
||
")\n",
|
||
"\n",
|
||
"train_words = pd.read_csv(\n",
|
||
" \"train/expected.tsv\",\n",
|
||
" sep=\"\\t\",\n",
|
||
" on_bad_lines='skip',\n",
|
||
" header=None,\n",
|
||
" # names=out_cols,\n",
|
||
" quoting=csv.QUOTE_NONE,\n",
|
||
" encoding=\"utf-8\"\n",
|
||
")\n",
|
||
"\n",
|
||
"train_data = data[[7, 6]]\n",
|
||
"train_data = pd.concat([train_data, train_words], axis=1)\n",
|
||
"\n",
|
||
"train_data[760] = train_data[7] + train_data[0] + train_data[6]\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "c3d2cfec",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>7</th>\n",
|
||
" <th>6</th>\n",
|
||
" <th>0</th>\n",
|
||
" <th>760</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>said\\nit's all squash. The best I could get\\ni...</td>\n",
|
||
" <td>came fiom the last place to this\\nplace, and t...</td>\n",
|
||
" <td>lie</td>\n",
|
||
" <td>said\\nit's all squash. The best I could get\\ni...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>\\ninto a proper perspective with those\\nminor ...</td>\n",
|
||
" <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n",
|
||
" <td>himself</td>\n",
|
||
" <td>\\ninto a proper perspective with those\\nminor ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>all notU\\nashore and afloat arc subjects for I...</td>\n",
|
||
" <td>\"Thera were in 1771 only aeventy-nine\\n*ub*erl...</td>\n",
|
||
" <td>of</td>\n",
|
||
" <td>all notU\\nashore and afloat arc subjects for I...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>ceucju l< d no; <o waste it nud so\\nsunk it in...</td>\n",
|
||
" <td>A gixnl man y nitereRtiiiv dii-clos-\\nur«s reg...</td>\n",
|
||
" <td>ably</td>\n",
|
||
" <td>ceucju l< d no; <o waste it nud so\\nsunk it in...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>ascertained w? OCt the COOltS of ibis\\nletale ...</td>\n",
|
||
" <td>Tin: 188UB TV THF BBABBT QABJE\\nMr. Schiffs *t...</td>\n",
|
||
" <td>j</td>\n",
|
||
" <td>ascertained w? OCt the COOltS of ibis\\nletale ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>432017</th>\n",
|
||
" <td>\\nSam was arrested.\\nThe case excited a great ...</td>\n",
|
||
" <td>Sam Clendenin bad a fancy for Ui«\\nscience of ...</td>\n",
|
||
" <td>and</td>\n",
|
||
" <td>\\nSam was arrested.\\nThe case excited a great ...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>432018</th>\n",
|
||
" <td>through the alnp the »Uitors laapeeeed tia.»\\n...</td>\n",
|
||
" <td>Wita.htt halting the party ware dilven to the ...</td>\n",
|
||
" <td>paasliic</td>\n",
|
||
" <td>through the alnp the »Uitors laapeeeed tia.»\\n...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>432019</th>\n",
|
||
" <td>Agua Negra across the line.\\nIt was a grim pla...</td>\n",
|
||
" <td>It was the last thing that either of\\nthem exp...</td>\n",
|
||
" <td>for</td>\n",
|
||
" <td>Agua Negra across the line.\\nIt was a grim pla...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>432020</th>\n",
|
||
" <td>\\na note of Wood, Dialogue fc Co., for\\nc27,im...</td>\n",
|
||
" <td>settlement with the department.\\nIt is also sh...</td>\n",
|
||
" <td>for</td>\n",
|
||
" <td>\\na note of Wood, Dialogue fc Co., for\\nc27,im...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>432021</th>\n",
|
||
" <td>3214c;do White at 3614c: Mixed Western at\\n331...</td>\n",
|
||
" <td>Flour quotations—low extras at 1 R0®2 50;\\ncit...</td>\n",
|
||
" <td>at</td>\n",
|
||
" <td>3214c;do White at 3614c: Mixed Western at\\n331...</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>432022 rows × 4 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" 7 \\\n",
|
||
"0 said\\nit's all squash. The best I could get\\ni... \n",
|
||
"1 \\ninto a proper perspective with those\\nminor ... \n",
|
||
"2 all notU\\nashore and afloat arc subjects for I... \n",
|
||
"3 ceucju l< d no; <o waste it nud so\\nsunk it in... \n",
|
||
"4 ascertained w? OCt the COOltS of ibis\\nletale ... \n",
|
||
"... ... \n",
|
||
"432017 \\nSam was arrested.\\nThe case excited a great ... \n",
|
||
"432018 through the alnp the »Uitors laapeeeed tia.»\\n... \n",
|
||
"432019 Agua Negra across the line.\\nIt was a grim pla... \n",
|
||
"432020 \\na note of Wood, Dialogue fc Co., for\\nc27,im... \n",
|
||
"432021 3214c;do White at 3614c: Mixed Western at\\n331... \n",
|
||
"\n",
|
||
" 6 0 \\\n",
|
||
"0 came fiom the last place to this\\nplace, and t... lie \n",
|
||
"1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... himself \n",
|
||
"2 \"Thera were in 1771 only aeventy-nine\\n*ub*erl... of \n",
|
||
"3 A gixnl man y nitereRtiiiv dii-clos-\\nur«s reg... ably \n",
|
||
"4 Tin: 188UB TV THF BBABBT QABJE\\nMr. Schiffs *t... j \n",
|
||
"... ... ... \n",
|
||
"432017 Sam Clendenin bad a fancy for Ui«\\nscience of ... and \n",
|
||
"432018 Wita.htt halting the party ware dilven to the ... paasliic \n",
|
||
"432019 It was the last thing that either of\\nthem exp... for \n",
|
||
"432020 settlement with the department.\\nIt is also sh... for \n",
|
||
"432021 Flour quotations—low extras at 1 R0®2 50;\\ncit... at \n",
|
||
"\n",
|
||
" 760 \n",
|
||
"0 said\\nit's all squash. The best I could get\\ni... \n",
|
||
"1 \\ninto a proper perspective with those\\nminor ... \n",
|
||
"2 all notU\\nashore and afloat arc subjects for I... \n",
|
||
"3 ceucju l< d no; <o waste it nud so\\nsunk it in... \n",
|
||
"4 ascertained w? OCt the COOltS of ibis\\nletale ... \n",
|
||
"... ... \n",
|
||
"432017 \\nSam was arrested.\\nThe case excited a great ... \n",
|
||
"432018 through the alnp the »Uitors laapeeeed tia.»\\n... \n",
|
||
"432019 Agua Negra across the line.\\nIt was a grim pla... \n",
|
||
"432020 \\na note of Wood, Dialogue fc Co., for\\nc27,im... \n",
|
||
"432021 3214c;do White at 3614c: Mixed Western at\\n331... \n",
|
||
"\n",
|
||
"[432022 rows x 4 columns]"
|
||
]
|
||
},
|
||
"execution_count": 23,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"train_data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"id": "bd92ba07",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"\n",
|
||
"model = defaultdict(lambda: defaultdict(lambda: 0))\n",
|
||
"\n",
|
||
"train_model(train_data, model)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 37,
|
||
"id": "ad23240e",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\749044266.py:46: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
|
||
"\n",
|
||
"\n",
|
||
" data = pd.read_csv(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"predict_data(\"dev-0/in.tsv.xz\", \"dev-0/out.tsv\", model)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 38,
|
||
"id": "195cb6cf",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\Users\\Norbert\\AppData\\Local\\Temp\\ipykernel_15436\\749044266.py:46: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
|
||
"\n",
|
||
"\n",
|
||
" data = pd.read_csv(\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"predict_data(\"test-A/in.tsv.xz\", \"test-A/out.tsv\", model)"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.2"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|