challenging-america-word-ga.../run_3.ipynb

593 lines
24 KiB
Plaintext
Raw Normal View History

2023-04-29 12:23:34 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pzHZprgFPh08",
"outputId": "0f27cdcb-57b9-4cec-85c4-dac8497403ab"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"execution_count": 1
}
],
"source": [
"import pandas as pd\n",
"import csv\n",
"import regex as re\n",
"from nltk import trigrams, word_tokenize\n",
"from collections import Counter, defaultdict\n",
"import nltk\n",
"nltk.download('punkt')"
]
},
{
"cell_type": "code",
"source": [
"from google.colab import drive\n",
"drive.mount('/content/gdrive')\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wIY8w_ZxPshw",
"outputId": "ed7fda0c-8a7b-4aa8-9109-9804cabf4d79"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount(\"/content/gdrive\", force_remount=True).\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"cd '/content/gdrive/MyDrive/challenging-america-word-gap-prediction/'"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "PS4mnw-1P3cP",
"outputId": "c86f074e-5c26-4f00-eddb-d267174a4297"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"/content/gdrive/MyDrive/challenging-america-word-gap-prediction\n"
]
}
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 701
},
"id": "3r2WzoHmPh1G",
"outputId": "1853f1c3-6e28-497e-b34b-21c193fcd6e8"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-4-06713320f790>:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_data = pd.read_csv('train/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"<ipython-input-4-06713320f790>:1: FutureWarning: The warn_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_data = pd.read_csv('train/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"<ipython-input-4-06713320f790>:2: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_labels = pd.read_csv('train/expected.tsv', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"<ipython-input-4-06713320f790>:2: FutureWarning: The warn_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" train_labels = pd.read_csv('train/expected.tsv', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
" 6 \\\n",
"0 came fiom the last place to this\\nplace, and t... \n",
"1 MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine... \n",
"2 \"Thera were in 1771 only aeventy-nine\\n*ub*erl... \n",
"3 A gixnl man y nitereRtiiiv dii-clos-\\nur«s reg... \n",
"4 Tin: 188UB TV THF BBABBT QABJE\\nMr. Schiffs *t... \n",
"... ... \n",
"432017 Sam Clendenin bad a fancy for Ui«\\nscience of ... \n",
"432018 Wita.htt halting the party ware dilven to the ... \n",
"432019 It was the last thing that either of\\nthem exp... \n",
"432020 settlement with the department.\\nIt is also sh... \n",
"432021 Flour quotations—low extras at 1 R0®2 50;\\ncit... \n",
"\n",
" 7 0 \n",
"0 said\\nit's all squash. The best I could get\\ni... lie \n",
"1 \\ninto a proper perspective with those\\nminor ... himself \n",
"2 all notU\\nashore and afloat arc subjects for I... of \n",
"3 ceucju l< d no; <o waste it nud so\\nsunk it in... ably \n",
"4 ascertained w? OCt the COOltS of ibis\\nletale ... j \n",
"... ... ... \n",
"432017 \\nSam was arrested.\\nThe case excited a great ... and \n",
"432018 through the alnp the »Uitors laapeeeed tia.»\\n... paasliic \n",
"432019 Agua Negra across the line.\\nIt was a grim pla... for \n",
"432020 \\na note of Wood, Dialogue fc Co., for\\nc27,im... for \n",
"432021 3214c;do White at 3614c: Mixed Western at\\n331... at \n",
"\n",
"[432022 rows x 3 columns]"
],
"text/html": [
"\n",
" <div id=\"df-89d6ad5d-c536-4a35-8506-b69b94e55deb\">\n",
" <div class=\"colab-df-container\">\n",
" <div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>6</th>\n",
" <th>7</th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>came fiom the last place to this\\nplace, and t...</td>\n",
" <td>said\\nit's all squash. The best I could get\\ni...</td>\n",
" <td>lie</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>MB. BOOT'S POLITICAL OBEED\\nAttempt to imagine...</td>\n",
" <td>\\ninto a proper perspective with those\\nminor ...</td>\n",
" <td>himself</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>\"Thera were in 1771 only aeventy-nine\\n*ub*erl...</td>\n",
" <td>all notU\\nashore and afloat arc subjects for I...</td>\n",
" <td>of</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>A gixnl man y nitereRtiiiv dii-clos-\\nur«s reg...</td>\n",
" <td>ceucju l&lt; d no; &lt;o waste it nud so\\nsunk it in...</td>\n",
" <td>ably</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Tin: 188UB TV THF BBABBT QABJE\\nMr. Schiffs *t...</td>\n",
" <td>ascertained w? OCt the COOltS of ibis\\nletale ...</td>\n",
" <td>j</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432017</th>\n",
" <td>Sam Clendenin bad a fancy for Ui«\\nscience of ...</td>\n",
" <td>\\nSam was arrested.\\nThe case excited a great ...</td>\n",
" <td>and</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432018</th>\n",
" <td>Wita.htt halting the party ware dilven to the ...</td>\n",
" <td>through the alnp the »Uitors laapeeeed tia.»\\n...</td>\n",
" <td>paasliic</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432019</th>\n",
" <td>It was the last thing that either of\\nthem exp...</td>\n",
" <td>Agua Negra across the line.\\nIt was a grim pla...</td>\n",
" <td>for</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432020</th>\n",
" <td>settlement with the department.\\nIt is also sh...</td>\n",
" <td>\\na note of Wood, Dialogue fc Co., for\\nc27,im...</td>\n",
" <td>for</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432021</th>\n",
" <td>Flour quotations—low extras at 1 R0®2 50;\\ncit...</td>\n",
" <td>3214c;do White at 3614c: Mixed Western at\\n331...</td>\n",
" <td>at</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>432022 rows × 3 columns</p>\n",
"</div>\n",
" <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-89d6ad5d-c536-4a35-8506-b69b94e55deb')\"\n",
" title=\"Convert this dataframe to an interactive table.\"\n",
" style=\"display:none;\">\n",
" \n",
" <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
" width=\"24px\">\n",
" <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
" <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
" </svg>\n",
" </button>\n",
" \n",
" <style>\n",
" .colab-df-container {\n",
" display:flex;\n",
" flex-wrap:wrap;\n",
" gap: 12px;\n",
" }\n",
"\n",
" .colab-df-convert {\n",
" background-color: #E8F0FE;\n",
" border: none;\n",
" border-radius: 50%;\n",
" cursor: pointer;\n",
" display: none;\n",
" fill: #1967D2;\n",
" height: 32px;\n",
" padding: 0 0 0 0;\n",
" width: 32px;\n",
" }\n",
"\n",
" .colab-df-convert:hover {\n",
" background-color: #E2EBFA;\n",
" box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
" fill: #174EA6;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert {\n",
" background-color: #3B4455;\n",
" fill: #D2E3FC;\n",
" }\n",
"\n",
" [theme=dark] .colab-df-convert:hover {\n",
" background-color: #434B5C;\n",
" box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
" filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
" fill: #FFFFFF;\n",
" }\n",
" </style>\n",
"\n",
" <script>\n",
" const buttonEl =\n",
" document.querySelector('#df-89d6ad5d-c536-4a35-8506-b69b94e55deb button.colab-df-convert');\n",
" buttonEl.style.display =\n",
" google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
"\n",
" async function convertToInteractive(key) {\n",
" const element = document.querySelector('#df-89d6ad5d-c536-4a35-8506-b69b94e55deb');\n",
" const dataTable =\n",
" await google.colab.kernel.invokeFunction('convertToInteractive',\n",
" [key], {});\n",
" if (!dataTable) return;\n",
"\n",
" const docLinkHtml = 'Like what you see? Visit the ' +\n",
" '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
" + ' to learn more about interactive tables.';\n",
" element.innerHTML = '';\n",
" dataTable['output_type'] = 'display_data';\n",
" await google.colab.output.renderOutput(dataTable, element);\n",
" const docLink = document.createElement('div');\n",
" docLink.innerHTML = docLinkHtml;\n",
" element.appendChild(docLink);\n",
" }\n",
" </script>\n",
" </div>\n",
" </div>\n",
" "
]
},
"metadata": {},
"execution_count": 4
}
],
"source": [
"train_data = pd.read_csv('train/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"train_labels = pd.read_csv('train/expected.tsv', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"\n",
"train_data = train_data[[6, 7]]\n",
"train_data = pd.concat([train_data, train_labels], axis=1)\n",
"train_data\n"
]
},
{
"cell_type": "code",
"source": [
"train_data = train_data[:120000]"
],
"metadata": {
"id": "ifrGODxOTuK7"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "JJTvit-qPh1L",
"outputId": "58f187c9-6561-4418-d0e0-fbaca2260b70"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-6-b31274590998>:1: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" train_data['final'] = train_data[6] + train_data[0] + train_data[7]\n"
]
}
],
"source": [
"train_data['final'] = train_data[6] + train_data[0] + train_data[7]\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "0GzBUzFkPh1M"
},
"outputs": [],
"source": [
"model = defaultdict(lambda: defaultdict(lambda: 0))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"id": "IViVFNNzPh1O"
},
"outputs": [],
"source": [
"def clean_text(text):\n",
" text = text.lower().replace('-\\\\n', '').replace('\\\\n', ' ')\n",
" text = re.sub(r'\\p{P}', '', text)\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"id": "ZXkV4cLFPh1P"
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "3Y4_y97tPh1R"
},
"outputs": [],
"source": [
"for index, row in train_data.iterrows():\n",
" text = clean_text(str(row['final']))\n",
" words = word_tokenize(text)\n",
" for w1, w2, w3 in trigrams(words, pad_right=True, pad_left=True):\n",
" if w1 and w2 and w3:\n",
" model[(w2, w3)][w1] += 1"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "V87WPI1PPh1S"
},
"outputs": [],
"source": [
"for w2_w3 in model:\n",
" total_count = float(sum(model[w2_w3].values()))\n",
" for w1 in model[w2_w3]:\n",
" model[w2_w3][w1] /= total_count\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"id": "TP-eEc4OPh1T"
},
"outputs": [],
"source": [
"def predict_probs(word1, word2):\n",
" raw_prediction = dict(model[word1, word2])\n",
" prediction = dict(Counter(raw_prediction).most_common(6))\n",
" \n",
" total_prob = 0.0\n",
" str_prediction = ''\n",
"\n",
" for word, prob in prediction.items():\n",
" total_prob += prob\n",
" str_prediction += f'{word}:{prob} '\n",
"\n",
" if total_prob == 0.0:\n",
" return 'from:0.2 the:0.2 to:0.2 a:0.1 and:0.1 of:0.1 :0.1'\n",
"\n",
" remaining_prob = 1 - total_prob\n",
"\n",
" if remaining_prob < 0.01:\n",
" remaining_prob = 0.01\n",
" \n",
" str_prediction += f':{remaining_prob}'\n",
" \n",
" return str_prediction"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "aehup5qzPh1W",
"outputId": "c4443682-95fb-43f0-c04d-726a65b4f6b9"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"<ipython-input-12-94466712d0ba>:1: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"<ipython-input-12-94466712d0ba>:1: FutureWarning: The warn_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"<ipython-input-12-94466712d0ba>:2: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" test_data = pd.read_csv('test-A/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"<ipython-input-12-94466712d0ba>:2: FutureWarning: The warn_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n",
"\n",
"\n",
" test_data = pd.read_csv('test-A/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n"
]
}
],
"source": [
"dev_data = pd.read_csv('dev-0/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n",
"test_data = pd.read_csv('test-A/in.tsv.xz', sep='\\t', error_bad_lines=False, warn_bad_lines=False, header=None, quoting=csv.QUOTE_NONE)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"id": "bTCUDesePh1X"
},
"outputs": [],
"source": [
"with open('dev-0/out.tsv', 'w') as file:\n",
" for index, row in dev_data.iterrows():\n",
" text = clean_text(str(row[7]))\n",
" words = word_tokenize(text)\n",
" if len(words) < 4:\n",
" prediction = 'from:0.2 the:0.2 to:0.2 a:0.1 and:0.1 of:0.1 :0.1'\n",
" else:\n",
" prediction = predict_probs(words[0], words[1])\n",
" file.write(prediction + '\\n')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "kzg8J0hAPh1Y"
},
"outputs": [],
"source": [
"\n",
"with open('test-A/out.tsv', 'w') as file:\n",
" for index, row in test_data.iterrows():\n",
" text = clean_text(str(row[7]))\n",
" words = word_tokenize(text)\n",
" if len(words) < 4:\n",
" prediction = 'from:0.2 the:0.2 to:0.2 a:0.1 and:0.1 of:0.1 :0.1'\n",
" else:\n",
" prediction = predict_probs(words[0], words[1])\n",
" file.write(prediction + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "s01_AhIbPh1b"
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.2"
},
"colab": {
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 0
}