ready-made

2022-06-14 23:05:20 +02:00 · 2022-06-14 23:05:20 +02:00 · 8e8a47dc81
commit 8e8a47dc81
19 changed files with 3276583 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
+
+*~
+*.swp
+*.bak
+*.pyc
+*.o
+.DS_Store
+.token
--- a/.ipynb_checkpoints/run-checkpoint.ipynb
+++ b/.ipynb_checkpoints/run-checkpoint.ipynb
@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/.ipynb_checkpoints/run-checkpoint.py
+++ b/.ipynb_checkpoints/run-checkpoint.py
@ -0,0 +1,94 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[3]:
+
+
+import vowpalwabbit
+import pandas as pd
+import re
+
+
+# In[4]:
+
+
+def prediction(path_in, path_out, model, categories):
+    data = pd.read_csv(path_in, header=None, sep='\t')
+    data = data.drop(1, axis=1)
+    data.columns = ['year', 'text']
+
+    data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)
+
+    with open(path_out, 'w', encoding='utf-8') as file:
+        for example in data['train_input']:
+            predicted = model.predict(example)
+            text_predicted = dict((value, key) for key, value in categories.items()).get(predicted)
+            file.write(str(text_predicted) + '\n')
+
+
+# In[5]:
+
+
+def to_vowpalwabbit(row, categories):
+    text = row['text'].replace('\n', ' ').lower().strip()
+    text = re.sub("[^a-zA-Z -']", '', text)
+    text = re.sub(" +", ' ', text)
+    year = row['year']
+    try:
+        category = categories[row['category']]
+    except KeyError:
+        category = ''
+
+    vw = f"{category} | year:{year} text:{text}\n"
+
+    return vw
+
+
+# In[10]:
+
+
+x_train = pd.read_csv('train/in.tsv', header=None, sep='\t')
+x_train = x_train.drop(1, axis=1)
+x_train.columns = ['year', 'text']
+
+
+y_train = pd.read_csv('train/expected.tsv', header=None, sep='\t')
+y_train.columns = ['category']
+
+x_train = x_train[0:800000]
+y_train = y_train(0:800000)
+
+data = pd.concat([x_train, y_train], axis=1)
+
+
+# In[ ]:
+
+
+categories = {}
+
+for i, x in enumerate(data['category'].unique()):
+    categories[x] = i+1
+
+print(categories)
+    
+data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)
+
+model = vowpalwabbit.Workspace('--oaa 7 --learning_rate 0.99')
+
+for example in data['train_input']:
+    model.learn(example)
+
+
+# In[ ]:
+
+
+prediction('dev-0/in.tsv', 'dev-0/out.tsv', model, categories)
+prediction('test-A/in.tsv', 'test-A/out.tsv', model, categories)
+prediction('test-B/in.tsv', 'test-B/out.tsv', model, categories)
+
+
+# In[ ]:
+
+
+get_ipython().system('jupyter nbconvert --to script run.ipynb')
+
--- a/README.md
+++ b/README.md
@ -0,0 +1,39 @@
+# Ireland news headlines
+
+# Dataset source and thanks
+
+Predict the headline category given headine text and year
+Start Date: 1996-01-01 End Date: 2019-12-31
+
+
+Dataset taken from https://www.kaggle.com/therohk/ireland-historical-news on 19.06.2020.
+Special thanks to Rohit Kulkarni who created it.
+
+You may find whole dataset (including the test dataset) in the link above.
+The dataset in the link may be updated.
+Please, do not incorporate any of the data from this kaggle dataset (or others) to your submission in this gonito challange.
+
+## Context (from https://www.kaggle.com/therohk/ireland-historical-news )
+
+This news dataset is a composition of 1.48 million headlines posted by the Irish Times operating within Ireland.
+
+Created over 160 years ago; the agency can provides long term birds eye view of the happenings in Europe.
+
+
+# Challange creation
+
+Year is normalized as follows:
+
+'''
+    days_in_year = 366 if is_leap else 365
+    normalized = d.year + ((day_of_year-1)  / days_in_year)
+'''
+
+train, dev, test split is 80%, 10%, 10% randomly
+
+note that there are very similar headlines in the data
+
+I did not make any effort to prevent from going one sentence like this to the train and second one to the test.
+
+I used a first category in the classification task. E.g there is "world" instead of "world.us" as on original dataset.
+
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
+--metric Accuracy --precision 4 -%
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/1
+++ b/1
@ -0,0 +1 @@
+year_normalized	date	text
--- a/run.ipynb
+++ b/run.ipynb
@ -0,0 +1,288 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "57debdd5-4760-4524-9c77-409652cfb52e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import vowpalwabbit\n",
+    "import pandas as pd\n",
+    "import re"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "48357fb3-9d6f-48d0-9869-bce7a87e3ba1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prediction(path_in, path_out, model, categories):\n",
+    "    data = pd.read_csv(path_in, header=None, sep='\\t')\n",
+    "    data = data.drop(1, axis=1)\n",
+    "    data.columns = ['year', 'text']\n",
+    "\n",
+    "    data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)\n",
+    "\n",
+    "    with open(path_out, 'w', encoding='utf-8') as file:\n",
+    "        for example in data['train_input']:\n",
+    "            predicted = model.predict(example)\n",
+    "            text_predicted = dict((value, key) for key, value in categories.items()).get(predicted)\n",
+    "            file.write(str(text_predicted) + '\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f47c4ff9-2078-43f7-b06c-99c9fdd2022e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def to_vowpalwabbit(row, categories):\n",
+    "    text = row['text'].replace('\\n', ' ').lower().strip()\n",
+    "    text = re.sub(\"[^a-zA-Z -']\", '', text)\n",
+    "    text = re.sub(\" +\", ' ', text)\n",
+    "    year = row['year']\n",
+    "    try:\n",
+    "        category = categories[row['category']]\n",
+    "    except KeyError:\n",
+    "        category = ''\n",
+    "\n",
+    "    vw = f\"{category} | year:{year} text:{text}\\n\"\n",
+    "\n",
+    "    return vw"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "015a5ccb-8fe0-45cf-bf59-416ce9e59dad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x_train = pd.read_csv('train/in.tsv', header=None, sep='\\t')\n",
+    "x_train = x_train.drop(1, axis=1)\n",
+    "x_train.columns = ['year', 'text']\n",
+    "\n",
+    "\n",
+    "y_train = pd.read_csv('train/expected.tsv', header=None, sep='\\t')\n",
+    "y_train.columns = ['category']\n",
+    "\n",
+    "x_train = x_train[0:800000]\n",
+    "y_train = y_train[0:800000]\n",
+    "\n",
+    "data = pd.concat([x_train, y_train], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "d8a2c80c-3b93-410d-98e4-ac651d0933a2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>year</th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2004.508197</td>\n",
+       "      <td>Sudan claims it is disarming militias</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2008.442623</td>\n",
+       "      <td>Bluffer's guide to Euro 2008</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2012.587432</td>\n",
+       "      <td>Ennis tallies her highest first day total</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>2009.071233</td>\n",
+       "      <td>Sri Lanka continues to battle Tamil Tigers</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1997.345205</td>\n",
+       "      <td>Talks today to avert new health service strike</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>799995</th>\n",
+       "      <td>2010.876712</td>\n",
+       "      <td>Top league stars among 135 listed online</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>799996</th>\n",
+       "      <td>2000.879452</td>\n",
+       "      <td>Cabinet to consider options for animal disposal</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>799997</th>\n",
+       "      <td>2004.915068</td>\n",
+       "      <td>Last orders for Bewley's this evening</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>799998</th>\n",
+       "      <td>2014.797260</td>\n",
+       "      <td>Toulon; Ospreys and Toulouse win Champions Cup...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>799999</th>\n",
+       "      <td>1999.019178</td>\n",
+       "      <td>Volatile year in store for the markets</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>800000 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "               year                                               text\n",
+       "0       2004.508197              Sudan claims it is disarming militias\n",
+       "1       2008.442623                       Bluffer's guide to Euro 2008\n",
+       "2       2012.587432          Ennis tallies her highest first day total\n",
+       "3       2009.071233         Sri Lanka continues to battle Tamil Tigers\n",
+       "4       1997.345205     Talks today to avert new health service strike\n",
+       "...             ...                                                ...\n",
+       "799995  2010.876712           Top league stars among 135 listed online\n",
+       "799996  2000.879452    Cabinet to consider options for animal disposal\n",
+       "799997  2004.915068              Last orders for Bewley's this evening\n",
+       "799998  2014.797260  Toulon; Ospreys and Toulouse win Champions Cup...\n",
+       "799999  1999.019178             Volatile year in store for the markets\n",
+       "\n",
+       "[800000 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "x_train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "422673c2-4de6-446a-816c-1c35ba43c373",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'news': 1, 'sport': 2, 'opinion': 3, 'business': 4, 'culture': 5, 'lifestyle': 6, 'removed': 7}\n"
+     ]
+    }
+   ],
+   "source": [
+    "categories = {}\n",
+    "\n",
+    "for i, x in enumerate(data['category'].unique()):\n",
+    "    categories[x] = i+1\n",
+    "\n",
+    "print(categories)\n",
+    "    \n",
+    "data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)\n",
+    "\n",
+    "model = vowpalwabbit.Workspace('--oaa 7 --learning_rate 0.99')\n",
+    "\n",
+    "for example in data['train_input']:\n",
+    "    model.learn(example)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "29f424f4-19fd-43f9-a8bf-39beb3fc408d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prediction('dev-0/in.tsv', 'dev-0/out.tsv', model, categories)\n",
+    "prediction('test-A/in.tsv', 'test-A/out.tsv', model, categories)\n",
+    "prediction('test-B/in.tsv', 'test-B/out.tsv', model, categories)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "ecf1726c-56ee-4476-bf88-136fa588feec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[NbConvertApp] Converting notebook run.ipynb to script\n",
+      "[NbConvertApp] Writing 2030 bytes to run.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "!jupyter nbconvert --to script run.ipynb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e00dd4c1-7d79-4b6d-9c59-7c71640e5230",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/run.py
+++ b/run.py
@ -0,0 +1,94 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[3]:
+
+
+import vowpalwabbit
+import pandas as pd
+import re
+
+
+# In[4]:
+
+
+def prediction(path_in, path_out, model, categories):
+    data = pd.read_csv(path_in, header=None, sep='\t')
+    data = data.drop(1, axis=1)
+    data.columns = ['year', 'text']
+
+    data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)
+
+    with open(path_out, 'w', encoding='utf-8') as file:
+        for example in data['train_input']:
+            predicted = model.predict(example)
+            text_predicted = dict((value, key) for key, value in categories.items()).get(predicted)
+            file.write(str(text_predicted) + '\n')
+
+
+# In[5]:
+
+
+def to_vowpalwabbit(row, categories):
+    text = row['text'].replace('\n', ' ').lower().strip()
+    text = re.sub("[^a-zA-Z -']", '', text)
+    text = re.sub(" +", ' ', text)
+    year = row['year']
+    try:
+        category = categories[row['category']]
+    except KeyError:
+        category = ''
+
+    vw = f"{category} | year:{year} text:{text}\n"
+
+    return vw
+
+
+# In[10]:
+
+
+x_train = pd.read_csv('train/in.tsv', header=None, sep='\t')
+x_train = x_train.drop(1, axis=1)
+x_train.columns = ['year', 'text']
+
+
+y_train = pd.read_csv('train/expected.tsv', header=None, sep='\t')
+y_train.columns = ['category']
+
+x_train = x_train[0:800000]
+y_train = y_train(0:800000)
+
+data = pd.concat([x_train, y_train], axis=1)
+
+
+# In[ ]:
+
+
+categories = {}
+
+for i, x in enumerate(data['category'].unique()):
+    categories[x] = i+1
+
+print(categories)
+    
+data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)
+
+model = vowpalwabbit.Workspace('--oaa 7 --learning_rate 0.99')
+
+for example in data['train_input']:
+    model.learn(example)
+
+
+# In[ ]:
+
+
+prediction('dev-0/in.tsv', 'dev-0/out.tsv', model, categories)
+prediction('test-A/in.tsv', 'test-A/out.tsv', model, categories)
+prediction('test-B/in.tsv', 'test-B/out.tsv', model, categories)
+
+
+# In[ ]:
+
+
+get_ipython().system('jupyter nbconvert --to script run.ipynb')
+
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/test-B/in.tsv
+++ b/test-B/in.tsv
--- a/test-B/out.tsv
+++ b/test-B/out.tsv
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/expected.tsv.xz
+++ b/train/expected.tsv.xz
--- a/train/in.tsv
+++ b/train/in.tsv
--- a/train/in.tsv.xz
+++ b/train/in.tsv.xz