ready-made
This commit is contained in:
commit
8e8a47dc81
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
*.pyc
|
||||
*.o
|
||||
.DS_Store
|
||||
.token
|
6
.ipynb_checkpoints/run-checkpoint.ipynb
Normal file
6
.ipynb_checkpoints/run-checkpoint.ipynb
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"cells": [],
|
||||
"metadata": {},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
94
.ipynb_checkpoints/run-checkpoint.py
Normal file
94
.ipynb_checkpoints/run-checkpoint.py
Normal file
@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# In[3]:
|
||||
|
||||
|
||||
import vowpalwabbit
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
|
||||
# In[4]:
|
||||
|
||||
|
||||
def prediction(path_in, path_out, model, categories):
|
||||
data = pd.read_csv(path_in, header=None, sep='\t')
|
||||
data = data.drop(1, axis=1)
|
||||
data.columns = ['year', 'text']
|
||||
|
||||
data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)
|
||||
|
||||
with open(path_out, 'w', encoding='utf-8') as file:
|
||||
for example in data['train_input']:
|
||||
predicted = model.predict(example)
|
||||
text_predicted = dict((value, key) for key, value in categories.items()).get(predicted)
|
||||
file.write(str(text_predicted) + '\n')
|
||||
|
||||
|
||||
# In[5]:
|
||||
|
||||
|
||||
def to_vowpalwabbit(row, categories):
|
||||
text = row['text'].replace('\n', ' ').lower().strip()
|
||||
text = re.sub("[^a-zA-Z -']", '', text)
|
||||
text = re.sub(" +", ' ', text)
|
||||
year = row['year']
|
||||
try:
|
||||
category = categories[row['category']]
|
||||
except KeyError:
|
||||
category = ''
|
||||
|
||||
vw = f"{category} | year:{year} text:{text}\n"
|
||||
|
||||
return vw
|
||||
|
||||
|
||||
# In[10]:
|
||||
|
||||
|
||||
x_train = pd.read_csv('train/in.tsv', header=None, sep='\t')
|
||||
x_train = x_train.drop(1, axis=1)
|
||||
x_train.columns = ['year', 'text']
|
||||
|
||||
|
||||
y_train = pd.read_csv('train/expected.tsv', header=None, sep='\t')
|
||||
y_train.columns = ['category']
|
||||
|
||||
x_train = x_train[0:800000]
|
||||
y_train = y_train(0:800000)
|
||||
|
||||
data = pd.concat([x_train, y_train], axis=1)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
categories = {}
|
||||
|
||||
for i, x in enumerate(data['category'].unique()):
|
||||
categories[x] = i+1
|
||||
|
||||
print(categories)
|
||||
|
||||
data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)
|
||||
|
||||
model = vowpalwabbit.Workspace('--oaa 7 --learning_rate 0.99')
|
||||
|
||||
for example in data['train_input']:
|
||||
model.learn(example)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
prediction('dev-0/in.tsv', 'dev-0/out.tsv', model, categories)
|
||||
prediction('test-A/in.tsv', 'test-A/out.tsv', model, categories)
|
||||
prediction('test-B/in.tsv', 'test-B/out.tsv', model, categories)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
get_ipython().system('jupyter nbconvert --to script run.ipynb')
|
||||
|
39
README.md
Normal file
39
README.md
Normal file
@ -0,0 +1,39 @@
|
||||
# Ireland news headlines
|
||||
|
||||
# Dataset source and thanks
|
||||
|
||||
Predict the headline category given headine text and year
|
||||
Start Date: 1996-01-01 End Date: 2019-12-31
|
||||
|
||||
|
||||
Dataset taken from https://www.kaggle.com/therohk/ireland-historical-news on 19.06.2020.
|
||||
Special thanks to Rohit Kulkarni who created it.
|
||||
|
||||
You may find whole dataset (including the test dataset) in the link above.
|
||||
The dataset in the link may be updated.
|
||||
Please, do not incorporate any of the data from this kaggle dataset (or others) to your submission in this gonito challange.
|
||||
|
||||
## Context (from https://www.kaggle.com/therohk/ireland-historical-news )
|
||||
|
||||
This news dataset is a composition of 1.48 million headlines posted by the Irish Times operating within Ireland.
|
||||
|
||||
Created over 160 years ago; the agency can provides long term birds eye view of the happenings in Europe.
|
||||
|
||||
|
||||
# Challange creation
|
||||
|
||||
Year is normalized as follows:
|
||||
|
||||
'''
|
||||
days_in_year = 366 if is_leap else 365
|
||||
normalized = d.year + ((day_of_year-1) / days_in_year)
|
||||
'''
|
||||
|
||||
train, dev, test split is 80%, 10%, 10% randomly
|
||||
|
||||
note that there are very similar headlines in the data
|
||||
|
||||
I did not make any effort to prevent from going one sentence like this to the train and second one to the test.
|
||||
|
||||
I used a first category in the classification task. E.g there is "world" instead of "world.us" as on original dataset.
|
||||
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
||||
--metric Accuracy --precision 4 -%
|
149134
dev-0/expected.tsv
Normal file
149134
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
149134
dev-0/in.tsv
Normal file
149134
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
149134
dev-0/out.tsv
Normal file
149134
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
288
run.ipynb
Normal file
288
run.ipynb
Normal file
@ -0,0 +1,288 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "57debdd5-4760-4524-9c77-409652cfb52e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import vowpalwabbit\n",
|
||||
"import pandas as pd\n",
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "48357fb3-9d6f-48d0-9869-bce7a87e3ba1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def prediction(path_in, path_out, model, categories):\n",
|
||||
" data = pd.read_csv(path_in, header=None, sep='\\t')\n",
|
||||
" data = data.drop(1, axis=1)\n",
|
||||
" data.columns = ['year', 'text']\n",
|
||||
"\n",
|
||||
" data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)\n",
|
||||
"\n",
|
||||
" with open(path_out, 'w', encoding='utf-8') as file:\n",
|
||||
" for example in data['train_input']:\n",
|
||||
" predicted = model.predict(example)\n",
|
||||
" text_predicted = dict((value, key) for key, value in categories.items()).get(predicted)\n",
|
||||
" file.write(str(text_predicted) + '\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "f47c4ff9-2078-43f7-b06c-99c9fdd2022e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def to_vowpalwabbit(row, categories):\n",
|
||||
" text = row['text'].replace('\\n', ' ').lower().strip()\n",
|
||||
" text = re.sub(\"[^a-zA-Z -']\", '', text)\n",
|
||||
" text = re.sub(\" +\", ' ', text)\n",
|
||||
" year = row['year']\n",
|
||||
" try:\n",
|
||||
" category = categories[row['category']]\n",
|
||||
" except KeyError:\n",
|
||||
" category = ''\n",
|
||||
"\n",
|
||||
" vw = f\"{category} | year:{year} text:{text}\\n\"\n",
|
||||
"\n",
|
||||
" return vw"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "015a5ccb-8fe0-45cf-bf59-416ce9e59dad",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"x_train = pd.read_csv('train/in.tsv', header=None, sep='\\t')\n",
|
||||
"x_train = x_train.drop(1, axis=1)\n",
|
||||
"x_train.columns = ['year', 'text']\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"y_train = pd.read_csv('train/expected.tsv', header=None, sep='\\t')\n",
|
||||
"y_train.columns = ['category']\n",
|
||||
"\n",
|
||||
"x_train = x_train[0:800000]\n",
|
||||
"y_train = y_train[0:800000]\n",
|
||||
"\n",
|
||||
"data = pd.concat([x_train, y_train], axis=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "d8a2c80c-3b93-410d-98e4-ac651d0933a2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>year</th>\n",
|
||||
" <th>text</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>2004.508197</td>\n",
|
||||
" <td>Sudan claims it is disarming militias</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2008.442623</td>\n",
|
||||
" <td>Bluffer's guide to Euro 2008</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2012.587432</td>\n",
|
||||
" <td>Ennis tallies her highest first day total</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2009.071233</td>\n",
|
||||
" <td>Sri Lanka continues to battle Tamil Tigers</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>1997.345205</td>\n",
|
||||
" <td>Talks today to avert new health service strike</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>799995</th>\n",
|
||||
" <td>2010.876712</td>\n",
|
||||
" <td>Top league stars among 135 listed online</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>799996</th>\n",
|
||||
" <td>2000.879452</td>\n",
|
||||
" <td>Cabinet to consider options for animal disposal</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>799997</th>\n",
|
||||
" <td>2004.915068</td>\n",
|
||||
" <td>Last orders for Bewley's this evening</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>799998</th>\n",
|
||||
" <td>2014.797260</td>\n",
|
||||
" <td>Toulon; Ospreys and Toulouse win Champions Cup...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>799999</th>\n",
|
||||
" <td>1999.019178</td>\n",
|
||||
" <td>Volatile year in store for the markets</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>800000 rows × 2 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" year text\n",
|
||||
"0 2004.508197 Sudan claims it is disarming militias\n",
|
||||
"1 2008.442623 Bluffer's guide to Euro 2008\n",
|
||||
"2 2012.587432 Ennis tallies her highest first day total\n",
|
||||
"3 2009.071233 Sri Lanka continues to battle Tamil Tigers\n",
|
||||
"4 1997.345205 Talks today to avert new health service strike\n",
|
||||
"... ... ...\n",
|
||||
"799995 2010.876712 Top league stars among 135 listed online\n",
|
||||
"799996 2000.879452 Cabinet to consider options for animal disposal\n",
|
||||
"799997 2004.915068 Last orders for Bewley's this evening\n",
|
||||
"799998 2014.797260 Toulon; Ospreys and Toulouse win Champions Cup...\n",
|
||||
"799999 1999.019178 Volatile year in store for the markets\n",
|
||||
"\n",
|
||||
"[800000 rows x 2 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"x_train"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "422673c2-4de6-446a-816c-1c35ba43c373",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{'news': 1, 'sport': 2, 'opinion': 3, 'business': 4, 'culture': 5, 'lifestyle': 6, 'removed': 7}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"categories = {}\n",
|
||||
"\n",
|
||||
"for i, x in enumerate(data['category'].unique()):\n",
|
||||
" categories[x] = i+1\n",
|
||||
"\n",
|
||||
"print(categories)\n",
|
||||
" \n",
|
||||
"data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)\n",
|
||||
"\n",
|
||||
"model = vowpalwabbit.Workspace('--oaa 7 --learning_rate 0.99')\n",
|
||||
"\n",
|
||||
"for example in data['train_input']:\n",
|
||||
" model.learn(example)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "29f424f4-19fd-43f9-a8bf-39beb3fc408d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"prediction('dev-0/in.tsv', 'dev-0/out.tsv', model, categories)\n",
|
||||
"prediction('test-A/in.tsv', 'test-A/out.tsv', model, categories)\n",
|
||||
"prediction('test-B/in.tsv', 'test-B/out.tsv', model, categories)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "ecf1726c-56ee-4476-bf88-136fa588feec",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[NbConvertApp] Converting notebook run.ipynb to script\n",
|
||||
"[NbConvertApp] Writing 2030 bytes to run.py\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!jupyter nbconvert --to script run.ipynb"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "e00dd4c1-7d79-4b6d-9c59-7c71640e5230",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.12"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
94
run.py
Normal file
94
run.py
Normal file
@ -0,0 +1,94 @@
|
||||
#!/usr/bin/env python
|
||||
# coding: utf-8
|
||||
|
||||
# In[3]:
|
||||
|
||||
|
||||
import vowpalwabbit
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
|
||||
# In[4]:
|
||||
|
||||
|
||||
def prediction(path_in, path_out, model, categories):
|
||||
data = pd.read_csv(path_in, header=None, sep='\t')
|
||||
data = data.drop(1, axis=1)
|
||||
data.columns = ['year', 'text']
|
||||
|
||||
data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)
|
||||
|
||||
with open(path_out, 'w', encoding='utf-8') as file:
|
||||
for example in data['train_input']:
|
||||
predicted = model.predict(example)
|
||||
text_predicted = dict((value, key) for key, value in categories.items()).get(predicted)
|
||||
file.write(str(text_predicted) + '\n')
|
||||
|
||||
|
||||
# In[5]:
|
||||
|
||||
|
||||
def to_vowpalwabbit(row, categories):
|
||||
text = row['text'].replace('\n', ' ').lower().strip()
|
||||
text = re.sub("[^a-zA-Z -']", '', text)
|
||||
text = re.sub(" +", ' ', text)
|
||||
year = row['year']
|
||||
try:
|
||||
category = categories[row['category']]
|
||||
except KeyError:
|
||||
category = ''
|
||||
|
||||
vw = f"{category} | year:{year} text:{text}\n"
|
||||
|
||||
return vw
|
||||
|
||||
|
||||
# In[10]:
|
||||
|
||||
|
||||
x_train = pd.read_csv('train/in.tsv', header=None, sep='\t')
|
||||
x_train = x_train.drop(1, axis=1)
|
||||
x_train.columns = ['year', 'text']
|
||||
|
||||
|
||||
y_train = pd.read_csv('train/expected.tsv', header=None, sep='\t')
|
||||
y_train.columns = ['category']
|
||||
|
||||
x_train = x_train[0:800000]
|
||||
y_train = y_train(0:800000)
|
||||
|
||||
data = pd.concat([x_train, y_train], axis=1)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
categories = {}
|
||||
|
||||
for i, x in enumerate(data['category'].unique()):
|
||||
categories[x] = i+1
|
||||
|
||||
print(categories)
|
||||
|
||||
data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)
|
||||
|
||||
model = vowpalwabbit.Workspace('--oaa 7 --learning_rate 0.99')
|
||||
|
||||
for example in data['train_input']:
|
||||
model.learn(example)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
prediction('dev-0/in.tsv', 'dev-0/out.tsv', model, categories)
|
||||
prediction('test-A/in.tsv', 'test-A/out.tsv', model, categories)
|
||||
prediction('test-B/in.tsv', 'test-B/out.tsv', model, categories)
|
||||
|
||||
|
||||
# In[ ]:
|
||||
|
||||
|
||||
get_ipython().system('jupyter nbconvert --to script run.ipynb')
|
||||
|
148308
test-A/in.tsv
Normal file
148308
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
148308
test-A/out.tsv
Normal file
148308
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
79119
test-B/in.tsv
Normal file
79119
test-B/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
79119
test-B/out.tsv
Normal file
79119
test-B/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1186898
train/expected.tsv
Normal file
1186898
train/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/expected.tsv.xz
Normal file
BIN
train/expected.tsv.xz
Normal file
Binary file not shown.
1186898
train/in.tsv
Normal file
1186898
train/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/in.tsv.xz
Normal file
BIN
train/in.tsv.xz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user