ready-made

This commit is contained in:
korne 2022-06-14 23:05:20 +02:00
commit 8e8a47dc81
19 changed files with 3276583 additions and 0 deletions

8
.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
*~
*.swp
*.bak
*.pyc
*.o
.DS_Store
.token

View File

@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,94 @@
#!/usr/bin/env python
# coding: utf-8
# In[3]:
import vowpalwabbit
import pandas as pd
import re
# In[4]:
def prediction(path_in, path_out, model, categories):
data = pd.read_csv(path_in, header=None, sep='\t')
data = data.drop(1, axis=1)
data.columns = ['year', 'text']
data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)
with open(path_out, 'w', encoding='utf-8') as file:
for example in data['train_input']:
predicted = model.predict(example)
text_predicted = dict((value, key) for key, value in categories.items()).get(predicted)
file.write(str(text_predicted) + '\n')
# In[5]:
def to_vowpalwabbit(row, categories):
text = row['text'].replace('\n', ' ').lower().strip()
text = re.sub("[^a-zA-Z -']", '', text)
text = re.sub(" +", ' ', text)
year = row['year']
try:
category = categories[row['category']]
except KeyError:
category = ''
vw = f"{category} | year:{year} text:{text}\n"
return vw
# In[10]:
x_train = pd.read_csv('train/in.tsv', header=None, sep='\t')
x_train = x_train.drop(1, axis=1)
x_train.columns = ['year', 'text']
y_train = pd.read_csv('train/expected.tsv', header=None, sep='\t')
y_train.columns = ['category']
x_train = x_train[0:800000]
y_train = y_train(0:800000)
data = pd.concat([x_train, y_train], axis=1)
# In[ ]:
categories = {}
for i, x in enumerate(data['category'].unique()):
categories[x] = i+1
print(categories)
data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)
model = vowpalwabbit.Workspace('--oaa 7 --learning_rate 0.99')
for example in data['train_input']:
model.learn(example)
# In[ ]:
prediction('dev-0/in.tsv', 'dev-0/out.tsv', model, categories)
prediction('test-A/in.tsv', 'test-A/out.tsv', model, categories)
prediction('test-B/in.tsv', 'test-B/out.tsv', model, categories)
# In[ ]:
get_ipython().system('jupyter nbconvert --to script run.ipynb')

39
README.md Normal file
View File

@ -0,0 +1,39 @@
# Ireland news headlines
# Dataset source and thanks
Predict the headline category given headine text and year
Start Date: 1996-01-01 End Date: 2019-12-31
Dataset taken from https://www.kaggle.com/therohk/ireland-historical-news on 19.06.2020.
Special thanks to Rohit Kulkarni who created it.
You may find whole dataset (including the test dataset) in the link above.
The dataset in the link may be updated.
Please, do not incorporate any of the data from this kaggle dataset (or others) to your submission in this gonito challange.
## Context (from https://www.kaggle.com/therohk/ireland-historical-news )
This news dataset is a composition of 1.48 million headlines posted by the Irish Times operating within Ireland.
Created over 160 years ago; the agency can provides long term birds eye view of the happenings in Europe.
# Challange creation
Year is normalized as follows:
'''
days_in_year = 366 if is_leap else 365
normalized = d.year + ((day_of_year-1) / days_in_year)
'''
train, dev, test split is 80%, 10%, 10% randomly
note that there are very similar headlines in the data
I did not make any effort to prevent from going one sentence like this to the train and second one to the test.
I used a first category in the classification task. E.g there is "world" instead of "world.us" as on original dataset.

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric Accuracy --precision 4 -%

149134
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

149134
dev-0/in.tsv Normal file

File diff suppressed because it is too large Load Diff

149134
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

1
names Normal file
View File

@ -0,0 +1 @@
year_normalized date text

288
run.ipynb Normal file
View File

@ -0,0 +1,288 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "57debdd5-4760-4524-9c77-409652cfb52e",
"metadata": {},
"outputs": [],
"source": [
"import vowpalwabbit\n",
"import pandas as pd\n",
"import re"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "48357fb3-9d6f-48d0-9869-bce7a87e3ba1",
"metadata": {},
"outputs": [],
"source": [
"def prediction(path_in, path_out, model, categories):\n",
" data = pd.read_csv(path_in, header=None, sep='\\t')\n",
" data = data.drop(1, axis=1)\n",
" data.columns = ['year', 'text']\n",
"\n",
" data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)\n",
"\n",
" with open(path_out, 'w', encoding='utf-8') as file:\n",
" for example in data['train_input']:\n",
" predicted = model.predict(example)\n",
" text_predicted = dict((value, key) for key, value in categories.items()).get(predicted)\n",
" file.write(str(text_predicted) + '\\n')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f47c4ff9-2078-43f7-b06c-99c9fdd2022e",
"metadata": {},
"outputs": [],
"source": [
"def to_vowpalwabbit(row, categories):\n",
" text = row['text'].replace('\\n', ' ').lower().strip()\n",
" text = re.sub(\"[^a-zA-Z -']\", '', text)\n",
" text = re.sub(\" +\", ' ', text)\n",
" year = row['year']\n",
" try:\n",
" category = categories[row['category']]\n",
" except KeyError:\n",
" category = ''\n",
"\n",
" vw = f\"{category} | year:{year} text:{text}\\n\"\n",
"\n",
" return vw"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "015a5ccb-8fe0-45cf-bf59-416ce9e59dad",
"metadata": {},
"outputs": [],
"source": [
"x_train = pd.read_csv('train/in.tsv', header=None, sep='\\t')\n",
"x_train = x_train.drop(1, axis=1)\n",
"x_train.columns = ['year', 'text']\n",
"\n",
"\n",
"y_train = pd.read_csv('train/expected.tsv', header=None, sep='\\t')\n",
"y_train.columns = ['category']\n",
"\n",
"x_train = x_train[0:800000]\n",
"y_train = y_train[0:800000]\n",
"\n",
"data = pd.concat([x_train, y_train], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "d8a2c80c-3b93-410d-98e4-ac651d0933a2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>year</th>\n",
" <th>text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2004.508197</td>\n",
" <td>Sudan claims it is disarming militias</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2008.442623</td>\n",
" <td>Bluffer's guide to Euro 2008</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2012.587432</td>\n",
" <td>Ennis tallies her highest first day total</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2009.071233</td>\n",
" <td>Sri Lanka continues to battle Tamil Tigers</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1997.345205</td>\n",
" <td>Talks today to avert new health service strike</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>799995</th>\n",
" <td>2010.876712</td>\n",
" <td>Top league stars among 135 listed online</td>\n",
" </tr>\n",
" <tr>\n",
" <th>799996</th>\n",
" <td>2000.879452</td>\n",
" <td>Cabinet to consider options for animal disposal</td>\n",
" </tr>\n",
" <tr>\n",
" <th>799997</th>\n",
" <td>2004.915068</td>\n",
" <td>Last orders for Bewley's this evening</td>\n",
" </tr>\n",
" <tr>\n",
" <th>799998</th>\n",
" <td>2014.797260</td>\n",
" <td>Toulon; Ospreys and Toulouse win Champions Cup...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>799999</th>\n",
" <td>1999.019178</td>\n",
" <td>Volatile year in store for the markets</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>800000 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" year text\n",
"0 2004.508197 Sudan claims it is disarming militias\n",
"1 2008.442623 Bluffer's guide to Euro 2008\n",
"2 2012.587432 Ennis tallies her highest first day total\n",
"3 2009.071233 Sri Lanka continues to battle Tamil Tigers\n",
"4 1997.345205 Talks today to avert new health service strike\n",
"... ... ...\n",
"799995 2010.876712 Top league stars among 135 listed online\n",
"799996 2000.879452 Cabinet to consider options for animal disposal\n",
"799997 2004.915068 Last orders for Bewley's this evening\n",
"799998 2014.797260 Toulon; Ospreys and Toulouse win Champions Cup...\n",
"799999 1999.019178 Volatile year in store for the markets\n",
"\n",
"[800000 rows x 2 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"x_train"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "422673c2-4de6-446a-816c-1c35ba43c373",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'news': 1, 'sport': 2, 'opinion': 3, 'business': 4, 'culture': 5, 'lifestyle': 6, 'removed': 7}\n"
]
}
],
"source": [
"categories = {}\n",
"\n",
"for i, x in enumerate(data['category'].unique()):\n",
" categories[x] = i+1\n",
"\n",
"print(categories)\n",
" \n",
"data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)\n",
"\n",
"model = vowpalwabbit.Workspace('--oaa 7 --learning_rate 0.99')\n",
"\n",
"for example in data['train_input']:\n",
" model.learn(example)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "29f424f4-19fd-43f9-a8bf-39beb3fc408d",
"metadata": {},
"outputs": [],
"source": [
"prediction('dev-0/in.tsv', 'dev-0/out.tsv', model, categories)\n",
"prediction('test-A/in.tsv', 'test-A/out.tsv', model, categories)\n",
"prediction('test-B/in.tsv', 'test-B/out.tsv', model, categories)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "ecf1726c-56ee-4476-bf88-136fa588feec",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[NbConvertApp] Converting notebook run.ipynb to script\n",
"[NbConvertApp] Writing 2030 bytes to run.py\n"
]
}
],
"source": [
"!jupyter nbconvert --to script run.ipynb"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e00dd4c1-7d79-4b6d-9c59-7c71640e5230",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

94
run.py Normal file
View File

@ -0,0 +1,94 @@
#!/usr/bin/env python
# coding: utf-8
# In[3]:
import vowpalwabbit
import pandas as pd
import re
# In[4]:
def prediction(path_in, path_out, model, categories):
data = pd.read_csv(path_in, header=None, sep='\t')
data = data.drop(1, axis=1)
data.columns = ['year', 'text']
data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)
with open(path_out, 'w', encoding='utf-8') as file:
for example in data['train_input']:
predicted = model.predict(example)
text_predicted = dict((value, key) for key, value in categories.items()).get(predicted)
file.write(str(text_predicted) + '\n')
# In[5]:
def to_vowpalwabbit(row, categories):
text = row['text'].replace('\n', ' ').lower().strip()
text = re.sub("[^a-zA-Z -']", '', text)
text = re.sub(" +", ' ', text)
year = row['year']
try:
category = categories[row['category']]
except KeyError:
category = ''
vw = f"{category} | year:{year} text:{text}\n"
return vw
# In[10]:
x_train = pd.read_csv('train/in.tsv', header=None, sep='\t')
x_train = x_train.drop(1, axis=1)
x_train.columns = ['year', 'text']
y_train = pd.read_csv('train/expected.tsv', header=None, sep='\t')
y_train.columns = ['category']
x_train = x_train[0:800000]
y_train = y_train(0:800000)
data = pd.concat([x_train, y_train], axis=1)
# In[ ]:
categories = {}
for i, x in enumerate(data['category'].unique()):
categories[x] = i+1
print(categories)
data['train_input'] = data.apply(lambda row: to_vowpalwabbit(row, categories), axis=1)
model = vowpalwabbit.Workspace('--oaa 7 --learning_rate 0.99')
for example in data['train_input']:
model.learn(example)
# In[ ]:
prediction('dev-0/in.tsv', 'dev-0/out.tsv', model, categories)
prediction('test-A/in.tsv', 'test-A/out.tsv', model, categories)
prediction('test-B/in.tsv', 'test-B/out.tsv', model, categories)
# In[ ]:
get_ipython().system('jupyter nbconvert --to script run.ipynb')

148308
test-A/in.tsv Normal file

File diff suppressed because it is too large Load Diff

148308
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

79119
test-B/in.tsv Normal file

File diff suppressed because it is too large Load Diff

79119
test-B/out.tsv Normal file

File diff suppressed because it is too large Load Diff

1186898
train/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
train/expected.tsv.xz Normal file

Binary file not shown.

1186898
train/in.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
train/in.tsv.xz Normal file

Binary file not shown.