s478874
This commit is contained in:
parent
ff924ca4e1
commit
3796beea7d
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
|
||||||
|
*~
|
||||||
|
*.swp
|
||||||
|
*.bak
|
||||||
|
*.pyc
|
||||||
|
*.o
|
||||||
|
.DS_Store
|
||||||
|
.token
|
39
README.md
Normal file
39
README.md
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
# Ireland news headlines
|
||||||
|
|
||||||
|
# Dataset source and thanks
|
||||||
|
|
||||||
|
Predict the headline category given headine text and year
|
||||||
|
Start Date: 1996-01-01 End Date: 2019-12-31
|
||||||
|
|
||||||
|
|
||||||
|
Dataset taken from https://www.kaggle.com/therohk/ireland-historical-news on 19.06.2020.
|
||||||
|
Special thanks to Rohit Kulkarni who created it.
|
||||||
|
|
||||||
|
You may find whole dataset (including the test dataset) in the link above.
|
||||||
|
The dataset in the link may be updated.
|
||||||
|
Please, do not incorporate any of the data from this kaggle dataset (or others) to your submission in this gonito challange.
|
||||||
|
|
||||||
|
## Context (from https://www.kaggle.com/therohk/ireland-historical-news )
|
||||||
|
|
||||||
|
This news dataset is a composition of 1.48 million headlines posted by the Irish Times operating within Ireland.
|
||||||
|
|
||||||
|
Created over 160 years ago; the agency can provides long term birds eye view of the happenings in Europe.
|
||||||
|
|
||||||
|
|
||||||
|
# Challange creation
|
||||||
|
|
||||||
|
Year is normalized as follows:
|
||||||
|
|
||||||
|
'''
|
||||||
|
days_in_year = 366 if is_leap else 365
|
||||||
|
normalized = d.year + ((day_of_year-1) / days_in_year)
|
||||||
|
'''
|
||||||
|
|
||||||
|
train, dev, test split is 80%, 10%, 10% randomly
|
||||||
|
|
||||||
|
note that there are very similar headlines in the data
|
||||||
|
|
||||||
|
I did not make any effort to prevent from going one sentence like this to the train and second one to the test.
|
||||||
|
|
||||||
|
I used a first category in the classification task. E.g there is "world" instead of "world.us" as on original dataset.
|
||||||
|
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
--metric Accuracy --precision 4 -%
|
149134
dev-0/expected.tsv
Normal file
149134
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
149134
dev-0/in.tsv
Normal file
149134
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
149134
dev-0/out.tsv
Normal file
149134
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
@ -1 +0,0 @@
|
|||||||
Subproject commit 97421a97eea8e2421b4d8ceb985974d22058e818
|
|
57
run.py
Normal file
57
run.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import re
|
||||||
|
import vowpalwabbit
|
||||||
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
|
|
||||||
|
def format_data(data):
|
||||||
|
return [
|
||||||
|
re.sub('[^a-zA-z -]', '', str(i[0]).replace('\n',' ').lower().strip())
|
||||||
|
for i in data.values
|
||||||
|
]
|
||||||
|
|
||||||
|
def load_train_data():
|
||||||
|
in_df = pd.read_csv('train/in.tsv',
|
||||||
|
delimiter='\t',
|
||||||
|
usecols=[2],
|
||||||
|
names=['text'])
|
||||||
|
exp_df = pd.read_csv('train/expected.tsv',
|
||||||
|
delimiter='\t',
|
||||||
|
names=['class'])
|
||||||
|
|
||||||
|
fit_list = list(exp_df['class'].unique())
|
||||||
|
fit_list.sort()
|
||||||
|
print("Categories: ", fit_list)
|
||||||
|
label_enc = LabelEncoder()
|
||||||
|
exp_df['class'] = label_enc.fit_transform(exp_df['class']) + 1
|
||||||
|
|
||||||
|
return format_data(in_df), exp_df, label_enc
|
||||||
|
|
||||||
|
def predict(data, model):
|
||||||
|
return [model.predict(f' || text: {i}\n') - 1 for i in data]
|
||||||
|
|
||||||
|
def write_res(data, path):
|
||||||
|
with open(path, 'w') as f:
|
||||||
|
for line in data:
|
||||||
|
f.write(f'{line}\n')
|
||||||
|
print(f"Data written {path}/out.tsv")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
in_df, exp_df, label_enc = load_train_data()
|
||||||
|
|
||||||
|
|
||||||
|
wabbit = vowpalwabbit.Workspace('--oaa 7')
|
||||||
|
for text, category in zip(in_df, exp_df['class']):
|
||||||
|
wabbit.learn(f'{category} | text:{text}\n')
|
||||||
|
|
||||||
|
|
||||||
|
for path in ['dev-0/', 'test-A/', 'test-B/']:
|
||||||
|
to_predict = format_data(pd.read_csv(f'{path}in.tsv', delimiter='\t', names=['text']))
|
||||||
|
predictions = label_enc.inverse_transform(predict(to_predict, wabbit))
|
||||||
|
write_res(predictions, f'{path}out.tsv')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
148308
test-A/in.tsv
Normal file
148308
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
148308
test-A/out.tsv
Normal file
148308
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
79119
test-B/in.tsv
Normal file
79119
test-B/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
79119
test-B/out.tsv
Normal file
79119
test-B/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
1186898
train/expected.tsv
Normal file
1186898
train/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/expected.tsv.xz
Normal file
BIN
train/expected.tsv.xz
Normal file
Binary file not shown.
1186898
train/in.tsv
Normal file
1186898
train/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/in.tsv.xz
Normal file
BIN
train/in.tsv.xz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user