s478874

2022-06-06 22:29:28 +02:00 · 2022-06-06 22:29:28 +02:00 · 3796beea7d
commit 3796beea7d
parent ff924ca4e1
17 changed files with 3276158 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
 *~
 *.swp
 *.bak
 *.pyc
 *.o
 .DS_Store
 .token
--- a/README.md
+++ b/README.md
@ -0,0 +1,39 @@
 # Ireland news headlines
 # Dataset source and thanks
 Predict the headline category given headine text and year
 Start Date: 1996-01-01 End Date: 2019-12-31
 Dataset taken from https://www.kaggle.com/therohk/ireland-historical-news on 19.06.2020.
 Special thanks to Rohit Kulkarni who created it.
 You may find whole dataset (including the test dataset) in the link above.
 The dataset in the link may be updated.
 Please, do not incorporate any of the data from this kaggle dataset (or others) to your submission in this gonito challange.
 ## Context (from https://www.kaggle.com/therohk/ireland-historical-news )
 This news dataset is a composition of 1.48 million headlines posted by the Irish Times operating within Ireland.
 Created over 160 years ago; the agency can provides long term birds eye view of the happenings in Europe.
 # Challange creation
 Year is normalized as follows:
 '''
    days_in_year = 366 if is_leap else 365
    normalized = d.year + ((day_of_year-1)  / days_in_year)
 '''
 train, dev, test split is 80%, 10%, 10% randomly
 note that there are very similar headlines in the data
 I did not make any effort to prevent from going one sentence like this to the train and second one to the test.
 I used a first category in the classification task. E.g there is "world" instead of "world.us" as on original dataset.
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
 --metric Accuracy --precision 4 -%
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/1
+++ b/1
@ -1 +0,0 @@
 Subproject commit 97421a97eea8e2421b4d8ceb985974d22058e818
--- a/1
+++ b/1
@ -0,0 +1 @@
 year_normalized	date	text
--- a/run.py
+++ b/run.py
@ -0,0 +1,57 @@
 import pandas as pd
 import re
 import vowpalwabbit
 from sklearn.preprocessing import LabelEncoder
 def format_data(data):
    return [
        re.sub('[^a-zA-z -]', '', str(i[0]).replace('\n',' ').lower().strip())
        for i in data.values
    ]
 def load_train_data():
    in_df = pd.read_csv('train/in.tsv',
                                delimiter='\t',
                                usecols=[2],
                                names=['text'])
    exp_df = pd.read_csv('train/expected.tsv',
                           delimiter='\t',
                           names=['class'])
    fit_list = list(exp_df['class'].unique())
    fit_list.sort()
    print("Categories: ", fit_list)
    label_enc = LabelEncoder()
    exp_df['class'] = label_enc.fit_transform(exp_df['class']) + 1
    return format_data(in_df), exp_df, label_enc
 def predict(data, model):
    return [model.predict(f' || text: {i}\n') - 1 for i in data]
 def write_res(data, path):
    with open(path, 'w') as f:
        for line in data:
            f.write(f'{line}\n')
    print(f"Data written {path}/out.tsv")
 def main():
    in_df, exp_df, label_enc = load_train_data()
    wabbit = vowpalwabbit.Workspace('--oaa 7')
    for text, category in zip(in_df, exp_df['class']):
        wabbit.learn(f'{category} | text:{text}\n')
    for path in ['dev-0/', 'test-A/', 'test-B/']:
        to_predict = format_data(pd.read_csv(f'{path}in.tsv', delimiter='\t', names=['text']))
        predictions = label_enc.inverse_transform(predict(to_predict, wabbit))
        write_res(predictions, f'{path}out.tsv')
 if __name__ == '__main__':
    main()
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/test-B/in.tsv
+++ b/test-B/in.tsv
--- a/test-B/out.tsv
+++ b/test-B/out.tsv
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/expected.tsv.xz
+++ b/train/expected.tsv.xz
--- a/train/in.tsv
+++ b/train/in.tsv
--- a/train/in.tsv.xz
+++ b/train/in.tsv.xz
		`@ -1 +0,0 @@`
			`Subproject commit 97421a97eea8e2421b4d8ceb985974d22058e818`