s478874

2022-06-06 22:29:28 +02:00 · 2022-06-06 22:29:28 +02:00 · 3796beea7d
commit 3796beea7d
parent ff924ca4e1
17 changed files with 3276158 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
+
+*~
+*.swp
+*.bak
+*.pyc
+*.o
+.DS_Store
+.token
--- a/README.md
+++ b/README.md
@ -0,0 +1,39 @@
+# Ireland news headlines
+
+# Dataset source and thanks
+
+Predict the headline category given headine text and year
+Start Date: 1996-01-01 End Date: 2019-12-31
+
+
+Dataset taken from https://www.kaggle.com/therohk/ireland-historical-news on 19.06.2020.
+Special thanks to Rohit Kulkarni who created it.
+
+You may find whole dataset (including the test dataset) in the link above.
+The dataset in the link may be updated.
+Please, do not incorporate any of the data from this kaggle dataset (or others) to your submission in this gonito challange.
+
+## Context (from https://www.kaggle.com/therohk/ireland-historical-news )
+
+This news dataset is a composition of 1.48 million headlines posted by the Irish Times operating within Ireland.
+
+Created over 160 years ago; the agency can provides long term birds eye view of the happenings in Europe.
+
+
+# Challange creation
+
+Year is normalized as follows:
+
+'''
+    days_in_year = 366 if is_leap else 365
+    normalized = d.year + ((day_of_year-1)  / days_in_year)
+'''
+
+train, dev, test split is 80%, 10%, 10% randomly
+
+note that there are very similar headlines in the data
+
+I did not make any effort to prevent from going one sentence like this to the train and second one to the test.
+
+I used a first category in the classification task. E.g there is "world" instead of "world.us" as on original dataset.
+
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
+--metric Accuracy --precision 4 -%
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/1
+++ b/1
@ -1 +0,0 @@
-Subproject commit 97421a97eea8e2421b4d8ceb985974d22058e818
--- a/1
+++ b/1
@ -0,0 +1 @@
+year_normalized	date	text
--- a/run.py
+++ b/run.py
@ -0,0 +1,57 @@
+import pandas as pd
+import re
+import vowpalwabbit
+from sklearn.preprocessing import LabelEncoder
+
+
+def format_data(data):
+    return [
+        re.sub('[^a-zA-z -]', '', str(i[0]).replace('\n',' ').lower().strip())
+        for i in data.values
+    ]
+
+def load_train_data():
+    in_df = pd.read_csv('train/in.tsv',
+                                delimiter='\t',
+                                usecols=[2],
+                                names=['text'])
+    exp_df = pd.read_csv('train/expected.tsv',
+                           delimiter='\t',
+                           names=['class'])
+
+    fit_list = list(exp_df['class'].unique())
+    fit_list.sort()
+    print("Categories: ", fit_list)
+    label_enc = LabelEncoder()
+    exp_df['class'] = label_enc.fit_transform(exp_df['class']) + 1
+
+    return format_data(in_df), exp_df, label_enc
+
+def predict(data, model):
+    return [model.predict(f' || text: {i}\n') - 1 for i in data]
+
+def write_res(data, path):
+    with open(path, 'w') as f:
+        for line in data:
+            f.write(f'{line}\n')
+    print(f"Data written {path}/out.tsv")
+
+
+def main():
+    in_df, exp_df, label_enc = load_train_data()
+
+
+    wabbit = vowpalwabbit.Workspace('--oaa 7')
+    for text, category in zip(in_df, exp_df['class']):
+        wabbit.learn(f'{category} | text:{text}\n')
+
+    
+    for path in ['dev-0/', 'test-A/', 'test-B/']:
+        to_predict = format_data(pd.read_csv(f'{path}in.tsv', delimiter='\t', names=['text']))
+        predictions = label_enc.inverse_transform(predict(to_predict, wabbit))
+        write_res(predictions, f'{path}out.tsv')
+
+
+
+if __name__ == '__main__':
+    main()
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/test-B/in.tsv
+++ b/test-B/in.tsv
--- a/test-B/out.tsv
+++ b/test-B/out.tsv
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/expected.tsv.xz
+++ b/train/expected.tsv.xz
--- a/train/in.tsv
+++ b/train/in.tsv
--- a/train/in.tsv.xz
+++ b/train/in.tsv.xz
				`@ -1 +0,0 @@`
				`Subproject commit 97421a97eea8e2421b4d8ceb985974d22058e818`