s478841

2022-06-01 00:12:22 +02:00 · 2022-06-01 00:12:22 +02:00 · 0f448711cf
commit 0f448711cf
14 changed files with 3276159 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,8 @@
+
+*~
+*.swp
+*.bak
+*.pyc
+*.o
+.DS_Store
+.token
--- a/README.md
+++ b/README.md
@ -0,0 +1,39 @@
+# Ireland news headlines
+
+# Dataset source and thanks
+
+Predict the headline category given headine text and year
+Start Date: 1996-01-01 End Date: 2019-12-31
+
+
+Dataset taken from https://www.kaggle.com/therohk/ireland-historical-news on 19.06.2020.
+Special thanks to Rohit Kulkarni who created it.
+
+You may find whole dataset (including the test dataset) in the link above.
+The dataset in the link may be updated.
+Please, do not incorporate any of the data from this kaggle dataset (or others) to your submission in this gonito challange.
+
+## Context (from https://www.kaggle.com/therohk/ireland-historical-news )
+
+This news dataset is a composition of 1.48 million headlines posted by the Irish Times operating within Ireland.
+
+Created over 160 years ago; the agency can provides long term birds eye view of the happenings in Europe.
+
+
+# Challange creation
+
+Year is normalized as follows:
+
+'''
+    days_in_year = 366 if is_leap else 365
+    normalized = d.year + ((day_of_year-1)  / days_in_year)
+'''
+
+train, dev, test split is 80%, 10%, 10% randomly
+
+note that there are very similar headlines in the data
+
+I did not make any effort to prevent from going one sentence like this to the train and second one to the test.
+
+I used a first category in the classification task. E.g there is "world" instead of "world.us" as on original dataset.
+
--- a/config.txt
+++ b/config.txt
@ -0,0 +1 @@
+--metric Accuracy --precision 4 -%
--- a/dev-0/expected.tsv
+++ b/dev-0/expected.tsv
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/1
+++ b/1
@ -0,0 +1 @@
+year_normalized	date	text
--- a/run.py
+++ b/run.py
@ -0,0 +1,58 @@
+from cProfile import label
+import pandas as pd, vowpalwabbit
+from sklearn.preprocessing import LabelEncoder
+import re
+
+
+def clean_data(data):
+    return [
+        re.sub(
+            ' +', ' ',
+            re.sub('[^a-zA-Z -]', '', elem[0].replace('\n',
+                                                      ' ').strip().lower()))
+        for elem in data.values
+    ]
+
+
+def predict(data, model, encoder):
+    preds = encoder.inverse_transform(
+        [model.predict(f' | text: {el}\n') - 1 for el in data])
+    return preds
+
+
+def write_results(data, path):
+    with open(path, 'w') as f:
+        for line in data:
+            f.write(f'{line}\n')
+    print(f"Data written to the file {path}")
+
+
+if __name__ == '__main__':
+    # * Loading & cleaning data
+    training_data = pd.read_csv('train/in.tsv',
+                                delimiter='\t',
+                                usecols=[2],
+                                names=['text'])
+    expected = pd.read_csv('train/expected.tsv',
+                           delimiter='\t',
+                           names=['class'])
+    cleaned_training = clean_data(training_data)
+
+    # * Encoding the categories
+    fit_list = list(expected['class'].unique())
+    fit_list.sort()
+    print("Categories: ", fit_list)
+    label_enc = LabelEncoder()
+    expected['class'] = label_enc.fit_transform(expected['class']) + 1
+
+    # * Training
+    wabbit = vowpalwabbit.Workspace('--oaa 7')
+    for text, category in zip(cleaned_training, expected['class']):
+        wabbit.learn(f'{category} | text:{text}\n')
+
+    # * Predictions
+    for path in ['dev-0/', 'test-A/', 'test-B/']:
+        to_predict = clean_data(
+            pd.read_csv(f'{path}in.tsv', delimiter='\t', names=['text']))
+        predictions = predict(to_predict, wabbit, label_enc)
+        write_results(predictions, f'{path}out.tsv')
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv
--- a/test-B/in.tsv
+++ b/test-B/in.tsv
--- a/test-B/out.tsv
+++ b/test-B/out.tsv
--- a/train/expected.tsv
+++ b/train/expected.tsv
--- a/train/in.tsv
+++ b/train/in.tsv