s478841
This commit is contained in:
commit
0f448711cf
|
@ -0,0 +1,8 @@
|
|||
|
||||
*~
|
||||
*.swp
|
||||
*.bak
|
||||
*.pyc
|
||||
*.o
|
||||
.DS_Store
|
||||
.token
|
|
@ -0,0 +1,39 @@
|
|||
# Ireland news headlines
|
||||
|
||||
# Dataset source and thanks
|
||||
|
||||
Predict the headline category given headine text and year
|
||||
Start Date: 1996-01-01 End Date: 2019-12-31
|
||||
|
||||
|
||||
Dataset taken from https://www.kaggle.com/therohk/ireland-historical-news on 19.06.2020.
|
||||
Special thanks to Rohit Kulkarni who created it.
|
||||
|
||||
You may find whole dataset (including the test dataset) in the link above.
|
||||
The dataset in the link may be updated.
|
||||
Please, do not incorporate any of the data from this kaggle dataset (or others) to your submission in this gonito challange.
|
||||
|
||||
## Context (from https://www.kaggle.com/therohk/ireland-historical-news )
|
||||
|
||||
This news dataset is a composition of 1.48 million headlines posted by the Irish Times operating within Ireland.
|
||||
|
||||
Created over 160 years ago; the agency can provides long term birds eye view of the happenings in Europe.
|
||||
|
||||
|
||||
# Challange creation
|
||||
|
||||
Year is normalized as follows:
|
||||
|
||||
'''
|
||||
days_in_year = 366 if is_leap else 365
|
||||
normalized = d.year + ((day_of_year-1) / days_in_year)
|
||||
'''
|
||||
|
||||
train, dev, test split is 80%, 10%, 10% randomly
|
||||
|
||||
note that there are very similar headlines in the data
|
||||
|
||||
I did not make any effort to prevent from going one sentence like this to the train and second one to the test.
|
||||
|
||||
I used a first category in the classification task. E.g there is "world" instead of "world.us" as on original dataset.
|
||||
|
|
@ -0,0 +1 @@
|
|||
--metric Accuracy --precision 4 -%
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,58 @@
|
|||
from cProfile import label
|
||||
import pandas as pd, vowpalwabbit
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
import re
|
||||
|
||||
|
||||
def clean_data(data):
|
||||
return [
|
||||
re.sub(
|
||||
' +', ' ',
|
||||
re.sub('[^a-zA-Z -]', '', elem[0].replace('\n',
|
||||
' ').strip().lower()))
|
||||
for elem in data.values
|
||||
]
|
||||
|
||||
|
||||
def predict(data, model, encoder):
|
||||
preds = encoder.inverse_transform(
|
||||
[model.predict(f' | text: {el}\n') - 1 for el in data])
|
||||
return preds
|
||||
|
||||
|
||||
def write_results(data, path):
|
||||
with open(path, 'w') as f:
|
||||
for line in data:
|
||||
f.write(f'{line}\n')
|
||||
print(f"Data written to the file {path}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# * Loading & cleaning data
|
||||
training_data = pd.read_csv('train/in.tsv',
|
||||
delimiter='\t',
|
||||
usecols=[2],
|
||||
names=['text'])
|
||||
expected = pd.read_csv('train/expected.tsv',
|
||||
delimiter='\t',
|
||||
names=['class'])
|
||||
cleaned_training = clean_data(training_data)
|
||||
|
||||
# * Encoding the categories
|
||||
fit_list = list(expected['class'].unique())
|
||||
fit_list.sort()
|
||||
print("Categories: ", fit_list)
|
||||
label_enc = LabelEncoder()
|
||||
expected['class'] = label_enc.fit_transform(expected['class']) + 1
|
||||
|
||||
# * Training
|
||||
wabbit = vowpalwabbit.Workspace('--oaa 7')
|
||||
for text, category in zip(cleaned_training, expected['class']):
|
||||
wabbit.learn(f'{category} | text:{text}\n')
|
||||
|
||||
# * Predictions
|
||||
for path in ['dev-0/', 'test-A/', 'test-B/']:
|
||||
to_predict = clean_data(
|
||||
pd.read_csv(f'{path}in.tsv', delimiter='\t', names=['text']))
|
||||
predictions = predict(to_predict, wabbit, label_enc)
|
||||
write_results(predictions, f'{path}out.tsv')
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue