Solution 444465
This commit is contained in:
commit
44398afd35
39
README.md
Normal file
39
README.md
Normal file
@ -0,0 +1,39 @@
|
||||
# Ireland news headlines
|
||||
|
||||
# Dataset source and thanks
|
||||
|
||||
Predict the headline category given headine text and year
|
||||
Start Date: 1996-01-01 End Date: 2019-12-31
|
||||
|
||||
|
||||
Dataset taken from https://www.kaggle.com/therohk/ireland-historical-news on 19.06.2020.
|
||||
Special thanks to Rohit Kulkarni who created it.
|
||||
|
||||
You may find whole dataset (including the test dataset) in the link above.
|
||||
The dataset in the link may be updated.
|
||||
Please, do not incorporate any of the data from this kaggle dataset (or others) to your submission in this gonito challange.
|
||||
|
||||
## Context (from https://www.kaggle.com/therohk/ireland-historical-news )
|
||||
|
||||
This news dataset is a composition of 1.48 million headlines posted by the Irish Times operating within Ireland.
|
||||
|
||||
Created over 160 years ago; the agency can provides long term birds eye view of the happenings in Europe.
|
||||
|
||||
|
||||
# Challange creation
|
||||
|
||||
Year is normalized as follows:
|
||||
|
||||
'''
|
||||
days_in_year = 366 if is_leap else 365
|
||||
normalized = d.year + ((day_of_year-1) / days_in_year)
|
||||
'''
|
||||
|
||||
train, dev, test split is 80%, 10%, 10% randomly
|
||||
|
||||
note that there are very similar headlines in the data
|
||||
|
||||
I did not make any effort to prevent from going one sentence like this to the train and second one to the test.
|
||||
|
||||
I used a first category in the classification task. E.g there is "world" instead of "world.us" as on original dataset.
|
||||
|
1
config.txt
Normal file
1
config.txt
Normal file
@ -0,0 +1 @@
|
||||
--metric Accuracy --precision 4 -%
|
149134
dev-0/expected.tsv
Normal file
149134
dev-0/expected.tsv
Normal file
File diff suppressed because it is too large
Load Diff
149134
dev-0/in.tsv
Normal file
149134
dev-0/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
149134
dev-0/out.tsv
Normal file
149134
dev-0/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
139
main.py
Normal file
139
main.py
Normal file
@ -0,0 +1,139 @@
|
||||
import vowpalwabbit
|
||||
import lzma
|
||||
from gensim.parsing.preprocessing import remove_stopwords
|
||||
|
||||
|
||||
VECTOR_SIZE = 100
|
||||
|
||||
|
||||
def transform_to_vw_train_format(data_x, data_y):
|
||||
formatted_data = []
|
||||
for line, label in zip(data_x, data_y):
|
||||
line_formatted = f"{label} | year_fraction:{line[0]} date:{line[1]} content:{line[2]}\n"
|
||||
formatted_data.append(line_formatted)
|
||||
|
||||
return formatted_data
|
||||
|
||||
|
||||
def transform_to_vw_test_format(data_x):
|
||||
formatted_data = []
|
||||
for line in data_x:
|
||||
line_formatted = f"| year_fraction:{line[0]} date:{line[1]} content:{line[2]}\n"
|
||||
formatted_data.append(line_formatted)
|
||||
|
||||
return formatted_data
|
||||
|
||||
|
||||
def main():
|
||||
train_x = []
|
||||
train_y = []
|
||||
|
||||
test_a_x = []
|
||||
test_b_x = []
|
||||
dev_0_x = []
|
||||
|
||||
with lzma.open('train/in.tsv.xz', 'r') as f, lzma.open('train/expected.tsv.xz', 'r') as f2:
|
||||
for line_in, line_expected in zip(f, f2):
|
||||
|
||||
line_in = line_in.decode("utf-8").strip().split('\t')
|
||||
line_in[2] = remove_stopwords(line_in[2].lower())
|
||||
train_x.append(line_in)
|
||||
|
||||
train_y.append(line_expected.strip().decode("utf-8"))
|
||||
|
||||
# breakpoint()
|
||||
with open("test-A/in.tsv", 'r') as f:
|
||||
for line_in in f.readlines():
|
||||
|
||||
line_in = line_in.strip().split('\t')
|
||||
line_in[2] = remove_stopwords(line_in[2].lower())
|
||||
test_a_x.append(line_in)
|
||||
|
||||
with open("test-B/in.tsv", 'r') as f:
|
||||
for line_in in f.readlines():
|
||||
|
||||
line_in = line_in.strip().split('\t')
|
||||
line_in[2] = remove_stopwords(line_in[2].lower())
|
||||
test_b_x.append(line_in)
|
||||
|
||||
with open("dev-0/in.tsv", 'r') as f:
|
||||
for line_in in f.readlines():
|
||||
|
||||
line_in = line_in.strip().split('\t')
|
||||
line_in[2] = remove_stopwords(line_in[2].lower())
|
||||
|
||||
dev_0_x.append(line_in)
|
||||
|
||||
cat_labels = dict()
|
||||
for i, item in enumerate(set(train_y), 1):
|
||||
cat_labels[item] = i
|
||||
|
||||
train_y = [cat_labels.get(line) for line in train_y]
|
||||
|
||||
vw_format_train_data = transform_to_vw_train_format(train_x, train_y)
|
||||
test_a_vw_format = transform_to_vw_test_format(test_a_x)
|
||||
test_b_vw_format = transform_to_vw_test_format(test_b_x)
|
||||
dev_0_vw_format = transform_to_vw_test_format(dev_0_x)
|
||||
# print(vw_format_train_data[:10])
|
||||
|
||||
vw = vowpalwabbit.Workspace("--oaa 7 --ngram 3 --learning_rate=0.1")
|
||||
|
||||
for example in vw_format_train_data:
|
||||
vw.learn(example)
|
||||
|
||||
predict_test_a = []
|
||||
predict_test_b = []
|
||||
predict_dev_0 = []
|
||||
|
||||
for line in dev_0_vw_format:
|
||||
predict = vw.predict(line)
|
||||
if round(predict) == 0:
|
||||
predict = 1
|
||||
predict_dev_0.append(round(predict))
|
||||
|
||||
predict_dev_0_text = []
|
||||
|
||||
for line in predict_dev_0:
|
||||
label_key = list(cat_labels.keys())[list(cat_labels.values()).index(line)]
|
||||
predict_dev_0_text.append(label_key)
|
||||
|
||||
#TEST-A
|
||||
for line in test_a_vw_format:
|
||||
predict = vw.predict(line)
|
||||
if round(predict) == 0:
|
||||
predict = 1
|
||||
predict_test_a.append(round(predict))
|
||||
|
||||
predict_test_a_text = []
|
||||
|
||||
for line in predict_test_a:
|
||||
label_key = list(cat_labels.keys())[list(cat_labels.values()).index(line)]
|
||||
predict_test_a_text.append(label_key)
|
||||
|
||||
#TEST-B
|
||||
for line in test_b_vw_format:
|
||||
predict = vw.predict(line)
|
||||
if round(predict) == 0:
|
||||
predict = 1
|
||||
predict_test_b.append(round(predict))
|
||||
|
||||
predict_test_b_text = []
|
||||
|
||||
for line in predict_test_b:
|
||||
label_key = list(cat_labels.keys())[list(cat_labels.values()).index(line)]
|
||||
predict_test_b_text.append(label_key)
|
||||
|
||||
with open("dev-0/out.tsv", "w") as f:
|
||||
for line in predict_dev_0_text:
|
||||
f.write(f"{line}\n")
|
||||
|
||||
with open("test-A/out.tsv", "w") as f:
|
||||
for line in predict_test_a_text:
|
||||
f.write(f"{line}\n")
|
||||
|
||||
with open("test-B/out.tsv", "w") as f:
|
||||
for line in predict_test_b_text:
|
||||
f.write(f"{line}\n")
|
||||
|
||||
|
||||
main()
|
148308
test-A/in.tsv
Normal file
148308
test-A/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
148308
test-A/out.tsv
Normal file
148308
test-A/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
79119
test-B/in.tsv
Normal file
79119
test-B/in.tsv
Normal file
File diff suppressed because it is too large
Load Diff
79119
test-B/out.tsv
Normal file
79119
test-B/out.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train/expected.tsv.xz
Normal file
BIN
train/expected.tsv.xz
Normal file
Binary file not shown.
BIN
train/in.tsv.xz
Normal file
BIN
train/in.tsv.xz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user