Solution 444465

This commit is contained in:
Andrzej Preibisz 2022-05-28 19:29:06 +02:00
commit 44398afd35
13 changed files with 902436 additions and 0 deletions

39
README.md Normal file
View File

@ -0,0 +1,39 @@
# Ireland news headlines
# Dataset source and thanks
Predict the headline category given headine text and year
Start Date: 1996-01-01 End Date: 2019-12-31
Dataset taken from https://www.kaggle.com/therohk/ireland-historical-news on 19.06.2020.
Special thanks to Rohit Kulkarni who created it.
You may find whole dataset (including the test dataset) in the link above.
The dataset in the link may be updated.
Please, do not incorporate any of the data from this kaggle dataset (or others) to your submission in this gonito challange.
## Context (from https://www.kaggle.com/therohk/ireland-historical-news )
This news dataset is a composition of 1.48 million headlines posted by the Irish Times operating within Ireland.
Created over 160 years ago; the agency can provides long term birds eye view of the happenings in Europe.
# Challange creation
Year is normalized as follows:
'''
days_in_year = 366 if is_leap else 365
normalized = d.year + ((day_of_year-1) / days_in_year)
'''
train, dev, test split is 80%, 10%, 10% randomly
note that there are very similar headlines in the data
I did not make any effort to prevent from going one sentence like this to the train and second one to the test.
I used a first category in the classification task. E.g there is "world" instead of "world.us" as on original dataset.

1
config.txt Normal file
View File

@ -0,0 +1 @@
--metric Accuracy --precision 4 -%

149134
dev-0/expected.tsv Normal file

File diff suppressed because it is too large Load Diff

149134
dev-0/in.tsv Normal file

File diff suppressed because it is too large Load Diff

149134
dev-0/out.tsv Normal file

File diff suppressed because it is too large Load Diff

139
main.py Normal file
View File

@ -0,0 +1,139 @@
import vowpalwabbit
import lzma
from gensim.parsing.preprocessing import remove_stopwords
VECTOR_SIZE = 100
def transform_to_vw_train_format(data_x, data_y):
formatted_data = []
for line, label in zip(data_x, data_y):
line_formatted = f"{label} | year_fraction:{line[0]} date:{line[1]} content:{line[2]}\n"
formatted_data.append(line_formatted)
return formatted_data
def transform_to_vw_test_format(data_x):
formatted_data = []
for line in data_x:
line_formatted = f"| year_fraction:{line[0]} date:{line[1]} content:{line[2]}\n"
formatted_data.append(line_formatted)
return formatted_data
def main():
train_x = []
train_y = []
test_a_x = []
test_b_x = []
dev_0_x = []
with lzma.open('train/in.tsv.xz', 'r') as f, lzma.open('train/expected.tsv.xz', 'r') as f2:
for line_in, line_expected in zip(f, f2):
line_in = line_in.decode("utf-8").strip().split('\t')
line_in[2] = remove_stopwords(line_in[2].lower())
train_x.append(line_in)
train_y.append(line_expected.strip().decode("utf-8"))
# breakpoint()
with open("test-A/in.tsv", 'r') as f:
for line_in in f.readlines():
line_in = line_in.strip().split('\t')
line_in[2] = remove_stopwords(line_in[2].lower())
test_a_x.append(line_in)
with open("test-B/in.tsv", 'r') as f:
for line_in in f.readlines():
line_in = line_in.strip().split('\t')
line_in[2] = remove_stopwords(line_in[2].lower())
test_b_x.append(line_in)
with open("dev-0/in.tsv", 'r') as f:
for line_in in f.readlines():
line_in = line_in.strip().split('\t')
line_in[2] = remove_stopwords(line_in[2].lower())
dev_0_x.append(line_in)
cat_labels = dict()
for i, item in enumerate(set(train_y), 1):
cat_labels[item] = i
train_y = [cat_labels.get(line) for line in train_y]
vw_format_train_data = transform_to_vw_train_format(train_x, train_y)
test_a_vw_format = transform_to_vw_test_format(test_a_x)
test_b_vw_format = transform_to_vw_test_format(test_b_x)
dev_0_vw_format = transform_to_vw_test_format(dev_0_x)
# print(vw_format_train_data[:10])
vw = vowpalwabbit.Workspace("--oaa 7 --ngram 3 --learning_rate=0.1")
for example in vw_format_train_data:
vw.learn(example)
predict_test_a = []
predict_test_b = []
predict_dev_0 = []
for line in dev_0_vw_format:
predict = vw.predict(line)
if round(predict) == 0:
predict = 1
predict_dev_0.append(round(predict))
predict_dev_0_text = []
for line in predict_dev_0:
label_key = list(cat_labels.keys())[list(cat_labels.values()).index(line)]
predict_dev_0_text.append(label_key)
#TEST-A
for line in test_a_vw_format:
predict = vw.predict(line)
if round(predict) == 0:
predict = 1
predict_test_a.append(round(predict))
predict_test_a_text = []
for line in predict_test_a:
label_key = list(cat_labels.keys())[list(cat_labels.values()).index(line)]
predict_test_a_text.append(label_key)
#TEST-B
for line in test_b_vw_format:
predict = vw.predict(line)
if round(predict) == 0:
predict = 1
predict_test_b.append(round(predict))
predict_test_b_text = []
for line in predict_test_b:
label_key = list(cat_labels.keys())[list(cat_labels.values()).index(line)]
predict_test_b_text.append(label_key)
with open("dev-0/out.tsv", "w") as f:
for line in predict_dev_0_text:
f.write(f"{line}\n")
with open("test-A/out.tsv", "w") as f:
for line in predict_test_a_text:
f.write(f"{line}\n")
with open("test-B/out.tsv", "w") as f:
for line in predict_test_b_text:
f.write(f"{line}\n")
main()

1
names Normal file
View File

@ -0,0 +1 @@
year_normalized date text

148308
test-A/in.tsv Normal file

File diff suppressed because it is too large Load Diff

148308
test-A/out.tsv Normal file

File diff suppressed because it is too large Load Diff

79119
test-B/in.tsv Normal file

File diff suppressed because it is too large Load Diff

79119
test-B/out.tsv Normal file

File diff suppressed because it is too large Load Diff

BIN
train/expected.tsv.xz Normal file

Binary file not shown.

BIN
train/in.tsv.xz Normal file

Binary file not shown.