ireland-news-headlines/wockyWoad.py
2022-05-28 21:16:33 +02:00

42 lines
1.3 KiB
Python

import inout as io
categories = {'news': 0, 'sport': 1, 'opinion': 2,
'business': 3, 'culture': 4, 'lifestyle': 5, 'removed': 6}
def trainingData(data, target):
data = io.read(data)
years = [x[0] for x in data]
text = [x[2].replace('\n', '').replace(':', '') for x in data]
target = [categories[x[0].replace('\n', '')] for x in io.read(target)]
data = []
for i in range(len(text)):
data.append(' |Text ' + text[i] + ' |Year ' + years[i])
return {'data': data, 'target': target}
def predictFuture(test):
data = io.read(test + '/in.tsv')
years = [x[0] for x in data]
text = [x[2].replace('\n', '').replace(':', '') for x in data]
data = []
for i in range(len(text)):
data.append(' |Text ' + text[i] + ' |Year ' + years[i])
with open(test + '/vw-in', 'w', encoding='utf-8') as f:
for text in data:
f.write('1' + text + '\n')
if __name__ == '__main__':
ireland_news_train = trainingData('train/in.tsv.xz', 'train/expected.tsv.xz')
with open('train/vw-in', 'w', encoding='utf-8') as f:
for target, text in zip(ireland_news_train['target'], ireland_news_train['data']):
f.write(str(target + 1) + text + '\n')
predictFuture('dev-0')
predictFuture('test-A')
predictFuture('test-B')