import inout as io def generateChooChoo(data, target, categories): data = [x[2].replace('\n', '').replace(':', '') for x in io.read(data)] target = [x[0].replace('\n', '') for x in io.read(target)] if categories == {}: i = 0 for x in target: if x not in categories: categories[x] = i i += 1 return {'data': data, 'target': target}, categories def predictFuture(test): data = [x[2].replace('\n', '').replace(':', '') for x in io.read(test + '/in.tsv')] with open('vw_ireland_news_' + test, 'w', encoding='utf-8') as f: for text in data: f.write('1 |text ' + text + '\n') if __name__ == '__main__': ireland_news_train, categories = generateChooChoo('train/in.tsv.xz', 'train/expected.tsv.xz', categories={}) ireland_news_dev, _ = generateChooChoo('dev-0/in.tsv', 'dev-0/expected.tsv', categories) with open('vw_ireland_news_train', 'w', encoding='utf-8') as f: for target, text in zip(ireland_news_train['target'], ireland_news_train['data']): f.write(str(categories[target] + 1) + ' |text ' + text + '\n') with open('vw_ireland_news_dev0', 'w', encoding='utf-8') as f, open('ireland_news_dev0_targets', 'w', encoding='utf-8') as f_targets: for target, text in zip(ireland_news_dev['target'], ireland_news_dev['data']): f.write('1 |text ' + text + '\n') f_targets.write(str(categories[target] + 1) + '\n') predictFuture('test-A') predictFuture('test-B')