import inout as io categories = {'news': 0, 'sport': 1, 'opinion': 2, 'business': 3, 'culture': 4, 'lifestyle': 5, 'removed': 6} def trainingData(data, target): data = io.read(data) years = [x[0] for x in data] text = [x[2].replace('\n', '').replace(':', '') for x in data] target = [categories[x[0].replace('\n', '')] for x in io.read(target)] data = [] for i in range(len(text)): data.append(' |Text ' + text[i] + ' |Year ' + years[i]) return {'data': data, 'target': target} def predictFuture(test): data = io.read(test + '/in.tsv') years = [x[0] for x in data] text = [x[2].replace('\n', '').replace(':', '') for x in data] data = [] for i in range(len(text)): data.append(' |Text ' + text[i] + ' |Year ' + years[i]) with open(test + '-in', 'w', encoding='utf-8') as f: for text in data: f.write('1' + text + '\n') if __name__ == '__main__': ireland_news_train = trainingData('train/in.tsv.xz', 'train/expected.tsv.xz') with open('train/vw-in', 'w', encoding='utf-8') as f: for target, text in zip(ireland_news_train['target'], ireland_news_train['data']): f.write(str(target + 1) + text + '\n') predictFuture('dev-0') predictFuture('test-A') predictFuture('test-B')