42 lines
1.3 KiB
Python
42 lines
1.3 KiB
Python
import inout as io
|
|
|
|
categories = {'news': 0, 'sport': 1, 'opinion': 2,
|
|
'business': 3, 'culture': 4, 'lifestyle': 5, 'removed': 6}
|
|
|
|
|
|
def trainingData(data, target):
|
|
data = io.read(data)
|
|
years = [x[0] for x in data]
|
|
text = [x[2].replace('\n', '').replace(':', '') for x in data]
|
|
target = [categories[x[0].replace('\n', '')] for x in io.read(target)]
|
|
|
|
data = []
|
|
for i in range(len(text)):
|
|
data.append(' |Text ' + text[i] + ' |Year ' + years[i])
|
|
|
|
return {'data': data, 'target': target}
|
|
|
|
def predictFuture(test):
|
|
data = io.read(test + '/in.tsv')
|
|
years = [x[0] for x in data]
|
|
text = [x[2].replace('\n', '').replace(':', '') for x in data]
|
|
|
|
data = []
|
|
for i in range(len(text)):
|
|
data.append(' |Text ' + text[i] + ' |Year ' + years[i])
|
|
|
|
with open(test + '/vw-in', 'w', encoding='utf-8') as f:
|
|
for text in data:
|
|
f.write('1' + text + '\n')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
ireland_news_train = trainingData('train/in.tsv.xz', 'train/expected.tsv.xz')
|
|
|
|
with open('train/vw-in', 'w', encoding='utf-8') as f:
|
|
for target, text in zip(ireland_news_train['target'], ireland_news_train['data']):
|
|
f.write(str(target + 1) + text + '\n')
|
|
|
|
predictFuture('dev-0')
|
|
predictFuture('test-A')
|
|
predictFuture('test-B') |