Rocky road to Dublin
This commit is contained in:
parent
97421a97ee
commit
b3f8dd40f2
25
inout.py
Normal file
25
inout.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
import csv, lzma
|
||||||
|
|
||||||
|
# Reads input from directory and returns a list
|
||||||
|
def read(dir):
|
||||||
|
X = []
|
||||||
|
if 'xz' in dir:
|
||||||
|
with lzma.open(dir) as f:
|
||||||
|
for line in f:
|
||||||
|
text = line.decode('utf-8')
|
||||||
|
text = text.replace('\n', '').split('\t')
|
||||||
|
X.append(text)
|
||||||
|
else:
|
||||||
|
with open(dir, encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
if 'tsv' in dir:
|
||||||
|
X.append(line.replace('\n', '').split('\t'))
|
||||||
|
else:
|
||||||
|
X.append(line.replace('\n', ''))
|
||||||
|
return X
|
||||||
|
|
||||||
|
# Takes the output (list) and writes it into directory
|
||||||
|
def write(output, dir):
|
||||||
|
with open(dir, 'w', newline='', encoding='utf-8') as f:
|
||||||
|
writer = csv.writer(f)
|
||||||
|
writer.writerows(output)
|
149134
ireland_news_dev0_targets
Normal file
149134
ireland_news_dev0_targets
Normal file
File diff suppressed because it is too large
Load Diff
37
rockyRoadtoDublin.py
Normal file
37
rockyRoadtoDublin.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
import inout as io
|
||||||
|
|
||||||
|
def generateChooChoo(data, target, categories):
|
||||||
|
data = [x[2].replace('\n', '') for x in io.read(data)]
|
||||||
|
target = [x[0].replace('\n', '') for x in io.read(target)]
|
||||||
|
|
||||||
|
if categories == {}:
|
||||||
|
i = 0
|
||||||
|
for x in target:
|
||||||
|
if x not in categories:
|
||||||
|
categories[x] = i
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return {'data': data, 'target': target}, categories
|
||||||
|
|
||||||
|
def predictFuture(test):
|
||||||
|
data = [x[2].replace('\n', '') for x in io.read(test + '/in.tsv')]
|
||||||
|
with open('vw_ireland_news_' + test, 'w', encoding='utf-8') as f:
|
||||||
|
for text in data:
|
||||||
|
f.write('1 |text ' + text + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
ireland_news_train, categories = generateChooChoo('train/in.tsv.xz', 'train/expected.tsv.xz', categories={})
|
||||||
|
ireland_news_dev, _ = generateChooChoo('dev-0/in.tsv', 'dev-0/expected.tsv', categories)
|
||||||
|
|
||||||
|
with open('vw_ireland_news_train', 'w', encoding='utf-8') as f:
|
||||||
|
for target, text in zip(ireland_news_train['target'], ireland_news_train['data']):
|
||||||
|
f.write(str(categories[target] + 1) + ' |text ' + text + '\n')
|
||||||
|
|
||||||
|
with open('vw_ireland_news_dev0', 'w', encoding='utf-8') as f, open('ireland_news_dev0_targets', 'w', encoding='utf-8') as f_targets:
|
||||||
|
for target, text in zip(ireland_news_dev['target'], ireland_news_dev['data']):
|
||||||
|
f.write('1 |text ' + text + '\n')
|
||||||
|
f_targets.write(str(categories[target] + 1) + '\n')
|
||||||
|
|
||||||
|
predictFuture('test-A')
|
||||||
|
predictFuture('test-B')
|
149134
vw_ireland_news_dev0
Normal file
149134
vw_ireland_news_dev0
Normal file
File diff suppressed because it is too large
Load Diff
148308
vw_ireland_news_test-A
Normal file
148308
vw_ireland_news_test-A
Normal file
File diff suppressed because it is too large
Load Diff
79119
vw_ireland_news_test-B
Normal file
79119
vw_ireland_news_test-B
Normal file
File diff suppressed because it is too large
Load Diff
1186898
vw_ireland_news_train
Normal file
1186898
vw_ireland_news_train
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user