Rocky road to Dublin

2022-05-28 16:27:40 +02:00 · 2022-05-28 16:27:40 +02:00 · b3f8dd40f2
commit b3f8dd40f2
parent 97421a97ee
7 changed files with 1712655 additions and 0 deletions
--- a/inout.py
+++ b/inout.py
@ -0,0 +1,25 @@
+import csv, lzma
+
+# Reads input from directory and returns a list
+def read(dir):
+    X = []
+    if 'xz' in dir:
+        with lzma.open(dir) as f:
+            for line in f:
+                text = line.decode('utf-8')
+                text = text.replace('\n', '').split('\t')
+                X.append(text)
+    else:
+        with open(dir, encoding='utf-8') as f:
+            for line in f:
+                if 'tsv' in dir:
+                    X.append(line.replace('\n', '').split('\t'))
+                else:
+                    X.append(line.replace('\n', ''))
+    return X
+
+# Takes the output (list) and writes it into directory
+def write(output, dir):
+    with open(dir, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerows(output)
--- a/149134
+++ b/149134
--- a/rockyRoadtoDublin.py
+++ b/rockyRoadtoDublin.py
@ -0,0 +1,37 @@
+import inout as io
+
+def generateChooChoo(data, target, categories):
+    data = [x[2].replace('\n', '') for x in io.read(data)]
+    target = [x[0].replace('\n', '') for x in io.read(target)]
+
+    if categories == {}:
+        i = 0
+        for x in target:
+            if x not in categories: 
+                categories[x] = i
+                i += 1
+    
+    return {'data': data, 'target': target}, categories
+
+def predictFuture(test):
+    data = [x[2].replace('\n', '') for x in io.read(test + '/in.tsv')]
+    with open('vw_ireland_news_' + test, 'w', encoding='utf-8') as f:
+        for text in data:
+            f.write('1 |text ' + text + '\n')
+
+
+if __name__ == '__main__':
+    ireland_news_train, categories = generateChooChoo('train/in.tsv.xz', 'train/expected.tsv.xz', categories={})
+    ireland_news_dev, _ = generateChooChoo('dev-0/in.tsv', 'dev-0/expected.tsv', categories)
+
+    with open('vw_ireland_news_train', 'w', encoding='utf-8') as f:
+        for target, text in zip(ireland_news_train['target'], ireland_news_train['data']):
+            f.write(str(categories[target] + 1) + ' |text ' + text + '\n')
+
+    with open('vw_ireland_news_dev0', 'w', encoding='utf-8') as f, open('ireland_news_dev0_targets', 'w', encoding='utf-8') as f_targets:
+        for target, text in zip(ireland_news_dev['target'], ireland_news_dev['data']):
+            f.write('1 |text ' + text + '\n')
+            f_targets.write(str(categories[target] + 1) + '\n')
+    
+    predictFuture('test-A')
+    predictFuture('test-B')
--- a/149134
+++ b/149134
--- a/148308
+++ b/148308
--- a/79119
+++ b/79119
--- a/1186898
+++ b/1186898