test version

2022-03-26 19:08:19 +01:00 · 2022-03-26 19:08:19 +01:00 · 774b5d8d4a
commit 774b5d8d4a
parent f3c9a87cdf
5 changed files with 35894 additions and 0 deletions
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/model.py
+++ b/model.py
@ -0,0 +1,28 @@
+import lzma
+from nltk.tokenize import word_tokenize
+
+def predict(word_before, word_after):
+    return 'the'
+
+# with open('./dev-0/in.tsv', 'w', encoding='utf-8') as file:
+#     text = lzma.open('./dev-0/in.tsv.xz').read().decode('utf-8')
+#     file.write(text)
+
+# with open('./dev-0/in.tsv', encoding='utf-8') as file_in, open('./dev-0/expected.tsv', encoding='utf-8') as file_expected, open('./dev-0/out.tsv', 'w', encoding='utf-8') as file_out:
+#     for line_in, line_expected in zip(file_in, file_expected):
+#         _, _, _, _, _, _, before, after = line_in.split('\t')
+#         before = word_tokenize(before.replace('\\n', '\n'))
+#         after = word_tokenize(after.replace('\\n', '\n'))
+#         file_out.write(predict(before[-1], after[0]) + '\n')
+
+
+with open('./test-A/in.tsv', 'w', encoding='utf-8') as file:
+    text = lzma.open('./test-A/in.tsv.xz').read().decode('utf-8')
+    file.write(text)
+
+with open('./test-A/in.tsv', encoding='utf-8') as file_in, open('./test-A/out.tsv', 'w', encoding='utf-8') as file_out:
+    for line_in in file_in:
+        _, _, _, _, _, _, before, after = line_in.split('\t')
+        before = word_tokenize(before.replace('\\n', '\n'))
+        after = word_tokenize(after.replace('\\n', '\n'))
+        file_out.write(predict(before[-1], after[0]) + '\n')
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv