test version

2022-03-26 19:08:19 +01:00 · 2022-03-26 19:08:19 +01:00 · 774b5d8d4a
commit 774b5d8d4a
parent f3c9a87cdf
5 changed files with 35894 additions and 0 deletions
--- a/dev-0/in.tsv
+++ b/dev-0/in.tsv
--- a/dev-0/out.tsv
+++ b/dev-0/out.tsv
--- a/model.py
+++ b/model.py
@ -0,0 +1,28 @@
 import lzma
 from nltk.tokenize import word_tokenize
 def predict(word_before, word_after):
    return 'the'
 # with open('./dev-0/in.tsv', 'w', encoding='utf-8') as file:
 #     text = lzma.open('./dev-0/in.tsv.xz').read().decode('utf-8')
 #     file.write(text)
 # with open('./dev-0/in.tsv', encoding='utf-8') as file_in, open('./dev-0/expected.tsv', encoding='utf-8') as file_expected, open('./dev-0/out.tsv', 'w', encoding='utf-8') as file_out:
 #     for line_in, line_expected in zip(file_in, file_expected):
 #         _, _, _, _, _, _, before, after = line_in.split('\t')
 #         before = word_tokenize(before.replace('\\n', '\n'))
 #         after = word_tokenize(after.replace('\\n', '\n'))
 #         file_out.write(predict(before[-1], after[0]) + '\n')
 with open('./test-A/in.tsv', 'w', encoding='utf-8') as file:
    text = lzma.open('./test-A/in.tsv.xz').read().decode('utf-8')
    file.write(text)
 with open('./test-A/in.tsv', encoding='utf-8') as file_in, open('./test-A/out.tsv', 'w', encoding='utf-8') as file_out:
    for line_in in file_in:
        _, _, _, _, _, _, before, after = line_in.split('\t')
        before = word_tokenize(before.replace('\\n', '\n'))
        after = word_tokenize(after.replace('\\n', '\n'))
        file_out.write(predict(before[-1], after[0]) + '\n')
--- a/test-A/in.tsv
+++ b/test-A/in.tsv
--- a/test-A/out.tsv
+++ b/test-A/out.tsv