Add alignment cleaning script

This commit is contained in:
nlitkowski 2021-06-22 00:06:04 +02:00
parent 671a5380fd
commit db99eb23e4
2 changed files with 5637 additions and 0 deletions

5596
output/aligned_cleaned.txt Normal file

File diff suppressed because it is too large Load Diff

41
src/clean_alignment.py Normal file
View File

@ -0,0 +1,41 @@
import os
ENC = "utf-8"
ALIGNED_FILE_NAME = "aligned"
ALIGNED_CLEAN_FILE_NAME = "aligned_cleaned.txt"
OUTPUT_DIR = "output"
OUT_FILE_NAME = "out_hr.en.txt"
def main():
outf = os.path.join(OUTPUT_DIR, OUT_FILE_NAME)
alf = os.path.join(OUTPUT_DIR, ALIGNED_FILE_NAME)
input_sentences = []
with open(outf) as f:
for l in f:
h, e = l.split(" ||| ")
input_sentences.append((h, e))
res_dic = []
i = 0
with open(alf) as f:
for l in f:
h, e = input_sentences[i]
hs = h.split()
es = e.split()
for t in l.split():
is1, is2 = t.split('-')
i1 = int(is1)
i2 = int(is2)
res_dic.append((hs[i1], es[i2]))
i += 1
alcf = os.path.join(OUTPUT_DIR, ALIGNED_CLEAN_FILE_NAME)
with open(alcf, "w") as f:
for h, e in res_dic:
f.write(f"{h} - {e}\n")
if __name__ == "__main__":
main()