Add alignment cleaning script
This commit is contained in:
parent
671a5380fd
commit
db99eb23e4
5596
output/aligned_cleaned.txt
Normal file
5596
output/aligned_cleaned.txt
Normal file
File diff suppressed because it is too large
Load Diff
41
src/clean_alignment.py
Normal file
41
src/clean_alignment.py
Normal file
@ -0,0 +1,41 @@
|
||||
|
||||
import os
|
||||
|
||||
|
||||
ENC = "utf-8"
|
||||
ALIGNED_FILE_NAME = "aligned"
|
||||
ALIGNED_CLEAN_FILE_NAME = "aligned_cleaned.txt"
|
||||
OUTPUT_DIR = "output"
|
||||
OUT_FILE_NAME = "out_hr.en.txt"
|
||||
|
||||
|
||||
def main():
|
||||
outf = os.path.join(OUTPUT_DIR, OUT_FILE_NAME)
|
||||
alf = os.path.join(OUTPUT_DIR, ALIGNED_FILE_NAME)
|
||||
input_sentences = []
|
||||
with open(outf) as f:
|
||||
for l in f:
|
||||
h, e = l.split(" ||| ")
|
||||
input_sentences.append((h, e))
|
||||
|
||||
res_dic = []
|
||||
i = 0
|
||||
with open(alf) as f:
|
||||
for l in f:
|
||||
h, e = input_sentences[i]
|
||||
hs = h.split()
|
||||
es = e.split()
|
||||
for t in l.split():
|
||||
is1, is2 = t.split('-')
|
||||
i1 = int(is1)
|
||||
i2 = int(is2)
|
||||
res_dic.append((hs[i1], es[i2]))
|
||||
i += 1
|
||||
alcf = os.path.join(OUTPUT_DIR, ALIGNED_CLEAN_FILE_NAME)
|
||||
with open(alcf, "w") as f:
|
||||
for h, e in res_dic:
|
||||
f.write(f"{h} - {e}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user