Add alignment cleaning script
This commit is contained in:
parent
671a5380fd
commit
db99eb23e4
5596
output/aligned_cleaned.txt
Normal file
5596
output/aligned_cleaned.txt
Normal file
File diff suppressed because it is too large
Load Diff
41
src/clean_alignment.py
Normal file
41
src/clean_alignment.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
ENC = "utf-8"
|
||||||
|
ALIGNED_FILE_NAME = "aligned"
|
||||||
|
ALIGNED_CLEAN_FILE_NAME = "aligned_cleaned.txt"
|
||||||
|
OUTPUT_DIR = "output"
|
||||||
|
OUT_FILE_NAME = "out_hr.en.txt"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
outf = os.path.join(OUTPUT_DIR, OUT_FILE_NAME)
|
||||||
|
alf = os.path.join(OUTPUT_DIR, ALIGNED_FILE_NAME)
|
||||||
|
input_sentences = []
|
||||||
|
with open(outf) as f:
|
||||||
|
for l in f:
|
||||||
|
h, e = l.split(" ||| ")
|
||||||
|
input_sentences.append((h, e))
|
||||||
|
|
||||||
|
res_dic = []
|
||||||
|
i = 0
|
||||||
|
with open(alf) as f:
|
||||||
|
for l in f:
|
||||||
|
h, e = input_sentences[i]
|
||||||
|
hs = h.split()
|
||||||
|
es = e.split()
|
||||||
|
for t in l.split():
|
||||||
|
is1, is2 = t.split('-')
|
||||||
|
i1 = int(is1)
|
||||||
|
i2 = int(is2)
|
||||||
|
res_dic.append((hs[i1], es[i2]))
|
||||||
|
i += 1
|
||||||
|
alcf = os.path.join(OUTPUT_DIR, ALIGNED_CLEAN_FILE_NAME)
|
||||||
|
with open(alcf, "w") as f:
|
||||||
|
for h, e in res_dic:
|
||||||
|
f.write(f"{h} - {e}\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in New Issue
Block a user