2.1 KiB
2.1 KiB
import re
# Open input file in read mode and output file in write mode
with open('dev-A/out.tsv', 'r', encoding="utf-8") as infile, open('dev-A/parsed.tsv', 'w', encoding="utf-8") as outfile:
# Iterate through each line in the input file
for line in infile:
# Regular expression to match "{...}" or "}"
match = re.search(r'({.*}|\\})', line)
# If "{...}" or "}" is found
if match:
# Extract "{...}" or "}" from the line
data = match.group(0)
# Extract the text before "{...}" or "}"
text = line[:match.start()].strip()
# Split the text into segments
segments = text.split()
# If no segments, it means only data is present
if not segments:
outfile.write(data + '\n')
# If only one segment, duplicate it to conform to the format "text\ttext\t{data}"
elif len(segments) == 1:
outfile.write(segments[0] + '\t' + segments[0] + '\t' + data + '\n')
# If more than one segment, use them as is
else:
outfile.write('\t'.join(segments) + '\t' + data + '\n')
else:
# If the line doesn't contain "{...}" or "}", write it as is
outfile.write(line)
# Done!
print("Data has been formatted and saved to output.txt")