cnlps-caiccaic/format.ipynb
Jakub Adamski bc5869a233 solution
2023-06-11 23:57:58 +02:00

2.1 KiB

import re

# Open input file in read mode and output file in write mode
with open('dev-A/out.tsv', 'r', encoding="utf-8") as infile, open('dev-A/parsed.tsv', 'w', encoding="utf-8") as outfile:
    
    # Iterate through each line in the input file
    for line in infile:
        
        # Regular expression to match "{...}" or "}"
        match = re.search(r'({.*}|\\})', line)
        
        # If "{...}" or "}" is found
        if match:
            # Extract "{...}" or "}" from the line
            data = match.group(0)
            
            # Extract the text before "{...}" or "}"
            text = line[:match.start()].strip()
            
            # Split the text into segments
            segments = text.split()
            
            # If no segments, it means only data is present
            if not segments:
                outfile.write(data + '\n')
            # If only one segment, duplicate it to conform to the format "text\ttext\t{data}"
            elif len(segments) == 1:
                outfile.write(segments[0] + '\t' + segments[0] + '\t' + data + '\n')
            # If more than one segment, use them as is
            else:
                outfile.write('\t'.join(segments) + '\t' + data + '\n')
        else:
            # If the line doesn't contain "{...}" or "}", write it as is
            outfile.write(line)

# Done!
print("Data has been formatted and saved to output.txt")