cnlps-caiccaic/format.ipynb

2.3 KiB

import re

# Open input file in read mode and output file in write mode
with open('test-A/out.tsv', 'r', encoding="utf-8") as infile, open('test-A/parsed.tsv', 'w', encoding="utf-8") as outfile:

    # Default text to use when text segment is missing
    default_text = "Airconditioner"

    # Iterate through each line in the input file
    for line in infile:

        # Regular expression to match "{...}" or "}"
        match = re.search(r'({.*}|\\}[^{]*$)', line)

        # If "{...}" or "}" is found
        if match:
            # Extract "{...}" or "}" from the line
            data = match.group(0).strip()

            # Extract the text before "{...}" or "}"
            text = line[:match.start()].strip()

            # If data is not enclosed within curly braces, enclose it
            if not data.startswith('{'):
                data = '{' + data

            # If text segment is missing, use default text
            if not text:
                text = default_text

            # Write to output file with the desired format
            outfile.write(f"{text}\t{text}\t{data}\n")
        else:
            # If the line doesn't contain "{...}" or "}", write it as is
            outfile.write(line)

# Done!
print("Data has been formatted and saved to output.txt")
Data has been formatted and saved to output.txt