{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data has been formatted and saved to output.txt\n" ] } ], "source": [ "import re\n", "\n", "# Open input file in read mode and output file in write mode\n", "with open('test-A/out.tsv', 'r', encoding=\"utf-8\") as infile, open('test-A/parsed.tsv', 'w', encoding=\"utf-8\") as outfile:\n", "\n", " # Default text to use when text segment is missing\n", " default_text = \"Airconditioner\"\n", "\n", " # Iterate through each line in the input file\n", " for line in infile:\n", "\n", " # Regular expression to match \"{...}\" or \"}\"\n", " match = re.search(r'({.*}|\\}[^{]*$)', line)\n", "\n", " # If \"{...}\" or \"}\" is found\n", " if match:\n", " # Extract \"{...}\" or \"}\" from the line\n", " data = match.group(0).strip()\n", "\n", " # Extract the text before \"{...}\" or \"}\"\n", " text = line[:match.start()].strip()\n", "\n", " # If data is not enclosed within curly braces, enclose it\n", " if not data.startswith('{'):\n", " data = '{' + data\n", "\n", " # If text segment is missing, use default text\n", " if not text:\n", " text = default_text\n", "\n", " # Write to output file with the desired format\n", " outfile.write(f\"{text}\\t{text}\\t{data}\\n\")\n", " else:\n", " # If the line doesn't contain \"{...}\" or \"}\", write it as is\n", " outfile.write(line)\n", "\n", "# Done!\n", "print(\"Data has been formatted and saved to output.txt\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "modelowanie", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }