diff --git a/data.py b/data.py new file mode 100644 index 0000000..7bb10dd --- /dev/null +++ b/data.py @@ -0,0 +1,51 @@ +import csv + +operators_and_users = { + # operator + ("06", "Krzysztof Bojakowski"): [ + # users + ("01", "Mikołaj Gawron"), + ("05", "Patryk Osiński"), + ("09", "Bartosz Wieczorek"), + ("07", "Sergiusz Kański"), + ("03", "Dawid Korzępa"), + ] +} + +OP_ANON = "system" +USER_ANON = "user" + +for operator, users in operators_and_users.items(): + op_id, op_name = operator + for (user_id, user_name) in users: + with open(f"./data/dialog-{op_id}-{user_id}-01.tsv", "w", newline='\n', encoding="utf-8") as csvfile: + spamwriter = csv.writer(csvfile, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) + spamwriter.writerow(["kto", "treść"]) + with open(f"./data/raw/dialog-{op_id}-{user_id}-01.txt", "r", encoding="utf-8") as f: + whos_typing = None + text = "" + while True: + line = f.readline().strip() + if not line: # eof + break + + if user_name not in line and op_name not in line: + if text != "": + text += f"\\n{line}" + else: + text = line + + if user_name in line: + if text != "" and whos_typing != USER_ANON: + spamwriter.writerow([whos_typing, f"'{text}'"]) + text = "" + whos_typing = USER_ANON + + if op_name in line: + if text != "" and whos_typing != OP_ANON: + spamwriter.writerow([whos_typing, f"'{text}'"]) + text = "" + whos_typing = OP_ANON + + if text != "": + spamwriter.writerow([whos_typing, f"'{text}'"])