chat-restaruacja/tsv_data_gen.py

60 lines
2.2 KiB
Python
Raw Permalink Normal View History

import csv
operators_and_users = {
# operator
("06", "Krzysztof Bojakowski"): [
# users
("01", "Mikołaj Gawron"),
("05", "Patryk Osiński"),
("09", "Bartosz Wieczorek"),
("07", "Sergiusz Kański"),
("03", "Dawid Korzępa"),
],
("05", "Patryk Osiński"): [
2024-04-19 21:01:57 +02:00
("02", "Anna Śmigiel"),
("04", "Michał Kasprowicz"),
("06", "Krzysztof Bojakowski"),
("08", "Krystian Osiński"),
]
}
OP_ANON = "system"
USER_ANON = "user"
for operator, users in operators_and_users.items():
op_id, op_name = operator
for (user_id, user_name) in users:
with open(f"./data/dialog-{op_id}-{user_id}-01.tsv", "w", newline='\n', encoding="utf-8") as csvfile:
spamwriter = csv.writer(csvfile, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
spamwriter.writerow(["kto", "treść"])
with open(f"./data/raw/dialog-{op_id}-{user_id}-01.txt", "r", encoding="utf-8") as f:
whos_typing = None
text = ""
while True:
line = f.readline().strip()
if '"' in line:
line = line.replace('"', "'") # Git preview can't display data with " in line
if not line: # eof
break
if user_name not in line and op_name not in line:
if text != "":
text += f"\\n{line}"
else:
text = line
if user_name in line:
if text != "" and whos_typing != USER_ANON:
spamwriter.writerow([whos_typing, f"{text}"])
text = ""
whos_typing = USER_ANON
if op_name in line:
if text != "" and whos_typing != OP_ANON:
spamwriter.writerow([whos_typing, f"{text}"])
text = ""
whos_typing = OP_ANON
if text != "":
spamwriter.writerow([whos_typing, f"{text}"])