REK-proj-2/data_preprocessing/people_identifier.py

78 lines
3.2 KiB
Python
Raw Normal View History

2021-06-28 20:18:14 +02:00
# Load libraries ---------------------------------------------
# ------------------------------------------------------------
class PeopleIdentifier(object):
def __init__(self):
self.id_column_names = []
self.pid_cname = ""
self.next_available_pid = 0
self.cid_to_pid = {} # {"col1": {cid1: pid1, cid2: pid2}, "col2":...}
self.pid_to_cid = {} # {pid1: {"col1": set(cid1, cid2, ...), "col2": set(...), ...}, pid2: ...}
self.data = None
def add_pid(self, data, id_column_names, pid_cname):
self.id_column_names = id_column_names
self.pid_cname = pid_cname
for cid_cname in id_column_names:
self.cid_to_pid[cid_cname] = {}
for idx, reservation in data.iterrows():
pids = set()
for cid_cname in id_column_names:
if reservation[cid_cname] in self.cid_to_pid[cid_cname]:
pids.add(self.cid_to_pid[cid_cname][reservation[cid_cname]])
# print(cid_cname, reservation[cid_cname], self.cid_to_pid[cid_cname][reservation[cid_cname]])
if len(pids) > 0:
min_pid = min(pids)
self.set_pid(min_pid, reservation)
# Merge pids connected through this node
if len(pids) > 1:
pids.remove(min_pid)
self.merge_pids(pids, min_pid)
# print("Chosen pid: {}".format(min_pid))
else:
new_pid = self.next_available_pid
self.next_available_pid += 1
self.set_pid(new_pid, reservation)
# print("Chosen pid: {}".format(new_pid))
# print("=======")
# print(self.pid_to_cid)
# print("=======")
data_pid = data.copy()
data_pid.loc[:, pid_cname] = data_pid.loc[:, id_column_names[0]].apply(lambda x: self.cid_to_pid[id_column_names[0]][x])
self.data = data_pid
return data_pid
def set_pid(self, pid, reservation):
for cid_cname in self.id_column_names:
if reservation[cid_cname] != "":
self.cid_to_pid[cid_cname][reservation[cid_cname]] = pid
if pid in self.pid_to_cid:
for cid_cname in self.id_column_names:
self.pid_to_cid[pid][cid_cname] |= {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
else:
self.pid_to_cid[pid] = {cid_cname: {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
for cid_cname in self.id_column_names}
def merge_pids(self, pids_from, pid_to):
# print("Merge pids", pids_from, pid_to, self.pid_to_cid)
for pid_from in pids_from:
for cid_cname in self.id_column_names:
for cid in self.pid_to_cid[pid_from][cid_cname]:
self.cid_to_pid[cid_cname][cid] = pid_to
self.pid_to_cid[pid_to][cid_cname] |= self.pid_to_cid[pid_from][cid_cname]
self.pid_to_cid.pop(pid_from)