78 lines
3.2 KiB
Python
78 lines
3.2 KiB
Python
# Load libraries ---------------------------------------------
|
|
|
|
# ------------------------------------------------------------
|
|
|
|
|
|
class PeopleIdentifier(object):
|
|
|
|
def __init__(self):
|
|
self.id_column_names = []
|
|
self.pid_cname = ""
|
|
self.next_available_pid = 0
|
|
self.cid_to_pid = {} # {"col1": {cid1: pid1, cid2: pid2}, "col2":...}
|
|
self.pid_to_cid = {} # {pid1: {"col1": set(cid1, cid2, ...), "col2": set(...), ...}, pid2: ...}
|
|
self.data = None
|
|
|
|
def add_pid(self, data, id_column_names, pid_cname):
|
|
self.id_column_names = id_column_names
|
|
self.pid_cname = pid_cname
|
|
|
|
for cid_cname in id_column_names:
|
|
self.cid_to_pid[cid_cname] = {}
|
|
|
|
for idx, reservation in data.iterrows():
|
|
pids = set()
|
|
for cid_cname in id_column_names:
|
|
if reservation[cid_cname] in self.cid_to_pid[cid_cname]:
|
|
pids.add(self.cid_to_pid[cid_cname][reservation[cid_cname]])
|
|
# print(cid_cname, reservation[cid_cname], self.cid_to_pid[cid_cname][reservation[cid_cname]])
|
|
|
|
if len(pids) > 0:
|
|
min_pid = min(pids)
|
|
|
|
self.set_pid(min_pid, reservation)
|
|
|
|
# Merge pids connected through this node
|
|
|
|
if len(pids) > 1:
|
|
pids.remove(min_pid)
|
|
self.merge_pids(pids, min_pid)
|
|
|
|
# print("Chosen pid: {}".format(min_pid))
|
|
else:
|
|
new_pid = self.next_available_pid
|
|
self.next_available_pid += 1
|
|
|
|
self.set_pid(new_pid, reservation)
|
|
# print("Chosen pid: {}".format(new_pid))
|
|
|
|
# print("=======")
|
|
# print(self.pid_to_cid)
|
|
# print("=======")
|
|
|
|
data_pid = data.copy()
|
|
data_pid.loc[:, pid_cname] = data_pid.loc[:, id_column_names[0]].apply(lambda x: self.cid_to_pid[id_column_names[0]][x])
|
|
self.data = data_pid
|
|
|
|
return data_pid
|
|
|
|
def set_pid(self, pid, reservation):
|
|
for cid_cname in self.id_column_names:
|
|
if reservation[cid_cname] != "":
|
|
self.cid_to_pid[cid_cname][reservation[cid_cname]] = pid
|
|
if pid in self.pid_to_cid:
|
|
for cid_cname in self.id_column_names:
|
|
self.pid_to_cid[pid][cid_cname] |= {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
|
|
else:
|
|
self.pid_to_cid[pid] = {cid_cname: {reservation[cid_cname]} if reservation[cid_cname] != "" else set()
|
|
for cid_cname in self.id_column_names}
|
|
|
|
def merge_pids(self, pids_from, pid_to):
|
|
# print("Merge pids", pids_from, pid_to, self.pid_to_cid)
|
|
for pid_from in pids_from:
|
|
for cid_cname in self.id_column_names:
|
|
for cid in self.pid_to_cid[pid_from][cid_cname]:
|
|
self.cid_to_pid[cid_cname][cid] = pid_to
|
|
self.pid_to_cid[pid_to][cid_cname] |= self.pid_to_cid[pid_from][cid_cname]
|
|
self.pid_to_cid.pop(pid_from)
|