Inzynierka_Gwiazdy/machine_learning/Lib/site-packages/pandas/_libs/join.pyx

898 lines
28 KiB
Cython
Raw Normal View History

2023-09-20 19:46:58 +02:00
cimport cython
from cython cimport Py_ssize_t
import numpy as np
cimport numpy as cnp
from numpy cimport (
int64_t,
intp_t,
ndarray,
uint64_t,
)
cnp.import_array()
from pandas._libs.algos import groupsort_indexer
from pandas._libs.dtypes cimport (
numeric_object_t,
numeric_t,
)
@cython.wraparound(False)
@cython.boundscheck(False)
def inner_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups):
cdef:
Py_ssize_t i, j, k, count = 0
intp_t[::1] left_sorter, right_sorter
intp_t[::1] left_count, right_count
intp_t[::1] left_indexer, right_indexer
intp_t lc, rc
Py_ssize_t left_pos = 0, right_pos = 0, position = 0
Py_ssize_t offset
left_sorter, left_count = groupsort_indexer(left, max_groups)
right_sorter, right_count = groupsort_indexer(right, max_groups)
with nogil:
# First pass, determine size of result set, do not use the NA group
for i in range(1, max_groups + 1):
lc = left_count[i]
rc = right_count[i]
if rc > 0 and lc > 0:
count += lc * rc
left_indexer = np.empty(count, dtype=np.intp)
right_indexer = np.empty(count, dtype=np.intp)
with nogil:
# exclude the NA group
left_pos = left_count[0]
right_pos = right_count[0]
for i in range(1, max_groups + 1):
lc = left_count[i]
rc = right_count[i]
if rc > 0 and lc > 0:
for j in range(lc):
offset = position + j * rc
for k in range(rc):
left_indexer[offset + k] = left_pos + j
right_indexer[offset + k] = right_pos + k
position += lc * rc
left_pos += lc
right_pos += rc
# Will overwrite left/right indexer with the result
_get_result_indexer(left_sorter, left_indexer)
_get_result_indexer(right_sorter, right_indexer)
return np.asarray(left_indexer), np.asarray(right_indexer)
@cython.wraparound(False)
@cython.boundscheck(False)
def left_outer_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups, bint sort=True):
cdef:
Py_ssize_t i, j, k, count = 0
ndarray[intp_t] rev
intp_t[::1] left_count, right_count
intp_t[::1] left_sorter, right_sorter
intp_t[::1] left_indexer, right_indexer
intp_t lc, rc
Py_ssize_t left_pos = 0, right_pos = 0, position = 0
Py_ssize_t offset
left_sorter, left_count = groupsort_indexer(left, max_groups)
right_sorter, right_count = groupsort_indexer(right, max_groups)
with nogil:
# First pass, determine size of result set, do not use the NA group
for i in range(1, max_groups + 1):
lc = left_count[i]
rc = right_count[i]
if rc > 0:
count += lc * rc
else:
count += lc
left_indexer = np.empty(count, dtype=np.intp)
right_indexer = np.empty(count, dtype=np.intp)
with nogil:
# exclude the NA group
left_pos = left_count[0]
right_pos = right_count[0]
for i in range(1, max_groups + 1):
lc = left_count[i]
rc = right_count[i]
if rc == 0:
for j in range(lc):
left_indexer[position + j] = left_pos + j
right_indexer[position + j] = -1
position += lc
else:
for j in range(lc):
offset = position + j * rc
for k in range(rc):
left_indexer[offset + k] = left_pos + j
right_indexer[offset + k] = right_pos + k
position += lc * rc
left_pos += lc
right_pos += rc
# Will overwrite left/right indexer with the result
_get_result_indexer(left_sorter, left_indexer)
_get_result_indexer(right_sorter, right_indexer)
if not sort: # if not asked to sort, revert to original order
if len(left) == len(left_indexer):
# no multiple matches for any row on the left
# this is a short-cut to avoid groupsort_indexer
# otherwise, the `else` path also works in this case
rev = np.empty(len(left), dtype=np.intp)
rev.put(np.asarray(left_sorter), np.arange(len(left)))
else:
rev, _ = groupsort_indexer(left_indexer, len(left))
return np.asarray(left_indexer).take(rev), np.asarray(right_indexer).take(rev)
else:
return np.asarray(left_indexer), np.asarray(right_indexer)
@cython.wraparound(False)
@cython.boundscheck(False)
def full_outer_join(const intp_t[:] left, const intp_t[:] right,
Py_ssize_t max_groups):
cdef:
Py_ssize_t i, j, k, count = 0
intp_t[::1] left_sorter, right_sorter
intp_t[::1] left_count, right_count
intp_t[::1] left_indexer, right_indexer
intp_t lc, rc
intp_t left_pos = 0, right_pos = 0
Py_ssize_t offset, position = 0
left_sorter, left_count = groupsort_indexer(left, max_groups)
right_sorter, right_count = groupsort_indexer(right, max_groups)
with nogil:
# First pass, determine size of result set, do not use the NA group
for i in range(1, max_groups + 1):
lc = left_count[i]
rc = right_count[i]
if rc > 0 and lc > 0:
count += lc * rc
else:
count += lc + rc
left_indexer = np.empty(count, dtype=np.intp)
right_indexer = np.empty(count, dtype=np.intp)
with nogil:
# exclude the NA group
left_pos = left_count[0]
right_pos = right_count[0]
for i in range(1, max_groups + 1):
lc = left_count[i]
rc = right_count[i]
if rc == 0:
for j in range(lc):
left_indexer[position + j] = left_pos + j
right_indexer[position + j] = -1
position += lc
elif lc == 0:
for j in range(rc):
left_indexer[position + j] = -1
right_indexer[position + j] = right_pos + j
position += rc
else:
for j in range(lc):
offset = position + j * rc
for k in range(rc):
left_indexer[offset + k] = left_pos + j
right_indexer[offset + k] = right_pos + k
position += lc * rc
left_pos += lc
right_pos += rc
# Will overwrite left/right indexer with the result
_get_result_indexer(left_sorter, left_indexer)
_get_result_indexer(right_sorter, right_indexer)
return np.asarray(left_indexer), np.asarray(right_indexer)
@cython.wraparound(False)
@cython.boundscheck(False)
cdef void _get_result_indexer(intp_t[::1] sorter, intp_t[::1] indexer) nogil:
"""NOTE: overwrites indexer with the result to avoid allocating another array"""
cdef:
Py_ssize_t i, n, idx
if len(sorter) > 0:
# cython-only equivalent to
# `res = algos.take_nd(sorter, indexer, fill_value=-1)`
n = indexer.shape[0]
for i in range(n):
idx = indexer[i]
if idx == -1:
indexer[i] = -1
else:
indexer[i] = sorter[idx]
else:
# length-0 case
indexer[:] = -1
@cython.wraparound(False)
@cython.boundscheck(False)
def ffill_indexer(const intp_t[:] indexer) -> np.ndarray:
cdef:
Py_ssize_t i, n = len(indexer)
ndarray[intp_t] result
intp_t val, last_obs
result = np.empty(n, dtype=np.intp)
last_obs = -1
for i in range(n):
val = indexer[i]
if val == -1:
result[i] = last_obs
else:
result[i] = val
last_obs = val
return result
# ----------------------------------------------------------------------
# left_join_indexer, inner_join_indexer, outer_join_indexer
# ----------------------------------------------------------------------
# Joins on ordered, unique indices
# right might contain non-unique values
@cython.wraparound(False)
@cython.boundscheck(False)
def left_join_indexer_unique(
ndarray[numeric_object_t] left,
ndarray[numeric_object_t] right
):
"""
Both left and right are strictly monotonic increasing.
"""
cdef:
Py_ssize_t i, j, nleft, nright
ndarray[intp_t] indexer
numeric_object_t rval
i = 0
j = 0
nleft = len(left)
nright = len(right)
indexer = np.empty(nleft, dtype=np.intp)
while True:
if i == nleft:
break
if j == nright:
indexer[i] = -1
i += 1
continue
rval = right[j]
while i < nleft - 1 and left[i] == rval:
indexer[i] = j
i += 1
if left[i] == rval:
indexer[i] = j
i += 1
while i < nleft - 1 and left[i] == rval:
indexer[i] = j
i += 1
j += 1
elif left[i] > rval:
indexer[i] = -1
j += 1
else:
indexer[i] = -1
i += 1
return indexer
@cython.wraparound(False)
@cython.boundscheck(False)
def left_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right):
"""
Two-pass algorithm for monotonic indexes. Handles many-to-one merges.
Both left and right are monotonic increasing, but at least one of them
is non-unique (if both were unique we'd use left_join_indexer_unique).
"""
cdef:
Py_ssize_t i, j, nright, nleft, count
numeric_object_t lval, rval
ndarray[intp_t] lindexer, rindexer
ndarray[numeric_object_t] result
nleft = len(left)
nright = len(right)
# First pass is to find the size 'count' of our output indexers.
i = 0
j = 0
count = 0
if nleft > 0:
while i < nleft:
if j == nright:
count += nleft - i
break
lval = left[i]
rval = right[j]
if lval == rval:
# This block is identical across
# left_join_indexer, inner_join_indexer, outer_join_indexer
count += 1
if i < nleft - 1:
if j < nright - 1 and right[j + 1] == rval:
j += 1
else:
i += 1
if left[i] != rval:
j += 1
elif j < nright - 1:
j += 1
if lval != right[j]:
i += 1
else:
# end of the road
break
elif lval < rval:
count += 1
i += 1
else:
j += 1
# do it again now that result size is known
lindexer = np.empty(count, dtype=np.intp)
rindexer = np.empty(count, dtype=np.intp)
result = np.empty(count, dtype=left.dtype)
i = 0
j = 0
count = 0
if nleft > 0:
while i < nleft:
if j == nright:
while i < nleft:
lindexer[count] = i
rindexer[count] = -1
result[count] = left[i]
i += 1
count += 1
break
lval = left[i]
rval = right[j]
if lval == rval:
lindexer[count] = i
rindexer[count] = j
result[count] = lval
count += 1
if i < nleft - 1:
if j < nright - 1 and right[j + 1] == rval:
j += 1
else:
i += 1
if left[i] != rval:
j += 1
elif j < nright - 1:
j += 1
if lval != right[j]:
i += 1
else:
# end of the road
break
elif lval < rval:
# i.e. lval not in right; we keep for left_join_indexer
lindexer[count] = i
rindexer[count] = -1
result[count] = lval
count += 1
i += 1
else:
# i.e. rval not in left; we discard for left_join_indexer
j += 1
return result, lindexer, rindexer
@cython.wraparound(False)
@cython.boundscheck(False)
def inner_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right):
"""
Two-pass algorithm for monotonic indexes. Handles many-to-one merges.
Both left and right are monotonic increasing but not necessarily unique.
"""
cdef:
Py_ssize_t i, j, nright, nleft, count
numeric_object_t lval, rval
ndarray[intp_t] lindexer, rindexer
ndarray[numeric_object_t] result
nleft = len(left)
nright = len(right)
# First pass is to find the size 'count' of our output indexers.
i = 0
j = 0
count = 0
if nleft > 0 and nright > 0:
while True:
if i == nleft:
break
if j == nright:
break
lval = left[i]
rval = right[j]
if lval == rval:
count += 1
if i < nleft - 1:
if j < nright - 1 and right[j + 1] == rval:
j += 1
else:
i += 1
if left[i] != rval:
j += 1
elif j < nright - 1:
j += 1
if lval != right[j]:
i += 1
else:
# end of the road
break
elif lval < rval:
# i.e. lval not in right; we discard for inner_indexer
i += 1
else:
# i.e. rval not in left; we discard for inner_indexer
j += 1
# do it again now that result size is known
lindexer = np.empty(count, dtype=np.intp)
rindexer = np.empty(count, dtype=np.intp)
result = np.empty(count, dtype=left.dtype)
i = 0
j = 0
count = 0
if nleft > 0 and nright > 0:
while True:
if i == nleft:
break
if j == nright:
break
lval = left[i]
rval = right[j]
if lval == rval:
lindexer[count] = i
rindexer[count] = j
result[count] = lval
count += 1
if i < nleft - 1:
if j < nright - 1 and right[j + 1] == rval:
j += 1
else:
i += 1
if left[i] != rval:
j += 1
elif j < nright - 1:
j += 1
if lval != right[j]:
i += 1
else:
# end of the road
break
elif lval < rval:
# i.e. lval not in right; we discard for inner_indexer
i += 1
else:
# i.e. rval not in left; we discard for inner_indexer
j += 1
return result, lindexer, rindexer
@cython.wraparound(False)
@cython.boundscheck(False)
def outer_join_indexer(ndarray[numeric_object_t] left, ndarray[numeric_object_t] right):
"""
Both left and right are monotonic increasing but not necessarily unique.
"""
cdef:
Py_ssize_t i, j, nright, nleft, count
numeric_object_t lval, rval
ndarray[intp_t] lindexer, rindexer
ndarray[numeric_object_t] result
nleft = len(left)
nright = len(right)
# First pass is to find the size 'count' of our output indexers.
# count will be length of left plus the number of elements of right not in
# left (counting duplicates)
i = 0
j = 0
count = 0
if nleft == 0:
count = nright
elif nright == 0:
count = nleft
else:
while True:
if i == nleft:
count += nright - j
break
if j == nright:
count += nleft - i
break
lval = left[i]
rval = right[j]
if lval == rval:
count += 1
if i < nleft - 1:
if j < nright - 1 and right[j + 1] == rval:
j += 1
else:
i += 1
if left[i] != rval:
j += 1
elif j < nright - 1:
j += 1
if lval != right[j]:
i += 1
else:
# end of the road
break
elif lval < rval:
count += 1
i += 1
else:
count += 1
j += 1
lindexer = np.empty(count, dtype=np.intp)
rindexer = np.empty(count, dtype=np.intp)
result = np.empty(count, dtype=left.dtype)
# do it again, but populate the indexers / result
i = 0
j = 0
count = 0
if nleft == 0:
for j in range(nright):
lindexer[j] = -1
rindexer[j] = j
result[j] = right[j]
elif nright == 0:
for i in range(nleft):
lindexer[i] = i
rindexer[i] = -1
result[i] = left[i]
else:
while True:
if i == nleft:
while j < nright:
lindexer[count] = -1
rindexer[count] = j
result[count] = right[j]
count += 1
j += 1
break
if j == nright:
while i < nleft:
lindexer[count] = i
rindexer[count] = -1
result[count] = left[i]
count += 1
i += 1
break
lval = left[i]
rval = right[j]
if lval == rval:
lindexer[count] = i
rindexer[count] = j
result[count] = lval
count += 1
if i < nleft - 1:
if j < nright - 1 and right[j + 1] == rval:
j += 1
else:
i += 1
if left[i] != rval:
j += 1
elif j < nright - 1:
j += 1
if lval != right[j]:
i += 1
else:
# end of the road
break
elif lval < rval:
# i.e. lval not in right; we keep for outer_join_indexer
lindexer[count] = i
rindexer[count] = -1
result[count] = lval
count += 1
i += 1
else:
# i.e. rval not in left; we keep for outer_join_indexer
lindexer[count] = -1
rindexer[count] = j
result[count] = rval
count += 1
j += 1
return result, lindexer, rindexer
# ----------------------------------------------------------------------
# asof_join_by
# ----------------------------------------------------------------------
from pandas._libs.hashtable cimport (
HashTable,
Int64HashTable,
PyObjectHashTable,
UInt64HashTable,
)
ctypedef fused by_t:
object
int64_t
uint64_t
def asof_join_backward_on_X_by_Y(numeric_t[:] left_values,
numeric_t[:] right_values,
by_t[:] left_by_values,
by_t[:] right_by_values,
bint allow_exact_matches=True,
tolerance=None,
bint use_hashtable=True):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[intp_t] left_indexer, right_indexer
bint has_tolerance = False
numeric_t tolerance_ = 0
numeric_t diff = 0
HashTable hash_table
by_t by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = True
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.intp)
right_indexer = np.empty(left_size, dtype=np.intp)
if use_hashtable:
if by_t is object:
hash_table = PyObjectHashTable(right_size)
elif by_t is int64_t:
hash_table = Int64HashTable(right_size)
elif by_t is uint64_t:
hash_table = UInt64HashTable(right_size)
right_pos = 0
for left_pos in range(left_size):
# restart right_pos if it went negative in a previous iteration
if right_pos < 0:
right_pos = 0
# find last position in right whose value is less than left's
if allow_exact_matches:
while (right_pos < right_size and
right_values[right_pos] <= left_values[left_pos]):
if use_hashtable:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
else:
while (right_pos < right_size and
right_values[right_pos] < left_values[left_pos]):
if use_hashtable:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos += 1
right_pos -= 1
# save positions as the desired index
if use_hashtable:
by_value = left_by_values[left_pos]
found_right_pos = (hash_table.get_item(by_value)
if by_value in hash_table else -1)
else:
found_right_pos = right_pos
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = left_values[left_pos] - right_values[found_right_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_forward_on_X_by_Y(numeric_t[:] left_values,
numeric_t[:] right_values,
by_t[:] left_by_values,
by_t[:] right_by_values,
bint allow_exact_matches=1,
tolerance=None,
bint use_hashtable=True):
cdef:
Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos
ndarray[intp_t] left_indexer, right_indexer
bint has_tolerance = False
numeric_t tolerance_ = 0
numeric_t diff = 0
HashTable hash_table
by_t by_value
# if we are using tolerance, set our objects
if tolerance is not None:
has_tolerance = True
tolerance_ = tolerance
left_size = len(left_values)
right_size = len(right_values)
left_indexer = np.empty(left_size, dtype=np.intp)
right_indexer = np.empty(left_size, dtype=np.intp)
if use_hashtable:
if by_t is object:
hash_table = PyObjectHashTable(right_size)
elif by_t is int64_t:
hash_table = Int64HashTable(right_size)
elif by_t is uint64_t:
hash_table = UInt64HashTable(right_size)
right_pos = right_size - 1
for left_pos in range(left_size - 1, -1, -1):
# restart right_pos if it went over in a previous iteration
if right_pos == right_size:
right_pos = right_size - 1
# find first position in right whose value is greater than left's
if allow_exact_matches:
while (right_pos >= 0 and
right_values[right_pos] >= left_values[left_pos]):
if use_hashtable:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos -= 1
else:
while (right_pos >= 0 and
right_values[right_pos] > left_values[left_pos]):
if use_hashtable:
hash_table.set_item(right_by_values[right_pos], right_pos)
right_pos -= 1
right_pos += 1
# save positions as the desired index
if use_hashtable:
by_value = left_by_values[left_pos]
found_right_pos = (hash_table.get_item(by_value)
if by_value in hash_table else -1)
else:
found_right_pos = (right_pos
if right_pos != right_size else -1)
left_indexer[left_pos] = left_pos
right_indexer[left_pos] = found_right_pos
# if needed, verify that tolerance is met
if has_tolerance and found_right_pos != -1:
diff = right_values[found_right_pos] - left_values[left_pos]
if diff > tolerance_:
right_indexer[left_pos] = -1
return left_indexer, right_indexer
def asof_join_nearest_on_X_by_Y(ndarray[numeric_t] left_values,
ndarray[numeric_t] right_values,
ndarray[by_t] left_by_values,
ndarray[by_t] right_by_values,
bint allow_exact_matches=True,
tolerance=None,
bint use_hashtable=True):
cdef:
ndarray[intp_t] bli, bri, fli, fri
ndarray[intp_t] left_indexer, right_indexer
Py_ssize_t left_size, i
numeric_t bdiff, fdiff
# search both forward and backward
# TODO(cython3):
# Bug in beta1 preventing Cython from choosing
# right specialization when one fused memview is None
# Doesn't matter what type we choose
# (nothing happens anyways since it is None)
# GH 51640
if left_by_values is not None and left_by_values.dtype != object:
by_dtype = f"{left_by_values.dtype}_t"
else:
by_dtype = object
bli, bri = asof_join_backward_on_X_by_Y[f"{left_values.dtype}_t", by_dtype](
left_values,
right_values,
left_by_values,
right_by_values,
allow_exact_matches,
tolerance,
use_hashtable
)
fli, fri = asof_join_forward_on_X_by_Y[f"{left_values.dtype}_t", by_dtype](
left_values,
right_values,
left_by_values,
right_by_values,
allow_exact_matches,
tolerance,
use_hashtable
)
# choose the smaller timestamp
left_size = len(left_values)
left_indexer = np.empty(left_size, dtype=np.intp)
right_indexer = np.empty(left_size, dtype=np.intp)
for i in range(len(bri)):
# choose timestamp from right with smaller difference
if bri[i] != -1 and fri[i] != -1:
bdiff = left_values[bli[i]] - right_values[bri[i]]
fdiff = right_values[fri[i]] - left_values[fli[i]]
right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i]
else:
right_indexer[i] = bri[i] if bri[i] != -1 else fri[i]
left_indexer[i] = bli[i]
return left_indexer, right_indexer