# NOTE: Please avoid the use of numpy.testing since NPYV intrinsics # may be involved in their functionality. import pytest, math from numpy.core._simd import targets class _Test_Utility: # submodule of the desired SIMD extention, e.g. targets["AVX512F"] npyv = None # the current data type suffix e.g. 's8' sfx = None def __getattr__(self, attr): """ To call NPV intrinsics without the attribute 'npyv' and auto suffixing intrinsics according to class attribute 'sfx' """ return getattr(self.npyv, attr + "_" + self.sfx) def _data(self, start=None, count=None, reverse=False): """ Create list of consecutive numbers according to number of vector's lanes. """ if start is None: start = 1 if count is None: count = self.nlanes rng = range(start, start + count) if reverse: rng = reversed(rng) if self._is_fp(): return [x / 1.0 for x in rng] return list(rng) def _is_unsigned(self): return self.sfx[0] == 'u' def _is_signed(self): return self.sfx[0] == 's' def _is_fp(self): return self.sfx[0] == 'f' def _scalar_size(self): return int(self.sfx[1:]) def _int_clip(self, seq): if self._is_fp(): return seq max_int = self._int_max() min_int = self._int_min() return [min(max(v, min_int), max_int) for v in seq] def _int_max(self): if self._is_fp(): return None max_u = self._to_unsigned(self.setall(-1))[0] if self._is_signed(): return max_u // 2 return max_u def _int_min(self): if self._is_fp(): return None if self._is_unsigned(): return 0 return -(self._int_max() + 1) def _true_mask(self): max_unsig = getattr(self.npyv, "setall_u" + self.sfx[1:])(-1) return max_unsig[0] def _to_unsigned(self, vector): if isinstance(vector, (list, tuple)): return getattr(self.npyv, "load_u" + self.sfx[1:])(vector) else: sfx = vector.__name__.replace("npyv_", "") if sfx[0] == "b": cvt_intrin = "cvt_u{0}_b{0}" else: cvt_intrin = "reinterpret_u{0}_{1}" return getattr(self.npyv, cvt_intrin.format(sfx[1:], sfx))(vector) def _pinfinity(self): v = self.npyv.setall_u32(0x7f800000) return self.npyv.reinterpret_f32_u32(v)[0] def _ninfinity(self): v = self.npyv.setall_u32(0xff800000) return self.npyv.reinterpret_f32_u32(v)[0] def _nan(self): v = self.npyv.setall_u32(0x7fc00000) return self.npyv.reinterpret_f32_u32(v)[0] class _SIMD_INT(_Test_Utility): """ To test all integer vector types at once """ def test_operators_shift(self): if self.sfx in ("u8", "s8"): return data_a = self._data(self._int_max() - self.nlanes) data_b = self._data(self._int_min(), reverse=True) vdata_a, vdata_b = self.load(data_a), self.load(data_b) for count in range(self._scalar_size()): # load to cast data_shl_a = self.load([a << count for a in data_a]) # left shift shl = self.shl(vdata_a, count) assert shl == data_shl_a # load to cast data_shr_a = self.load([a >> count for a in data_a]) # right shift shr = self.shr(vdata_a, count) assert shr == data_shr_a # shift by zero or max or out-range immediate constant is not applicable and illogical for count in range(1, self._scalar_size()): # load to cast data_shl_a = self.load([a << count for a in data_a]) # left shift by an immediate constant shli = self.shli(vdata_a, count) assert shli == data_shl_a # load to cast data_shr_a = self.load([a >> count for a in data_a]) # right shift by an immediate constant shri = self.shri(vdata_a, count) assert shri == data_shr_a def test_arithmetic_subadd_saturated(self): if self.sfx in ("u32", "s32", "u64", "s64"): return data_a = self._data(self._int_max() - self.nlanes) data_b = self._data(self._int_min(), reverse=True) vdata_a, vdata_b = self.load(data_a), self.load(data_b) data_adds = self._int_clip([a + b for a, b in zip(data_a, data_b)]) adds = self.adds(vdata_a, vdata_b) assert adds == data_adds data_subs = self._int_clip([a - b for a, b in zip(data_a, data_b)]) subs = self.subs(vdata_a, vdata_b) assert subs == data_subs class _SIMD_FP(_Test_Utility): """ To test all float vector types at once """ def test_arithmetic_fused(self): vdata_a, vdata_b, vdata_c = [self.load(self._data())]*3 vdata_cx2 = self.add(vdata_c, vdata_c) # multiply and add, a*b + c data_fma = self.load([a * b + c for a, b, c in zip(vdata_a, vdata_b, vdata_c)]) fma = self.muladd(vdata_a, vdata_b, vdata_c) assert fma == data_fma # multiply and subtract, a*b - c fms = self.mulsub(vdata_a, vdata_b, vdata_c) data_fms = self.sub(data_fma, vdata_cx2) assert fms == data_fms # negate multiply and add, -(a*b) + c nfma = self.nmuladd(vdata_a, vdata_b, vdata_c) data_nfma = self.sub(vdata_cx2, data_fma) assert nfma == data_nfma # negate multiply and subtract, -(a*b) - c nfms = self.nmulsub(vdata_a, vdata_b, vdata_c) data_nfms = self.mul(data_fma, self.setall(-1)) assert nfms == data_nfms def test_abs(self): pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() data = self._data() vdata = self.load(self._data()) abs_cases = ((-0, 0), (ninf, pinf), (pinf, pinf), (nan, nan)) for case, desired in abs_cases: data_abs = [desired]*self.nlanes vabs = self.abs(self.setall(case)) assert vabs == pytest.approx(data_abs, nan_ok=True) vabs = self.abs(self.mul(vdata, self.setall(-1))) assert vabs == data def test_sqrt(self): pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() data = self._data() vdata = self.load(self._data()) sqrt_cases = ((-0.0, -0.0), (0.0, 0.0), (-1.0, nan), (ninf, nan), (pinf, pinf)) for case, desired in sqrt_cases: data_sqrt = [desired]*self.nlanes sqrt = self.sqrt(self.setall(case)) assert sqrt == pytest.approx(data_sqrt, nan_ok=True) data_sqrt = self.load([math.sqrt(x) for x in data]) # load to truncate precision sqrt = self.sqrt(vdata) assert sqrt == data_sqrt def test_square(self): pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() data = self._data() vdata = self.load(self._data()) # square square_cases = ((nan, nan), (pinf, pinf), (ninf, pinf)) for case, desired in square_cases: data_square = [desired]*self.nlanes square = self.square(self.setall(case)) assert square == pytest.approx(data_square, nan_ok=True) data_square = [x*x for x in data] square = self.square(vdata) assert square == data_square def test_reciprocal(self): pinf, ninf, nan = self._pinfinity(), self._ninfinity(), self._nan() data = self._data() vdata = self.load(self._data()) recip_cases = ((nan, nan), (pinf, 0.0), (ninf, -0.0), (0.0, pinf), (-0.0, ninf)) for case, desired in recip_cases: data_recip = [desired]*self.nlanes recip = self.recip(self.setall(case)) assert recip == pytest.approx(data_recip, nan_ok=True) data_recip = self.load([1/x for x in data]) # load to truncate precision recip = self.recip(vdata) assert recip == data_recip class _SIMD_ALL(_Test_Utility): """ To test all vector types at once """ def test_memory_load(self): data = self._data() # unaligned load load_data = self.load(data) assert load_data == data # aligned load loada_data = self.loada(data) assert loada_data == data # stream load loads_data = self.loads(data) assert loads_data == data # load lower part loadl = self.loadl(data) loadl_half = list(loadl)[:self.nlanes//2] data_half = data[:self.nlanes//2] assert loadl_half == data_half assert loadl != data # detect overflow def test_memory_store(self): data = self._data() vdata = self.load(data) # unaligned store store = [0] * self.nlanes self.store(store, vdata) assert store == data # aligned store store_a = [0] * self.nlanes self.storea(store_a, vdata) assert store_a == data # stream store store_s = [0] * self.nlanes self.stores(store_s, vdata) assert store_s == data # store lower part store_l = [0] * self.nlanes self.storel(store_l, vdata) assert store_l[:self.nlanes//2] == data[:self.nlanes//2] assert store_l != vdata # detect overflow # store higher part store_h = [0] * self.nlanes self.storeh(store_h, vdata) assert store_h[:self.nlanes//2] == data[self.nlanes//2:] assert store_h != vdata # detect overflow def test_memory_partial_load(self): if self.sfx in ("u8", "s8", "u16", "s16"): return data = self._data() lanes = list(range(1, self.nlanes + 1)) lanes += [self.nlanes**2, self.nlanes**4] # test out of range for n in lanes: load_till = self.load_till(data, n, 15) data_till = data[:n] + [15] * (self.nlanes-n) assert load_till == data_till load_tillz = self.load_tillz(data, n) data_tillz = data[:n] + [0] * (self.nlanes-n) assert load_tillz == data_tillz def test_memory_partial_store(self): if self.sfx in ("u8", "s8", "u16", "s16"): return data = self._data() data_rev = self._data(reverse=True) vdata = self.load(data) lanes = list(range(1, self.nlanes + 1)) lanes += [self.nlanes**2, self.nlanes**4] for n in lanes: data_till = data_rev.copy() data_till[:n] = data[:n] store_till = self._data(reverse=True) self.store_till(store_till, n, vdata) assert store_till == data_till def test_memory_noncont_load(self): if self.sfx in ("u8", "s8", "u16", "s16"): return for stride in range(1, 64): data = self._data(count=stride*self.nlanes) data_stride = data[::stride] loadn = self.loadn(data, stride) assert loadn == data_stride for stride in range(-64, 0): data = self._data(stride, -stride*self.nlanes) data_stride = self.load(data[::stride]) # cast unsigned loadn = self.loadn(data, stride) assert loadn == data_stride def test_memory_noncont_partial_load(self): if self.sfx in ("u8", "s8", "u16", "s16"): return lanes = list(range(1, self.nlanes + 1)) lanes += [self.nlanes**2, self.nlanes**4] for stride in range(1, 64): data = self._data(count=stride*self.nlanes) data_stride = data[::stride] for n in lanes: data_stride_till = data_stride[:n] + [15] * (self.nlanes-n) loadn_till = self.loadn_till(data, stride, n, 15) assert loadn_till == data_stride_till data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n) loadn_tillz = self.loadn_tillz(data, stride, n) assert loadn_tillz == data_stride_tillz for stride in range(-64, 0): data = self._data(stride, -stride*self.nlanes) data_stride = list(self.load(data[::stride])) # cast unsigned for n in lanes: data_stride_till = data_stride[:n] + [15] * (self.nlanes-n) loadn_till = self.loadn_till(data, stride, n, 15) assert loadn_till == data_stride_till data_stride_tillz = data_stride[:n] + [0] * (self.nlanes-n) loadn_tillz = self.loadn_tillz(data, stride, n) assert loadn_tillz == data_stride_tillz def test_memory_noncont_store(self): if self.sfx in ("u8", "s8", "u16", "s16"): return vdata = self.load(self._data()) for stride in range(1, 64): data = [15] * stride * self.nlanes data[::stride] = vdata storen = [15] * stride * self.nlanes storen += [127]*64 self.storen(storen, stride, vdata) assert storen[:-64] == data assert storen[-64:] == [127]*64 # detect overflow for stride in range(-64, 0): data = [15] * -stride * self.nlanes data[::stride] = vdata storen = [127]*64 storen += [15] * -stride * self.nlanes self.storen(storen, stride, vdata) assert storen[64:] == data assert storen[:64] == [127]*64 # detect overflow def test_memory_noncont_partial_store(self): if self.sfx in ("u8", "s8", "u16", "s16"): return data = self._data() vdata = self.load(data) lanes = list(range(1, self.nlanes + 1)) lanes += [self.nlanes**2, self.nlanes**4] for stride in range(1, 64): for n in lanes: data_till = [15] * stride * self.nlanes data_till[::stride] = data[:n] + [15] * (self.nlanes-n) storen_till = [15] * stride * self.nlanes storen_till += [127]*64 self.storen_till(storen_till, stride, n, vdata) assert storen_till[:-64] == data_till assert storen_till[-64:] == [127]*64 # detect overflow for stride in range(-64, 0): for n in lanes: data_till = [15] * -stride * self.nlanes data_till[::stride] = data[:n] + [15] * (self.nlanes-n) storen_till = [127]*64 storen_till += [15] * -stride * self.nlanes self.storen_till(storen_till, stride, n, vdata) assert storen_till[64:] == data_till assert storen_till[:64] == [127]*64 # detect overflow def test_misc(self): broadcast_zero = self.zero() assert broadcast_zero == [0] * self.nlanes for i in range(1, 10): broadcasti = self.setall(i) assert broadcasti == [i] * self.nlanes data_a, data_b = self._data(), self._data(reverse=True) vdata_a, vdata_b = self.load(data_a), self.load(data_b) # py level of npyv_set_* don't support ignoring the extra specified lanes or # fill non-specified lanes with zero. vset = self.set(*data_a) assert vset == data_a # py level of npyv_setf_* don't support ignoring the extra specified lanes or # fill non-specified lanes with the specified scalar. vsetf = self.setf(10, *data_a) assert vsetf == data_a # We're testing the sainty of _simd's type-vector, # reinterpret* intrinsics itself are tested via compiler # during the build of _simd module sfxes = ["u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64", "f32"] if self.npyv.simd_f64: sfxes.append("f64") for sfx in sfxes: vec_name = getattr(self, "reinterpret_" + sfx)(vdata_a).__name__ assert vec_name == "npyv_" + sfx # select & mask operations select_a = self.select(self.cmpeq(self.zero(), self.zero()), vdata_a, vdata_b) assert select_a == data_a select_b = self.select(self.cmpneq(self.zero(), self.zero()), vdata_a, vdata_b) assert select_b == data_b # cleanup intrinsic is only used with AVX for # zeroing registers to avoid the AVX-SSE transition penalty, # so nothing to test here self.npyv.cleanup() def test_reorder(self): data_a, data_b = self._data(), self._data(reverse=True) vdata_a, vdata_b = self.load(data_a), self.load(data_b) # lower half part data_a_lo = data_a[:self.nlanes//2] data_b_lo = data_b[:self.nlanes//2] # higher half part data_a_hi = data_a[self.nlanes//2:] data_b_hi = data_b[self.nlanes//2:] # combine two lower parts combinel = self.combinel(vdata_a, vdata_b) assert combinel == data_a_lo + data_b_lo # combine two higher parts combineh = self.combineh(vdata_a, vdata_b) assert combineh == data_a_hi + data_b_hi # combine x2 combine = self.combine(vdata_a, vdata_b) assert combine == (data_a_lo + data_b_lo, data_a_hi + data_b_hi) # zip(interleave) data_zipl = [v for p in zip(data_a_lo, data_b_lo) for v in p] data_ziph = [v for p in zip(data_a_hi, data_b_hi) for v in p] vzip = self.zip(vdata_a, vdata_b) assert vzip == (data_zipl, data_ziph) def test_operators_comparison(self): if self._is_fp(): data_a = self._data() else: data_a = self._data(self._int_max() - self.nlanes) data_b = self._data(self._int_min(), reverse=True) vdata_a, vdata_b = self.load(data_a), self.load(data_b) mask_true = self._true_mask() def to_bool(vector): return [lane == mask_true for lane in vector] # equal data_eq = [a == b for a, b in zip(data_a, data_b)] cmpeq = to_bool(self.cmpeq(vdata_a, vdata_b)) assert cmpeq == data_eq # not equal data_neq = [a != b for a, b in zip(data_a, data_b)] cmpneq = to_bool(self.cmpneq(vdata_a, vdata_b)) assert cmpneq == data_neq # greater than data_gt = [a > b for a, b in zip(data_a, data_b)] cmpgt = to_bool(self.cmpgt(vdata_a, vdata_b)) assert cmpgt == data_gt # greater than and equal data_ge = [a >= b for a, b in zip(data_a, data_b)] cmpge = to_bool(self.cmpge(vdata_a, vdata_b)) assert cmpge == data_ge # less than data_lt = [a < b for a, b in zip(data_a, data_b)] cmplt = to_bool(self.cmplt(vdata_a, vdata_b)) assert cmplt == data_lt # less than and equal data_le = [a <= b for a, b in zip(data_a, data_b)] cmple = to_bool(self.cmple(vdata_a, vdata_b)) assert cmple == data_le def test_operators_logical(self): if self._is_fp(): data_a = self._data() else: data_a = self._data(self._int_max() - self.nlanes) data_b = self._data(self._int_min(), reverse=True) vdata_a, vdata_b = self.load(data_a), self.load(data_b) if self._is_fp(): data_cast_a = self._to_unsigned(vdata_a) data_cast_b = self._to_unsigned(vdata_b) cast, cast_data = self._to_unsigned, self._to_unsigned else: data_cast_a, data_cast_b = data_a, data_b cast, cast_data = lambda a: a, self.load data_xor = cast_data([a ^ b for a, b in zip(data_cast_a, data_cast_b)]) vxor = cast(self.xor(vdata_a, vdata_b)) assert vxor == data_xor data_or = cast_data([a | b for a, b in zip(data_cast_a, data_cast_b)]) vor = cast(getattr(self, "or")(vdata_a, vdata_b)) assert vor == data_or data_and = cast_data([a & b for a, b in zip(data_cast_a, data_cast_b)]) vand = cast(getattr(self, "and")(vdata_a, vdata_b)) assert vand == data_and data_not = cast_data([~a for a in data_cast_a]) vnot = cast(getattr(self, "not")(vdata_a)) assert vnot == data_not def test_conversion_boolean(self): bsfx = "b" + self.sfx[1:] to_boolean = getattr(self.npyv, "cvt_%s_%s" % (bsfx, self.sfx)) from_boolean = getattr(self.npyv, "cvt_%s_%s" % (self.sfx, bsfx)) false_vb = to_boolean(self.setall(0)) true_vb = self.cmpeq(self.setall(0), self.setall(0)) assert false_vb != true_vb false_vsfx = from_boolean(false_vb) true_vsfx = from_boolean(true_vb) assert false_vsfx != true_vsfx def test_arithmetic_subadd(self): if self._is_fp(): data_a = self._data() else: data_a = self._data(self._int_max() - self.nlanes) data_b = self._data(self._int_min(), reverse=True) vdata_a, vdata_b = self.load(data_a), self.load(data_b) # non-saturated data_add = self.load([a + b for a, b in zip(data_a, data_b)]) # load to cast add = self.add(vdata_a, vdata_b) assert add == data_add data_sub = self.load([a - b for a, b in zip(data_a, data_b)]) sub = self.sub(vdata_a, vdata_b) assert sub == data_sub def test_arithmetic_mul(self): if self.sfx in ("u64", "s64"): return if self._is_fp(): data_a = self._data() else: data_a = self._data(self._int_max() - self.nlanes) data_b = self._data(self._int_min(), reverse=True) vdata_a, vdata_b = self.load(data_a), self.load(data_b) data_mul = self.load([a * b for a, b in zip(data_a, data_b)]) mul = self.mul(vdata_a, vdata_b) assert mul == data_mul def test_arithmetic_div(self): if not self._is_fp(): return data_a, data_b = self._data(), self._data(reverse=True) vdata_a, vdata_b = self.load(data_a), self.load(data_b) # load to truncate f64 to precision of f32 data_div = self.load([a / b for a, b in zip(data_a, data_b)]) div = self.div(vdata_a, vdata_b) assert div == data_div def test_arithmetic_reduce_sum(self): if not self._is_fp(): return # reduce sum data = self._data() vdata = self.load(data) data_sum = sum(data) vsum = self.sum(vdata) assert vsum == data_sum int_sfx = ("u8", "s8", "u16", "s16", "u32", "s32", "u64", "s64") fp_sfx = ("f32", "f64") all_sfx = int_sfx + fp_sfx tests_registry = { int_sfx : _SIMD_INT, fp_sfx : _SIMD_FP, all_sfx : _SIMD_ALL } for target_name, npyv in targets.items(): simd_width = npyv.simd if npyv else '' pretty_name = target_name.split('__') # multi-target separator if len(pretty_name) > 1: # multi-target pretty_name = f"({' '.join(pretty_name)})" else: pretty_name = pretty_name[0] skip = "" skip_sfx = dict() if not npyv: skip = f"target '{pretty_name}' isn't supported by current machine" elif not npyv.simd: skip = f"target '{pretty_name}' isn't supported by NPYV" elif not npyv.simd_f64: skip_sfx["f64"] = f"target '{pretty_name}' doesn't support double-precision" for sfxes, cls in tests_registry.items(): for sfx in sfxes: skip_m = skip_sfx.get(sfx, skip) inhr = (cls,) attr = dict(npyv=targets[target_name], sfx=sfx) tcls = type(f"Test{cls.__name__}_{simd_width}_{target_name}_{sfx}", inhr, attr) if skip_m: pytest.mark.skip(reason=skip_m)(tcls) globals()[tcls.__name__] = tcls