// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Copyright (c) 2008-2019 NVIDIA Corporation. All rights reserved. // Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved. // Copyright (c) 2001-2004 NovodeX AG. All rights reserved. #ifndef DY_ARTICULATION_SIMD_FNS_H #define DY_ARTICULATION_SIMD_FNS_H #include "DyArticulationUtils.h" namespace physx { namespace Dy { template class PodULike { PxU8 space[sizeof(T)*count]; public: PX_FORCE_INLINE operator T*() { return reinterpret_cast(space); } }; #define POD_U_LIKE(_T, _count, _alignment) PX_ALIGN_PREFIX(_alignment) PodULike<_T, _count> PX_ALIGN_SUFFIX(_alignment) class ArticulationFnsSimdBase { public: static PX_FORCE_INLINE FsInertia addInertia(const FsInertia& in1, const FsInertia& in2) { return FsInertia(M33Add(in1.ll, in2.ll), M33Add(in1.la, in2.la), M33Add(in1.aa, in2.aa)); } static PX_FORCE_INLINE FsInertia subtractInertia(const FsInertia& in1, const FsInertia& in2) { return FsInertia(M33Sub(in1.ll, in2.ll), M33Sub(in1.la, in2.la), M33Sub(in1.aa, in2.aa)); } static PX_FORCE_INLINE Vec3V axisDot(const Cm::SpatialVectorV S[3], const Cm::SpatialVectorV &v) { return V3Merge(FAdd(V3Dot(S[0].linear,v.linear), V3Dot(S[0].angular,v.angular)), FAdd(V3Dot(S[1].linear,v.linear), V3Dot(S[1].angular,v.angular)), FAdd(V3Dot(S[2].linear,v.linear), V3Dot(S[2].angular,v.angular))); } static PX_FORCE_INLINE Cm::SpatialVectorV axisMultiply(const Cm::SpatialVectorV S[3], Vec3V v) { return Cm::SpatialVectorV(V3ScaleAdd(S[0].linear, V3GetX(v), V3ScaleAdd(S[1].linear, V3GetY(v), V3Scale(S[2].linear, V3GetZ(v)))), V3ScaleAdd(S[0].angular, V3GetX(v), V3ScaleAdd(S[1].angular, V3GetY(v), V3Scale(S[2].angular, V3GetZ(v))))); } static PX_FORCE_INLINE Cm::SpatialVectorV subtract(const Cm::SpatialVectorV &a, const Cm::SpatialVectorV &b) { return Cm::SpatialVectorV(V3Sub(a.linear, b.linear), V3Sub(a.angular, b.angular)); } static PX_FORCE_INLINE Cm::SpatialVectorV add(const Cm::SpatialVectorV &a, const Cm::SpatialVectorV &b) { return Cm::SpatialVectorV(V3Add(a.linear, b.linear), V3Add(a.angular, b.angular)); } static PX_FORCE_INLINE Cm::SpatialVectorV multiply(const FsInertia &I, const Cm::SpatialVectorV &S) { return Cm::SpatialVectorV(V3Add(M33MulV3(I.ll,S.linear), M33MulV3(I.la,S.angular)), V3Add(M33TrnspsMulV3(I.la,S.linear), M33MulV3(I.aa,S.angular))); } static PX_FORCE_INLINE Cm::SpatialVectorV translateMotion(const Vec3V& p, const Cm::SpatialVectorV& v) { return Cm::SpatialVectorV(V3Add(v.linear, V3Cross(p, v.angular)), v.angular); } // translate a force resolved at position p to the origin static PX_FORCE_INLINE Cm::SpatialVectorV translateForce(const Vec3V& p, const Cm::SpatialVectorV& v) { return Cm::SpatialVectorV(v.linear, V3Add(v.angular, V3Cross(p, v.linear))); } static PX_FORCE_INLINE Mat33V invertSym33(const Mat33V &m) { Vec3V a0 = V3Cross(m.col1, m.col2); Vec3V a1 = V3Cross(m.col2, m.col0); Vec3V a2 = V3Cross(m.col0, m.col1); FloatV det = V3Dot(a0, m.col0); FloatV recipDet = FRecip(det); a1 = V3SetX(a1, V3GetY(a0)); a2 = V3Merge(V3GetZ(a0), V3GetZ(a1), V3GetZ(a2)); // make sure it's symmetric return Mat33V(V3Scale(a0, recipDet), V3Scale(a1, recipDet), V3Scale(a2, recipDet)); } static PX_FORCE_INLINE FloatV safeInvSqrt(FloatV v) { return FSqrt(FMax(FZero(), FRecip(v))); } static PX_FORCE_INLINE Mat33V invSqrt(const Mat33V& m) { // cholesky factor to // (a 0 0) // (b c 0) // (d e f) // except that a,c,f are the reciprocal sqrts rather than sqrts // PxVec3 v0 = m.column0, v1 = m.column1, v2 = m.column2; Vec3V v0 = m.col0, v1 = m.col1, v2 = m.col2; const FloatV x0 = V3GetX(v0), y1 = V3GetY(v1), z2 = V3GetZ(v2); FloatV a = safeInvSqrt(x0); // PxReal a = PxRecipSqrt(v0.x); Vec3V abd = V3Scale(v0, a); // PxReal b = v0.y*a; FloatV b = V3GetY(abd); FloatV c2 = FNegScaleSub(b, b, y1); // PxReal c = PxRecipSqrt(v1.y - b*b); FloatV c = safeInvSqrt(c2); FloatV d = V3GetZ(abd); // PxReal d = v0.z*a; FloatV e = FMul(FNegScaleSub(b, d, V3GetZ(v1)), c); // PxReal e = (v1.z-d*b) * c; FloatV f2 = FNegScaleSub(d, d, FNegScaleSub(e, e, z2)); // PxReal f = PxRecipSqrt(v2.z - d*d - e*e); FloatV f = safeInvSqrt(f2); // invert FloatV x = FMul(FMul(b,a),c), // x = -b*a*c y = FMul((FNegScaleSub(d,a, FMul(e,x))), f), // y = (-e*x-d*a)*f z = FMul(e, FMul(c,f)); // z = -e*c*f return Mat33V(V3Merge(a, FZero(), FZero()), V3Merge(FNeg(x), c, FZero()), V3Merge(y, FNeg(z), f)); } static PX_FORCE_INLINE FsInertia invertInertia(const FsInertia &I) { Mat33V aa = M33Scale(M33Add(I.aa, M33Trnsps(I.aa)), FHalf()); Mat33V ll = M33Scale(M33Add(I.ll, M33Trnsps(I.ll)), FHalf()); Mat33V AAInv = invertSym33(aa); Mat33V z = M33MulM33(M33Neg(I.la), AAInv); Mat33V S = M33Add(ll, M33MulM33(z, M33Trnsps(I.la))); Mat33V LL = invertSym33(S); Mat33V LA = M33MulM33(LL, z); Mat33V AA = M33Add(AAInv, M33MulM33(M33Trnsps(z), LA)); return FsInertia(LL, LA, AA); } static PX_NOINLINE Mat33V computeSIS(const FsInertia &I, const Cm::SpatialVectorV S[3], Cm::SpatialVectorV IS[3]) { Vec3V S0l = S[0].linear, S0a = S[0].angular; Vec3V S1l = S[1].linear, S1a = S[1].angular; Vec3V S2l = S[2].linear, S2a = S[2].angular; Vec3V IS0l = V3Add(M33MulV3(I.ll,S0l), M33MulV3(I.la,S0a)); Vec3V IS0a = V3Add(M33TrnspsMulV3(I.la,S0l), M33MulV3(I.aa,S0a)); Vec3V IS1l = V3Add(M33MulV3(I.ll,S1l), M33MulV3(I.la,S1a)); Vec3V IS1a = V3Add(M33TrnspsMulV3(I.la,S1l), M33MulV3(I.aa,S1a)); Vec3V IS2l = V3Add(M33MulV3(I.ll,S2l), M33MulV3(I.la,S2a)); Vec3V IS2a = V3Add(M33TrnspsMulV3(I.la,S2l), M33MulV3(I.aa,S2a)); // compute SIS FloatV a00 = FAdd(V3Dot(S0l, IS0l), V3Dot(S0a, IS0a)); FloatV a01 = FAdd(V3Dot(S0l, IS1l), V3Dot(S0a, IS1a)); FloatV a02 = FAdd(V3Dot(S0l, IS2l), V3Dot(S0a, IS2a)); FloatV a11 = FAdd(V3Dot(S1l, IS1l), V3Dot(S1a, IS1a)); FloatV a12 = FAdd(V3Dot(S1l, IS2l), V3Dot(S1a, IS2a)); FloatV a22 = FAdd(V3Dot(S2l, IS2l), V3Dot(S2a, IS2a)); // write IS, a useful side-effect IS[0].linear = IS0l; IS[0].angular = IS0a; IS[1].linear = IS1l; IS[1].angular = IS1a; IS[2].linear = IS2l; IS[2].angular = IS2a; return Mat33V(V3Merge(a00, a01, a02), V3Merge(a01, a11, a12), V3Merge(a02, a12, a22)); } static PX_FORCE_INLINE FsInertia multiplySubtract(const FsInertia &I, const Mat33V &D, const Cm::SpatialVectorV IS[3], Cm::SpatialVectorV DSI[3]) { // cut'n'paste, how I love ya, how I love ya Vec3V IS0l = IS[0].linear, IS0a = IS[0].angular; Vec3V IS1l = IS[1].linear, IS1a = IS[1].angular; Vec3V IS2l = IS[2].linear, IS2a = IS[2].angular; Vec3V D0 = D.col0, D1 = D.col1, D2 = D.col2; // compute IDS Vec3V DSI0l = V3ScaleAdd(IS0l, V3GetX(D0), V3ScaleAdd(IS1l, V3GetY(D0), V3Scale(IS2l, V3GetZ(D0)))); Vec3V DSI1l = V3ScaleAdd(IS0l, V3GetX(D1), V3ScaleAdd(IS1l, V3GetY(D1), V3Scale(IS2l, V3GetZ(D1)))); Vec3V DSI2l = V3ScaleAdd(IS0l, V3GetX(D2), V3ScaleAdd(IS1l, V3GetY(D2), V3Scale(IS2l, V3GetZ(D2)))); Vec3V DSI0a = V3ScaleAdd(IS0a, V3GetX(D0), V3ScaleAdd(IS1a, V3GetY(D0), V3Scale(IS2a, V3GetZ(D0)))); Vec3V DSI1a = V3ScaleAdd(IS0a, V3GetX(D1), V3ScaleAdd(IS1a, V3GetY(D1), V3Scale(IS2a, V3GetZ(D1)))); Vec3V DSI2a = V3ScaleAdd(IS0a, V3GetX(D2), V3ScaleAdd(IS1a, V3GetY(D2), V3Scale(IS2a, V3GetZ(D2)))); // compute J = I - DSI' IS. Each row of DSI' IS generates an inertia dyad Vec3V ll0 = I.ll.col0, ll1 = I.ll.col1, ll2 = I.ll.col2; Vec3V la0 = I.la.col0, la1 = I.la.col1, la2 = I.la.col2; Vec3V aa0 = I.aa.col0, aa1 = I.aa.col1, aa2 = I.aa.col2; #define SUBTRACT_DYAD(_a, _b) \ ll0 = V3NegScaleSub(_b##l, V3GetX(_a##l), ll0); la0 = V3NegScaleSub(_b##l, V3GetX(_a##a), la0); aa0 = V3NegScaleSub(_b##a, V3GetX(_a##a), aa0); \ ll1 = V3NegScaleSub(_b##l, V3GetY(_a##l), ll1); la1 = V3NegScaleSub(_b##l, V3GetY(_a##a), la1); aa1 = V3NegScaleSub(_b##a, V3GetY(_a##a), aa1); \ ll2 = V3NegScaleSub(_b##l, V3GetZ(_a##l), ll2); la2 = V3NegScaleSub(_b##l, V3GetZ(_a##a), la2); aa2 = V3NegScaleSub(_b##a, V3GetZ(_a##a), aa2); SUBTRACT_DYAD(IS0, DSI0); SUBTRACT_DYAD(IS1, DSI1); SUBTRACT_DYAD(IS2, DSI2); #undef SUBTRACT_DYAD DSI[0].linear = DSI0l; DSI[0].angular = DSI0a; DSI[1].linear = DSI1l; DSI[1].angular = DSI1a; DSI[2].linear = DSI2l; DSI[2].angular = DSI2a; return FsInertia(Mat33V(ll0, ll1, ll2), Mat33V(la0, la1, la2), Mat33V(aa0, aa1, aa2)); } static PX_FORCE_INLINE FsInertia multiplySubtract(const FsInertia &I, const Cm::SpatialVectorV S[3]) { // cut'n'paste, how I love ya, how I love ya const Vec3V S0l = S[0].linear, S0a = S[0].angular; const Vec3V S1l = S[1].linear, S1a = S[1].angular; const Vec3V S2l = S[2].linear, S2a = S[2].angular; // compute J = I - DSI' IS. Each row of DSI' IS generates an inertia dyad Vec3V ll0 = I.ll.col0, ll1 = I.ll.col1, ll2 = I.ll.col2; Vec3V la0 = I.la.col0, la1 = I.la.col1, la2 = I.la.col2; Vec3V aa0 = I.aa.col0, aa1 = I.aa.col1, aa2 = I.aa.col2; #define SUBTRACT_DYAD(_a, _b) \ ll0 = V3NegScaleSub(_b##l, V3GetX(_a##l), ll0); la0 = V3NegScaleSub(_b##l, V3GetX(_a##a), la0); aa0 = V3NegScaleSub(_b##a, V3GetX(_a##a), aa0); \ ll1 = V3NegScaleSub(_b##l, V3GetY(_a##l), ll1); la1 = V3NegScaleSub(_b##l, V3GetY(_a##a), la1); aa1 = V3NegScaleSub(_b##a, V3GetY(_a##a), aa1); \ ll2 = V3NegScaleSub(_b##l, V3GetZ(_a##l), ll2); la2 = V3NegScaleSub(_b##l, V3GetZ(_a##a), la2); aa2 = V3NegScaleSub(_b##a, V3GetZ(_a##a), aa2); SUBTRACT_DYAD(S0, S0); SUBTRACT_DYAD(S1, S1); SUBTRACT_DYAD(S2, S2); #undef SUBTRACT_DYAD return FsInertia(Mat33V(ll0, ll1, ll2), Mat33V(la0, la1, la2), Mat33V(aa0, aa1, aa2)); } static PX_FORCE_INLINE FsInertia translateInertia(Vec3V a, const FsInertia &input) { Vec3V b = V3Neg(a); Vec3V la0 = input.la.col0, la1 = input.la.col1, la2 = input.la.col2; Vec3V ll0 = input.ll.col0, ll1 = input.ll.col1, ll2 = input.ll.col2; Vec3V aa0 = input.aa.col0, aa1 = input.aa.col1, aa2 = input.aa.col2; FloatV aX = V3GetX(a), aY = V3GetY(a), aZ = V3GetZ(a); FloatV bX = V3GetX(b), bY = V3GetY(b), bZ = V3GetZ(b); FloatV Z = FZero(); // s - star matrix of a Vec3V s0 = V3Merge(Z, aZ, bY), s1 = V3Merge(bZ, Z, aX), s2 = V3Merge(aY, bX, Z); // s * la Vec3V sla0 = V3ScaleAdd(s0, V3GetX(la0), V3ScaleAdd(s1, V3GetY(la0), V3Scale(s2, V3GetZ(la0)))); Vec3V sla1 = V3ScaleAdd(s0, V3GetX(la1), V3ScaleAdd(s1, V3GetY(la1), V3Scale(s2, V3GetZ(la1)))); Vec3V sla2 = V3ScaleAdd(s0, V3GetX(la2), V3ScaleAdd(s1, V3GetY(la2), V3Scale(s2, V3GetZ(la2)))); // ll * s.transpose() (ll is symmetric) Vec3V llst0 = V3ScaleAdd(ll2, aY, V3Scale(ll1, bZ)), llst1 = V3ScaleAdd(ll0, aZ, V3Scale(ll2, bX)), llst2 = V3ScaleAdd(ll1, aX, V3Scale(ll0, bY)); // t = sla+S*llst*0.5f; Vec3V sllst0 = V3ScaleAdd(s2, V3GetZ(llst0), V3ScaleAdd(s1, V3GetY(llst0), V3Scale(s0, V3GetX(llst0)))); Vec3V sllst1 = V3ScaleAdd(s2, V3GetZ(llst1), V3ScaleAdd(s1, V3GetY(llst1), V3Scale(s0, V3GetX(llst1)))); Vec3V sllst2 = V3ScaleAdd(s2, V3GetZ(llst2), V3ScaleAdd(s1, V3GetY(llst2), V3Scale(s0, V3GetX(llst2)))); Vec3V t0 = V3ScaleAdd(sllst0, FHalf(), sla0); Vec3V t1 = V3ScaleAdd(sllst1, FHalf(), sla1); Vec3V t2 = V3ScaleAdd(sllst2, FHalf(), sla2); // t+t.transpose() Vec3V r0 = V3Add(t0, V3Merge(V3GetX(t0), V3GetX(t1), V3GetX(t2))), r1 = V3Add(t1, V3Merge(V3GetY(t0), V3GetY(t1), V3GetY(t2))), r2 = V3Add(t2, V3Merge(V3GetZ(t0), V3GetZ(t1), V3GetZ(t2))); return FsInertia(Mat33V(ll0, ll1, ll2), Mat33V(V3Add(la0, llst0), V3Add(la1, llst1), V3Add(la2, llst2)), Mat33V(V3Add(aa0, r0), V3Add(aa1, r1), V3Add(aa2, r2))); } }; template class ArticulationFnsSimd : public Base { static PX_FORCE_INLINE void axisMultiplyLowerTriangular(Cm::SpatialVectorV ES[3], const Mat33V&E, const Cm::SpatialVectorV S[3]) { const Vec3V l0 = S[0].linear, l1 = S[1].linear, l2 = S[2].linear; const Vec3V a0 = S[0].angular, a1 = S[1].angular, a2 = S[2].angular; ES[0] = Cm::SpatialVectorV(V3Scale(l0, V3GetX(E.col0)), V3Scale(a0, V3GetX(E.col0))); ES[1] = Cm::SpatialVectorV(V3ScaleAdd(l0, V3GetX(E.col1), V3Scale(l1, V3GetY(E.col1))), V3ScaleAdd(a0, V3GetX(E.col1), V3Scale(a1, V3GetY(E.col1)))); ES[2] = Cm::SpatialVectorV(V3ScaleAdd(l0, V3GetX(E.col2), V3ScaleAdd(l1, V3GetY(E.col2), V3Scale(l2, V3GetZ(E.col2)))), V3ScaleAdd(a0, V3GetX(E.col2), V3ScaleAdd(a1, V3GetY(E.col2), V3Scale(a2, V3GetZ(E.col2))))); } public: static PX_FORCE_INLINE FsInertia propagate(const FsInertia &I, const Cm::SpatialVectorV S[3], const Mat33V &load, const FloatV isf) { Cm::SpatialVectorV IS[3], ISE[3]; Mat33V D = Base::computeSIS(I, S, IS); D.col0 = V3ScaleAdd(load.col0, isf, D.col0); D.col1 = V3ScaleAdd(load.col1, isf, D.col1); D.col2 = V3ScaleAdd(load.col2, isf, D.col2); axisMultiplyLowerTriangular(ISE, Base::invSqrt(D), IS); return Base::multiplySubtract(I, ISE); } static PX_INLINE Cm::SpatialVectorV propagateImpulse(const FsRow& row, const FsJointVectors& jv, Vec3V& SZ, const Cm::SpatialVectorV& Z, const FsRowAux& aux) { PX_UNUSED(aux); SZ = V3Add(Z.angular, V3Cross(Z.linear, jv.jointOffset)); return Base::translateForce(jv.parentOffset, Z - Base::axisMultiply(row.DSI, SZ)); } static PX_INLINE Cm::SpatialVectorV propagateVelocity(const FsRow& row, const FsJointVectors& jv, const Vec3V& SZ, const Cm::SpatialVectorV& v, const FsRowAux& aux) { PX_UNUSED(aux); Cm::SpatialVectorV w = Base::translateMotion(V3Neg(jv.parentOffset), v); Vec3V DSZ = M33MulV3(row.D, SZ); Vec3V n = V3Add(Base::axisDot(row.DSI, w), DSZ); return w - Cm::SpatialVectorV(V3Cross(jv.jointOffset, n), n); } static #if !(PX_ANDROID && PX_DEBUG) // Not inlining for Android Debug to avoid "conditional branch out of range" compilation error on arm64-v8a ABI PX_FORCE_INLINE #endif Mat33V computeDriveInertia(const FsInertia &I0, const FsInertia &I1, const Cm::SpatialVectorV S[3]) { POD_U_LIKE(Cm::SpatialVectorV, 3, 16) IS, ISD, dummy; Mat33V D = Base::computeSIS(I0, S, IS); Mat33V DInv = Base::invertSym33(D); FsInertia tmp = Base::addInertia(I0, I1); tmp = Base::multiplySubtract(tmp, DInv, IS, ISD); FsInertia J = Base::invertInertia(tmp); Mat33V E = Base::computeSIS(J, ISD, dummy); return Base::invertSym33(M33Add(DInv,E)); } }; } } #endif //DY_ARTICULATION_SIMD_FNS_H