Projekt_Grafika/dependencies/physx-4.1/source/lowleveldynamics/src/DyArticulationFnsSimd.h

443 lines
16 KiB
C++

//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2019 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#ifndef DY_ARTICULATION_SIMD_FNS_H
#define DY_ARTICULATION_SIMD_FNS_H
#include "DyArticulationUtils.h"
namespace physx
{
namespace Dy
{
template <typename T, PxU32 count>
class PodULike
{
PxU8 space[sizeof(T)*count];
public:
PX_FORCE_INLINE operator T*() { return reinterpret_cast<T*>(space); }
};
#define POD_U_LIKE(_T, _count, _alignment) PX_ALIGN_PREFIX(_alignment) PodULike<_T, _count> PX_ALIGN_SUFFIX(_alignment)
class ArticulationFnsSimdBase
{
public:
static PX_FORCE_INLINE FsInertia addInertia(const FsInertia& in1, const FsInertia& in2)
{
return FsInertia(M33Add(in1.ll, in2.ll),
M33Add(in1.la, in2.la),
M33Add(in1.aa, in2.aa));
}
static PX_FORCE_INLINE FsInertia subtractInertia(const FsInertia& in1, const FsInertia& in2)
{
return FsInertia(M33Sub(in1.ll, in2.ll),
M33Sub(in1.la, in2.la),
M33Sub(in1.aa, in2.aa));
}
static PX_FORCE_INLINE Vec3V axisDot(const Cm::SpatialVectorV S[3], const Cm::SpatialVectorV &v)
{
return V3Merge(FAdd(V3Dot(S[0].linear,v.linear), V3Dot(S[0].angular,v.angular)),
FAdd(V3Dot(S[1].linear,v.linear), V3Dot(S[1].angular,v.angular)),
FAdd(V3Dot(S[2].linear,v.linear), V3Dot(S[2].angular,v.angular)));
}
static PX_FORCE_INLINE Cm::SpatialVectorV axisMultiply(const Cm::SpatialVectorV S[3], Vec3V v)
{
return Cm::SpatialVectorV(V3ScaleAdd(S[0].linear, V3GetX(v), V3ScaleAdd(S[1].linear, V3GetY(v), V3Scale(S[2].linear, V3GetZ(v)))),
V3ScaleAdd(S[0].angular, V3GetX(v), V3ScaleAdd(S[1].angular, V3GetY(v), V3Scale(S[2].angular, V3GetZ(v)))));
}
static PX_FORCE_INLINE Cm::SpatialVectorV subtract(const Cm::SpatialVectorV &a, const Cm::SpatialVectorV &b)
{
return Cm::SpatialVectorV(V3Sub(a.linear, b.linear), V3Sub(a.angular, b.angular));
}
static PX_FORCE_INLINE Cm::SpatialVectorV add(const Cm::SpatialVectorV &a, const Cm::SpatialVectorV &b)
{
return Cm::SpatialVectorV(V3Add(a.linear, b.linear), V3Add(a.angular, b.angular));
}
static PX_FORCE_INLINE Cm::SpatialVectorV multiply(const FsInertia &I, const Cm::SpatialVectorV &S)
{
return Cm::SpatialVectorV(V3Add(M33MulV3(I.ll,S.linear), M33MulV3(I.la,S.angular)),
V3Add(M33TrnspsMulV3(I.la,S.linear), M33MulV3(I.aa,S.angular)));
}
static PX_FORCE_INLINE Cm::SpatialVectorV translateMotion(const Vec3V& p, const Cm::SpatialVectorV& v)
{
return Cm::SpatialVectorV(V3Add(v.linear, V3Cross(p, v.angular)), v.angular);
}
// translate a force resolved at position p to the origin
static PX_FORCE_INLINE Cm::SpatialVectorV translateForce(const Vec3V& p, const Cm::SpatialVectorV& v)
{
return Cm::SpatialVectorV(v.linear, V3Add(v.angular, V3Cross(p, v.linear)));
}
static PX_FORCE_INLINE Mat33V invertSym33(const Mat33V &m)
{
Vec3V a0 = V3Cross(m.col1, m.col2);
Vec3V a1 = V3Cross(m.col2, m.col0);
Vec3V a2 = V3Cross(m.col0, m.col1);
FloatV det = V3Dot(a0, m.col0);
FloatV recipDet = FRecip(det);
a1 = V3SetX(a1, V3GetY(a0));
a2 = V3Merge(V3GetZ(a0), V3GetZ(a1), V3GetZ(a2)); // make sure it's symmetric
return Mat33V(V3Scale(a0, recipDet),
V3Scale(a1, recipDet),
V3Scale(a2, recipDet));
}
static PX_FORCE_INLINE FloatV safeInvSqrt(FloatV v)
{
return FSqrt(FMax(FZero(), FRecip(v)));
}
static PX_FORCE_INLINE Mat33V invSqrt(const Mat33V& m)
{
// cholesky factor to
// (a 0 0)
// (b c 0)
// (d e f)
// except that a,c,f are the reciprocal sqrts rather than sqrts
// PxVec3 v0 = m.column0, v1 = m.column1, v2 = m.column2;
Vec3V v0 = m.col0, v1 = m.col1, v2 = m.col2;
const FloatV x0 = V3GetX(v0), y1 = V3GetY(v1), z2 = V3GetZ(v2);
FloatV a = safeInvSqrt(x0); // PxReal a = PxRecipSqrt(v0.x);
Vec3V abd = V3Scale(v0, a); // PxReal b = v0.y*a;
FloatV b = V3GetY(abd);
FloatV c2 = FNegScaleSub(b, b, y1); // PxReal c = PxRecipSqrt(v1.y - b*b);
FloatV c = safeInvSqrt(c2);
FloatV d = V3GetZ(abd); // PxReal d = v0.z*a;
FloatV e = FMul(FNegScaleSub(b, d, V3GetZ(v1)), c); // PxReal e = (v1.z-d*b) * c;
FloatV f2 = FNegScaleSub(d, d, FNegScaleSub(e, e, z2)); // PxReal f = PxRecipSqrt(v2.z - d*d - e*e);
FloatV f = safeInvSqrt(f2);
// invert
FloatV x = FMul(FMul(b,a),c), // x = -b*a*c
y = FMul((FNegScaleSub(d,a, FMul(e,x))), f), // y = (-e*x-d*a)*f
z = FMul(e, FMul(c,f)); // z = -e*c*f
return Mat33V(V3Merge(a, FZero(), FZero()),
V3Merge(FNeg(x), c, FZero()),
V3Merge(y, FNeg(z), f));
}
static PX_FORCE_INLINE FsInertia invertInertia(const FsInertia &I)
{
Mat33V aa = M33Scale(M33Add(I.aa, M33Trnsps(I.aa)), FHalf());
Mat33V ll = M33Scale(M33Add(I.ll, M33Trnsps(I.ll)), FHalf());
Mat33V AAInv = invertSym33(aa);
Mat33V z = M33MulM33(M33Neg(I.la), AAInv);
Mat33V S = M33Add(ll, M33MulM33(z, M33Trnsps(I.la)));
Mat33V LL = invertSym33(S);
Mat33V LA = M33MulM33(LL, z);
Mat33V AA = M33Add(AAInv, M33MulM33(M33Trnsps(z), LA));
return FsInertia(LL, LA, AA);
}
static PX_NOINLINE Mat33V computeSIS(const FsInertia &I, const Cm::SpatialVectorV S[3], Cm::SpatialVectorV IS[3])
{
Vec3V S0l = S[0].linear, S0a = S[0].angular;
Vec3V S1l = S[1].linear, S1a = S[1].angular;
Vec3V S2l = S[2].linear, S2a = S[2].angular;
Vec3V IS0l = V3Add(M33MulV3(I.ll,S0l), M33MulV3(I.la,S0a));
Vec3V IS0a = V3Add(M33TrnspsMulV3(I.la,S0l), M33MulV3(I.aa,S0a));
Vec3V IS1l = V3Add(M33MulV3(I.ll,S1l), M33MulV3(I.la,S1a));
Vec3V IS1a = V3Add(M33TrnspsMulV3(I.la,S1l), M33MulV3(I.aa,S1a));
Vec3V IS2l = V3Add(M33MulV3(I.ll,S2l), M33MulV3(I.la,S2a));
Vec3V IS2a = V3Add(M33TrnspsMulV3(I.la,S2l), M33MulV3(I.aa,S2a));
// compute SIS
FloatV a00 = FAdd(V3Dot(S0l, IS0l), V3Dot(S0a, IS0a));
FloatV a01 = FAdd(V3Dot(S0l, IS1l), V3Dot(S0a, IS1a));
FloatV a02 = FAdd(V3Dot(S0l, IS2l), V3Dot(S0a, IS2a));
FloatV a11 = FAdd(V3Dot(S1l, IS1l), V3Dot(S1a, IS1a));
FloatV a12 = FAdd(V3Dot(S1l, IS2l), V3Dot(S1a, IS2a));
FloatV a22 = FAdd(V3Dot(S2l, IS2l), V3Dot(S2a, IS2a));
// write IS, a useful side-effect
IS[0].linear = IS0l; IS[0].angular = IS0a;
IS[1].linear = IS1l; IS[1].angular = IS1a;
IS[2].linear = IS2l; IS[2].angular = IS2a;
return Mat33V(V3Merge(a00, a01, a02),
V3Merge(a01, a11, a12),
V3Merge(a02, a12, a22));
}
static PX_FORCE_INLINE FsInertia multiplySubtract(const FsInertia &I, const Mat33V &D, const Cm::SpatialVectorV IS[3], Cm::SpatialVectorV DSI[3])
{
// cut'n'paste, how I love ya, how I love ya
Vec3V IS0l = IS[0].linear, IS0a = IS[0].angular;
Vec3V IS1l = IS[1].linear, IS1a = IS[1].angular;
Vec3V IS2l = IS[2].linear, IS2a = IS[2].angular;
Vec3V D0 = D.col0, D1 = D.col1, D2 = D.col2;
// compute IDS
Vec3V DSI0l = V3ScaleAdd(IS0l, V3GetX(D0), V3ScaleAdd(IS1l, V3GetY(D0), V3Scale(IS2l, V3GetZ(D0))));
Vec3V DSI1l = V3ScaleAdd(IS0l, V3GetX(D1), V3ScaleAdd(IS1l, V3GetY(D1), V3Scale(IS2l, V3GetZ(D1))));
Vec3V DSI2l = V3ScaleAdd(IS0l, V3GetX(D2), V3ScaleAdd(IS1l, V3GetY(D2), V3Scale(IS2l, V3GetZ(D2))));
Vec3V DSI0a = V3ScaleAdd(IS0a, V3GetX(D0), V3ScaleAdd(IS1a, V3GetY(D0), V3Scale(IS2a, V3GetZ(D0))));
Vec3V DSI1a = V3ScaleAdd(IS0a, V3GetX(D1), V3ScaleAdd(IS1a, V3GetY(D1), V3Scale(IS2a, V3GetZ(D1))));
Vec3V DSI2a = V3ScaleAdd(IS0a, V3GetX(D2), V3ScaleAdd(IS1a, V3GetY(D2), V3Scale(IS2a, V3GetZ(D2))));
// compute J = I - DSI' IS. Each row of DSI' IS generates an inertia dyad
Vec3V ll0 = I.ll.col0, ll1 = I.ll.col1, ll2 = I.ll.col2;
Vec3V la0 = I.la.col0, la1 = I.la.col1, la2 = I.la.col2;
Vec3V aa0 = I.aa.col0, aa1 = I.aa.col1, aa2 = I.aa.col2;
#define SUBTRACT_DYAD(_a, _b) \
ll0 = V3NegScaleSub(_b##l, V3GetX(_a##l), ll0); la0 = V3NegScaleSub(_b##l, V3GetX(_a##a), la0); aa0 = V3NegScaleSub(_b##a, V3GetX(_a##a), aa0); \
ll1 = V3NegScaleSub(_b##l, V3GetY(_a##l), ll1); la1 = V3NegScaleSub(_b##l, V3GetY(_a##a), la1); aa1 = V3NegScaleSub(_b##a, V3GetY(_a##a), aa1); \
ll2 = V3NegScaleSub(_b##l, V3GetZ(_a##l), ll2); la2 = V3NegScaleSub(_b##l, V3GetZ(_a##a), la2); aa2 = V3NegScaleSub(_b##a, V3GetZ(_a##a), aa2);
SUBTRACT_DYAD(IS0, DSI0);
SUBTRACT_DYAD(IS1, DSI1);
SUBTRACT_DYAD(IS2, DSI2);
#undef SUBTRACT_DYAD
DSI[0].linear = DSI0l; DSI[0].angular = DSI0a;
DSI[1].linear = DSI1l; DSI[1].angular = DSI1a;
DSI[2].linear = DSI2l; DSI[2].angular = DSI2a;
return FsInertia(Mat33V(ll0, ll1, ll2),
Mat33V(la0, la1, la2),
Mat33V(aa0, aa1, aa2));
}
static PX_FORCE_INLINE FsInertia multiplySubtract(const FsInertia &I, const Cm::SpatialVectorV S[3])
{
// cut'n'paste, how I love ya, how I love ya
const Vec3V S0l = S[0].linear, S0a = S[0].angular;
const Vec3V S1l = S[1].linear, S1a = S[1].angular;
const Vec3V S2l = S[2].linear, S2a = S[2].angular;
// compute J = I - DSI' IS. Each row of DSI' IS generates an inertia dyad
Vec3V ll0 = I.ll.col0, ll1 = I.ll.col1, ll2 = I.ll.col2;
Vec3V la0 = I.la.col0, la1 = I.la.col1, la2 = I.la.col2;
Vec3V aa0 = I.aa.col0, aa1 = I.aa.col1, aa2 = I.aa.col2;
#define SUBTRACT_DYAD(_a, _b) \
ll0 = V3NegScaleSub(_b##l, V3GetX(_a##l), ll0); la0 = V3NegScaleSub(_b##l, V3GetX(_a##a), la0); aa0 = V3NegScaleSub(_b##a, V3GetX(_a##a), aa0); \
ll1 = V3NegScaleSub(_b##l, V3GetY(_a##l), ll1); la1 = V3NegScaleSub(_b##l, V3GetY(_a##a), la1); aa1 = V3NegScaleSub(_b##a, V3GetY(_a##a), aa1); \
ll2 = V3NegScaleSub(_b##l, V3GetZ(_a##l), ll2); la2 = V3NegScaleSub(_b##l, V3GetZ(_a##a), la2); aa2 = V3NegScaleSub(_b##a, V3GetZ(_a##a), aa2);
SUBTRACT_DYAD(S0, S0);
SUBTRACT_DYAD(S1, S1);
SUBTRACT_DYAD(S2, S2);
#undef SUBTRACT_DYAD
return FsInertia(Mat33V(ll0, ll1, ll2),
Mat33V(la0, la1, la2),
Mat33V(aa0, aa1, aa2));
}
static PX_FORCE_INLINE FsInertia translateInertia(Vec3V a, const FsInertia &input)
{
Vec3V b = V3Neg(a);
Vec3V la0 = input.la.col0, la1 = input.la.col1, la2 = input.la.col2;
Vec3V ll0 = input.ll.col0, ll1 = input.ll.col1, ll2 = input.ll.col2;
Vec3V aa0 = input.aa.col0, aa1 = input.aa.col1, aa2 = input.aa.col2;
FloatV aX = V3GetX(a), aY = V3GetY(a), aZ = V3GetZ(a);
FloatV bX = V3GetX(b), bY = V3GetY(b), bZ = V3GetZ(b);
FloatV Z = FZero();
// s - star matrix of a
Vec3V s0 = V3Merge(Z, aZ, bY),
s1 = V3Merge(bZ, Z, aX),
s2 = V3Merge(aY, bX, Z);
// s * la
Vec3V sla0 = V3ScaleAdd(s0, V3GetX(la0), V3ScaleAdd(s1, V3GetY(la0), V3Scale(s2, V3GetZ(la0))));
Vec3V sla1 = V3ScaleAdd(s0, V3GetX(la1), V3ScaleAdd(s1, V3GetY(la1), V3Scale(s2, V3GetZ(la1))));
Vec3V sla2 = V3ScaleAdd(s0, V3GetX(la2), V3ScaleAdd(s1, V3GetY(la2), V3Scale(s2, V3GetZ(la2))));
// ll * s.transpose() (ll is symmetric)
Vec3V llst0 = V3ScaleAdd(ll2, aY, V3Scale(ll1, bZ)),
llst1 = V3ScaleAdd(ll0, aZ, V3Scale(ll2, bX)),
llst2 = V3ScaleAdd(ll1, aX, V3Scale(ll0, bY));
// t = sla+S*llst*0.5f;
Vec3V sllst0 = V3ScaleAdd(s2, V3GetZ(llst0), V3ScaleAdd(s1, V3GetY(llst0), V3Scale(s0, V3GetX(llst0))));
Vec3V sllst1 = V3ScaleAdd(s2, V3GetZ(llst1), V3ScaleAdd(s1, V3GetY(llst1), V3Scale(s0, V3GetX(llst1))));
Vec3V sllst2 = V3ScaleAdd(s2, V3GetZ(llst2), V3ScaleAdd(s1, V3GetY(llst2), V3Scale(s0, V3GetX(llst2))));
Vec3V t0 = V3ScaleAdd(sllst0, FHalf(), sla0);
Vec3V t1 = V3ScaleAdd(sllst1, FHalf(), sla1);
Vec3V t2 = V3ScaleAdd(sllst2, FHalf(), sla2);
// t+t.transpose()
Vec3V r0 = V3Add(t0, V3Merge(V3GetX(t0), V3GetX(t1), V3GetX(t2))),
r1 = V3Add(t1, V3Merge(V3GetY(t0), V3GetY(t1), V3GetY(t2))),
r2 = V3Add(t2, V3Merge(V3GetZ(t0), V3GetZ(t1), V3GetZ(t2)));
return FsInertia(Mat33V(ll0, ll1, ll2),
Mat33V(V3Add(la0, llst0),
V3Add(la1, llst1),
V3Add(la2, llst2)),
Mat33V(V3Add(aa0, r0),
V3Add(aa1, r1),
V3Add(aa2, r2)));
}
};
template<class Base>
class ArticulationFnsSimd : public Base
{
static PX_FORCE_INLINE void axisMultiplyLowerTriangular(Cm::SpatialVectorV ES[3], const Mat33V&E, const Cm::SpatialVectorV S[3])
{
const Vec3V l0 = S[0].linear, l1 = S[1].linear, l2 = S[2].linear;
const Vec3V a0 = S[0].angular, a1 = S[1].angular, a2 = S[2].angular;
ES[0] = Cm::SpatialVectorV(V3Scale(l0, V3GetX(E.col0)),
V3Scale(a0, V3GetX(E.col0)));
ES[1] = Cm::SpatialVectorV(V3ScaleAdd(l0, V3GetX(E.col1), V3Scale(l1, V3GetY(E.col1))),
V3ScaleAdd(a0, V3GetX(E.col1), V3Scale(a1, V3GetY(E.col1))));
ES[2] = Cm::SpatialVectorV(V3ScaleAdd(l0, V3GetX(E.col2), V3ScaleAdd(l1, V3GetY(E.col2), V3Scale(l2, V3GetZ(E.col2)))),
V3ScaleAdd(a0, V3GetX(E.col2), V3ScaleAdd(a1, V3GetY(E.col2), V3Scale(a2, V3GetZ(E.col2)))));
}
public:
static PX_FORCE_INLINE FsInertia propagate(const FsInertia &I,
const Cm::SpatialVectorV S[3],
const Mat33V &load,
const FloatV isf)
{
Cm::SpatialVectorV IS[3], ISE[3];
Mat33V D = Base::computeSIS(I, S, IS);
D.col0 = V3ScaleAdd(load.col0, isf, D.col0);
D.col1 = V3ScaleAdd(load.col1, isf, D.col1);
D.col2 = V3ScaleAdd(load.col2, isf, D.col2);
axisMultiplyLowerTriangular(ISE, Base::invSqrt(D), IS);
return Base::multiplySubtract(I, ISE);
}
static PX_INLINE Cm::SpatialVectorV propagateImpulse(const FsRow& row,
const FsJointVectors& jv,
Vec3V& SZ,
const Cm::SpatialVectorV& Z,
const FsRowAux& aux)
{
PX_UNUSED(aux);
SZ = V3Add(Z.angular, V3Cross(Z.linear, jv.jointOffset));
return Base::translateForce(jv.parentOffset, Z - Base::axisMultiply(row.DSI, SZ));
}
static PX_INLINE Cm::SpatialVectorV propagateVelocity(const FsRow& row,
const FsJointVectors& jv,
const Vec3V& SZ,
const Cm::SpatialVectorV& v,
const FsRowAux& aux)
{
PX_UNUSED(aux);
Cm::SpatialVectorV w = Base::translateMotion(V3Neg(jv.parentOffset), v);
Vec3V DSZ = M33MulV3(row.D, SZ);
Vec3V n = V3Add(Base::axisDot(row.DSI, w), DSZ);
return w - Cm::SpatialVectorV(V3Cross(jv.jointOffset, n), n);
}
static
#if !(PX_ANDROID && PX_DEBUG) // Not inlining for Android Debug to avoid "conditional branch out of range" compilation error on arm64-v8a ABI
PX_FORCE_INLINE
#endif
Mat33V computeDriveInertia(const FsInertia &I0,
const FsInertia &I1,
const Cm::SpatialVectorV S[3])
{
POD_U_LIKE(Cm::SpatialVectorV, 3, 16) IS, ISD, dummy;
Mat33V D = Base::computeSIS(I0, S, IS);
Mat33V DInv = Base::invertSym33(D);
FsInertia tmp = Base::addInertia(I0, I1);
tmp = Base::multiplySubtract(tmp, DInv, IS, ISD);
FsInertia J = Base::invertInertia(tmp);
Mat33V E = Base::computeSIS(J, ISD, dummy);
return Base::invertSym33(M33Add(DInv,E));
}
};
}
}
#endif //DY_ARTICULATION_SIMD_FNS_H