GRK-Projekt-Scena-Podwodna/dependencies/PHYSX/source/lowleveldynamics/src/DyTGSContactPrepBlock.cpp
2022-02-11 15:37:18 +01:00

3663 lines
151 KiB
C++

//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2008-2019 NVIDIA Corporation. All rights reserved.
// Copyright (c) 2004-2008 AGEIA Technologies, Inc. All rights reserved.
// Copyright (c) 2001-2004 NovodeX AG. All rights reserved.
#include "foundation/PxPreprocessor.h"
#include "PxSceneDesc.h"
#include "PsVecMath.h"
#include "PsMathUtils.h"
#include "DySolverContact.h"
#include "DySolverContact4.h"
#include "DySolverConstraintTypes.h"
#include "PxcNpWorkUnit.h"
#include "DyThreadContext.h"
#include "DyContactPrep.h"
#include "PxcNpContactPrepShared.h"
#include "PxvDynamics.h"
#include "DyCorrelationBuffer.h"
#include "DyDynamics.h"
#include "DyArticulationContactPrep.h"
#include "PxsContactManager.h"
#include "PsFoundation.h"
#include "DyTGSDynamics.h"
using namespace physx;
using namespace Gu;
#include "PsVecMath.h"
#include "PxContactModifyCallback.h"
#include "PxsMaterialManager.h"
#include "PxsMaterialCombiner.h"
#include "DyContactPrepShared.h"
#include "DySolverConstraint1D.h"
#include "DyConstraintPrep.h"
#include "CmConeLimitHelper.h"
#include "DySolverContext.h"
namespace physx
{
namespace Dy
{
inline bool ValidateVec4(const Vec4V v)
{
PX_ALIGN(16, PxVec4 vF);
Ps::aos::V4StoreA(v, &vF.x);
return vF.isFinite();
}
PX_FORCE_INLINE void QuatRotate4(const Vec4VArg qx, const Vec4VArg qy, const Vec4VArg qz, const Vec4VArg qw, const Vec4VArg vx, const Vec4VArg vy, const Vec4VArg vz,
Vec4V& rX, Vec4V& rY, Vec4V& rZ)
{
/*
const PxVec3 qv(x,y,z);
return (v*(w*w-0.5f) + (qv.cross(v))*w + qv*(qv.dot(v)))*2;
*/
const Vec4V two = V4Splat(FLoad(2.f));
const Vec4V nhalf = V4Splat(FLoad(-0.5f));
const Vec4V w2 = V4MulAdd(qw, qw, nhalf);
const Vec4V ax = V4Mul(vx, w2);
const Vec4V ay = V4Mul(vy, w2);
const Vec4V az = V4Mul(vz, w2);
const Vec4V crX = V4NegMulSub(qz, vy, V4Mul(qy, vz));
const Vec4V crY = V4NegMulSub(qx, vz, V4Mul(qz, vx));
const Vec4V crZ = V4NegMulSub(qy, vx, V4Mul(qx, vy));
const Vec4V tempX = V4MulAdd(crX, qw, ax);
const Vec4V tempY = V4MulAdd(crY, qw, ay);
const Vec4V tempZ = V4MulAdd(crZ, qw, az);
Vec4V dotuv = V4Mul(qx, vx);
dotuv = V4MulAdd(qy, vy, dotuv);
dotuv = V4MulAdd(qz, vz, dotuv);
rX = V4Mul(V4MulAdd(qx, dotuv, tempX), two);
rY = V4Mul(V4MulAdd(qy, dotuv, tempY), two);
rZ = V4Mul(V4MulAdd(qz, dotuv, tempZ), two);
}
struct SolverContactHeaderStepBlock
{
enum
{
eHAS_MAX_IMPULSE = 1 << 0,
eHAS_TARGET_VELOCITY = 1 << 1
};
PxU8 type; //Note: mType should be first as the solver expects a type in the first byte.
PxU8 numNormalConstr;
PxU8 numFrictionConstr;
PxU8 flag;
PxU8 flags[4];
//KS - used for write-back only
PxU8 numNormalConstrs[4];
PxU8 numFrictionConstrs[4];
Vec4V restitution;
Vec4V staticFriction;
Vec4V dynamicFriction;
//Technically, these mass properties could be pulled out into a new structure and shared. For multi-manifold contacts,
//this would save 64 bytes per-manifold after the cost of the first manifold
Vec4V invMass0D0;
Vec4V invMass1D1;
Vec4V angDom0;
Vec4V angDom1;
//Normal is shared between all contacts in the batch. This will save some memory!
Vec4V normalX;
Vec4V normalY;
Vec4V normalZ;
Vec4V maxPenBias;
Sc::ShapeInteraction* shapeInteraction[4]; //192 or 208
BoolV broken;
PxU8* frictionBrokenWritebackByte[4];
};
struct SolverContactPointStepBlock
{
Vec4V raXnI[3];
Vec4V rbXnI[3];
Vec4V separation;
Vec4V velMultiplier;
Vec4V targetVelocity;
Vec4V biasCoefficient;
};
//KS - technically, this friction constraint has identical data to the above contact constraint.
//We make them separate structs for clarity
struct SolverContactFrictionStepBlock
{
Vec4V normal[3];
Vec4V raXnI[3];
Vec4V rbXnI[3];
Vec4V error;
Vec4V velMultiplier;
Vec4V targetVel;
Vec4V biasCoefficient;
};
struct SolverConstraint1DHeaderStep4
{
PxU8 type; // enum SolverConstraintType - must be first byte
PxU8 pad0[3];
//These counts are the max of the 4 sets of data.
//When certain pairs have fewer constraints than others, they are padded with 0s so that no work is performed but
//calculations are still shared (afterall, they're computationally free because we're doing 4 things at a time in SIMD)
PxU32 count;
PxU8 counts[4];
PxU8 breakable[4];
Vec4V linBreakImpulse;
Vec4V angBreakImpulse;
Vec4V invMass0D0;
Vec4V invMass1D1;
Vec4V angD0;
Vec4V angD1;
Vec4V body0WorkOffset[3];
Vec4V rAWorld[3];
Vec4V rBWorld[3];
Vec4V angOrthoAxis0X[3];
Vec4V angOrthoAxis0Y[3];
Vec4V angOrthoAxis0Z[3];
Vec4V angOrthoAxis1X[3];
Vec4V angOrthoAxis1Y[3];
Vec4V angOrthoAxis1Z[3];
Vec4V angOrthoRecipResponse[3];
Vec4V angOrthoError[3];
};
PX_ALIGN_PREFIX(16)
struct SolverConstraint1DStep4
{
public:
Vec4V lin0[3]; //!< linear velocity projection (body 0)
Vec4V error; //!< constraint error term - must be scaled by biasScale. Can be adjusted at run-time
Vec4V lin1[3]; //!< linear velocity projection (body 1)
Vec4V biasScale; //!< constraint constant bias scale. Constant
Vec4V ang0[3]; //!< angular velocity projection (body 0)
Vec4V velMultiplier; //!< constraint velocity multiplier
Vec4V ang1[3]; //!< angular velocity projection (body 1)
Vec4V impulseMultiplier; //!< constraint impulse multiplier
Vec4V velTarget; //!< Scaled target velocity of the constraint drive
Vec4V minImpulse; //!< Lower bound on impulse magnitude
Vec4V maxImpulse; //!< Upper bound on impulse magnitude
Vec4V appliedForce; //!< applied force to correct velocity+bias
Vec4V maxBias;
Vec4V angularErrorScale; //Constant
PxU32 flags[4];
} PX_ALIGN_SUFFIX(16);
static void setupFinalizeSolverConstraints4Step(PxTGSSolverContactDesc* PX_RESTRICT descs, CorrelationBuffer& c,
PxU8* PX_RESTRICT workspace, const PxReal invDtF32, const PxReal invTotalDtF32, PxReal bounceThresholdF32, const PxReal solverOffsetSlopF32,
const Ps::aos::Vec4VArg invMassScale0, const Ps::aos::Vec4VArg invInertiaScale0,
const Ps::aos::Vec4VArg invMassScale1, const Ps::aos::Vec4VArg invInertiaScale1)
{
//OK, we have a workspace of pre-allocated space to store all 4 descs in. We now need to create the constraints in it
//const Vec4V ccdMaxSeparation = Ps::aos::V4LoadXYZW(descs[0].maxCCDSeparation, descs[1].maxCCDSeparation, descs[2].maxCCDSeparation, descs[3].maxCCDSeparation);
const Vec4V solverOffsetSlop = V4Load(solverOffsetSlopF32);
const Vec4V zero = V4Zero();
const BoolV bFalse = BFFFF();
const FloatV fZero = FZero();
PxU8 flags[4] = { PxU8(descs[0].hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0),
PxU8(descs[1].hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0),
PxU8(descs[2].hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0),
PxU8(descs[3].hasForceThresholds ? SolverContactHeader::eHAS_FORCE_THRESHOLDS : 0) };
bool hasMaxImpulse = descs[0].hasMaxImpulse || descs[1].hasMaxImpulse || descs[2].hasMaxImpulse || descs[3].hasMaxImpulse;
//The block is dynamic if **any** of the constraints have a non-static body B. This allows us to batch static and non-static constraints but we only get a memory/perf
//saving if all 4 are static. This simplifies the constraint partitioning such that it only needs to care about separating contacts and 1D constraints (which it already does)
bool isDynamic = false;
bool hasKinematic = false;
PxReal kinematicScale0F32[4];
PxReal kinematicScale1F32[4];
for (PxU32 a = 0; a < 4; ++a)
{
isDynamic = isDynamic || (descs[a].bodyState1 == PxSolverContactDesc::eDYNAMIC_BODY);
hasKinematic = hasKinematic || descs[a].bodyState1 == PxSolverContactDesc::eKINEMATIC_BODY;
kinematicScale0F32[a] = descs[a].body0->isKinematic ? 1.f : 0.f;
kinematicScale1F32[a] = descs[a].body1->isKinematic ? 1.f : 0.f;
}
/*BoolV kinematic0 = BLoad(isKinematic0);
BoolV kinematic1 = BLoad(isKinematic1);*/
const Vec4V kinematicScale0 = V4LoadU(kinematicScale0F32);
const Vec4V kinematicScale1 = V4LoadU(kinematicScale1F32);
const PxU32 constraintSize = sizeof(SolverContactPointStepBlock);
const PxU32 frictionSize = sizeof(SolverContactFrictionStepBlock);
PxU8* PX_RESTRICT ptr = workspace;
const Vec4V dom0 = invMassScale0;
const Vec4V dom1 = invMassScale1;
const Vec4V angDom0 = invInertiaScale0;
const Vec4V angDom1 = invInertiaScale1;
const Vec4V maxPenBias = V4Max(V4LoadXYZW(descs[0].bodyData0->penBiasClamp, descs[1].bodyData0->penBiasClamp,
descs[2].bodyData0->penBiasClamp, descs[3].bodyData0->penBiasClamp),
V4LoadXYZW(descs[0].bodyData1->penBiasClamp, descs[1].bodyData1->penBiasClamp,
descs[2].bodyData1->penBiasClamp, descs[3].bodyData1->penBiasClamp));
const Vec4V restDistance = V4LoadXYZW(descs[0].restDistance, descs[1].restDistance, descs[2].restDistance,
descs[3].restDistance);
//load up velocities
Vec4V linVel00 = V4LoadA(&descs[0].bodyData0->originalLinearVelocity.x);
Vec4V linVel10 = V4LoadA(&descs[1].bodyData0->originalLinearVelocity.x);
Vec4V linVel20 = V4LoadA(&descs[2].bodyData0->originalLinearVelocity.x);
Vec4V linVel30 = V4LoadA(&descs[3].bodyData0->originalLinearVelocity.x);
Vec4V linVel01 = V4LoadA(&descs[0].bodyData1->originalLinearVelocity.x);
Vec4V linVel11 = V4LoadA(&descs[1].bodyData1->originalLinearVelocity.x);
Vec4V linVel21 = V4LoadA(&descs[2].bodyData1->originalLinearVelocity.x);
Vec4V linVel31 = V4LoadA(&descs[3].bodyData1->originalLinearVelocity.x);
Vec4V angVel00 = V4LoadA(&descs[0].bodyData0->originalAngularVelocity.x);
Vec4V angVel10 = V4LoadA(&descs[1].bodyData0->originalAngularVelocity.x);
Vec4V angVel20 = V4LoadA(&descs[2].bodyData0->originalAngularVelocity.x);
Vec4V angVel30 = V4LoadA(&descs[3].bodyData0->originalAngularVelocity.x);
Vec4V angVel01 = V4LoadA(&descs[0].bodyData1->originalAngularVelocity.x);
Vec4V angVel11 = V4LoadA(&descs[1].bodyData1->originalAngularVelocity.x);
Vec4V angVel21 = V4LoadA(&descs[2].bodyData1->originalAngularVelocity.x);
Vec4V angVel31 = V4LoadA(&descs[3].bodyData1->originalAngularVelocity.x);
Vec4V linVelT00, linVelT10, linVelT20;
Vec4V linVelT01, linVelT11, linVelT21;
Vec4V angVelT00, angVelT10, angVelT20;
Vec4V angVelT01, angVelT11, angVelT21;
PX_TRANSPOSE_44_34(linVel00, linVel10, linVel20, linVel30, linVelT00, linVelT10, linVelT20);
PX_TRANSPOSE_44_34(linVel01, linVel11, linVel21, linVel31, linVelT01, linVelT11, linVelT21);
PX_TRANSPOSE_44_34(angVel00, angVel10, angVel20, angVel30, angVelT00, angVelT10, angVelT20);
PX_TRANSPOSE_44_34(angVel01, angVel11, angVel21, angVel31, angVelT01, angVelT11, angVelT21);
const Vec4V vrelX = V4Sub(linVelT00, linVelT01);
const Vec4V vrelY = V4Sub(linVelT10, linVelT11);
const Vec4V vrelZ = V4Sub(linVelT20, linVelT21);
//Load up masses and invInertia
const Vec4V invMass0 = V4LoadXYZW(descs[0].bodyData0->invMass, descs[1].bodyData0->invMass, descs[2].bodyData0->invMass, descs[3].bodyData0->invMass);
const Vec4V invMass1 = V4LoadXYZW(descs[0].bodyData1->invMass, descs[1].bodyData1->invMass, descs[2].bodyData1->invMass, descs[3].bodyData1->invMass);
const Vec4V invMass0D0 = V4Mul(dom0, invMass0);
const Vec4V invMass1D1 = V4Mul(dom1, invMass1);
Vec4V invInertia00X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[0].body0TxI->sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia00Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[0].body0TxI->sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia00Z = Vec4V_From_Vec3V(V3LoadU(descs[0].body0TxI->sqrtInvInertia.column2));
Vec4V invInertia10X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[1].body0TxI->sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia10Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[1].body0TxI->sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia10Z = Vec4V_From_Vec3V(V3LoadU(descs[1].body0TxI->sqrtInvInertia.column2));
Vec4V invInertia20X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[2].body0TxI->sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia20Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[2].body0TxI->sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia20Z = Vec4V_From_Vec3V(V3LoadU(descs[2].body0TxI->sqrtInvInertia.column2));
Vec4V invInertia30X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[3].body0TxI->sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia30Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[3].body0TxI->sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia30Z = Vec4V_From_Vec3V(V3LoadU(descs[3].body0TxI->sqrtInvInertia.column2));
Vec4V invInertia01X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[0].body1TxI->sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia01Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[0].body1TxI->sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia01Z = Vec4V_From_Vec3V(V3LoadU(descs[0].body1TxI->sqrtInvInertia.column2));
Vec4V invInertia11X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[1].body1TxI->sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia11Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[1].body1TxI->sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia11Z = Vec4V_From_Vec3V(V3LoadU(descs[1].body1TxI->sqrtInvInertia.column2));
Vec4V invInertia21X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[2].body1TxI->sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia21Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[2].body1TxI->sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia21Z = Vec4V_From_Vec3V(V3LoadU(descs[2].body1TxI->sqrtInvInertia.column2));
Vec4V invInertia31X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[3].body1TxI->sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia31Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(descs[3].body1TxI->sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia31Z = Vec4V_From_Vec3V(V3LoadU(descs[3].body1TxI->sqrtInvInertia.column2));
Vec4V invInertia0X0, invInertia0X1, invInertia0X2;
Vec4V invInertia0Y0, invInertia0Y1, invInertia0Y2;
Vec4V invInertia0Z0, invInertia0Z1, invInertia0Z2;
Vec4V invInertia1X0, invInertia1X1, invInertia1X2;
Vec4V invInertia1Y0, invInertia1Y1, invInertia1Y2;
Vec4V invInertia1Z0, invInertia1Z1, invInertia1Z2;
PX_TRANSPOSE_44_34(invInertia00X, invInertia10X, invInertia20X, invInertia30X, invInertia0X0, invInertia0Y0, invInertia0Z0);
PX_TRANSPOSE_44_34(invInertia00Y, invInertia10Y, invInertia20Y, invInertia30Y, invInertia0X1, invInertia0Y1, invInertia0Z1);
PX_TRANSPOSE_44_34(invInertia00Z, invInertia10Z, invInertia20Z, invInertia30Z, invInertia0X2, invInertia0Y2, invInertia0Z2);
PX_TRANSPOSE_44_34(invInertia01X, invInertia11X, invInertia21X, invInertia31X, invInertia1X0, invInertia1Y0, invInertia1Z0);
PX_TRANSPOSE_44_34(invInertia01Y, invInertia11Y, invInertia21Y, invInertia31Y, invInertia1X1, invInertia1Y1, invInertia1Z1);
PX_TRANSPOSE_44_34(invInertia01Z, invInertia11Z, invInertia21Z, invInertia31Z, invInertia1X2, invInertia1Y2, invInertia1Z2);
const FloatV invDt = FLoad(invDtF32);
const FloatV frictionBiasScale = invDt;
const FloatV invTotalDt = FLoad(invTotalDtF32);
const FloatV p8 = FLoad(0.8f);
const Vec4V p84 = V4Splat(p8);
const Vec4V bounceThreshold = V4Splat(FLoad(bounceThresholdF32));
const Vec4V invDtp8 = V4Splat(FMul(invDt, p8));
const Vec3V bodyFrame00p = V3LoadU(descs[0].bodyFrame0.p);
const Vec3V bodyFrame01p = V3LoadU(descs[1].bodyFrame0.p);
const Vec3V bodyFrame02p = V3LoadU(descs[2].bodyFrame0.p);
const Vec3V bodyFrame03p = V3LoadU(descs[3].bodyFrame0.p);
Vec4V bodyFrame00p4 = Vec4V_From_Vec3V(bodyFrame00p);
Vec4V bodyFrame01p4 = Vec4V_From_Vec3V(bodyFrame01p);
Vec4V bodyFrame02p4 = Vec4V_From_Vec3V(bodyFrame02p);
Vec4V bodyFrame03p4 = Vec4V_From_Vec3V(bodyFrame03p);
Vec4V bodyFrame0pX, bodyFrame0pY, bodyFrame0pZ;
PX_TRANSPOSE_44_34(bodyFrame00p4, bodyFrame01p4, bodyFrame02p4, bodyFrame03p4, bodyFrame0pX, bodyFrame0pY, bodyFrame0pZ);
const Vec3V bodyFrame10p = V3LoadU(descs[0].bodyFrame1.p);
const Vec3V bodyFrame11p = V3LoadU(descs[1].bodyFrame1.p);
const Vec3V bodyFrame12p = V3LoadU(descs[2].bodyFrame1.p);
const Vec3V bodyFrame13p = V3LoadU(descs[3].bodyFrame1.p);
Vec4V bodyFrame10p4 = Vec4V_From_Vec3V(bodyFrame10p);
Vec4V bodyFrame11p4 = Vec4V_From_Vec3V(bodyFrame11p);
Vec4V bodyFrame12p4 = Vec4V_From_Vec3V(bodyFrame12p);
Vec4V bodyFrame13p4 = Vec4V_From_Vec3V(bodyFrame13p);
Vec4V bodyFrame1pX, bodyFrame1pY, bodyFrame1pZ;
PX_TRANSPOSE_44_34(bodyFrame10p4, bodyFrame11p4, bodyFrame12p4, bodyFrame13p4, bodyFrame1pX, bodyFrame1pY, bodyFrame1pZ);
const QuatV bodyFrame00q = QuatVLoadU(&descs[0].bodyFrame0.q.x);
const QuatV bodyFrame01q = QuatVLoadU(&descs[1].bodyFrame0.q.x);
const QuatV bodyFrame02q = QuatVLoadU(&descs[2].bodyFrame0.q.x);
const QuatV bodyFrame03q = QuatVLoadU(&descs[3].bodyFrame0.q.x);
const QuatV bodyFrame10q = QuatVLoadU(&descs[0].bodyFrame1.q.x);
const QuatV bodyFrame11q = QuatVLoadU(&descs[1].bodyFrame1.q.x);
const QuatV bodyFrame12q = QuatVLoadU(&descs[2].bodyFrame1.q.x);
const QuatV bodyFrame13q = QuatVLoadU(&descs[3].bodyFrame1.q.x);
PxU32 frictionPatchWritebackAddrIndex0 = 0;
PxU32 frictionPatchWritebackAddrIndex1 = 0;
PxU32 frictionPatchWritebackAddrIndex2 = 0;
PxU32 frictionPatchWritebackAddrIndex3 = 0;
Ps::prefetchLine(c.contactID);
Ps::prefetchLine(c.contactID, 128);
PxU32 frictionIndex0 = 0, frictionIndex1 = 0, frictionIndex2 = 0, frictionIndex3 = 0;
//PxU32 contactIndex0 = 0, contactIndex1 = 0, contactIndex2 = 0, contactIndex3 = 0;
//OK, we iterate through all friction patch counts in the constraint patch, building up the constraint list etc.
PxU32 maxPatches = PxMax(descs[0].numFrictionPatches, PxMax(descs[1].numFrictionPatches, PxMax(descs[2].numFrictionPatches, descs[3].numFrictionPatches)));
const Vec4V p1 = V4Splat(FLoad(0.0001f));
const Vec4V orthoThreshold = V4Splat(FLoad(0.70710678f));
PxU32 contact0 = 0, contact1 = 0, contact2 = 0, contact3 = 0;
PxU32 patch0 = 0, patch1 = 0, patch2 = 0, patch3 = 0;
PxU8 flag = 0;
if (hasMaxImpulse)
flag |= SolverContactHeader4::eHAS_MAX_IMPULSE;
for (PxU32 i = 0; i<maxPatches; i++)
{
const bool hasFinished0 = i >= descs[0].numFrictionPatches;
const bool hasFinished1 = i >= descs[1].numFrictionPatches;
const bool hasFinished2 = i >= descs[2].numFrictionPatches;
const bool hasFinished3 = i >= descs[3].numFrictionPatches;
frictionIndex0 = hasFinished0 ? frictionIndex0 : descs[0].startFrictionPatchIndex + i;
frictionIndex1 = hasFinished1 ? frictionIndex1 : descs[1].startFrictionPatchIndex + i;
frictionIndex2 = hasFinished2 ? frictionIndex2 : descs[2].startFrictionPatchIndex + i;
frictionIndex3 = hasFinished3 ? frictionIndex3 : descs[3].startFrictionPatchIndex + i;
PxU32 clampedContacts0 = hasFinished0 ? 0 : c.frictionPatchContactCounts[frictionIndex0];
PxU32 clampedContacts1 = hasFinished1 ? 0 : c.frictionPatchContactCounts[frictionIndex1];
PxU32 clampedContacts2 = hasFinished2 ? 0 : c.frictionPatchContactCounts[frictionIndex2];
PxU32 clampedContacts3 = hasFinished3 ? 0 : c.frictionPatchContactCounts[frictionIndex3];
PxU32 firstPatch0 = c.correlationListHeads[frictionIndex0];
PxU32 firstPatch1 = c.correlationListHeads[frictionIndex1];
PxU32 firstPatch2 = c.correlationListHeads[frictionIndex2];
PxU32 firstPatch3 = c.correlationListHeads[frictionIndex3];
const Gu::ContactPoint* contactBase0 = descs[0].contacts + c.contactPatches[firstPatch0].start;
const Gu::ContactPoint* contactBase1 = descs[1].contacts + c.contactPatches[firstPatch1].start;
const Gu::ContactPoint* contactBase2 = descs[2].contacts + c.contactPatches[firstPatch2].start;
const Gu::ContactPoint* contactBase3 = descs[3].contacts + c.contactPatches[firstPatch3].start;
const Vec4V restitution = V4Neg(V4LoadXYZW(contactBase0->restitution, contactBase1->restitution, contactBase2->restitution,
contactBase3->restitution));
SolverContactHeaderStepBlock* PX_RESTRICT header = reinterpret_cast<SolverContactHeaderStepBlock*>(ptr);
ptr += sizeof(SolverContactHeaderStepBlock);
header->flags[0] = flags[0];
header->flags[1] = flags[1];
header->flags[2] = flags[2];
header->flags[3] = flags[3];
header->flag = flag;
PxU32 totalContacts = PxMax(clampedContacts0, PxMax(clampedContacts1, PxMax(clampedContacts2, clampedContacts3)));
Vec4V* PX_RESTRICT appliedNormalForces = reinterpret_cast<Vec4V*>(ptr);
ptr += sizeof(Vec4V)*totalContacts;
PxMemZero(appliedNormalForces, sizeof(Vec4V) * totalContacts);
header->numNormalConstr = Ps::to8(totalContacts);
header->numNormalConstrs[0] = Ps::to8(clampedContacts0);
header->numNormalConstrs[1] = Ps::to8(clampedContacts1);
header->numNormalConstrs[2] = Ps::to8(clampedContacts2);
header->numNormalConstrs[3] = Ps::to8(clampedContacts3);
//header->sqrtInvMassA = sqrtInvMass0;
//header->sqrtInvMassB = sqrtInvMass1;
header->invMass0D0 = invMass0D0;
header->invMass1D1 = invMass1D1;
header->angDom0 = angDom0;
header->angDom1 = angDom1;
header->shapeInteraction[0] = descs[0].shapeInteraction; header->shapeInteraction[1] = descs[1].shapeInteraction;
header->shapeInteraction[2] = descs[2].shapeInteraction; header->shapeInteraction[3] = descs[3].shapeInteraction;
Vec4V* maxImpulse = reinterpret_cast<Vec4V*>(ptr + constraintSize * totalContacts);
header->restitution = restitution;
Vec4V normal0 = V4LoadA(&contactBase0->normal.x);
Vec4V normal1 = V4LoadA(&contactBase1->normal.x);
Vec4V normal2 = V4LoadA(&contactBase2->normal.x);
Vec4V normal3 = V4LoadA(&contactBase3->normal.x);
Vec4V normalX, normalY, normalZ;
PX_TRANSPOSE_44_34(normal0, normal1, normal2, normal3, normalX, normalY, normalZ);
PX_ASSERT(ValidateVec4(normalX));
PX_ASSERT(ValidateVec4(normalY));
PX_ASSERT(ValidateVec4(normalZ));
header->normalX = normalX;
header->normalY = normalY;
header->normalZ = normalZ;
header->maxPenBias = maxPenBias;
const Vec4V norVel0 = V4MulAdd(normalZ, linVelT20, V4MulAdd(normalY, linVelT10, V4Mul(normalX, linVelT00)));
const Vec4V norVel1 = V4MulAdd(normalZ, linVelT21, V4MulAdd(normalY, linVelT11, V4Mul(normalX, linVelT01)));
const Vec4V relNorVel = V4Sub(norVel0, norVel1);
//For all correlation heads - need to pull this out I think
//OK, we have a counter for all our patches...
PxU32 finished = (PxU32(hasFinished0)) |
((PxU32(hasFinished1)) << 1) |
((PxU32(hasFinished2)) << 2) |
((PxU32(hasFinished3)) << 3);
CorrelationListIterator iter0(c, firstPatch0);
CorrelationListIterator iter1(c, firstPatch1);
CorrelationListIterator iter2(c, firstPatch2);
CorrelationListIterator iter3(c, firstPatch3);
//PxU32 contact0, contact1, contact2, contact3;
//PxU32 patch0, patch1, patch2, patch3;
if (!hasFinished0)
iter0.nextContact(patch0, contact0);
if (!hasFinished1)
iter1.nextContact(patch1, contact1);
if (!hasFinished2)
iter2.nextContact(patch2, contact2);
if (!hasFinished3)
iter3.nextContact(patch3, contact3);
PxU8* p = ptr;
PxU32 contactCount = 0;
PxU32 newFinished =
(PxU32(hasFinished0 || !iter0.hasNextContact())) |
((PxU32(hasFinished1 || !iter1.hasNextContact())) << 1) |
((PxU32(hasFinished2 || !iter2.hasNextContact())) << 2) |
((PxU32(hasFinished3 || !iter3.hasNextContact())) << 3);
while (finished != 0xf)
{
finished = newFinished;
++contactCount;
Ps::prefetchLine(p, 384);
Ps::prefetchLine(p, 512);
Ps::prefetchLine(p, 640);
SolverContactPointStepBlock* PX_RESTRICT solverContact = reinterpret_cast<SolverContactPointStepBlock*>(p);
p += constraintSize;
const Gu::ContactPoint& con0 = descs[0].contacts[c.contactPatches[patch0].start + contact0];
const Gu::ContactPoint& con1 = descs[1].contacts[c.contactPatches[patch1].start + contact1];
const Gu::ContactPoint& con2 = descs[2].contacts[c.contactPatches[patch2].start + contact2];
const Gu::ContactPoint& con3 = descs[3].contacts[c.contactPatches[patch3].start + contact3];
//Now we need to splice these 4 contacts into a single structure
{
Vec4V point0 = V4LoadA(&con0.point.x);
Vec4V point1 = V4LoadA(&con1.point.x);
Vec4V point2 = V4LoadA(&con2.point.x);
Vec4V point3 = V4LoadA(&con3.point.x);
Vec4V pointX, pointY, pointZ;
PX_TRANSPOSE_44_34(point0, point1, point2, point3, pointX, pointY, pointZ);
Vec4V cTargetVel0 = V4LoadA(&con0.targetVel.x);
Vec4V cTargetVel1 = V4LoadA(&con1.targetVel.x);
Vec4V cTargetVel2 = V4LoadA(&con2.targetVel.x);
Vec4V cTargetVel3 = V4LoadA(&con3.targetVel.x);
Vec4V cTargetVelX, cTargetVelY, cTargetVelZ;
PX_TRANSPOSE_44_34(cTargetVel0, cTargetVel1, cTargetVel2, cTargetVel3, cTargetVelX, cTargetVelY, cTargetVelZ);
const Vec4V separation = V4LoadXYZW(con0.separation, con1.separation, con2.separation, con3.separation);
const Vec4V cTargetNorVel = V4MulAdd(cTargetVelX, normalX, V4MulAdd(cTargetVelY, normalY, V4Mul(cTargetVelZ, normalZ)));
const Vec4V raX = V4Sub(pointX, bodyFrame0pX);
const Vec4V raY = V4Sub(pointY, bodyFrame0pY);
const Vec4V raZ = V4Sub(pointZ, bodyFrame0pZ);
const Vec4V rbX = V4Sub(pointX, bodyFrame1pX);
const Vec4V rbY = V4Sub(pointY, bodyFrame1pY);
const Vec4V rbZ = V4Sub(pointZ, bodyFrame1pZ);
/*raX = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raX)), zero, raX);
raY = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raY)), zero, raY);
raZ = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raZ)), zero, raZ);
rbX = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbX)), zero, rbX);
rbY = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbY)), zero, rbY);
rbZ = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbZ)), zero, rbZ);*/
PX_ASSERT(ValidateVec4(raX));
PX_ASSERT(ValidateVec4(raY));
PX_ASSERT(ValidateVec4(raZ));
PX_ASSERT(ValidateVec4(rbX));
PX_ASSERT(ValidateVec4(rbY));
PX_ASSERT(ValidateVec4(rbZ));
//raXn = cross(ra, normal) which = Vec3V( a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
Vec4V raXnX = V4NegMulSub(raZ, normalY, V4Mul(raY, normalZ));
Vec4V raXnY = V4NegMulSub(raX, normalZ, V4Mul(raZ, normalX));
Vec4V raXnZ = V4NegMulSub(raY, normalX, V4Mul(raX, normalY));
raXnX = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raXnX)), zero, raXnX);
raXnY = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raXnY)), zero, raXnY);
raXnZ = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raXnZ)), zero, raXnZ);
Vec4V delAngVel0X = V4Mul(invInertia0X0, raXnX);
Vec4V delAngVel0Y = V4Mul(invInertia0X1, raXnX);
Vec4V delAngVel0Z = V4Mul(invInertia0X2, raXnX);
delAngVel0X = V4MulAdd(invInertia0Y0, raXnY, delAngVel0X);
delAngVel0Y = V4MulAdd(invInertia0Y1, raXnY, delAngVel0Y);
delAngVel0Z = V4MulAdd(invInertia0Y2, raXnY, delAngVel0Z);
delAngVel0X = V4MulAdd(invInertia0Z0, raXnZ, delAngVel0X);
delAngVel0Y = V4MulAdd(invInertia0Z1, raXnZ, delAngVel0Y);
delAngVel0Z = V4MulAdd(invInertia0Z2, raXnZ, delAngVel0Z);
PX_ASSERT(ValidateVec4(delAngVel0X));
PX_ASSERT(ValidateVec4(delAngVel0Y));
PX_ASSERT(ValidateVec4(delAngVel0Z));
const Vec4V dotDelAngVel0 = V4MulAdd(delAngVel0X, delAngVel0X, V4MulAdd(delAngVel0Y, delAngVel0Y, V4Mul(delAngVel0Z, delAngVel0Z)));
const Vec4V relAngVel0 = V4MulAdd(raXnZ, angVelT20, V4MulAdd(raXnY, angVelT10, V4Mul(raXnX, angVelT00)));
Vec4V unitResponse = V4MulAdd(invMass0D0, angDom0, dotDelAngVel0);
Vec4V vrel0 = V4Add(norVel0, relAngVel0);
Vec4V vrel1 = norVel1;
Vec4V delAngVel1X = zero;
Vec4V delAngVel1Y = zero;
Vec4V delAngVel1Z = zero;
//The dynamic-only parts - need to if-statement these up. A branch here shouldn't cost us too much
if (isDynamic)
{
Vec4V rbXnX = V4NegMulSub(rbZ, normalY, V4Mul(rbY, normalZ));
Vec4V rbXnY = V4NegMulSub(rbX, normalZ, V4Mul(rbZ, normalX));
Vec4V rbXnZ = V4NegMulSub(rbY, normalX, V4Mul(rbX, normalY));
rbXnX = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbXnX)), zero, rbXnX);
rbXnY = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbXnY)), zero, rbXnY);
rbXnZ = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbXnZ)), zero, rbXnZ);
delAngVel1X = V4Mul(invInertia1X0, rbXnX);
delAngVel1Y = V4Mul(invInertia1X1, rbXnX);
delAngVel1Z = V4Mul(invInertia1X2, rbXnX);
delAngVel1X = V4MulAdd(invInertia1Y0, rbXnY, delAngVel1X);
delAngVel1Y = V4MulAdd(invInertia1Y1, rbXnY, delAngVel1Y);
delAngVel1Z = V4MulAdd(invInertia1Y2, rbXnY, delAngVel1Z);
delAngVel1X = V4MulAdd(invInertia1Z0, rbXnZ, delAngVel1X);
delAngVel1Y = V4MulAdd(invInertia1Z1, rbXnZ, delAngVel1Y);
delAngVel1Z = V4MulAdd(invInertia1Z2, rbXnZ, delAngVel1Z);
PX_ASSERT(ValidateVec4(delAngVel1X));
PX_ASSERT(ValidateVec4(delAngVel1Y));
PX_ASSERT(ValidateVec4(delAngVel1Z));
const Vec4V dotDelAngVel1 = V4MulAdd(delAngVel1X, delAngVel1X, V4MulAdd(delAngVel1Y, delAngVel1Y, V4Mul(delAngVel1Z, delAngVel1Z)));
const Vec4V relAngVel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4Mul(rbXnX, angVelT01)));
const Vec4V resp1 = V4MulAdd(dotDelAngVel1, angDom1, invMass1D1);
unitResponse = V4Add(unitResponse, resp1);
vrel1 = V4Add(vrel1, relAngVel1);
//These are for dynamic-only contacts.
}
else if (hasKinematic)
{
const Vec4V rbXnX = V4NegMulSub(rbZ, normalY, V4Mul(rbY, normalZ));
const Vec4V rbXnY = V4NegMulSub(rbX, normalZ, V4Mul(rbZ, normalX));
const Vec4V rbXnZ = V4NegMulSub(rbY, normalX, V4Mul(rbX, normalY));
const Vec4V relAngVel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4Mul(rbXnX, angVelT01)));
vrel1 = V4Add(vrel1, relAngVel1);
}
Vec4V vrel = V4Sub(vrel0, vrel1);
solverContact->rbXnI[0] = delAngVel1X;
solverContact->rbXnI[1] = delAngVel1Y;
solverContact->rbXnI[2] = delAngVel1Z;
const Vec4V velMultiplier = V4Sel(V4IsGrtr(unitResponse, zero), V4Recip(unitResponse), zero);
const Vec4V penetration = V4Sub(separation, restDistance);
//const Vec4V penInvDtPt8 = V4Max(maxPenBias, V4Scale(penetration, invDtp8));
//Vec4V scaledBias = invDtPt8;
const Vec4V penetrationInvDt = V4Scale(penetration, invTotalDt);
const BoolV isGreater2 = BAnd(BAnd(V4IsGrtr(zero, restitution), V4IsGrtr(bounceThreshold, vrel)),
V4IsGrtr(V4Neg(vrel), penetrationInvDt));
const Vec4V scaledBias = V4Neg(invDtp8);
Vec4V targetVelocity = V4NegMulSub(vrel0, kinematicScale0, V4MulAdd(vrel1, kinematicScale1, V4Sel(isGreater2, V4Mul(vrel, restitution), zero)));
//Vec4V biasedErr = V4Sel(isGreater2, targetVelocity, scaledBias);
//Vec4V biasedErr = V4Add(targetVelocity, scaledBias);
//biasedErr = V4NegMulSub(V4Sub(vrel, cTargetNorVel), velMultiplier, biasedErr);
//These values are present for static and dynamic contacts
solverContact->raXnI[0] = delAngVel0X;
solverContact->raXnI[1] = delAngVel0Y;
solverContact->raXnI[2] = delAngVel0Z;
solverContact->velMultiplier = velMultiplier;
solverContact->targetVelocity = V4Add(cTargetNorVel, targetVelocity);
solverContact->separation = penetration;
solverContact->biasCoefficient = scaledBias;
if (hasMaxImpulse)
{
maxImpulse[contactCount - 1] = V4Merge(FLoad(con0.maxImpulse), FLoad(con1.maxImpulse), FLoad(con2.maxImpulse),
FLoad(con3.maxImpulse));
}
}
if (!(finished & 0x1))
{
iter0.nextContact(patch0, contact0);
newFinished |= PxU32(!iter0.hasNextContact());
}
if (!(finished & 0x2))
{
iter1.nextContact(patch1, contact1);
newFinished |= (PxU32(!iter1.hasNextContact()) << 1);
}
if (!(finished & 0x4))
{
iter2.nextContact(patch2, contact2);
newFinished |= (PxU32(!iter2.hasNextContact()) << 2);
}
if (!(finished & 0x8))
{
iter3.nextContact(patch3, contact3);
newFinished |= (PxU32(!iter3.hasNextContact()) << 3);
}
}
ptr = p;
if (hasMaxImpulse)
{
ptr += sizeof(Vec4V) * totalContacts;
}
//OK...friction time :-)
Vec4V maxImpulseScale = V4One();
{
const FrictionPatch& frictionPatch0 = c.frictionPatches[frictionIndex0];
const FrictionPatch& frictionPatch1 = c.frictionPatches[frictionIndex1];
const FrictionPatch& frictionPatch2 = c.frictionPatches[frictionIndex2];
const FrictionPatch& frictionPatch3 = c.frictionPatches[frictionIndex3];
PxU32 anchorCount0 = frictionPatch0.anchorCount;
PxU32 anchorCount1 = frictionPatch1.anchorCount;
PxU32 anchorCount2 = frictionPatch2.anchorCount;
PxU32 anchorCount3 = frictionPatch3.anchorCount;
PxU32 clampedAnchorCount0 = hasFinished0 || (contactBase0->materialFlags & PxMaterialFlag::eDISABLE_FRICTION) ? 0 : anchorCount0;
PxU32 clampedAnchorCount1 = hasFinished1 || (contactBase1->materialFlags & PxMaterialFlag::eDISABLE_FRICTION) ? 0 : anchorCount1;
PxU32 clampedAnchorCount2 = hasFinished2 || (contactBase2->materialFlags & PxMaterialFlag::eDISABLE_FRICTION) ? 0 : anchorCount2;
PxU32 clampedAnchorCount3 = hasFinished3 || (contactBase3->materialFlags & PxMaterialFlag::eDISABLE_FRICTION) ? 0 : anchorCount3;
const PxU32 maxAnchorCount = PxMax(clampedAnchorCount0, PxMax(clampedAnchorCount1, PxMax(clampedAnchorCount2, clampedAnchorCount3)));
PX_ALIGN(16, PxReal staticFriction[4]);
PX_ALIGN(16, PxReal dynamicFriction[4]);
//for (PxU32 f = 0; f < 4; ++f)
{
PxReal coeff0 = (contactBase0->materialFlags & PxMaterialFlag::eIMPROVED_PATCH_FRICTION && clampedAnchorCount0 == 2) ? 0.5f : 1.f;
PxReal coeff1 = (contactBase1->materialFlags & PxMaterialFlag::eIMPROVED_PATCH_FRICTION && clampedAnchorCount1 == 2) ? 0.5f : 1.f;
PxReal coeff2 = (contactBase2->materialFlags & PxMaterialFlag::eIMPROVED_PATCH_FRICTION && clampedAnchorCount2 == 2) ? 0.5f : 1.f;
PxReal coeff3 = (contactBase3->materialFlags & PxMaterialFlag::eIMPROVED_PATCH_FRICTION && clampedAnchorCount3 == 2) ? 0.5f : 1.f;
staticFriction[0] = contactBase0->staticFriction * coeff0;
dynamicFriction[0] = contactBase0->dynamicFriction * coeff0;
staticFriction[1] = contactBase1->staticFriction * coeff1;
dynamicFriction[1] = contactBase1->dynamicFriction * coeff1;
staticFriction[2] = contactBase2->staticFriction * coeff2;
dynamicFriction[2] = contactBase2->dynamicFriction * coeff2;
staticFriction[3] = contactBase3->staticFriction * coeff3;
dynamicFriction[3] = contactBase3->dynamicFriction * coeff3;
}
PX_ASSERT(totalContacts == contactCount);
header->numFrictionConstr = Ps::to8(maxAnchorCount * 2);
header->numFrictionConstrs[0] = Ps::to8(clampedAnchorCount0 * 2);
header->numFrictionConstrs[1] = Ps::to8(clampedAnchorCount1 * 2);
header->numFrictionConstrs[2] = Ps::to8(clampedAnchorCount2 * 2);
header->numFrictionConstrs[3] = Ps::to8(clampedAnchorCount3 * 2);
//KS - TODO - extend this if needed
header->type = Ps::to8(DY_SC_TYPE_BLOCK_RB_CONTACT);
if (maxAnchorCount)
{
const BoolV cond = V4IsGrtr(orthoThreshold, V4Abs(normalX));
const Vec4V t0FallbackX = V4Sel(cond, zero, V4Neg(normalY));
const Vec4V t0FallbackY = V4Sel(cond, V4Neg(normalZ), normalX);
const Vec4V t0FallbackZ = V4Sel(cond, normalY, zero);
//const Vec4V dotNormalVrel = V4MulAdd(normalZ, vrelZ, V4MulAdd(normalY, vrelY, V4Mul(normalX, vrelX)));
const Vec4V vrelSubNorVelX = V4NegMulSub(normalX, relNorVel, vrelX);
const Vec4V vrelSubNorVelY = V4NegMulSub(normalY, relNorVel, vrelY);
const Vec4V vrelSubNorVelZ = V4NegMulSub(normalZ, relNorVel, vrelZ);
const Vec4V lenSqvrelSubNorVelZ = V4MulAdd(vrelSubNorVelX, vrelSubNorVelX, V4MulAdd(vrelSubNorVelY, vrelSubNorVelY, V4Mul(vrelSubNorVelZ, vrelSubNorVelZ)));
const BoolV bcon2 = V4IsGrtr(lenSqvrelSubNorVelZ, p1);
Vec4V t0X = V4Sel(bcon2, vrelSubNorVelX, t0FallbackX);
Vec4V t0Y = V4Sel(bcon2, vrelSubNorVelY, t0FallbackY);
Vec4V t0Z = V4Sel(bcon2, vrelSubNorVelZ, t0FallbackZ);
//Now normalize this...
const Vec4V recipLen = V4Rsqrt(V4MulAdd(t0Z, t0Z, V4MulAdd(t0Y, t0Y, V4Mul(t0X, t0X))));
t0X = V4Mul(t0X, recipLen);
t0Y = V4Mul(t0Y, recipLen);
t0Z = V4Mul(t0Z, recipLen);
Vec4V t1X = V4NegMulSub(normalZ, t0Y, V4Mul(normalY, t0Z));
Vec4V t1Y = V4NegMulSub(normalX, t0Z, V4Mul(normalZ, t0X));
Vec4V t1Z = V4NegMulSub(normalY, t0X, V4Mul(normalX, t0Y));
PX_ASSERT((uintptr_t(descs[0].frictionPtr) & 0xF) == 0);
PX_ASSERT((uintptr_t(descs[1].frictionPtr) & 0xF) == 0);
PX_ASSERT((uintptr_t(descs[2].frictionPtr) & 0xF) == 0);
PX_ASSERT((uintptr_t(descs[3].frictionPtr) & 0xF) == 0);
PxU8* PX_RESTRICT writeback0 = descs[0].frictionPtr + frictionPatchWritebackAddrIndex0 * sizeof(FrictionPatch);
PxU8* PX_RESTRICT writeback1 = descs[1].frictionPtr + frictionPatchWritebackAddrIndex1 * sizeof(FrictionPatch);
PxU8* PX_RESTRICT writeback2 = descs[2].frictionPtr + frictionPatchWritebackAddrIndex2 * sizeof(FrictionPatch);
PxU8* PX_RESTRICT writeback3 = descs[3].frictionPtr + frictionPatchWritebackAddrIndex3 * sizeof(FrictionPatch);
PxU32 index0 = 0, index1 = 0, index2 = 0, index3 = 0;
header->broken = bFalse;
header->frictionBrokenWritebackByte[0] = writeback0;
header->frictionBrokenWritebackByte[1] = writeback1;
header->frictionBrokenWritebackByte[2] = writeback2;
header->frictionBrokenWritebackByte[3] = writeback3;
/*header->frictionNormal[0][0] = t0X;
header->frictionNormal[0][1] = t0Y;
header->frictionNormal[0][2] = t0Z;
header->frictionNormal[1][0] = t1X;
header->frictionNormal[1][1] = t1Y;
header->frictionNormal[1][2] = t1Z;*/
Vec4V* PX_RESTRICT appliedForces = reinterpret_cast<Vec4V*>(ptr);
ptr += sizeof(Vec4V)*header->numFrictionConstr;
PxMemZero(appliedForces, sizeof(Vec4V) * header->numFrictionConstr);
for (PxU32 j = 0; j < maxAnchorCount; j++)
{
Ps::prefetchLine(ptr, 384);
Ps::prefetchLine(ptr, 512);
Ps::prefetchLine(ptr, 640);
SolverContactFrictionStepBlock* PX_RESTRICT f0 = reinterpret_cast<SolverContactFrictionStepBlock*>(ptr);
ptr += frictionSize;
SolverContactFrictionStepBlock* PX_RESTRICT f1 = reinterpret_cast<SolverContactFrictionStepBlock*>(ptr);
ptr += frictionSize;
index0 = j < clampedAnchorCount0 ? j : index0;
index1 = j < clampedAnchorCount1 ? j : index1;
index2 = j < clampedAnchorCount2 ? j : index2;
index3 = j < clampedAnchorCount3 ? j : index3;
if (j >= clampedAnchorCount0)
maxImpulseScale = V4SetX(maxImpulseScale, fZero);
if (j >= clampedAnchorCount1)
maxImpulseScale = V4SetY(maxImpulseScale, fZero);
if (j >= clampedAnchorCount2)
maxImpulseScale = V4SetZ(maxImpulseScale, fZero);
if (j >= clampedAnchorCount3)
maxImpulseScale = V4SetW(maxImpulseScale, fZero);
t0X = V4Mul(maxImpulseScale, t0X);
t0Y = V4Mul(maxImpulseScale, t0Y);
t0Z = V4Mul(maxImpulseScale, t0Z);
t1X = V4Mul(maxImpulseScale, t1X);
t1Y = V4Mul(maxImpulseScale, t1Y);
t1Z = V4Mul(maxImpulseScale, t1Z);
Vec3V body0Anchor0 = V3LoadU(frictionPatch0.body0Anchors[index0]);
Vec3V body0Anchor1 = V3LoadU(frictionPatch1.body0Anchors[index1]);
Vec3V body0Anchor2 = V3LoadU(frictionPatch2.body0Anchors[index2]);
Vec3V body0Anchor3 = V3LoadU(frictionPatch3.body0Anchors[index3]);
Vec4V ra0 = Vec4V_From_Vec3V(QuatRotate(bodyFrame00q, body0Anchor0));
Vec4V ra1 = Vec4V_From_Vec3V(QuatRotate(bodyFrame01q, body0Anchor1));
Vec4V ra2 = Vec4V_From_Vec3V(QuatRotate(bodyFrame02q, body0Anchor2));
Vec4V ra3 = Vec4V_From_Vec3V(QuatRotate(bodyFrame03q, body0Anchor3));
Vec4V raX, raY, raZ;
PX_TRANSPOSE_44_34(ra0, ra1, ra2, ra3, raX, raY, raZ);
/*raX = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raX)), zero, raX);
raY = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raY)), zero, raY);
raZ = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raZ)), zero, raZ);*/
const Vec4V raWorldX = V4Add(raX, bodyFrame0pX);
const Vec4V raWorldY = V4Add(raY, bodyFrame0pY);
const Vec4V raWorldZ = V4Add(raZ, bodyFrame0pZ);
Vec3V body1Anchor0 = V3LoadU(frictionPatch0.body1Anchors[index0]);
Vec3V body1Anchor1 = V3LoadU(frictionPatch1.body1Anchors[index1]);
Vec3V body1Anchor2 = V3LoadU(frictionPatch2.body1Anchors[index2]);
Vec3V body1Anchor3 = V3LoadU(frictionPatch3.body1Anchors[index3]);
Vec4V rb0 = Vec4V_From_Vec3V(QuatRotate(bodyFrame10q, body1Anchor0));
Vec4V rb1 = Vec4V_From_Vec3V(QuatRotate(bodyFrame11q, body1Anchor1));
Vec4V rb2 = Vec4V_From_Vec3V(QuatRotate(bodyFrame12q, body1Anchor2));
Vec4V rb3 = Vec4V_From_Vec3V(QuatRotate(bodyFrame13q, body1Anchor3));
Vec4V rbX, rbY, rbZ;
PX_TRANSPOSE_44_34(rb0, rb1, rb2, rb3, rbX, rbY, rbZ);
/*rbX = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbX)), zero, rbX);
rbY = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbY)), zero, rbY);
rbZ = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbZ)), zero, rbZ);*/
const Vec4V rbWorldX = V4Add(rbX, bodyFrame1pX);
const Vec4V rbWorldY = V4Add(rbY, bodyFrame1pY);
const Vec4V rbWorldZ = V4Add(rbZ, bodyFrame1pZ);
Vec4V errorX = V4Sub(raWorldX, rbWorldX);
Vec4V errorY = V4Sub(raWorldY, rbWorldY);
Vec4V errorZ = V4Sub(raWorldZ, rbWorldZ);
/*errorX = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(errorX)), zero, errorX);
errorY = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(errorY)), zero, errorY);
errorZ = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(errorZ)), zero, errorZ);*/
//KS - todo - get this working with per-point friction
PxU32 contactIndex0 = c.contactID[frictionIndex0][index0];
PxU32 contactIndex1 = c.contactID[frictionIndex1][index1];
PxU32 contactIndex2 = c.contactID[frictionIndex2][index2];
PxU32 contactIndex3 = c.contactID[frictionIndex3][index3];
//Ensure that the ocntact indices are valid
PX_ASSERT(contactIndex0 == 0xffff || contactIndex0 < descs[0].numContacts);
PX_ASSERT(contactIndex1 == 0xffff || contactIndex1 < descs[1].numContacts);
PX_ASSERT(contactIndex2 == 0xffff || contactIndex2 < descs[2].numContacts);
PX_ASSERT(contactIndex3 == 0xffff || contactIndex3 < descs[3].numContacts);
Vec4V targetVel0 = V4LoadA(contactIndex0 == 0xFFFF ? &contactBase0->targetVel.x : &descs[0].contacts[contactIndex0].targetVel.x);
Vec4V targetVel1 = V4LoadA(contactIndex1 == 0xFFFF ? &contactBase0->targetVel.x : &descs[1].contacts[contactIndex1].targetVel.x);
Vec4V targetVel2 = V4LoadA(contactIndex2 == 0xFFFF ? &contactBase0->targetVel.x : &descs[2].contacts[contactIndex2].targetVel.x);
Vec4V targetVel3 = V4LoadA(contactIndex3 == 0xFFFF ? &contactBase0->targetVel.x : &descs[3].contacts[contactIndex3].targetVel.x);
Vec4V targetVelX, targetVelY, targetVelZ;
PX_TRANSPOSE_44_34(targetVel0, targetVel1, targetVel2, targetVel3, targetVelX, targetVelY, targetVelZ);
{
Vec4V raXnX = V4NegMulSub(raZ, t0Y, V4Mul(raY, t0Z));
Vec4V raXnY = V4NegMulSub(raX, t0Z, V4Mul(raZ, t0X));
Vec4V raXnZ = V4NegMulSub(raY, t0X, V4Mul(raX, t0Y));
raXnX = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raXnX)), zero, raXnX);
raXnY = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raXnY)), zero, raXnY);
raXnZ = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raXnZ)), zero, raXnZ);
Vec4V delAngVel0X = V4Mul(invInertia0X0, raXnX);
Vec4V delAngVel0Y = V4Mul(invInertia0X1, raXnX);
Vec4V delAngVel0Z = V4Mul(invInertia0X2, raXnX);
delAngVel0X = V4MulAdd(invInertia0Y0, raXnY, delAngVel0X);
delAngVel0Y = V4MulAdd(invInertia0Y1, raXnY, delAngVel0Y);
delAngVel0Z = V4MulAdd(invInertia0Y2, raXnY, delAngVel0Z);
delAngVel0X = V4MulAdd(invInertia0Z0, raXnZ, delAngVel0X);
delAngVel0Y = V4MulAdd(invInertia0Z1, raXnZ, delAngVel0Y);
delAngVel0Z = V4MulAdd(invInertia0Z2, raXnZ, delAngVel0Z);
const Vec4V dotDelAngVel0 = V4MulAdd(delAngVel0Z, delAngVel0Z, V4MulAdd(delAngVel0Y, delAngVel0Y, V4Mul(delAngVel0X, delAngVel0X)));
Vec4V resp = V4MulAdd(dotDelAngVel0, angDom0, invMass0D0);
const Vec4V tVel0 = V4MulAdd(t0Z, linVelT20, V4MulAdd(t0Y, linVelT10, V4Mul(t0X, linVelT00)));
Vec4V vrel0 = V4MulAdd(raXnZ, angVelT20, V4MulAdd(raXnY, angVelT10, V4MulAdd(raXnX, angVelT00, tVel0)));
Vec4V delAngVel1X = zero;
Vec4V delAngVel1Y = zero;
Vec4V delAngVel1Z = zero;
Vec4V vrel1 = zero;
if (isDynamic)
{
Vec4V rbXnX = V4NegMulSub(rbZ, t0Y, V4Mul(rbY, t0Z));
Vec4V rbXnY = V4NegMulSub(rbX, t0Z, V4Mul(rbZ, t0X));
Vec4V rbXnZ = V4NegMulSub(rbY, t0X, V4Mul(rbX, t0Y));
rbXnX = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbXnX)), zero, rbXnX);
rbXnY = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbXnY)), zero, rbXnY);
rbXnZ = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbXnZ)), zero, rbXnZ);
delAngVel1X = V4Mul(invInertia1X0, rbXnX);
delAngVel1Y = V4Mul(invInertia1X1, rbXnX);
delAngVel1Z = V4Mul(invInertia1X2, rbXnX);
delAngVel1X = V4MulAdd(invInertia1Y0, rbXnY, delAngVel1X);
delAngVel1Y = V4MulAdd(invInertia1Y1, rbXnY, delAngVel1Y);
delAngVel1Z = V4MulAdd(invInertia1Y2, rbXnY, delAngVel1Z);
delAngVel1X = V4MulAdd(invInertia1Z0, rbXnZ, delAngVel1X);
delAngVel1Y = V4MulAdd(invInertia1Z1, rbXnZ, delAngVel1Y);
delAngVel1Z = V4MulAdd(invInertia1Z2, rbXnZ, delAngVel1Z);
const Vec4V dotDelAngVel1 = V4MulAdd(delAngVel1Z, delAngVel1Z, V4MulAdd(delAngVel1Y, delAngVel1Y, V4Mul(delAngVel1X, delAngVel1X)));
const Vec4V resp1 = V4MulAdd(dotDelAngVel1, angDom1, invMass1D1);
resp = V4Add(resp, resp1);
const Vec4V tVel1 = V4MulAdd(t0Z, linVelT21, V4MulAdd(t0Y, linVelT11, V4Mul(t0X, linVelT01)));
vrel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4MulAdd(rbXnX, angVelT01, tVel1)));
}
else if (hasKinematic)
{
const Vec4V rbXnX = V4NegMulSub(rbZ, t0Y, V4Mul(rbY, t0Z));
const Vec4V rbXnY = V4NegMulSub(rbX, t0Z, V4Mul(rbZ, t0X));
const Vec4V rbXnZ = V4NegMulSub(rbY, t0X, V4Mul(rbX, t0Y));
const Vec4V tVel1 = V4MulAdd(t0Z, linVelT21, V4MulAdd(t0Y, linVelT11, V4Mul(t0X, linVelT01)));
vrel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4MulAdd(rbXnX, angVelT01, tVel1)));
}
f0->rbXnI[0] = delAngVel1X;
f0->rbXnI[1] = delAngVel1Y;
f0->rbXnI[2] = delAngVel1Z;
const Vec4V velMultiplier = V4Mul(maxImpulseScale, V4Sel(V4IsGrtr(resp, zero), V4Div(p84, resp), zero));
Vec4V error = V4MulAdd(t0Z, errorZ, V4MulAdd(t0Y, errorY, V4Mul(t0X, errorX)));
Vec4V targetVel = V4NegMulSub(vrel0, kinematicScale0, V4MulAdd(vrel1, kinematicScale1, V4MulAdd(t0Z, targetVelZ, V4MulAdd(t0Y, targetVelY, V4Mul(t0X, targetVelX)))));
f0->normal[0] = t0X;
f0->normal[1] = t0Y;
f0->normal[2] = t0Z;
f0->raXnI[0] = delAngVel0X;
f0->raXnI[1] = delAngVel0Y;
f0->raXnI[2] = delAngVel0Z;
f0->error = error;
f0->velMultiplier = velMultiplier;
f0->biasCoefficient = V4Splat(frictionBiasScale);
f0->targetVel = targetVel;
}
{
Vec4V raXnX = V4NegMulSub(raZ, t1Y, V4Mul(raY, t1Z));
Vec4V raXnY = V4NegMulSub(raX, t1Z, V4Mul(raZ, t1X));
Vec4V raXnZ = V4NegMulSub(raY, t1X, V4Mul(raX, t1Y));
raXnX = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raXnX)), zero, raXnX);
raXnY = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raXnY)), zero, raXnY);
raXnZ = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(raXnZ)), zero, raXnZ);
Vec4V delAngVel0X = V4Mul(invInertia0X0, raXnX);
Vec4V delAngVel0Y = V4Mul(invInertia0X1, raXnX);
Vec4V delAngVel0Z = V4Mul(invInertia0X2, raXnX);
delAngVel0X = V4MulAdd(invInertia0Y0, raXnY, delAngVel0X);
delAngVel0Y = V4MulAdd(invInertia0Y1, raXnY, delAngVel0Y);
delAngVel0Z = V4MulAdd(invInertia0Y2, raXnY, delAngVel0Z);
delAngVel0X = V4MulAdd(invInertia0Z0, raXnZ, delAngVel0X);
delAngVel0Y = V4MulAdd(invInertia0Z1, raXnZ, delAngVel0Y);
delAngVel0Z = V4MulAdd(invInertia0Z2, raXnZ, delAngVel0Z);
const Vec4V dotDelAngVel0 = V4MulAdd(delAngVel0Z, delAngVel0Z, V4MulAdd(delAngVel0Y, delAngVel0Y, V4Mul(delAngVel0X, delAngVel0X)));
Vec4V resp = V4MulAdd(dotDelAngVel0, angDom0, invMass0D0);
const Vec4V tVel0 = V4MulAdd(t1Z, linVelT20, V4MulAdd(t1Y, linVelT10, V4Mul(t1X, linVelT00)));
Vec4V vrel0 = V4MulAdd(raXnZ, angVelT20, V4MulAdd(raXnY, angVelT10, V4MulAdd(raXnX, angVelT00, tVel0)));
Vec4V delAngVel1X = zero;
Vec4V delAngVel1Y = zero;
Vec4V delAngVel1Z = zero;
Vec4V vrel1 = zero;
if (isDynamic)
{
Vec4V rbXnX = V4NegMulSub(rbZ, t1Y, V4Mul(rbY, t1Z));
Vec4V rbXnY = V4NegMulSub(rbX, t1Z, V4Mul(rbZ, t1X));
Vec4V rbXnZ = V4NegMulSub(rbY, t1X, V4Mul(rbX, t1Y));
rbXnX = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbXnX)), zero, rbXnX);
rbXnY = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbXnY)), zero, rbXnY);
rbXnZ = V4Sel(V4IsGrtr(solverOffsetSlop, V4Abs(rbXnZ)), zero, rbXnZ);
delAngVel1X = V4Mul(invInertia1X0, rbXnX);
delAngVel1Y = V4Mul(invInertia1X1, rbXnX);
delAngVel1Z = V4Mul(invInertia1X2, rbXnX);
delAngVel1X = V4MulAdd(invInertia1Y0, rbXnY, delAngVel1X);
delAngVel1Y = V4MulAdd(invInertia1Y1, rbXnY, delAngVel1Y);
delAngVel1Z = V4MulAdd(invInertia1Y2, rbXnY, delAngVel1Z);
delAngVel1X = V4MulAdd(invInertia1Z0, rbXnZ, delAngVel1X);
delAngVel1Y = V4MulAdd(invInertia1Z1, rbXnZ, delAngVel1Y);
delAngVel1Z = V4MulAdd(invInertia1Z2, rbXnZ, delAngVel1Z);
const Vec4V dotDelAngVel1 = V4MulAdd(delAngVel1Z, delAngVel1Z, V4MulAdd(delAngVel1Y, delAngVel1Y, V4Mul(delAngVel1X, delAngVel1X)));
const Vec4V resp1 = V4MulAdd(dotDelAngVel1, angDom1, invMass1D1);
resp = V4Add(resp, resp1);
const Vec4V tVel1 = V4MulAdd(t1Z, linVelT21, V4MulAdd(t1Y, linVelT11, V4Mul(t1X, linVelT01)));
vrel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4MulAdd(rbXnX, angVelT01, tVel1)));
}
else if (hasKinematic)
{
const Vec4V rbXnX = V4NegMulSub(rbZ, t1Y, V4Mul(rbY, t1Z));
const Vec4V rbXnY = V4NegMulSub(rbX, t1Z, V4Mul(rbZ, t1X));
const Vec4V rbXnZ = V4NegMulSub(rbY, t1X, V4Mul(rbX, t1Y));
const Vec4V tVel1 = V4MulAdd(t1Z, linVelT21, V4MulAdd(t1Y, linVelT11, V4Mul(t1X, linVelT01)));
vrel1 = V4MulAdd(rbXnZ, angVelT21, V4MulAdd(rbXnY, angVelT11, V4MulAdd(rbXnX, angVelT01, tVel1)));
}
f1->rbXnI[0] = delAngVel1X;
f1->rbXnI[1] = delAngVel1Y;
f1->rbXnI[2] = delAngVel1Z;
const Vec4V velMultiplier = V4Mul(maxImpulseScale, V4Sel(V4IsGrtr(resp, zero), V4Div(p84, resp), zero));
Vec4V error = V4MulAdd(t1Z, errorZ, V4MulAdd(t1Y, errorY, V4Mul(t1X, errorX)));
Vec4V targetVel = V4NegMulSub(vrel0, kinematicScale0, V4MulAdd(vrel1, kinematicScale1, V4MulAdd(t1Z, targetVelZ, V4MulAdd(t1Y, targetVelY, V4Mul(t1X, targetVelX)))));
f1->normal[0] = t1X;
f1->normal[1] = t1Y;
f1->normal[2] = t1Z;
f1->raXnI[0] = delAngVel0X;
f1->raXnI[1] = delAngVel0Y;
f1->raXnI[2] = delAngVel0Z;
f1->error = error;
f1->velMultiplier = velMultiplier;
f1->targetVel = targetVel;
f1->biasCoefficient = V4Splat(frictionBiasScale);
}
}
header->dynamicFriction = V4LoadA(dynamicFriction);
header->staticFriction = V4LoadA(staticFriction);
frictionPatchWritebackAddrIndex0++;
frictionPatchWritebackAddrIndex1++;
frictionPatchWritebackAddrIndex2++;
frictionPatchWritebackAddrIndex3++;
}
}
}
}
PX_FORCE_INLINE void computeBlockStreamFrictionByteSizes(const CorrelationBuffer& c,
PxU32& _frictionPatchByteSize, PxU32& _numFrictionPatches,
PxU32 frictionPatchStartIndex, PxU32 frictionPatchEndIndex)
{
// PT: use local vars to remove LHS
PxU32 numFrictionPatches = 0;
for (PxU32 i = frictionPatchStartIndex; i < frictionPatchEndIndex; i++)
{
//Friction patches.
if (c.correlationListHeads[i] != CorrelationBuffer::LIST_END)
numFrictionPatches++;
}
PxU32 frictionPatchByteSize = numFrictionPatches * sizeof(FrictionPatch);
_numFrictionPatches = numFrictionPatches;
//16-byte alignment.
_frictionPatchByteSize = ((frictionPatchByteSize + 0x0f) & ~0x0f);
PX_ASSERT(0 == (_frictionPatchByteSize & 0x0f));
}
static bool reserveFrictionBlockStreams(const CorrelationBuffer& c, PxConstraintAllocator& constraintAllocator, PxU32 frictionPatchStartIndex, PxU32 frictionPatchEndIndex,
FrictionPatch*& _frictionPatches,
PxU32& numFrictionPatches)
{
//From frictionPatchStream we just need to reserve a single buffer.
PxU32 frictionPatchByteSize = 0;
//Compute the sizes of all the buffers.
computeBlockStreamFrictionByteSizes(c, frictionPatchByteSize, numFrictionPatches, frictionPatchStartIndex, frictionPatchEndIndex);
FrictionPatch* frictionPatches = NULL;
//If the constraint block reservation didn't fail then reserve the friction buffer too.
if (frictionPatchByteSize > 0)
{
frictionPatches = reinterpret_cast<FrictionPatch*>(constraintAllocator.reserveFrictionData(frictionPatchByteSize));
if (0 == frictionPatches || (reinterpret_cast<FrictionPatch*>(-1)) == frictionPatches)
{
if (0 == frictionPatches)
{
PX_WARN_ONCE(
"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
"Either accept dropped contacts or increase buffer size allocated for narrow phase by increasing PxSceneDesc::maxNbContactDataBlocks.");
}
else
{
PX_WARN_ONCE(
"Attempting to allocate more than 16K of friction data for a single contact pair in constraint prep. "
"Either accept dropped contacts or simplify collision geometry.");
frictionPatches = NULL;
}
}
}
_frictionPatches = frictionPatches;
//Return true if neither of the two block reservations failed.
return (0 == frictionPatchByteSize || frictionPatches);
}
//The persistent friction patch correlation/allocation will already have happenned as this is per-pair.
//This function just computes the size of the combined solve data.
void computeBlockStreamByteSizes4(PxTGSSolverContactDesc* descs,
PxU32& _solverConstraintByteSize, PxU32* _axisConstraintCount,
const CorrelationBuffer& c)
{
PX_ASSERT(0 == _solverConstraintByteSize);
PxU32 maxPatches = 0;
PxU32 maxFrictionPatches = 0;
PxU32 maxContactCount[CorrelationBuffer::MAX_FRICTION_PATCHES];
PxU32 maxFrictionCount[CorrelationBuffer::MAX_FRICTION_PATCHES];
PxMemZero(maxContactCount, sizeof(maxContactCount));
PxMemZero(maxFrictionCount, sizeof(maxFrictionCount));
bool hasMaxImpulse = false;
for (PxU32 a = 0; a < 4; ++a)
{
PxU32 axisConstraintCount = 0;
hasMaxImpulse = hasMaxImpulse || descs[a].hasMaxImpulse;
for (PxU32 i = 0; i < descs[a].numFrictionPatches; i++)
{
PxU32 ind = i + descs[a].startFrictionPatchIndex;
const FrictionPatch& frictionPatch = c.frictionPatches[ind];
const bool haveFriction = (frictionPatch.materialFlags & PxMaterialFlag::eDISABLE_FRICTION) == 0
&& frictionPatch.anchorCount != 0;
//Solver constraint data.
if (c.frictionPatchContactCounts[ind] != 0)
{
maxContactCount[i] = PxMax(c.frictionPatchContactCounts[ind], maxContactCount[i]);
axisConstraintCount += c.frictionPatchContactCounts[ind];
if (haveFriction)
{
const PxU32 fricCount = PxU32(c.frictionPatches[ind].anchorCount) * 2;
maxFrictionCount[i] = PxMax(fricCount, maxFrictionCount[i]);
axisConstraintCount += fricCount;
}
}
}
maxPatches = PxMax(descs[a].numFrictionPatches, maxPatches);
_axisConstraintCount[a] = axisConstraintCount;
}
for (PxU32 a = 0; a < maxPatches; ++a)
{
if (maxFrictionCount[a] > 0)
maxFrictionPatches++;
}
PxU32 totalContacts = 0, totalFriction = 0;
for (PxU32 a = 0; a < maxPatches; ++a)
{
totalContacts += maxContactCount[a];
totalFriction += maxFrictionCount[a];
}
//OK, we have a given number of friction patches, contact points and friction constraints so we can calculate how much memory we need
//Body 2 is considered static if it is either *not dynamic* or *kinematic*
/*bool hasDynamicBody = false;
for (PxU32 a = 0; a < 4; ++a)
{
hasDynamicBody = hasDynamicBody || ((descs[a].bodyState1 == PxSolverContactDesc::eDYNAMIC_BODY));
}
const bool isStatic = !hasDynamicBody;*/
const PxU32 headerSize = sizeof(SolverContactHeaderStepBlock) * maxPatches;
//PxU32 constraintSize = isStatic ? (sizeof(SolverContactBatchPointBase4) * totalContacts) + (sizeof(SolverContactFrictionBase4) * totalFriction) :
// (sizeof(SolverContactBatchPointDynamic4) * totalContacts) + (sizeof(SolverContactFrictionDynamic4) * totalFriction);
PxU32 constraintSize = (sizeof(SolverContactPointStepBlock) * totalContacts) + (sizeof(SolverContactFrictionStepBlock) * totalFriction);
//Space for the appliedForce buffer
constraintSize += sizeof(Vec4V)*(totalContacts + totalFriction);
//If we have max impulse, reserve a buffer for it
if (hasMaxImpulse)
constraintSize += sizeof(Ps::aos::Vec4V) * totalContacts;
_solverConstraintByteSize = ((constraintSize + headerSize + 0x0f) & ~0x0f);
PX_ASSERT(0 == (_solverConstraintByteSize & 0x0f));
}
static SolverConstraintPrepState::Enum reserveBlockStreams4(PxTGSSolverContactDesc* descs, Dy::CorrelationBuffer& c,
PxU8*& solverConstraint, PxU32* axisConstraintCount,
PxU32& solverConstraintByteSize,
PxConstraintAllocator& constraintAllocator)
{
PX_ASSERT(NULL == solverConstraint);
PX_ASSERT(0 == solverConstraintByteSize);
//Compute the sizes of all the buffers.
computeBlockStreamByteSizes4(descs,
solverConstraintByteSize, axisConstraintCount,
c);
//Reserve the buffers.
//First reserve the accumulated buffer size for the constraint block.
PxU8* constraintBlock = NULL;
const PxU32 constraintBlockByteSize = solverConstraintByteSize;
if (constraintBlockByteSize > 0)
{
if ((constraintBlockByteSize + 16u) > 16384)
return SolverConstraintPrepState::eUNBATCHABLE;
constraintBlock = constraintAllocator.reserveConstraintData(constraintBlockByteSize + 16u);
if (0 == constraintBlock || (reinterpret_cast<PxU8*>(-1)) == constraintBlock)
{
if (0 == constraintBlock)
{
PX_WARN_ONCE(
"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
"Either accept dropped contacts or increase buffer size allocated for narrow phase by increasing PxSceneDesc::maxNbContactDataBlocks.");
}
else
{
PX_WARN_ONCE(
"Attempting to allocate more than 16K of contact data for a single contact pair in constraint prep. "
"Either accept dropped contacts or simplify collision geometry.");
constraintBlock = NULL;
}
}
}
//Patch up the individual ptrs to the buffer returned by the constraint block reservation (assuming the reservation didn't fail).
if (0 == constraintBlockByteSize || constraintBlock)
{
if (solverConstraintByteSize)
{
solverConstraint = constraintBlock;
PX_ASSERT(0 == (uintptr_t(solverConstraint) & 0x0f));
}
}
return ((0 == constraintBlockByteSize || constraintBlock)) ? SolverConstraintPrepState::eSUCCESS : SolverConstraintPrepState::eOUT_OF_MEMORY;
}
SolverConstraintPrepState::Enum createFinalizeSolverContacts4Step(
Dy::CorrelationBuffer& c,
PxTGSSolverContactDesc* blockDescs,
const PxReal invDtF32,
const PxReal invTotalDtF32,
PxReal bounceThresholdF32,
PxReal frictionOffsetThreshold,
PxReal correlationDistance,
PxReal solverOffsetSlop,
PxConstraintAllocator& constraintAllocator)
{
PX_ALIGN(16, PxReal invMassScale0[4]);
PX_ALIGN(16, PxReal invMassScale1[4]);
PX_ALIGN(16, PxReal invInertiaScale0[4]);
PX_ALIGN(16, PxReal invInertiaScale1[4]);
c.frictionPatchCount = 0;
c.contactPatchCount = 0;
for (PxU32 a = 0; a < 4; ++a)
{
PxTGSSolverContactDesc& blockDesc = blockDescs[a];
invMassScale0[a] = blockDesc.invMassScales.linear0;
invMassScale1[a] = blockDesc.invMassScales.linear1;
invInertiaScale0[a] = blockDesc.invMassScales.angular0;
invInertiaScale1[a] = blockDesc.invMassScales.angular1;
blockDesc.startFrictionPatchIndex = c.frictionPatchCount;
if (!(blockDesc.disableStrongFriction))
{
bool valid = getFrictionPatches(c, blockDesc.frictionPtr, blockDesc.frictionCount,
blockDesc.bodyFrame0, blockDesc.bodyFrame1, correlationDistance);
if (!valid)
return SolverConstraintPrepState::eUNBATCHABLE;
}
//Create the contact patches
blockDesc.startContactPatchIndex = c.contactPatchCount;
if (!createContactPatches(c, blockDesc.contacts, blockDesc.numContacts, PXC_SAME_NORMAL))
return SolverConstraintPrepState::eUNBATCHABLE;
blockDesc.numContactPatches = PxU16(c.contactPatchCount - blockDesc.startContactPatchIndex);
bool overflow = correlatePatches(c, blockDesc.contacts, blockDesc.bodyFrame0, blockDesc.bodyFrame1, PXC_SAME_NORMAL,
blockDesc.startContactPatchIndex, blockDesc.startFrictionPatchIndex);
if (overflow)
return SolverConstraintPrepState::eUNBATCHABLE;
growPatches(c, blockDesc.contacts, blockDesc.bodyFrame0, blockDesc.bodyFrame1, correlationDistance, blockDesc.startFrictionPatchIndex,
frictionOffsetThreshold + blockDescs[a].restDistance);
//Remove the empty friction patches - do we actually need to do this?
for (PxU32 p = c.frictionPatchCount; p > blockDesc.startFrictionPatchIndex; --p)
{
if (c.correlationListHeads[p - 1] == 0xffff)
{
//We have an empty patch...need to bin this one...
for (PxU32 p2 = p; p2 < c.frictionPatchCount; ++p2)
{
c.correlationListHeads[p2 - 1] = c.correlationListHeads[p2];
c.frictionPatchContactCounts[p2 - 1] = c.frictionPatchContactCounts[p2];
}
c.frictionPatchCount--;
}
}
PxU32 numFricPatches = c.frictionPatchCount - blockDesc.startFrictionPatchIndex;
blockDesc.numFrictionPatches = numFricPatches;
}
FrictionPatch* frictionPatchArray[4];
PxU32 frictionPatchCounts[4];
for (PxU32 a = 0; a < 4; ++a)
{
PxTGSSolverContactDesc& blockDesc = blockDescs[a];
const bool successfulReserve = reserveFrictionBlockStreams(c, constraintAllocator, blockDesc.startFrictionPatchIndex, blockDesc.numFrictionPatches + blockDesc.startFrictionPatchIndex,
frictionPatchArray[a],
frictionPatchCounts[a]);
//KS - TODO - how can we recover if we failed to allocate this memory?
if (!successfulReserve)
{
return SolverConstraintPrepState::eOUT_OF_MEMORY;
}
}
//At this point, all the friction data has been calculated, the correlation has been done. Provided this was all successful,
//we are ready to create the batched constraints
PxU8* solverConstraint = NULL;
PxU32 solverConstraintByteSize = 0;
{
PxU32 axisConstraintCount[4];
SolverConstraintPrepState::Enum state = reserveBlockStreams4(blockDescs, c,
solverConstraint, axisConstraintCount,
solverConstraintByteSize,
constraintAllocator);
if (state != SolverConstraintPrepState::eSUCCESS)
return state;
for (PxU32 a = 0; a < 4; ++a)
{
FrictionPatch* frictionPatches = frictionPatchArray[a];
PxTGSSolverContactDesc& blockDesc = blockDescs[a];
PxSolverConstraintDesc& desc = *blockDesc.desc;
blockDesc.frictionPtr = reinterpret_cast<PxU8*>(frictionPatches);
blockDesc.frictionCount = Ps::to8(frictionPatchCounts[a]);
//Initialise friction buffer.
if (frictionPatches)
{
// PT: TODO: revisit this... not very satisfying
//const PxU32 maxSize = numFrictionPatches*sizeof(FrictionPatch);
Ps::prefetchLine(frictionPatches);
Ps::prefetchLine(frictionPatches, 128);
Ps::prefetchLine(frictionPatches, 256);
for (PxU32 i = 0; i<blockDesc.numFrictionPatches; i++)
{
if (c.correlationListHeads[blockDesc.startFrictionPatchIndex + i] != CorrelationBuffer::LIST_END)
{
//*frictionPatches++ = c.frictionPatches[blockDesc.startFrictionPatchIndex + i];
PxMemCopy(frictionPatches++, &c.frictionPatches[blockDesc.startFrictionPatchIndex + i], sizeof(FrictionPatch));
//Ps::prefetchLine(frictionPatches, 256);
}
}
}
blockDesc.axisConstraintCount += Ps::to16(axisConstraintCount[a]);
desc.constraint = solverConstraint;
desc.constraintLengthOver16 = Ps::to16(solverConstraintByteSize / 16);
desc.writeBackLengthOver4 = PxU16(blockDesc.numContacts);
desc.writeBack = blockDesc.contactForces;
}
const Vec4V iMassScale0 = V4LoadA(invMassScale0);
const Vec4V iInertiaScale0 = V4LoadA(invInertiaScale0);
const Vec4V iMassScale1 = V4LoadA(invMassScale1);
const Vec4V iInertiaScale1 = V4LoadA(invInertiaScale1);
setupFinalizeSolverConstraints4Step(blockDescs, c, solverConstraint, invDtF32, invTotalDtF32, bounceThresholdF32, solverOffsetSlop,
iMassScale0, iInertiaScale0, iMassScale1, iInertiaScale1);
PX_ASSERT((*solverConstraint == DY_SC_TYPE_BLOCK_RB_CONTACT) || (*solverConstraint == DY_SC_TYPE_BLOCK_STATIC_RB_CONTACT));
*(reinterpret_cast<PxU32*>(solverConstraint + solverConstraintByteSize)) = 0;
}
return SolverConstraintPrepState::eSUCCESS;
}
SolverConstraintPrepState::Enum createFinalizeSolverContacts4Step(
PxsContactManagerOutput** cmOutputs,
ThreadContext& threadContext,
PxTGSSolverContactDesc* blockDescs,
const PxReal invDtF32,
const PxReal invTotalDtF32,
PxReal bounceThresholdF32,
PxReal frictionOffsetThreshold,
PxReal correlationDistance,
PxReal solverOffsetSlop,
PxConstraintAllocator& constraintAllocator)
{
for (PxU32 a = 0; a < 4; ++a)
{
blockDescs[a].desc->constraintLengthOver16 = 0;
}
//PX_ASSERT(cmOutputs[0]->nbContacts && cmOutputs[1]->nbContacts && cmOutputs[2]->nbContacts && cmOutputs[3]->nbContacts);
Gu::ContactBuffer& buffer = threadContext.mContactBuffer;
buffer.count = 0;
//PxTransform idt = PxTransform(PxIdentity);
CorrelationBuffer& c = threadContext.mCorrelationBuffer;
for (PxU32 a = 0; a < 4; ++a)
{
PxTGSSolverContactDesc& blockDesc = blockDescs[a];
PxSolverConstraintDesc& desc = *blockDesc.desc;
//blockDesc.startContactIndex = buffer.count;
blockDesc.contacts = buffer.contacts + buffer.count;
Ps::prefetchLine(desc.bodyA);
Ps::prefetchLine(desc.bodyB);
//Unbatchable if we have (a) too many contacts or (b) torsional friction enabled - it just seems easier to handle this on an individual contact basis because it is expected to
//be used relatively rarely
if ((buffer.count + cmOutputs[a]->nbContacts) > 64 || (blockDesc.torsionalPatchRadius != 0.f || blockDesc.minTorsionalPatchRadius != 0.f) )
{
return SolverConstraintPrepState::eUNBATCHABLE;
}
bool hasMaxImpulse = false;
bool hasTargetVelocity = false;
//OK...do the correlation here as well...
Ps::prefetchLine(blockDescs[a].frictionPtr);
Ps::prefetchLine(blockDescs[a].frictionPtr, 64);
Ps::prefetchLine(blockDescs[a].frictionPtr, 128);
if (a < 3)
{
Ps::prefetchLine(cmOutputs[a]->contactPatches);
Ps::prefetchLine(cmOutputs[a]->contactPoints);
}
PxReal invMassScale0, invMassScale1, invInertiaScale0, invInertiaScale1;
const PxReal defaultMaxImpulse = PxMin(blockDesc.bodyData0->maxContactImpulse, blockDesc.bodyData1->maxContactImpulse);
PxU32 contactCount = extractContacts(buffer, *cmOutputs[a], hasMaxImpulse, hasTargetVelocity, invMassScale0, invMassScale1,
invInertiaScale0, invInertiaScale1, defaultMaxImpulse);
if (contactCount == 0 || hasTargetVelocity)
return SolverConstraintPrepState::eUNBATCHABLE;
blockDesc.numContacts = contactCount;
blockDesc.hasMaxImpulse = hasMaxImpulse;
blockDesc.disableStrongFriction = blockDesc.disableStrongFriction || hasTargetVelocity;
blockDesc.invMassScales.linear0 *= invMassScale0;
blockDesc.invMassScales.linear1 *= invMassScale1;
blockDesc.invMassScales.angular0 *= blockDesc.body0->isKinematic ? 0.f : invInertiaScale0;
blockDesc.invMassScales.angular1 *= blockDesc.body1->isKinematic ? 0.f : invInertiaScale1;
}
return createFinalizeSolverContacts4Step(c, blockDescs,
invDtF32, invTotalDtF32, bounceThresholdF32, frictionOffsetThreshold,
correlationDistance, solverOffsetSlop, constraintAllocator);
}
void preprocessRows(Px1DConstraint** sorted,
Px1DConstraint* rows,
PxVec4* angSqrtInvInertia0,
PxVec4* angSqrtInvInertia1,
PxU32 rowCount,
const PxMat33& sqrtInvInertia0F32,
const PxMat33& sqrtInvInertia1F32,
const PxReal invMass0,
const PxReal invMass1,
const PxConstraintInvMassScale& ims,
bool disablePreprocessing,
bool diagonalizeDrive,
bool preprocessLinear = true);
void setSolverConstantsStep(PxReal& error,
PxReal& biasScale,
PxReal& targetVel,
PxReal& maxBias,
PxReal& velMultiplier,
PxReal& impulseMultiplier,
PxReal& rcpResponse,
const Px1DConstraint& c,
PxReal normalVel,
PxReal unitResponse,
PxReal minRowResponse,
PxReal erp,
PxReal dt,
PxReal totalDt,
PxReal biasClamp,
PxReal recipdt,
PxReal recipTotalDt);
namespace
{
void setConstants(PxReal& error, PxReal& biasScale, PxReal& targetVel, PxReal& maxBias, PxReal& velMultiplier, PxReal& impulseMultiplier,
PxReal& rcpResponse, const Px1DConstraint& c, PxReal unitResponse, PxReal minRowResponse, PxReal dt, PxReal totalDt,
PxReal recipdt, PxReal recipTotalDt, const bool finished,
const PxReal lengthScale, const PxReal nv, const PxReal nv0, const PxReal nv1, const bool isKinematic0, const bool isKinematic1)
{
PX_UNUSED(dt);
if (finished)
{
error = 0.f;
biasScale = 0.f;
maxBias = 0.f;
velMultiplier = 0.f;
impulseMultiplier = 0.f;
rcpResponse = 0.f;
targetVel = 0.f;
return;
}
PxReal erp = 1.f;
PxReal linearErp = 1.f;
//PxReal biasClamp = c.flags & Px1DConstraintFlag::eANGULAR_CONSTRAINT ? 50.f : 200.f*lengthScale;
PxReal biasClamp = c.flags & Px1DConstraintFlag::eANGULAR_CONSTRAINT ? 100.f : 1000.f*lengthScale;
setSolverConstantsStep(error, biasScale, targetVel, maxBias, velMultiplier, impulseMultiplier, rcpResponse, c, nv, unitResponse,
minRowResponse, c.flags & Px1DConstraintFlag::eANGULAR_CONSTRAINT ? erp : linearErp, dt, totalDt, biasClamp, recipdt, recipTotalDt);
if(isKinematic0)
targetVel -= nv0;
if(isKinematic1)
targetVel += nv1;
}
void setOrthoData(const PxReal& ang0X, const PxReal& ang0Y, const PxReal& ang0Z, const PxReal& ang1X, const PxReal& ang1Y, const PxReal& ang1Z,
const PxReal& recipResponse, const PxReal& error, PxReal& orthoAng0X, PxReal& orthoAng0Y, PxReal& orthoAng0Z, PxReal& orthoAng1X, PxReal& orthoAng1Y, PxReal& orthoAng1Z,
PxReal& orthoRecipResponse, PxReal& orthoError, const bool disableProcessing, const PxU32 solveHint, PxU32& flags, PxU32& orthoCount, const bool finished)
{
if (!finished && !disableProcessing)
{
if (solveHint == PxConstraintSolveHint::eROTATIONAL_EQUALITY)
{
flags |= DY_SC_FLAG_ROT_EQ;
orthoAng0X = ang0X; orthoAng0Y = ang0Y; orthoAng0Z = ang0Z;
orthoAng1X = ang1X; orthoAng1Y = ang1Y; orthoAng1Z = ang1Z;
orthoRecipResponse = recipResponse;
orthoError = error;
orthoCount++;
}
else if (solveHint & PxConstraintSolveHint::eEQUALITY)
flags |= DY_SC_FLAG_ORTHO_TARGET;
}
}
}
SolverConstraintPrepState::Enum setupSolverConstraintStep4
(PxTGSSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
const PxReal dt, const PxReal totalDt, const PxReal recipdt, const PxReal recipTotalDt, PxU32& totalRows,
PxConstraintAllocator& allocator, PxU32 maxRows, const PxReal lengthScale);
SolverConstraintPrepState::Enum setupSolverConstraintStep4
(SolverConstraintShaderPrepDesc* PX_RESTRICT constraintShaderDescs,
PxTGSSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
const PxReal dt, const PxReal totalDt, const PxReal recipdt, const PxReal recipTotalDt, PxU32& totalRows,
PxConstraintAllocator& allocator, const PxReal lengthScale)
{
//KS - we will never get here with constraints involving articulations so we don't need to stress about those in here
totalRows = 0;
Px1DConstraint allRows[MAX_CONSTRAINT_ROWS * 4];
PxU32 numRows = 0;
PxU32 maxRows = 0;
PxU32 preppedIndex = 0;
for (PxU32 a = 0; a < 4; ++a)
{
Px1DConstraint* rows = allRows + numRows;
SolverConstraintShaderPrepDesc& shaderDesc = constraintShaderDescs[a];
PxTGSSolverConstraintPrepDesc& desc = constraintDescs[a];
if (!shaderDesc.solverPrep)
return SolverConstraintPrepState::eUNBATCHABLE;
PxMemZero(rows + preppedIndex, sizeof(Px1DConstraint)*(MAX_CONSTRAINT_ROWS));
for (PxU32 b = preppedIndex; b < MAX_CONSTRAINT_ROWS; ++b)
{
Px1DConstraint& c = rows[b];
//Px1DConstraintInit(c);
c.minImpulse = -PX_MAX_REAL;
c.maxImpulse = PX_MAX_REAL;
}
desc.invMassScales.linear0 = desc.invMassScales.linear1 = desc.invMassScales.angular0 = desc.invMassScales.angular1 = 1.f;
desc.body0WorldOffset = PxVec3(0.f);
PxU32 constraintCount = (*shaderDesc.solverPrep)(rows,
desc.body0WorldOffset,
MAX_CONSTRAINT_ROWS,
desc.invMassScales,
shaderDesc.constantBlock,
desc.bodyFrame0, desc.bodyFrame1, desc.extendedLimits, desc.cA2w, desc.cB2w);
preppedIndex = MAX_CONSTRAINT_ROWS - constraintCount;
maxRows = PxMax(constraintCount, maxRows);
if (constraintCount == 0)
return SolverConstraintPrepState::eUNBATCHABLE;
desc.rows = rows;
desc.numRows = constraintCount;
numRows += constraintCount;
if (desc.body0->isKinematic)
desc.invMassScales.angular0 = 0.f;
if (desc.body1->isKinematic)
desc.invMassScales.angular1 = 0.f;
}
return setupSolverConstraintStep4(constraintDescs, dt, totalDt, recipdt, recipTotalDt, totalRows, allocator, maxRows, lengthScale);
}
SolverConstraintPrepState::Enum setupSolverConstraintStep4
(PxTGSSolverConstraintPrepDesc* PX_RESTRICT constraintDescs,
const PxReal dt, const PxReal totalDt, const PxReal recipdt, const PxReal recipTotalDt, PxU32& totalRows,
PxConstraintAllocator& allocator, PxU32 maxRows,
const PxReal lengthScale)
{
const Vec4V zero = V4Zero();
Px1DConstraint* allSorted[MAX_CONSTRAINT_ROWS * 4];
PxU32 startIndex[4];
PX_ALIGN(16, PxVec4) angSqrtInvInertia0[MAX_CONSTRAINT_ROWS * 4];
PX_ALIGN(16, PxVec4) angSqrtInvInertia1[MAX_CONSTRAINT_ROWS * 4];
PxU32 numRows = 0;
for (PxU32 a = 0; a < 4; ++a)
{
startIndex[a] = numRows;
PxTGSSolverConstraintPrepDesc& desc = constraintDescs[a];
Px1DConstraint** sorted = allSorted + numRows;
for (PxU32 i = 0; i < desc.numRows; ++i)
{
if (desc.rows[i].flags & Px1DConstraintFlag::eANGULAR_CONSTRAINT)
{
if (desc.rows[i].solveHint == PxConstraintSolveHint::eEQUALITY)
desc.rows[i].solveHint = PxConstraintSolveHint::eROTATIONAL_EQUALITY;
else if (desc.rows[i].solveHint == PxConstraintSolveHint::eINEQUALITY)
desc.rows[i].solveHint = PxConstraintSolveHint::eROTATIONAL_INEQUALITY;
}
}
preprocessRows(sorted, desc.rows, angSqrtInvInertia0 + numRows, angSqrtInvInertia1 + numRows, desc.numRows,
desc.body0TxI->sqrtInvInertia, desc.body1TxI->sqrtInvInertia, desc.bodyData0->invMass, desc.bodyData1->invMass,
desc.invMassScales, desc.disablePreprocessing, desc.improvedSlerp, false);
numRows += desc.numRows;
}
PxU32 stride = sizeof(SolverConstraint1DStep4);
const PxU32 constraintLength = sizeof(SolverConstraint1DHeaderStep4) + stride * maxRows;
//KS - +16 is for the constraint progress counter, which needs to be the last element in the constraint (so that we
//know SPU DMAs have completed)
PxU8* ptr = allocator.reserveConstraintData(constraintLength + 16u);
if (NULL == ptr || (reinterpret_cast<PxU8*>(-1)) == ptr)
{
for (PxU32 a = 0; a < 4; ++a)
{
PxTGSSolverConstraintPrepDesc& desc = constraintDescs[a];
desc.desc->constraint = NULL;
desc.desc->constraintLengthOver16 = 0;
desc.desc->writeBack = desc.writeback;
}
if (NULL == ptr)
{
PX_WARN_ONCE(
"Reached limit set by PxSceneDesc::maxNbContactDataBlocks - ran out of buffer space for constraint prep. "
"Either accept joints detaching/exploding or increase buffer size allocated for constraint prep by increasing PxSceneDesc::maxNbContactDataBlocks.");
return SolverConstraintPrepState::eOUT_OF_MEMORY;
}
else
{
PX_WARN_ONCE(
"Attempting to allocate more than 16K of constraint data. "
"Either accept joints detaching/exploding or simplify constraints.");
ptr = NULL;
return SolverConstraintPrepState::eOUT_OF_MEMORY;
}
}
//desc.constraint = ptr;
totalRows = numRows;
const bool isKinematic00 = constraintDescs[0].body0->isKinematic;
const bool isKinematic01 = constraintDescs[0].body1->isKinematic;
const bool isKinematic10 = constraintDescs[1].body0->isKinematic;
const bool isKinematic11 = constraintDescs[1].body1->isKinematic;
const bool isKinematic20 = constraintDescs[2].body0->isKinematic;
const bool isKinematic21 = constraintDescs[2].body1->isKinematic;
const bool isKinematic30 = constraintDescs[3].body0->isKinematic;
const bool isKinematic31 = constraintDescs[3].body1->isKinematic;
for (PxU32 a = 0; a < 4; ++a)
{
PxTGSSolverConstraintPrepDesc& desc = constraintDescs[a];
desc.desc->constraint = ptr;
desc.desc->constraintLengthOver16 = PxU16(constraintLength/16);
desc.desc->writeBack = desc.writeback;
}
{
PxU8* currPtr = ptr;
SolverConstraint1DHeaderStep4* header = reinterpret_cast<SolverConstraint1DHeaderStep4*>(currPtr);
currPtr += sizeof(SolverConstraint1DHeaderStep4);
const PxTGSSolverBodyData& bd00 = *constraintDescs[0].bodyData0;
const PxTGSSolverBodyData& bd01 = *constraintDescs[1].bodyData0;
const PxTGSSolverBodyData& bd02 = *constraintDescs[2].bodyData0;
const PxTGSSolverBodyData& bd03 = *constraintDescs[3].bodyData0;
const PxTGSSolverBodyData& bd10 = *constraintDescs[0].bodyData1;
const PxTGSSolverBodyData& bd11 = *constraintDescs[1].bodyData1;
const PxTGSSolverBodyData& bd12 = *constraintDescs[2].bodyData1;
const PxTGSSolverBodyData& bd13 = *constraintDescs[3].bodyData1;
//Load up masses, invInertia, velocity etc.
const Vec4V invMassScale0 = V4LoadXYZW(constraintDescs[0].invMassScales.linear0, constraintDescs[1].invMassScales.linear0,
constraintDescs[2].invMassScales.linear0, constraintDescs[3].invMassScales.linear0);
const Vec4V invMassScale1 = V4LoadXYZW(constraintDescs[0].invMassScales.linear1, constraintDescs[1].invMassScales.linear1,
constraintDescs[2].invMassScales.linear1, constraintDescs[3].invMassScales.linear1);
const Vec4V iMass0 = V4LoadXYZW(bd00.invMass, bd01.invMass, bd02.invMass, bd03.invMass);
const Vec4V iMass1 = V4LoadXYZW(bd10.invMass, bd11.invMass, bd12.invMass, bd13.invMass);
const Vec4V invMass0 = V4Mul(iMass0, invMassScale0);
const Vec4V invMass1 = V4Mul(iMass1, invMassScale1);
const Vec4V invInertiaScale0 = V4LoadXYZW(constraintDescs[0].invMassScales.angular0, constraintDescs[1].invMassScales.angular0,
constraintDescs[2].invMassScales.angular0, constraintDescs[3].invMassScales.angular0);
const Vec4V invInertiaScale1 = V4LoadXYZW(constraintDescs[0].invMassScales.angular1, constraintDescs[1].invMassScales.angular1,
constraintDescs[2].invMassScales.angular1, constraintDescs[3].invMassScales.angular1);
//body world offsets
Vec4V workOffset0 = Vec4V_From_Vec3V(V3LoadU(constraintDescs[0].body0WorldOffset));
Vec4V workOffset1 = Vec4V_From_Vec3V(V3LoadU(constraintDescs[1].body0WorldOffset));
Vec4V workOffset2 = Vec4V_From_Vec3V(V3LoadU(constraintDescs[2].body0WorldOffset));
Vec4V workOffset3 = Vec4V_From_Vec3V(V3LoadU(constraintDescs[3].body0WorldOffset));
Vec4V workOffsetX, workOffsetY, workOffsetZ;
PX_TRANSPOSE_44_34(workOffset0, workOffset1, workOffset2, workOffset3, workOffsetX, workOffsetY, workOffsetZ);
const FloatV dtV = FLoad(totalDt);
Vec4V linBreakForce = V4LoadXYZW(constraintDescs[0].linBreakForce, constraintDescs[1].linBreakForce,
constraintDescs[2].linBreakForce, constraintDescs[3].linBreakForce);
Vec4V angBreakForce = V4LoadXYZW(constraintDescs[0].angBreakForce, constraintDescs[1].angBreakForce,
constraintDescs[2].angBreakForce, constraintDescs[3].angBreakForce);
header->breakable[0] = PxU8((constraintDescs[0].linBreakForce != PX_MAX_F32) || (constraintDescs[0].angBreakForce != PX_MAX_F32));
header->breakable[1] = PxU8((constraintDescs[1].linBreakForce != PX_MAX_F32) || (constraintDescs[1].angBreakForce != PX_MAX_F32));
header->breakable[2] = PxU8((constraintDescs[2].linBreakForce != PX_MAX_F32) || (constraintDescs[2].angBreakForce != PX_MAX_F32));
header->breakable[3] = PxU8((constraintDescs[3].linBreakForce != PX_MAX_F32) || (constraintDescs[3].angBreakForce != PX_MAX_F32));
//OK, I think that's everything loaded in
header->invMass0D0 = invMass0;
header->invMass1D1 = invMass1;
header->angD0 = invInertiaScale0;
header->angD1 = invInertiaScale1;
header->body0WorkOffset[0] = workOffsetX;
header->body0WorkOffset[1] = workOffsetY;
header->body0WorkOffset[2] = workOffsetZ;
header->count = maxRows;
header->type = DY_SC_TYPE_BLOCK_1D;
header->linBreakImpulse = V4Scale(linBreakForce, dtV);
header->angBreakImpulse = V4Scale(angBreakForce, dtV);
header->counts[0] = Ps::to8(constraintDescs[0].numRows);
header->counts[1] = Ps::to8(constraintDescs[1].numRows);
header->counts[2] = Ps::to8(constraintDescs[2].numRows);
header->counts[3] = Ps::to8(constraintDescs[3].numRows);
Vec4V ca2WX, ca2WY, ca2WZ;
Vec4V cb2WX, cb2WY, cb2WZ;
Vec4V ca2W0 = V4LoadU(&constraintDescs[0].cA2w.x);
Vec4V ca2W1 = V4LoadU(&constraintDescs[1].cA2w.x);
Vec4V ca2W2 = V4LoadU(&constraintDescs[2].cA2w.x);
Vec4V ca2W3 = V4LoadU(&constraintDescs[3].cA2w.x);
Vec4V cb2W0 = V4LoadU(&constraintDescs[0].cB2w.x);
Vec4V cb2W1 = V4LoadU(&constraintDescs[1].cB2w.x);
Vec4V cb2W2 = V4LoadU(&constraintDescs[2].cB2w.x);
Vec4V cb2W3 = V4LoadU(&constraintDescs[3].cB2w.x);
PX_TRANSPOSE_44_34(ca2W0, ca2W1, ca2W2, ca2W3, ca2WX, ca2WY, ca2WZ);
PX_TRANSPOSE_44_34(cb2W0, cb2W1, cb2W2, cb2W3, cb2WX, cb2WY, cb2WZ);
Vec4V pos00 = V4LoadA(&constraintDescs[0].body0TxI->deltaBody2World.p.x);
Vec4V pos01 = V4LoadA(&constraintDescs[0].body1TxI->deltaBody2World.p.x);
Vec4V pos10 = V4LoadA(&constraintDescs[1].body0TxI->deltaBody2World.p.x);
Vec4V pos11 = V4LoadA(&constraintDescs[1].body1TxI->deltaBody2World.p.x);
Vec4V pos20 = V4LoadA(&constraintDescs[2].body0TxI->deltaBody2World.p.x);
Vec4V pos21 = V4LoadA(&constraintDescs[2].body1TxI->deltaBody2World.p.x);
Vec4V pos30 = V4LoadA(&constraintDescs[3].body0TxI->deltaBody2World.p.x);
Vec4V pos31 = V4LoadA(&constraintDescs[3].body1TxI->deltaBody2World.p.x);
Vec4V pos0X, pos0Y, pos0Z;
Vec4V pos1X, pos1Y, pos1Z;
PX_TRANSPOSE_44_34(pos00, pos10, pos20, pos30, pos0X, pos0Y, pos0Z);
PX_TRANSPOSE_44_34(pos01, pos11, pos21, pos31, pos1X, pos1Y, pos1Z);
Vec4V linVel00 = V4LoadA(&constraintDescs[0].bodyData0->originalLinearVelocity.x);
Vec4V linVel01 = V4LoadA(&constraintDescs[0].bodyData1->originalLinearVelocity.x);
Vec4V angState00 = V4LoadA(&constraintDescs[0].bodyData0->originalAngularVelocity.x);
Vec4V angState01 = V4LoadA(&constraintDescs[0].bodyData1->originalAngularVelocity.x);
Vec4V linVel10 = V4LoadA(&constraintDescs[1].bodyData0->originalLinearVelocity.x);
Vec4V linVel11 = V4LoadA(&constraintDescs[1].bodyData1->originalLinearVelocity.x);
Vec4V angState10 = V4LoadA(&constraintDescs[1].bodyData0->originalAngularVelocity.x);
Vec4V angState11 = V4LoadA(&constraintDescs[1].bodyData1->originalAngularVelocity.x);
Vec4V linVel20 = V4LoadA(&constraintDescs[2].bodyData0->originalLinearVelocity.x);
Vec4V linVel21 = V4LoadA(&constraintDescs[2].bodyData1->originalLinearVelocity.x);
Vec4V angState20 = V4LoadA(&constraintDescs[2].bodyData0->originalAngularVelocity.x);
Vec4V angState21 = V4LoadA(&constraintDescs[2].bodyData1->originalAngularVelocity.x);
Vec4V linVel30 = V4LoadA(&constraintDescs[3].bodyData0->originalLinearVelocity.x);
Vec4V linVel31 = V4LoadA(&constraintDescs[3].bodyData1->originalLinearVelocity.x);
Vec4V angState30 = V4LoadA(&constraintDescs[3].bodyData0->originalAngularVelocity.x);
Vec4V angState31 = V4LoadA(&constraintDescs[3].bodyData1->originalAngularVelocity.x);
Vec4V linVel0T0, linVel0T1, linVel0T2;
Vec4V linVel1T0, linVel1T1, linVel1T2;
Vec4V angState0T0, angState0T1, angState0T2;
Vec4V angState1T0, angState1T1, angState1T2;
PX_TRANSPOSE_44_34(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2);
PX_TRANSPOSE_44_34(linVel01, linVel11, linVel21, linVel31, linVel1T0, linVel1T1, linVel1T2);
PX_TRANSPOSE_44_34(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2);
PX_TRANSPOSE_44_34(angState01, angState11, angState21, angState31, angState1T0, angState1T1, angState1T2);
const Vec4V raWorldX = V4Sub(ca2WX, pos0X);
const Vec4V raWorldY = V4Sub(ca2WY, pos0Y);
const Vec4V raWorldZ = V4Sub(ca2WZ, pos0Z);
const Vec4V rbWorldX = V4Sub(cb2WX, pos1X);
const Vec4V rbWorldY = V4Sub(cb2WY, pos1Y);
const Vec4V rbWorldZ = V4Sub(cb2WZ, pos1Z);
header->rAWorld[0] = raWorldX;
header->rAWorld[1] = raWorldY;
header->rAWorld[2] = raWorldZ;
header->rBWorld[0] = rbWorldX;
header->rBWorld[1] = rbWorldY;
header->rBWorld[2] = rbWorldZ;
//Now we loop over the constraints and build the results...
PxU32 index0 = 0;
PxU32 endIndex0 = constraintDescs[0].numRows - 1;
PxU32 index1 = startIndex[1];
PxU32 endIndex1 = index1 + constraintDescs[1].numRows - 1;
PxU32 index2 = startIndex[2];
PxU32 endIndex2 = index2 + constraintDescs[2].numRows - 1;
PxU32 index3 = startIndex[3];
PxU32 endIndex3 = index3 + constraintDescs[3].numRows - 1;
const Vec4V one = V4One();
const FloatV fOne = FOne();
PxU32 orthoCount0 = 0, orthoCount1 = 0, orthoCount2 = 0, orthoCount3 = 0;
for (PxU32 a = 0; a < 3; ++a)
{
header->angOrthoAxis0X[a] = V4Zero();
header->angOrthoAxis0Y[a] = V4Zero();
header->angOrthoAxis0Z[a] = V4Zero();
header->angOrthoAxis1X[a] = V4Zero();
header->angOrthoAxis1Y[a] = V4Zero();
header->angOrthoAxis1Z[a] = V4Zero();
header->angOrthoRecipResponse[a] = V4Zero();
header->angOrthoError[a] = V4Zero();
}
for (PxU32 a = 0; a < maxRows; ++a)
{
bool finished[] = { a >= constraintDescs[0].numRows, a >= constraintDescs[1].numRows, a >= constraintDescs[2].numRows, a >= constraintDescs[3].numRows };
BoolV bFinished = BLoad(finished);
SolverConstraint1DStep4* c = reinterpret_cast<SolverConstraint1DStep4*>(currPtr);
currPtr += stride;
Px1DConstraint* con0 = allSorted[index0];
Px1DConstraint* con1 = allSorted[index1];
Px1DConstraint* con2 = allSorted[index2];
Px1DConstraint* con3 = allSorted[index3];
bool angularConstraint[4] =
{
!!(con0->flags & Px1DConstraintFlag::eANGULAR_CONSTRAINT),
!!(con1->flags & Px1DConstraintFlag::eANGULAR_CONSTRAINT),
!!(con2->flags & Px1DConstraintFlag::eANGULAR_CONSTRAINT),
!!(con3->flags & Px1DConstraintFlag::eANGULAR_CONSTRAINT),
};
BoolV bAngularConstraint = BLoad(angularConstraint);
Vec4V cangDelta00 = V4LoadA(&angSqrtInvInertia0[index0].x);
Vec4V cangDelta01 = V4LoadA(&angSqrtInvInertia0[index1].x);
Vec4V cangDelta02 = V4LoadA(&angSqrtInvInertia0[index2].x);
Vec4V cangDelta03 = V4LoadA(&angSqrtInvInertia0[index3].x);
Vec4V cangDelta10 = V4LoadA(&angSqrtInvInertia1[index0].x);
Vec4V cangDelta11 = V4LoadA(&angSqrtInvInertia1[index1].x);
Vec4V cangDelta12 = V4LoadA(&angSqrtInvInertia1[index2].x);
Vec4V cangDelta13 = V4LoadA(&angSqrtInvInertia1[index3].x);
index0 = index0 == endIndex0 ? index0 : index0 + 1;
index1 = index1 == endIndex1 ? index1 : index1 + 1;
index2 = index2 == endIndex2 ? index2 : index2 + 1;
index3 = index3 == endIndex3 ? index3 : index3 + 1;
Vec4V driveScale = one;
if (con0->flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintDescs[0].driveLimitsAreForces)
driveScale = V4SetX(driveScale, FMin(fOne, dtV));
if (con1->flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintDescs[1].driveLimitsAreForces)
driveScale = V4SetY(driveScale, FMin(fOne, dtV));
if (con2->flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintDescs[2].driveLimitsAreForces)
driveScale = V4SetZ(driveScale, FMin(fOne, dtV));
if (con3->flags&Px1DConstraintFlag::eHAS_DRIVE_LIMIT && constraintDescs[3].driveLimitsAreForces)
driveScale = V4SetW(driveScale, FMin(fOne, dtV));
Vec4V clin00 = V4LoadA(&con0->linear0.x);
Vec4V clin01 = V4LoadA(&con1->linear0.x);
Vec4V clin02 = V4LoadA(&con2->linear0.x);
Vec4V clin03 = V4LoadA(&con3->linear0.x);
Vec4V clin0X, clin0Y, clin0Z;
PX_TRANSPOSE_44_34(clin00, clin01, clin02, clin03, clin0X, clin0Y, clin0Z);
Vec4V cang00 = V4LoadA(&con0->angular0.x);
Vec4V cang01 = V4LoadA(&con1->angular0.x);
Vec4V cang02 = V4LoadA(&con2->angular0.x);
Vec4V cang03 = V4LoadA(&con3->angular0.x);
Vec4V cang0X, cang0Y, cang0Z;
PX_TRANSPOSE_44_34(cang00, cang01, cang02, cang03, cang0X, cang0Y, cang0Z);
Vec4V cang10 = V4LoadA(&con0->angular1.x);
Vec4V cang11 = V4LoadA(&con1->angular1.x);
Vec4V cang12 = V4LoadA(&con2->angular1.x);
Vec4V cang13 = V4LoadA(&con3->angular1.x);
Vec4V cang1X, cang1Y, cang1Z;
PX_TRANSPOSE_44_34(cang10, cang11, cang12, cang13, cang1X, cang1Y, cang1Z);
const Vec4V maxImpulse = V4LoadXYZW(con0->maxImpulse, con1->maxImpulse, con2->maxImpulse, con3->maxImpulse);
const Vec4V minImpulse = V4LoadXYZW(con0->minImpulse, con1->minImpulse, con2->minImpulse, con3->minImpulse);
Vec4V angDelta0X, angDelta0Y, angDelta0Z;
PX_TRANSPOSE_44_34(cangDelta00, cangDelta01, cangDelta02, cangDelta03, angDelta0X, angDelta0Y, angDelta0Z);
c->flags[0] = 0;
c->flags[1] = 0;
c->flags[2] = 0;
c->flags[3] = 0;
c->lin0[0] = V4Sel(bFinished, zero, clin0X);
c->lin0[1] = V4Sel(bFinished, zero, clin0Y);
c->lin0[2] = V4Sel(bFinished, zero, clin0Z);
c->ang0[0] = V4Sel(BAndNot(bAngularConstraint, bFinished), cang0X, zero);
c->ang0[1] = V4Sel(BAndNot(bAngularConstraint, bFinished), cang0Y, zero);
c->ang0[2] = V4Sel(BAndNot(bAngularConstraint, bFinished), cang0Z, zero);
c->angularErrorScale = V4Sel(bAngularConstraint, one, zero);
c->minImpulse = V4Mul(minImpulse, driveScale);
c->maxImpulse = V4Mul(maxImpulse, driveScale);
c->appliedForce = zero;
const Vec4V lin0MagSq = V4MulAdd(clin0Z, clin0Z, V4MulAdd(clin0Y, clin0Y, V4Mul(clin0X, clin0X)));
const Vec4V cang0DotAngDelta = V4MulAdd(angDelta0Z, angDelta0Z, V4MulAdd(angDelta0Y, angDelta0Y, V4Mul(angDelta0X, angDelta0X)));
Vec4V unitResponse = V4MulAdd(lin0MagSq, invMass0, V4Mul(cang0DotAngDelta, invInertiaScale0));
Vec4V clin10 = V4LoadA(&con0->linear1.x);
Vec4V clin11 = V4LoadA(&con1->linear1.x);
Vec4V clin12 = V4LoadA(&con2->linear1.x);
Vec4V clin13 = V4LoadA(&con3->linear1.x);
Vec4V clin1X, clin1Y, clin1Z;
PX_TRANSPOSE_44_34(clin10, clin11, clin12, clin13, clin1X, clin1Y, clin1Z);
Vec4V angDelta1X, angDelta1Y, angDelta1Z;
PX_TRANSPOSE_44_34(cangDelta10, cangDelta11, cangDelta12, cangDelta13, angDelta1X, angDelta1Y, angDelta1Z);
const Vec4V lin1MagSq = V4MulAdd(clin1Z, clin1Z, V4MulAdd(clin1Y, clin1Y, V4Mul(clin1X, clin1X)));
const Vec4V cang1DotAngDelta = V4MulAdd(angDelta1Z, angDelta1Z, V4MulAdd(angDelta1Y, angDelta1Y, V4Mul(angDelta1X, angDelta1X)));
c->lin1[0] = V4Sel(bFinished, zero, clin1X);
c->lin1[1] = V4Sel(bFinished, zero, clin1Y);
c->lin1[2] = V4Sel(bFinished, zero, clin1Z);
c->ang1[0] = V4Sel(BAndNot(bAngularConstraint, bFinished), cang1X, zero);
c->ang1[1] = V4Sel(BAndNot(bAngularConstraint, bFinished), cang1Y, zero);
c->ang1[2] = V4Sel(BAndNot(bAngularConstraint, bFinished), cang1Z, zero);
unitResponse = V4Add(unitResponse, V4MulAdd(lin1MagSq, invMass1, V4Mul(cang1DotAngDelta, invInertiaScale1)));
const Vec4V lnormalVel0 = V4MulAdd(clin0X, linVel0T0, V4MulAdd(clin0Y, linVel0T1, V4Mul(clin0Z, linVel0T2)));
const Vec4V lnormalVel1 = V4MulAdd(clin1X, linVel1T0, V4MulAdd(clin1Y, linVel1T1, V4Mul(clin1Z, linVel1T2)));
const Vec4V angVel0 = V4MulAdd(cang0X, angState0T0, V4MulAdd(cang0Y, angState0T1, V4Mul(cang0Z, angState0T2)));
const Vec4V angVel1 = V4MulAdd(angDelta1X, angState1T0, V4MulAdd(angDelta1Y, angState1T1, V4Mul(angDelta1Z, angState1T2)));
const Vec4V normalVel0 = V4Add(lnormalVel0, angVel0);
const Vec4V normalVel1 = V4Add(lnormalVel1, angVel1);
const Vec4V normalVel = V4Sub(normalVel0, normalVel1);
angDelta0X = V4Mul(angDelta0X, invInertiaScale0);
angDelta0Y = V4Mul(angDelta0Y, invInertiaScale0);
angDelta0Z = V4Mul(angDelta0Z, invInertiaScale0);
angDelta1X = V4Mul(angDelta1X, invInertiaScale1);
angDelta1Y = V4Mul(angDelta1Y, invInertiaScale1);
angDelta1Z = V4Mul(angDelta1Z, invInertiaScale1);
{
PxReal recipResponse[4];
const PxVec4& ur = reinterpret_cast<const PxVec4&>(unitResponse);
PxVec4& cBiasScale = reinterpret_cast<PxVec4&>(c->biasScale);
PxVec4& cError = reinterpret_cast<PxVec4&>(c->error);
PxVec4& cMaxBias = reinterpret_cast<PxVec4&>(c->maxBias);
PxVec4& cTargetVel = reinterpret_cast<PxVec4&>(c->velTarget);
PxVec4& cVelMultiplier = reinterpret_cast<PxVec4&>(c->velMultiplier);
PxVec4& cImpulseMultiplier = reinterpret_cast<PxVec4&>(c->impulseMultiplier);
const PxVec4& nVel = reinterpret_cast<const PxVec4&>(normalVel);
const PxVec4& nVel0 = reinterpret_cast<const PxVec4&>(normalVel0);
const PxVec4& nVel1 = reinterpret_cast<const PxVec4&>(normalVel1);
setConstants(cError.x, cBiasScale.x, cTargetVel.x, cMaxBias.x, cVelMultiplier.x, cImpulseMultiplier.x,
recipResponse[0], *con0, ur.x, constraintDescs[0].minResponseThreshold, dt, totalDt, recipdt, recipTotalDt,
a >= constraintDescs[0].numRows, lengthScale, nVel.x, nVel0.x, nVel1.x, isKinematic00, isKinematic01);
setConstants(cError.y, cBiasScale.y, cTargetVel.y, cMaxBias.y, cVelMultiplier.y, cImpulseMultiplier.y,
recipResponse[1], *con1, ur.y, constraintDescs[1].minResponseThreshold, dt, totalDt, recipdt, recipTotalDt,
a >= constraintDescs[1].numRows, lengthScale, nVel.y, nVel0.y, nVel1.y, isKinematic10, isKinematic11);
setConstants(cError.z, cBiasScale.z, cTargetVel.z, cMaxBias.z, cVelMultiplier.z, cImpulseMultiplier.z,
recipResponse[2], *con2, ur.z, constraintDescs[2].minResponseThreshold, dt, totalDt, recipdt, recipTotalDt,
a >= constraintDescs[2].numRows, lengthScale, nVel.z, nVel0.z, nVel1.z, isKinematic20, isKinematic21);
setConstants(cError.w, cBiasScale.w, cTargetVel.w, cMaxBias.w, cVelMultiplier.w, cImpulseMultiplier.w,
recipResponse[3], *con3, ur.w, constraintDescs[3].minResponseThreshold, dt, totalDt, recipdt, recipTotalDt,
a >= constraintDescs[3].numRows, lengthScale, nVel.w, nVel0.w, nVel1.w, isKinematic30, isKinematic31);
PxVec4* angOrthoAxes0X = reinterpret_cast<PxVec4*>(header->angOrthoAxis0X);
PxVec4* angOrthoAxes0Y = reinterpret_cast<PxVec4*>(header->angOrthoAxis0Y);
PxVec4* angOrthoAxes0Z = reinterpret_cast<PxVec4*>(header->angOrthoAxis0Z);
PxVec4* angOrthoAxes1X = reinterpret_cast<PxVec4*>(header->angOrthoAxis1X);
PxVec4* angOrthoAxes1Y = reinterpret_cast<PxVec4*>(header->angOrthoAxis1Y);
PxVec4* angOrthoAxes1Z = reinterpret_cast<PxVec4*>(header->angOrthoAxis1Z);
PxVec4* orthoRecipResponse = reinterpret_cast<PxVec4*>(header->angOrthoRecipResponse);
PxVec4* orthoError = reinterpret_cast<PxVec4*>(header->angOrthoError);
const PxVec4& ang0X = reinterpret_cast<const PxVec4&>(angDelta0X);
const PxVec4& ang0Y = reinterpret_cast<const PxVec4&>(angDelta0Y);
const PxVec4& ang0Z = reinterpret_cast<const PxVec4&>(angDelta0Z);
const PxVec4& ang1X = reinterpret_cast<const PxVec4&>(angDelta1X);
const PxVec4& ang1Y = reinterpret_cast<const PxVec4&>(angDelta1Y);
const PxVec4& ang1Z = reinterpret_cast<const PxVec4&>(angDelta1Z);
setOrthoData(ang0X.x, ang0Y.x, ang0Z.x, ang1X.x, ang1Y.x, ang1Z.x, recipResponse[0], cError.x, angOrthoAxes0X[orthoCount0].x,
angOrthoAxes0Y[orthoCount0].x, angOrthoAxes0Z[orthoCount0].x, angOrthoAxes1X[orthoCount0].x, angOrthoAxes1Y[orthoCount0].x,
angOrthoAxes1Z[orthoCount0].x, orthoRecipResponse[orthoCount0].x, orthoError[orthoCount0].x, constraintDescs[0].disablePreprocessing, con0->solveHint,
c->flags[0], orthoCount0, a >= constraintDescs[0].numRows);
setOrthoData(ang0X.y, ang0Y.y, ang0Z.y, ang1X.y, ang1Y.y, ang1Z.y, recipResponse[1], cError.y, angOrthoAxes0X[orthoCount1].y,
angOrthoAxes0Y[orthoCount1].y, angOrthoAxes0Z[orthoCount1].y, angOrthoAxes1X[orthoCount1].y, angOrthoAxes1Y[orthoCount1].y,
angOrthoAxes1Z[orthoCount1].y, orthoRecipResponse[orthoCount1].y, orthoError[orthoCount1].y, constraintDescs[1].disablePreprocessing, con1->solveHint,
c->flags[1], orthoCount1, a >= constraintDescs[1].numRows);
setOrthoData(ang0X.z, ang0Y.z, ang0Z.z, ang1X.z, ang1Y.z, ang1Z.z, recipResponse[2], cError.z, angOrthoAxes0X[orthoCount2].z,
angOrthoAxes0Y[orthoCount2].z, angOrthoAxes0Z[orthoCount2].z, angOrthoAxes1X[orthoCount2].z, angOrthoAxes1Y[orthoCount2].z,
angOrthoAxes1Z[orthoCount2].z, orthoRecipResponse[orthoCount2].z, orthoError[orthoCount2].z, constraintDescs[2].disablePreprocessing, con2->solveHint,
c->flags[2], orthoCount2, a >= constraintDescs[2].numRows);
setOrthoData(ang0X.w, ang0Y.w, ang0Z.w, ang1X.w, ang1Y.w, ang1Z.w, recipResponse[3], cError.w, angOrthoAxes0X[orthoCount3].w,
angOrthoAxes0Y[orthoCount3].w, angOrthoAxes0Z[orthoCount3].w, angOrthoAxes1X[orthoCount3].w, angOrthoAxes1Y[orthoCount3].w,
angOrthoAxes1Z[orthoCount3].w, orthoRecipResponse[orthoCount3].w, orthoError[orthoCount3].w, constraintDescs[3].disablePreprocessing, con3->solveHint,
c->flags[3], orthoCount3, a >= constraintDescs[3].numRows);
}
if (con0->flags & Px1DConstraintFlag::eOUTPUT_FORCE)
c->flags[0] |= DY_SC_FLAG_OUTPUT_FORCE;
if (con1->flags & Px1DConstraintFlag::eOUTPUT_FORCE)
c->flags[1] |= DY_SC_FLAG_OUTPUT_FORCE;
if (con2->flags & Px1DConstraintFlag::eOUTPUT_FORCE)
c->flags[2] |= DY_SC_FLAG_OUTPUT_FORCE;
if (con3->flags & Px1DConstraintFlag::eOUTPUT_FORCE)
c->flags[3] |= DY_SC_FLAG_OUTPUT_FORCE;
if ((con0->flags & Px1DConstraintFlag::eKEEPBIAS))
c->flags[0] |= DY_SC_FLAG_KEEP_BIAS;
if ((con1->flags & Px1DConstraintFlag::eKEEPBIAS))
c->flags[1] |= DY_SC_FLAG_KEEP_BIAS;
if ((con2->flags & Px1DConstraintFlag::eKEEPBIAS))
c->flags[2] |= DY_SC_FLAG_KEEP_BIAS;
if ((con3->flags & Px1DConstraintFlag::eKEEPBIAS))
c->flags[3] |= DY_SC_FLAG_KEEP_BIAS;
//Flag inequality constraints...
if (con0->solveHint & 1)
c->flags[0] |= DY_SC_FLAG_INEQUALITY;
if (con1->solveHint & 1)
c->flags[1] |= DY_SC_FLAG_INEQUALITY;
if (con2->solveHint & 1)
c->flags[2] |= DY_SC_FLAG_INEQUALITY;
if (con3->solveHint & 1)
c->flags[3] |= DY_SC_FLAG_INEQUALITY;
}
*(reinterpret_cast<PxU32*>(currPtr)) = 0;
*(reinterpret_cast<PxU32*>(currPtr + 4)) = 0;
}
//OK, we're ready to allocate and solve prep these constraints now :-)
return SolverConstraintPrepState::eSUCCESS;
}
void solveContact4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc, const bool doFriction, const PxReal minPenetration,
const PxReal elapsedTimeF32)
{
PxTGSSolverBodyVel& b00 = *desc[0].tgsBodyA;
PxTGSSolverBodyVel& b01 = *desc[0].tgsBodyB;
PxTGSSolverBodyVel& b10 = *desc[1].tgsBodyA;
PxTGSSolverBodyVel& b11 = *desc[1].tgsBodyB;
PxTGSSolverBodyVel& b20 = *desc[2].tgsBodyA;
PxTGSSolverBodyVel& b21 = *desc[2].tgsBodyB;
PxTGSSolverBodyVel& b30 = *desc[3].tgsBodyA;
PxTGSSolverBodyVel& b31 = *desc[3].tgsBodyB;
const Vec4V minPen = V4Load(minPenetration);
const Vec4V elapsedTime = V4Load(elapsedTimeF32);
//We'll need this.
const Vec4V vZero = V4Zero();
Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
Vec4V linVel01 = V4LoadA(&b01.linearVelocity.x);
Vec4V angState00 = V4LoadA(&b00.angularVelocity.x);
Vec4V angState01 = V4LoadA(&b01.angularVelocity.x);
Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
Vec4V linVel11 = V4LoadA(&b11.linearVelocity.x);
Vec4V angState10 = V4LoadA(&b10.angularVelocity.x);
Vec4V angState11 = V4LoadA(&b11.angularVelocity.x);
Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
Vec4V linVel21 = V4LoadA(&b21.linearVelocity.x);
Vec4V angState20 = V4LoadA(&b20.angularVelocity.x);
Vec4V angState21 = V4LoadA(&b21.angularVelocity.x);
Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
Vec4V linVel31 = V4LoadA(&b31.linearVelocity.x);
Vec4V angState30 = V4LoadA(&b30.angularVelocity.x);
Vec4V angState31 = V4LoadA(&b31.angularVelocity.x);
Vec4V linVel0T0, linVel0T1, linVel0T2;
Vec4V linVel1T0, linVel1T1, linVel1T2;
Vec4V angState0T0, angState0T1, angState0T2;
Vec4V angState1T0, angState1T1, angState1T2;
PX_TRANSPOSE_44_34(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2);
PX_TRANSPOSE_44_34(linVel01, linVel11, linVel21, linVel31, linVel1T0, linVel1T1, linVel1T2);
PX_TRANSPOSE_44_34(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2);
PX_TRANSPOSE_44_34(angState01, angState11, angState21, angState31, angState1T0, angState1T1, angState1T2);
Vec4V linDelta00 = V4LoadA(&b00.deltaLinDt.x);
Vec4V linDelta01 = V4LoadA(&b01.deltaLinDt.x);
Vec4V angDelta00 = V4LoadA(&b00.deltaAngDt.x);
Vec4V angDelta01 = V4LoadA(&b01.deltaAngDt.x);
Vec4V linDelta10 = V4LoadA(&b10.deltaLinDt.x);
Vec4V linDelta11 = V4LoadA(&b11.deltaLinDt.x);
Vec4V angDelta10 = V4LoadA(&b10.deltaAngDt.x);
Vec4V angDelta11 = V4LoadA(&b11.deltaAngDt.x);
Vec4V linDelta20 = V4LoadA(&b20.deltaLinDt.x);
Vec4V linDelta21 = V4LoadA(&b21.deltaLinDt.x);
Vec4V angDelta20 = V4LoadA(&b20.deltaAngDt.x);
Vec4V angDelta21 = V4LoadA(&b21.deltaAngDt.x);
Vec4V linDelta30 = V4LoadA(&b30.deltaLinDt.x);
Vec4V linDelta31 = V4LoadA(&b31.deltaLinDt.x);
Vec4V angDelta30 = V4LoadA(&b30.deltaAngDt.x);
Vec4V angDelta31 = V4LoadA(&b31.deltaAngDt.x);
Vec4V linDelta0T0, linDelta0T1, linDelta0T2;
Vec4V linDelta1T0, linDelta1T1, linDelta1T2;
Vec4V angDelta0T0, angDelta0T1, angDelta0T2;
Vec4V angDelta1T0, angDelta1T1, angDelta1T2;
PX_TRANSPOSE_44_34(linDelta00, linDelta10, linDelta20, linDelta30, linDelta0T0, linDelta0T1, linDelta0T2);
PX_TRANSPOSE_44_34(linDelta01, linDelta11, linDelta21, linDelta31, linDelta1T0, linDelta1T1, linDelta1T2);
PX_TRANSPOSE_44_34(angDelta00, angDelta10, angDelta20, angDelta30, angDelta0T0, angDelta0T1, angDelta0T2);
PX_TRANSPOSE_44_34(angDelta01, angDelta11, angDelta21, angDelta31, angDelta1T0, angDelta1T1, angDelta1T2);
const PxU8* PX_RESTRICT last = desc[0].constraint + getConstraintLength(desc[0]);
//hopefully pointer aliasing doesn't bite.
PxU8* PX_RESTRICT currPtr = desc[0].constraint;
Vec4V vMax = V4Splat(FMax());
const PxU8* PX_RESTRICT prefetchAddress = currPtr + sizeof(SolverContactHeaderStepBlock) + sizeof(SolverContactPointStepBlock);
SolverContactHeaderStepBlock* PX_RESTRICT hdr = reinterpret_cast<SolverContactHeaderStepBlock*>(currPtr);
const Vec4V invMassA = hdr->invMass0D0;
const Vec4V invMassB = hdr->invMass1D1;
const Vec4V sumInvMass = V4Add(invMassA, invMassB);
Vec4V linDeltaX = V4Sub(linDelta0T0, linDelta1T0);
Vec4V linDeltaY = V4Sub(linDelta0T1, linDelta1T1);
Vec4V linDeltaZ = V4Sub(linDelta0T2, linDelta1T2);
while (currPtr < last)
{
hdr = reinterpret_cast<SolverContactHeaderStepBlock*>(currPtr);
PX_ASSERT(hdr->type == DY_SC_TYPE_BLOCK_RB_CONTACT);
currPtr = reinterpret_cast<PxU8*>(const_cast<SolverContactHeaderStepBlock*>(hdr) + 1);
const PxU32 numNormalConstr = hdr->numNormalConstr;
const PxU32 numFrictionConstr = hdr->numFrictionConstr;
bool hasMaxImpulse = (hdr->flag & SolverContactHeaderStepBlock::eHAS_MAX_IMPULSE) != 0;
Vec4V* appliedForces = reinterpret_cast<Vec4V*>(currPtr);
currPtr += sizeof(Vec4V)*numNormalConstr;
SolverContactPointStepBlock* PX_RESTRICT contacts = reinterpret_cast<SolverContactPointStepBlock*>(currPtr);
Vec4V* maxImpulses;
currPtr = reinterpret_cast<PxU8*>(contacts + numNormalConstr);
PxU32 maxImpulseMask = 0;
if (hasMaxImpulse)
{
maxImpulseMask = 0xFFFFFFFF;
maxImpulses = reinterpret_cast<Vec4V*>(currPtr);
currPtr += sizeof(Vec4V) * numNormalConstr;
}
else
{
maxImpulses = &vMax;
}
/*SolverFrictionSharedData4* PX_RESTRICT fd = reinterpret_cast<SolverFrictionSharedData4*>(currPtr);
if (numFrictionConstr)
currPtr += sizeof(SolverFrictionSharedData4);*/
Vec4V* frictionAppliedForce = reinterpret_cast<Vec4V*>(currPtr);
currPtr += sizeof(Vec4V)*numFrictionConstr;
const SolverContactFrictionStepBlock* PX_RESTRICT frictions = reinterpret_cast<SolverContactFrictionStepBlock*>(currPtr);
currPtr += numFrictionConstr * sizeof(SolverContactFrictionStepBlock);
Vec4V accumulatedNormalImpulse = vZero;
const Vec4V angD0 = hdr->angDom0;
const Vec4V angD1 = hdr->angDom1;
const Vec4V _normalT0 = hdr->normalX;
const Vec4V _normalT1 = hdr->normalY;
const Vec4V _normalT2 = hdr->normalZ;
Vec4V contactNormalVel1 = V4Mul(linVel0T0, _normalT0);
Vec4V contactNormalVel3 = V4Mul(linVel1T0, _normalT0);
contactNormalVel1 = V4MulAdd(linVel0T1, _normalT1, contactNormalVel1);
contactNormalVel3 = V4MulAdd(linVel1T1, _normalT1, contactNormalVel3);
contactNormalVel1 = V4MulAdd(linVel0T2, _normalT2, contactNormalVel1);
contactNormalVel3 = V4MulAdd(linVel1T2, _normalT2, contactNormalVel3);
const Vec4V maxPenBias = hdr->maxPenBias;
Vec4V relVel1 = V4Sub(contactNormalVel1, contactNormalVel3);
Vec4V deltaNormalV = V4Mul(linDeltaX, _normalT0);
deltaNormalV = V4MulAdd(linDeltaY, _normalT1, deltaNormalV);
deltaNormalV = V4MulAdd(linDeltaZ, _normalT2, deltaNormalV);
Vec4V accumDeltaF = vZero;
for (PxU32 i = 0; i<numNormalConstr; i++)
{
const SolverContactPointStepBlock& c = contacts[i];
PxU32 offset = 0;
Ps::prefetchLine(prefetchAddress, offset += 64);
Ps::prefetchLine(prefetchAddress, offset += 64);
Ps::prefetchLine(prefetchAddress, offset += 64);
prefetchAddress += offset;
const Vec4V appliedForce = appliedForces[i];
const Vec4V maxImpulse = maxImpulses[i & maxImpulseMask];
Vec4V contactNormalVel2 = V4Mul(c.raXnI[0], angState0T0);
Vec4V contactNormalVel4 = V4Mul(c.rbXnI[0], angState1T0);
contactNormalVel2 = V4MulAdd(c.raXnI[1], angState0T1, contactNormalVel2);
contactNormalVel4 = V4MulAdd(c.rbXnI[1], angState1T1, contactNormalVel4);
contactNormalVel2 = V4MulAdd(c.raXnI[2], angState0T2, contactNormalVel2);
contactNormalVel4 = V4MulAdd(c.rbXnI[2], angState1T2, contactNormalVel4);
const Vec4V normalVel = V4Add(relVel1, V4Sub(contactNormalVel2, contactNormalVel4));
Vec4V angDelta0 = V4Mul(angDelta0T0, c.raXnI[0]);
Vec4V angDelta1 = V4Mul(angDelta1T0, c.rbXnI[0]);
angDelta0 = V4MulAdd(angDelta0T1, c.raXnI[1], angDelta0);
angDelta1 = V4MulAdd(angDelta1T1, c.rbXnI[1], angDelta1);
angDelta0 = V4MulAdd(angDelta0T2, c.raXnI[2], angDelta0);
angDelta1 = V4MulAdd(angDelta1T2, c.rbXnI[2], angDelta1);
const Vec4V deltaAng = V4Sub(angDelta0, angDelta1);
const Vec4V targetVel = c.targetVelocity;
const Vec4V deltaBias = V4Sub(V4Add(deltaNormalV, deltaAng), V4Mul(targetVel, elapsedTime));
const Vec4V biasCoefficient = c.biasCoefficient;
const Vec4V sep = V4Max(minPen, V4Add(c.separation, deltaBias));
const Vec4V bias = V4Min(V4Neg(maxPenBias), V4Mul(biasCoefficient, sep));
const Vec4V velMultiplier = c.velMultiplier;
const Vec4V tVel = V4Add(bias, targetVel);
const Vec4V _deltaF = V4Max(V4Mul(V4Sub(tVel, normalVel), velMultiplier), V4Neg(appliedForce));
//Vec4V deltaF = V4NegMulSub(normalVel, c.velMultiplier, c.biasedErr);
const Vec4V newAppliedForce = V4Min(V4Add(appliedForce, _deltaF), maxImpulse);
const Vec4V deltaF = V4Sub(newAppliedForce, appliedForce);
accumDeltaF = V4Add(accumDeltaF, deltaF);
const Vec4V angDetaF0 = V4Mul(deltaF, angD0);
const Vec4V angDetaF1 = V4Mul(deltaF, angD1);
relVel1 = V4MulAdd(sumInvMass, deltaF, relVel1);
angState0T0 = V4MulAdd(c.raXnI[0], angDetaF0, angState0T0);
angState1T0 = V4NegMulSub(c.rbXnI[0], angDetaF1, angState1T0);
angState0T1 = V4MulAdd(c.raXnI[1], angDetaF0, angState0T1);
angState1T1 = V4NegMulSub(c.rbXnI[1], angDetaF1, angState1T1);
angState0T2 = V4MulAdd(c.raXnI[2], angDetaF0, angState0T2);
angState1T2 = V4NegMulSub(c.rbXnI[2], angDetaF1, angState1T2);
appliedForces[i] = newAppliedForce;
accumulatedNormalImpulse = V4Add(accumulatedNormalImpulse, newAppliedForce);
}
const Vec4V accumDeltaF_IM0 = V4Mul(accumDeltaF, invMassA);
const Vec4V accumDeltaF_IM1 = V4Mul(accumDeltaF, invMassB);
linVel0T0 = V4MulAdd(_normalT0, accumDeltaF_IM0, linVel0T0);
linVel1T0 = V4NegMulSub(_normalT0, accumDeltaF_IM1, linVel1T0);
linVel0T1 = V4MulAdd(_normalT1, accumDeltaF_IM0, linVel0T1);
linVel1T1 = V4NegMulSub(_normalT1, accumDeltaF_IM1, linVel1T1);
linVel0T2 = V4MulAdd(_normalT2, accumDeltaF_IM0, linVel0T2);
linVel1T2 = V4NegMulSub(_normalT2, accumDeltaF_IM1, linVel1T2);
if (doFriction && numFrictionConstr)
{
const Vec4V staticFric = hdr->staticFriction;
const Vec4V dynamicFric = hdr->dynamicFriction;
const Vec4V maxFrictionImpulse = V4Mul(staticFric, accumulatedNormalImpulse);
const Vec4V maxDynFrictionImpulse = V4Mul(dynamicFric, accumulatedNormalImpulse);
const Vec4V negMaxDynFrictionImpulse = V4Neg(maxDynFrictionImpulse);
//const Vec4V negMaxFrictionImpulse = V4Neg(maxFrictionImpulse);
BoolV broken = BFFFF();
for (PxU32 i = 0; i<numFrictionConstr; i++)
{
const SolverContactFrictionStepBlock& f = frictions[i];
PxU32 offset = 0;
Ps::prefetchLine(prefetchAddress, offset += 64);
Ps::prefetchLine(prefetchAddress, offset += 64);
Ps::prefetchLine(prefetchAddress, offset += 64);
Ps::prefetchLine(prefetchAddress, offset += 64);
prefetchAddress += offset;
const Vec4V appliedForce = frictionAppliedForce[i];
const Vec4V normalT0 = f.normal[0];
const Vec4V normalT1 = f.normal[1];
const Vec4V normalT2 = f.normal[2];
Vec4V normalVel1 = V4Mul(linVel0T0, normalT0);
Vec4V normalVel2 = V4Mul(f.raXnI[0], angState0T0);
Vec4V normalVel3 = V4Mul(linVel1T0, normalT0);
Vec4V normalVel4 = V4Mul(f.rbXnI[0], angState1T0);
normalVel1 = V4MulAdd(linVel0T1, normalT1, normalVel1);
normalVel2 = V4MulAdd(f.raXnI[1], angState0T1, normalVel2);
normalVel3 = V4MulAdd(linVel1T1, normalT1, normalVel3);
normalVel4 = V4MulAdd(f.rbXnI[1], angState1T1, normalVel4);
normalVel1 = V4MulAdd(linVel0T2, normalT2, normalVel1);
normalVel2 = V4MulAdd(f.raXnI[2], angState0T2, normalVel2);
normalVel3 = V4MulAdd(linVel1T2, normalT2, normalVel3);
normalVel4 = V4MulAdd(f.rbXnI[2], angState1T2, normalVel4);
const Vec4V _normalVel = V4Add(normalVel1, normalVel2);
const Vec4V __normalVel = V4Add(normalVel3, normalVel4);
// appliedForce -bias * velMultiplier - a hoisted part of the total impulse computation
const Vec4V normalVel = V4Sub(_normalVel, __normalVel);
Vec4V deltaV = V4Mul(linDeltaX, normalT0);
deltaV = V4MulAdd(linDeltaY, normalT1, deltaV);
deltaV = V4MulAdd(linDeltaZ, normalT2, deltaV);
Vec4V angDelta0 = V4Mul(angDelta0T0, f.raXnI[0]);
Vec4V angDelta1 = V4Mul(angDelta1T0, f.rbXnI[0]);
angDelta0 = V4MulAdd(angDelta0T1, f.raXnI[1], angDelta0);
angDelta1 = V4MulAdd(angDelta1T1, f.rbXnI[1], angDelta1);
angDelta0 = V4MulAdd(angDelta0T2, f.raXnI[2], angDelta0);
angDelta1 = V4MulAdd(angDelta1T2, f.rbXnI[2], angDelta1);
const Vec4V deltaAng = V4Sub(angDelta0, angDelta1);
const Vec4V deltaBias = V4Sub(V4Add(deltaV, deltaAng), V4Mul(f.targetVel, elapsedTime));
const Vec4V error = V4Add(f.error, deltaBias);
const Vec4V bias = V4Mul(error, f.biasCoefficient);
const Vec4V tmp1 = V4NegMulSub(V4Sub(bias, f.targetVel), f.velMultiplier, appliedForce);
const Vec4V totalImpulse = V4NegMulSub(normalVel, f.velMultiplier, tmp1);
const BoolV clamped = V4IsGrtr(V4Abs(totalImpulse), maxFrictionImpulse);
broken = BOr(broken, clamped);
const Vec4V newAppliedForce = V4Min(maxDynFrictionImpulse, V4Max(negMaxDynFrictionImpulse, totalImpulse));
const Vec4V deltaF = V4Sub(newAppliedForce, appliedForce);
frictionAppliedForce[i] = newAppliedForce;
const Vec4V deltaFIM0 = V4Mul(deltaF, invMassA);
const Vec4V deltaFIM1 = V4Mul(deltaF, invMassB);
const Vec4V angDetaF0 = V4Mul(deltaF, angD0);
const Vec4V angDetaF1 = V4Mul(deltaF, angD1);
linVel0T0 = V4MulAdd(normalT0, deltaFIM0, linVel0T0);
linVel1T0 = V4NegMulSub(normalT0, deltaFIM1, linVel1T0);
angState0T0 = V4MulAdd(f.raXnI[0], angDetaF0, angState0T0);
angState1T0 = V4NegMulSub(f.rbXnI[0], angDetaF1, angState1T0);
linVel0T1 = V4MulAdd(normalT1, deltaFIM0, linVel0T1);
linVel1T1 = V4NegMulSub(normalT1, deltaFIM1, linVel1T1);
angState0T1 = V4MulAdd(f.raXnI[1], angDetaF0, angState0T1);
angState1T1 = V4NegMulSub(f.rbXnI[1], angDetaF1, angState1T1);
linVel0T2 = V4MulAdd(normalT2, deltaFIM0, linVel0T2);
linVel1T2 = V4NegMulSub(normalT2, deltaFIM1, linVel1T2);
angState0T2 = V4MulAdd(f.raXnI[2], angDetaF0, angState0T2);
angState1T2 = V4NegMulSub(f.rbXnI[2], angDetaF1, angState1T2);
}
hdr->broken = broken;
}
}
PX_TRANSPOSE_34_44(linVel0T0, linVel0T1, linVel0T2, linVel00, linVel10, linVel20, linVel30);
PX_TRANSPOSE_34_44(linVel1T0, linVel1T1, linVel1T2, linVel01, linVel11, linVel21, linVel31);
PX_TRANSPOSE_34_44(angState0T0, angState0T1, angState0T2, angState00, angState10, angState20, angState30);
PX_TRANSPOSE_34_44(angState1T0, angState1T1, angState1T2, angState01, angState11, angState21, angState31);
PX_ASSERT(b00.linearVelocity.isFinite());
PX_ASSERT(b00.angularVelocity.isFinite());
PX_ASSERT(b10.linearVelocity.isFinite());
PX_ASSERT(b10.angularVelocity.isFinite());
PX_ASSERT(b20.linearVelocity.isFinite());
PX_ASSERT(b20.angularVelocity.isFinite());
PX_ASSERT(b30.linearVelocity.isFinite());
PX_ASSERT(b30.angularVelocity.isFinite());
PX_ASSERT(b01.linearVelocity.isFinite());
PX_ASSERT(b01.angularVelocity.isFinite());
PX_ASSERT(b11.linearVelocity.isFinite());
PX_ASSERT(b11.angularVelocity.isFinite());
PX_ASSERT(b21.linearVelocity.isFinite());
PX_ASSERT(b21.angularVelocity.isFinite());
PX_ASSERT(b31.linearVelocity.isFinite());
PX_ASSERT(b31.angularVelocity.isFinite());
// Write back
V4StoreA(linVel00, &b00.linearVelocity.x);
V4StoreA(angState00, &b00.angularVelocity.x);
V4StoreA(linVel10, &b10.linearVelocity.x);
V4StoreA(angState10, &b10.angularVelocity.x);
V4StoreA(linVel20, &b20.linearVelocity.x);
V4StoreA(angState20, &b20.angularVelocity.x);
V4StoreA(linVel30, &b30.linearVelocity.x);
V4StoreA(angState30, &b30.angularVelocity.x);
if (desc[0].bodyBDataIndex != 0)
{
V4StoreA(linVel01, &b01.linearVelocity.x);
V4StoreA(angState01, &b01.angularVelocity.x);
}
if (desc[1].bodyBDataIndex != 0)
{
V4StoreA(linVel11, &b11.linearVelocity.x);
V4StoreA(angState11, &b11.angularVelocity.x);
}
if (desc[2].bodyBDataIndex != 0)
{
V4StoreA(linVel21, &b21.linearVelocity.x);
V4StoreA(angState21, &b21.angularVelocity.x);
}
if (desc[3].bodyBDataIndex != 0)
{
V4StoreA(linVel31, &b31.linearVelocity.x);
V4StoreA(angState31, &b31.angularVelocity.x);
}
PX_ASSERT(b00.linearVelocity.isFinite());
PX_ASSERT(b00.angularVelocity.isFinite());
PX_ASSERT(b10.linearVelocity.isFinite());
PX_ASSERT(b10.angularVelocity.isFinite());
PX_ASSERT(b20.linearVelocity.isFinite());
PX_ASSERT(b20.angularVelocity.isFinite());
PX_ASSERT(b30.linearVelocity.isFinite());
PX_ASSERT(b30.angularVelocity.isFinite());
PX_ASSERT(b01.linearVelocity.isFinite());
PX_ASSERT(b01.angularVelocity.isFinite());
PX_ASSERT(b11.linearVelocity.isFinite());
PX_ASSERT(b11.angularVelocity.isFinite());
PX_ASSERT(b21.linearVelocity.isFinite());
PX_ASSERT(b21.angularVelocity.isFinite());
PX_ASSERT(b31.linearVelocity.isFinite());
PX_ASSERT(b31.angularVelocity.isFinite());
}
void writeBackContact4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext* cache)
{
PX_UNUSED(cache);
const PxU8* PX_RESTRICT last = desc[0].constraint + getConstraintLength(desc[0]);
//hopefully pointer aliasing doesn't bite.
PxU8* PX_RESTRICT currPtr = desc[0].constraint;
PxReal* PX_RESTRICT vForceWriteback0 = reinterpret_cast<PxReal*>(desc[0].writeBack);
PxReal* PX_RESTRICT vForceWriteback1 = reinterpret_cast<PxReal*>(desc[1].writeBack);
PxReal* PX_RESTRICT vForceWriteback2 = reinterpret_cast<PxReal*>(desc[2].writeBack);
PxReal* PX_RESTRICT vForceWriteback3 = reinterpret_cast<PxReal*>(desc[3].writeBack);
//const PxU8 type = *desc[0].constraint;
const PxU32 contactSize = sizeof(SolverContactPointStepBlock);
const PxU32 frictionSize = sizeof(SolverContactFrictionStepBlock);
Vec4V normalForce = V4Zero();
//We'll need this.
//const Vec4V vZero = V4Zero();
bool writeBackThresholds[4] = { false, false, false, false };
while ((currPtr < last))
{
SolverContactHeaderStepBlock* PX_RESTRICT hdr = reinterpret_cast<SolverContactHeaderStepBlock*>(currPtr);
currPtr = reinterpret_cast<PxU8*>(hdr + 1);
const PxU32 numNormalConstr = hdr->numNormalConstr;
const PxU32 numFrictionConstr = hdr->numFrictionConstr;
Vec4V* PX_RESTRICT appliedForces = reinterpret_cast<Vec4V*>(currPtr);
currPtr += sizeof(Vec4V)*numNormalConstr;
//SolverContactBatchPointBase4* PX_RESTRICT contacts = (SolverContactBatchPointBase4*)currPtr;
currPtr += (numNormalConstr * contactSize);
bool hasMaxImpulse = (hdr->flag & SolverContactHeader4::eHAS_MAX_IMPULSE) != 0;
if (hasMaxImpulse)
currPtr += sizeof(Vec4V) * numNormalConstr;
currPtr += sizeof(Vec4V)*numFrictionConstr;
//SolverContactFrictionBase4* PX_RESTRICT frictions = (SolverContactFrictionBase4*)currPtr;
currPtr += (numFrictionConstr * frictionSize);
writeBackThresholds[0] = hdr->flags[0] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
writeBackThresholds[1] = hdr->flags[1] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
writeBackThresholds[2] = hdr->flags[2] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
writeBackThresholds[3] = hdr->flags[3] & SolverContactHeader::eHAS_FORCE_THRESHOLDS;
for (PxU32 i = 0; i<numNormalConstr; i++)
{
//contacts = (SolverContactBatchPointBase4*)(((PxU8*)contacts) + contactSize);
const FloatV appliedForce0 = V4GetX(appliedForces[i]);
const FloatV appliedForce1 = V4GetY(appliedForces[i]);
const FloatV appliedForce2 = V4GetZ(appliedForces[i]);
const FloatV appliedForce3 = V4GetW(appliedForces[i]);
normalForce = V4Add(normalForce, appliedForces[i]);
if (vForceWriteback0 && i < hdr->numNormalConstrs[0])
FStore(appliedForce0, vForceWriteback0++);
if (vForceWriteback1 && i < hdr->numNormalConstrs[1])
FStore(appliedForce1, vForceWriteback1++);
if (vForceWriteback2 && i < hdr->numNormalConstrs[2])
FStore(appliedForce2, vForceWriteback2++);
if (vForceWriteback3 && i < hdr->numNormalConstrs[3])
FStore(appliedForce3, vForceWriteback3++);
}
if (numFrictionConstr)
{
PX_ALIGN(16, PxU32 broken[4]);
BStoreA(hdr->broken, broken);
PxU8* frictionCounts = hdr->numNormalConstrs;
for (PxU32 a = 0; a < 4; ++a)
{
if (frictionCounts[a] && broken[a])
*hdr->frictionBrokenWritebackByte[a] = 1; // PT: bad L2 miss here
}
}
}
PX_UNUSED(writeBackThresholds);
#if 0
if (cache)
{
PX_ALIGN(16, PxReal nf[4]);
V4StoreA(normalForce, nf);
Sc::ShapeInteraction** shapeInteractions = reinterpret_cast<SolverContactHeader4*>(desc[0].constraint)->shapeInteraction;
for (PxU32 a = 0; a < 4; ++a)
{
if (writeBackThresholds[a] && desc[a].linkIndexA == PxSolverConstraintDesc::NO_LINK && desc[a].linkIndexB == PxSolverConstraintDesc::NO_LINK &&
nf[a] != 0.f && (bd0[a]->reportThreshold < PX_MAX_REAL || bd1[a]->reportThreshold < PX_MAX_REAL))
{
ThresholdStreamElement elt;
elt.normalForce = nf[a];
elt.threshold = PxMin<float>(bd0[a]->reportThreshold, bd1[a]->reportThreshold);
elt.nodeIndexA = bd0[a]->nodeIndex;
elt.nodeIndexB = bd1[a]->nodeIndex;
elt.shapeInteraction = shapeInteractions[a];
Ps::order(elt.nodeIndexA, elt.nodeIndexB);
PX_ASSERT(elt.nodeIndexA < elt.nodeIndexB);
PX_ASSERT(cache.mThresholdStreamIndex < cache.mThresholdStreamLength);
cache.mThresholdStream[cache.mThresholdStreamIndex++] = elt;
}
}
}
#endif
}
void solveContact4(const PxConstraintBatchHeader& hdr, const PxSolverConstraintDesc* PX_RESTRICT desc, const PxTGSSolverBodyTxInertia* const /*txInertias*/,
const PxReal minPenetration, const PxReal elapsedTime, SolverContext& /*cache*/)
{
solveContact4_Block(desc + hdr.startIndex, true, minPenetration, elapsedTime);
}
void writeBackContact4(const PxConstraintBatchHeader& hdr, const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext* cache)
{
writeBackContact4_Block(desc + hdr.startIndex, cache);
}
PX_FORCE_INLINE Vec4V V4Dot3(const Vec4V& x0, const Vec4V& y0, const Vec4V& z0, const Vec4V& x1, const Vec4V& y1, const Vec4V& z1)
{
return V4MulAdd(x0, x1, V4MulAdd(y0, y1, V4Mul(z0, z1)));
}
void solve1DStep4(const PxSolverConstraintDesc* PX_RESTRICT desc, const PxTGSSolverBodyTxInertia* const txInertias,
const PxReal elapsedTimeF32)
{
PxU8* PX_RESTRICT bPtr = desc->constraint;
if (bPtr == NULL)
return;
const FloatV elapsedTime = FLoad(elapsedTimeF32);
PxTGSSolverBodyVel& b00 = *desc[0].tgsBodyA;
PxTGSSolverBodyVel& b01 = *desc[0].tgsBodyB;
PxTGSSolverBodyVel& b10 = *desc[1].tgsBodyA;
PxTGSSolverBodyVel& b11 = *desc[1].tgsBodyB;
PxTGSSolverBodyVel& b20 = *desc[2].tgsBodyA;
PxTGSSolverBodyVel& b21 = *desc[2].tgsBodyB;
PxTGSSolverBodyVel& b30 = *desc[3].tgsBodyA;
PxTGSSolverBodyVel& b31 = *desc[3].tgsBodyB;
const PxTGSSolverBodyTxInertia& txI00 = txInertias[desc[0].bodyADataIndex];
const PxTGSSolverBodyTxInertia& txI01 = txInertias[desc[0].bodyBDataIndex];
const PxTGSSolverBodyTxInertia& txI10 = txInertias[desc[1].bodyADataIndex];
const PxTGSSolverBodyTxInertia& txI11 = txInertias[desc[1].bodyBDataIndex];
const PxTGSSolverBodyTxInertia& txI20 = txInertias[desc[2].bodyADataIndex];
const PxTGSSolverBodyTxInertia& txI21 = txInertias[desc[2].bodyBDataIndex];
const PxTGSSolverBodyTxInertia& txI30 = txInertias[desc[3].bodyADataIndex];
const PxTGSSolverBodyTxInertia& txI31 = txInertias[desc[3].bodyBDataIndex];
Vec4V linVel00 = V4LoadA(&b00.linearVelocity.x);
Vec4V linVel01 = V4LoadA(&b01.linearVelocity.x);
Vec4V angState00 = V4LoadA(&b00.angularVelocity.x);
Vec4V angState01 = V4LoadA(&b01.angularVelocity.x);
Vec4V linVel10 = V4LoadA(&b10.linearVelocity.x);
Vec4V linVel11 = V4LoadA(&b11.linearVelocity.x);
Vec4V angState10 = V4LoadA(&b10.angularVelocity.x);
Vec4V angState11 = V4LoadA(&b11.angularVelocity.x);
Vec4V linVel20 = V4LoadA(&b20.linearVelocity.x);
Vec4V linVel21 = V4LoadA(&b21.linearVelocity.x);
Vec4V angState20 = V4LoadA(&b20.angularVelocity.x);
Vec4V angState21 = V4LoadA(&b21.angularVelocity.x);
Vec4V linVel30 = V4LoadA(&b30.linearVelocity.x);
Vec4V linVel31 = V4LoadA(&b31.linearVelocity.x);
Vec4V angState30 = V4LoadA(&b30.angularVelocity.x);
Vec4V angState31 = V4LoadA(&b31.angularVelocity.x);
Vec4V linVel0T0, linVel0T1, linVel0T2;
Vec4V linVel1T0, linVel1T1, linVel1T2;
Vec4V angState0T0, angState0T1, angState0T2;
Vec4V angState1T0, angState1T1, angState1T2;
PX_TRANSPOSE_44_34(linVel00, linVel10, linVel20, linVel30, linVel0T0, linVel0T1, linVel0T2);
PX_TRANSPOSE_44_34(linVel01, linVel11, linVel21, linVel31, linVel1T0, linVel1T1, linVel1T2);
PX_TRANSPOSE_44_34(angState00, angState10, angState20, angState30, angState0T0, angState0T1, angState0T2);
PX_TRANSPOSE_44_34(angState01, angState11, angState21, angState31, angState1T0, angState1T1, angState1T2);
Vec4V linDelta00 = V4LoadA(&b00.deltaLinDt.x);
Vec4V linDelta01 = V4LoadA(&b01.deltaLinDt.x);
Vec4V angDelta00 = V4LoadA(&b00.deltaAngDt.x);
Vec4V angDelta01 = V4LoadA(&b01.deltaAngDt.x);
Vec4V linDelta10 = V4LoadA(&b10.deltaLinDt.x);
Vec4V linDelta11 = V4LoadA(&b11.deltaLinDt.x);
Vec4V angDelta10 = V4LoadA(&b10.deltaAngDt.x);
Vec4V angDelta11 = V4LoadA(&b11.deltaAngDt.x);
Vec4V linDelta20 = V4LoadA(&b20.deltaLinDt.x);
Vec4V linDelta21 = V4LoadA(&b21.deltaLinDt.x);
Vec4V angDelta20 = V4LoadA(&b20.deltaAngDt.x);
Vec4V angDelta21 = V4LoadA(&b21.deltaAngDt.x);
Vec4V linDelta30 = V4LoadA(&b30.deltaLinDt.x);
Vec4V linDelta31 = V4LoadA(&b31.deltaLinDt.x);
Vec4V angDelta30 = V4LoadA(&b30.deltaAngDt.x);
Vec4V angDelta31 = V4LoadA(&b31.deltaAngDt.x);
Vec4V linDelta0T0, linDelta0T1, linDelta0T2;
Vec4V linDelta1T0, linDelta1T1, linDelta1T2;
Vec4V angDelta0T0, angDelta0T1, angDelta0T2;
Vec4V angDelta1T0, angDelta1T1, angDelta1T2;
PX_TRANSPOSE_44_34(linDelta00, linDelta10, linDelta20, linDelta30, linDelta0T0, linDelta0T1, linDelta0T2);
PX_TRANSPOSE_44_34(linDelta01, linDelta11, linDelta21, linDelta31, linDelta1T0, linDelta1T1, linDelta1T2);
PX_TRANSPOSE_44_34(angDelta00, angDelta10, angDelta20, angDelta30, angDelta0T0, angDelta0T1, angDelta0T2);
PX_TRANSPOSE_44_34(angDelta01, angDelta11, angDelta21, angDelta31, angDelta1T0, angDelta1T1, angDelta1T2);
const SolverConstraint1DHeaderStep4* PX_RESTRICT header = reinterpret_cast<const SolverConstraint1DHeaderStep4*>(bPtr);
SolverConstraint1DStep4* PX_RESTRICT base = reinterpret_cast<SolverConstraint1DStep4*>(bPtr + sizeof(SolverConstraint1DHeaderStep4));
Vec4V invInertia00X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI00.sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia00Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI00.sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia00Z = Vec4V_From_Vec3V(V3LoadU(txI00.sqrtInvInertia.column2));
Vec4V invInertia10X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI10.sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia10Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI10.sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia10Z = Vec4V_From_Vec3V(V3LoadU(txI10.sqrtInvInertia.column2));
Vec4V invInertia20X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI20.sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia20Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI20.sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia20Z = Vec4V_From_Vec3V(V3LoadU(txI20.sqrtInvInertia.column2));
Vec4V invInertia30X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI30.sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia30Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI30.sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia30Z = Vec4V_From_Vec3V(V3LoadU(txI30.sqrtInvInertia.column2));
Vec4V invInertia01X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI01.sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia01Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI01.sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia01Z = Vec4V_From_Vec3V(V3LoadU(txI01.sqrtInvInertia.column2));
Vec4V invInertia11X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI11.sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia11Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI11.sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia11Z = Vec4V_From_Vec3V(V3LoadU(txI11.sqrtInvInertia.column2));
Vec4V invInertia21X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI21.sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia21Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI21.sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia21Z = Vec4V_From_Vec3V(V3LoadU(txI21.sqrtInvInertia.column2));
Vec4V invInertia31X = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI31.sqrtInvInertia.column0)); // PT: safe because 'column1' follows 'column0' in PxMat33
Vec4V invInertia31Y = Vec4V_From_Vec3V(V3LoadU_SafeReadW(txI31.sqrtInvInertia.column1)); // PT: safe because 'column2' follows 'column1' in PxMat33
Vec4V invInertia31Z = Vec4V_From_Vec3V(V3LoadU(txI31.sqrtInvInertia.column2));
Vec4V invInertia0X0, invInertia0X1, invInertia0X2;
Vec4V invInertia0Y0, invInertia0Y1, invInertia0Y2;
Vec4V invInertia0Z0, invInertia0Z1, invInertia0Z2;
Vec4V invInertia1X0, invInertia1X1, invInertia1X2;
Vec4V invInertia1Y0, invInertia1Y1, invInertia1Y2;
Vec4V invInertia1Z0, invInertia1Z1, invInertia1Z2;
PX_TRANSPOSE_44_34(invInertia00X, invInertia10X, invInertia20X, invInertia30X, invInertia0X0, invInertia0Y0, invInertia0Z0);
PX_TRANSPOSE_44_34(invInertia00Y, invInertia10Y, invInertia20Y, invInertia30Y, invInertia0X1, invInertia0Y1, invInertia0Z1);
PX_TRANSPOSE_44_34(invInertia00Z, invInertia10Z, invInertia20Z, invInertia30Z, invInertia0X2, invInertia0Y2, invInertia0Z2);
PX_TRANSPOSE_44_34(invInertia01X, invInertia11X, invInertia21X, invInertia31X, invInertia1X0, invInertia1Y0, invInertia1Z0);
PX_TRANSPOSE_44_34(invInertia01Y, invInertia11Y, invInertia21Y, invInertia31Y, invInertia1X1, invInertia1Y1, invInertia1Z1);
PX_TRANSPOSE_44_34(invInertia01Z, invInertia11Z, invInertia21Z, invInertia31Z, invInertia1X2, invInertia1Y2, invInertia1Z2);
const Vec4V invInertiaScale0 = header->angD0;
const Vec4V invInertiaScale1 = header->angD1;
//KS - todo - load this a bit quicker...
Vec4V rot00 = V4LoadA(&txI00.deltaBody2World.q.x);
Vec4V rot01 = V4LoadA(&txI01.deltaBody2World.q.x);
Vec4V rot10 = V4LoadA(&txI10.deltaBody2World.q.x);
Vec4V rot11 = V4LoadA(&txI11.deltaBody2World.q.x);
Vec4V rot20 = V4LoadA(&txI20.deltaBody2World.q.x);
Vec4V rot21 = V4LoadA(&txI21.deltaBody2World.q.x);
Vec4V rot30 = V4LoadA(&txI30.deltaBody2World.q.x);
Vec4V rot31 = V4LoadA(&txI31.deltaBody2World.q.x);
Vec4V rot0X, rot0Y, rot0Z, rot0W;
Vec4V rot1X, rot1Y, rot1Z, rot1W;
PX_TRANSPOSE_44(rot00, rot10, rot20, rot30, rot0X, rot0Y, rot0Z, rot0W);
PX_TRANSPOSE_44(rot01, rot11, rot21, rot31, rot1X, rot1Y, rot1Z, rot1W);
Vec4V raX, raY, raZ;
Vec4V rbX, rbY, rbZ;
QuatRotate4(rot0X, rot0Y, rot0Z, rot0W, header->rAWorld[0], header->rAWorld[1], header->rAWorld[2], raX, raY, raZ);
QuatRotate4(rot1X, rot1Y, rot1Z, rot1W, header->rBWorld[0], header->rBWorld[1], header->rBWorld[2], rbX, rbY, rbZ);
const Vec4V raMotionX = V4Sub(V4Add(raX, linDelta0T0), header->rAWorld[0]);
const Vec4V raMotionY = V4Sub(V4Add(raY, linDelta0T1), header->rAWorld[1]);
const Vec4V raMotionZ = V4Sub(V4Add(raZ, linDelta0T2), header->rAWorld[2]);
const Vec4V rbMotionX = V4Sub(V4Add(rbX, linDelta1T0), header->rBWorld[0]);
const Vec4V rbMotionY = V4Sub(V4Add(rbY, linDelta1T1), header->rBWorld[1]);
const Vec4V rbMotionZ = V4Sub(V4Add(rbZ, linDelta1T2), header->rBWorld[2]);
const Vec4V mass0 = header->invMass0D0;
const Vec4V mass1 = header->invMass1D1;
const VecI32V orthoMask = I4Load(DY_SC_FLAG_ORTHO_TARGET);
const VecI32V limitMask = I4Load(DY_SC_FLAG_INEQUALITY);
const Vec4V zero = V4Zero();
const Vec4V one = V4One();
Vec4V error0 = V4Add(header->angOrthoError[0],
V4Sub(V4Dot3(header->angOrthoAxis0X[0], header->angOrthoAxis0Y[0], header->angOrthoAxis0Z[0], angDelta0T0, angDelta0T1, angDelta0T2),
V4Dot3(header->angOrthoAxis1X[0], header->angOrthoAxis1Y[0], header->angOrthoAxis1Z[0], angDelta1T0, angDelta1T1, angDelta1T2)));
Vec4V error1 = V4Add(header->angOrthoError[1],
V4Sub(V4Dot3(header->angOrthoAxis0X[1], header->angOrthoAxis0Y[1], header->angOrthoAxis0Z[1], angDelta0T0, angDelta0T1, angDelta0T2),
V4Dot3(header->angOrthoAxis1X[1], header->angOrthoAxis1Y[1], header->angOrthoAxis1Z[1], angDelta1T0, angDelta1T1, angDelta1T2)));
Vec4V error2 = V4Add(header->angOrthoError[2],
V4Sub(V4Dot3(header->angOrthoAxis0X[2], header->angOrthoAxis0Y[2], header->angOrthoAxis0Z[2], angDelta0T0, angDelta0T1, angDelta0T2),
V4Dot3(header->angOrthoAxis1X[2], header->angOrthoAxis1Y[2], header->angOrthoAxis1Z[2], angDelta1T0, angDelta1T1, angDelta1T2)));
for (PxU32 i = 0; i<header->count; ++i, base++)
{
Ps::prefetchLine(base + 1);
SolverConstraint1DStep4& c = *base;
const Vec4V cangVel0X = V4Add(c.ang0[0], V4NegMulSub(raZ, c.lin0[1], V4Mul(raY, c.lin0[2])));
const Vec4V cangVel0Y = V4Add(c.ang0[1], V4NegMulSub(raX, c.lin0[2], V4Mul(raZ, c.lin0[0])));
const Vec4V cangVel0Z = V4Add(c.ang0[2], V4NegMulSub(raY, c.lin0[0], V4Mul(raX, c.lin0[1])));
const Vec4V cangVel1X = V4Add(c.ang1[0], V4NegMulSub(rbZ, c.lin1[1], V4Mul(rbY, c.lin1[2])));
const Vec4V cangVel1Y = V4Add(c.ang1[1], V4NegMulSub(rbX, c.lin1[2], V4Mul(rbZ, c.lin1[0])));
const Vec4V cangVel1Z = V4Add(c.ang1[2], V4NegMulSub(rbY, c.lin1[0], V4Mul(rbX, c.lin1[1])));
const VecI32V flags = I4LoadA(reinterpret_cast<PxI32*>(c.flags));
const BoolV useOrtho = VecI32V_IsEq(VecI32V_And(flags, orthoMask), orthoMask);
const Vec4V angOrthoCoefficient = V4Sel(useOrtho, one, zero);
Vec4V delAngVel0X = V4Mul(invInertia0X0, cangVel0X);
Vec4V delAngVel0Y = V4Mul(invInertia0X1, cangVel0X);
Vec4V delAngVel0Z = V4Mul(invInertia0X2, cangVel0X);
delAngVel0X = V4MulAdd(invInertia0Y0, cangVel0Y, delAngVel0X);
delAngVel0Y = V4MulAdd(invInertia0Y1, cangVel0Y, delAngVel0Y);
delAngVel0Z = V4MulAdd(invInertia0Y2, cangVel0Y, delAngVel0Z);
delAngVel0X = V4MulAdd(invInertia0Z0, cangVel0Z, delAngVel0X);
delAngVel0Y = V4MulAdd(invInertia0Z1, cangVel0Z, delAngVel0Y);
delAngVel0Z = V4MulAdd(invInertia0Z2, cangVel0Z, delAngVel0Z);
Vec4V delAngVel1X = V4Mul(invInertia1X0, cangVel1X);
Vec4V delAngVel1Y = V4Mul(invInertia1X1, cangVel1X);
Vec4V delAngVel1Z = V4Mul(invInertia1X2, cangVel1X);
delAngVel1X = V4MulAdd(invInertia1Y0, cangVel1Y, delAngVel1X);
delAngVel1Y = V4MulAdd(invInertia1Y1, cangVel1Y, delAngVel1Y);
delAngVel1Z = V4MulAdd(invInertia1Y2, cangVel1Y, delAngVel1Z);
delAngVel1X = V4MulAdd(invInertia1Z0, cangVel1Z, delAngVel1X);
delAngVel1Y = V4MulAdd(invInertia1Z1, cangVel1Z, delAngVel1Y);
delAngVel1Z = V4MulAdd(invInertia1Z2, cangVel1Z, delAngVel1Z);
Vec4V err = c.error;
{
const Vec4V proj0 = V4Mul(V4MulAdd(header->angOrthoAxis0X[0], delAngVel0X, V4MulAdd(header->angOrthoAxis0Y[0], delAngVel0Y,
V4MulAdd(header->angOrthoAxis0Z[0], delAngVel0Z, V4MulAdd(header->angOrthoAxis1X[0], delAngVel1X,
V4MulAdd(header->angOrthoAxis1Y[0], delAngVel1Y, V4Mul(header->angOrthoAxis1Z[0], delAngVel1Z)))))), header->angOrthoRecipResponse[0]);
const Vec4V proj1 = V4Mul(V4MulAdd(header->angOrthoAxis0X[1], delAngVel0X, V4MulAdd(header->angOrthoAxis0Y[1], delAngVel0Y,
V4MulAdd(header->angOrthoAxis0Z[1], delAngVel0Z, V4MulAdd(header->angOrthoAxis1X[1], delAngVel1X,
V4MulAdd(header->angOrthoAxis1Y[1], delAngVel1Y, V4Mul(header->angOrthoAxis1Z[1], delAngVel1Z)))))), header->angOrthoRecipResponse[1]);
const Vec4V proj2 = V4Mul(V4MulAdd(header->angOrthoAxis0X[2], delAngVel0X, V4MulAdd(header->angOrthoAxis0Y[2], delAngVel0Y,
V4MulAdd(header->angOrthoAxis0Z[2], delAngVel0Z, V4MulAdd(header->angOrthoAxis1X[2], delAngVel1X,
V4MulAdd(header->angOrthoAxis1Y[2], delAngVel1Y, V4Mul(header->angOrthoAxis1Z[2], delAngVel1Z)))))), header->angOrthoRecipResponse[2]);
const Vec4V delta0X = V4MulAdd(header->angOrthoAxis0X[0], proj0, V4MulAdd(header->angOrthoAxis0X[1], proj1, V4Mul(header->angOrthoAxis0X[2], proj2)));
const Vec4V delta0Y = V4MulAdd(header->angOrthoAxis0Y[0], proj0, V4MulAdd(header->angOrthoAxis0Y[1], proj1, V4Mul(header->angOrthoAxis0Y[2], proj2)));
const Vec4V delta0Z = V4MulAdd(header->angOrthoAxis0Z[0], proj0, V4MulAdd(header->angOrthoAxis0Z[1], proj1, V4Mul(header->angOrthoAxis0Z[2], proj2)));
const Vec4V delta1X = V4MulAdd(header->angOrthoAxis1X[0], proj0, V4MulAdd(header->angOrthoAxis1X[1], proj1, V4Mul(header->angOrthoAxis1X[2], proj2)));
const Vec4V delta1Y = V4MulAdd(header->angOrthoAxis1Y[0], proj0, V4MulAdd(header->angOrthoAxis1Y[1], proj1, V4Mul(header->angOrthoAxis1Y[2], proj2)));
const Vec4V delta1Z = V4MulAdd(header->angOrthoAxis1Z[0], proj0, V4MulAdd(header->angOrthoAxis1Z[1], proj1, V4Mul(header->angOrthoAxis1Z[2], proj2)));
delAngVel0X = V4NegMulSub(delta0X, angOrthoCoefficient, delAngVel0X);
delAngVel0Y = V4NegMulSub(delta0Y, angOrthoCoefficient, delAngVel0Y);
delAngVel0Z = V4NegMulSub(delta0Z, angOrthoCoefficient, delAngVel0Z);
delAngVel1X = V4NegMulSub(delta1X, angOrthoCoefficient, delAngVel1X);
delAngVel1Y = V4NegMulSub(delta1Y, angOrthoCoefficient, delAngVel1Y);
delAngVel1Z = V4NegMulSub(delta1Z, angOrthoCoefficient, delAngVel1Z);
err = V4Sub(err, V4Mul(V4MulAdd(error0, proj0, V4MulAdd(error1, proj1, V4Mul(error2, proj2))), angOrthoCoefficient));
}
Vec4V ang0IX = V4Mul(invInertia0X0, delAngVel0X);
Vec4V ang0IY = V4Mul(invInertia0X1, delAngVel0X);
Vec4V ang0IZ = V4Mul(invInertia0X2, delAngVel0X);
ang0IX = V4MulAdd(invInertia0Y0, delAngVel0Y, ang0IX);
ang0IY = V4MulAdd(invInertia0Y1, delAngVel0Y, ang0IY);
ang0IZ = V4MulAdd(invInertia0Y2, delAngVel0Y, ang0IZ);
ang0IX = V4MulAdd(invInertia0Z0, delAngVel0Z, ang0IX);
ang0IY = V4MulAdd(invInertia0Z1, delAngVel0Z, ang0IY);
ang0IZ = V4MulAdd(invInertia0Z2, delAngVel0Z, ang0IZ);
Vec4V ang1IX = V4Mul(invInertia1X0, delAngVel1X);
Vec4V ang1IY = V4Mul(invInertia1X1, delAngVel1X);
Vec4V ang1IZ = V4Mul(invInertia1X2, delAngVel1X);
ang1IX = V4MulAdd(invInertia1Y0, delAngVel1Y, ang1IX);
ang1IY = V4MulAdd(invInertia1Y1, delAngVel1Y, ang1IY);
ang1IZ = V4MulAdd(invInertia1Y2, delAngVel1Y, ang1IZ);
ang1IX = V4MulAdd(invInertia1Z0, delAngVel1Z, ang1IX);
ang1IY = V4MulAdd(invInertia1Z1, delAngVel1Z, ang1IY);
ang1IZ = V4MulAdd(invInertia1Z2, delAngVel1Z, ang1IZ);
const Vec4V clinVel0X = c.lin0[0];
const Vec4V clinVel0Y = c.lin0[1];
const Vec4V clinVel0Z = c.lin0[2];
const Vec4V clinVel1X = c.lin1[0];
const Vec4V clinVel1Y = c.lin1[1];
const Vec4V clinVel1Z = c.lin1[2];
const Vec4V clinVel0X_ = c.lin0[0];
const Vec4V clinVel0Y_ = c.lin0[1];
const Vec4V clinVel0Z_ = c.lin0[2];
const Vec4V clinVel1X_ = c.lin1[0];
const Vec4V clinVel1Y_ = c.lin1[1];
const Vec4V clinVel1Z_ = c.lin1[2];
//KS - compute raXnI and effective mass. Unfortunately, the joints are noticeably less stable if we don't do this each
//iteration. It's skippable. If we do that, there's no need for the invInertiaTensors
const Vec4V dotDelAngVel0 = V4MulAdd(delAngVel0X, delAngVel0X, V4MulAdd(delAngVel0Y, delAngVel0Y, V4Mul(delAngVel0Z, delAngVel0Z)));
const Vec4V dotDelAngVel1 = V4MulAdd(delAngVel1X, delAngVel1X, V4MulAdd(delAngVel1Y, delAngVel1Y, V4Mul(delAngVel1Z, delAngVel1Z)));
const Vec4V dotRbXnAngDelta0 = V4MulAdd(delAngVel0X, angDelta0T0, V4MulAdd(delAngVel0Y, angDelta0T1, V4Mul(delAngVel0Z, angDelta0T2)));
const Vec4V dotRbXnAngDelta1 = V4MulAdd(delAngVel1X, angDelta1T0, V4MulAdd(delAngVel1Y, angDelta1T1, V4Mul(delAngVel1Z, angDelta1T2)));
const Vec4V dotRaMotion = V4MulAdd(clinVel0X_, raMotionX, V4MulAdd(clinVel0Y_, raMotionY, V4Mul(clinVel0Z_, raMotionZ)));
const Vec4V dotRbMotion = V4MulAdd(clinVel1X_, rbMotionX, V4MulAdd(clinVel1Y_, rbMotionY, V4Mul(clinVel1Z_, rbMotionZ)));
const Vec4V deltaAng = V4Mul(c.angularErrorScale, V4Sub(dotRbXnAngDelta0, dotRbXnAngDelta1));
const Vec4V error = V4NegScaleSub(c.velTarget, elapsedTime, V4Add(V4Add(err, V4Sub(dotRaMotion, dotRbMotion)), deltaAng));
const Vec4V dotClinVel0 = V4MulAdd(clinVel0X, clinVel0X, V4MulAdd(clinVel0Y, clinVel0Y, V4Mul(clinVel0Z, clinVel0Z)));
const Vec4V dotClinVel1 = V4MulAdd(clinVel1X, clinVel1X, V4MulAdd(clinVel1Y, clinVel1Y, V4Mul(clinVel1Z, clinVel1Z)));
const Vec4V resp0 = V4MulAdd(mass0, dotClinVel0, V4Mul(invInertiaScale0, dotDelAngVel0));
const Vec4V resp1 = V4MulAdd(mass1, dotClinVel1, V4Mul(invInertiaScale1, dotDelAngVel1));
const Vec4V response = V4Add(resp0, resp1);
const Vec4V recipResponse = V4Sel(V4IsGrtr(response, V4Zero()), V4Recip(response), V4Zero());
const Vec4V vMul = V4Mul(recipResponse, c.velMultiplier);
const BoolV isLimitConstraint = VecI32V_IsEq(VecI32V_And(flags, limitMask), limitMask);
const Vec4V minBias = V4Sel(isLimitConstraint, V4Neg(Vec4V_From_FloatV(FMax())), V4Neg(c.maxBias));
const Vec4V unclampedBias = V4Mul(error, c.biasScale);
const Vec4V bias = V4Clamp(unclampedBias, minBias, c.maxBias);
const Vec4V constant = V4Mul(recipResponse, V4Add(bias, c.velTarget));
const Vec4V normalVel0 = V4MulAdd(clinVel0X_, linVel0T0, V4MulAdd(clinVel0Y_, linVel0T1, V4Mul(clinVel0Z_, linVel0T2)));
const Vec4V normalVel1 = V4MulAdd(clinVel1X_, linVel1T0, V4MulAdd(clinVel1Y_, linVel1T1, V4Mul(clinVel1Z_, linVel1T2)));
const Vec4V angVel0 = V4MulAdd(delAngVel0X, angState0T0, V4MulAdd(delAngVel0Y, angState0T1, V4Mul(delAngVel0Z, angState0T2)));
const Vec4V angVel1 = V4MulAdd(delAngVel1X, angState1T0, V4MulAdd(delAngVel1Y, angState1T1, V4Mul(delAngVel1Z, angState1T2)));
const Vec4V normalVel = V4Add(V4Sub(normalVel0, normalVel1), V4Sub(angVel0, angVel1));
const Vec4V unclampedForce = V4MulAdd(c.impulseMultiplier, c.appliedForce, V4MulAdd(vMul, normalVel, constant));
const Vec4V clampedForce = V4Clamp(unclampedForce, c.minImpulse, c.maxImpulse);
const Vec4V deltaF = V4Sub(clampedForce, c.appliedForce);
c.appliedForce = clampedForce;
const Vec4V deltaFIM0 = V4Mul(deltaF, mass0);
const Vec4V deltaFIM1 = V4Mul(deltaF, mass1);
const Vec4V angDetaF0 = V4Mul(deltaF, invInertiaScale0);
const Vec4V angDetaF1 = V4Mul(deltaF, invInertiaScale1);
linVel0T0 = V4MulAdd(clinVel0X_, deltaFIM0, linVel0T0);
linVel1T0 = V4NegMulSub(clinVel1X_, deltaFIM1, linVel1T0);
angState0T0 = V4MulAdd(delAngVel0X, angDetaF0, angState0T0);
angState1T0 = V4NegMulSub(delAngVel1X, angDetaF1, angState1T0);
linVel0T1 = V4MulAdd(clinVel0Y_, deltaFIM0, linVel0T1);
linVel1T1 = V4NegMulSub(clinVel1Y_, deltaFIM1, linVel1T1);
angState0T1 = V4MulAdd(delAngVel0Y, angDetaF0, angState0T1);
angState1T1 = V4NegMulSub(delAngVel1Y, angDetaF1, angState1T1);
linVel0T2 = V4MulAdd(clinVel0Z_, deltaFIM0, linVel0T2);
linVel1T2 = V4NegMulSub(clinVel1Z_, deltaFIM1, linVel1T2);
angState0T2 = V4MulAdd(delAngVel0Z, angDetaF0, angState0T2);
angState1T2 = V4NegMulSub(delAngVel1Z, angDetaF1, angState1T2);
}
PX_TRANSPOSE_34_44(linVel0T0, linVel0T1, linVel0T2, linVel00, linVel10, linVel20, linVel30);
PX_TRANSPOSE_34_44(linVel1T0, linVel1T1, linVel1T2, linVel01, linVel11, linVel21, linVel31);
PX_TRANSPOSE_34_44(angState0T0, angState0T1, angState0T2, angState00, angState10, angState20, angState30);
PX_TRANSPOSE_34_44(angState1T0, angState1T1, angState1T2, angState01, angState11, angState21, angState31);
PX_ASSERT(b00.linearVelocity.isFinite());
PX_ASSERT(b00.angularVelocity.isFinite());
PX_ASSERT(b10.linearVelocity.isFinite());
PX_ASSERT(b10.angularVelocity.isFinite());
PX_ASSERT(b20.linearVelocity.isFinite());
PX_ASSERT(b20.angularVelocity.isFinite());
PX_ASSERT(b30.linearVelocity.isFinite());
PX_ASSERT(b30.angularVelocity.isFinite());
PX_ASSERT(b01.linearVelocity.isFinite());
PX_ASSERT(b01.angularVelocity.isFinite());
PX_ASSERT(b11.linearVelocity.isFinite());
PX_ASSERT(b11.angularVelocity.isFinite());
PX_ASSERT(b21.linearVelocity.isFinite());
PX_ASSERT(b21.angularVelocity.isFinite());
PX_ASSERT(b31.linearVelocity.isFinite());
PX_ASSERT(b31.angularVelocity.isFinite());
// Write back
V4StoreA(linVel00, &b00.linearVelocity.x);
V4StoreA(angState00, &b00.angularVelocity.x);
V4StoreA(linVel10, &b10.linearVelocity.x);
V4StoreA(angState10, &b10.angularVelocity.x);
V4StoreA(linVel20, &b20.linearVelocity.x);
V4StoreA(angState20, &b20.angularVelocity.x);
V4StoreA(linVel30, &b30.linearVelocity.x);
V4StoreA(angState30, &b30.angularVelocity.x);
V4StoreA(linVel01, &b01.linearVelocity.x);
V4StoreA(angState01, &b01.angularVelocity.x);
V4StoreA(linVel11, &b11.linearVelocity.x);
V4StoreA(angState11, &b11.angularVelocity.x);
V4StoreA(linVel21, &b21.linearVelocity.x);
V4StoreA(angState21, &b21.angularVelocity.x);
V4StoreA(linVel31, &b31.linearVelocity.x);
V4StoreA(angState31, &b31.angularVelocity.x);
PX_ASSERT(b00.linearVelocity.isFinite());
PX_ASSERT(b00.angularVelocity.isFinite());
PX_ASSERT(b10.linearVelocity.isFinite());
PX_ASSERT(b10.angularVelocity.isFinite());
PX_ASSERT(b20.linearVelocity.isFinite());
PX_ASSERT(b20.angularVelocity.isFinite());
PX_ASSERT(b30.linearVelocity.isFinite());
PX_ASSERT(b30.angularVelocity.isFinite());
PX_ASSERT(b01.linearVelocity.isFinite());
PX_ASSERT(b01.angularVelocity.isFinite());
PX_ASSERT(b11.linearVelocity.isFinite());
PX_ASSERT(b11.angularVelocity.isFinite());
PX_ASSERT(b21.linearVelocity.isFinite());
PX_ASSERT(b21.angularVelocity.isFinite());
PX_ASSERT(b31.linearVelocity.isFinite());
PX_ASSERT(b31.angularVelocity.isFinite());
}
void solve1D4(const PxConstraintBatchHeader& hdr, const PxSolverConstraintDesc* PX_RESTRICT desc, const PxTGSSolverBodyTxInertia* const txInertias,
const PxReal /*minPenetration*/, const PxReal elapsedTime, SolverContext& /*cache*/)
{
solve1DStep4(desc + hdr.startIndex, txInertias, elapsedTime);
}
void writeBack1D4(const PxConstraintBatchHeader& hdr, const PxSolverConstraintDesc* PX_RESTRICT desc, SolverContext* /*cache*/)
{
PX_UNUSED(hdr);
ConstraintWriteback* writeback0 = reinterpret_cast<ConstraintWriteback*>(desc[hdr.startIndex].writeBack);
ConstraintWriteback* writeback1 = reinterpret_cast<ConstraintWriteback*>(desc[hdr.startIndex + 1].writeBack);
ConstraintWriteback* writeback2 = reinterpret_cast<ConstraintWriteback*>(desc[hdr.startIndex + 2].writeBack);
ConstraintWriteback* writeback3 = reinterpret_cast<ConstraintWriteback*>(desc[hdr.startIndex + 3].writeBack);
if (writeback0 || writeback1 || writeback2 || writeback3)
{
SolverConstraint1DHeaderStep4* header = reinterpret_cast<SolverConstraint1DHeaderStep4*>(desc[hdr.startIndex].constraint);
SolverConstraint1DStep4* base = reinterpret_cast<SolverConstraint1DStep4*>(desc[hdr.startIndex].constraint + sizeof(SolverConstraint1DHeaderStep4));
const Vec4V zero = V4Zero();
Vec4V linX(zero), linY(zero), linZ(zero);
Vec4V angX(zero), angY(zero), angZ(zero);
for (PxU32 i = 0; i<header->count; i++)
{
const SolverConstraint1DStep4* c = base;
//Load in flags
const VecI32V flags = I4LoadU(reinterpret_cast<const PxI32*>(&c->flags[0]));
//Work out masks
const VecI32V mask = I4Load(DY_SC_FLAG_OUTPUT_FORCE);
const VecI32V masked = VecI32V_And(flags, mask);
const BoolV isEq = VecI32V_IsEq(masked, mask);
const Vec4V appliedForce = V4Sel(isEq, c->appliedForce, zero);
linX = V4MulAdd(c->lin0[0], appliedForce, linX);
linY = V4MulAdd(c->lin0[1], appliedForce, linY);
linZ = V4MulAdd(c->lin0[2], appliedForce, linZ);
angX = V4MulAdd(c->ang0[0], appliedForce, angX);
angY = V4MulAdd(c->ang0[1], appliedForce, angY);
angZ = V4MulAdd(c->ang0[2], appliedForce, angZ);
base++;
}
//We need to do the cross product now
angX = V4Sub(angX, V4NegMulSub(header->body0WorkOffset[0], linY, V4Mul(header->body0WorkOffset[1], linZ)));
angY = V4Sub(angY, V4NegMulSub(header->body0WorkOffset[1], linZ, V4Mul(header->body0WorkOffset[2], linX)));
angZ = V4Sub(angZ, V4NegMulSub(header->body0WorkOffset[2], linX, V4Mul(header->body0WorkOffset[0], linY)));
const Vec4V linLenSq = V4MulAdd(linZ, linZ, V4MulAdd(linY, linY, V4Mul(linX, linX)));
const Vec4V angLenSq = V4MulAdd(angZ, angZ, V4MulAdd(angY, angY, V4Mul(angX, angX)));
const Vec4V linLen = V4Sqrt(linLenSq);
const Vec4V angLen = V4Sqrt(angLenSq);
const BoolV broken = BOr(V4IsGrtr(linLen, header->linBreakImpulse), V4IsGrtr(angLen, header->angBreakImpulse));
PX_ALIGN(16, PxU32 iBroken[4]);
BStoreA(broken, iBroken);
Vec4V lin0, lin1, lin2, lin3;
Vec4V ang0, ang1, ang2, ang3;
PX_TRANSPOSE_34_44(linX, linY, linZ, lin0, lin1, lin2, lin3);
PX_TRANSPOSE_34_44(angX, angY, angZ, ang0, ang1, ang2, ang3);
if (writeback0)
{
V3StoreU(Vec3V_From_Vec4V_WUndefined(lin0), writeback0->linearImpulse);
V3StoreU(Vec3V_From_Vec4V_WUndefined(ang0), writeback0->angularImpulse);
writeback0->broken = header->breakable[0] ? PxU32(iBroken[0] != 0) : 0;
}
if (writeback1)
{
V3StoreU(Vec3V_From_Vec4V_WUndefined(lin1), writeback1->linearImpulse);
V3StoreU(Vec3V_From_Vec4V_WUndefined(ang1), writeback1->angularImpulse);
writeback1->broken = header->breakable[1] ? PxU32(iBroken[1] != 0) : 0;
}
if (writeback2)
{
V3StoreU(Vec3V_From_Vec4V_WUndefined(lin2), writeback2->linearImpulse);
V3StoreU(Vec3V_From_Vec4V_WUndefined(ang2), writeback2->angularImpulse);
writeback2->broken = header->breakable[2] ? PxU32(iBroken[2] != 0) : 0;
}
if (writeback3)
{
V3StoreU(Vec3V_From_Vec4V_WUndefined(lin3), writeback3->linearImpulse);
V3StoreU(Vec3V_From_Vec4V_WUndefined(ang3), writeback3->angularImpulse);
writeback3->broken = header->breakable[3] ? PxU32(iBroken[3] != 0) : 0;
}
}
}
void concludeContact4_Block(const PxSolverConstraintDesc* PX_RESTRICT desc)
{
PX_UNUSED(desc);
//const PxU8* PX_RESTRICT last = desc[0].constraint + getConstraintLength(desc[0]);
////hopefully pointer aliasing doesn't bite.
//PxU8* PX_RESTRICT currPtr = desc[0].constraint;
//const Vec4V zero = V4Zero();
////const PxU8 type = *desc[0].constraint;
//const PxU32 contactSize = sizeof(SolverContactPointStepBlock);
//const PxU32 frictionSize = sizeof(SolverContactFrictionStepBlock);
//while ((currPtr < last))
//{
// SolverContactHeaderStepBlock* PX_RESTRICT hdr = reinterpret_cast<SolverContactHeaderStepBlock*>(currPtr);
// currPtr = reinterpret_cast<PxU8*>(hdr + 1);
// const PxU32 numNormalConstr = hdr->numNormalConstr;
// const PxU32 numFrictionConstr = hdr->numFrictionConstr;
// //Applied forces
// currPtr += sizeof(Vec4V)*numNormalConstr;
// //SolverContactPointStepBlock* PX_RESTRICT contacts = reinterpret_cast<SolverContactPointStepBlock*>(currPtr);
// currPtr += (numNormalConstr * contactSize);
// bool hasMaxImpulse = (hdr->flag & SolverContactHeader4::eHAS_MAX_IMPULSE) != 0;
// if (hasMaxImpulse)
// currPtr += sizeof(Vec4V) * numNormalConstr;
// currPtr += sizeof(Vec4V)*numFrictionConstr;
// SolverContactFrictionStepBlock* PX_RESTRICT frictions = reinterpret_cast<SolverContactFrictionStepBlock*>(currPtr);
// currPtr += (numFrictionConstr * frictionSize);
// /*for (PxU32 i = 0; i<numNormalConstr; i++)
// {
// contacts[i].biasCoefficient = V4Sel(V4IsGrtr(contacts[i].separation, zero), contacts[i].biasCoefficient, zero);
// }*/
// for (PxU32 i = 0; i<numFrictionConstr; i++)
// {
// frictions[i].biasCoefficient = zero;
// }
//}
}
void conclude1DStep4(const PxSolverConstraintDesc* PX_RESTRICT desc)
{
PxU8* PX_RESTRICT bPtr = desc->constraint;
if (bPtr == NULL)
return;
const SolverConstraint1DHeaderStep4* PX_RESTRICT header = reinterpret_cast<const SolverConstraint1DHeaderStep4*>(bPtr);
SolverConstraint1DStep4* PX_RESTRICT base = reinterpret_cast<SolverConstraint1DStep4*>(bPtr + sizeof(SolverConstraint1DHeaderStep4));
const VecI32V mask = I4Load(DY_SC_FLAG_KEEP_BIAS);
const Vec4V zero = V4Zero();
for (PxU32 i = 0; i<header->count; ++i, base++)
{
Ps::prefetchLine(base + 1);
SolverConstraint1DStep4& c = *base;
const VecI32V flags = I4LoadA(reinterpret_cast<PxI32*>(c.flags));
const BoolV keepBias = VecI32V_IsEq(VecI32V_And(flags, mask), mask);
c.biasScale = V4Sel(keepBias, c.biasScale, zero);
}
}
void solveConcludeContact4(const PxConstraintBatchHeader& hdr, const PxSolverConstraintDesc* desc,
const PxTGSSolverBodyTxInertia* const /*txInertias*/, const PxReal elapsedTime, SolverContext& /*cache*/)
{
solveContact4_Block(desc + hdr.startIndex, true, -PX_MAX_F32, elapsedTime);
concludeContact4_Block(desc + hdr.startIndex);
}
void solveConclude1D4(const PxConstraintBatchHeader& hdr, const PxSolverConstraintDesc* desc,
const PxTGSSolverBodyTxInertia* const txInertias, const PxReal elapsedTime, SolverContext& /*cache*/)
{
solve1DStep4(desc + hdr.startIndex, txInertias, elapsedTime);
conclude1DStep4(desc + hdr.startIndex);
}
}
}