Skip to content

Commit

Permalink
update studio_util neon
Browse files Browse the repository at this point in the history
  • Loading branch information
MoeMod committed Aug 2, 2023
1 parent 43710ea commit c78b3d5
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 13 deletions.
19 changes: 19 additions & 0 deletions cl_dll/StudioModelRenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,28 @@ void CStudioModelRenderer::StudioSlerpBones( vec4_t q1[], float pos1[][3], vec4_

s1 = 1.0f - s;

#if 1
switch (m_pStudioHeader->numbones & 3)
{
case 3:
QuaternionSlerp( q1[2], q2[2], s, q1[2] );
case 2:
QuaternionSlerp( q1[1], q2[1], s, q1[1] );
case 1:
QuaternionSlerp( q1[0], q2[0], s, q1[0] );
case 0:
for ( i = m_pStudioHeader->numbones & 3; i < m_pStudioHeader->numbones; i += 4 )
QuaternionSlerpX4( q1 + i, q2 + i, s, q1 + i );
}
#else
for( i = 0; i < m_pStudioHeader->numbones; i++ )
{
QuaternionSlerp( q1[i], q2[i], s, q3 );
}
#endif

for( i = 0; i < m_pStudioHeader->numbones; i++ )
{
q1[i][0] = q3[0];
q1[i][1] = q3[1];
q1[i][2] = q3[2];
Expand Down
85 changes: 72 additions & 13 deletions cl_dll/studio_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,10 @@ void CrossProduct( const float *v1, const float *v2, float *cross )
memcpy(&v1_reg, v1, sizeof(float) * 3);
memcpy(&v2_reg, v2, sizeof(float) * 3);

float32x4_t yzxy_a = vextq_f32(vextq_f32(v1_reg, v1_reg, 3), v1_reg, 2); // [aj, ak, ai, aj]
float32x4_t yzxy_b = vextq_f32(vextq_f32(v2_reg, v2_reg, 3), v2_reg, 2); // [bj, bk, bi, bj]
float32x2_t xy_a = vget_low_f32(v1_reg);
float32x2_t xy_b = vget_low_f32(v2_reg);
float32x4_t yzxy_a = vcombine_f32(vext_f32(xy_a, vget_high_f32(v1_reg), 1), xy_a); // [aj, ak, ai, aj]
float32x4_t yzxy_b = vcombine_f32(vext_f32(xy_b, vget_high_f32(v2_reg), 1), xy_b); // [bj, bk, bi, bj]
float32x4_t zxyy_a = vextq_f32(yzxy_a, yzxy_a, 1); // [ak, ai, aj, aj]
float32x4_t zxyy_b = vextq_f32(yzxy_b, yzxy_b, 1); // [bk, ai, bj, bj]
float32x4_t cross_reg = vfmsq_f32(vmulq_f32(yzxy_a, zxyy_b), zxyy_a, yzxy_b); // [ajbk-akbj, akbi-aibk, aibj-ajbi, 0]
Expand Down Expand Up @@ -265,23 +267,23 @@ void AngleQuaternion( float *angles, vec4_t quaternion )
sincos_ps(vmulq_n_f32(angles_reg, 0.5), &sr_sp_sy_0_cr_cp_cy_1.val[0], &sr_sp_sy_0_cr_cp_cy_1.val[1]);

float32x4x2_t sr_sy_cr_cy_sp_0_cp_1 = vuzpq_f32(sr_sp_sy_0_cr_cp_cy_1.val[0], sr_sp_sy_0_cr_cp_cy_1.val[1]);
float32x4_t cp_cp_cp_cp = vdupq_laneq_f32(sr_sp_sy_0_cr_cp_cy_1.val[1], 1);
float32x4_t sp_sp_sp_sp = vdupq_laneq_f32(sr_sp_sy_0_cr_cp_cy_1.val[0], 1);

float32x4_t sr_sy_cr_cy = sr_sy_cr_cy_sp_0_cp_1.val[0];
float32x4_t sy_cr_cy_sr = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 1);
float32x4_t cr_cy_sr_sy = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 2);
float32x4_t cy_sr_sy_cr = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 3);
float32x4_t sp_sp_sp_sp_signed = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(sp_sp_sp_sp), AngleQuaternion_sign2));
float32x4_t srsy_sycr_crcy_cysr = vmulq_f32(sr_sy_cr_cy, sy_cr_cy_sr);
float32x4_t sycr_crcy_cysr_srsy = vextq_f32(srsy_sycr_crcy_cysr, srsy_sycr_crcy_cysr, 1);
float32x4_t cysr_srsy_sycr_crcy = vextq_f32(srsy_sycr_crcy_cysr, srsy_sycr_crcy_cysr, 3);
float32x4_t sycr_crcy_cysr_srsy_signed = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(sycr_crcy_cysr_srsy), AngleQuaternion_sign2));

float32x4_t left = vmulq_f32(vmulq_f32(sr_sy_cr_cy, cp_cp_cp_cp), cy_sr_sy_cr);
float32x4_t left = vmulq_laneq_f32(cysr_srsy_sycr_crcy, sr_sp_sy_0_cr_cp_cy_1.val[1], 1);

float32x4_t out_reg = vfmaq_f32(left, vmulq_f32(cr_cy_sr_sy, sp_sp_sp_sp_signed), sy_cr_cy_sr);
float32x4_t out_reg = vfmaq_laneq_f32(left, sycr_crcy_cysr_srsy_signed, sr_sp_sy_0_cr_cp_cy_1.val[0], 1);
memcpy(quaternion, &out_reg, sizeof(float) * 4);
//quaternion[0] = sr * cp * cy - cr * sp * sy; // X
//quaternion[1] = sy * cp * sr + cy * sp * cr; // Y
//quaternion[2] = cr * cp * sy - sr * sp * cy; // Z
//quaternion[3] = cy * cp * cr + sy * sp * sr; // W
// A = sr * sy, B = sy * cr, C = cr * cy, D = cy * sr
//quaternion[0] = D * cp - B * sp; // X
//quaternion[1] = A * cp + C * sp; // Y
//quaternion[2] = B * cp - D * sp; // Z
//quaternion[3] = C * cp + A * sp; // W
#else
float angle;
float sr, sp, sy, cr, cp, cy;
Expand Down Expand Up @@ -400,6 +402,63 @@ void QuaternionSlerp( vec4_t p, vec4_t q, float t, vec4_t qt )
#endif
}

/*
====================
QuaternionSlerpX4
====================
*/
void QuaternionSlerpX4( vec4_t p[4], vec4_t q[4], float t, vec4_t qt[4] )
{
#if XASH_SIMD_NEON
float32x4x4_t p_reg, q_reg;
memcpy(&p_reg, p, sizeof(float) * 4 * 4);
memcpy(&q_reg, q, sizeof(float) * 4 * 4);

//float32x4_t cosom = { DotProduct(p[0], q[0]), DotProduct(p[1], q[1]), DotProduct(p[2], q[2]), DotProduct(p[3], q[3]) };
float32x4x4_t p_t = vld4q_f32((const float*)&p_reg);
float32x4x4_t q_t = vld4q_f32((const float*)&q_reg);
float32x4_t cosom = vmulq_f32(p_t.val[0], q_t.val[0]);
cosom = vfmaq_f32(cosom, p_t.val[1], q_t.val[1]);
cosom = vfmaq_f32(cosom, p_t.val[2], q_t.val[2]);
cosom = vfmaq_f32(cosom, p_t.val[3], q_t.val[3]);

// if(cosom < 0) q=-q, cosom=-cosom
uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(cosom), vdupq_n_u32(0x80000000));
q_reg.val[0] = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(q_reg.val[0]), vdupq_laneq_f32(sign, 0)));
q_reg.val[1] = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(q_reg.val[1]), vdupq_laneq_f32(sign, 1)));
q_reg.val[2] = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(q_reg.val[2]), vdupq_laneq_f32(sign, 2)));
q_reg.val[3] = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(q_reg.val[3]), vdupq_laneq_f32(sign, 3)));
cosom = vabsq_f32(cosom);

float32x4_t sclp = vdupq_n_f32(1.0f - t);
float32x4_t sclq = vdupq_n_f32(t);
// if ((1.0 - cosom) > 0.000001) scl = sin(scl * omega) / sinom;
uint32x4_t cosom_less_then_one = vcltq_f32(cosom, vdupq_n_f32(1.0f - 0.00001f));
float32x4_t omega = acos_ps(cosom);
// 1/sinom = rsqrt(1-cosom*cosom)
float32x4_t sinom = sin_ps(omega);
float32x4_t sinom_reciprocal = vrecpeq_f32(sinom); // vdivq_f32(vdupq_n_f32(1), sin_ps(omega));
sinom_reciprocal = vmulq_f32(sinom_reciprocal, vrecpsq_f32(sinom, sinom_reciprocal));
sclp = vbslq_f32(cosom_less_then_one, sclp, vmulq_f32(sin_ps(vmulq_f32(sclp, omega)), sinom_reciprocal));
sclq = vbslq_f32(cosom_less_then_one, sclq, vmulq_f32(sin_ps(vmulq_f32(sclq, omega)), sinom_reciprocal));

// qt = (sclp * p + sclq * q);
float32x4x4_t qt_reg;
qt_reg.val[0] = vfmaq_laneq_f32(vmulq_laneq_f32(p_reg.val[0], sclp, 0), q_reg.val[0], sclq, 0);
qt_reg.val[1] = vfmaq_laneq_f32(vmulq_laneq_f32(p_reg.val[1], sclp, 1), q_reg.val[1], sclq, 1);
qt_reg.val[2] = vfmaq_laneq_f32(vmulq_laneq_f32(p_reg.val[2], sclp, 2), q_reg.val[2], sclq, 2);
qt_reg.val[3] = vfmaq_laneq_f32(vmulq_laneq_f32(p_reg.val[3], sclp, 3), q_reg.val[3], sclq, 3);

memcpy(qt, &qt_reg, sizeof(float) * 4 * 4);
#else
QuaternionSlerp(p[0], q[0], t, qt[0]);
QuaternionSlerp(p[1], q[1], t, qt[1]);
QuaternionSlerp(p[2], q[2], t, qt[2]);
QuaternionSlerp(p[3], q[3], t, qt[3]);
#endif
}

/*
====================
QuaternionMatrix
Expand Down
1 change: 1 addition & 0 deletions cl_dll/studio_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,6 @@ void ConcatTransforms( float in1[3][4], float in2[3][4], float out[3][4] );
void MatrixCopy( float in[3][4], float out[3][4] );
void QuaternionMatrix( vec4_t quaternion, float (*matrix)[4] );
void QuaternionSlerp( vec4_t p, vec4_t q, float t, vec4_t qt );
void QuaternionSlerpX4( vec4_t p[4], vec4_t q[4], float t, vec4_t qt[4] );
void AngleQuaternion( float *angles, vec4_t quaternion );
#endif // STUDIO_UTIL_H

0 comments on commit c78b3d5

Please sign in to comment.