FWGS · MoeMod · Aug 2, 2023 · Aug 3, 2023 · Aug 3, 2023 · a1batross
diff --git a/cl_dll/StudioModelRenderer.cpp b/cl_dll/StudioModelRenderer.cpp
@@ -349,9 +349,28 @@ void CStudioModelRenderer::StudioSlerpBones( vec4_t q1[], float pos1[][3], vec4_
 
 	s1 = 1.0f - s;
 
+#if 1
+	switch (m_pStudioHeader->numbones & 3)
+	{
+	case 3:
+		QuaternionSlerp( q1[2], q2[2], s, q1[2] );
+	case 2:
+		QuaternionSlerp( q1[1], q2[1], s, q1[1] );
+	case 1:
+		QuaternionSlerp( q1[0], q2[0], s, q1[0] );
+	case 0:
+		for ( i = m_pStudioHeader->numbones & 3; i < m_pStudioHeader->numbones; i += 4 )
+			QuaternionSlerpX4( q1 + i, q2 + i, s, q1 + i );
+	}
+#else
 	for( i = 0; i < m_pStudioHeader->numbones; i++ )
 	{
 		QuaternionSlerp( q1[i], q2[i], s, q3 );
+	}
+#endif
+
+	for( i = 0; i < m_pStudioHeader->numbones; i++ )
+	{
 		q1[i][0] = q3[0];
 		q1[i][1] = q3[1];
 		q1[i][2] = q3[2];

diff --git a/cl_dll/studio_util.cpp b/cl_dll/studio_util.cpp
@@ -143,8 +143,10 @@ void CrossProduct( const float *v1, const float *v2, float *cross )
 	memcpy(&v1_reg, v1, sizeof(float) * 3);
 	memcpy(&v2_reg, v2, sizeof(float) * 3);
 
-	float32x4_t yzxy_a = vextq_f32(vextq_f32(v1_reg, v1_reg, 3), v1_reg, 2); // [aj, ak, ai, aj]
-	float32x4_t yzxy_b = vextq_f32(vextq_f32(v2_reg, v2_reg, 3), v2_reg, 2); // [bj, bk, bi, bj]
+	float32x2_t xy_a = vget_low_f32(v1_reg);
+	float32x2_t xy_b = vget_low_f32(v2_reg);
+	float32x4_t yzxy_a = vcombine_f32(vext_f32(xy_a, vget_high_f32(v1_reg), 1), xy_a); // [aj, ak, ai, aj]
+	float32x4_t yzxy_b = vcombine_f32(vext_f32(xy_b, vget_high_f32(v2_reg), 1), xy_b); // [bj, bk, bi, bj]
 	float32x4_t zxyy_a = vextq_f32(yzxy_a, yzxy_a, 1); // [ak, ai, aj, aj]
 	float32x4_t zxyy_b = vextq_f32(yzxy_b, yzxy_b, 1); // [bk, ai, bj, bj]
 	float32x4_t cross_reg = vfmsq_f32(vmulq_f32(yzxy_a, zxyy_b), zxyy_a, yzxy_b); // [ajbk-akbj, akbi-aibk, aibj-ajbi, 0]
@@ -265,23 +267,23 @@ void AngleQuaternion( float *angles, vec4_t quaternion )
 	sincos_ps(vmulq_n_f32(angles_reg, 0.5), &sr_sp_sy_0_cr_cp_cy_1.val[0], &sr_sp_sy_0_cr_cp_cy_1.val[1]);
 
 	float32x4x2_t sr_sy_cr_cy_sp_0_cp_1 = vuzpq_f32(sr_sp_sy_0_cr_cp_cy_1.val[0], sr_sp_sy_0_cr_cp_cy_1.val[1]);
-	float32x4_t cp_cp_cp_cp = vdupq_laneq_f32(sr_sp_sy_0_cr_cp_cy_1.val[1], 1);
-	float32x4_t sp_sp_sp_sp = vdupq_laneq_f32(sr_sp_sy_0_cr_cp_cy_1.val[0], 1);
 
 	float32x4_t sr_sy_cr_cy = sr_sy_cr_cy_sp_0_cp_1.val[0];
 	float32x4_t sy_cr_cy_sr = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 1);
-	float32x4_t cr_cy_sr_sy = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 2);
-	float32x4_t cy_sr_sy_cr = vextq_f32(sr_sy_cr_cy_sp_0_cp_1.val[0], sr_sy_cr_cy_sp_0_cp_1.val[0], 3);
-	float32x4_t sp_sp_sp_sp_signed = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(sp_sp_sp_sp), AngleQuaternion_sign2));
+	float32x4_t srsy_sycr_crcy_cysr = vmulq_f32(sr_sy_cr_cy, sy_cr_cy_sr);
+	float32x4_t sycr_crcy_cysr_srsy = vextq_f32(srsy_sycr_crcy_cysr, srsy_sycr_crcy_cysr, 1);
+	float32x4_t cysr_srsy_sycr_crcy = vextq_f32(srsy_sycr_crcy_cysr, srsy_sycr_crcy_cysr, 3);
+	float32x4_t sycr_crcy_cysr_srsy_signed = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(sycr_crcy_cysr_srsy), AngleQuaternion_sign2));
 
-	float32x4_t left = vmulq_f32(vmulq_f32(sr_sy_cr_cy, cp_cp_cp_cp), cy_sr_sy_cr);
+	float32x4_t left = vmulq_laneq_f32(cysr_srsy_sycr_crcy, sr_sp_sy_0_cr_cp_cy_1.val[1], 1);
 
-	float32x4_t out_reg = vfmaq_f32(left, vmulq_f32(cr_cy_sr_sy, sp_sp_sp_sp_signed), sy_cr_cy_sr);
+	float32x4_t out_reg = vfmaq_laneq_f32(left, sycr_crcy_cysr_srsy_signed, sr_sp_sy_0_cr_cp_cy_1.val[0], 1);
 	memcpy(quaternion, &out_reg, sizeof(float) * 4);
-	//quaternion[0] =   sr * cp * cy - cr * sp * sy; // X
-	//quaternion[1] =   sy * cp * sr + cy * sp * cr; // Y
-	//quaternion[2] =   cr * cp * sy - sr * sp * cy; // Z
-	//quaternion[3] =   cy * cp * cr + sy * sp * sr; // W
+	// A = sr * sy, B = sy * cr, C = cr * cy, D = cy * sr
+	//quaternion[0] =   D * cp - B * sp; // X
+	//quaternion[1] =   A * cp + C * sp; // Y
+	//quaternion[2] =   B * cp - D * sp; // Z
+	//quaternion[3] =   C * cp + A * sp; // W
 #else
 	float angle;
 	float sr, sp, sy, cr, cp, cy;
@@ -400,6 +402,63 @@ void QuaternionSlerp( vec4_t p, vec4_t q, float t, vec4_t qt )
 #endif
 }
 
+/*
+====================
+QuaternionSlerpX4
+
+====================
+*/
+void QuaternionSlerpX4( vec4_t p[4], vec4_t q[4], float t, vec4_t qt[4] )
+{
+#if XASH_SIMD_NEON
+	float32x4x4_t p_reg, q_reg;
+	memcpy(&p_reg, p, sizeof(float) * 4 * 4);
+	memcpy(&q_reg, q, sizeof(float) * 4 * 4);
+
+	//float32x4_t cosom = { DotProduct(p[0], q[0]), DotProduct(p[1], q[1]), DotProduct(p[2], q[2]), DotProduct(p[3], q[3]) };
+	float32x4x4_t p_t = vld4q_f32((const float*)&p_reg);
+	float32x4x4_t q_t = vld4q_f32((const float*)&q_reg);
+	float32x4_t cosom = vmulq_f32(p_t.val[0], q_t.val[0]);
+	cosom = vfmaq_f32(cosom, p_t.val[1], q_t.val[1]);
+	cosom = vfmaq_f32(cosom, p_t.val[2], q_t.val[2]);
+	cosom = vfmaq_f32(cosom, p_t.val[3], q_t.val[3]);
+
+	// if(cosom < 0) q=-q, cosom=-cosom
+	uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(cosom), vdupq_n_u32(0x80000000));
+	q_reg.val[0] = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(q_reg.val[0]), vdupq_laneq_f32(sign, 0)));
+	q_reg.val[1] = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(q_reg.val[1]), vdupq_laneq_f32(sign, 1)));
+	q_reg.val[2] = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(q_reg.val[2]), vdupq_laneq_f32(sign, 2)));
+	q_reg.val[3] = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(q_reg.val[3]), vdupq_laneq_f32(sign, 3)));
+	cosom = vabsq_f32(cosom);
+
+	float32x4_t sclp = vdupq_n_f32(1.0f - t);
+	float32x4_t sclq = vdupq_n_f32(t);
+	// if ((1.0 - cosom) > 0.000001) scl = sin(scl * omega) / sinom;
+	uint32x4_t cosom_less_then_one = vcltq_f32(cosom, vdupq_n_f32(1.0f - 0.00001f));
+	float32x4_t omega = acos_ps(cosom);
+	// 1/sinom = rsqrt(1-cosom*cosom)
+	float32x4_t sinom = sin_ps(omega);
+	float32x4_t sinom_reciprocal = vrecpeq_f32(sinom); // vdivq_f32(vdupq_n_f32(1), sin_ps(omega));
+	sinom_reciprocal = vmulq_f32(sinom_reciprocal, vrecpsq_f32(sinom, sinom_reciprocal));
+	sclp = vbslq_f32(cosom_less_then_one, sclp, vmulq_f32(sin_ps(vmulq_f32(sclp, omega)), sinom_reciprocal));
+	sclq = vbslq_f32(cosom_less_then_one, sclq, vmulq_f32(sin_ps(vmulq_f32(sclq, omega)), sinom_reciprocal));
+
+	// qt = (sclp * p + sclq * q);
+	float32x4x4_t qt_reg;
+	qt_reg.val[0] = vfmaq_laneq_f32(vmulq_laneq_f32(p_reg.val[0], sclp, 0), q_reg.val[0], sclq, 0);
+	qt_reg.val[1] = vfmaq_laneq_f32(vmulq_laneq_f32(p_reg.val[1], sclp, 1), q_reg.val[1], sclq, 1);
+	qt_reg.val[2] = vfmaq_laneq_f32(vmulq_laneq_f32(p_reg.val[2], sclp, 2), q_reg.val[2], sclq, 2);
+	qt_reg.val[3] = vfmaq_laneq_f32(vmulq_laneq_f32(p_reg.val[3], sclp, 3), q_reg.val[3], sclq, 3);
+
+	memcpy(qt, &qt_reg, sizeof(float) * 4 * 4);
+#else
+	QuaternionSlerp(p[0], q[0], t, qt[0]);
+	QuaternionSlerp(p[1], q[1], t, qt[1]);
+	QuaternionSlerp(p[2], q[2], t, qt[2]);
+	QuaternionSlerp(p[3], q[3], t, qt[3]);
+#endif
+}
+
 /*
 ====================
 QuaternionMatrix

diff --git a/cl_dll/studio_util.h b/cl_dll/studio_util.h
@@ -36,5 +36,6 @@ void	ConcatTransforms( float in1[3][4], float in2[3][4], float out[3][4] );
 void	MatrixCopy( float in[3][4], float out[3][4] );
 void	QuaternionMatrix( vec4_t quaternion, float (*matrix)[4] );
 void	QuaternionSlerp( vec4_t p, vec4_t q, float t, vec4_t qt );
+void	QuaternionSlerpX4( vec4_t p[4], vec4_t q[4], float t, vec4_t qt[4] );
 void	AngleQuaternion( float *angles, vec4_t quaternion );
 #endif // STUDIO_UTIL_H