|
|
|
@ -44,6 +44,11 @@ namespace icbc {
|
|
|
|
|
#include <assert.h>
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if ICBC_USE_SIMD
|
|
|
|
|
#include <xmmintrin.h>
|
|
|
|
|
#include <emmintrin.h>
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
namespace icbc {
|
|
|
|
|
|
|
|
|
|
///////////////////////////////////////////////////////////////////////////////////////////////////
|
|
|
|
@ -223,11 +228,13 @@ inline bool equal(Vector3 a, Vector3 b, float epsilon) {
|
|
|
|
|
|
|
|
|
|
#if ICBC_USE_SIMD
|
|
|
|
|
|
|
|
|
|
#include <xmmintrin.h>
|
|
|
|
|
#include <emmintrin.h>
|
|
|
|
|
|
|
|
|
|
#define SIMD_INLINE inline
|
|
|
|
|
#define SIMD_NATIVE __forceinline
|
|
|
|
|
#if __GNUC__
|
|
|
|
|
// Also need "inline" to avoid "function body can be overwritten at link time" errors.
|
|
|
|
|
# define SIMD_NATIVE __attribute__((always_inline)) inline
|
|
|
|
|
#else // _MSC_VER
|
|
|
|
|
# define SIMD_NATIVE __forceinline
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
class SimdVector
|
|
|
|
|
{
|
|
|
|
@ -2196,12 +2203,12 @@ bool ClusterFit::fastCompress3(Vector3 * start, Vector3 * end)
|
|
|
|
|
|
|
|
|
|
for (int c1 = 0; c1 <= count - c0; c1++, i++)
|
|
|
|
|
{
|
|
|
|
|
const SimdVector constants = SimdVector((const float *)&s_threeElement[i]);
|
|
|
|
|
|
|
|
|
|
const SimdVector alpha2_sum = constants.splatX();
|
|
|
|
|
const SimdVector beta2_sum = constants.splatY();
|
|
|
|
|
const SimdVector alphabeta_sum = constants.splatZ();
|
|
|
|
|
const SimdVector factor = constants.splatW();
|
|
|
|
|
const SimdVector constants = SimdVector((const float *)&s_threeElement[i]);
|
|
|
|
|
|
|
|
|
|
const SimdVector alpha2_sum = constants.splatX();
|
|
|
|
|
const SimdVector beta2_sum = constants.splatY();
|
|
|
|
|
const SimdVector alphabeta_sum = constants.splatZ();
|
|
|
|
|
const SimdVector factor = constants.splatW();
|
|
|
|
|
|
|
|
|
|
const SimdVector alphax_sum = multiplyAdd(x1, half, x0);
|
|
|
|
|
const SimdVector betax_sum = m_xsum - alphax_sum;
|
|
|
|
@ -2283,15 +2290,15 @@ bool ClusterFit::fastCompress4(Vector3 * start, Vector3 * end)
|
|
|
|
|
|
|
|
|
|
for (int c2 = 0; c2 <= 16 - c0 - c1; c2++, i++)
|
|
|
|
|
{
|
|
|
|
|
const SimdVector constants = SimdVector((const float *)&s_fourElement[i]);
|
|
|
|
|
|
|
|
|
|
const SimdVector alpha2_sum = constants.splatX();
|
|
|
|
|
const SimdVector beta2_sum = constants.splatY();
|
|
|
|
|
const SimdVector alphabeta_sum = constants.splatZ();
|
|
|
|
|
const SimdVector factor = constants.splatW();
|
|
|
|
|
|
|
|
|
|
const SimdVector alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0));
|
|
|
|
|
const SimdVector betax_sum = m_xsum - alphax_sum;
|
|
|
|
|
const SimdVector constants = SimdVector((const float *)&s_fourElement[i]);
|
|
|
|
|
|
|
|
|
|
const SimdVector alpha2_sum = constants.splatX();
|
|
|
|
|
const SimdVector beta2_sum = constants.splatY();
|
|
|
|
|
const SimdVector alphabeta_sum = constants.splatZ();
|
|
|
|
|
const SimdVector factor = constants.splatW();
|
|
|
|
|
|
|
|
|
|
const SimdVector alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0));
|
|
|
|
|
const SimdVector betax_sum = m_xsum - alphax_sum;
|
|
|
|
|
|
|
|
|
|
SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
|
|
|
|
|
SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
|
|
|
|
@ -2560,10 +2567,10 @@ bool ClusterFit::fastCompress3(Vector3 * start, Vector3 * end)
|
|
|
|
|
|
|
|
|
|
for (uint c1 = 0; c1 <= count - c0; c1++, i++)
|
|
|
|
|
{
|
|
|
|
|
float const alpha2_sum = s_threeElement[i].alpha2_sum;
|
|
|
|
|
float const beta2_sum = s_threeElement[i].beta2_sum;
|
|
|
|
|
float const alphabeta_sum = s_threeElement[i].alphabeta_sum;
|
|
|
|
|
float const factor = s_threeElement[i].factor;
|
|
|
|
|
float const alpha2_sum = s_threeElement[i].alpha2_sum;
|
|
|
|
|
float const beta2_sum = s_threeElement[i].beta2_sum;
|
|
|
|
|
float const alphabeta_sum = s_threeElement[i].alphabeta_sum;
|
|
|
|
|
float const factor = s_threeElement[i].factor;
|
|
|
|
|
|
|
|
|
|
Vector3 const alphax_sum = x0 + x1 * 0.5f;
|
|
|
|
|
Vector3 const betax_sum = m_xsum - alphax_sum;
|
|
|
|
@ -2647,11 +2654,11 @@ bool ClusterFit::fastCompress4(Vector3 * start, Vector3 * end)
|
|
|
|
|
|
|
|
|
|
for (uint c2 = 0; c2 <= count - c0 - c1; c2++, i++)
|
|
|
|
|
{
|
|
|
|
|
float const alpha2_sum = s_fourElement[i].alpha2_sum;
|
|
|
|
|
float const beta2_sum = s_fourElement[i].beta2_sum;
|
|
|
|
|
float const alphabeta_sum = s_fourElement[i].alphabeta_sum;
|
|
|
|
|
float const factor = s_fourElement[i].factor;
|
|
|
|
|
|
|
|
|
|
float const alpha2_sum = s_fourElement[i].alpha2_sum;
|
|
|
|
|
float const beta2_sum = s_fourElement[i].beta2_sum;
|
|
|
|
|
float const alphabeta_sum = s_fourElement[i].alphabeta_sum;
|
|
|
|
|
float const factor = s_fourElement[i].factor;
|
|
|
|
|
|
|
|
|
|
Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
|
|
|
|
|
Vector3 const betax_sum = m_xsum - alphax_sum;
|
|
|
|
|
|
|
|
|
|