diff --git a/src/nvtt/icbc.h b/src/nvtt/icbc.h index 56bf3b7..4abc98f 100644 --- a/src/nvtt/icbc.h +++ b/src/nvtt/icbc.h @@ -44,6 +44,11 @@ namespace icbc { #include #endif +#if ICBC_USE_SIMD +#include +#include +#endif + namespace icbc { /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -223,11 +228,13 @@ inline bool equal(Vector3 a, Vector3 b, float epsilon) { #if ICBC_USE_SIMD -#include -#include - #define SIMD_INLINE inline -#define SIMD_NATIVE __forceinline +#if __GNUC__ +// Also need "inline" to avoid "function body can be overwritten at link time" errors. +# define SIMD_NATIVE __attribute__((always_inline)) inline +#else // _MSC_VER +# define SIMD_NATIVE __forceinline +#endif class SimdVector { @@ -2196,12 +2203,12 @@ bool ClusterFit::fastCompress3(Vector3 * start, Vector3 * end) for (int c1 = 0; c1 <= count - c0; c1++, i++) { - const SimdVector constants = SimdVector((const float *)&s_threeElement[i]); - - const SimdVector alpha2_sum = constants.splatX(); - const SimdVector beta2_sum = constants.splatY(); - const SimdVector alphabeta_sum = constants.splatZ(); - const SimdVector factor = constants.splatW(); + const SimdVector constants = SimdVector((const float *)&s_threeElement[i]); + + const SimdVector alpha2_sum = constants.splatX(); + const SimdVector beta2_sum = constants.splatY(); + const SimdVector alphabeta_sum = constants.splatZ(); + const SimdVector factor = constants.splatW(); const SimdVector alphax_sum = multiplyAdd(x1, half, x0); const SimdVector betax_sum = m_xsum - alphax_sum; @@ -2283,15 +2290,15 @@ bool ClusterFit::fastCompress4(Vector3 * start, Vector3 * end) for (int c2 = 0; c2 <= 16 - c0 - c1; c2++, i++) { - const SimdVector constants = SimdVector((const float *)&s_fourElement[i]); - - const SimdVector alpha2_sum = constants.splatX(); - const SimdVector beta2_sum = constants.splatY(); - const SimdVector alphabeta_sum = constants.splatZ(); - const SimdVector factor = constants.splatW(); - - const SimdVector alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0)); - const SimdVector betax_sum = m_xsum - alphax_sum; + const SimdVector constants = SimdVector((const float *)&s_fourElement[i]); + + const SimdVector alpha2_sum = constants.splatX(); + const SimdVector beta2_sum = constants.splatY(); + const SimdVector alphabeta_sum = constants.splatZ(); + const SimdVector factor = constants.splatW(); + + const SimdVector alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0)); + const SimdVector betax_sum = m_xsum - alphax_sum; SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor; SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor; @@ -2560,10 +2567,10 @@ bool ClusterFit::fastCompress3(Vector3 * start, Vector3 * end) for (uint c1 = 0; c1 <= count - c0; c1++, i++) { - float const alpha2_sum = s_threeElement[i].alpha2_sum; - float const beta2_sum = s_threeElement[i].beta2_sum; - float const alphabeta_sum = s_threeElement[i].alphabeta_sum; - float const factor = s_threeElement[i].factor; + float const alpha2_sum = s_threeElement[i].alpha2_sum; + float const beta2_sum = s_threeElement[i].beta2_sum; + float const alphabeta_sum = s_threeElement[i].alphabeta_sum; + float const factor = s_threeElement[i].factor; Vector3 const alphax_sum = x0 + x1 * 0.5f; Vector3 const betax_sum = m_xsum - alphax_sum; @@ -2647,11 +2654,11 @@ bool ClusterFit::fastCompress4(Vector3 * start, Vector3 * end) for (uint c2 = 0; c2 <= count - c0 - c1; c2++, i++) { - float const alpha2_sum = s_fourElement[i].alpha2_sum; - float const beta2_sum = s_fourElement[i].beta2_sum; - float const alphabeta_sum = s_fourElement[i].alphabeta_sum; - float const factor = s_fourElement[i].factor; - + float const alpha2_sum = s_fourElement[i].alpha2_sum; + float const beta2_sum = s_fourElement[i].beta2_sum; + float const alphabeta_sum = s_fourElement[i].alphabeta_sum; + float const factor = s_fourElement[i].factor; + Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f); Vector3 const betax_sum = m_xsum - alphax_sum;