diff --git a/src/nvtt/ClusterFit.cpp b/src/nvtt/ClusterFit.cpp
index 39bbbc4..a83c68c 100644
--- a/src/nvtt/ClusterFit.cpp
+++ b/src/nvtt/ClusterFit.cpp
@@ -1,587 +1,584 @@
-/* -----------------------------------------------------------------------------
-
-    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
-    Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
-
-    Permission is hereby granted, free of charge, to any person obtaining
-    a copy of this software and associated documentation files (the
-    "Software"), to	deal in the Software without restriction, including
-    without limitation the rights to use, copy, modify, merge, publish,
-    distribute, sublicense, and/or sell copies of the Software, and to
-    permit persons to whom the Software is furnished to do so, subject to
-    the following conditions:
-
-    The above copyright notice and this permission notice shall be included
-    in all copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-   -------------------------------------------------------------------------- */
-
-#include "ClusterFit.h"
-#include "nvmath/Fitting.h"
-#include "nvmath/Vector.inl"
-#include "nvmath/ftoi.h"
-#include "nvimage/ColorBlock.h"
-
-#include <float.h> // FLT_MAX
-
-using namespace nv;
-
-ClusterFit::ClusterFit()
-{
-}
-
-void ClusterFit::setColorSet(const Vector3 * colors, const float * weights, int count)
-{
-    // initialise the best error
-#if NVTT_USE_SIMD
-    m_besterror = SimdVector( FLT_MAX );
-    Vector3 metric = m_metric.toVector3();
-#else
-    m_besterror = FLT_MAX;
-    Vector3 metric = m_metric;
-#endif
-
-    m_count = count;
-
-    Vector3 principal = Fit::computePrincipalComponent_PowerMethod(count, colors, weights, metric);
-    //Vector3 principal = Fit::computePrincipalComponent_EigenSolver(count, colors, weights, metric);
-
-    // build the list of values
-    int order[16];
-    float dps[16];
-    for (uint i = 0; i < m_count; ++i)
-    {
-        dps[i] = dot(colors[i], principal);
-        order[i] = i;
-    }
-
-    // stable sort
-    for (uint i = 0; i < m_count; ++i)
-    {
-        for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
-        {
-            swap(dps[j], dps[j - 1]);
-            swap(order[j], order[j - 1]);
-        }
-    }
-
-    // weight all the points
-#if NVTT_USE_SIMD
-    m_xxsum = SimdVector( 0.0f );
-    m_xsum = SimdVector( 0.0f );
-#else
-    m_xxsum = Vector3(0.0f);
-    m_xsum = Vector3(0.0f);
-    m_wsum = 0.0f;
-#endif
-	
-    for (uint i = 0; i < m_count; ++i)
-    {
-        int p = order[i];
-#if NVTT_USE_SIMD
-        NV_ALIGN_16 Vector4 tmp(colors[p], 1);
-        m_weighted[i] = SimdVector(tmp.component) * SimdVector(weights[p]);
-        m_xxsum += m_weighted[i] * m_weighted[i];
-        m_xsum += m_weighted[i];
-#else
-        m_weighted[i] = colors[p] * weights[p];
-        m_xxsum += m_weighted[i] * m_weighted[i];
-        m_xsum += m_weighted[i];
-        m_weights[i] = weights[p];
-        m_wsum += m_weights[i];
-#endif
-    }
-}
-
-
-
-void ClusterFit::setColorWeights(Vector4::Arg w)
-{
-#if NVTT_USE_SIMD
-    NV_ALIGN_16 Vector4 tmp(w.xyz(), 1);
-    m_metric = SimdVector(tmp.component);
-#else
-    m_metric = w.xyz();
-#endif
-    m_metricSqr = m_metric * m_metric;
-}
-
-float ClusterFit::bestError() const
-{
-#if NVTT_USE_SIMD
-    SimdVector x = m_xxsum * m_metricSqr;
-    SimdVector error = m_besterror + x.splatX() + x.splatY() + x.splatZ();
-    return error.toFloat();
-#else
-    return m_besterror + dot(m_xxsum, m_metricSqr);
-#endif
-
-}
-
-#if NVTT_USE_SIMD
-
-bool ClusterFit::compress3( Vector3 * start, Vector3 * end )
-{
-    const int count = m_count;
-    const SimdVector one = SimdVector(1.0f);
-    const SimdVector zero = SimdVector(0.0f);
-    const SimdVector half(0.5f, 0.5f, 0.5f, 0.25f);
-    const SimdVector two = SimdVector(2.0);
-    const SimdVector grid( 31.0f, 63.0f, 31.0f, 0.0f );
-    const SimdVector gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
-
-    // declare variables
-    SimdVector beststart = SimdVector( 0.0f );
-    SimdVector bestend = SimdVector( 0.0f );
-    SimdVector besterror = SimdVector( FLT_MAX );
-
-    SimdVector x0 = zero;
-
-    int b0 = 0, b1 = 0;
-
-    // check all possible clusters for this total order
-    for( int c0 = 0; c0 <= count; c0++)
-    {
-        SimdVector x1 = zero;
-
-        for( int c1 = 0; c1 <= count-c0; c1++)
-        {
-            const SimdVector x2 = m_xsum - x1 - x0;
-
-            //Vector3 alphax_sum = x0 + x1 * 0.5f;
-            //float alpha2_sum = w0 + w1 * 0.25f;
-            const SimdVector alphax_sum = multiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
-            const SimdVector alpha2_sum = alphax_sum.splatW();
-
-            //const Vector3 betax_sum = x2 + x1 * 0.5f;
-            //const float beta2_sum = w2 + w1 * 0.25f;
-            const SimdVector betax_sum = multiplyAdd(x1, half, x2); // betax_sum, beta2_sum
-            const SimdVector beta2_sum = betax_sum.splatW();
-
-            //const float alphabeta_sum = w1 * 0.25f;
-            const SimdVector alphabeta_sum = (x1 * half).splatW(); // alphabeta_sum
-
-            // const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-            const SimdVector factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
-
-            SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
-            SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
-
-            // clamp to the grid
-            a = min( one, max( zero, a ) );
-            b = min( one, max( zero, b ) );
-            a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
-            b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
-
-            // compute the error (we skip the constant xxsum)
-            SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-            SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
-            SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
-            SimdVector e4 = multiplyAdd( two, e3, e1 );
-
-            // apply the metric to the error term
-            SimdVector e5 = e4 * m_metricSqr;
-            SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
-
-            // keep the solution if it wins
-            if( compareAnyLessThan( error, besterror ) )
-            {
-                besterror = error;
-                beststart = a;
-                bestend = b;
-                b0 = c0;
-                b1 = c1;
-            }
-
-            x1 += m_weighted[c0+c1];
-        }
-
-        x0 += m_weighted[c0];
-    }
-
-    // save the block if necessary
-    if( compareAnyLessThan( besterror, m_besterror ) )
-    {
-
-        *start = beststart.toVector3();
-        *end = bestend.toVector3();
-
-        // save the error
-        m_besterror = besterror;
-
-        return true;
-    }
-
-    return false;
-}
-
-bool ClusterFit::compress4( Vector3 * start, Vector3 * end )
-{
-    const int count = m_count;
-    const SimdVector one = SimdVector(1.0f);
-    const SimdVector zero = SimdVector(0.0f);
-    const SimdVector half = SimdVector(0.5f);
-    const SimdVector two = SimdVector(2.0);
-    const SimdVector onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
-    const SimdVector twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
-    const SimdVector twonineths = SimdVector( 2.0f/9.0f );
-    const SimdVector grid( 31.0f, 63.0f, 31.0f, 0.0f );
-    const SimdVector gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
-
-    // declare variables
-    SimdVector beststart = SimdVector( 0.0f );
-    SimdVector bestend = SimdVector( 0.0f );
-    SimdVector besterror = SimdVector( FLT_MAX );
-
-    SimdVector x0 = zero;
-    int b0 = 0, b1 = 0, b2 = 0;
-
-    // check all possible clusters for this total order
-    for( int c0 = 0; c0 <= count; c0++)
-    {
-        SimdVector x1 = zero;
-
-        for( int c1 = 0; c1 <= count-c0; c1++)
-        {
-            SimdVector x2 = zero;
-
-            for( int c2 = 0; c2 <= count-c0-c1; c2++)
-            {
-                const SimdVector x3 = m_xsum - x2 - x1 - x0;
-
-                //const Vector3 alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
-                //const float alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
-                const SimdVector alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum
-                const SimdVector alpha2_sum = alphax_sum.splatW();
-
-                //const Vector3 betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
-                //const float beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
-                const SimdVector betax_sum = multiplyAdd(x2, twothirds, multiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum
-                const SimdVector beta2_sum = betax_sum.splatW();
-
-                //const float alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
-                const SimdVector alphabeta_sum = twonineths*( x1 + x2 ).splatW(); // alphabeta_sum
-
-                //const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-                const SimdVector factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
-
-                SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
-                SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
-
-                // clamp to the grid
-                a = min( one, max( zero, a ) );
-                b = min( one, max( zero, b ) );
-                a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
-                b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
-
-                // compute the error (we skip the constant xxsum)
-                SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
-                SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
-                SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
-                SimdVector e4 = multiplyAdd( two, e3, e1 );
-
-#if 1
-                // apply the metric to the error term
-                SimdVector e5 = e4 * m_metricSqr;
-                SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
-#else
-                // @@ Is there a horizontal max SIMD instruction?
-                SimdVector error = e4.splatX() + e4.splatY() + e4.splatZ();
-                error *= two;
-                error += max(max(e4.splatX(), e4.splatY()), e4.splatZ());
-                error -= min(min(e4.splatX(), e4.splatY()), e4.splatZ());
-
-#endif
-
-                // keep the solution if it wins
-                if (compareAnyLessThan(error, besterror))
-                {
-                    besterror = error;
-                    beststart = a;
-                    bestend = b;
-                    b0 = c0;
-                    b1 = c1;
-                    b2 = c2;
-                }
-
-                x2 += m_weighted[c0+c1+c2];
-            }
-
-            x1 += m_weighted[c0+c1];
-        }
-
-        x0 += m_weighted[c0];
-    }
-
-    // save the block if necessary
-    if (compareAnyLessThan(besterror, m_besterror))
-    {
-        *start = beststart.toVector3();
-        *end = bestend.toVector3();
-
-        // save the error
-        m_besterror = besterror;
-
-        return true;
-    }
-
-    return false;
-}
-
-#else
-
-inline Vector3 round565(const Vector3 & v) {
-    uint r = ftoi_trunc(v.x * 31.0f);
-    float r0 = float(((r+0) << 3) | ((r+0) >> 2));
-    float r1 = float(((r+1) << 3) | ((r+1) >> 2));
-    if (fabs(v.x - r1) < fabs(v.x - r0)) r = min(r+1, 31U);
-    r = (r << 3) | (r >> 2);
-
-    uint g = ftoi_trunc(v.y * 63.0f);
-    float g0 = float(((g+0) << 2) | ((g+0) >> 4));
-    float g1 = float(((g+1) << 2) | ((g+1) >> 4));
-    if (fabs(v.y - g1) < fabs(v.y - g0)) g = min(g+1, 63U);
-    g = (g << 2) | (g >> 4);
-
-    uint b = ftoi_trunc(v.z * 31.0f);
-    float b0 = float(((b+0) << 3) | ((b+0) >> 2));
-    float b1 = float(((b+1) << 3) | ((b+1) >> 2));
-    if (fabs(v.z - b1) < fabs(v.z - b0)) b = min(b+1, 31U);
-
-    b = (b << 3) | (b >> 2);
-
-    return Vector3(float(r)/255, float(g)/255, float(b)/255);
-}
-
-bool ClusterFit::compress3(Vector3 * start, Vector3 * end)
-{
-    const uint count = m_count;
-    const Vector3 grid( 31.0f, 63.0f, 31.0f );
-    const Vector3 gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
-
-    // declare variables
-    Vector3 beststart( 0.0f );
-    Vector3 bestend( 0.0f );
-    float besterror = FLT_MAX;
-
-    Vector3 x0(0.0f);
-    float w0 = 0.0f;
-
-    int b0 = 0, b1 = 0;
-
-    // check all possible clusters for this total order
-    for (uint c0 = 0; c0 <= count; c0++)
-    {
-        Vector3 x1(0.0f);
-        float w1 = 0.0f;
-
-        for (uint c1 = 0; c1 <= count-c0; c1++)
-        {
-            float w2 = m_wsum - w0 - w1;
-
-            // These factors could be entirely precomputed.
-            float const alpha2_sum = w0 + w1 * 0.25f;
-            float const beta2_sum = w2 + w1 * 0.25f;
-            float const alphabeta_sum = w1 * 0.25f;
-            float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-            Vector3 const alphax_sum = x0 + x1 * 0.5f;
-            Vector3 const betax_sum = m_xsum - alphax_sum;
-
-            Vector3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
-            Vector3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
-
-            // clamp to the grid
-            a = clamp(a, 0, 1);
-            b = clamp(b, 0, 1);
-#if 1
-            a = floor(grid * a + 0.5f) * gridrcp;
-            b = floor(grid * b + 0.5f) * gridrcp;
-#else
-
-            //int ar = ftoi_round(31 * a.x); ar = (ar << 3) | (ar >> 2); a.x = float(ar) / 255.0f;
-            //int ag = ftoi_round(63 * a.y); ar = (ag << 2) | (ag >> 4); a.y = float(ag) / 255.0f;
-            //int ab = ftoi_round(31 * a.z); ar = (ab << 3) | (ab >> 2); a.z = float(ab) / 255.0f;
-            //int br = ftoi_round(31 * b.x); br = (br << 3) | (br >> 2); b.x = float(br) / 255.0f;
-            //int bg = ftoi_round(63 * b.y); br = (bg << 2) | (bg >> 4); b.y = float(bg) / 255.0f;
-            //int bb = ftoi_round(31 * b.z); br = (bb << 3) | (bb >> 2); b.z = float(bb) / 255.0f;
-
-            /*a = floor(a * grid + 0.5f);
-            a.x = (a.x * 8 + floorf(a.x / 4)) / 255.0f;
-            a.y = (a.y * 4 + floorf(a.y / 16)) / 255.0f;
-            a.z = (a.z * 8 + floorf(a.z / 4)) / 255.0f;
-
-            b = floor(b * grid + 0.5f);
-            b.x = (b.x * 8 + floorf(b.x / 4)) / 255.0f;
-            b.y = (b.y * 4 + floorf(b.y / 16)) / 255.0f;
-            b.z = (b.z * 8 + floorf(b.z / 4)) / 255.0f;*/
-
-            a = round565(a);
-            b = round565(b);
-#endif
-
-            // compute the error
-            Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
-
-            // apply the metric to the error term
-            float error = dot(e1, m_metricSqr);
-
-            // keep the solution if it wins
-            if (error < besterror)
-            {
-                besterror = error;
-                beststart = a;
-                bestend = b;
-                b0 = c0;
-                b1 = c1;
-            }
-
-            x1 += m_weighted[c0+c1];
-            w1 += m_weights[c0+c1];
-        }
-
-        x0 += m_weighted[c0];
-        w0 += m_weights[c0];
-    }
-
-    // save the block if necessary
-    if( besterror < m_besterror )
-    {
-
-        *start = beststart;
-        *end = bestend;
-
-        // save the error
-        m_besterror = besterror;
-
-        return true;
-    }
-
-    return false;
-}
-
-bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
-{
-    const uint count = m_count;
-    const Vector3 grid( 31.0f, 63.0f, 31.0f );
-    const Vector3 gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
-
-    // declare variables
-    Vector3 beststart( 0.0f );
-    Vector3 bestend( 0.0f );
-    float besterror = FLT_MAX;
-
-    Vector3 x0(0.0f);
-    float w0 = 0.0f;
-    int b0 = 0, b1 = 0, b2 = 0;
-
-    // check all possible clusters for this total order
-    for (uint c0 = 0; c0 <= count; c0++)
-    {
-        Vector3 x1(0.0f);
-        float w1 = 0.0f;
-
-        for (uint c1 = 0; c1 <= count-c0; c1++)
-        {
-            Vector3 x2(0.0f);
-            float w2 = 0.0f;
-
-            for (uint c2 = 0; c2 <= count-c0-c1; c2++)
-            {
-                float w3 = m_wsum - w0 - w1 - w2;
-
-                float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
-                float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
-                float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
-                float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
-
-                Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
-                Vector3 const betax_sum = m_xsum - alphax_sum;
-
-                Vector3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
-                Vector3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
-
-                // clamp to the grid
-                a = clamp(a, 0, 1);
-                b = clamp(b, 0, 1);
-#if 0
-                a = floor(a * grid + 0.5f) * gridrcp;
-                b = floor(b * grid + 0.5f) * gridrcp;
-#else
-                //int ar = ftoi_round(31 * a.x); ar = (ar << 3) | (ar >> 2); a.x = float(ar) / 255.0f;
-                //int ag = ftoi_round(63 * a.y); ar = (ag << 2) | (ag >> 4); a.y = float(ag) / 255.0f;
-                //int ab = ftoi_round(31 * a.z); ar = (ab << 3) | (ab >> 2); a.z = float(ab) / 255.0f;
-                //int br = ftoi_round(31 * b.x); br = (br << 3) | (br >> 2); b.x = float(br) / 255.0f;
-                //int bg = ftoi_round(63 * b.y); br = (bg << 2) | (bg >> 4); b.y = float(bg) / 255.0f;
-                //int bb = ftoi_round(31 * b.z); br = (bb << 3) | (bb >> 2); b.z = float(bb) / 255.0f;
-
-                /*
-                a = floor(a * grid + 0.5f);
-                a.x = (a.x * 8 + floorf(a.x / 4)) / 255.0f;
-                a.y = (a.y * 4 + floorf(a.y / 16)) / 255.0f;
-                a.z = (a.z * 8 + floorf(a.z / 4)) / 255.0f;
-
-                b = floor(b * grid + 0.5f);
-                b.x = (b.x * 8 + floorf(b.x / 4)) / 255.0f;
-                b.y = (b.y * 4 + floorf(b.y / 16)) / 255.0f;
-                b.z = (b.z * 8 + floorf(b.z / 4)) / 255.0f;
-                */
-
-                a = round565(a);
-                b = round565(b);
-#endif
-                // @@ It would be much more accurate to evaluate the error exactly. 
-
-                // compute the error
-                Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
-
-                // apply the metric to the error term
-                float error = dot( e1, m_metricSqr );
-
-                // keep the solution if it wins
-                if (error < besterror)
-                {
-                    besterror = error;
-                    beststart = a;
-                    bestend = b;
-                    b0 = c0;
-                    b1 = c1;
-                    b2 = c2;
-                }
-
-                x2 += m_weighted[c0+c1+c2];
-                w2 += m_weights[c0+c1+c2];
-            }
-
-            x1 += m_weighted[c0+c1];
-            w1 += m_weights[c0+c1];
-        }
-
-        x0 += m_weighted[c0];
-        w0 += m_weights[c0];
-    }
-
-    // save the block if necessary
-    if (besterror < m_besterror)
-    {
-        *start = beststart;
-        *end = bestend;
-
-        // save the error
-        m_besterror = besterror;
-
-        return true;
-    }
-
-    return false;
-}
-
-#endif // NVTT_USE_SIMD
+/* -----------------------------------------------------------------------------
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+    Copyright (c) 2006 Ignacio Castano                      icastano@nvidia.com
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to	deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+   -------------------------------------------------------------------------- */
+
+#include "ClusterFit.h"
+#include "nvmath/Fitting.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/ftoi.h"
+#include "nvimage/ColorBlock.h"
+
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+
+ClusterFit::ClusterFit()
+{
+}
+
+/*
+// find minimum and maximum colors based on bounding box in color space
+inline static void fit_colors_bbox(const Vector3 * colors, int count, Vector3 * restrict c0, Vector3 * restrict c1)
+{
+    *c0 = Vector3(0);
+    *c1 = Vector3(1);
+
+    for (int i = 0; i < count; i++) {
+        *c0 = max(*c0, colors[i]);
+        *c1 = min(*c1, colors[i]);
+    }
+}
+
+inline static void select_diagonal(const Vector3 * colors, int count, Vector3 * restrict c0, Vector3 * restrict c1)
+{
+    Vector3 center = (*c0 + *c1) * 0.5f;
+
+    Vector2 covariance = Vector2(0);
+    for (int i = 0; i < count; i++) {
+        Vector3 t = colors[i] - center;
+        covariance += t.xy() * t.z;
+    }
+
+    float x0 = c0->x;
+    float y0 = c0->y;
+    float x1 = c1->x;
+    float y1 = c1->y;
+
+    if (covariance.x < 0) {
+        swap(x0, x1);
+    }
+    if (covariance.y < 0) {
+        swap(y0, y1);
+    }
+
+    c0->set(x0, y0, c0->z);
+    c1->set(x1, y1, c1->z);
+}
+*/
+
+void ClusterFit::setColorSet(const Vector3 * colors, const float * weights, int count)
+{
+    // initialise the best error
+#if NVTT_USE_SIMD
+    m_besterror = SimdVector( FLT_MAX );
+    Vector3 metric = m_metric.toVector3();
+#else
+    m_besterror = FLT_MAX;
+    Vector3 metric = m_metric;
+#endif
+
+    m_count = count;
+
+    Vector3 principal = Fit::computePrincipalComponent_PowerMethod(count, colors, weights, metric);
+    //Vector3 principal = Fit::computePrincipalComponent_EigenSolver(count, colors, weights, metric);
+
+    /*// This approximation produces slightly lower quality:
+    Vector3 c0, c1;
+    fit_colors_bbox(colors, count, &c0, &c1);
+    select_diagonal(colors, count, &c0, &c1);
+    if (c0 != c1) {
+        principal = normalize(c1 - c0);
+    }*/
+
+    // build the list of values
+    int order[16];
+    float dps[16];
+    for (uint i = 0; i < m_count; ++i)
+    {
+        dps[i] = dot(colors[i], principal);
+        order[i] = i;
+    }
+
+    // stable sort
+    for (uint i = 0; i < m_count; ++i)
+    {
+        for (uint j = i; j > 0 && dps[j] < dps[j - 1]; --j)
+        {
+            swap(dps[j], dps[j - 1]);
+            swap(order[j], order[j - 1]);
+        }
+    }
+
+    // weight all the points
+#if NVTT_USE_SIMD
+    m_xxsum = SimdVector( 0.0f );
+    m_xsum = SimdVector( 0.0f );
+#else
+    m_xxsum = Vector3(0.0f);
+    m_xsum = Vector3(0.0f);
+    m_wsum = 0.0f;
+#endif
+	
+    for (uint i = 0; i < m_count; ++i)
+    {
+        int p = order[i];
+#if NVTT_USE_SIMD
+        NV_ALIGN_16 Vector4 tmp(colors[p], 1);
+        m_weighted[i] = SimdVector(tmp.component) * SimdVector(weights[p]);
+        m_xxsum += m_weighted[i] * m_weighted[i];
+        m_xsum += m_weighted[i];
+#else
+        m_weighted[i] = colors[p] * weights[p];
+        m_xxsum += m_weighted[i] * m_weighted[i];
+        m_xsum += m_weighted[i];
+        m_weights[i] = weights[p];
+        m_wsum += m_weights[i];
+#endif
+    }
+}
+
+
+
+void ClusterFit::setColorWeights(Vector4::Arg w)
+{
+#if NVTT_USE_SIMD
+    NV_ALIGN_16 Vector4 tmp(w.xyz(), 1);
+    m_metric = SimdVector(tmp.component);
+#else
+    m_metric = w.xyz();
+#endif
+    m_metricSqr = m_metric * m_metric;
+}
+
+float ClusterFit::bestError() const
+{
+#if NVTT_USE_SIMD
+    SimdVector x = m_xxsum * m_metricSqr;
+    SimdVector error = m_besterror + x.splatX() + x.splatY() + x.splatZ();
+    return error.toFloat();
+#else
+    return m_besterror + dot(m_xxsum, m_metricSqr);
+#endif
+
+}
+
+#if NVTT_USE_SIMD
+
+bool ClusterFit::compress3( Vector3 * start, Vector3 * end )
+{
+    const int count = m_count;
+    const SimdVector one = SimdVector(1.0f);
+    const SimdVector zero = SimdVector(0.0f);
+    const SimdVector half(0.5f, 0.5f, 0.5f, 0.25f);
+    const SimdVector two = SimdVector(2.0);
+    const SimdVector grid( 31.0f, 63.0f, 31.0f, 0.0f );
+    const SimdVector gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+    // declare variables
+    SimdVector beststart = SimdVector( 0.0f );
+    SimdVector bestend = SimdVector( 0.0f );
+    SimdVector besterror = SimdVector( FLT_MAX );
+
+    SimdVector x0 = zero;
+
+    // check all possible clusters for this total order
+    for( int c0 = 0; c0 <= count; c0++)
+    {
+        SimdVector x1 = zero;
+
+        for( int c1 = 0; c1 <= count-c0; c1++)
+        {
+            const SimdVector x2 = m_xsum - x1 - x0;
+
+            //Vector3 alphax_sum = x0 + x1 * 0.5f;
+            //float alpha2_sum = w0 + w1 * 0.25f;
+            const SimdVector alphax_sum = multiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum
+            const SimdVector alpha2_sum = alphax_sum.splatW();
+
+            //const Vector3 betax_sum = x2 + x1 * 0.5f;
+            //const float beta2_sum = w2 + w1 * 0.25f;
+            const SimdVector betax_sum = multiplyAdd(x1, half, x2); // betax_sum, beta2_sum
+            const SimdVector beta2_sum = betax_sum.splatW();
+
+            //const float alphabeta_sum = w1 * 0.25f;
+            const SimdVector alphabeta_sum = (x1 * half).splatW(); // alphabeta_sum
+
+            // const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+            const SimdVector factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
+
+            SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+            SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+
+            // clamp to the grid
+            a = min( one, max( zero, a ) );
+            b = min( one, max( zero, b ) );
+            a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
+            b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
+
+            // compute the error (we skip the constant xxsum)
+            SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+            SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+            SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
+            SimdVector e4 = multiplyAdd( two, e3, e1 );
+
+            // apply the metric to the error term
+            SimdVector e5 = e4 * m_metricSqr;
+            SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+
+            // keep the solution if it wins
+            if( compareAnyLessThan( error, besterror ) )
+            {
+                besterror = error;
+                beststart = a;
+                bestend = b;
+            }
+
+            x1 += m_weighted[c0+c1];
+        }
+
+        x0 += m_weighted[c0];
+    }
+
+    // save the block if necessary
+    if( compareAnyLessThan( besterror, m_besterror ) )
+    {
+
+        *start = beststart.toVector3();
+        *end = bestend.toVector3();
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+bool ClusterFit::compress4( Vector3 * start, Vector3 * end )
+{
+    const int count = m_count;
+    const SimdVector one = SimdVector(1.0f);
+    const SimdVector zero = SimdVector(0.0f);
+    const SimdVector half = SimdVector(0.5f);
+    const SimdVector two = SimdVector(2.0);
+    const SimdVector onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
+    const SimdVector twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
+    const SimdVector twonineths = SimdVector( 2.0f/9.0f );
+    const SimdVector grid( 31.0f, 63.0f, 31.0f, 0.0f );
+    const SimdVector gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+    // declare variables
+    SimdVector beststart = SimdVector( 0.0f );
+    SimdVector bestend = SimdVector( 0.0f );
+    SimdVector besterror = SimdVector( FLT_MAX );
+
+    SimdVector x0 = zero;
+
+    // check all possible clusters for this total order
+    for( int c0 = 0; c0 <= count; c0++)
+    {
+        SimdVector x1 = zero;
+
+        for( int c1 = 0; c1 <= count-c0; c1++)
+        {
+            SimdVector x2 = zero;
+
+            for( int c2 = 0; c2 <= count-c0-c1; c2++)
+            {
+                const SimdVector x3 = m_xsum - x2 - x1 - x0;
+
+                //const Vector3 alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+                //const float alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+                const SimdVector alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum
+                const SimdVector alpha2_sum = alphax_sum.splatW();
+
+                //const Vector3 betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
+                //const float beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+                const SimdVector betax_sum = multiplyAdd(x2, twothirds, multiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum
+                const SimdVector beta2_sum = betax_sum.splatW();
+
+                //const float alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+                const SimdVector alphabeta_sum = twonineths*( x1 + x2 ).splatW(); // alphabeta_sum
+
+                //const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+                const SimdVector factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );
+
+                SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
+                SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
+
+                // clamp to the grid
+                a = min( one, max( zero, a ) );
+                b = min( one, max( zero, b ) );
+                a = truncate( multiplyAdd( grid, a, half ) ) * gridrcp;
+                b = truncate( multiplyAdd( grid, b, half ) ) * gridrcp;
+
+                // compute the error (we skip the constant xxsum)
+                // error = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+                SimdVector e1 = multiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+                SimdVector e2 = negativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+                SimdVector e3 = negativeMultiplySubtract( b, betax_sum, e2 );
+                SimdVector e4 = multiplyAdd( two, e3, e1 );
+
+                // apply the metric to the error term
+                SimdVector e5 = e4 * m_metricSqr;
+                SimdVector error = e5.splatX() + e5.splatY() + e5.splatZ();
+
+                // keep the solution if it wins
+                if (compareAnyLessThan(error, besterror))
+                {
+                    besterror = error;
+                    beststart = a;
+                    bestend = b;
+                }
+
+                x2 += m_weighted[c0+c1+c2];
+            }
+
+            x1 += m_weighted[c0+c1];
+        }
+
+        x0 += m_weighted[c0];
+    }
+
+    // save the block if necessary
+    if (compareAnyLessThan(besterror, m_besterror))
+    {
+        *start = beststart.toVector3();
+        *end = bestend.toVector3();
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+#else
+
+static const float midpoints5[32] = {
+    0.015686f, 0.047059f, 0.078431f, 0.111765f, 0.145098f, 0.176471f, 0.207843f, 0.241176f, 0.274510f, 0.305882f, 0.337255f, 0.370588f, 0.403922f, 0.435294f, 0.466667f, 0.5f,
+    0.533333f, 0.564706f, 0.596078f, 0.629412f, 0.662745f, 0.694118f, 0.725490f, 0.758824f, 0.792157f, 0.823529f, 0.854902f, 0.888235f, 0.921569f, 0.952941f, 0.984314f, 1.0f
+};
+
+static const float midpoints6[64] = {
+    0.007843f, 0.023529f, 0.039216f, 0.054902f, 0.070588f, 0.086275f, 0.101961f, 0.117647f, 0.133333f, 0.149020f, 0.164706f, 0.180392f, 0.196078f, 0.211765f, 0.227451f, 0.245098f,
+    0.262745f, 0.278431f, 0.294118f, 0.309804f, 0.325490f, 0.341176f, 0.356863f, 0.372549f, 0.388235f, 0.403922f, 0.419608f, 0.435294f, 0.450980f, 0.466667f, 0.482353f, 0.500000f,
+    0.517647f, 0.533333f, 0.549020f, 0.564706f, 0.580392f, 0.596078f, 0.611765f, 0.627451f, 0.643137f, 0.658824f, 0.674510f, 0.690196f, 0.705882f, 0.721569f, 0.737255f, 0.754902f,
+    0.772549f, 0.788235f, 0.803922f, 0.819608f, 0.835294f, 0.850980f, 0.866667f, 0.882353f, 0.898039f, 0.913725f, 0.929412f, 0.945098f, 0.960784f, 0.976471f, 0.992157f, 1.0f
+};
+
+// This is the ideal way to round, but it's too expensive to do this in the inner loop.
+inline Vector3 round565(const Vector3 & v) {
+    const Vector3 grid(31.0f, 63.0f, 31.0f);
+    const Vector3 gridrcp(1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f);
+
+    Vector3 q = floor(grid * v);
+    q.x += (v.x > midpoints5[int(q.x)]);
+    q.y += (v.y > midpoints6[int(q.y)]);
+    q.z += (v.z > midpoints5[int(q.z)]);
+    q *= gridrcp;
+    return q;
+}
+
+bool ClusterFit::compress3(Vector3 * start, Vector3 * end)
+{
+    const uint count = m_count;
+    const Vector3 grid( 31.0f, 63.0f, 31.0f );
+    const Vector3 gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+
+    // declare variables
+    Vector3 beststart( 0.0f );
+    Vector3 bestend( 0.0f );
+    float besterror = FLT_MAX;
+
+    Vector3 x0(0.0f);
+    float w0 = 0.0f;
+
+    int b0 = 0, b1 = 0;
+
+    // check all possible clusters for this total order
+    for (uint c0 = 0; c0 <= count; c0++)
+    {
+        Vector3 x1(0.0f);
+        float w1 = 0.0f;
+
+        for (uint c1 = 0; c1 <= count-c0; c1++)
+        {
+            float w2 = m_wsum - w0 - w1;
+
+            // These factors could be entirely precomputed.
+            float const alpha2_sum = w0 + w1 * 0.25f;
+            float const beta2_sum = w2 + w1 * 0.25f;
+            float const alphabeta_sum = w1 * 0.25f;
+            float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+            Vector3 const alphax_sum = x0 + x1 * 0.5f;
+            Vector3 const betax_sum = m_xsum - alphax_sum;
+
+            Vector3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor;
+            Vector3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor;
+
+            // clamp to the grid
+            a = clamp(a, 0, 1);
+            b = clamp(b, 0, 1);
+#if 1
+            a = floor(grid * a + 0.5f) * gridrcp;
+            b = floor(grid * b + 0.5f) * gridrcp;
+#else
+            a = round565(a);
+            b = round565(b);
+#endif
+
+            // compute the error
+            Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+
+            // apply the metric to the error term
+            float error = dot(e1, m_metricSqr);
+
+            // keep the solution if it wins
+            if (error < besterror)
+            {
+                besterror = error;
+                beststart = a;
+                bestend = b;
+                b0 = c0;
+                b1 = c1;
+            }
+
+            x1 += m_weighted[c0+c1];
+            w1 += m_weights[c0+c1];
+        }
+
+        x0 += m_weighted[c0];
+        w0 += m_weights[c0];
+    }
+
+    // save the block if necessary
+    if( besterror < m_besterror )
+    {
+
+        *start = beststart;
+        *end = bestend;
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+bool ClusterFit::compress4(Vector3 * start, Vector3 * end)
+{
+    const uint count = m_count;
+    const Vector3 grid( 31.0f, 63.0f, 31.0f );
+    const Vector3 gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+
+    // declare variables
+    Vector3 beststart( 0.0f );
+    Vector3 bestend( 0.0f );
+    float besterror = FLT_MAX;
+
+    Vector3 x0(0.0f);
+    float w0 = 0.0f;
+    int b0 = 0, b1 = 0, b2 = 0;
+
+    // check all possible clusters for this total order
+    for (uint c0 = 0; c0 <= count; c0++)
+    {
+        Vector3 x1(0.0f);
+        float w1 = 0.0f;
+
+        for (uint c1 = 0; c1 <= count-c0; c1++)
+        {
+            Vector3 x2(0.0f);
+            float w2 = 0.0f;
+
+            for (uint c2 = 0; c2 <= count-c0-c1; c2++)
+            {
+                float w3 = m_wsum - w0 - w1 - w2;
+
+                float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
+                float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
+                float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
+                float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
+
+                Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
+                Vector3 const betax_sum = m_xsum - alphax_sum;
+
+                Vector3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor;
+                Vector3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor;
+
+                // clamp to the grid
+                a = clamp(a, 0, 1);
+                b = clamp(b, 0, 1);
+#if 1
+                a = floor(a * grid + 0.5f) * gridrcp;
+                b = floor(b * grid + 0.5f) * gridrcp;
+#else
+                a = round565(a);
+                b = round565(b);
+#endif
+                // @@ It would be much more accurate to evaluate the error exactly. 
+
+                // compute the error
+                Vector3 e1 = a*a*alpha2_sum + b*b*beta2_sum + 2.0f*( a*b*alphabeta_sum - a*alphax_sum - b*betax_sum );
+
+                // apply the metric to the error term
+                float error = dot( e1, m_metricSqr );
+
+                // keep the solution if it wins
+                if (error < besterror)
+                {
+                    besterror = error;
+                    beststart = a;
+                    bestend = b;
+                    b0 = c0;
+                    b1 = c1;
+                    b2 = c2;
+                }
+
+                x2 += m_weighted[c0+c1+c2];
+                w2 += m_weights[c0+c1+c2];
+            }
+
+            x1 += m_weighted[c0+c1];
+            w1 += m_weights[c0+c1];
+        }
+
+        x0 += m_weighted[c0];
+        w0 += m_weights[c0];
+    }
+
+    // save the block if necessary
+    if (besterror < m_besterror)
+    {
+        *start = beststart;
+        *end = bestend;
+
+        // save the error
+        m_besterror = besterror;
+
+        return true;
+    }
+
+    return false;
+}
+
+#endif // NVTT_USE_SIMD