Merge changes from the witness.

2014-11-04 17:49:29 +00:00
parent 4cb60cc5ba
commit d019cd7080
86 changed files with 3534 additions and 882 deletions
--- a/src/nvmath/Box.inl
+++ b/src/nvmath/Box.inl
@ -39,7 +39,7 @@ namespace nv
    // Build a cube centered on center and with edge = 2*dist
    inline void Box::cube(const Vector3 & center, float dist)
    {
-        setCenterExtents(center, Vector3(dist, dist, dist));
+        setCenterExtents(center, Vector3(dist));
    }

    // Build a box, given center and extents.
--- a/src/nvmath/Color.h
+++ b/src/nvmath/Color.h
@ -89,6 +89,7 @@ namespace nv
                uint8 b: 8;
 #endif
            };
+            uint8 component[4];
            uint32 u;
        };
    };
--- a/src/nvmath/Color.inl
+++ b/src/nvmath/Color.inl
@ -6,6 +6,7 @@

 #include "Color.h"
 #include "Vector.inl"
+#include "ftoi.h"


 namespace nv
@ -123,30 +124,30 @@ namespace nv
    inline Color32 toColor32(const Vector4 & v)
    {
        Color32 color;
-        color.r = toU8(nv::iround(saturate(v.x) * 255));
-        color.g = toU8(nv::iround(saturate(v.y) * 255));
-        color.b = toU8(nv::iround(saturate(v.z) * 255));
-        color.a = toU8(nv::iround(saturate(v.w) * 255));
+        color.r = U8(ftoi_round(saturate(v.x) * 255));
+        color.g = U8(ftoi_round(saturate(v.y) * 255));
+        color.b = U8(ftoi_round(saturate(v.z) * 255));
+        color.a = U8(ftoi_round(saturate(v.w) * 255));
        return color;
    }

    inline Color32 toColor32_from_bgra(const Vector4 & v)
    {
        Color32 color;
-        color.b = toU8(nv::iround(saturate(v.x) * 255));
-        color.g = toU8(nv::iround(saturate(v.y) * 255));
-        color.r = toU8(nv::iround(saturate(v.z) * 255));
-        color.a = toU8(nv::iround(saturate(v.w) * 255));
+        color.b = U8(ftoi_round(saturate(v.x) * 255));
+        color.g = U8(ftoi_round(saturate(v.y) * 255));
+        color.r = U8(ftoi_round(saturate(v.z) * 255));
+        color.a = U8(ftoi_round(saturate(v.w) * 255));
        return color;
    }

    inline Color32 toColor32_from_argb(const Vector4 & v)
    {
        Color32 color;
-        color.a = toU8(nv::iround(saturate(v.x) * 255));
-        color.r = toU8(nv::iround(saturate(v.y) * 255));
-        color.g = toU8(nv::iround(saturate(v.z) * 255));
-        color.b = toU8(nv::iround(saturate(v.w) * 255));
+        color.a = U8(ftoi_round(saturate(v.x) * 255));
+        color.r = U8(ftoi_round(saturate(v.y) * 255));
+        color.g = U8(ftoi_round(saturate(v.z) * 255));
+        color.b = U8(ftoi_round(saturate(v.w) * 255));
        return color;
    }

--- a/src/nvmath/Fitting.cpp
+++ b/src/nvmath/Fitting.cpp
@ -4,10 +4,11 @@
 #include "Vector.inl"
 #include "Plane.inl"

+#include "nvcore/Array.inl"
 #include "nvcore/Utils.h" // max, swap

 #include <float.h> // FLT_MAX
-#include <vector>
+//#include <vector>
 #include <string.h>

 using namespace nv;
@ -329,7 +330,7 @@ void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R);
 Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict points)
 {
 	// Store the points in an n x n matrix
-	std::vector<float> Q(n*n, 0.0f);
+    Array<float> Q; Q.resize(n*n, 0.0f);
 	for (int i = 0; i < n; ++i)
 	{
 		Q[i*n+0] = points[i].x;
@ -338,8 +339,8 @@ Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict
 	}

 	// Alloc space for the SVD outputs
-	std::vector<float> diag(n, 0.0f);
-	std::vector<float> R(n*n, 0.0f);
+    Array<float> diag; diag.resize(n, 0.0f);
+    Array<float> R; R.resize(n*n, 0.0f);

 	ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);

@ -350,7 +351,7 @@ Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict
 Vector4 nv::Fit::computePrincipalComponent_SVD(int n, const Vector4 *__restrict points)
 {
 	// Store the points in an n x n matrix
-	std::vector<float> Q(n*n, 0.0f);
+    Array<float> Q; Q.resize(n*n, 0.0f);
 	for (int i = 0; i < n; ++i)
 	{
 		Q[i*n+0] = points[i].x;
@ -360,8 +361,8 @@ Vector4 nv::Fit::computePrincipalComponent_SVD(int n, const Vector4 *__restrict
 	}

 	// Alloc space for the SVD outputs
-	std::vector<float> diag(n, 0.0f);
-	std::vector<float> R(n*n, 0.0f);
+    Array<float> diag; diag.resize(n, 0.0f);
+    Array<float> R; R.resize(n*n, 0.0f);

 	ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);

@ -940,7 +941,7 @@ void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R)
 	float  g     = 0.0f;
 	float  scale = 0.0f;

-	std::vector<float> temp(cols, 0.0f);
+    Array<float> temp; temp.resize(cols, 0.0f);

 	for( i = 0; i < cols; i++ ) 
 	{
--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@ -580,56 +580,56 @@ namespace nv {
 void nv::half_init_tables()
 {
    // Init mantissa table.
-	mantissa_table[0] = 0;
+    mantissa_table[0] = 0;

    // denormals
-	for (int i = 1; i < 1024; i++) {
-		uint m = i << 13;
-		uint e = 0;
+    for (int i = 1; i < 1024; i++) {
+        uint m = i << 13;
+        uint e = 0;

-		while ((m & 0x00800000) == 0) {
-			e -= 0x00800000;
-			m <<= 1;
-		}
-		m &= ~0x00800000;
-		e += 0x38800000;
-		mantissa_table[i] = m | e;
-	}
+        while ((m & 0x00800000) == 0) {
+            e -= 0x00800000;
+            m <<= 1;
+        }
+        m &= ~0x00800000;
+        e += 0x38800000;
+        mantissa_table[i] = m | e;
+    }

    // normals
    for (int i = 1024; i < 2048; i++) {
-		mantissa_table[i] = (i - 1024) << 13;
+        mantissa_table[i] = (i - 1024) << 13;
    }


    // Init exponent table.
-	exponent_table[0] = 0;
+    exponent_table[0] = 0;

    for (int i = 1; i < 31; i++) {
-		exponent_table[i] = 0x38000000 + (i << 23);
+        exponent_table[i] = 0x38000000 + (i << 23);
    }

-	exponent_table[31] = 0x7f800000;
-	exponent_table[32] = 0x80000000;
+    exponent_table[31] = 0x7f800000;
+    exponent_table[32] = 0x80000000;

    for (int i = 33; i < 63; i++) {
-		exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
+        exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
    }

-	exponent_table[63] = 0xff800000;
+    exponent_table[63] = 0xff800000;


    // Init offset table.
-	offset_table[0] = 0;
+    offset_table[0] = 0;

    for (int i = 1; i < 32; i++) {
-		offset_table[i] = 1024;
+        offset_table[i] = 1024;
    }

-	offset_table[32] = 0;
+    offset_table[32] = 0;

    for (int i = 33; i < 64; i++) {
-		offset_table[i] = 1024;
+        offset_table[i] = 1024;
    }
 }

@ -660,27 +660,27 @@ uint32 nv::fast_half_to_float(uint16 v)
 // Mike Acton's code should be fairly easy to vectorize and that would handle all cases too, the table method might still be faster, though.


-static __declspec(align(16)) unsigned half_sign[4]	  = {0x00008000, 0x00008000, 0x00008000, 0x00008000};
-static __declspec(align(16)) unsigned half_exponent[4]	  = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00};
-static __declspec(align(16)) unsigned half_mantissa[4]	  = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF};
+static __declspec(align(16)) unsigned half_sign[4]    = {0x00008000, 0x00008000, 0x00008000, 0x00008000};
+static __declspec(align(16)) unsigned half_exponent[4]    = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00};
+static __declspec(align(16)) unsigned half_mantissa[4]    = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF};
 static __declspec(align(16)) unsigned half_bias_offset[4] = {0x0001C000, 0x0001C000, 0x0001C000, 0x0001C000};

 __asm
 {
-	movaps	xmm1, xmm0  // Input in xmm0
-	movaps	xmm2, xmm0
+    movaps  xmm1, xmm0  // Input in xmm0
+    movaps  xmm2, xmm0

-	andps	xmm0, half_sign
-	andps	xmm1, half_exponent
-	andps	xmm2, half_mantissa
-	paddd	xmm1, half_bias_offset
+    andps   xmm0, half_sign
+    andps   xmm1, half_exponent
+    andps   xmm2, half_mantissa
+    paddd   xmm1, half_bias_offset

-	pslld	xmm0, 16
-	pslld	xmm1, 13
-	pslld	xmm2, 13
+    pslld   xmm0, 16
+    pslld   xmm1, 13
+    pslld   xmm2, 13

-	orps	xmm1, xmm2
-	orps	xmm0, xmm1  // Result in xmm0
+    orps    xmm1, xmm2
+    orps    xmm0, xmm1  // Result in xmm0
 }


--- a/src/nvmath/Matrix.cpp
+++ b/src/nvmath/Matrix.cpp
@ -7,6 +7,10 @@

 #include <float.h>

+#if !NV_CC_MSVC && !NV_OS_ORBIS
+#include <alloca.h>
+#endif
+
 using namespace nv;


@ -20,8 +24,7 @@ static bool ludcmp(float **a, int n, int *indx, float *d)
 {
    const float TINY = 1.0e-20f;

-    Array<float> vv;    // vv stores the implicit scaling of each row.
-    vv.resize(n);
+    float * vv = (float*)alloca(sizeof(float) * n);    // vv stores the implicit scaling of each row.

    *d = 1.0; // No row interchanges yet.
    for (int i = 0; i < n; i++) { // Loop over rows to get the implicit scaling information.
@ -149,6 +152,21 @@ bool nv::solveLU(const Matrix & A, const Vector4 & b, Vector4 * x)
    return true;
 }

+// @@ Not tested.
+Matrix nv::inverseLU(const Matrix & A)
+{
+    Vector4 Ai[4];
+
+    solveLU(A, Vector4(1, 0, 0, 0), &Ai[0]);
+    solveLU(A, Vector4(0, 1, 0, 0), &Ai[1]);
+    solveLU(A, Vector4(0, 0, 1, 0), &Ai[2]);
+    solveLU(A, Vector4(0, 0, 0, 1), &Ai[3]);
+
+    return Matrix(Ai[0], Ai[1], Ai[2], Ai[3]);
+}
+
+
+
 bool nv::solveLU(const Matrix3 & A, const Vector3 & b, Vector3 * x)
 {
    nvDebugCheck(x != NULL);
@ -184,7 +202,7 @@ bool nv::solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x)
 {
    nvDebugCheck(x != NULL);

-    *x = transform(inverse(A), b);
+    *x = transform(inverseCramer(A), b);
    
    return true; // @@ Return false if determinant(A) == 0 !
 }
@ -198,7 +216,7 @@ bool nv::solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x)
        return false;
    }

-    Matrix3 Ai = inverse(A);
+    Matrix3 Ai = inverseCramer(A);

    *x = transform(Ai, b);
    
@ -207,6 +225,119 @@ bool nv::solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x)



+// Inverse using gaussian elimination. From Jon's code.
+Matrix nv::inverse(const Matrix & m) {
+
+    Matrix A = m;
+    Matrix B(identity);
+
+    int i, j, k;
+    float max, t, det, pivot;
+
+    det = 1.0;
+    for (i=0; i<4; i++) {               /* eliminate in column i, below diag */
+        max = -1.;
+        for (k=i; k<4; k++)             /* find pivot for column i */
+            if (fabs(A(k, i)) > max) {
+                max = fabs(A(k, i));
+                j = k;
+            }
+        if (max<=0.) return B;         /* if no nonzero pivot, PUNT */
+        if (j!=i) {                     /* swap rows i and j */
+            for (k=i; k<4; k++)
+                swap(A(i, k), A(j, k));
+            for (k=0; k<4; k++)
+                swap(B(i, k), B(j, k));
+            det = -det;
+        }
+        pivot = A(i, i);
+        det *= pivot;
+        for (k=i+1; k<4; k++)           /* only do elems to right of pivot */
+            A(i, k) /= pivot;
+        for (k=0; k<4; k++)
+            B(i, k) /= pivot;
+        /* we know that A(i, i) will be set to 1, so don't bother to do it */
+
+        for (j=i+1; j<4; j++) {         /* eliminate in rows below i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=i+1; k<4; k++)       /* subtract scaled row i from row j */
+                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
+            for (k=0; k<4; k++)
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    /*---------- backward elimination ----------*/
+
+    for (i=4-1; i>0; i--) {             /* eliminate in column i, above diag */
+        for (j=0; j<i; j++) {           /* eliminate in rows above i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=0; k<4; k++)         /* subtract scaled row i from row j */
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    return B;
+}
+
+
+Matrix3 nv::inverse(const Matrix3 & m) {
+
+    Matrix3 A = m;
+    Matrix3 B(identity);
+
+    int i, j, k;
+    float max, t, det, pivot;
+
+    det = 1.0;
+    for (i=0; i<3; i++) {               /* eliminate in column i, below diag */
+        max = -1.;
+        for (k=i; k<3; k++)             /* find pivot for column i */
+            if (fabs(A(k, i)) > max) {
+                max = fabs(A(k, i));
+                j = k;
+            }
+        if (max<=0.) return B;         /* if no nonzero pivot, PUNT */
+        if (j!=i) {                     /* swap rows i and j */
+            for (k=i; k<3; k++)
+                swap(A(i, k), A(j, k));
+            for (k=0; k<3; k++)
+                swap(B(i, k), B(j, k));
+            det = -det;
+        }
+        pivot = A(i, i);
+        det *= pivot;
+        for (k=i+1; k<3; k++)           /* only do elems to right of pivot */
+            A(i, k) /= pivot;
+        for (k=0; k<3; k++)
+            B(i, k) /= pivot;
+        /* we know that A(i, i) will be set to 1, so don't bother to do it */
+
+        for (j=i+1; j<3; j++) {         /* eliminate in rows below i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=i+1; k<3; k++)       /* subtract scaled row i from row j */
+                A(j, k) -= A(i, k)*t;   /* (ignore k<=i, we know they're 0) */
+            for (k=0; k<3; k++)
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    /*---------- backward elimination ----------*/
+
+    for (i=3-1; i>0; i--) {             /* eliminate in column i, above diag */
+        for (j=0; j<i; j++) {           /* eliminate in rows above i */
+            t = A(j, i);                /* we're gonna zero this guy */
+            for (k=0; k<3; k++)         /* subtract scaled row i from row j */
+                B(j, k) -= B(i, k)*t;
+        }
+    }
+
+    return B;
+}
+
+
+
+

 #if 0 

--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
@ -83,6 +83,9 @@ namespace nv
        void rotate(float theta, float v0, float v1, float v2);
        float determinant() const;

+        void operator+=(const Matrix & m);
+        void operator-=(const Matrix & m);
+
        void apply(Matrix::Arg m);

    private:
@ -90,11 +93,18 @@ namespace nv
    };

    // Solve equation system using LU decomposition and back-substitution.
-    extern bool solveLU(const Matrix & m, const Vector4 & b, Vector4 * x);
+    extern bool solveLU(const Matrix & A, const Vector4 & b, Vector4 * x);

    // Solve equation system using Cramer's inverse.
    extern bool solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x);

+    // Compute inverse using LU decomposition.
+    extern Matrix inverseLU(const Matrix & m);
+
+    // Compute inverse using Gaussian elimination and partial pivoting.
+    extern Matrix inverse(const Matrix & m);
+    extern Matrix3 inverse(const Matrix3 & m);
+
 } // nv namespace

 #endif // NV_MATH_MATRIX_H
--- a/src/nvmath/Matrix.inl
+++ b/src/nvmath/Matrix.inl
@ -195,7 +195,7 @@ namespace nv
    }

    // Inverse using Cramer's rule.
-    inline Matrix3 inverse(const Matrix3 & m)
+    inline Matrix3 inverseCramer(const Matrix3 & m)
    {
        const float det = m.determinant();
        if (equal(det, 0.0f, 0.0f)) {
@ -477,6 +477,25 @@ namespace nv
        return m;
    }

+    // Get inverse frustum matrix.
+    inline Matrix frustumInverse(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar)
+    {
+        Matrix m(0.0f);
+
+        float one_doubleznear = 1.0f / (2.0f * zNear);
+        float one_doubleznearzfar = 1.0f / (2.0f * zNear * zFar);
+
+        m(0,0) = (xmax - xmin) * one_doubleznear;
+        m(0,3) = (xmax + xmin) * one_doubleznear;
+        m(1,1) = (ymax - ymin) * one_doubleznear;
+        m(1,3) = (ymax + ymin) * one_doubleznear;
+        m(2,3) = -1;
+        m(3,2) = -(zFar - zNear) * one_doubleznearzfar;
+        m(3,3) = (zFar + zNear) * one_doubleznearzfar;
+
+        return m;
+    }
+
    // Get infinite frustum matrix.
    inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear)
    {
@ -510,6 +529,18 @@ namespace nv
        return frustum(xmin, xmax, ymin, ymax, zNear, zFar);	
    }

+    // Get inverse perspective matrix.
+    inline Matrix perspectiveInverse(float fovy, float aspect, float zNear, float zFar)
+    {
+        float xmax = zNear * tan(fovy / 2);
+        float xmin = -xmax;
+
+        float ymax = xmax / aspect;
+        float ymin = -ymax;
+
+        return frustumInverse(xmin, xmax, ymin, ymax, zNear, zFar);	
+    }
+
    // Get infinite perspective matrix.
    inline Matrix perspective(float fovy, float aspect, float zNear)
    {
@ -544,7 +575,7 @@ namespace nv
    }

    // Inverse using Cramer's rule.
-    inline Matrix inverse(Matrix::Arg m)
+    inline Matrix inverseCramer(Matrix::Arg m)
    {
        Matrix r;
        r.data( 0) = m.data(6)*m.data(11)*m.data(13) - m.data(7)*m.data(10)*m.data(13) + m.data(7)*m.data(9)*m.data(14) - m.data(5)*m.data(11)*m.data(14) - m.data(6)*m.data(9)*m.data(15) + m.data(5)*m.data(10)*m.data(15);
@ -622,6 +653,35 @@ namespace nv
        return m;
    }

+    inline void Matrix::operator+=(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] += m.m_data[i];
+        }
+    }
+
+    inline void Matrix::operator-=(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] -= m.m_data[i];
+        }
+    }
+
+    inline Matrix operator+(const Matrix & a, const Matrix & b)
+    {
+        Matrix m = a;
+        m += b;
+        return m;
+    }
+
+    inline Matrix operator-(const Matrix & a, const Matrix & b)
+    {
+        Matrix m = a;
+        m -= b;
+        return m;
+    }
+
+
 } // nv namespace


--- a/src/nvmath/SimdVector.h
+++ b/src/nvmath/SimdVector.h
@ -12,13 +12,13 @@
 #   if NV_CPU_X86 || NV_CPU_X86_64
 #       define NV_USE_SSE 2
 #   endif
-//#   if defined(__SSE2__)
-//#       define NV_USE_SSE 2
-//#   elif defined(__SSE__)
-//#       define NV_USE_SSE 1
-//#   else
-//#       define NV_USE_SSE 0
-//#   endif
+#   if defined(__SSE2__)
+#       define NV_USE_SSE 2
+#   elif defined(__SSE__)
+#       define NV_USE_SSE 1
+#   else
+#       define NV_USE_SSE 0
+#   endif
 #endif

 // Internally set NV_USE_SIMD when either altivec or sse is available.
--- a/src/nvmath/Vector.h
+++ b/src/nvmath/Vector.h
@ -144,6 +144,6 @@ namespace nv
 // Instead we simply have explicit casts:
 template <typename T> T to(const nv::Vector2 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector2)); return T(v.x, v.y); }
 template <typename T> T to(const nv::Vector3 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector3)); return T(v.x, v.y, v.z); }
-template <typename T> T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.z); }
+template <typename T> T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.w); }

 #endif // NV_MATH_VECTOR_H
--- a/src/nvmath/Vector.inl
+++ b/src/nvmath/Vector.inl
@ -440,14 +440,17 @@ namespace nv
    }

    // Note, this is the area scaled by 2!
+    inline float triangleArea(Vector2::Arg v0, Vector2::Arg v1)
+    {
+	    return (v0.x * v1.y - v0.y * v1.x); // * 0.5f;
+    }
    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
    {
-	    Vector2 v0 = a - c;
-	    Vector2 v1 = b - c;
-
-	    return (v0.x * v1.y - v0.y * v1.x);
+        return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y); // * 0.5f;
+        //return triangleArea(a-c, b-c);
    }

+
    template <>
    inline uint hash(const Vector2 & v, uint h)
    {
--- a/src/nvmath/ftoi.h
+++ b/src/nvmath/ftoi.h
@ -0,0 +1,256 @@
+// This code is in the public domain -- castano@gmail.com
+
+#pragma once
+#ifndef NV_MATH_FTOI_H
+#define NV_MATH_FTOI_H
+
+#include "nvmath/nvmath.h"
+
+#include <math.h>
+
+namespace nv
+{
+    // Optimized float to int conversions. See:
+    // http://cbloomrants.blogspot.com/2009/01/01-17-09-float-to-int.html
+    // http://www.stereopsis.com/sree/fpu2006.html
+    // http://assemblyrequired.crashworks.org/2009/01/12/why-you-should-never-cast-floats-to-ints/
+    // http://chrishecker.com/Miscellaneous_Technical_Articles#Floating_Point
+
+
+    union DoubleAnd64 {
+        uint64    i;
+        double    d;
+    };
+
+    static const double floatutil_xs_doublemagic = (6755399441055744.0);                            // 2^52 * 1.5
+    static const double floatutil_xs_doublemagicdelta = (1.5e-8);                                   // almost .5f = .5f + 1e^(number of exp bit)
+    static const double floatutil_xs_doublemagicroundeps = (0.5f - floatutil_xs_doublemagicdelta);  // almost .5f = .5f - 1e^(number of exp bit)
+
+    NV_FORCEINLINE int ftoi_round_xs(double val, double magic) {
+#if 1
+        DoubleAnd64 dunion;
+        dunion.d = val + magic;
+        return (int32) dunion.i; // just cast to grab the bottom bits
+#else
+        val += magic;
+        return ((int*)&val)[0]; // @@ Assumes little endian.
+#endif
+    }
+
+    NV_FORCEINLINE int ftoi_round_xs(float val) {
+        return ftoi_round_xs(val, floatutil_xs_doublemagic);
+    }
+
+    NV_FORCEINLINE int ftoi_floor_xs(float val) {
+        return ftoi_round_xs(val - floatutil_xs_doublemagicroundeps, floatutil_xs_doublemagic);
+    }
+
+    NV_FORCEINLINE int ftoi_ceil_xs(float val) {
+        return ftoi_round_xs(val + floatutil_xs_doublemagicroundeps, floatutil_xs_doublemagic);
+    }
+
+    NV_FORCEINLINE int ftoi_trunc_xs(float val) {
+        return (val<0) ? ftoi_ceil_xs(val) : ftoi_floor_xs(val);
+    }
+
+#if NV_CPU_X86 || NV_CPU_X86_64
+
+    NV_FORCEINLINE int ftoi_round_sse(float f) {
+        return _mm_cvt_ss2si(_mm_set_ss(f));
+    }
+
+    NV_FORCEINLINE int ftoi_trunc_sse(float f) {
+      return _mm_cvtt_ss2si(_mm_set_ss(f));
+    }
+
+#endif
+
+
+
+#if NV_USE_SSE
+
+    NV_FORCEINLINE int ftoi_round(float val) {
+        return ftoi_round_sse(val);
+    }
+
+    NV_FORCEINLINE int ftoi_trunc(float f) {
+      return ftoi_trunc_sse(f);
+    }
+
+    // We can probably do better than this. See for example:
+    // http://dss.stephanierct.com/DevBlog/?p=8
+    NV_FORCEINLINE int ftoi_floor(float val) {
+        return ftoi_round(floorf(val));
+    }
+
+    NV_FORCEINLINE int ftoi_ceil(float val) {
+        return ftoi_round(ceilf(val));
+    }
+
+#else
+
+    // In theory this should work with any double floating point math implementation, but it appears that MSVC produces incorrect code
+    // when SSE2 is targeted and fast math is enabled (/arch:SSE2 & /fp:fast). These problems go away with /fp:precise, which is the default mode.
+
+    NV_FORCEINLINE int ftoi_round(float val) {
+        return ftoi_round_xs(val);
+    }
+
+    NV_FORCEINLINE int ftoi_floor(float val) {
+        return ftoi_floor_xs(val);
+    }
+
+    NV_FORCEINLINE int ftoi_ceil(float val) {
+        return ftoi_ceil_xs(val);
+    }
+
+    NV_FORCEINLINE int ftoi_trunc(float f) {
+      return ftoi_trunc_xs(f);
+    }
+
+#endif
+
+
+    inline void test_ftoi() {
+
+        // Round to nearest integer.
+        nvCheck(ftoi_round(0.1f) == 0);
+        nvCheck(ftoi_round(0.6f) == 1);
+        nvCheck(ftoi_round(-0.2f) == 0);
+        nvCheck(ftoi_round(-0.7f) == -1);
+        nvCheck(ftoi_round(10.1f) == 10);
+        nvCheck(ftoi_round(10.6f) == 11);
+        nvCheck(ftoi_round(-90.1f) == -90);
+        nvCheck(ftoi_round(-90.6f) == -91);
+
+        nvCheck(ftoi_round(0) == 0);
+        nvCheck(ftoi_round(1) == 1);
+        nvCheck(ftoi_round(-1) == -1);
+        
+        nvCheck(ftoi_round(0.5f) == 0);  // How are midpoints rounded? Bankers rounding.
+        nvCheck(ftoi_round(1.5f) == 2);
+        nvCheck(ftoi_round(2.5f) == 2);
+        nvCheck(ftoi_round(3.5f) == 4);
+        nvCheck(ftoi_round(4.5f) == 4);
+        nvCheck(ftoi_round(-0.5f) == 0);
+        nvCheck(ftoi_round(-1.5f) == -2);
+                
+
+        // Truncation (round down if > 0, round up if < 0).
+        nvCheck(ftoi_trunc(0.1f) == 0);
+        nvCheck(ftoi_trunc(0.6f) == 0);
+        nvCheck(ftoi_trunc(-0.2f) == 0);
+        nvCheck(ftoi_trunc(-0.7f) == 0);    // @@ When using /arch:SSE2 in Win32, msvc produce wrong code for this one. It is skipping the addition.
+        nvCheck(ftoi_trunc(1.99f) == 1);
+        nvCheck(ftoi_trunc(-1.2f) == -1);
+
+        // Floor (round down).
+        nvCheck(ftoi_floor(0.1f) == 0);
+        nvCheck(ftoi_floor(0.6f) == 0);
+        nvCheck(ftoi_floor(-0.2f) == -1);
+        nvCheck(ftoi_floor(-0.7f) == -1);
+        nvCheck(ftoi_floor(1.99f) == 1);
+        nvCheck(ftoi_floor(-1.2f) == -2);
+
+        nvCheck(ftoi_floor(0) == 0);
+        nvCheck(ftoi_floor(1) == 1);
+        nvCheck(ftoi_floor(-1) == -1);
+        nvCheck(ftoi_floor(2) == 2);
+        nvCheck(ftoi_floor(-2) == -2);
+
+        // Ceil (round up).
+        nvCheck(ftoi_ceil(0.1f) == 1);
+        nvCheck(ftoi_ceil(0.6f) == 1);
+        nvCheck(ftoi_ceil(-0.2f) == 0);
+        nvCheck(ftoi_ceil(-0.7f) == 0);
+        nvCheck(ftoi_ceil(1.99f) == 2);
+        nvCheck(ftoi_ceil(-1.2f) == -1);
+
+        nvCheck(ftoi_ceil(0) == 0);
+        nvCheck(ftoi_ceil(1) == 1);
+        nvCheck(ftoi_ceil(-1) == -1);
+        nvCheck(ftoi_ceil(2) == 2);
+        nvCheck(ftoi_ceil(-2) == -2);
+    }
+
+
+
+
+
+    // Safe versions using standard casts.
+
+    inline int iround(float f)
+    {
+        return int(floorf(f + 0.5f));
+    }
+
+    inline int iround(double f)
+    {
+        return int(::floor(f + 0.5));
+    }
+
+    inline int ifloor(float f)
+    {
+        return int(floorf(f));
+    }
+
+    inline int iceil(float f)
+    {
+        return int(ceilf(f));
+    }
+
+
+
+    // I'm always confused about which quantizer to use. I think we should choose a quantizer based on how the values are expanded later and this is generally using the 'exact endpoints' rule.
+    // Some notes from cbloom: http://cbloomrants.blogspot.com/2011/07/07-26-11-pixel-int-to-float-options.html
+
+    // Quantize a float in the [0,1] range, using exact end points or uniform bins.
+    inline float quantizeFloat(float x, uint bits, bool exactEndPoints = true) {
+        nvDebugCheck(bits <= 16);
+
+        float range = float(1 << bits);
+        if (exactEndPoints) {
+            return floorf(x * (range-1) + 0.5f) / (range-1);
+        }
+        else {
+            return (floorf(x * range) + 0.5f) / range;
+        }
+    }
+
+
+    // This is the most common rounding mode:
+    // 
+    //   0     1       2     3
+    // |___|_______|_______|___|
+    // 0                       1
+    //
+    // You get that if you take the unit floating point number multiply by 'N-1' and round to nearest. That is, `i = round(f * (N-1))`.
+    // You reconstruct the original float dividing by 'N-1': `f = i / (N-1)`
+
+
+    //    0     1     2     3
+    // |_____|_____|_____|_____|
+    // 0                       1
+
+    /*enum BinningMode {
+        RoundMode_ExactEndPoints,       
+        RoundMode_UniformBins,
+    };*/
+
+    template <int N>
+    inline uint unitFloatToFixed(float f) {
+        return ftoi_round(f * ((1<<N)-1));
+    }
+
+    inline uint8 unitFloatToFixed8(float f) {
+        return (uint8)unitFloatToFixed<8>(f);
+    }
+
+    inline uint16 unitFloatToFixed16(float f) {
+        return (uint16)unitFloatToFixed<16>(f);
+    }
+
+
+} // nv
+
+#endif // NV_MATH_FTOI_H
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@ -14,6 +14,13 @@
 #include <float.h>  // finite, isnan
 #endif

+#if NV_CPU_X86 || NV_CPU_X86_64
+    //#include <intrin.h>
+    #include <xmmintrin.h>
+#endif
+
+
+
 // Function linkage
 #if NVMATH_SHARED
 #ifdef NVMATH_EXPORTS
@ -28,6 +35,37 @@
 #define NVMATH_CLASS
 #endif // NVMATH_SHARED

+// Set some reasonable defaults.
+#ifndef NV_USE_ALTIVEC
+#   define NV_USE_ALTIVEC NV_CPU_PPC
+//#   define NV_USE_ALTIVEC defined(__VEC__)
+#endif
+
+#ifndef NV_USE_SSE
+#   if NV_CPU_X86_64
+        // x64 always supports at least SSE2
+#       define NV_USE_SSE 2
+#   elif NV_CC_MSVC && defined(_M_IX86_FP)
+        // Also on x86 with the /arch:SSE flag in MSVC.
+#       define NV_USE_SSE _M_IX86_FP       // 1=SSE, 2=SS2
+#   elif defined(__SSE__)
+#       define NV_USE_SSE 1
+#   elif defined(__SSE2__)
+#       define NV_USE_SSE 2
+#   else
+        // Otherwise we assume no SSE.
+#       define NV_USE_SSE 0
+#   endif
+#endif
+
+
+// Internally set NV_USE_SIMD when either altivec or sse is available.
+#if NV_USE_ALTIVEC && NV_USE_SSE
+#	error "Cannot enable both altivec and sse!"
+#endif
+
+
+
 #ifndef PI
 #define PI                  float(3.1415926535897932384626433833)
 #endif
@ -179,26 +217,6 @@ namespace nv
    inline float cube(float f) { return f * f * f; }
    inline int cube(int i) { return i * i * i; }

-    // @@ Float to int conversions to be optimized at some point. See:
-    // http://cbloomrants.blogspot.com/2009/01/01-17-09-float-to-int.html
-    // http://www.stereopsis.com/sree/fpu2006.html
-    // http://assemblyrequired.crashworks.org/2009/01/12/why-you-should-never-cast-floats-to-ints/
-    // http://chrishecker.com/Miscellaneous_Technical_Articles#Floating_Point
-    inline int iround(float f)
-    {
-        return int(floorf(f + 0.5f));
-    }
-
-    inline int ifloor(float f)
-    {
-        return int(floorf(f));
-    }
-
-    inline int iceil(float f)
-    {
-        return int(ceilf(f));
-    }
-
    inline float frac(float f)
    {
        return f - floor(f);
@ -242,21 +260,6 @@ namespace nv
        return 0;
    }

-    // I'm always confused about which quantizer to use. I think we should choose a quantizer based on how the values are expanded later and this is generally using the 'exact endpoints' rule.
-
-    // Quantize a float in the [0,1] range, using exact end points or uniform bins.
-    inline float quantizeFloat(float x, uint bits, bool exactEndPoints = true) {
-        nvDebugCheck(bits <= 16);
-
-        float range = float(1 << bits);
-        if (exactEndPoints) {
-            return floorf(x * (range-1) + 0.5f) / (range-1);
-        }
-        else {
-            return (floorf(x * range) + 0.5f) / range;
-        }
-    }
-
    union Float754 {
        unsigned int raw;
        float value;