From fa468b04ab991fa25b930183cb1caa9923548452 Mon Sep 17 00:00:00 2001
From: castano <castano@95f4ed2b-212e-0410-8b90-d31948207fce>
Date: Tue, 14 Feb 2012 16:31:25 +0000
Subject: [PATCH] Merge changes from The Witness.

---
 src/nvmath/Box.cpp          |  35 ++++++
 src/nvmath/Box.h            |   4 +
 src/nvmath/Box.inl          |  38 +++----
 src/nvmath/Fitting.cpp      | 218 ++++++++++++++++++++++++++++++++++--
 src/nvmath/Fitting.h        |   4 +
 src/nvmath/Half.cpp         | 124 ++++++++++++++++----
 src/nvmath/Half.h           |  13 ++-
 src/nvmath/SimdVector_SSE.h |  80 +++++++------
 src/nvmath/Vector.inl       |  55 ++++++++-
 src/nvmath/nvmath.h         |   7 +-
 10 files changed, 492 insertions(+), 86 deletions(-)

diff --git a/src/nvmath/Box.cpp b/src/nvmath/Box.cpp
index 7368f3c..48e8517 100644
--- a/src/nvmath/Box.cpp
+++ b/src/nvmath/Box.cpp
@@ -29,3 +29,38 @@ float nv::distanceSquared(const Box &box, const Vector3 &point) {
 /*bool nv::overlap(const Box &box, const Sphere &sphere) {
     return distanceSquared(box, sphere.center) < sphere.radius * sphere.radius;
 }*/
+
+
+bool nv::intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t /*= NULL*/) {
+    // Precompute these in ray structure?
+    int sdx = (id.x < 0);
+    int sdy = (id.y < 0);
+    int sdz = (id.z < 0);
+
+    float tmin = (box.corner(  sdx).x - p.x) * id.x;
+    float tmax = (box.corner(1-sdx).x - p.x) * id.x;
+    float tymin = (box.corner(  sdy).y - p.y) * id.y;
+    float tymax = (box.corner(1-sdy).y - p.y) * id.y;
+
+    if ((tmin > tymax) || (tymin > tmax)) 
+        return false;
+
+    if (tymin > tmin) tmin = tymin;
+    if (tymax < tmax) tmax = tymax;
+
+    float tzmin = (box.corner(  sdz).z - p.z) * id.z;
+    float tzmax = (box.corner(1-sdz).z - p.z) * id.z;
+
+    if ((tmin > tzmax) || (tzmin > tmax)) 
+        return false;
+
+    if (tzmin > tmin) tmin = tzmin;
+    if (tzmax < tmax) tmax = tzmax;
+
+    if (tmax < 0) 
+        return false;
+
+    if (t != NULL) *t = tmin;
+
+    return true;
+}
diff --git a/src/nvmath/Box.h b/src/nvmath/Box.h
index 74e4bf3..71fff03 100644
--- a/src/nvmath/Box.h
+++ b/src/nvmath/Box.h
@@ -74,6 +74,8 @@ namespace nv
 
         friend Stream & operator<< (Stream & s, Box & box);
 
+        const Vector3 & corner(int i) const { return (&minCorner)[i]; }
+
         Vector3 minCorner;
         Vector3 maxCorner;
     };
@@ -81,6 +83,8 @@ namespace nv
     float distanceSquared(const Box &box, const Vector3 &point);
     bool overlap(const Box &box, const Sphere &sphere);
 
+    // p is ray origin, id is inverse ray direction.
+    bool intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t);
 
 } // nv namespace
 
diff --git a/src/nvmath/Box.inl b/src/nvmath/Box.inl
index 9b69828..55d738e 100644
--- a/src/nvmath/Box.inl
+++ b/src/nvmath/Box.inl
@@ -12,51 +12,51 @@
 namespace nv
 {
     // Default ctor.
-    Box::Box() { };
+    inline Box::Box() { };
 
     // Copy ctor.
-    Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
+    inline Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
 
     // Init ctor.
-    Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { }
+    inline Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { }
 
     // Assignment operator.
-    Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; }
+    inline Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; }
 
     // Clear the bounds.
-    void Box::clearBounds()
+    inline void Box::clearBounds()
     {
         minCorner.set(FLT_MAX, FLT_MAX, FLT_MAX);
         maxCorner.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
     }
 
     // Build a cube centered on center and with edge = 2*dist
-    void Box::cube(const Vector3 & center, float dist)
+    inline void Box::cube(const Vector3 & center, float dist)
     {
         setCenterExtents(center, Vector3(dist, dist, dist));
     }
 
     // Build a box, given center and extents.
-    void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents)
+    inline void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents)
     {
         minCorner = center - extents;
         maxCorner = center + extents;
     }
 
     // Get box center.
-    Vector3 Box::center() const
+    inline Vector3 Box::center() const
     {
         return (minCorner + maxCorner) * 0.5f;
     }
 
     // Return extents of the box.
-    Vector3 Box::extents() const
+    inline Vector3 Box::extents() const
     {
         return (maxCorner - minCorner) * 0.5f;
     }
 
     // Return extents of the box.
-    float Box::extents(uint axis) const
+    inline float Box::extents(uint axis) const
     {
         nvDebugCheck(axis < 3);
         if (axis == 0) return (maxCorner.x - minCorner.x) * 0.5f;
@@ -67,55 +67,55 @@ namespace nv
     }
 
     // Add a point to this box.
-    void Box::addPointToBounds(const Vector3 & p)
+    inline void Box::addPointToBounds(const Vector3 & p)
     {
         minCorner = min(minCorner, p);
         maxCorner = max(maxCorner, p);
     }
 
     // Add a box to this box.
-    void Box::addBoxToBounds(const Box & b)
+    inline void Box::addBoxToBounds(const Box & b)
     {
         minCorner = min(minCorner, b.minCorner);
         maxCorner = max(maxCorner, b.maxCorner);
     }
 
     // Translate box.
-    void Box::translate(const Vector3 & v)
+    inline void Box::translate(const Vector3 & v)
     {
         minCorner += v;
         maxCorner += v;
     }
 
     // Scale the box.
-    void Box::scale(float s)
+    inline void Box::scale(float s)
     {
         minCorner *= s;
         maxCorner *= s;
     }
 
     // Expand the box by a fixed amount.
-    void Box::expand(float r) {
+    inline void Box::expand(float r) {
         minCorner -= Vector3(r,r,r);
         maxCorner += Vector3(r,r,r);
     }
 
     // Get the area of the box.
-    float Box::area() const
+    inline float Box::area() const
     {
         const Vector3 d = extents();
         return 8.0f * (d.x*d.y + d.x*d.z + d.y*d.z);
     }	
 
     // Get the volume of the box.
-    float Box::volume() const
+    inline float Box::volume() const
     {
         Vector3 d = extents();
         return 8.0f * (d.x * d.y * d.z);
     }
 
     // Return true if the box contains the given point.
-    bool Box::contains(const Vector3 & p) const
+    inline bool Box::contains(const Vector3 & p) const
     {
         return 
             minCorner.x < p.x && minCorner.y < p.y && minCorner.z < p.z &&
@@ -123,7 +123,7 @@ namespace nv
     }
 
     // Split the given box in 8 octants and assign the ith one to this box.
-    void Box::setOctant(const Box & box, const Vector3 & center, int i)
+    inline void Box::setOctant(const Box & box, const Vector3 & center, int i)
     {
         minCorner = box.minCorner;
         maxCorner = box.maxCorner;
diff --git a/src/nvmath/Fitting.cpp b/src/nvmath/Fitting.cpp
index 57c755a..0106ef8 100644
--- a/src/nvmath/Fitting.cpp
+++ b/src/nvmath/Fitting.cpp
@@ -159,24 +159,228 @@ Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
     float matrix[6];
     Vector3 centroid = computeCovariance(n, points, matrix);
 
-    if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
     {
         // If no plane defined, then return a horizontal plane.
         return Plane(Vector3(0, 0, 1), centroid);
     }
 
-#pragma NV_MESSAGE("TODO: need to write an eigensolver!")
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+    if (!eigenSolveSymmetric(matrix, eigenValues, eigenVectors)) {
+        // If no plane defined, then return a horizontal plane.
+        return Plane(Vector3(0, 0, 1), centroid);
+    }
 
-    // - Numerical Recipes in C is a good reference. Householder transforms followed by QL decomposition seems to be the best approach.
-    // - The one from magic-tools is now LGPL. For the 3D case it uses a cubic root solver, which is not very accurate.
-    // - Charles' Galaxy3 contains an implementation of the tridiagonalization method, but is under BPL.
+    return Plane(eigenVectors[2], centroid);
+}
+
+bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON*/)
+{
+    // compute the centroid and covariance
+    float matrix[6];
+    Vector3 centroid = computeCovariance(n, points, matrix);
 
-    //EigenSolver3 solver(matrix);
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+    if (!eigenSolveSymmetric(matrix, eigenValues, eigenVectors)) {
+        return false;
+    }
 
-    return Plane();
+    return eigenValues[2] < epsilon;
 }
 
 
+
+// Tridiagonal solver from Charles Bloom. 
+// Householder transforms followed by QL decomposition. 
+// Seems to be based on the code from Numerical Recipes in C.
+
+static void EigenSolver_Tridiagonal(double mat[3][3],double * diag,double * subd);
+static bool EigenSolver_QLAlgorithm(double mat[3][3],double * diag,double * subd);
+
+bool nv::Fit::eigenSolveSymmetric(float matrix[6], float eigenValues[3], Vector3 eigenVectors[3])
+{
+    nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
+
+    double subd[3];
+    double diag[3];
+    double work[3][3];
+
+    work[0][0] = matrix[0];
+    work[0][1] = work[1][0] = matrix[1];
+    work[0][2] = work[2][0] = matrix[2];
+    work[1][1] = matrix[3];
+    work[1][2] = work[2][1] = matrix[4];
+    work[2][2] = matrix[5];
+
+    EigenSolver_Tridiagonal(work, diag, subd);
+    if (!EigenSolver_QLAlgorithm(work, diag, subd))
+    {
+        for (int i = 0; i < 3; i++) {
+            eigenValues[i] = 0;
+            eigenVectors[i] = Vector3(0);
+        }
+        return false;
+    }
+
+    for (int i = 0; i < 3; i++) {
+        eigenValues[i] = (float)diag[i];
+    }
+
+    // eigenvectors are the columns; make them the rows :
+
+    for (int i=0; i < 3; i++)
+    {
+        for (int j = 0; j < 3; j++)
+        {
+            eigenVectors[j].component[i] = (float) work[i][j];
+        }
+    }
+
+    // shuffle to sort by singular value :
+    if (eigenValues[2] > eigenValues[0] && eigenValues[2] > eigenValues[1])
+    {
+        swap(eigenValues[0], eigenValues[2]);
+        swap(eigenVectors[0], eigenVectors[2]);
+    }
+    if (eigenValues[1] > eigenValues[0])
+    {
+        swap(eigenValues[0], eigenValues[1]);
+        swap(eigenVectors[0], eigenVectors[1]);
+    }
+    if (eigenValues[2] > eigenValues[1])
+    {
+        swap(eigenValues[1], eigenValues[2]);
+        swap(eigenVectors[1], eigenVectors[2]);
+    }
+
+    nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2]);
+    nvDebugCheck(eigenValues[1] >= eigenValues[2]);
+
+    return true;
+}
+
+static void EigenSolver_Tridiagonal(double mat[3][3],double * diag,double * subd)
+{
+    // Householder reduction T = Q^t M Q
+    //   Input:   
+    //     mat, symmetric 3x3 matrix M
+    //   Output:  
+    //     mat, orthogonal matrix Q
+    //     diag, diagonal entries of T
+    //     subd, subdiagonal entries of T (T is symmetric)
+    const double epsilon = 1e-08f;
+
+    double a = mat[0][0];
+    double b = mat[0][1];
+    double c = mat[0][2];
+    double d = mat[1][1];
+    double e = mat[1][2];
+    double f = mat[2][2];
+
+    diag[0] = a;
+    subd[2] = 0.f;
+    if ( fabs(c) >= epsilon )
+    {
+        const double ell = sqrt(b*b+c*c);
+        b /= ell;
+        c /= ell;
+        const double q = 2*b*e+c*(f-d);
+        diag[1] = d+c*q;
+        diag[2] = f-c*q;
+        subd[0] = ell;
+        subd[1] = e-b*q;
+        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
+        mat[1][0] = 0; mat[1][1] = b; mat[1][2] = c;
+        mat[2][0] = 0; mat[2][1] = c; mat[2][2] = -b;
+    }
+    else
+    {
+        diag[1] = d;
+        diag[2] = f;
+        subd[0] = b;
+        subd[1] = e;
+        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
+        mat[1][0] = 0; mat[1][1] = 1; mat[1][2] = 0;
+        mat[2][0] = 0; mat[2][1] = 0; mat[2][2] = 1;
+    }
+}
+
+static bool EigenSolver_QLAlgorithm(double mat[3][3],double * diag,double * subd)
+{
+    // QL iteration with implicit shifting to reduce matrix from tridiagonal
+    // to diagonal
+    const int maxiter = 32;
+
+    for (int ell = 0; ell < 3; ell++)
+    {
+        int iter;
+        for (iter = 0; iter < maxiter; iter++)
+        {
+            int m;
+            for (m = ell; m <= 1; m++)
+            {
+                double dd = fabs(diag[m]) + fabs(diag[m+1]);
+                if ( fabs(subd[m]) + dd == dd )
+                    break;
+            }
+            if ( m == ell )
+                break;
+
+            double g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
+            double r = sqrt(g*g+1);
+            if ( g < 0 )
+                g = diag[m]-diag[ell]+subd[ell]/(g-r);
+            else
+                g = diag[m]-diag[ell]+subd[ell]/(g+r);
+            double s = 1, c = 1, p = 0;
+            for (int i = m-1; i >= ell; i--)
+            {
+                double f = s*subd[i], b = c*subd[i];
+                if ( fabs(f) >= fabs(g) )
+                {
+                    c = g/f;
+                    r = sqrt(c*c+1);
+                    subd[i+1] = f*r;
+                    c *= (s = 1/r);
+                }
+                else
+                {
+                    s = f/g;
+                    r = sqrt(s*s+1);
+                    subd[i+1] = g*r;
+                    s *= (c = 1/r);
+                }
+                g = diag[i+1]-p;
+                r = (diag[i]-g)*s+2*b*c;
+                p = s*r;
+                diag[i+1] = g+p;
+                g = c*r-b;
+
+                for (int k = 0; k < 3; k++)
+                {
+                    f = mat[k][i+1];
+                    mat[k][i+1] = s*mat[k][i]+c*f;
+                    mat[k][i] = c*mat[k][i]-s*f;
+                }
+            }
+            diag[ell] -= p;
+            subd[ell] = g;
+            subd[m] = 0;
+        }
+
+        if ( iter == maxiter )
+            // should not get here under normal circumstances
+            return false;
+    }
+
+    return true;
+}
+
+
+
+
 int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
 {
     // Compute principal component.
diff --git a/src/nvmath/Fitting.h b/src/nvmath/Fitting.h
index 84d397f..73fad82 100644
--- a/src/nvmath/Fitting.h
+++ b/src/nvmath/Fitting.h
@@ -23,6 +23,10 @@ namespace nv
         Vector3 computePrincipalComponent(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
 
         Plane bestPlane(int n, const Vector3 * points);
+        bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);
+
+        bool eigenSolveSymmetric (float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
+
 
         // Returns number of clusters [1-4].
         int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);
diff --git a/src/nvmath/Half.cpp b/src/nvmath/Half.cpp
index b0bd2a8..7bd9122 100644
--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@@ -488,16 +488,20 @@ nv::half_to_float( uint16 h )
 }
 
 
+// @@ This code appears to be wrong.
 // @@ These tables could be smaller.
-static uint32 mantissa_table[2048];
-static uint32 exponent_table[64];
-static uint32 offset_table[64];
+namespace nv {
+    uint32 mantissa_table[2048];
+    uint32 exponent_table[64];
+    uint32 offset_table[64];
+}
 
 void nv::half_init_tables()
 {
     // Init mantissa table.
 	mantissa_table[0] = 0;
 
+    // denormals
 	for (int i = 1; i < 1024; i++) {
 		uint m = i << 13;
 		uint e = 0;
@@ -511,8 +515,9 @@ void nv::half_init_tables()
 		mantissa_table[i] = m | e;
 	}
 
+    // normals
     for (int i = 1024; i < 2048; i++) {
-		mantissa_table[i] = 0x38000000 + ((i - 1024) << 13);
+		mantissa_table[i] = (i - 1024) << 13;
     }
 
 
@@ -520,17 +525,17 @@ void nv::half_init_tables()
 	exponent_table[0] = 0;
 
     for (int i = 1; i < 31; i++) {
-		exponent_table[i] = (i << 23);
+		exponent_table[i] = 0x38000000 + (i << 23);
     }
 
-	exponent_table[31] = 0x47800000;
+	exponent_table[31] = 0x7f800000;
 	exponent_table[32] = 0x80000000;
 
     for (int i = 33; i < 63; i++) {
-		exponent_table[i] = 0x80000000 + ((i - 32) << 23);
+		exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
     }
 
-	exponent_table[63] = 0xC7800000;
+	exponent_table[63] = 0xff800000;
 
 
     // Init offset table.
@@ -545,22 +550,11 @@ void nv::half_init_tables()
     for (int i = 33; i < 64; i++) {
 		offset_table[i] = 1024;
     }
-
-    /*for (int i = 0; i < 64; i++) {
-        offset_table[i] = ((i & 31) != 0) * 1024;
-    }*/
-}
-
-// Fast half to float conversion based on:
-// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
-uint32 nv::fast_half_to_float(uint16 h)
-{
-	uint exp = h >> 10;
-	return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
 }
 
 
 #if 0
+
 // Inaccurate conversion suggested at the ffmpeg mailing list:
 // http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
 uint32 nv::fast_half_to_float(uint16 v)
@@ -609,4 +603,94 @@ __asm
 }
 
 
+#endif
+
+#if 0
+// These version computes the tables at compile time:
+// http://gamedev.stackexchange.com/questions/17326/conversion-of-a-number-from-single-precision-floating-point-representation-to-a
+
+/* This method is faster than the OpenEXR implementation (very often
+ * used, eg. in Ogre), with the additional benefit of rounding, inspired
+ * by James Tursa�s half-precision code. */
+static inline uint16_t float_to_half_branch(uint32_t x)
+{
+    uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
+    uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
+    unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
+
+    /* If zero, or denormal, or exponent underflows too much for a denormal
+     * half, return signed zero. */
+    if (e < 103)
+        return bits;
+
+    /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
+    if (e > 142)
+    {
+        bits |= 0x7c00u;
+        /* If exponent was 0xff and one mantissa bit was set, it means NaN,
+         * not Inf, so make sure we set one mantissa bit too. */
+        bits |= e == 255 && (x & 0x007fffffu);
+        return bits;
+    }
+
+    /* If exponent underflows but not too much, return a denormal */
+    if (e < 113)
+    {
+        m |= 0x0800u;
+        /* Extra rounding may overflow and set mantissa to 0 and exponent
+         * to 1, which is OK. */
+        bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
+        return bits;
+    }
+
+    bits |= ((e - 112) << 10) | (m >> 1);
+    /* Extra rounding. An overflow will set mantissa to 0 and increment
+     * the exponent, which is OK. */
+    bits += m & 1;
+    return bits;
+}
+
+/* These macros implement a finite iterator useful to build lookup
+ * tables. For instance, S64(0) will call S1(x) for all values of x
+ * between 0 and 63.
+ * Due to the exponential behaviour of the calls, the stress on the
+ * compiler may be important. */
+#define S4(x)    S1((x)),   S1((x)+1),     S1((x)+2),     S1((x)+3)
+#define S16(x)   S4((x)),   S4((x)+4),     S4((x)+8),     S4((x)+12)
+#define S64(x)   S16((x)),  S16((x)+16),   S16((x)+32),   S16((x)+48)
+#define S256(x)  S64((x)),  S64((x)+64),   S64((x)+128),  S64((x)+192)
+#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
+
+/* Lookup table-based algorithm from �Fast Half Float Conversions�
+ * by Jeroen van der Zijp, November 2008. No rounding is performed,
+ * and some NaN values may be incorrectly converted to Inf. */
+static inline uint16_t float_to_half_nobranch(uint32_t x)
+{
+    static uint16_t const basetable[512] =
+    {
+#define S1(i) (((i) < 103) ? 0x0000 : \
+               ((i) < 113) ? 0x0400 >> (113 - (i)) : \
+               ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
+        S256(0),
+#undef S1
+#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
+                         ((i) < 113) ? 0x0400 >> (113 - (i)) : \
+                         ((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
+        S256(0),
+#undef S1
+    };
+
+    static uint8_t const shifttable[512] =
+    {
+#define S1(i) (((i) < 103) ? 24 : \
+               ((i) < 113) ? 126 - (i) : \
+               ((i) < 143 || (i) == 255) ? 13 : 24)
+        S256(0), S256(0),
+#undef S1
+    };
+
+    uint16_t bits = basetable[(x >> 23) & 0x1ff];
+    bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
+    return bits;
+}
 #endif
\ No newline at end of file
diff --git a/src/nvmath/Half.h b/src/nvmath/Half.h
index f732e93..53fceb6 100644
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@@ -11,7 +11,18 @@ namespace nv {
 
     void half_init_tables();
 
-    uint32 fast_half_to_float(uint16 h);
+    extern uint32 mantissa_table[2048];
+    extern uint32 exponent_table[64];
+    extern uint32 offset_table[64];
+
+    // Fast half to float conversion based on:
+    // http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
+    inline uint32 fast_half_to_float(uint16 h)
+    {
+	    uint exp = h >> 10;
+	    return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
+    }
+
 
     inline uint16 to_half(float c) {
         union { float f; uint32 u; } f;
diff --git a/src/nvmath/SimdVector_SSE.h b/src/nvmath/SimdVector_SSE.h
index a8d52d9..c816de2 100644
--- a/src/nvmath/SimdVector_SSE.h
+++ b/src/nvmath/SimdVector_SSE.h
@@ -33,8 +33,15 @@
 #include <emmintrin.h>
 #endif
 
+// See this for ideas:
+// http://molecularmusings.wordpress.com/2011/10/18/simdifying-multi-platform-math/
+
+
 namespace nv {
 
+#define NV_SIMD_NATIVE NV_FORCEINLINE
+#define NV_SIMD_INLINE inline
+
     class SimdVector
     {
     public:
@@ -42,45 +49,47 @@ namespace nv {
 
         typedef SimdVector const& Arg;
 
-        NV_FORCEINLINE SimdVector() {}
-        NV_FORCEINLINE explicit SimdVector(float f) : vec(_mm_set1_ps(f)) {}
-        NV_FORCEINLINE explicit SimdVector(__m128 v) : vec(v) {}
-
-        NV_FORCEINLINE explicit SimdVector(NV_ALIGN_16 Vector4 v) {
-            vec = _mm_load_ps( v.component );
+        NV_SIMD_NATIVE SimdVector() {}
+        
+        NV_SIMD_NATIVE explicit SimdVector(__m128 v) : vec(v) {}
+        
+        NV_SIMD_NATIVE explicit SimdVector(float f) {
+            vec = _mm_set1_ps(f);
         }
 
-        NV_FORCEINLINE explicit SimdVector(const float * v) {
+        NV_SIMD_NATIVE explicit SimdVector(const float * v)
+        {
             vec = _mm_load_ps( v );
         }
 
-        NV_FORCEINLINE SimdVector(float x, float y, float z, float w) {
+        NV_SIMD_NATIVE SimdVector(float x, float y, float z, float w)
+        {
             vec = _mm_setr_ps( x, y, z, w );
         }
 
-        NV_FORCEINLINE SimdVector(const SimdVector & arg) : vec(arg.vec) {}
+        NV_SIMD_NATIVE SimdVector(const SimdVector & arg) : vec(arg.vec) {}
 
-        NV_FORCEINLINE SimdVector & operator=(const SimdVector & arg) {
+        NV_SIMD_NATIVE SimdVector & operator=(const SimdVector & arg)
+        {
             vec = arg.vec;
             return *this;
         }
 
-
-        float toFloat() const 
+        NV_SIMD_INLINE float toFloat() const 
         {
             NV_ALIGN_16 float f;
             _mm_store_ss(&f, vec);
             return f;
         }
 
-        Vector3 toVector3() const
+        NV_SIMD_INLINE Vector3 toVector3() const
         {
             NV_ALIGN_16 float c[4];
             _mm_store_ps( c, vec );
             return Vector3( c[0], c[1], c[2] );
         }
 
-        Vector4 toVector4() const
+        NV_SIMD_INLINE Vector4 toVector4() const
         {
             NV_ALIGN_16 float c[4];
             _mm_store_ps( c, vec );
@@ -88,57 +97,60 @@ namespace nv {
         }
 
 #define SSE_SPLAT( a ) ((a) | ((a) << 2) | ((a) << 4) | ((a) << 6))
-        NV_FORCEINLINE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); }
-        NV_FORCEINLINE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); }
-        NV_FORCEINLINE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); }
-        NV_FORCEINLINE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); }
+        NV_SIMD_NATIVE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); }
+        NV_SIMD_NATIVE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); }
+        NV_SIMD_NATIVE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); }
+        NV_SIMD_NATIVE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); }
 #undef SSE_SPLAT
 
-        NV_FORCEINLINE  SimdVector & operator+=( Arg v ) {
+        NV_SIMD_NATIVE SimdVector& operator+=( Arg v )
+        {
             vec = _mm_add_ps( vec, v.vec );
             return *this;
         }
 
-        NV_FORCEINLINE SimdVector & operator-=( Arg v ) {
+        NV_SIMD_NATIVE SimdVector& operator-=( Arg v )
+        {
             vec = _mm_sub_ps( vec, v.vec );
             return *this;
         }
 
-        NV_FORCEINLINE SimdVector & operator*=( Arg v ) {
+        NV_SIMD_NATIVE SimdVector& operator*=( Arg v )
+        {
             vec = _mm_mul_ps( vec, v.vec );
             return *this;
         }
     };
 
 
-    NV_FORCEINLINE SimdVector operator+(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_NATIVE SimdVector operator+( SimdVector::Arg left, SimdVector::Arg right  )
     {
         return SimdVector( _mm_add_ps( left.vec, right.vec ) );
     }
 
-    NV_FORCEINLINE SimdVector operator-(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_NATIVE SimdVector operator-( SimdVector::Arg left, SimdVector::Arg right  )
     {
         return SimdVector( _mm_sub_ps( left.vec, right.vec ) );
     }
 
-    NV_FORCEINLINE SimdVector operator*(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_NATIVE SimdVector operator*( SimdVector::Arg left, SimdVector::Arg right  )
     {
         return SimdVector( _mm_mul_ps( left.vec, right.vec ) );
     }
 
     // Returns a*b + c
-    NV_FORCEINLINE SimdVector multiplyAdd(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c)
+    NV_SIMD_INLINE SimdVector multiplyAdd( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
     {
         return SimdVector( _mm_add_ps( _mm_mul_ps( a.vec, b.vec ), c.vec ) );
     }
 
-    // Returns -( a*b - c ) = c - a*b
-    NV_FORCEINLINE SimdVector negativeMultiplySubtract(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c)
+    // Returns -( a*b - c )
+    NV_SIMD_INLINE SimdVector negativeMultiplySubtract( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
     {
         return SimdVector( _mm_sub_ps( c.vec, _mm_mul_ps( a.vec, b.vec ) ) );
     }
 
-    inline SimdVector reciprocal( SimdVector::Arg v )
+    NV_SIMD_INLINE SimdVector reciprocal( SimdVector::Arg v )
     {
         // get the reciprocal estimate
         __m128 estimate = _mm_rcp_ps( v.vec );
@@ -148,17 +160,17 @@ namespace nv {
         return SimdVector( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) );
     }
 
-    NV_FORCEINLINE SimdVector min(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_NATIVE SimdVector min( SimdVector::Arg left, SimdVector::Arg right )
     {
         return SimdVector( _mm_min_ps( left.vec, right.vec ) );
     }
 
-    NV_FORCEINLINE SimdVector max(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_NATIVE SimdVector max( SimdVector::Arg left, SimdVector::Arg right )
     {
         return SimdVector( _mm_max_ps( left.vec, right.vec ) );
     }
 
-    inline SimdVector truncate( SimdVector::Arg v )
+    NV_SIMD_INLINE SimdVector truncate( SimdVector::Arg v )
     {
 #if (NV_USE_SSE == 1)
         // convert to ints
@@ -179,12 +191,12 @@ namespace nv {
 #endif
     }
 
-    NV_FORCEINLINE SimdVector compareEqual(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_NATIVE SimdVector compareEqual( SimdVector::Arg left, SimdVector::Arg right )
     {
         return SimdVector( _mm_cmpeq_ps( left.vec, right.vec ) );
     }
 
-    inline SimdVector select(SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits)
+    NV_SIMD_INLINE SimdVector select( SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits )
     {
         __m128 a = _mm_andnot_ps( bits.vec, off.vec );
         __m128 b = _mm_and_ps( bits.vec, on.vec );
@@ -192,7 +204,7 @@ namespace nv {
         return SimdVector( _mm_or_ps( a, b ) );
     }
 
-    inline bool compareAnyLessThan(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_INLINE bool compareAnyLessThan( SimdVector::Arg left, SimdVector::Arg right ) 
     {
         __m128 bits = _mm_cmplt_ps( left.vec, right.vec );
         int value = _mm_movemask_ps( bits );
diff --git a/src/nvmath/Vector.inl b/src/nvmath/Vector.inl
index d2d3341..d252e4d 100644
--- a/src/nvmath/Vector.inl
+++ b/src/nvmath/Vector.inl
@@ -314,6 +314,12 @@ namespace nv
         return scale(v, 1.0f/s);
     }
 
+    inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
+    }
+
     inline float dot(Vector2::Arg a, Vector2::Arg b)
     {
         return a.x * b.x + a.y * b.y;
@@ -381,6 +387,16 @@ namespace nv
         return Vector2(max(a.x, b.x), max(a.y, b.y));
     }
 
+    inline Vector2 clamp(Vector2::Arg v, float min, float max)
+    {
+        return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
+    }
+
+    inline Vector2 saturate(Vector2::Arg v)
+    {
+        return Vector2(saturate(v.x), saturate(v.y));
+    }
+
     inline bool isFinite(Vector2::Arg v)
     {
         return isFinite(v.x) && isFinite(v.y);
@@ -394,6 +410,7 @@ namespace nv
         return vf;
     }
 
+    // Note, this is the area scaled by 2!
     inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
     {
 	    Vector2 v0 = a - c;
@@ -500,6 +517,16 @@ namespace nv
         return sqrtf(lengthSquared(v));
     }
 
+    inline float distance(Vector3::Arg a, Vector3::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
+    {
+        return lengthSquared(a - b);
+    }
+
     inline float inverseLength(Vector3::Arg v)
     {
         return 1.0f / sqrtf(lengthSquared(v));
@@ -557,6 +584,11 @@ namespace nv
         return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
     }
 
+    inline Vector3 saturate(Vector3::Arg v)
+    {
+        return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
+    }
+
     inline Vector3 floor(Vector3::Arg v)
     {
         return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
@@ -580,6 +612,11 @@ namespace nv
         return vf;
     }
 
+    inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
+    {
+	    return v - (2 * dot(v, n)) * n;
+    }
+
 
 
     // Vector4
@@ -627,9 +664,15 @@ namespace nv
         return scale(v, 1.0f/s);
     }
 
-    inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
+    /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
     {
         return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
+    }*/
+
+    inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
     }
 
     inline float dot(Vector4::Arg a, Vector4::Arg b)
@@ -699,6 +742,16 @@ namespace nv
         return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
     }
 
+    inline Vector4 clamp(Vector4::Arg v, float min, float max)
+    {
+        return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
+    }
+
+    inline Vector4 saturate(Vector4::Arg v)
+    {
+        return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
+    }
+
     inline bool isFinite(Vector4::Arg v)
     {
         return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);
diff --git a/src/nvmath/nvmath.h b/src/nvmath/nvmath.h
index 20c64d9..798ddce 100644
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@@ -155,15 +155,14 @@ namespace nv
     }
 
 #if NV_CC_MSVC
-    inline float log2f(float x)
+    NV_FORCEINLINE float log2f(float x)
     {
         nvCheck(x >= 0);
         return logf(x) / logf(2.0f);
     }
-
-    inline float exp2f(float x)
+    NV_FORCEINLINE float exp2f(float x)
     {
-        return powf(2, x);
+        return powf(2.0f, x);
     }
 #endif