Merge changes from The Witness.

12 years ago · fa468b04ab
parent 06c170b41b
commit fa468b04ab
10 changed files with 492 additions and 86 deletions
--- a/src/nvmath/Box.cpp
+++ b/src/nvmath/Box.cpp
@ -29,3 +29,38 @@ float nv::distanceSquared(const Box &box, const Vector3 &point) {
 /*bool nv::overlap(const Box &box, const Sphere &sphere) {
    return distanceSquared(box, sphere.center) < sphere.radius * sphere.radius;
 }*/
 bool nv::intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t /*= NULL*/) {
    // Precompute these in ray structure?
    int sdx = (id.x < 0);
    int sdy = (id.y < 0);
    int sdz = (id.z < 0);
    float tmin = (box.corner(  sdx).x - p.x) * id.x;
    float tmax = (box.corner(1-sdx).x - p.x) * id.x;
    float tymin = (box.corner(  sdy).y - p.y) * id.y;
    float tymax = (box.corner(1-sdy).y - p.y) * id.y;
    if ((tmin > tymax) || (tymin > tmax)) 
        return false;
    if (tymin > tmin) tmin = tymin;
    if (tymax < tmax) tmax = tymax;
    float tzmin = (box.corner(  sdz).z - p.z) * id.z;
    float tzmax = (box.corner(1-sdz).z - p.z) * id.z;
    if ((tmin > tzmax) || (tzmin > tmax)) 
        return false;
    if (tzmin > tmin) tmin = tzmin;
    if (tzmax < tmax) tmax = tzmax;
    if (tmax < 0) 
        return false;
    if (t != NULL) *t = tmin;
    return true;
 }
--- a/src/nvmath/Box.h
+++ b/src/nvmath/Box.h
@ -74,6 +74,8 @@ namespace nv
        friend Stream & operator<< (Stream & s, Box & box);
        const Vector3 & corner(int i) const { return (&minCorner)[i]; }
        Vector3 minCorner;
        Vector3 maxCorner;
    };
@ -81,6 +83,8 @@ namespace nv
    float distanceSquared(const Box &box, const Vector3 &point);
    bool overlap(const Box &box, const Sphere &sphere);
    // p is ray origin, id is inverse ray direction.
    bool intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t);
 } // nv namespace
--- a/src/nvmath/Box.inl
+++ b/src/nvmath/Box.inl
@ -12,51 +12,51 @@
 namespace nv
 {
    // Default ctor.
-    Box::Box() { };
+    inline Box::Box() { };
    // Copy ctor.
-    Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
+    inline Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
    // Init ctor.
-    Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { }
+    inline Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { }
    // Assignment operator.
-    Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; }
+    inline Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; }
    // Clear the bounds.
-    void Box::clearBounds()
+    inline void Box::clearBounds()
    {
        minCorner.set(FLT_MAX, FLT_MAX, FLT_MAX);
        maxCorner.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
    }
    // Build a cube centered on center and with edge = 2*dist
-    void Box::cube(const Vector3 & center, float dist)
+    inline void Box::cube(const Vector3 & center, float dist)
    {
        setCenterExtents(center, Vector3(dist, dist, dist));
    }
    // Build a box, given center and extents.
-    void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents)
+    inline void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents)
    {
        minCorner = center - extents;
        maxCorner = center + extents;
    }
    // Get box center.
-    Vector3 Box::center() const
+    inline Vector3 Box::center() const
    {
        return (minCorner + maxCorner) * 0.5f;
    }
    // Return extents of the box.
-    Vector3 Box::extents() const
+    inline Vector3 Box::extents() const
    {
        return (maxCorner - minCorner) * 0.5f;
    }
    // Return extents of the box.
-    float Box::extents(uint axis) const
+    inline float Box::extents(uint axis) const
    {
        nvDebugCheck(axis < 3);
        if (axis == 0) return (maxCorner.x - minCorner.x) * 0.5f;
@ -67,55 +67,55 @@ namespace nv
    }
    // Add a point to this box.
-    void Box::addPointToBounds(const Vector3 & p)
+    inline void Box::addPointToBounds(const Vector3 & p)
    {
        minCorner = min(minCorner, p);
        maxCorner = max(maxCorner, p);
    }
    // Add a box to this box.
-    void Box::addBoxToBounds(const Box & b)
+    inline void Box::addBoxToBounds(const Box & b)
    {
        minCorner = min(minCorner, b.minCorner);
        maxCorner = max(maxCorner, b.maxCorner);
    }
    // Translate box.
-    void Box::translate(const Vector3 & v)
+    inline void Box::translate(const Vector3 & v)
    {
        minCorner += v;
        maxCorner += v;
    }
    // Scale the box.
-    void Box::scale(float s)
+    inline void Box::scale(float s)
    {
        minCorner *= s;
        maxCorner *= s;
    }
    // Expand the box by a fixed amount.
-    void Box::expand(float r) {
+    inline void Box::expand(float r) {
        minCorner -= Vector3(r,r,r);
        maxCorner += Vector3(r,r,r);
    }
    // Get the area of the box.
-    float Box::area() const
+    inline float Box::area() const
    {
        const Vector3 d = extents();
        return 8.0f * (d.x*d.y + d.x*d.z + d.y*d.z);
    }	
    // Get the volume of the box.
-    float Box::volume() const
+    inline float Box::volume() const
    {
        Vector3 d = extents();
        return 8.0f * (d.x * d.y * d.z);
    }
    // Return true if the box contains the given point.
-    bool Box::contains(const Vector3 & p) const
+    inline bool Box::contains(const Vector3 & p) const
    {
        return 
            minCorner.x < p.x && minCorner.y < p.y && minCorner.z < p.z &&
@ -123,7 +123,7 @@ namespace nv
    }
    // Split the given box in 8 octants and assign the ith one to this box.
-    void Box::setOctant(const Box & box, const Vector3 & center, int i)
+    inline void Box::setOctant(const Box & box, const Vector3 & center, int i)
    {
        minCorner = box.minCorner;
        maxCorner = box.maxCorner;
--- a/src/nvmath/Fitting.cpp
+++ b/src/nvmath/Fitting.cpp
@ -159,24 +159,228 @@ Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
    float matrix[6];
    Vector3 centroid = computeCovariance(n, points, matrix);
-    if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
    {
        // If no plane defined, then return a horizontal plane.
        return Plane(Vector3(0, 0, 1), centroid);
    }
-#pragma NV_MESSAGE("TODO: need to write an eigensolver!")
+    float eigenValues[3];
    Vector3 eigenVectors[3];
    if (!eigenSolveSymmetric(matrix, eigenValues, eigenVectors)) {
        // If no plane defined, then return a horizontal plane.
        return Plane(Vector3(0, 0, 1), centroid);
    }
-    // - Numerical Recipes in C is a good reference. Householder transforms followed by QL decomposition seems to be the best approach.
+    return Plane(eigenVectors[2], centroid);
-    // - The one from magic-tools is now LGPL. For the 3D case it uses a cubic root solver, which is not very accurate.
+}
-    // - Charles' Galaxy3 contains an implementation of the tridiagonalization method, but is under BPL.
+
 bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON*/)
 {
    // compute the centroid and covariance
    float matrix[6];
    Vector3 centroid = computeCovariance(n, points, matrix);
-    //EigenSolver3 solver(matrix);
+    float eigenValues[3];
    Vector3 eigenVectors[3];
    if (!eigenSolveSymmetric(matrix, eigenValues, eigenVectors)) {
        return false;
    }
-    return Plane();
+    return eigenValues[2] < epsilon;
 }
 // Tridiagonal solver from Charles Bloom. 
 // Householder transforms followed by QL decomposition. 
 // Seems to be based on the code from Numerical Recipes in C.
 static void EigenSolver_Tridiagonal(double mat[3][3],double * diag,double * subd);
 static bool EigenSolver_QLAlgorithm(double mat[3][3],double * diag,double * subd);
 bool nv::Fit::eigenSolveSymmetric(float matrix[6], float eigenValues[3], Vector3 eigenVectors[3])
 {
    nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
    double subd[3];
    double diag[3];
    double work[3][3];
    work[0][0] = matrix[0];
    work[0][1] = work[1][0] = matrix[1];
    work[0][2] = work[2][0] = matrix[2];
    work[1][1] = matrix[3];
    work[1][2] = work[2][1] = matrix[4];
    work[2][2] = matrix[5];
    EigenSolver_Tridiagonal(work, diag, subd);
    if (!EigenSolver_QLAlgorithm(work, diag, subd))
    {
        for (int i = 0; i < 3; i++) {
            eigenValues[i] = 0;
            eigenVectors[i] = Vector3(0);
        }
        return false;
    }
    for (int i = 0; i < 3; i++) {
        eigenValues[i] = (float)diag[i];
    }
    // eigenvectors are the columns; make them the rows :
    for (int i=0; i < 3; i++)
    {
        for (int j = 0; j < 3; j++)
        {
            eigenVectors[j].component[i] = (float) work[i][j];
        }
    }
    // shuffle to sort by singular value :
    if (eigenValues[2] > eigenValues[0] && eigenValues[2] > eigenValues[1])
    {
        swap(eigenValues[0], eigenValues[2]);
        swap(eigenVectors[0], eigenVectors[2]);
    }
    if (eigenValues[1] > eigenValues[0])
    {
        swap(eigenValues[0], eigenValues[1]);
        swap(eigenVectors[0], eigenVectors[1]);
    }
    if (eigenValues[2] > eigenValues[1])
    {
        swap(eigenValues[1], eigenValues[2]);
        swap(eigenVectors[1], eigenVectors[2]);
    }
    nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2]);
    nvDebugCheck(eigenValues[1] >= eigenValues[2]);
    return true;
 }
 static void EigenSolver_Tridiagonal(double mat[3][3],double * diag,double * subd)
 {
    // Householder reduction T = Q^t M Q
    //   Input:   
    //     mat, symmetric 3x3 matrix M
    //   Output:  
    //     mat, orthogonal matrix Q
    //     diag, diagonal entries of T
    //     subd, subdiagonal entries of T (T is symmetric)
    const double epsilon = 1e-08f;
    double a = mat[0][0];
    double b = mat[0][1];
    double c = mat[0][2];
    double d = mat[1][1];
    double e = mat[1][2];
    double f = mat[2][2];
    diag[0] = a;
    subd[2] = 0.f;
    if ( fabs(c) >= epsilon )
    {
        const double ell = sqrt(b*b+c*c);
        b /= ell;
        c /= ell;
        const double q = 2*b*e+c*(f-d);
        diag[1] = d+c*q;
        diag[2] = f-c*q;
        subd[0] = ell;
        subd[1] = e-b*q;
        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
        mat[1][0] = 0; mat[1][1] = b; mat[1][2] = c;
        mat[2][0] = 0; mat[2][1] = c; mat[2][2] = -b;
    }
    else
    {
        diag[1] = d;
        diag[2] = f;
        subd[0] = b;
        subd[1] = e;
        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
        mat[1][0] = 0; mat[1][1] = 1; mat[1][2] = 0;
        mat[2][0] = 0; mat[2][1] = 0; mat[2][2] = 1;
    }
 }
 static bool EigenSolver_QLAlgorithm(double mat[3][3],double * diag,double * subd)
 {
    // QL iteration with implicit shifting to reduce matrix from tridiagonal
    // to diagonal
    const int maxiter = 32;
    for (int ell = 0; ell < 3; ell++)
    {
        int iter;
        for (iter = 0; iter < maxiter; iter++)
        {
            int m;
            for (m = ell; m <= 1; m++)
            {
                double dd = fabs(diag[m]) + fabs(diag[m+1]);
                if ( fabs(subd[m]) + dd == dd )
                    break;
            }
            if ( m == ell )
                break;
            double g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
            double r = sqrt(g*g+1);
            if ( g < 0 )
                g = diag[m]-diag[ell]+subd[ell]/(g-r);
            else
                g = diag[m]-diag[ell]+subd[ell]/(g+r);
            double s = 1, c = 1, p = 0;
            for (int i = m-1; i >= ell; i--)
            {
                double f = s*subd[i], b = c*subd[i];
                if ( fabs(f) >= fabs(g) )
                {
                    c = g/f;
                    r = sqrt(c*c+1);
                    subd[i+1] = f*r;
                    c *= (s = 1/r);
                }
                else
                {
                    s = f/g;
                    r = sqrt(s*s+1);
                    subd[i+1] = g*r;
                    s *= (c = 1/r);
                }
                g = diag[i+1]-p;
                r = (diag[i]-g)*s+2*b*c;
                p = s*r;
                diag[i+1] = g+p;
                g = c*r-b;
                for (int k = 0; k < 3; k++)
                {
                    f = mat[k][i+1];
                    mat[k][i+1] = s*mat[k][i]+c*f;
                    mat[k][i] = c*mat[k][i]-s*f;
                }
            }
            diag[ell] -= p;
            subd[ell] = g;
            subd[m] = 0;
        }
        if ( iter == maxiter )
            // should not get here under normal circumstances
            return false;
    }
    return true;
 }
 int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
 {
    // Compute principal component.
--- a/src/nvmath/Fitting.h
+++ b/src/nvmath/Fitting.h
@ -23,6 +23,10 @@ namespace nv
        Vector3 computePrincipalComponent(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
        Plane bestPlane(int n, const Vector3 * points);
        bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);
        bool eigenSolveSymmetric (float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
        // Returns number of clusters [1-4].
        int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);
--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@ -488,16 +488,20 @@ nv::half_to_float( uint16 h )
 }
 // @@ This code appears to be wrong.
 // @@ These tables could be smaller.
-static uint32 mantissa_table[2048];
+namespace nv {
-static uint32 exponent_table[64];
+    uint32 mantissa_table[2048];
-static uint32 offset_table[64];
+    uint32 exponent_table[64];
    uint32 offset_table[64];
 }
 void nv::half_init_tables()
 {
    // Init mantissa table.
 	mantissa_table[0] = 0;
    // denormals
 	for (int i = 1; i < 1024; i++) {
 		uint m = i << 13;
 		uint e = 0;
@ -511,8 +515,9 @@ void nv::half_init_tables()
 		mantissa_table[i] = m | e;
 	}
    // normals
    for (int i = 1024; i < 2048; i++) {
-		mantissa_table[i] = 0x38000000 + ((i - 1024) << 13);
+		mantissa_table[i] = (i - 1024) << 13;
    }
@ -520,17 +525,17 @@ void nv::half_init_tables()
 	exponent_table[0] = 0;
    for (int i = 1; i < 31; i++) {
-		exponent_table[i] = (i << 23);
+		exponent_table[i] = 0x38000000 + (i << 23);
    }
-	exponent_table[31] = 0x47800000;
+	exponent_table[31] = 0x7f800000;
 	exponent_table[32] = 0x80000000;
    for (int i = 33; i < 63; i++) {
-		exponent_table[i] = 0x80000000 + ((i - 32) << 23);
+		exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
    }
-	exponent_table[63] = 0xC7800000;
+	exponent_table[63] = 0xff800000;
    // Init offset table.
@ -545,22 +550,11 @@ void nv::half_init_tables()
    for (int i = 33; i < 64; i++) {
 		offset_table[i] = 1024;
    }
    /*for (int i = 0; i < 64; i++) {
        offset_table[i] = ((i & 31) != 0) * 1024;
    }*/
 }
 // Fast half to float conversion based on:
 // http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
 uint32 nv::fast_half_to_float(uint16 h)
 {
 	uint exp = h >> 10;
 	return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
 }
 #if 0
 // Inaccurate conversion suggested at the ffmpeg mailing list:
 // http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
 uint32 nv::fast_half_to_float(uint16 v)
@ -609,4 +603,94 @@ __asm
 }
 #endif
 #if 0
 // These version computes the tables at compile time:
 // http://gamedev.stackexchange.com/questions/17326/conversion-of-a-number-from-single-precision-floating-point-representation-to-a
 /* This method is faster than the OpenEXR implementation (very often
 * used, eg. in Ogre), with the additional benefit of rounding, inspired
 * by James Tursa’s half-precision code. */
 static inline uint16_t float_to_half_branch(uint32_t x)
 {
    uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
    uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
    unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
    /* If zero, or denormal, or exponent underflows too much for a denormal
     * half, return signed zero. */
    if (e < 103)
        return bits;
    /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
    if (e > 142)
    {
        bits |= 0x7c00u;
        /* If exponent was 0xff and one mantissa bit was set, it means NaN,
         * not Inf, so make sure we set one mantissa bit too. */
        bits |= e == 255 && (x & 0x007fffffu);
        return bits;
    }
    /* If exponent underflows but not too much, return a denormal */
    if (e < 113)
    {
        m |= 0x0800u;
        /* Extra rounding may overflow and set mantissa to 0 and exponent
         * to 1, which is OK. */
        bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
        return bits;
    }
    bits |= ((e - 112) << 10) | (m >> 1);
    /* Extra rounding. An overflow will set mantissa to 0 and increment
     * the exponent, which is OK. */
    bits += m & 1;
    return bits;
 }
 /* These macros implement a finite iterator useful to build lookup
 * tables. For instance, S64(0) will call S1(x) for all values of x
 * between 0 and 63.
 * Due to the exponential behaviour of the calls, the stress on the
 * compiler may be important. */
 #define S4(x)    S1((x)),   S1((x)+1),     S1((x)+2),     S1((x)+3)
 #define S16(x)   S4((x)),   S4((x)+4),     S4((x)+8),     S4((x)+12)
 #define S64(x)   S16((x)),  S16((x)+16),   S16((x)+32),   S16((x)+48)
 #define S256(x)  S64((x)),  S64((x)+64),   S64((x)+128),  S64((x)+192)
 #define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
 /* Lookup table-based algorithm from “Fast Half Float Conversions”
 * by Jeroen van der Zijp, November 2008. No rounding is performed,
 * and some NaN values may be incorrectly converted to Inf. */
 static inline uint16_t float_to_half_nobranch(uint32_t x)
 {
    static uint16_t const basetable[512] =
    {
 #define S1(i) (((i) < 103) ? 0x0000 : \
               ((i) < 113) ? 0x0400 >> (113 - (i)) : \
               ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
        S256(0),
 #undef S1
 #define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
                         ((i) < 113) ? 0x0400 >> (113 - (i)) : \
                         ((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
        S256(0),
 #undef S1
    };
    static uint8_t const shifttable[512] =
    {
 #define S1(i) (((i) < 103) ? 24 : \
               ((i) < 113) ? 126 - (i) : \
               ((i) < 143 || (i) == 255) ? 13 : 24)
        S256(0), S256(0),
 #undef S1
    };
    uint16_t bits = basetable[(x >> 23) & 0x1ff];
    bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
    return bits;
 }
 #endif
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@ -11,7 +11,18 @@ namespace nv {
    void half_init_tables();
-    uint32 fast_half_to_float(uint16 h);
+    extern uint32 mantissa_table[2048];
    extern uint32 exponent_table[64];
    extern uint32 offset_table[64];
    // Fast half to float conversion based on:
    // http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
    inline uint32 fast_half_to_float(uint16 h)
    {
 	    uint exp = h >> 10;
 	    return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
    }
    inline uint16 to_half(float c) {
        union { float f; uint32 u; } f;
--- a/src/nvmath/SimdVector_SSE.h
+++ b/src/nvmath/SimdVector_SSE.h
@ -33,8 +33,15 @@
 #include <emmintrin.h>
 #endif
 // See this for ideas:
 // http://molecularmusings.wordpress.com/2011/10/18/simdifying-multi-platform-math/
 namespace nv {
 #define NV_SIMD_NATIVE NV_FORCEINLINE
 #define NV_SIMD_INLINE inline
    class SimdVector
    {
    public:
@ -42,45 +49,47 @@ namespace nv {
        typedef SimdVector const& Arg;
-        NV_FORCEINLINE SimdVector() {}
+        NV_SIMD_NATIVE SimdVector() {}
-        NV_FORCEINLINE explicit SimdVector(float f) : vec(_mm_set1_ps(f)) {}
+        
-        NV_FORCEINLINE explicit SimdVector(__m128 v) : vec(v) {}
+        NV_SIMD_NATIVE explicit SimdVector(__m128 v) : vec(v) {}
-
+        
-        NV_FORCEINLINE explicit SimdVector(NV_ALIGN_16 Vector4 v) {
+        NV_SIMD_NATIVE explicit SimdVector(float f) {
-            vec = _mm_load_ps( v.component );
+            vec = _mm_set1_ps(f);
        }
-        NV_FORCEINLINE explicit SimdVector(const float * v) {
+        NV_SIMD_NATIVE explicit SimdVector(const float * v)
        {
            vec = _mm_load_ps( v );
        }
-        NV_FORCEINLINE SimdVector(float x, float y, float z, float w) {
+        NV_SIMD_NATIVE SimdVector(float x, float y, float z, float w)
        {
            vec = _mm_setr_ps( x, y, z, w );
        }
-        NV_FORCEINLINE SimdVector(const SimdVector & arg) : vec(arg.vec) {}
+        NV_SIMD_NATIVE SimdVector(const SimdVector & arg) : vec(arg.vec) {}
-        NV_FORCEINLINE SimdVector & operator=(const SimdVector & arg) {
+        NV_SIMD_NATIVE SimdVector & operator=(const SimdVector & arg)
        {
            vec = arg.vec;
            return *this;
        }
-
+        NV_SIMD_INLINE float toFloat() const 
        float toFloat() const 
        {
            NV_ALIGN_16 float f;
            _mm_store_ss(&f, vec);
            return f;
        }
-        Vector3 toVector3() const
+        NV_SIMD_INLINE Vector3 toVector3() const
        {
            NV_ALIGN_16 float c[4];
            _mm_store_ps( c, vec );
            return Vector3( c[0], c[1], c[2] );
        }
-        Vector4 toVector4() const
+        NV_SIMD_INLINE Vector4 toVector4() const
        {
            NV_ALIGN_16 float c[4];
            _mm_store_ps( c, vec );
@ -88,57 +97,60 @@ namespace nv {
        }
 #define SSE_SPLAT( a ) ((a) | ((a) << 2) | ((a) << 4) | ((a) << 6))
-        NV_FORCEINLINE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); }
+        NV_SIMD_NATIVE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); }
-        NV_FORCEINLINE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); }
+        NV_SIMD_NATIVE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); }
-        NV_FORCEINLINE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); }
+        NV_SIMD_NATIVE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); }
-        NV_FORCEINLINE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); }
+        NV_SIMD_NATIVE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); }
 #undef SSE_SPLAT
-        NV_FORCEINLINE  SimdVector & operator+=( Arg v ) {
+        NV_SIMD_NATIVE SimdVector& operator+=( Arg v )
        {
            vec = _mm_add_ps( vec, v.vec );
            return *this;
        }
-        NV_FORCEINLINE SimdVector & operator-=( Arg v ) {
+        NV_SIMD_NATIVE SimdVector& operator-=( Arg v )
        {
            vec = _mm_sub_ps( vec, v.vec );
            return *this;
        }
-        NV_FORCEINLINE SimdVector & operator*=( Arg v ) {
+        NV_SIMD_NATIVE SimdVector& operator*=( Arg v )
        {
            vec = _mm_mul_ps( vec, v.vec );
            return *this;
        }
    };
-    NV_FORCEINLINE SimdVector operator+(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_NATIVE SimdVector operator+( SimdVector::Arg left, SimdVector::Arg right  )
    {
        return SimdVector( _mm_add_ps( left.vec, right.vec ) );
    }
-    NV_FORCEINLINE SimdVector operator-(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_NATIVE SimdVector operator-( SimdVector::Arg left, SimdVector::Arg right  )
    {
        return SimdVector( _mm_sub_ps( left.vec, right.vec ) );
    }
-    NV_FORCEINLINE SimdVector operator*(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_NATIVE SimdVector operator*( SimdVector::Arg left, SimdVector::Arg right  )
    {
        return SimdVector( _mm_mul_ps( left.vec, right.vec ) );
    }
    // Returns a*b + c
-    NV_FORCEINLINE SimdVector multiplyAdd(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c)
+    NV_SIMD_INLINE SimdVector multiplyAdd( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
    {
        return SimdVector( _mm_add_ps( _mm_mul_ps( a.vec, b.vec ), c.vec ) );
    }
-    // Returns -( a*b - c ) = c - a*b
+    // Returns -( a*b - c )
-    NV_FORCEINLINE SimdVector negativeMultiplySubtract(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c)
+    NV_SIMD_INLINE SimdVector negativeMultiplySubtract( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
    {
        return SimdVector( _mm_sub_ps( c.vec, _mm_mul_ps( a.vec, b.vec ) ) );
    }
-    inline SimdVector reciprocal( SimdVector::Arg v )
+    NV_SIMD_INLINE SimdVector reciprocal( SimdVector::Arg v )
    {
        // get the reciprocal estimate
        __m128 estimate = _mm_rcp_ps( v.vec );
@ -148,17 +160,17 @@ namespace nv {
        return SimdVector( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) );
    }
-    NV_FORCEINLINE SimdVector min(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_NATIVE SimdVector min( SimdVector::Arg left, SimdVector::Arg right )
    {
        return SimdVector( _mm_min_ps( left.vec, right.vec ) );
    }
-    NV_FORCEINLINE SimdVector max(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_NATIVE SimdVector max( SimdVector::Arg left, SimdVector::Arg right )
    {
        return SimdVector( _mm_max_ps( left.vec, right.vec ) );
    }
-    inline SimdVector truncate( SimdVector::Arg v )
+    NV_SIMD_INLINE SimdVector truncate( SimdVector::Arg v )
    {
 #if (NV_USE_SSE == 1)
        // convert to ints
@ -179,12 +191,12 @@ namespace nv {
 #endif
    }
-    NV_FORCEINLINE SimdVector compareEqual(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_NATIVE SimdVector compareEqual( SimdVector::Arg left, SimdVector::Arg right )
    {
        return SimdVector( _mm_cmpeq_ps( left.vec, right.vec ) );
    }
-    inline SimdVector select(SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits)
+    NV_SIMD_INLINE SimdVector select( SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits )
    {
        __m128 a = _mm_andnot_ps( bits.vec, off.vec );
        __m128 b = _mm_and_ps( bits.vec, on.vec );
@ -192,7 +204,7 @@ namespace nv {
        return SimdVector( _mm_or_ps( a, b ) );
    }
-    inline bool compareAnyLessThan(SimdVector::Arg left, SimdVector::Arg right)
+    NV_SIMD_INLINE bool compareAnyLessThan( SimdVector::Arg left, SimdVector::Arg right ) 
    {
        __m128 bits = _mm_cmplt_ps( left.vec, right.vec );
        int value = _mm_movemask_ps( bits );
--- a/src/nvmath/Vector.inl
+++ b/src/nvmath/Vector.inl
@ -314,6 +314,12 @@ namespace nv
        return scale(v, 1.0f/s);
    }
    inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
    {
        const float s = 1.0f - t;
        return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
    }
    inline float dot(Vector2::Arg a, Vector2::Arg b)
    {
        return a.x * b.x + a.y * b.y;
@ -381,6 +387,16 @@ namespace nv
        return Vector2(max(a.x, b.x), max(a.y, b.y));
    }
    inline Vector2 clamp(Vector2::Arg v, float min, float max)
    {
        return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
    }
    inline Vector2 saturate(Vector2::Arg v)
    {
        return Vector2(saturate(v.x), saturate(v.y));
    }
    inline bool isFinite(Vector2::Arg v)
    {
        return isFinite(v.x) && isFinite(v.y);
@ -394,6 +410,7 @@ namespace nv
        return vf;
    }
    // Note, this is the area scaled by 2!
    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
    {
 	    Vector2 v0 = a - c;
@ -500,6 +517,16 @@ namespace nv
        return sqrtf(lengthSquared(v));
    }
    inline float distance(Vector3::Arg a, Vector3::Arg b)
    {
        return length(a - b);
    }
    inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
    {
        return lengthSquared(a - b);
    }
    inline float inverseLength(Vector3::Arg v)
    {
        return 1.0f / sqrtf(lengthSquared(v));
@ -557,6 +584,11 @@ namespace nv
        return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
    }
    inline Vector3 saturate(Vector3::Arg v)
    {
        return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
    }
    inline Vector3 floor(Vector3::Arg v)
    {
        return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
@ -580,6 +612,11 @@ namespace nv
        return vf;
    }
    inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
    {
 	    return v - (2 * dot(v, n)) * n;
    }
    // Vector4
@ -627,9 +664,15 @@ namespace nv
        return scale(v, 1.0f/s);
    }
-    inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
+    /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
    {
        return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
    }*/
    inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
    {
        const float s = 1.0f - t;
        return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
    }
    inline float dot(Vector4::Arg a, Vector4::Arg b)
@ -699,6 +742,16 @@ namespace nv
        return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
    }
    inline Vector4 clamp(Vector4::Arg v, float min, float max)
    {
        return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
    }
    inline Vector4 saturate(Vector4::Arg v)
    {
        return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
    }
    inline bool isFinite(Vector4::Arg v)
    {
        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@ -155,15 +155,14 @@ namespace nv
    }
 #if NV_CC_MSVC
-    inline float log2f(float x)
+    NV_FORCEINLINE float log2f(float x)
    {
        nvCheck(x >= 0);
        return logf(x) / logf(2.0f);
    }
-
+    NV_FORCEINLINE float exp2f(float x)
    inline float exp2f(float x)
    {
-        return powf(2, x);
+        return powf(2.0f, x);
    }
 #endif