Merge private branch.

2010-05-27 23:19:24 +00:00
parent 51a4fe7e2d
commit c09067e477
13 changed files with 2701 additions and 2622 deletions
--- a/src/nvmath/Box.cpp
+++ b/src/nvmath/Box.cpp
@ -0,0 +1,30 @@
 // This code is in the public domain -- castanyo@yahoo.es
 #include "nvmath/Box.h"
 #include "nvmath/Sphere.h"
 using namespace nv;
 float nv::distanceSquared(const Box &box, const Vector3 &point) {
    Vector3 closest;
    if (point.x < box.minCorner.x) closest.x = box.minCorner.x;
    else if (point.x > box.maxCorner.x) closest.x = box.maxCorner.x;
    else closest.x = point.x;
    if (point.y < box.minCorner.y) closest.y = box.minCorner.y;
    else if (point.y > box.maxCorner.y) closest.y = box.maxCorner.y;
    else closest.y = point.y;
    if (point.z < box.minCorner.z) closest.z = box.minCorner.z;
    else if (point.z > box.maxCorner.z) closest.z = box.maxCorner.z;
    else closest.z = point.z;
    return lengthSquared(point - closest);
 }
 bool nv::overlap(const Box &box, const Sphere &sphere) {
    return distanceSquared(box, sphere.center) < sphere.radius * sphere.radius;
 }
--- a/src/nvmath/Box.h
+++ b/src/nvmath/Box.h
@ -1,46 +1,40 @@
 // This code is in the public domain -- castanyo@yahoo.es
 #pragma once
 #ifndef NV_MATH_BOX_H
 #define NV_MATH_BOX_H
-#include <nvmath/Vector.h>
+#include "Vector.h"
 #include <float.h> // FLT_MAX
 namespace nv
 {
-class Stream;
+    class Stream;
    class Sphere;
-/// Axis Aligned Bounding Box.
+    /// Axis Aligned Bounding Box.
-class Box
+    class Box
-{
+    {
-public:
+    public:
        /// Default ctor.
        Box() { };
        /// Copy ctor.
-	Box( const Box & b ) : m_mins(b.m_mins), m_maxs(b.m_maxs) { }
+        Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
        /// Init ctor.
-	Box( Vector3::Arg mins, Vector3::Arg maxs ) : m_mins(mins), m_maxs(maxs) { }
+        Box(Vector3::Arg mins, Vector3::Arg maxs) : minCorner(mins), maxCorner(maxs) { }
        // Cast operators.
        operator const float * () const { return reinterpret_cast<const float *>(this); }
 	// Min corner of the box.
 	Vector3 minCorner() const { return m_mins; }
 	Vector3 & minCorner() { return m_mins; }
 	// Max corner of the box.
 	Vector3 maxCorner() const { return m_maxs; }
 	Vector3 & maxCorner() { return m_maxs; }
        /// Clear the bounds.
        void clearBounds()
        {
-		m_mins.set(FLT_MAX, FLT_MAX, FLT_MAX);
+            minCorner.set(FLT_MAX, FLT_MAX, FLT_MAX);
-		m_maxs.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
+            maxCorner.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
        }
        /// Build a cube centered on center and with edge = 2*dist
@ -52,29 +46,29 @@ public:
        /// Build a box, given center and extents.
        void setCenterExtents(Vector3::Arg center, Vector3::Arg extents)
        {
-		m_mins = center - extents;
+            minCorner = center - extents;
-		m_maxs = center + extents;
+            maxCorner = center + extents;
        }
        /// Get box center.
        Vector3 center() const
        {
-		return (m_mins + m_maxs) * 0.5f;
+            return (minCorner + maxCorner) * 0.5f;
        }
        /// Return extents of the box.
        Vector3 extents() const
        {
-		return (m_maxs - m_mins) * 0.5f;
+            return (maxCorner - minCorner) * 0.5f;
        }
        /// Return extents of the box.
        scalar extents(uint axis) const
        {
            nvDebugCheck(axis < 3);
-		if (axis == 0) return (m_maxs.x() - m_mins.x()) * 0.5f;
+            if (axis == 0) return (maxCorner.x - minCorner.x) * 0.5f;
-		if (axis == 1) return (m_maxs.y() - m_mins.y()) * 0.5f;
+            if (axis == 1) return (maxCorner.y - minCorner.y) * 0.5f;
-		if (axis == 2) return (m_maxs.z() - m_mins.z()) * 0.5f;
+            if (axis == 2) return (maxCorner.z - minCorner.z) * 0.5f;
            nvAssume(false);
            return 0.0f;
        }
@ -82,61 +76,81 @@ public:
        /// Add a point to this box.
        void addPointToBounds(Vector3::Arg p)
        {
-		m_mins = min(m_mins, p);
+            minCorner = min(minCorner, p);
-		m_maxs = max(m_maxs, p);
+            maxCorner = max(maxCorner, p);
        }
        /// Add a box to this box.
        void addBoxToBounds(const Box & b)
        {
-		m_mins = min(m_mins, b.m_mins);
+            minCorner = min(minCorner, b.minCorner);
-		m_maxs = max(m_maxs, b.m_maxs);
+            maxCorner = max(maxCorner, b.maxCorner);
        }
        /// Translate box.
        void translate(Vector3::Arg v)
        {
-		m_mins += v;
+            minCorner += v;
-		m_maxs += v;
+            maxCorner += v;
        }
        /// Scale the box.
        void scale(float s)
        {
-		m_mins *= s;
+            minCorner *= s;
-		m_maxs *= s;
+            maxCorner *= s;
        }
        // Expand the box by a fixed amount.
        void expand(float r) {
            minCorner -= Vector3(r,r,r);
            maxCorner += Vector3(r,r,r);
        }
        /// Get the area of the box.
        float area() const
        {
            const Vector3 d = extents();
-		return 8.0f * (d.x()*d.y() + d.x()*d.z() + d.y()*d.z());
+            return 8.0f * (d.x*d.y + d.x*d.z + d.y*d.z);
        }	
        /// Get the volume of the box.
        float volume() const
        {
            Vector3 d = extents();
-		return 8.0f * (d.x() * d.y() * d.z());
+            return 8.0f * (d.x * d.y * d.z);
        }
        /// Return true if the box contains the given point.
        bool contains(Vector3::Arg p) const
        {
            return 
-			m_mins.x() < p.x() && m_mins.y() < p.y() && m_mins.z() < p.z() &&
+                minCorner.x < p.x && minCorner.y < p.y && minCorner.z < p.z &&
-			m_maxs.x() > p.x() && m_maxs.y() > p.y() && m_maxs.z() > p.z();
+                maxCorner.x > p.x && maxCorner.y > p.y && maxCorner.z > p.z;
        }
        /// Split the given box in 8 octants and assign the ith one to this box.
        void setOctant(const Box & box, Vector3::Arg center, int i)
        {
            minCorner = box.minCorner;
            maxCorner = box.maxCorner;
            if (i & 4) minCorner.x = center.x;
            else       maxCorner.x = center.x;
            if (i & 2) minCorner.y = center.y;
            else       maxCorner.y = center.y;
            if (i & 1) minCorner.z = center.z;
            else       maxCorner.z = center.z;
        }
        friend Stream & operator<< (Stream & s, Box & box);
-private:
+        Vector3 minCorner;
-
+        Vector3 maxCorner;
-	Vector3 m_mins;
+    };
 	Vector3 m_maxs;
 };
    float distanceSquared(const Box &box, const Vector3 &point);
    bool overlap(const Box &box, const Sphere &sphere);
 } // nv namespace
--- a/src/nvmath/CMakeLists.txt
+++ b/src/nvmath/CMakeLists.txt
@ -5,7 +5,7 @@ SET(MATH_SRCS
    Vector.h
    Matrix.h
    Plane.h Plane.cpp
-    Box.h
+    Box.h Box.cpp
    Color.h
    Half.h Half.cpp
    Fitting.h Fitting.cpp)
--- a/src/nvmath/Color.h
+++ b/src/nvmath/Color.h
@ -1,18 +1,19 @@
 // This code is in the public domain -- castanyo@yahoo.es
 #pragma once
 #ifndef NV_MATH_COLOR_H
 #define NV_MATH_COLOR_H
-#include <nvcore/Debug.h>
+#include "nvcore/Debug.h"
-#include <nvmath/Vector.h>
+#include "nvmath/Vector.h"
 namespace nv
 {
-/// 64 bit color stored as BGRA.
+    /// 64 bit color stored as BGRA.
-class NVMATH_CLASS Color64 
+    class NVMATH_CLASS Color64 
-{
+    {
-public:
+    public:
        Color64() { }
        Color64(const Color64 & c) : u(c.u) { }
        Color64(uint16 R, uint16 G, uint16 B, uint16 A) { setRGBA(R, G, B, A); }
@ -43,12 +44,12 @@ public:
            };
            uint64 u;
        };
-};
+    };
-/// 32 bit color stored as BGRA.
+    /// 32 bit color stored as BGRA.
-class NVMATH_CLASS Color32
+    class NVMATH_CLASS Color32
-{
+    {
-public:
+    public:
        Color32() { }
        Color32(const Color32 & c) : u(c.u) { }
        Color32(uint8 R, uint8 G, uint8 B) { setRGBA(R, G, B, 0xFF); }
@ -91,13 +92,13 @@ public:
            };
            uint32 u;
        };
-};
+    };
-/// 16 bit 565 BGR color.
+    /// 16 bit 565 BGR color.
-class NVMATH_CLASS Color16
+    class NVMATH_CLASS Color16
-{
+    {
-public:
+    public:
        Color16() { }
        Color16(const Color16 & c) : u(c.u) { }
        explicit Color16(uint16 U) : u(U) { }
@ -116,49 +117,49 @@ public:
            };
            uint16 u;
        };
-};
+    };
-/// Clamp color components.
+    /// Clamp color components.
-inline Vector3 colorClamp(Vector3::Arg c)
+    inline Vector3 colorClamp(Vector3::Arg c)
-{
+    {
-	return Vector3(clamp(c.x(), 0.0f, 1.0f), clamp(c.y(), 0.0f, 1.0f), clamp(c.z(), 0.0f, 1.0f));
+        return Vector3(clamp(c.x, 0.0f, 1.0f), clamp(c.y, 0.0f, 1.0f), clamp(c.z, 0.0f, 1.0f));
-}
+    }
-/// Clamp without allowing the hue to change.
+    /// Clamp without allowing the hue to change.
-inline Vector3 colorNormalize(Vector3::Arg c)
+    inline Vector3 colorNormalize(Vector3::Arg c)
-{
+    {
        float scale = 1.0f;
-	if (c.x() > scale) scale = c.x();
+        if (c.x > scale) scale = c.x;
-	if (c.y() > scale) scale = c.y();
+        if (c.y > scale) scale = c.y;
-	if (c.z() > scale) scale = c.z();
+        if (c.z > scale) scale = c.z;
        return c / scale;
-}
+    }
-/// Convert Color32 to Color16.
+    /// Convert Color32 to Color16.
-inline Color16 toColor16(Color32 c)
+    inline Color16 toColor16(Color32 c)
-{
+    {
        Color16 color;
        //         rrrrrggggggbbbbb
        // rrrrr000gggggg00bbbbb000
-//	color.u = (c.u >> 3) & 0x1F;
+        //	color.u = (c.u >> 3) & 0x1F;
-//	color.u |= (c.u >> 5) & 0x7E0;
+        //	color.u |= (c.u >> 5) & 0x7E0;
-//	color.u |= (c.u >> 8) & 0xF800;
+        //	color.u |= (c.u >> 8) & 0xF800;
        color.r = c.r >> 3;
        color.g = c.g >> 2;
        color.b = c.b >> 3;
        return color; 
-}
+    }
-/// Promote 16 bit color to 32 bit using regular bit expansion.
+    /// Promote 16 bit color to 32 bit using regular bit expansion.
-inline Color32 toColor32(Color16 c)
+    inline Color32 toColor32(Color16 c)
-{
+    {
        Color32 color;
-//	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
+        //	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
-//	c.u |= (c.u >> 5) & 0x070007;
+        //	c.u |= (c.u >> 5) & 0x070007;
-//	c.u |= (c.u >> 6) & 0x000300;
+        //	c.u |= (c.u >> 6) & 0x000300;
        color.b = (c.b << 3) | (c.b >> 2);
        color.g = (c.g << 2) | (c.g >> 4);
@ -166,13 +167,13 @@ inline Color32 toColor32(Color16 c)
        color.a = 0xFF;
        return color;
-}
+    }
-inline Vector4 toVector4(Color32 c)
+    inline Vector4 toVector4(Color32 c)
-{
+    {
        const float scale = 1.0f / 255.0f;
        return Vector4(c.r * scale, c.g * scale, c.b * scale, c.a * scale);
-}
+    }
 } // nv namespace
--- a/src/nvmath/Fitting.cpp
+++ b/src/nvmath/Fitting.cpp
@ -1,9 +1,7 @@
 // This code is in the public domain -- icastano@gmail.com
 #include "Fitting.h"
-
+#include "nvcore/Utils.h" // max, swap
 #include <nvcore/Algorithms.h> // max
 #include <nvcore/Containers.h> // swap
 #include <float.h> // FLT_MAX
@ -12,7 +10,7 @@ using namespace nv;
 // @@ Move to EigenSolver.h
 static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
 {
-	if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
    {
        return Vector3(zero);
    }
@ -22,9 +20,9 @@ static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matri
    Vector3 v(1, 1, 1);
    for (int i = 0; i < NUM; i++)
    {
-		float x = v.x() * matrix[0] + v.y() * matrix[1] + v.z() * matrix[2];
+        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
-		float y = v.x() * matrix[1] + v.y() * matrix[3] + v.z() * matrix[4];
+        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
-		float z = v.x() * matrix[2] + v.y() * matrix[4] + v.z() * matrix[5];
+        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
        float norm = max(max(x, y), z);
@ -79,12 +77,12 @@ Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, floa
    {
        Vector3 v = points[i] - centroid;
-		covariance[0] += v.x() * v.x();
+        covariance[0] += v.x * v.x;
-		covariance[1] += v.x() * v.y();
+        covariance[1] += v.x * v.y;
-		covariance[2] += v.x() * v.z();
+        covariance[2] += v.x * v.z;
-		covariance[3] += v.y() * v.y();
+        covariance[3] += v.y * v.y;
-		covariance[4] += v.y() * v.z();
+        covariance[4] += v.y * v.z;
-		covariance[5] += v.z() * v.z();
+        covariance[5] += v.z * v.z;
    }
    return centroid;
@ -106,12 +104,12 @@ Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, cons
        Vector3 a = (points[i] - centroid) * metric;
        Vector3 b = weights[i]*a;
-		covariance[0] += a.x()*b.x();
+        covariance[0] += a.x * b.x;
-		covariance[1] += a.x()*b.y();
+        covariance[1] += a.x * b.y;
-		covariance[2] += a.x()*b.z();
+        covariance[2] += a.x * b.z;
-		covariance[3] += a.y()*b.y();
+        covariance[3] += a.y * b.y;
-		covariance[4] += a.y()*b.z();
+        covariance[4] += a.y * b.z;
-		covariance[5] += a.z()*b.z();
+        covariance[5] += a.z * b.z;
    }
    return centroid;
@ -204,7 +202,7 @@ int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float
            float mindist = FLT_MAX;
            for (int j = 0; j < 4; j++)
            {
-				float dist = length_squared((cluster[j] - points[i]) * metric);
+                float dist = lengthSquared((cluster[j] - points[i]) * metric);
                if (dist < mindist)
                {
                    mindist = dist;
--- a/src/nvmath/Fitting.h
+++ b/src/nvmath/Fitting.h
@ -1,5 +1,6 @@
 // This code is in the public domain -- icastano@gmail.com
 #pragma once
 #ifndef NV_MATH_FITTING_H
 #define NV_MATH_FITTING_H
--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@ -88,6 +88,12 @@ static inline uint32 _uint32_dec( uint32 a )
    return (a - 1);
 }
 // Increment
 static inline uint32 _uint32_inc( uint32 a )
 {
  return (a + 1);
 }
 // Complement
 static inline uint32 _uint32_not( uint32 a )
 {
@ -97,12 +103,9 @@ static inline uint32 _uint32_not( uint32 a )
 // Negate
 static inline uint32 _uint32_neg( uint32 a )
 {
-#if NV_CC_MSVC
+#pragma warning(disable : 4146)     // unary minus operator applied to unsigned type, result still unsigned
  // prevent msvc warning.
  return ~a + 1;
 #else
    return (-a);
-#endif
+#pragma warning(default : 4146)
 }
 // Extend sign
@ -272,14 +275,33 @@ static inline uint16 _uint16_sels( uint16 test, uint16 a, uint16 b )
    return (result);
 }
 #if NV_CC_MSVC
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
 uint32 _uint32_nlz( uint32 x ) {
    unsigned long index;
    _BitScanReverse(&index, x);
    return 31 - index;
 }
 #endif
 // Count Leading Zeros
 static inline uint32 _uint32_cntlz( uint32 x )
 {
-#ifdef __GNUC__
+#if NV_CC_GCC
    /* On PowerPC, this will map to insn: cntlzw */
    /* On Pentium, this will map to insn: clz    */
    uint32 is_x_nez_msb = _uint32_neg( x );
    uint32 nlz          = __builtin_clz( x );
-  return (nlz);
+    uint32 result       = _uint32_sels( is_x_nez_msb, nlz, 0x00000020 );
    return (result);
 #elif NV_CC_MSVC
    uint32 is_x_nez_msb = _uint32_neg( x );
    uint32 nlz          = _uint32_nlz( x );
    uint32 result       = _uint32_sels( is_x_nez_msb, nlz, 0x00000020 );
    return (result);
 #else
    const uint32 x0  = _uint32_srl(  x,  1 );
    const uint32 x1  = _uint32_or(   x,  x0 );
@ -317,8 +339,8 @@ static inline uint16 _uint16_cntlz( uint16 x )
 #ifdef __GNUC__
    /* On PowerPC, this will map to insn: cntlzw */
    /* On Pentium, this will map to insn: clz    */
-  uint32 x32   = _uint32_sll( x, 16 );
+    uint16 nlz32 = (uint16)_uint32_cntlz( (uint32)x );
-  uint16 nlz   = (uint16)__builtin_clz( x32 );
+    uint32 nlz   = _uint32_sub( nlz32, 16 );
    return (nlz);
 #else
    const uint16 x0  = _uint16_srl(  x,  1 );
@ -344,63 +366,72 @@ static inline uint16 _uint16_cntlz( uint16 x )
 }
 uint16
-half_from_float( uint32 f )
+nv::half_from_float( uint32 f )
 {
    const uint32 one                        = _uint32_li( 0x00000001 );
    const uint32 f_s_mask                   = _uint32_li( 0x80000000 );
    const uint32 f_e_mask                   = _uint32_li( 0x7f800000 );
    const uint32 f_m_mask                   = _uint32_li( 0x007fffff );
  const uint32 f_s_mask                   = _uint32_li( 0x80000000 );
  const uint32 h_e_mask                   = _uint32_li( 0x00007c00 );
  const uint32 f_e_pos                    = _uint32_li( 0x00000017 );
  const uint32 f_m_round_bit              = _uint32_li( 0x00001000 );
  const uint32 h_nan_em_min               = _uint32_li( 0x00007c01 );
  const uint32 f_h_s_pos_offset           = _uint32_li( 0x00000010 );
    const uint32 f_m_hidden_bit             = _uint32_li( 0x00800000 );
    const uint32 f_m_round_bit              = _uint32_li( 0x00001000 );
    const uint32 f_snan_mask                = _uint32_li( 0x7fc00000 );
    const uint32 f_e_pos                    = _uint32_li( 0x00000017 );
    const uint32 h_e_pos                    = _uint32_li( 0x0000000a );
    const uint32 h_e_mask                   = _uint32_li( 0x00007c00 );
    const uint32 h_snan_mask                = _uint32_li( 0x00007e00 );
    const uint32 h_e_mask_value             = _uint32_li( 0x0000001f );
    const uint32 f_h_s_pos_offset           = _uint32_li( 0x00000010 );
    const uint32 f_h_bias_offset            = _uint32_li( 0x00000070 );
    const uint32 f_h_m_pos_offset           = _uint32_li( 0x0000000d );
-  const uint32 f_h_bias_offset            = _uint32_li( 0x38000000 );
+    const uint32 h_nan_min                  = _uint32_li( 0x00007c01 );
-  const uint32 f_m_snan_mask              = _uint32_li( 0x003fffff );
+    const uint32 f_h_e_biased_flag          = _uint32_li( 0x0000008f );
  const uint16 h_snan_mask                = _uint32_li( 0x00007e00 );
  const uint32 f_e                        = _uint32_and( f, f_e_mask  );
  const uint32 f_m                        = _uint32_and( f, f_m_mask  );
    const uint32 f_s                        = _uint32_and( f,               f_s_mask         );
-  const uint32 f_e_h_bias                 = _uint32_sub( f_e,               f_h_bias_offset );
+    const uint32 f_e                        = _uint32_and( f,               f_e_mask         );
-  const uint32 f_e_h_bias_amount          = _uint32_srl( f_e_h_bias,        f_e_pos         );
+    const uint16 h_s                        = _uint32_srl( f_s,             f_h_s_pos_offset );
    const uint32 f_m                        = _uint32_and( f,               f_m_mask         );
    const uint16 f_e_amount                 = _uint32_srl( f_e,             f_e_pos          );
    const uint32 f_e_half_bias              = _uint32_sub( f_e_amount,      f_h_bias_offset  );
    const uint32 f_snan                     = _uint32_and( f,               f_snan_mask      );
    const uint32 f_m_round_mask             = _uint32_and( f_m,             f_m_round_bit    );
    const uint32 f_m_round_offset           = _uint32_sll( f_m_round_mask,  one              );
    const uint32 f_m_rounded                = _uint32_add( f_m,             f_m_round_offset );
-  const uint32 f_m_rounded_overflow       = _uint32_and( f_m_rounded,       f_m_hidden_bit    );
+    const uint32 f_m_denorm_sa              = _uint32_sub( one,             f_e_half_bias    );
  const uint32 f_m_denorm_sa              = _uint32_sub( one,               f_e_h_bias_amount );
    const uint32 f_m_with_hidden            = _uint32_or(  f_m_rounded,     f_m_hidden_bit   );
    const uint32 f_m_denorm                 = _uint32_srl( f_m_with_hidden, f_m_denorm_sa    );
  const uint32 f_em_norm_packed           = _uint32_or(  f_e_h_bias,        f_m_rounded       );
  const uint32 f_e_overflow               = _uint32_add( f_e_h_bias,        f_m_hidden_bit    );
  const uint32 h_s                        = _uint32_srl( f_s,               f_h_s_pos_offset );
  const uint32 h_m_nan                    = _uint32_srl( f_m,               f_h_m_pos_offset );
    const uint32 h_m_denorm                 = _uint32_srl( f_m_denorm,      f_h_m_pos_offset );
-  const uint32 h_em_norm                  = _uint32_srl( f_em_norm_packed,  f_h_m_pos_offset );
+    const uint32 f_m_rounded_overflow       = _uint32_and( f_m_rounded,     f_m_hidden_bit   );
-  const uint32 h_em_overflow              = _uint32_srl( f_e_overflow,      f_h_m_pos_offset );
+    const uint32 m_nan                      = _uint32_srl( f_m,             f_h_m_pos_offset );
-  const uint32 is_e_eqz_msb               = _uint32_dec(  f_e     );
+    const uint32 h_em_nan                   = _uint32_or(  h_e_mask,        m_nan            );
-  const uint32 is_m_nez_msb               = _uint32_neg(  f_m     );
+    const uint32 h_e_norm_overflow_offset   = _uint32_inc( f_e_half_bias );
-  const uint32 is_h_m_nan_nez_msb         = _uint32_neg(  h_m_nan );
+    const uint32 h_e_norm_overflow          = _uint32_sll( h_e_norm_overflow_offset, h_e_pos          );
-  const uint32 is_e_nflagged_msb          = _uint32_sub(  f_e,                 f_e_mask          );
+    const uint32 h_e_norm                   = _uint32_sll( f_e_half_bias,            h_e_pos          );
-  const uint32 is_ninf_msb                = _uint32_or(   is_e_nflagged_msb,   is_m_nez_msb      );
+    const uint32 h_m_norm                   = _uint32_srl( f_m_rounded,              f_h_m_pos_offset );
-  const uint32 is_underflow_msb           = _uint32_sub(  is_e_eqz_msb,        f_h_bias_offset   );
+    const uint32 h_em_norm                  = _uint32_or(  h_e_norm,                 h_m_norm         );
-  const uint32 is_nan_nunderflow_msb      = _uint32_or(   is_h_m_nan_nez_msb,  is_e_nflagged_msb );
+    const uint32 is_h_ndenorm_msb           = _uint32_sub( f_h_bias_offset,   f_e_amount    );
-  const uint32 is_m_snan_msb              = _uint32_sub(  f_m_snan_mask,       f_m               );
+    const uint32 is_f_e_flagged_msb         = _uint32_sub( f_h_e_biased_flag, f_e_half_bias );
-  const uint32 is_snan_msb                = _uint32_andc( is_m_snan_msb,       is_e_nflagged_msb );
+    const uint32 is_h_denorm_msb            = _uint32_not( is_h_ndenorm_msb );
-  const uint32 is_overflow_msb            = _uint32_neg(  f_m_rounded_overflow );
+    const uint32 is_f_m_eqz_msb             = _uint32_dec( f_m   );
-  const uint32 h_nan_underflow_result     = _uint32_sels( is_nan_nunderflow_msb, h_em_norm,                h_nan_em_min       );
+    const uint32 is_h_nan_eqz_msb           = _uint32_dec( m_nan );
-  const uint32 h_inf_result               = _uint32_sels( is_ninf_msb,           h_nan_underflow_result,   h_e_mask           );
+    const uint32 is_f_inf_msb               = _uint32_and( is_f_e_flagged_msb, is_f_m_eqz_msb   );
-  const uint32 h_underflow_result         = _uint32_sels( is_underflow_msb,      h_m_denorm,               h_inf_result       );
+    const uint32 is_f_nan_underflow_msb     = _uint32_and( is_f_e_flagged_msb, is_h_nan_eqz_msb );
-  const uint32 h_overflow_result          = _uint32_sels( is_overflow_msb,       h_em_overflow,            h_underflow_result );
+    const uint32 is_e_overflow_msb          = _uint32_sub( h_e_mask_value,     f_e_half_bias    );
-  const uint32 h_em_result                = _uint32_sels( is_snan_msb,           h_snan_mask,              h_overflow_result  );
+    const uint32 is_h_inf_msb               = _uint32_or(  is_e_overflow_msb,  is_f_inf_msb     );
-  const uint32 h_result                   = _uint32_or( h_em_result, h_s );
+    const uint32 is_f_nsnan_msb             = _uint32_sub( f_snan,             f_snan_mask      );
    const uint32 is_m_norm_overflow_msb     = _uint32_neg( f_m_rounded_overflow );
    const uint32 is_f_snan_msb              = _uint32_not( is_f_nsnan_msb );
    const uint32 h_em_overflow_result       = _uint32_sels( is_m_norm_overflow_msb, h_e_norm_overflow, h_em_norm                 );
    const uint32 h_em_nan_result            = _uint32_sels( is_f_e_flagged_msb,     h_em_nan,          h_em_overflow_result      );
    const uint32 h_em_nan_underflow_result  = _uint32_sels( is_f_nan_underflow_msb, h_nan_min,         h_em_nan_result           );
    const uint32 h_em_inf_result            = _uint32_sels( is_h_inf_msb,           h_e_mask,          h_em_nan_underflow_result );
    const uint32 h_em_denorm_result         = _uint32_sels( is_h_denorm_msb,        h_m_denorm,        h_em_inf_result           );
    const uint32 h_em_snan_result           = _uint32_sels( is_f_snan_msb,          h_snan_mask,       h_em_denorm_result        );
    const uint32 h_result                   = _uint32_or( h_s, h_em_snan_result );
-  return (h_result);
+    return (uint16)(h_result);
 }
 uint32 
-half_to_float( uint16 h )
+nv::half_to_float( uint16 h )
 {
    const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
    const uint32 h_m_mask              = _uint32_li( 0x000003ff );
@ -447,117 +478,46 @@ half_to_float( uint16 h )
    return (f_result);
 }
-uint16
+uint32 
-half_add( uint16 x, uint16 y )
+nv::fast_half_to_float( uint16 h )
 {
-  const uint16 one                       = _uint16_li( 0x0001 );
+    const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
-  const uint16 msb_to_lsb_sa             = _uint16_li( 0x000f );
+    const uint32 h_m_mask              = _uint32_li( 0x000003ff );
-  const uint16 h_s_mask                  = _uint16_li( 0x8000 );
+    const uint32 h_s_mask              = _uint32_li( 0x00008000 );
-  const uint16 h_e_mask                  = _uint16_li( 0x7c00 );
+    const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
-  const uint16 h_m_mask                  = _uint16_li( 0x03ff );
+    const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
-  const uint16 h_m_msb_mask              = _uint16_li( 0x2000 );
+    const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
-  const uint16 h_m_msb_sa                = _uint16_li( 0x000d );
+    const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
-  const uint16 h_m_hidden                = _uint16_li( 0x0400 );
+    const uint32 f_m_mask              = _uint32_li( 0x007fffff );
-  const uint16 h_e_pos                   = _uint16_li( 0x000a );
+    const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
-  const uint16 h_e_bias_minus_one        = _uint16_li( 0x000e );
+    const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
-  const uint16 h_m_grs_carry             = _uint16_li( 0x4000 );
+    const uint32 f_e_pos               = _uint32_li( 0x00000017 );
-  const uint16 h_m_grs_carry_pos         = _uint16_li( 0x000e );
+    const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
-  const uint16 h_grs_size                = _uint16_li( 0x0003 );
+    const uint32 h_e                   = _uint32_and( h, h_e_mask );
-  const uint16 h_snan                    = _uint16_li( 0xfe00 );
+    const uint32 h_m                   = _uint32_and( h, h_m_mask );
-  const uint16 h_e_mask_minus_one        = _uint16_li( 0x7bff );
+    const uint32 h_s                   = _uint32_and( h, h_s_mask );
-  const uint16 h_grs_round_carry         = _uint16_sll( one, h_grs_size );
+    const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
-  const uint16 h_grs_round_mask          = _uint16_sub( h_grs_round_carry, one );
+    const uint32 h_m_nlz               = _uint32_cntlz( h_m );
-  const uint16 x_e                       = _uint16_and( x, h_e_mask );
+    const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
-  const uint16 y_e                       = _uint16_and( y, h_e_mask );
+    const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
-  const uint16 is_y_e_larger_msb         = _uint16_sub( x_e, y_e );
+    const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
-  const uint16 a                         = _uint16_sels( is_y_e_larger_msb, y, x);
+    const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
-  const uint16 a_s                       = _uint16_and( a, h_s_mask );
+    const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
-  const uint16 a_e                       = _uint16_and( a, h_e_mask );
+    const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
-  const uint16 a_m_no_hidden_bit         = _uint16_and( a, h_m_mask );
+    const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
-  const uint16 a_em_no_hidden_bit        = _uint16_or( a_e, a_m_no_hidden_bit );
+    const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
-  const uint16 b                         = _uint16_sels( is_y_e_larger_msb, x, y);
+    const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
-  const uint16 b_s                       = _uint16_and( b, h_s_mask );
+    const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
-  const uint16 b_e                       = _uint16_and( b, h_e_mask );
+    const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
-  const uint16 b_m_no_hidden_bit         = _uint16_and( b, h_m_mask );
+    const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
-  const uint16 b_em_no_hidden_bit        = _uint16_or( b_e, b_m_no_hidden_bit );
+    const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
-  const uint16 is_diff_sign_msb          = _uint16_xor( a_s, b_s );
+    const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
-  const uint16 is_a_inf_msb              = _uint16_sub( h_e_mask_minus_one, a_em_no_hidden_bit );
+    const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
-  const uint16 is_b_inf_msb              = _uint16_sub( h_e_mask_minus_one, b_em_no_hidden_bit );
+    const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
-  const uint16 is_undenorm_msb           = _uint16_dec( a_e );
+    const uint32 is_zero               = _uint32_ext(  is_zero_msb );
-  const uint16 is_undenorm               = _uint16_ext( is_undenorm_msb );
+    const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
-  const uint16 is_both_inf_msb           = _uint16_and( is_a_inf_msb, is_b_inf_msb );
+    const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
-  const uint16 is_invalid_inf_op_msb     = _uint16_and( is_both_inf_msb, b_s );
+    const uint32 f_result              = _uint32_or( f_s, f_denorm_result );
  const uint16 is_a_e_nez_msb            = _uint16_neg( a_e );
  const uint16 is_b_e_nez_msb            = _uint16_neg( b_e );
  const uint16 is_a_e_nez                = _uint16_ext( is_a_e_nez_msb );
  const uint16 is_b_e_nez                = _uint16_ext( is_b_e_nez_msb );
  const uint16 a_m_hidden_bit            = _uint16_and( is_a_e_nez, h_m_hidden );
  const uint16 b_m_hidden_bit            = _uint16_and( is_b_e_nez, h_m_hidden );
  const uint16 a_m_no_grs                = _uint16_or( a_m_no_hidden_bit, a_m_hidden_bit );
  const uint16 b_m_no_grs                = _uint16_or( b_m_no_hidden_bit, b_m_hidden_bit );
  const uint16 diff_e                    = _uint16_sub( a_e,        b_e );
  const uint16 a_e_unbias                = _uint16_sub( a_e,        h_e_bias_minus_one );
  const uint16 a_m                       = _uint16_sll( a_m_no_grs, h_grs_size );
  const uint16 a_e_biased                = _uint16_srl( a_e,        h_e_pos );
  const uint16 m_sa_unbias               = _uint16_srl( a_e_unbias, h_e_pos );
  const uint16 m_sa_default              = _uint16_srl( diff_e,     h_e_pos );
  const uint16 m_sa_unbias_mask          = _uint16_andc( is_a_e_nez_msb,   is_b_e_nez_msb );
  const uint16 m_sa                      = _uint16_sels( m_sa_unbias_mask, m_sa_unbias, m_sa_default );
  const uint16 b_m_no_sticky             = _uint16_sll( b_m_no_grs,        h_grs_size );
  const uint16 sh_m                      = _uint16_srl( b_m_no_sticky,     m_sa );
  const uint16 sticky_overflow           = _uint16_sll( one,               m_sa );
  const uint16 sticky_mask               = _uint16_dec( sticky_overflow );
  const uint16 sticky_collect            = _uint16_and( b_m_no_sticky, sticky_mask );
  const uint16 is_sticky_set_msb         = _uint16_neg( sticky_collect );
  const uint16 sticky                    = _uint16_srl( is_sticky_set_msb, msb_to_lsb_sa);
  const uint16 b_m                       = _uint16_or( sh_m, sticky );
  const uint16 is_c_m_ab_pos_msb         = _uint16_sub( b_m, a_m );
  const uint16 c_inf                     = _uint16_or( a_s, h_e_mask );
  const uint16 c_m_sum                   = _uint16_add( a_m, b_m );
  const uint16 c_m_diff_ab               = _uint16_sub( a_m, b_m );
  const uint16 c_m_diff_ba               = _uint16_sub( b_m, a_m );
  const uint16 c_m_smag_diff             = _uint16_sels( is_c_m_ab_pos_msb, c_m_diff_ab, c_m_diff_ba );
  const uint16 c_s_diff                  = _uint16_sels( is_c_m_ab_pos_msb, a_s,         b_s         );
  const uint16 c_s                       = _uint16_sels( is_diff_sign_msb,  c_s_diff,    a_s         );
  const uint16 c_m_smag_diff_nlz         = _uint16_cntlz( c_m_smag_diff );
  const uint16 diff_norm_sa              = _uint16_sub( c_m_smag_diff_nlz, one );
  const uint16 is_diff_denorm_msb        = _uint16_sub( a_e_biased, diff_norm_sa );
  const uint16 is_diff_denorm            = _uint16_ext( is_diff_denorm_msb );
  const uint16 is_a_or_b_norm_msb        = _uint16_neg( a_e_biased );
  const uint16 diff_denorm_sa            = _uint16_dec( a_e_biased );
  const uint16 c_m_diff_denorm           = _uint16_sll( c_m_smag_diff, diff_denorm_sa );
  const uint16 c_m_diff_norm             = _uint16_sll( c_m_smag_diff, diff_norm_sa );
  const uint16 c_e_diff_norm             = _uint16_sub( a_e_biased,  diff_norm_sa );
  const uint16 c_m_diff_ab_norm          = _uint16_sels( is_diff_denorm_msb, c_m_diff_denorm, c_m_diff_norm );
  const uint16 c_e_diff_ab_norm          = _uint16_andc( c_e_diff_norm, is_diff_denorm );
  const uint16 c_m_diff                  = _uint16_sels( is_a_or_b_norm_msb, c_m_diff_ab_norm, c_m_smag_diff );
  const uint16 c_e_diff                  = _uint16_sels( is_a_or_b_norm_msb, c_e_diff_ab_norm, a_e_biased    );
  const uint16 is_diff_eqz_msb           = _uint16_dec( c_m_diff );
  const uint16 is_diff_exactly_zero_msb  = _uint16_and( is_diff_sign_msb, is_diff_eqz_msb );
  const uint16 is_diff_exactly_zero      = _uint16_ext( is_diff_exactly_zero_msb );
  const uint16 c_m_added                 = _uint16_sels( is_diff_sign_msb, c_m_diff, c_m_sum );
  const uint16 c_e_added                 = _uint16_sels( is_diff_sign_msb, c_e_diff, a_e_biased );
  const uint16 c_m_carry                 = _uint16_and( c_m_added, h_m_grs_carry );
  const uint16 is_c_m_carry_msb          = _uint16_neg( c_m_carry );
  const uint16 c_e_hidden_offset         = _uint16_andsrl( c_m_added, h_m_grs_carry, h_m_grs_carry_pos );
  const uint16 c_m_sub_hidden            = _uint16_srl( c_m_added, one );
  const uint16 c_m_no_hidden             = _uint16_sels( is_c_m_carry_msb, c_m_sub_hidden, c_m_added );
  const uint16 c_e_no_hidden             = _uint16_add( c_e_added,         c_e_hidden_offset  );
  const uint16 c_m_no_hidden_msb         = _uint16_and( c_m_no_hidden,     h_m_msb_mask       );
  const uint16 undenorm_m_msb_odd        = _uint16_srl( c_m_no_hidden_msb, h_m_msb_sa         );
  const uint16 undenorm_fix_e            = _uint16_and( is_undenorm,       undenorm_m_msb_odd );
  const uint16 c_e_fixed                 = _uint16_add( c_e_no_hidden,     undenorm_fix_e     );
  const uint16 c_m_round_amount          = _uint16_and( c_m_no_hidden,     h_grs_round_mask   );
  const uint16 c_m_rounded               = _uint16_add( c_m_no_hidden,     c_m_round_amount   );
  const uint16 c_m_round_overflow        = _uint16_andsrl( c_m_rounded, h_m_grs_carry, h_m_grs_carry_pos );
  const uint16 c_e_rounded               = _uint16_add( c_e_fixed, c_m_round_overflow );
  const uint16 c_m_no_grs                = _uint16_srlm( c_m_rounded, h_grs_size,  h_m_mask );
  const uint16 c_e                       = _uint16_sll( c_e_rounded, h_e_pos );
  const uint16 c_em                      = _uint16_or( c_e, c_m_no_grs );
  const uint16 c_normal                  = _uint16_or( c_s, c_em );
  const uint16 c_inf_result              = _uint16_sels( is_a_inf_msb, c_inf, c_normal );
  const uint16 c_zero_result             = _uint16_andc( c_inf_result, is_diff_exactly_zero );
  const uint16 c_result                  = _uint16_sels( is_invalid_inf_op_msb, h_snan, c_zero_result );
-  return (c_result);
+    return (f_result);
 }
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@ -1,9 +1,17 @@
 #pragma once
 #ifndef NV_MATH_HALF_H
 #define NV_MATH_HALF_H
-#include <nvmath/nvmath.h>
+#include "nvmath.h"
-uint32 half_to_float( uint16 h );
+namespace nv {
 uint16 half_from_float( uint32 f );
-#endif /* NV_MATH_HALF_H */
+    uint32 half_to_float( uint16 h );
    uint16 half_from_float( uint32 f );
    // Does not handle NaN or infinity.
    uint32 fast_half_to_float( uint16 h );
 } // nv namespace
 #endif // NV_MATH_HALF_H
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
--- a/src/nvmath/Plane.cpp
+++ b/src/nvmath/Plane.cpp
@ -14,4 +14,13 @@ namespace nv
        return Plane(newVec, ptInPlane);
    }
-}
+
    Vector3 planeIntersection(Plane::Arg a, Plane::Arg b, Plane::Arg c)
    {
        return dot(a.vector(), cross(b.vector(), c.vector())) * (
            a.offset() * cross(b.vector(), c.vector()) + 
            c.offset() * cross(a.vector(), b.vector()) +
            b.offset() * cross(c.vector(), a.vector()));
    }
 } // nv namespace
--- a/src/nvmath/Plane.h
+++ b/src/nvmath/Plane.h
@ -1,10 +1,11 @@
 // This code is in the public domain -- castanyo@yahoo.es
 #pragma once
 #ifndef NV_MATH_PLANE_H
 #define NV_MATH_PLANE_H
-#include "nvmath.h"
+#include <nvmath/nvmath.h>
-#include "Vector.h"
+#include <nvmath/Vector.h>
 namespace nv
 {
@ -45,7 +46,7 @@ namespace nv
    inline const Plane & Plane::operator=(Plane::Arg v) { p = v.p; return *this; }
    inline Vector3 Plane::vector() const { return p.xyz(); }
-	inline scalar Plane::offset() const { return p.w(); }
+    inline scalar Plane::offset() const { return p.w; }
    inline const Vector4 & Plane::asVector() const { return p; }
    inline Vector4 & Plane::asVector() { return p; }
@ -72,6 +73,9 @@ namespace nv
    Plane transformPlane(const Matrix&, Plane::Arg);
    Vector3 planeIntersection(Plane::Arg a, Plane::Arg b, Plane::Arg c);
 } // nv namespace
 #endif // NV_MATH_PLANE_H
--- a/src/nvmath/Vector.h
+++ b/src/nvmath/Vector.h
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@ -1,5 +1,6 @@
 // This code is in the public domain -- castanyo@yahoo.es
 #pragma once
 #ifndef NV_MATH_H
 #define NV_MATH_H
@ -97,6 +98,7 @@ inline float asinf_assert(const float f)
 #define asin asin_assert
 #define asinf asinf_assert
 namespace nv
 {
    inline float toRadian(float degree) { return degree * (PI / 180.0f); }
@ -161,10 +163,11 @@ namespace nv
        return f0 * s + f1 * t;
    }
-    inline float square(float f)
+    inline float square(float f) { return f * f; }
-    {
+    inline int square(int i) { return i * i; }
-        return f * f;
+
-    }
+    inline float cube(float f) { return f * f; }
    inline int cube(int i) { return i * i; }
    // @@ Float to int conversions to be optimized at some point. See:
    // http://cbloomrants.blogspot.com/2009/01/01-17-09-float-to-int.html