Merge private branch.

2010-05-27 23:19:24 +00:00 · 2010-05-27 23:19:24 +00:00 · c09067e477
commit c09067e477
parent 51a4fe7e2d
13 changed files with 2701 additions and 2622 deletions
--- a/src/nvmath/Box.cpp
+++ b/src/nvmath/Box.cpp
@ -0,0 +1,30 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#include "nvmath/Box.h"
+#include "nvmath/Sphere.h"
+
+using namespace nv;
+
+
+
+float nv::distanceSquared(const Box &box, const Vector3 &point) {
+    Vector3 closest;
+
+    if (point.x < box.minCorner.x) closest.x = box.minCorner.x;
+    else if (point.x > box.maxCorner.x) closest.x = box.maxCorner.x;
+    else closest.x = point.x;
+
+    if (point.y < box.minCorner.y) closest.y = box.minCorner.y;
+    else if (point.y > box.maxCorner.y) closest.y = box.maxCorner.y;
+    else closest.y = point.y;
+
+    if (point.z < box.minCorner.z) closest.z = box.minCorner.z;
+    else if (point.z > box.maxCorner.z) closest.z = box.maxCorner.z;
+    else closest.z = point.z;
+
+    return lengthSquared(point - closest);
+}
+
+bool nv::overlap(const Box &box, const Sphere &sphere) {
+    return distanceSquared(box, sphere.center) < sphere.radius * sphere.radius;
+}
--- a/src/nvmath/Box.h
+++ b/src/nvmath/Box.h
@ -1,142 +1,156 @@
 // This code is in the public domain -- castanyo@yahoo.es

+#pragma once
 #ifndef NV_MATH_BOX_H
 #define NV_MATH_BOX_H

-#include <nvmath/Vector.h>
+#include "Vector.h"

 #include <float.h> // FLT_MAX

 namespace nv
 {
-class Stream;
+    class Stream;
+    class Sphere;

-/// Axis Aligned Bounding Box.
-class Box
-{
-public:
+    /// Axis Aligned Bounding Box.
+    class Box
+    {
+    public:

-	/// Default ctor.
-	Box() { };
+        /// Default ctor.
+        Box() { };

-	/// Copy ctor.
-	Box( const Box & b ) : m_mins(b.m_mins), m_maxs(b.m_maxs) { }
+        /// Copy ctor.
+        Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }

-	/// Init ctor.
-	Box( Vector3::Arg mins, Vector3::Arg maxs ) : m_mins(mins), m_maxs(maxs) { }
+        /// Init ctor.
+        Box(Vector3::Arg mins, Vector3::Arg maxs) : minCorner(mins), maxCorner(maxs) { }

-	// Cast operators.
-	operator const float * () const { return reinterpret_cast<const float *>(this); }
+        // Cast operators.
+        operator const float * () const { return reinterpret_cast<const float *>(this); }

-	// Min corner of the box.
-	Vector3 minCorner() const { return m_mins; }
-	Vector3 & minCorner() { return m_mins; }
+        /// Clear the bounds.
+        void clearBounds()
+        {
+            minCorner.set(FLT_MAX, FLT_MAX, FLT_MAX);
+            maxCorner.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
+        }

-	// Max corner of the box.
-	Vector3 maxCorner() const { return m_maxs; }
-	Vector3 & maxCorner() { return m_maxs; }
+        /// Build a cube centered on center and with edge = 2*dist
+        void cube(Vector3::Arg center, float dist)
+        {
+            setCenterExtents(center, Vector3(dist, dist, dist));
+        }

-	/// Clear the bounds.
-	void clearBounds()
-	{
-		m_mins.set(FLT_MAX, FLT_MAX, FLT_MAX);
-		m_maxs.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
-	}
+        /// Build a box, given center and extents.
+        void setCenterExtents(Vector3::Arg center, Vector3::Arg extents)
+        {
+            minCorner = center - extents;
+            maxCorner = center + extents;
+        }

-	/// Build a cube centered on center and with edge = 2*dist
-	void cube(Vector3::Arg center, float dist)
-	{
-		setCenterExtents(center, Vector3(dist, dist, dist));
-	}
+        /// Get box center.
+        Vector3 center() const
+        {
+            return (minCorner + maxCorner) * 0.5f;
+        }

-	/// Build a box, given center and extents.
-	void setCenterExtents(Vector3::Arg center, Vector3::Arg extents)
-	{
-		m_mins = center - extents;
-		m_maxs = center + extents;
-	}
+        /// Return extents of the box.
+        Vector3 extents() const
+        {
+            return (maxCorner - minCorner) * 0.5f;
+        }

-	/// Get box center.
-	Vector3 center() const
-	{
-		return (m_mins + m_maxs) * 0.5f;
-	}
+        /// Return extents of the box.
+        scalar extents(uint axis) const
+        {
+            nvDebugCheck(axis < 3);
+            if (axis == 0) return (maxCorner.x - minCorner.x) * 0.5f;
+            if (axis == 1) return (maxCorner.y - minCorner.y) * 0.5f;
+            if (axis == 2) return (maxCorner.z - minCorner.z) * 0.5f;
+            nvAssume(false);
+            return 0.0f;
+        }

-	/// Return extents of the box.
-	Vector3 extents() const
-	{
-		return (m_maxs - m_mins) * 0.5f;
-	}
+        /// Add a point to this box.
+        void addPointToBounds(Vector3::Arg p)
+        {
+            minCorner = min(minCorner, p);
+            maxCorner = max(maxCorner, p);
+        }

-	/// Return extents of the box.
-	scalar extents(uint axis) const
-	{
-		nvDebugCheck(axis < 3);
-		if (axis == 0) return (m_maxs.x() - m_mins.x()) * 0.5f;
-		if (axis == 1) return (m_maxs.y() - m_mins.y()) * 0.5f;
-		if (axis == 2) return (m_maxs.z() - m_mins.z()) * 0.5f;
-		nvAssume(false);
-		return 0.0f;
-	}
+        /// Add a box to this box.
+        void addBoxToBounds(const Box & b)
+        {
+            minCorner = min(minCorner, b.minCorner);
+            maxCorner = max(maxCorner, b.maxCorner);
+        }

-	/// Add a point to this box.
-	void addPointToBounds(Vector3::Arg p)
-	{
-		m_mins = min(m_mins, p);
-		m_maxs = max(m_maxs, p);
-	}
+        /// Translate box.
+        void translate(Vector3::Arg v)
+        {
+            minCorner += v;
+            maxCorner += v;
+        }

-	/// Add a box to this box.
-	void addBoxToBounds(const Box & b)
-	{
-		m_mins = min(m_mins, b.m_mins);
-		m_maxs = max(m_maxs, b.m_maxs);
-	}
+        /// Scale the box.
+        void scale(float s)
+        {
+            minCorner *= s;
+            maxCorner *= s;
+        }

-	/// Translate box.
-	void translate(Vector3::Arg v)
-	{
-		m_mins += v;
-		m_maxs += v;
-	}
+        // Expand the box by a fixed amount.
+        void expand(float r) {
+            minCorner -= Vector3(r,r,r);
+            maxCorner += Vector3(r,r,r);
+        }

-	/// Scale the box.
-	void scale(float s)
-	{
-		m_mins *= s;
-		m_maxs *= s;
-	}
+        /// Get the area of the box.
+        float area() const
+        {
+            const Vector3 d = extents();
+            return 8.0f * (d.x*d.y + d.x*d.z + d.y*d.z);
+        }	

-	/// Get the area of the box.
-	float area() const
-	{
-		const Vector3 d = extents();
-		return 8.0f * (d.x()*d.y() + d.x()*d.z() + d.y()*d.z());
-	}	
+        /// Get the volume of the box.
+        float volume() const
+        {
+            Vector3 d = extents();
+            return 8.0f * (d.x * d.y * d.z);
+        }

-	/// Get the volume of the box.
-	float volume() const
-	{
-		Vector3 d = extents();
-		return 8.0f * (d.x() * d.y() * d.z());
-	}
-	
-	/// Return true if the box contains the given point.
-	bool contains(Vector3::Arg p) const
-	{
-		return 
-			m_mins.x() < p.x() && m_mins.y() < p.y() && m_mins.z() < p.z() &&
-			m_maxs.x() > p.x() && m_maxs.y() > p.y() && m_maxs.z() > p.z();
-	}
+        /// Return true if the box contains the given point.
+        bool contains(Vector3::Arg p) const
+        {
+            return 
+                minCorner.x < p.x && minCorner.y < p.y && minCorner.z < p.z &&
+                maxCorner.x > p.x && maxCorner.y > p.y && maxCorner.z > p.z;
+        }

-	friend Stream & operator<< (Stream & s, Box & box);
+        /// Split the given box in 8 octants and assign the ith one to this box.
+        void setOctant(const Box & box, Vector3::Arg center, int i)
+        {
+            minCorner = box.minCorner;
+            maxCorner = box.maxCorner;

-private:
+            if (i & 4) minCorner.x = center.x;
+            else       maxCorner.x = center.x;
+            if (i & 2) minCorner.y = center.y;
+            else       maxCorner.y = center.y;
+            if (i & 1) minCorner.z = center.z;
+            else       maxCorner.z = center.z;
+        }

-	Vector3 m_mins;
-	Vector3 m_maxs;
-};
+        friend Stream & operator<< (Stream & s, Box & box);

+        Vector3 minCorner;
+        Vector3 maxCorner;
+    };
+
+    float distanceSquared(const Box &box, const Vector3 &point);
+    bool overlap(const Box &box, const Sphere &sphere);


 } // nv namespace
--- a/src/nvmath/CMakeLists.txt
+++ b/src/nvmath/CMakeLists.txt
@ -5,7 +5,7 @@ SET(MATH_SRCS
    Vector.h
    Matrix.h
    Plane.h Plane.cpp
-    Box.h
+    Box.h Box.cpp
    Color.h
    Half.h Half.cpp
    Fitting.h Fitting.cpp)
--- a/src/nvmath/Color.h
+++ b/src/nvmath/Color.h
@ -1,178 +1,179 @@
 // This code is in the public domain -- castanyo@yahoo.es

+#pragma once
 #ifndef NV_MATH_COLOR_H
 #define NV_MATH_COLOR_H

-#include <nvcore/Debug.h>
-#include <nvmath/Vector.h>
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.h"

 namespace nv
 {

-/// 64 bit color stored as BGRA.
-class NVMATH_CLASS Color64 
-{
-public:
-	Color64() { }
-	Color64(const Color64 & c) : u(c.u) { }
-	Color64(uint16 R, uint16 G, uint16 B, uint16 A) { setRGBA(R, G, B, A); }
-	explicit Color64(uint64 U) : u(U) { }
+    /// 64 bit color stored as BGRA.
+    class NVMATH_CLASS Color64 
+    {
+    public:
+        Color64() { }
+        Color64(const Color64 & c) : u(c.u) { }
+        Color64(uint16 R, uint16 G, uint16 B, uint16 A) { setRGBA(R, G, B, A); }
+        explicit Color64(uint64 U) : u(U) { }

-	void setRGBA(uint16 R, uint16 G, uint16 B, uint16 A)
-	{
-		r = R;
-		g = G;
-		b = B;
-		a = A;
-	}
+        void setRGBA(uint16 R, uint16 G, uint16 B, uint16 A)
+        {
+            r = R;
+            g = G;
+            b = B;
+            a = A;
+        }

-	operator uint64 () const {
-		return u;
-	}
+        operator uint64 () const {
+            return u;
+        }

-	union {
-		struct {
+        union {
+            struct {
 #if NV_LITTLE_ENDIAN
-			uint16 r, a, b, g;
+                uint16 r, a, b, g;
 #else
-			uint16 a: 16;
-			uint16 r: 16;
-			uint16 g: 16;
-			uint16 b: 16;
+                uint16 a: 16;
+                uint16 r: 16;
+                uint16 g: 16;
+                uint16 b: 16;
 #endif
-		};
-		uint64 u;
-	};
-};
+            };
+            uint64 u;
+        };
+    };

-/// 32 bit color stored as BGRA.
-class NVMATH_CLASS Color32
-{
-public:
-	Color32() { }
-	Color32(const Color32 & c) : u(c.u) { }
-	Color32(uint8 R, uint8 G, uint8 B) { setRGBA(R, G, B, 0xFF); }
-	Color32(uint8 R, uint8 G, uint8 B, uint8 A) { setRGBA( R, G, B, A); }
-	//Color32(uint8 c[4]) { setRGBA(c[0], c[1], c[2], c[3]); }
-	//Color32(float R, float G, float B) { setRGBA(uint(R*255), uint(G*255), uint(B*255), 0xFF); }
-	//Color32(float R, float G, float B, float A) { setRGBA(uint(R*255), uint(G*255), uint(B*255), uint(A*255)); }
-	explicit Color32(uint32 U) : u(U) { }
+    /// 32 bit color stored as BGRA.
+    class NVMATH_CLASS Color32
+    {
+    public:
+        Color32() { }
+        Color32(const Color32 & c) : u(c.u) { }
+        Color32(uint8 R, uint8 G, uint8 B) { setRGBA(R, G, B, 0xFF); }
+        Color32(uint8 R, uint8 G, uint8 B, uint8 A) { setRGBA( R, G, B, A); }
+        //Color32(uint8 c[4]) { setRGBA(c[0], c[1], c[2], c[3]); }
+        //Color32(float R, float G, float B) { setRGBA(uint(R*255), uint(G*255), uint(B*255), 0xFF); }
+        //Color32(float R, float G, float B, float A) { setRGBA(uint(R*255), uint(G*255), uint(B*255), uint(A*255)); }
+        explicit Color32(uint32 U) : u(U) { }

-	void setRGBA(uint8 R, uint8 G, uint8 B, uint8 A)
-	{
-		r = R;
-		g = G;
-		b = B;
-		a = A;
-	}
+        void setRGBA(uint8 R, uint8 G, uint8 B, uint8 A)
+        {
+            r = R;
+            g = G;
+            b = B;
+            a = A;
+        }

-	void setBGRA(uint8 B, uint8 G, uint8 R, uint8 A = 0xFF)
-	{
-		r = R;
-		g = G;
-		b = B;
-		a = A;
-	}
-	
-	operator uint32 () const {
-		return u;
-	}
-	
-	union {
-		struct {
+        void setBGRA(uint8 B, uint8 G, uint8 R, uint8 A = 0xFF)
+        {
+            r = R;
+            g = G;
+            b = B;
+            a = A;
+        }
+
+        operator uint32 () const {
+            return u;
+        }
+
+        union {
+            struct {
 #if NV_LITTLE_ENDIAN
-			uint8 b, g, r, a;
+                uint8 b, g, r, a;
 #else
-			uint8 a: 8;
-			uint8 r: 8;
-			uint8 g: 8;
-			uint8 b: 8;
+                uint8 a: 8;
+                uint8 r: 8;
+                uint8 g: 8;
+                uint8 b: 8;
 #endif
-		};
-		uint32 u;
-	};
-};
+            };
+            uint32 u;
+        };
+    };


-/// 16 bit 565 BGR color.
-class NVMATH_CLASS Color16
-{
-public:
-	Color16() { }
-	Color16(const Color16 & c) : u(c.u) { }
-	explicit Color16(uint16 U) : u(U) { }
-	
-	union {
-		struct {
+    /// 16 bit 565 BGR color.
+    class NVMATH_CLASS Color16
+    {
+    public:
+        Color16() { }
+        Color16(const Color16 & c) : u(c.u) { }
+        explicit Color16(uint16 U) : u(U) { }
+
+        union {
+            struct {
 #if NV_LITTLE_ENDIAN
-			uint16 b : 5;
-			uint16 g : 6;
-			uint16 r : 5;
+                uint16 b : 5;
+                uint16 g : 6;
+                uint16 r : 5;
 #else
-			uint16 r : 5;
-			uint16 g : 6;
-			uint16 b : 5;
+                uint16 r : 5;
+                uint16 g : 6;
+                uint16 b : 5;
 #endif
-		};
-		uint16 u;
-	};
-};
+            };
+            uint16 u;
+        };
+    };


-/// Clamp color components.
-inline Vector3 colorClamp(Vector3::Arg c)
-{
-	return Vector3(clamp(c.x(), 0.0f, 1.0f), clamp(c.y(), 0.0f, 1.0f), clamp(c.z(), 0.0f, 1.0f));
-}
+    /// Clamp color components.
+    inline Vector3 colorClamp(Vector3::Arg c)
+    {
+        return Vector3(clamp(c.x, 0.0f, 1.0f), clamp(c.y, 0.0f, 1.0f), clamp(c.z, 0.0f, 1.0f));
+    }

-/// Clamp without allowing the hue to change.
-inline Vector3 colorNormalize(Vector3::Arg c)
-{
-	float scale = 1.0f;
-	if (c.x() > scale) scale = c.x();
-	if (c.y() > scale) scale = c.y();
-	if (c.z() > scale) scale = c.z();
-	return c / scale;
-}
+    /// Clamp without allowing the hue to change.
+    inline Vector3 colorNormalize(Vector3::Arg c)
+    {
+        float scale = 1.0f;
+        if (c.x > scale) scale = c.x;
+        if (c.y > scale) scale = c.y;
+        if (c.z > scale) scale = c.z;
+        return c / scale;
+    }

-/// Convert Color32 to Color16.
-inline Color16 toColor16(Color32 c)
-{
-	Color16 color;
-	//         rrrrrggggggbbbbb
-	// rrrrr000gggggg00bbbbb000
-//	color.u = (c.u >> 3) & 0x1F;
-//	color.u |= (c.u >> 5) & 0x7E0;
-//	color.u |= (c.u >> 8) & 0xF800;
-	
-	color.r = c.r >> 3;
-	color.g = c.g >> 2;
-	color.b = c.b >> 3;
-	return color; 
-}
+    /// Convert Color32 to Color16.
+    inline Color16 toColor16(Color32 c)
+    {
+        Color16 color;
+        //         rrrrrggggggbbbbb
+        // rrrrr000gggggg00bbbbb000
+        //	color.u = (c.u >> 3) & 0x1F;
+        //	color.u |= (c.u >> 5) & 0x7E0;
+        //	color.u |= (c.u >> 8) & 0xF800;
+
+        color.r = c.r >> 3;
+        color.g = c.g >> 2;
+        color.b = c.b >> 3;
+        return color; 
+    }


-/// Promote 16 bit color to 32 bit using regular bit expansion.
-inline Color32 toColor32(Color16 c)
-{
-	Color32 color;
-//	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
-//	c.u |= (c.u >> 5) & 0x070007;
-//	c.u |= (c.u >> 6) & 0x000300;
-	
-	color.b = (c.b << 3) | (c.b >> 2);
-	color.g = (c.g << 2) | (c.g >> 4);
-	color.r = (c.r << 3) | (c.r >> 2);
-	color.a = 0xFF;
-	
-	return color;
-}
+    /// Promote 16 bit color to 32 bit using regular bit expansion.
+    inline Color32 toColor32(Color16 c)
+    {
+        Color32 color;
+        //	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
+        //	c.u |= (c.u >> 5) & 0x070007;
+        //	c.u |= (c.u >> 6) & 0x000300;

-inline Vector4 toVector4(Color32 c)
-{
-	const float scale = 1.0f / 255.0f;
-	return Vector4(c.r * scale, c.g * scale, c.b * scale, c.a * scale);
-}
+        color.b = (c.b << 3) | (c.b >> 2);
+        color.g = (c.g << 2) | (c.g >> 4);
+        color.r = (c.r << 3) | (c.r >> 2);
+        color.a = 0xFF;
+
+        return color;
+    }
+
+    inline Vector4 toVector4(Color32 c)
+    {
+        const float scale = 1.0f / 255.0f;
+        return Vector4(c.r * scale, c.g * scale, c.b * scale, c.a * scale);
+    }

 } // nv namespace

--- a/src/nvmath/Fitting.cpp
+++ b/src/nvmath/Fitting.cpp
@ -1,9 +1,7 @@
 // This code is in the public domain -- icastano@gmail.com

 #include "Fitting.h"
-
-#include <nvcore/Algorithms.h> // max
-#include <nvcore/Containers.h> // swap
+#include "nvcore/Utils.h" // max, swap

 #include <float.h> // FLT_MAX

@ -12,236 +10,236 @@ using namespace nv;
 // @@ Move to EigenSolver.h
 static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
 {
-	if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
-	{
-		return Vector3(zero);
-	}
-	
-	const int NUM = 8;
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        return Vector3(zero);
+    }

-	Vector3 v(1, 1, 1);
-	for (int i = 0; i < NUM; i++)
-	{
-		float x = v.x() * matrix[0] + v.y() * matrix[1] + v.z() * matrix[2];
-		float y = v.x() * matrix[1] + v.y() * matrix[3] + v.z() * matrix[4];
-		float z = v.x() * matrix[2] + v.y() * matrix[4] + v.z() * matrix[5];
-		
-		float norm = max(max(x, y), z);
-	
-		v = Vector3(x, y, z) / norm;
-	}
+    const int NUM = 8;

-	return v;	
+    Vector3 v(1, 1, 1);
+    for (int i = 0; i < NUM; i++)
+    {
+        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
+        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
+        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
+
+        float norm = max(max(x, y), z);
+
+        v = Vector3(x, y, z) / norm;
+    }
+
+    return v;	
 }


 Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points)
 {
-	Vector3 centroid(zero);
+    Vector3 centroid(zero);

-	for (int i = 0; i < n; i++)
-	{
-		centroid += points[i];
-	}
-	centroid /= float(n);
+    for (int i = 0; i < n; i++)
+    {
+        centroid += points[i];
+    }
+    centroid /= float(n);

-	return centroid;
+    return centroid;
 }

 Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
 {
-	Vector3 centroid(zero);
-	float total = 0.0f;
+    Vector3 centroid(zero);
+    float total = 0.0f;

-	for (int i = 0; i < n; i++)
-	{
-		total += weights[i];
-		centroid += weights[i]*points[i];
-	}
-	centroid /= total;
+    for (int i = 0; i < n; i++)
+    {
+        total += weights[i];
+        centroid += weights[i]*points[i];
+    }
+    centroid /= total;

-	return centroid;
+    return centroid;
 }


 Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, float *__restrict covariance)
 {
-	// compute the centroid
-	Vector3 centroid = computeCentroid(n, points);
+    // compute the centroid
+    Vector3 centroid = computeCentroid(n, points);

-	// compute covariance matrix
-	for (int i = 0; i < 6; i++)
-	{
-		covariance[i] = 0.0f;
-	}
+    // compute covariance matrix
+    for (int i = 0; i < 6; i++)
+    {
+        covariance[i] = 0.0f;
+    }

-	for (int i = 0; i < n; i++)
-	{
-		Vector3 v = points[i] - centroid;
-		
-		covariance[0] += v.x() * v.x();
-		covariance[1] += v.x() * v.y();
-		covariance[2] += v.x() * v.z();
-		covariance[3] += v.y() * v.y();
-		covariance[4] += v.y() * v.z();
-		covariance[5] += v.z() * v.z();
-	}
+    for (int i = 0; i < n; i++)
+    {
+        Vector3 v = points[i] - centroid;

-	return centroid;
+        covariance[0] += v.x * v.x;
+        covariance[1] += v.x * v.y;
+        covariance[2] += v.x * v.z;
+        covariance[3] += v.y * v.y;
+        covariance[4] += v.y * v.z;
+        covariance[5] += v.z * v.z;
+    }
+
+    return centroid;
 }

 Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, float *__restrict covariance)
 {
-	// compute the centroid
-	Vector3 centroid = computeCentroid(n, points, weights, metric);
+    // compute the centroid
+    Vector3 centroid = computeCentroid(n, points, weights, metric);

-	// compute covariance matrix
-	for (int i = 0; i < 6; i++)
-	{
-		covariance[i] = 0.0f;
-	}
+    // compute covariance matrix
+    for (int i = 0; i < 6; i++)
+    {
+        covariance[i] = 0.0f;
+    }

-	for (int i = 0; i < n; i++)
-	{
-		Vector3 a = (points[i] - centroid) * metric;
-		Vector3 b = weights[i]*a;
-		
-		covariance[0] += a.x()*b.x();
-		covariance[1] += a.x()*b.y();
-		covariance[2] += a.x()*b.z();
-		covariance[3] += a.y()*b.y();
-		covariance[4] += a.y()*b.z();
-		covariance[5] += a.z()*b.z();
-	}
+    for (int i = 0; i < n; i++)
+    {
+        Vector3 a = (points[i] - centroid) * metric;
+        Vector3 b = weights[i]*a;

-	return centroid;
+        covariance[0] += a.x * b.x;
+        covariance[1] += a.x * b.y;
+        covariance[2] += a.x * b.z;
+        covariance[3] += a.y * b.y;
+        covariance[4] += a.y * b.z;
+        covariance[5] += a.z * b.z;
+    }
+
+    return centroid;
 }

 Vector3 nv::Fit::computePrincipalComponent(int n, const Vector3 *__restrict points)
 {
-	float matrix[6];
-	computeCovariance(n, points, matrix);
+    float matrix[6];
+    computeCovariance(n, points, matrix);

-	return firstEigenVector_PowerMethod(matrix);
+    return firstEigenVector_PowerMethod(matrix);
 }

 Vector3 nv::Fit::computePrincipalComponent(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
 {
-	float matrix[6];
-	computeCovariance(n, points, weights, metric, matrix);
+    float matrix[6];
+    computeCovariance(n, points, weights, metric, matrix);

-	return firstEigenVector_PowerMethod(matrix);
+    return firstEigenVector_PowerMethod(matrix);
 }


 Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
 {
-	// compute the centroid and covariance
-	float matrix[6];
-	Vector3 centroid = computeCovariance(n, points, matrix);
+    // compute the centroid and covariance
+    float matrix[6];
+    Vector3 centroid = computeCovariance(n, points, matrix);

-	if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
-	{
-		// If no plane defined, then return a horizontal plane.
-		return Plane(Vector3(0, 0, 1), centroid);
-	}
+    if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
+    {
+        // If no plane defined, then return a horizontal plane.
+        return Plane(Vector3(0, 0, 1), centroid);
+    }

 #pragma message(NV_FILE_LINE "TODO: need to write an eigensolver!")

-	// - Numerical Recipes in C is a good reference. Householder transforms followed by QL decomposition seems to be the best approach.
-	// - The one from magic-tools is now LGPL. For the 3D case it uses a cubic root solver, which is not very accurate.
-	// - Charles' Galaxy3 contains an implementation of the tridiagonalization method, but is under BPL.
+    // - Numerical Recipes in C is a good reference. Householder transforms followed by QL decomposition seems to be the best approach.
+    // - The one from magic-tools is now LGPL. For the 3D case it uses a cubic root solver, which is not very accurate.
+    // - Charles' Galaxy3 contains an implementation of the tridiagonalization method, but is under BPL.

-	//EigenSolver3 solver(matrix);
+    //EigenSolver3 solver(matrix);

-	return Plane();
+    return Plane();
 }


 int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
 {
-	// Compute principal component.
-	float matrix[6];
-	Vector3 centroid = computeCovariance(n, points, weights, metric, matrix);
-	Vector3 principal = firstEigenVector_PowerMethod(matrix);
-	
-	// Pick initial solution.
-	int mini, maxi;
-	mini = maxi = 0;
-	
-	float mindps, maxdps;
-	mindps = maxdps = dot(points[0] - centroid, principal);
-	
-	for (int i = 1; i < n; ++i)
-	{
-		float dps = dot(points[i] - centroid, principal);
-		
-		if (dps < mindps) {
-			mindps = dps;
-			mini = i;
-		}
-		else {
-			maxdps = dps;
-			maxi = i;
-		}
-	}
+    // Compute principal component.
+    float matrix[6];
+    Vector3 centroid = computeCovariance(n, points, weights, metric, matrix);
+    Vector3 principal = firstEigenVector_PowerMethod(matrix);

-	cluster[0] = centroid + mindps * principal;
-	cluster[1] = centroid + maxdps * principal;
-	cluster[2] = (2 * cluster[0] + cluster[1]) / 3;
-	cluster[3] = (2 * cluster[1] + cluster[0]) / 3;
+    // Pick initial solution.
+    int mini, maxi;
+    mini = maxi = 0;

-	// Now we have to iteratively refine the clusters.
-	while (true)
-	{
-		Vector3 newCluster[4] = { Vector3(zero), Vector3(zero), Vector3(zero), Vector3(zero) };
-		float total[4] = {0, 0, 0, 0};
-		
-		for (int i = 0; i < n; ++i)
-		{
-			// Find nearest cluster.
-			int nearest = 0;
-			float mindist = FLT_MAX;
-			for (int j = 0; j < 4; j++)
-			{
-				float dist = length_squared((cluster[j] - points[i]) * metric);
-				if (dist < mindist)
-				{
-					mindist = dist;
-					nearest = j;
-				}
-			}
-			
-			newCluster[nearest] += weights[i] * points[i];
-			total[nearest] += weights[i];
-		}
+    float mindps, maxdps;
+    mindps = maxdps = dot(points[0] - centroid, principal);

-		for (int j = 0; j < 4; j++)
-		{
+    for (int i = 1; i < n; ++i)
+    {
+        float dps = dot(points[i] - centroid, principal);
+
+        if (dps < mindps) {
+            mindps = dps;
+            mini = i;
+        }
+        else {
+            maxdps = dps;
+            maxi = i;
+        }
+    }
+
+    cluster[0] = centroid + mindps * principal;
+    cluster[1] = centroid + maxdps * principal;
+    cluster[2] = (2 * cluster[0] + cluster[1]) / 3;
+    cluster[3] = (2 * cluster[1] + cluster[0]) / 3;
+
+    // Now we have to iteratively refine the clusters.
+    while (true)
+    {
+        Vector3 newCluster[4] = { Vector3(zero), Vector3(zero), Vector3(zero), Vector3(zero) };
+        float total[4] = {0, 0, 0, 0};
+
+        for (int i = 0; i < n; ++i)
+        {
+            // Find nearest cluster.
+            int nearest = 0;
+            float mindist = FLT_MAX;
+            for (int j = 0; j < 4; j++)
+            {
+                float dist = lengthSquared((cluster[j] - points[i]) * metric);
+                if (dist < mindist)
+                {
+                    mindist = dist;
+                    nearest = j;
+                }
+            }
+
+            newCluster[nearest] += weights[i] * points[i];
+            total[nearest] += weights[i];
+        }
+
+        for (int j = 0; j < 4; j++)
+        {
            if (total[j] != 0)
-			    newCluster[j] /= total[j];
-		}
+                newCluster[j] /= total[j];
+        }

-		if (equal(cluster[0], newCluster[0]) && equal(cluster[1], newCluster[1]) && 
-			equal(cluster[2], newCluster[2]) && equal(cluster[3], newCluster[3]))
-		{
-			return (total[0] != 0) + (total[1] != 0) + (total[2] != 0) + (total[3] != 0);
-		}
+        if (equal(cluster[0], newCluster[0]) && equal(cluster[1], newCluster[1]) && 
+            equal(cluster[2], newCluster[2]) && equal(cluster[3], newCluster[3]))
+        {
+            return (total[0] != 0) + (total[1] != 0) + (total[2] != 0) + (total[3] != 0);
+        }

-		cluster[0] = newCluster[0];
-		cluster[1] = newCluster[1];
-		cluster[2] = newCluster[2];
-		cluster[3] = newCluster[3];
+        cluster[0] = newCluster[0];
+        cluster[1] = newCluster[1];
+        cluster[2] = newCluster[2];
+        cluster[3] = newCluster[3];

-		// Sort clusters by weight.
-		for (int i = 0; i < 4; i++)
-		{
-			for (int j = i; j > 0 && total[j] > total[j - 1]; j--)
-			{
-				swap( total[j], total[j - 1] );
-				swap( cluster[j], cluster[j - 1] );
-			}
-		}
-	}
+        // Sort clusters by weight.
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = i; j > 0 && total[j] > total[j - 1]; j--)
+            {
+                swap( total[j], total[j - 1] );
+                swap( cluster[j], cluster[j - 1] );
+            }
+        }
+    }
 }

--- a/src/nvmath/Fitting.h
+++ b/src/nvmath/Fitting.h
@ -1,5 +1,6 @@
 // This code is in the public domain -- icastano@gmail.com

+#pragma once
 #ifndef NV_MATH_FITTING_H
 #define NV_MATH_FITTING_H

@ -9,22 +10,22 @@

 namespace nv
 {
-	namespace Fit
-	{
-		Vector3 computeCentroid(int n, const Vector3 * points);
-		Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, Vector3::Arg metric);
+    namespace Fit
+    {
+        Vector3 computeCentroid(int n, const Vector3 * points);
+        Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, Vector3::Arg metric);

-		Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
-		Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, Vector3::Arg metric, float * covariance);
+        Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
+        Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, Vector3::Arg metric, float * covariance);

-		Vector3 computePrincipalComponent(int n, const Vector3 * points);
-		Vector3 computePrincipalComponent(int n, const Vector3 * points, const float * weights, Vector3::Arg metric);
+        Vector3 computePrincipalComponent(int n, const Vector3 * points);
+        Vector3 computePrincipalComponent(int n, const Vector3 * points, const float * weights, Vector3::Arg metric);

-		Plane bestPlane(int n, const Vector3 * points);
+        Plane bestPlane(int n, const Vector3 * points);

-		// Returns number of clusters [1-4].
-		int compute4Means(int n, const Vector3 * points, const float * weights, Vector3::Arg metric, Vector3 * cluster);
-	}
+        // Returns number of clusters [1-4].
+        int compute4Means(int n, const Vector3 * points, const float * weights, Vector3::Arg metric, Vector3 * cluster);
+    }

 } // nv namespace

--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@ -79,235 +79,257 @@
 // Load immediate
 static inline uint32 _uint32_li( uint32 a )
 {
-  return (a);
+    return (a);
 }

 // Decrement
 static inline uint32 _uint32_dec( uint32 a )
 {
-  return (a - 1);
+    return (a - 1);
+}
+
+// Increment
+static inline uint32 _uint32_inc( uint32 a )
+{
+  return (a + 1);
 }

 // Complement
 static inline uint32 _uint32_not( uint32 a )
 {
-  return (~a);
+    return (~a);
 }

 // Negate
 static inline uint32 _uint32_neg( uint32 a )
 {
-#if NV_CC_MSVC
-  // prevent msvc warning.
-  return ~a + 1;
-#else
-  return (-a);
-#endif
+#pragma warning(disable : 4146)     // unary minus operator applied to unsigned type, result still unsigned
+    return (-a);
+#pragma warning(default : 4146)
 }

 // Extend sign
 static inline uint32 _uint32_ext( uint32 a )
 {
-  return (((int32)a)>>31);
+    return (((int32)a)>>31);
 }

 // And
 static inline uint32 _uint32_and( uint32 a, uint32 b )
 {
-  return (a & b);
+    return (a & b);
 }

 // And with Complement
 static inline uint32 _uint32_andc( uint32 a, uint32 b )
 {
-  return (a & ~b);
+    return (a & ~b);
 }

 // Or
 static inline uint32 _uint32_or( uint32 a, uint32 b )
 {
-  return (a | b);
+    return (a | b);
 }

 // Shift Right Logical
 static inline uint32 _uint32_srl( uint32 a, int sa )
 {
-  return (a >> sa);
+    return (a >> sa);
 }

 // Shift Left Logical
 static inline uint32 _uint32_sll( uint32 a, int sa )
 {
-  return (a << sa);
+    return (a << sa);
 }

 // Add
 static inline uint32 _uint32_add( uint32 a, uint32 b )
 {
-  return (a + b);
+    return (a + b);
 }

 // Subtract
 static inline uint32 _uint32_sub( uint32 a, uint32 b )
 {
-  return (a - b);
+    return (a - b);
 }

 // Select on Sign bit
 static inline uint32 _uint32_sels( uint32 test, uint32 a, uint32 b )
 {
-  const uint32 mask   = _uint32_ext( test );
-  const uint32 sel_a  = _uint32_and(  a,     mask  );
-  const uint32 sel_b  = _uint32_andc( b,     mask  );
-  const uint32 result = _uint32_or(   sel_a, sel_b );
+    const uint32 mask   = _uint32_ext( test );
+    const uint32 sel_a  = _uint32_and(  a,     mask  );
+    const uint32 sel_b  = _uint32_andc( b,     mask  );
+    const uint32 result = _uint32_or(   sel_a, sel_b );

-  return (result);
+    return (result);
 }

 // Load Immediate
 static inline uint16 _uint16_li( uint16 a )
 {
-  return (a);
+    return (a);
 }

 // Extend sign
 static inline uint16 _uint16_ext( uint16 a )
 {
-  return (((int16)a)>>15);
+    return (((int16)a)>>15);
 }

 // Negate
 static inline uint16 _uint16_neg( uint16 a )
 {
-  return (-a);
+    return (-a);
 }

 // Complement
 static inline uint16 _uint16_not( uint16 a )
 {
-  return (~a);
+    return (~a);
 }

 // Decrement
 static inline uint16 _uint16_dec( uint16 a )
 {
-  return (a - 1);
+    return (a - 1);
 }

 // Shift Left Logical
 static inline uint16 _uint16_sll( uint16 a, int sa )
 {
-  return (a << sa);
+    return (a << sa);
 }

 // Shift Right Logical
 static inline uint16 _uint16_srl( uint16 a, int sa )
 {
-  return (a >> sa);
+    return (a >> sa);
 }

 // Add
 static inline uint16 _uint16_add( uint16 a, uint16 b )
 {
-  return (a + b);
+    return (a + b);
 }

 // Subtract
 static inline uint16 _uint16_sub( uint16 a, uint16 b )
 {
-  return (a - b);
+    return (a - b);
 }

 // And
 static inline uint16 _uint16_and( uint16 a, uint16 b )
 {
-  return (a & b);
+    return (a & b);
 }

 // Or
 static inline uint16 _uint16_or( uint16 a, uint16 b )
 {
-  return (a | b);
+    return (a | b);
 }

 // Exclusive Or
 static inline uint16 _uint16_xor( uint16 a, uint16 b )
 {
-  return (a ^ b);
+    return (a ^ b);
 }

 // And with Complement
 static inline uint16 _uint16_andc( uint16 a, uint16 b )
 {
-  return (a & ~b);
+    return (a & ~b);
 }

 // And then Shift Right Logical
 static inline uint16 _uint16_andsrl( uint16 a, uint16 b, int sa )
 {
-  return ((a & b) >> sa);
+    return ((a & b) >> sa);
 }

 // Shift Right Logical then Mask
 static inline uint16 _uint16_srlm( uint16 a, int sa, uint16 mask )
 {
-  return ((a >> sa) & mask);
+    return ((a >> sa) & mask);
 }

 // Add then Mask
 static inline uint16 _uint16_addm( uint16 a, uint16 b, uint16 mask )
 {
-  return ((a + b) & mask);
+    return ((a + b) & mask);
 }


 // Select on Sign bit
 static inline uint16 _uint16_sels( uint16 test, uint16 a, uint16 b )
 {
-  const uint16 mask   = _uint16_ext( test );
-  const uint16 sel_a  = _uint16_and(  a,     mask  );
-  const uint16 sel_b  = _uint16_andc( b,     mask  );
-  const uint16 result = _uint16_or(   sel_a, sel_b );
+    const uint16 mask   = _uint16_ext( test );
+    const uint16 sel_a  = _uint16_and(  a,     mask  );
+    const uint16 sel_b  = _uint16_andc( b,     mask  );
+    const uint16 result = _uint16_or(   sel_a, sel_b );

-  return (result);
+    return (result);
 }

+#if NV_CC_MSVC
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+uint32 _uint32_nlz( uint32 x ) {
+    unsigned long index;
+    _BitScanReverse(&index, x);
+    return 31 - index;
+}
+#endif
+
+
 // Count Leading Zeros
 static inline uint32 _uint32_cntlz( uint32 x )
 {
-#ifdef __GNUC__
-  /* On PowerPC, this will map to insn: cntlzw */
-  /* On Pentium, this will map to insn: clz    */
-  uint32 nlz = __builtin_clz( x );
-  return (nlz);
+#if NV_CC_GCC
+    /* On PowerPC, this will map to insn: cntlzw */
+    /* On Pentium, this will map to insn: clz    */
+    uint32 is_x_nez_msb = _uint32_neg( x );
+    uint32 nlz          = __builtin_clz( x );
+    uint32 result       = _uint32_sels( is_x_nez_msb, nlz, 0x00000020 );
+    return (result);
+#elif NV_CC_MSVC
+    uint32 is_x_nez_msb = _uint32_neg( x );
+    uint32 nlz          = _uint32_nlz( x );
+    uint32 result       = _uint32_sels( is_x_nez_msb, nlz, 0x00000020 );
+    return (result);
 #else
-  const uint32 x0  = _uint32_srl(  x,  1 );
-  const uint32 x1  = _uint32_or(   x,  x0 );
-  const uint32 x2  = _uint32_srl(  x1, 2 );
-  const uint32 x3  = _uint32_or(   x1, x2 );
-  const uint32 x4  = _uint32_srl(  x3, 4 );
-  const uint32 x5  = _uint32_or(   x3, x4 );
-  const uint32 x6  = _uint32_srl(  x5, 8 );
-  const uint32 x7  = _uint32_or(   x5, x6 );
-  const uint32 x8  = _uint32_srl(  x7, 16 );
-  const uint32 x9  = _uint32_or(   x7, x8 );
-  const uint32 xA  = _uint32_not(  x9 );
-  const uint32 xB  = _uint32_srl(  xA, 1 );
-  const uint32 xC  = _uint32_and(  xB, 0x55555555 );
-  const uint32 xD  = _uint32_sub(  xA, xC );
-  const uint32 xE  = _uint32_and(  xD, 0x33333333 );
-  const uint32 xF  = _uint32_srl(  xD, 2 );
-  const uint32 x10 = _uint32_and(  xF, 0x33333333 );
-  const uint32 x11 = _uint32_add(  xE, x10 );
-  const uint32 x12 = _uint32_srl(  x11, 4 );
-  const uint32 x13 = _uint32_add(  x11, x12 );
-  const uint32 x14 = _uint32_and(  x13, 0x0f0f0f0f );
-  const uint32 x15 = _uint32_srl(  x14, 8 );
-  const uint32 x16 = _uint32_add(  x14, x15 );
-  const uint32 x17 = _uint32_srl(  x16, 16 );
-  const uint32 x18 = _uint32_add(  x16, x17 );
-  const uint32 x19 = _uint32_and(  x18, 0x0000003f );
-  return ( x19 );
+    const uint32 x0  = _uint32_srl(  x,  1 );
+    const uint32 x1  = _uint32_or(   x,  x0 );
+    const uint32 x2  = _uint32_srl(  x1, 2 );
+    const uint32 x3  = _uint32_or(   x1, x2 );
+    const uint32 x4  = _uint32_srl(  x3, 4 );
+    const uint32 x5  = _uint32_or(   x3, x4 );
+    const uint32 x6  = _uint32_srl(  x5, 8 );
+    const uint32 x7  = _uint32_or(   x5, x6 );
+    const uint32 x8  = _uint32_srl(  x7, 16 );
+    const uint32 x9  = _uint32_or(   x7, x8 );
+    const uint32 xA  = _uint32_not(  x9 );
+    const uint32 xB  = _uint32_srl(  xA, 1 );
+    const uint32 xC  = _uint32_and(  xB, 0x55555555 );
+    const uint32 xD  = _uint32_sub(  xA, xC );
+    const uint32 xE  = _uint32_and(  xD, 0x33333333 );
+    const uint32 xF  = _uint32_srl(  xD, 2 );
+    const uint32 x10 = _uint32_and(  xF, 0x33333333 );
+    const uint32 x11 = _uint32_add(  xE, x10 );
+    const uint32 x12 = _uint32_srl(  x11, 4 );
+    const uint32 x13 = _uint32_add(  x11, x12 );
+    const uint32 x14 = _uint32_and(  x13, 0x0f0f0f0f );
+    const uint32 x15 = _uint32_srl(  x14, 8 );
+    const uint32 x16 = _uint32_add(  x14, x15 );
+    const uint32 x17 = _uint32_srl(  x16, 16 );
+    const uint32 x18 = _uint32_add(  x16, x17 );
+    const uint32 x19 = _uint32_and(  x18, 0x0000003f );
+    return ( x19 );
 #endif
 }

@ -315,249 +337,187 @@ static inline uint32 _uint32_cntlz( uint32 x )
 static inline uint16 _uint16_cntlz( uint16 x )
 {
 #ifdef __GNUC__
-  /* On PowerPC, this will map to insn: cntlzw */
-  /* On Pentium, this will map to insn: clz    */
-  uint32 x32   = _uint32_sll( x, 16 );
-  uint16 nlz   = (uint16)__builtin_clz( x32 );
-  return (nlz);
+    /* On PowerPC, this will map to insn: cntlzw */
+    /* On Pentium, this will map to insn: clz    */
+    uint16 nlz32 = (uint16)_uint32_cntlz( (uint32)x );
+    uint32 nlz   = _uint32_sub( nlz32, 16 );
+    return (nlz);
 #else
-  const uint16 x0  = _uint16_srl(  x,  1 );
-  const uint16 x1  = _uint16_or(   x,  x0 );
-  const uint16 x2  = _uint16_srl(  x1, 2 );
-  const uint16 x3  = _uint16_or(   x1, x2 );
-  const uint16 x4  = _uint16_srl(  x3, 4 );
-  const uint16 x5  = _uint16_or(   x3, x4 );
-  const uint16 x6  = _uint16_srl(  x5, 8 );
-  const uint16 x7  = _uint16_or(   x5, x6 );
-  const uint16 x8  = _uint16_not(  x7 );
-  const uint16 x9  = _uint16_srlm( x8, 1, 0x5555 );
-  const uint16 xA  = _uint16_sub(  x8, x9 );
-  const uint16 xB  = _uint16_and(  xA, 0x3333 );
-  const uint16 xC  = _uint16_srlm( xA, 2, 0x3333 );
-  const uint16 xD  = _uint16_add(  xB, xC );
-  const uint16 xE  = _uint16_srl(  xD, 4 );
-  const uint16 xF  = _uint16_addm( xD, xE, 0x0f0f );
-  const uint16 x10 = _uint16_srl(  xF, 8 );
-  const uint16 x11 = _uint16_addm( xF, x10, 0x001f );
-  return ( x11 );
+    const uint16 x0  = _uint16_srl(  x,  1 );
+    const uint16 x1  = _uint16_or(   x,  x0 );
+    const uint16 x2  = _uint16_srl(  x1, 2 );
+    const uint16 x3  = _uint16_or(   x1, x2 );
+    const uint16 x4  = _uint16_srl(  x3, 4 );
+    const uint16 x5  = _uint16_or(   x3, x4 );
+    const uint16 x6  = _uint16_srl(  x5, 8 );
+    const uint16 x7  = _uint16_or(   x5, x6 );
+    const uint16 x8  = _uint16_not(  x7 );
+    const uint16 x9  = _uint16_srlm( x8, 1, 0x5555 );
+    const uint16 xA  = _uint16_sub(  x8, x9 );
+    const uint16 xB  = _uint16_and(  xA, 0x3333 );
+    const uint16 xC  = _uint16_srlm( xA, 2, 0x3333 );
+    const uint16 xD  = _uint16_add(  xB, xC );
+    const uint16 xE  = _uint16_srl(  xD, 4 );
+    const uint16 xF  = _uint16_addm( xD, xE, 0x0f0f );
+    const uint16 x10 = _uint16_srl(  xF, 8 );
+    const uint16 x11 = _uint16_addm( xF, x10, 0x001f );
+    return ( x11 );
 #endif
 }

 uint16
-half_from_float( uint32 f )
+nv::half_from_float( uint32 f )
 {
-  const uint32 one                        = _uint32_li( 0x00000001 );
-  const uint32 f_e_mask                   = _uint32_li( 0x7f800000 );
-  const uint32 f_m_mask                   = _uint32_li( 0x007fffff );
-  const uint32 f_s_mask                   = _uint32_li( 0x80000000 );
-  const uint32 h_e_mask                   = _uint32_li( 0x00007c00 );
-  const uint32 f_e_pos                    = _uint32_li( 0x00000017 );
-  const uint32 f_m_round_bit              = _uint32_li( 0x00001000 );
-  const uint32 h_nan_em_min               = _uint32_li( 0x00007c01 );
-  const uint32 f_h_s_pos_offset           = _uint32_li( 0x00000010 );
-  const uint32 f_m_hidden_bit             = _uint32_li( 0x00800000 );
-  const uint32 f_h_m_pos_offset           = _uint32_li( 0x0000000d );
-  const uint32 f_h_bias_offset            = _uint32_li( 0x38000000 );
-  const uint32 f_m_snan_mask              = _uint32_li( 0x003fffff );
-  const uint16 h_snan_mask                = _uint32_li( 0x00007e00 );
-  const uint32 f_e                        = _uint32_and( f, f_e_mask  );
-  const uint32 f_m                        = _uint32_and( f, f_m_mask  );
-  const uint32 f_s                        = _uint32_and( f, f_s_mask  );
-  const uint32 f_e_h_bias                 = _uint32_sub( f_e,               f_h_bias_offset );
-  const uint32 f_e_h_bias_amount          = _uint32_srl( f_e_h_bias,        f_e_pos         );
-  const uint32 f_m_round_mask             = _uint32_and( f_m,               f_m_round_bit     );
-  const uint32 f_m_round_offset           = _uint32_sll( f_m_round_mask,    one               );
-  const uint32 f_m_rounded                = _uint32_add( f_m,               f_m_round_offset  );
-  const uint32 f_m_rounded_overflow       = _uint32_and( f_m_rounded,       f_m_hidden_bit    );
-  const uint32 f_m_denorm_sa              = _uint32_sub( one,               f_e_h_bias_amount );
-  const uint32 f_m_with_hidden            = _uint32_or(  f_m_rounded,       f_m_hidden_bit    );
-  const uint32 f_m_denorm                 = _uint32_srl( f_m_with_hidden,   f_m_denorm_sa     );
-  const uint32 f_em_norm_packed           = _uint32_or(  f_e_h_bias,        f_m_rounded       );
-  const uint32 f_e_overflow               = _uint32_add( f_e_h_bias,        f_m_hidden_bit    );
-  const uint32 h_s                        = _uint32_srl( f_s,               f_h_s_pos_offset );
-  const uint32 h_m_nan                    = _uint32_srl( f_m,               f_h_m_pos_offset );
-  const uint32 h_m_denorm                 = _uint32_srl( f_m_denorm,        f_h_m_pos_offset );
-  const uint32 h_em_norm                  = _uint32_srl( f_em_norm_packed,  f_h_m_pos_offset );
-  const uint32 h_em_overflow              = _uint32_srl( f_e_overflow,      f_h_m_pos_offset );
-  const uint32 is_e_eqz_msb               = _uint32_dec(  f_e     );
-  const uint32 is_m_nez_msb               = _uint32_neg(  f_m     );
-  const uint32 is_h_m_nan_nez_msb         = _uint32_neg(  h_m_nan );
-  const uint32 is_e_nflagged_msb          = _uint32_sub(  f_e,                 f_e_mask          );
-  const uint32 is_ninf_msb                = _uint32_or(   is_e_nflagged_msb,   is_m_nez_msb      );
-  const uint32 is_underflow_msb           = _uint32_sub(  is_e_eqz_msb,        f_h_bias_offset   );
-  const uint32 is_nan_nunderflow_msb      = _uint32_or(   is_h_m_nan_nez_msb,  is_e_nflagged_msb );
-  const uint32 is_m_snan_msb              = _uint32_sub(  f_m_snan_mask,       f_m               );
-  const uint32 is_snan_msb                = _uint32_andc( is_m_snan_msb,       is_e_nflagged_msb );
-  const uint32 is_overflow_msb            = _uint32_neg(  f_m_rounded_overflow );
-  const uint32 h_nan_underflow_result     = _uint32_sels( is_nan_nunderflow_msb, h_em_norm,                h_nan_em_min       );
-  const uint32 h_inf_result               = _uint32_sels( is_ninf_msb,           h_nan_underflow_result,   h_e_mask           );
-  const uint32 h_underflow_result         = _uint32_sels( is_underflow_msb,      h_m_denorm,               h_inf_result       );
-  const uint32 h_overflow_result          = _uint32_sels( is_overflow_msb,       h_em_overflow,            h_underflow_result );
-  const uint32 h_em_result                = _uint32_sels( is_snan_msb,           h_snan_mask,              h_overflow_result  );
-  const uint32 h_result                   = _uint32_or( h_em_result, h_s );
+    const uint32 one                        = _uint32_li( 0x00000001 );
+    const uint32 f_s_mask                   = _uint32_li( 0x80000000 );
+    const uint32 f_e_mask                   = _uint32_li( 0x7f800000 );
+    const uint32 f_m_mask                   = _uint32_li( 0x007fffff );
+    const uint32 f_m_hidden_bit             = _uint32_li( 0x00800000 );
+    const uint32 f_m_round_bit              = _uint32_li( 0x00001000 );
+    const uint32 f_snan_mask                = _uint32_li( 0x7fc00000 );
+    const uint32 f_e_pos                    = _uint32_li( 0x00000017 );
+    const uint32 h_e_pos                    = _uint32_li( 0x0000000a );
+    const uint32 h_e_mask                   = _uint32_li( 0x00007c00 );
+    const uint32 h_snan_mask                = _uint32_li( 0x00007e00 );
+    const uint32 h_e_mask_value             = _uint32_li( 0x0000001f );
+    const uint32 f_h_s_pos_offset           = _uint32_li( 0x00000010 );
+    const uint32 f_h_bias_offset            = _uint32_li( 0x00000070 );
+    const uint32 f_h_m_pos_offset           = _uint32_li( 0x0000000d );
+    const uint32 h_nan_min                  = _uint32_li( 0x00007c01 );
+    const uint32 f_h_e_biased_flag          = _uint32_li( 0x0000008f );
+    const uint32 f_s                        = _uint32_and( f,               f_s_mask         );
+    const uint32 f_e                        = _uint32_and( f,               f_e_mask         );
+    const uint16 h_s                        = _uint32_srl( f_s,             f_h_s_pos_offset );
+    const uint32 f_m                        = _uint32_and( f,               f_m_mask         );
+    const uint16 f_e_amount                 = _uint32_srl( f_e,             f_e_pos          );
+    const uint32 f_e_half_bias              = _uint32_sub( f_e_amount,      f_h_bias_offset  );
+    const uint32 f_snan                     = _uint32_and( f,               f_snan_mask      );
+    const uint32 f_m_round_mask             = _uint32_and( f_m,             f_m_round_bit    );
+    const uint32 f_m_round_offset           = _uint32_sll( f_m_round_mask,  one              );
+    const uint32 f_m_rounded                = _uint32_add( f_m,             f_m_round_offset );
+    const uint32 f_m_denorm_sa              = _uint32_sub( one,             f_e_half_bias    );
+    const uint32 f_m_with_hidden            = _uint32_or(  f_m_rounded,     f_m_hidden_bit   );
+    const uint32 f_m_denorm                 = _uint32_srl( f_m_with_hidden, f_m_denorm_sa    );
+    const uint32 h_m_denorm                 = _uint32_srl( f_m_denorm,      f_h_m_pos_offset );
+    const uint32 f_m_rounded_overflow       = _uint32_and( f_m_rounded,     f_m_hidden_bit   );
+    const uint32 m_nan                      = _uint32_srl( f_m,             f_h_m_pos_offset );
+    const uint32 h_em_nan                   = _uint32_or(  h_e_mask,        m_nan            );
+    const uint32 h_e_norm_overflow_offset   = _uint32_inc( f_e_half_bias );
+    const uint32 h_e_norm_overflow          = _uint32_sll( h_e_norm_overflow_offset, h_e_pos          );
+    const uint32 h_e_norm                   = _uint32_sll( f_e_half_bias,            h_e_pos          );
+    const uint32 h_m_norm                   = _uint32_srl( f_m_rounded,              f_h_m_pos_offset );
+    const uint32 h_em_norm                  = _uint32_or(  h_e_norm,                 h_m_norm         );
+    const uint32 is_h_ndenorm_msb           = _uint32_sub( f_h_bias_offset,   f_e_amount    );
+    const uint32 is_f_e_flagged_msb         = _uint32_sub( f_h_e_biased_flag, f_e_half_bias );
+    const uint32 is_h_denorm_msb            = _uint32_not( is_h_ndenorm_msb );
+    const uint32 is_f_m_eqz_msb             = _uint32_dec( f_m   );
+    const uint32 is_h_nan_eqz_msb           = _uint32_dec( m_nan );
+    const uint32 is_f_inf_msb               = _uint32_and( is_f_e_flagged_msb, is_f_m_eqz_msb   );
+    const uint32 is_f_nan_underflow_msb     = _uint32_and( is_f_e_flagged_msb, is_h_nan_eqz_msb );
+    const uint32 is_e_overflow_msb          = _uint32_sub( h_e_mask_value,     f_e_half_bias    );
+    const uint32 is_h_inf_msb               = _uint32_or(  is_e_overflow_msb,  is_f_inf_msb     );
+    const uint32 is_f_nsnan_msb             = _uint32_sub( f_snan,             f_snan_mask      );
+    const uint32 is_m_norm_overflow_msb     = _uint32_neg( f_m_rounded_overflow );
+    const uint32 is_f_snan_msb              = _uint32_not( is_f_nsnan_msb );
+    const uint32 h_em_overflow_result       = _uint32_sels( is_m_norm_overflow_msb, h_e_norm_overflow, h_em_norm                 );
+    const uint32 h_em_nan_result            = _uint32_sels( is_f_e_flagged_msb,     h_em_nan,          h_em_overflow_result      );
+    const uint32 h_em_nan_underflow_result  = _uint32_sels( is_f_nan_underflow_msb, h_nan_min,         h_em_nan_result           );
+    const uint32 h_em_inf_result            = _uint32_sels( is_h_inf_msb,           h_e_mask,          h_em_nan_underflow_result );
+    const uint32 h_em_denorm_result         = _uint32_sels( is_h_denorm_msb,        h_m_denorm,        h_em_inf_result           );
+    const uint32 h_em_snan_result           = _uint32_sels( is_f_snan_msb,          h_snan_mask,       h_em_denorm_result        );
+    const uint32 h_result                   = _uint32_or( h_s, h_em_snan_result );

-  return (h_result);
+    return (uint16)(h_result);
 }

 uint32 
-half_to_float( uint16 h )
+nv::half_to_float( uint16 h )
 {
-  const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
-  const uint32 h_m_mask              = _uint32_li( 0x000003ff );
-  const uint32 h_s_mask              = _uint32_li( 0x00008000 );
-  const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
-  const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
-  const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
-  const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
-  const uint32 f_m_mask              = _uint32_li( 0x007fffff );
-  const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
-  const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
-  const uint32 f_e_pos               = _uint32_li( 0x00000017 );
-  const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
-  const uint32 h_e                   = _uint32_and( h, h_e_mask );
-  const uint32 h_m                   = _uint32_and( h, h_m_mask );
-  const uint32 h_s                   = _uint32_and( h, h_s_mask );
-  const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
-  const uint32 h_m_nlz               = _uint32_cntlz( h_m );
-  const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
-  const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
-  const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
-  const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
-  const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
-  const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
-  const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
-  const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
-  const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
-  const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
-  const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
-  const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
-  const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
-  const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
-  const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
-  const uint32 is_inf_msb            = _uint32_andc( is_e_flagged_msb,   is_m_nez_msb );
-  const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
-  const uint32 is_nan_msb            = _uint32_and(  is_e_flagged_msb,   is_m_nez_msb ); 
-  const uint32 is_zero               = _uint32_ext(  is_zero_msb );
-  const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
-  const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
-  const uint32 f_inf_result          = _uint32_sels( is_inf_msb,    f_e_mask,    f_denorm_result );
-  const uint32 f_nan_result          = _uint32_sels( is_nan_msb,    f_em_nan,    f_inf_result    );
-  const uint32 f_result              = _uint32_or( f_s, f_nan_result );
- 
-  return (f_result);
+    const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
+    const uint32 h_m_mask              = _uint32_li( 0x000003ff );
+    const uint32 h_s_mask              = _uint32_li( 0x00008000 );
+    const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
+    const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
+    const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
+    const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
+    const uint32 f_m_mask              = _uint32_li( 0x007fffff );
+    const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
+    const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
+    const uint32 f_e_pos               = _uint32_li( 0x00000017 );
+    const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
+    const uint32 h_e                   = _uint32_and( h, h_e_mask );
+    const uint32 h_m                   = _uint32_and( h, h_m_mask );
+    const uint32 h_s                   = _uint32_and( h, h_s_mask );
+    const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
+    const uint32 h_m_nlz               = _uint32_cntlz( h_m );
+    const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
+    const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
+    const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
+    const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
+    const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
+    const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
+    const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
+    const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
+    const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
+    const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
+    const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
+    const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
+    const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
+    const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
+    const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
+    const uint32 is_inf_msb            = _uint32_andc( is_e_flagged_msb,   is_m_nez_msb );
+    const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
+    const uint32 is_nan_msb            = _uint32_and(  is_e_flagged_msb,   is_m_nez_msb ); 
+    const uint32 is_zero               = _uint32_ext(  is_zero_msb );
+    const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
+    const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
+    const uint32 f_inf_result          = _uint32_sels( is_inf_msb,    f_e_mask,    f_denorm_result );
+    const uint32 f_nan_result          = _uint32_sels( is_nan_msb,    f_em_nan,    f_inf_result    );
+    const uint32 f_result              = _uint32_or( f_s, f_nan_result );
+
+    return (f_result);
 }

-uint16
-half_add( uint16 x, uint16 y )
+uint32 
+nv::fast_half_to_float( uint16 h )
 {
-  const uint16 one                       = _uint16_li( 0x0001 );
-  const uint16 msb_to_lsb_sa             = _uint16_li( 0x000f );
-  const uint16 h_s_mask                  = _uint16_li( 0x8000 );
-  const uint16 h_e_mask                  = _uint16_li( 0x7c00 );
-  const uint16 h_m_mask                  = _uint16_li( 0x03ff );
-  const uint16 h_m_msb_mask              = _uint16_li( 0x2000 );
-  const uint16 h_m_msb_sa                = _uint16_li( 0x000d );
-  const uint16 h_m_hidden                = _uint16_li( 0x0400 );
-  const uint16 h_e_pos                   = _uint16_li( 0x000a );
-  const uint16 h_e_bias_minus_one        = _uint16_li( 0x000e );
-  const uint16 h_m_grs_carry             = _uint16_li( 0x4000 );
-  const uint16 h_m_grs_carry_pos         = _uint16_li( 0x000e );
-  const uint16 h_grs_size                = _uint16_li( 0x0003 );
-  const uint16 h_snan                    = _uint16_li( 0xfe00 );
-  const uint16 h_e_mask_minus_one        = _uint16_li( 0x7bff );
-  const uint16 h_grs_round_carry         = _uint16_sll( one, h_grs_size );
-  const uint16 h_grs_round_mask          = _uint16_sub( h_grs_round_carry, one );
-  const uint16 x_e                       = _uint16_and( x, h_e_mask );
-  const uint16 y_e                       = _uint16_and( y, h_e_mask );
-  const uint16 is_y_e_larger_msb         = _uint16_sub( x_e, y_e );
-  const uint16 a                         = _uint16_sels( is_y_e_larger_msb, y, x);
-  const uint16 a_s                       = _uint16_and( a, h_s_mask );
-  const uint16 a_e                       = _uint16_and( a, h_e_mask );
-  const uint16 a_m_no_hidden_bit         = _uint16_and( a, h_m_mask );
-  const uint16 a_em_no_hidden_bit        = _uint16_or( a_e, a_m_no_hidden_bit );
-  const uint16 b                         = _uint16_sels( is_y_e_larger_msb, x, y);
-  const uint16 b_s                       = _uint16_and( b, h_s_mask );
-  const uint16 b_e                       = _uint16_and( b, h_e_mask );
-  const uint16 b_m_no_hidden_bit         = _uint16_and( b, h_m_mask );
-  const uint16 b_em_no_hidden_bit        = _uint16_or( b_e, b_m_no_hidden_bit );
-  const uint16 is_diff_sign_msb          = _uint16_xor( a_s, b_s );
-  const uint16 is_a_inf_msb              = _uint16_sub( h_e_mask_minus_one, a_em_no_hidden_bit );
-  const uint16 is_b_inf_msb              = _uint16_sub( h_e_mask_minus_one, b_em_no_hidden_bit );
-  const uint16 is_undenorm_msb           = _uint16_dec( a_e );
-  const uint16 is_undenorm               = _uint16_ext( is_undenorm_msb );
-  const uint16 is_both_inf_msb           = _uint16_and( is_a_inf_msb, is_b_inf_msb );
-  const uint16 is_invalid_inf_op_msb     = _uint16_and( is_both_inf_msb, b_s );
-  const uint16 is_a_e_nez_msb            = _uint16_neg( a_e );
-  const uint16 is_b_e_nez_msb            = _uint16_neg( b_e );
-  const uint16 is_a_e_nez                = _uint16_ext( is_a_e_nez_msb );
-  const uint16 is_b_e_nez                = _uint16_ext( is_b_e_nez_msb );
-  const uint16 a_m_hidden_bit            = _uint16_and( is_a_e_nez, h_m_hidden );
-  const uint16 b_m_hidden_bit            = _uint16_and( is_b_e_nez, h_m_hidden );
-  const uint16 a_m_no_grs                = _uint16_or( a_m_no_hidden_bit, a_m_hidden_bit );
-  const uint16 b_m_no_grs                = _uint16_or( b_m_no_hidden_bit, b_m_hidden_bit );
-  const uint16 diff_e                    = _uint16_sub( a_e,        b_e );
-  const uint16 a_e_unbias                = _uint16_sub( a_e,        h_e_bias_minus_one );
-  const uint16 a_m                       = _uint16_sll( a_m_no_grs, h_grs_size );
-  const uint16 a_e_biased                = _uint16_srl( a_e,        h_e_pos );
-  const uint16 m_sa_unbias               = _uint16_srl( a_e_unbias, h_e_pos );
-  const uint16 m_sa_default              = _uint16_srl( diff_e,     h_e_pos );
-  const uint16 m_sa_unbias_mask          = _uint16_andc( is_a_e_nez_msb,   is_b_e_nez_msb );
-  const uint16 m_sa                      = _uint16_sels( m_sa_unbias_mask, m_sa_unbias, m_sa_default );
-  const uint16 b_m_no_sticky             = _uint16_sll( b_m_no_grs,        h_grs_size );
-  const uint16 sh_m                      = _uint16_srl( b_m_no_sticky,     m_sa );
-  const uint16 sticky_overflow           = _uint16_sll( one,               m_sa );
-  const uint16 sticky_mask               = _uint16_dec( sticky_overflow );
-  const uint16 sticky_collect            = _uint16_and( b_m_no_sticky, sticky_mask );
-  const uint16 is_sticky_set_msb         = _uint16_neg( sticky_collect );
-  const uint16 sticky                    = _uint16_srl( is_sticky_set_msb, msb_to_lsb_sa);
-  const uint16 b_m                       = _uint16_or( sh_m, sticky );
-  const uint16 is_c_m_ab_pos_msb         = _uint16_sub( b_m, a_m );
-  const uint16 c_inf                     = _uint16_or( a_s, h_e_mask );
-  const uint16 c_m_sum                   = _uint16_add( a_m, b_m );
-  const uint16 c_m_diff_ab               = _uint16_sub( a_m, b_m );
-  const uint16 c_m_diff_ba               = _uint16_sub( b_m, a_m );
-  const uint16 c_m_smag_diff             = _uint16_sels( is_c_m_ab_pos_msb, c_m_diff_ab, c_m_diff_ba );
-  const uint16 c_s_diff                  = _uint16_sels( is_c_m_ab_pos_msb, a_s,         b_s         );
-  const uint16 c_s                       = _uint16_sels( is_diff_sign_msb,  c_s_diff,    a_s         );
-  const uint16 c_m_smag_diff_nlz         = _uint16_cntlz( c_m_smag_diff );
-  const uint16 diff_norm_sa              = _uint16_sub( c_m_smag_diff_nlz, one );
-  const uint16 is_diff_denorm_msb        = _uint16_sub( a_e_biased, diff_norm_sa );
-  const uint16 is_diff_denorm            = _uint16_ext( is_diff_denorm_msb );
-  const uint16 is_a_or_b_norm_msb        = _uint16_neg( a_e_biased );
-  const uint16 diff_denorm_sa            = _uint16_dec( a_e_biased );
-  const uint16 c_m_diff_denorm           = _uint16_sll( c_m_smag_diff, diff_denorm_sa );
-  const uint16 c_m_diff_norm             = _uint16_sll( c_m_smag_diff, diff_norm_sa );
-  const uint16 c_e_diff_norm             = _uint16_sub( a_e_biased,  diff_norm_sa );
-  const uint16 c_m_diff_ab_norm          = _uint16_sels( is_diff_denorm_msb, c_m_diff_denorm, c_m_diff_norm );
-  const uint16 c_e_diff_ab_norm          = _uint16_andc( c_e_diff_norm, is_diff_denorm );
-  const uint16 c_m_diff                  = _uint16_sels( is_a_or_b_norm_msb, c_m_diff_ab_norm, c_m_smag_diff );
-  const uint16 c_e_diff                  = _uint16_sels( is_a_or_b_norm_msb, c_e_diff_ab_norm, a_e_biased    );
-  const uint16 is_diff_eqz_msb           = _uint16_dec( c_m_diff );
-  const uint16 is_diff_exactly_zero_msb  = _uint16_and( is_diff_sign_msb, is_diff_eqz_msb );
-  const uint16 is_diff_exactly_zero      = _uint16_ext( is_diff_exactly_zero_msb );
-  const uint16 c_m_added                 = _uint16_sels( is_diff_sign_msb, c_m_diff, c_m_sum );
-  const uint16 c_e_added                 = _uint16_sels( is_diff_sign_msb, c_e_diff, a_e_biased );
-  const uint16 c_m_carry                 = _uint16_and( c_m_added, h_m_grs_carry );
-  const uint16 is_c_m_carry_msb          = _uint16_neg( c_m_carry );
-  const uint16 c_e_hidden_offset         = _uint16_andsrl( c_m_added, h_m_grs_carry, h_m_grs_carry_pos );
-  const uint16 c_m_sub_hidden            = _uint16_srl( c_m_added, one );
-  const uint16 c_m_no_hidden             = _uint16_sels( is_c_m_carry_msb, c_m_sub_hidden, c_m_added );
-  const uint16 c_e_no_hidden             = _uint16_add( c_e_added,         c_e_hidden_offset  );
-  const uint16 c_m_no_hidden_msb         = _uint16_and( c_m_no_hidden,     h_m_msb_mask       );
-  const uint16 undenorm_m_msb_odd        = _uint16_srl( c_m_no_hidden_msb, h_m_msb_sa         );
-  const uint16 undenorm_fix_e            = _uint16_and( is_undenorm,       undenorm_m_msb_odd );
-  const uint16 c_e_fixed                 = _uint16_add( c_e_no_hidden,     undenorm_fix_e     );
-  const uint16 c_m_round_amount          = _uint16_and( c_m_no_hidden,     h_grs_round_mask   );
-  const uint16 c_m_rounded               = _uint16_add( c_m_no_hidden,     c_m_round_amount   );
-  const uint16 c_m_round_overflow        = _uint16_andsrl( c_m_rounded, h_m_grs_carry, h_m_grs_carry_pos );
-  const uint16 c_e_rounded               = _uint16_add( c_e_fixed, c_m_round_overflow );
-  const uint16 c_m_no_grs                = _uint16_srlm( c_m_rounded, h_grs_size,  h_m_mask );
-  const uint16 c_e                       = _uint16_sll( c_e_rounded, h_e_pos );
-  const uint16 c_em                      = _uint16_or( c_e, c_m_no_grs );
-  const uint16 c_normal                  = _uint16_or( c_s, c_em );
-  const uint16 c_inf_result              = _uint16_sels( is_a_inf_msb, c_inf, c_normal );
-  const uint16 c_zero_result             = _uint16_andc( c_inf_result, is_diff_exactly_zero );
-  const uint16 c_result                  = _uint16_sels( is_invalid_inf_op_msb, h_snan, c_zero_result );
+    const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
+    const uint32 h_m_mask              = _uint32_li( 0x000003ff );
+    const uint32 h_s_mask              = _uint32_li( 0x00008000 );
+    const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
+    const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
+    const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
+    const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
+    const uint32 f_m_mask              = _uint32_li( 0x007fffff );
+    const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
+    const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
+    const uint32 f_e_pos               = _uint32_li( 0x00000017 );
+    const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
+    const uint32 h_e                   = _uint32_and( h, h_e_mask );
+    const uint32 h_m                   = _uint32_and( h, h_m_mask );
+    const uint32 h_s                   = _uint32_and( h, h_s_mask );
+    const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
+    const uint32 h_m_nlz               = _uint32_cntlz( h_m );
+    const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
+    const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
+    const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
+    const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
+    const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
+    const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
+    const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
+    const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
+    const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
+    const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
+    const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
+    const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
+    const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
+    const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
+    const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
+    const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
+    const uint32 is_zero               = _uint32_ext(  is_zero_msb );
+    const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
+    const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
+    const uint32 f_result              = _uint32_or( f_s, f_denorm_result );

-  return (c_result);
+    return (f_result);
 }
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@ -1,9 +1,17 @@
+#pragma once
 #ifndef NV_MATH_HALF_H
 #define NV_MATH_HALF_H

-#include <nvmath/nvmath.h>
+#include "nvmath.h"

-uint32 half_to_float( uint16 h );
-uint16 half_from_float( uint32 f );
+namespace nv {

-#endif /* NV_MATH_HALF_H */
+    uint32 half_to_float( uint16 h );
+    uint16 half_from_float( uint32 f );
+
+    // Does not handle NaN or infinity.
+    uint32 fast_half_to_float( uint16 h );
+
+} // nv namespace
+
+#endif // NV_MATH_HALF_H
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
--- a/src/nvmath/Plane.cpp
+++ b/src/nvmath/Plane.cpp
@ -5,13 +5,22 @@

 namespace nv
 {
-	Plane transformPlane(const Matrix& m, Plane::Arg p)
-	{
-		Vector3 newVec = transformVector(m, p.vector());
+    Plane transformPlane(const Matrix& m, Plane::Arg p)
+    {
+        Vector3 newVec = transformVector(m, p.vector());

-		Vector3 ptInPlane = p.offset() * p.vector();
-		ptInPlane = transformPoint(m, ptInPlane);
+        Vector3 ptInPlane = p.offset() * p.vector();
+        ptInPlane = transformPoint(m, ptInPlane);

-		return Plane(newVec, ptInPlane);
-	}
-}
+        return Plane(newVec, ptInPlane);
+    }
+
+    Vector3 planeIntersection(Plane::Arg a, Plane::Arg b, Plane::Arg c)
+    {
+        return dot(a.vector(), cross(b.vector(), c.vector())) * (
+            a.offset() * cross(b.vector(), c.vector()) + 
+            c.offset() * cross(a.vector(), b.vector()) +
+            b.offset() * cross(c.vector(), a.vector()));
+    }
+
+} // nv namespace
--- a/src/nvmath/Plane.h
+++ b/src/nvmath/Plane.h
@ -1,77 +1,81 @@
 // This code is in the public domain -- castanyo@yahoo.es

+#pragma once
 #ifndef NV_MATH_PLANE_H
 #define NV_MATH_PLANE_H

-#include "nvmath.h"
-#include "Vector.h"
+#include <nvmath/nvmath.h>
+#include <nvmath/Vector.h>

 namespace nv
 {
-	class Matrix;
+    class Matrix;


-	class NVMATH_CLASS Plane
-	{
-	public:
-		typedef Plane const & Arg;
-		
-		Plane();
-		Plane(float x, float y, float z, float w);
-		Plane(Vector4::Arg v);
-		Plane(Vector3::Arg v, float d);
-		Plane(Vector3::Arg normal, Vector3::Arg point);
-		
-		const Plane & operator=(Plane::Arg v);
-		
-		Vector3 vector() const;
-		scalar offset() const;
-		
-		const Vector4 & asVector() const;
-		Vector4 & asVector();
-		
-		void operator*=(scalar s);
+    class NVMATH_CLASS Plane
+    {
+    public:
+        typedef Plane const & Arg;

-	private:
-		Vector4 p;
-	};
+        Plane();
+        Plane(float x, float y, float z, float w);
+        Plane(Vector4::Arg v);
+        Plane(Vector3::Arg v, float d);
+        Plane(Vector3::Arg normal, Vector3::Arg point);

-	inline Plane::Plane() {}
-	inline Plane::Plane(float x, float y, float z, float w) : p(x, y, z, w) {}
-	inline Plane::Plane(Vector4::Arg v) : p(v) {}
-	inline Plane::Plane(Vector3::Arg v, float d) : p(v, d) {}
-	inline Plane::Plane(Vector3::Arg normal, Vector3::Arg point) : p(normal, dot(normal, point)) {}
-	
-	inline const Plane & Plane::operator=(Plane::Arg v) { p = v.p; return *this; }
-	
-	inline Vector3 Plane::vector() const { return p.xyz(); }
-	inline scalar Plane::offset() const { return p.w(); }
+        const Plane & operator=(Plane::Arg v);

-	inline const Vector4 & Plane::asVector() const { return p; }
-	inline Vector4 & Plane::asVector() { return p; }
+        Vector3 vector() const;
+        scalar offset() const;

-	// Normalize plane.
-	inline Plane normalize(Plane::Arg plane, float epsilon = NV_EPSILON)
-	{
-		const float len = length(plane.vector());
-		nvDebugCheck(!isZero(len, epsilon));
-		const float inv = 1.0f / len;
-		return Plane(plane.asVector() * inv);
-	}
+        const Vector4 & asVector() const;
+        Vector4 & asVector();

-	// Get the signed distance from the given point to this plane.
-	inline float distance(Plane::Arg plane, Vector3::Arg point)
-	{
-		return dot(plane.vector(), point) - plane.offset();
-	}
+        void operator*=(scalar s);
+
+    private:
+        Vector4 p;
+    };
+
+    inline Plane::Plane() {}
+    inline Plane::Plane(float x, float y, float z, float w) : p(x, y, z, w) {}
+    inline Plane::Plane(Vector4::Arg v) : p(v) {}
+    inline Plane::Plane(Vector3::Arg v, float d) : p(v, d) {}
+    inline Plane::Plane(Vector3::Arg normal, Vector3::Arg point) : p(normal, dot(normal, point)) {}
+
+    inline const Plane & Plane::operator=(Plane::Arg v) { p = v.p; return *this; }
+
+    inline Vector3 Plane::vector() const { return p.xyz(); }
+    inline scalar Plane::offset() const { return p.w; }
+
+    inline const Vector4 & Plane::asVector() const { return p; }
+    inline Vector4 & Plane::asVector() { return p; }
+
+    // Normalize plane.
+    inline Plane normalize(Plane::Arg plane, float epsilon = NV_EPSILON)
+    {
+        const float len = length(plane.vector());
+        nvDebugCheck(!isZero(len, epsilon));
+        const float inv = 1.0f / len;
+        return Plane(plane.asVector() * inv);
+    }
+
+    // Get the signed distance from the given point to this plane.
+    inline float distance(Plane::Arg plane, Vector3::Arg point)
+    {
+        return dot(plane.vector(), point) - plane.offset();
+    }
+
+    inline void Plane::operator*=(scalar s)
+    {
+        scale(p, s);
+    }
+
+    Plane transformPlane(const Matrix&, Plane::Arg);
+
+    Vector3 planeIntersection(Plane::Arg a, Plane::Arg b, Plane::Arg c);

-	inline void Plane::operator*=(scalar s)
-	{
-		scale(p, s);
-	}

-	Plane transformPlane(const Matrix&, Plane::Arg);
-	
 } // nv namespace

 #endif // NV_MATH_PLANE_H
--- a/src/nvmath/Vector.h
+++ b/src/nvmath/Vector.h
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@ -1,5 +1,6 @@
 // This code is in the public domain -- castanyo@yahoo.es

+#pragma once
 #ifndef NV_MATH_H
 #define NV_MATH_H

@ -28,29 +29,29 @@
 #endif // NVMATH_SHARED

 #ifndef PI
-#define PI      			float(3.1415926535897932384626433833)
+#define PI                  float(3.1415926535897932384626433833)
 #endif

-#define NV_EPSILON			(0.0001f)
-#define NV_NORMAL_EPSILON	(0.001f)
+#define NV_EPSILON          (0.0001f)
+#define NV_NORMAL_EPSILON   (0.001f)

 /*
-#define SQ(r)				((r)*(r))
+#define SQ(r)               ((r)*(r))

-#define	SIGN_BITMASK		0x80000000
+#define SIGN_BITMASK        0x80000000

 /// Integer representation of a floating-point value.
-#define IR(x)					((uint32 &)(x))
+#define IR(x)               ((uint32 &)(x))

 /// Absolute integer representation of a floating-point value
-#define AIR(x)					(IR(x) & 0x7fffffff)
+#define AIR(x)              (IR(x) & 0x7fffffff)

 /// Floating-point representation of an integer value.
-#define FR(x)					((float&)(x))
+#define FR(x)               ((float&)(x))

 /// Integer-based comparison of a floating point value.
 /// Don't use it blindly, it can be faster or slower than the FPU comparison, depends on the context.
-#define IS_NEGATIVE_FLOAT(x)	(IR(x)&SIGN_BITMASK)
+#define IS_NEGATIVE_FLOAT(x) (IR(x)&SIGN_BITMASK)
 */

 inline double sqrt_assert(const double f)
@ -97,6 +98,7 @@ inline float asinf_assert(const float f)
 #define asin asin_assert
 #define asinf asinf_assert

+
 namespace nv
 {
    inline float toRadian(float degree) { return degree * (PI / 180.0f); }
@ -121,7 +123,7 @@ namespace nv
 #elif NV_OS_LINUX
        return finitef(f);
 #else
-#	error "isFinite not supported"
+#   error "isFinite not supported"
 #endif
        //return std::isfinite (f);
        //return finite (f);
@ -136,7 +138,7 @@ namespace nv
 #elif NV_OS_LINUX
        return isnanf(f);
 #else
-#	error "isNan not supported"
+#   error "isNan not supported"
 #endif
    }

@ -161,10 +163,11 @@ namespace nv
        return f0 * s + f1 * t;
    }

-    inline float square(float f)
-    {
-        return f * f;
-    }
+    inline float square(float f) { return f * f; }
+    inline int square(int i) { return i * i; }
+
+    inline float cube(float f) { return f * f; }
+    inline int cube(int i) { return i * i; }

    // @@ Float to int conversions to be optimized at some point. See:
    // http://cbloomrants.blogspot.com/2009/01/01-17-09-float-to-int.html
@ -186,10 +189,10 @@ namespace nv
        return int(ceilf(f));
    }

-    inline float frac(float f)
-    {
-        return f - floor(f);
-    }
+	inline float frac(float f)
+	{
+		return f - floor(f);
+	}

 } // nv