Merge private branch.

2010-05-27 23:19:24 +00:00
parent 51a4fe7e2d
commit c09067e477
13 changed files with 2701 additions and 2622 deletions
--- a/src/nvmath/Box.cpp
+++ b/src/nvmath/Box.cpp
@ -0,0 +1,30 @@
 // This code is in the public domain -- castanyo@yahoo.es
 #include "nvmath/Box.h"
 #include "nvmath/Sphere.h"
 using namespace nv;
 float nv::distanceSquared(const Box &box, const Vector3 &point) {
    Vector3 closest;
    if (point.x < box.minCorner.x) closest.x = box.minCorner.x;
    else if (point.x > box.maxCorner.x) closest.x = box.maxCorner.x;
    else closest.x = point.x;
    if (point.y < box.minCorner.y) closest.y = box.minCorner.y;
    else if (point.y > box.maxCorner.y) closest.y = box.maxCorner.y;
    else closest.y = point.y;
    if (point.z < box.minCorner.z) closest.z = box.minCorner.z;
    else if (point.z > box.maxCorner.z) closest.z = box.maxCorner.z;
    else closest.z = point.z;
    return lengthSquared(point - closest);
 }
 bool nv::overlap(const Box &box, const Sphere &sphere) {
    return distanceSquared(box, sphere.center) < sphere.radius * sphere.radius;
 }
--- a/src/nvmath/Box.h
+++ b/src/nvmath/Box.h
@ -1,142 +1,156 @@
 // This code is in the public domain -- castanyo@yahoo.es
 #pragma once
 #ifndef NV_MATH_BOX_H
 #define NV_MATH_BOX_H
-#include <nvmath/Vector.h>
+#include "Vector.h"
 #include <float.h> // FLT_MAX
 namespace nv
 {
-class Stream;
+    class Stream;
    class Sphere;
-/// Axis Aligned Bounding Box.
+    /// Axis Aligned Bounding Box.
-class Box
+    class Box
-{
+    {
-public:
+    public:
-	/// Default ctor.
+        /// Default ctor.
-	Box() { };
+        Box() { };
-	/// Copy ctor.
+        /// Copy ctor.
-	Box( const Box & b ) : m_mins(b.m_mins), m_maxs(b.m_maxs) { }
+        Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
-	/// Init ctor.
+        /// Init ctor.
-	Box( Vector3::Arg mins, Vector3::Arg maxs ) : m_mins(mins), m_maxs(maxs) { }
+        Box(Vector3::Arg mins, Vector3::Arg maxs) : minCorner(mins), maxCorner(maxs) { }
-	// Cast operators.
+        // Cast operators.
-	operator const float * () const { return reinterpret_cast<const float *>(this); }
+        operator const float * () const { return reinterpret_cast<const float *>(this); }
-	// Min corner of the box.
+        /// Clear the bounds.
-	Vector3 minCorner() const { return m_mins; }
+        void clearBounds()
-	Vector3 & minCorner() { return m_mins; }
+        {
            minCorner.set(FLT_MAX, FLT_MAX, FLT_MAX);
            maxCorner.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
        }
-	// Max corner of the box.
+        /// Build a cube centered on center and with edge = 2*dist
-	Vector3 maxCorner() const { return m_maxs; }
+        void cube(Vector3::Arg center, float dist)
-	Vector3 & maxCorner() { return m_maxs; }
+        {
            setCenterExtents(center, Vector3(dist, dist, dist));
        }
-	/// Clear the bounds.
+        /// Build a box, given center and extents.
-	void clearBounds()
+        void setCenterExtents(Vector3::Arg center, Vector3::Arg extents)
-	{
+        {
-		m_mins.set(FLT_MAX, FLT_MAX, FLT_MAX);
+            minCorner = center - extents;
-		m_maxs.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
+            maxCorner = center + extents;
-	}
+        }
-	/// Build a cube centered on center and with edge = 2*dist
+        /// Get box center.
-	void cube(Vector3::Arg center, float dist)
+        Vector3 center() const
-	{
+        {
-		setCenterExtents(center, Vector3(dist, dist, dist));
+            return (minCorner + maxCorner) * 0.5f;
-	}
+        }
-	/// Build a box, given center and extents.
+        /// Return extents of the box.
-	void setCenterExtents(Vector3::Arg center, Vector3::Arg extents)
+        Vector3 extents() const
-	{
+        {
-		m_mins = center - extents;
+            return (maxCorner - minCorner) * 0.5f;
-		m_maxs = center + extents;
+        }
 	}
-	/// Get box center.
+        /// Return extents of the box.
-	Vector3 center() const
+        scalar extents(uint axis) const
-	{
+        {
-		return (m_mins + m_maxs) * 0.5f;
+            nvDebugCheck(axis < 3);
-	}
+            if (axis == 0) return (maxCorner.x - minCorner.x) * 0.5f;
            if (axis == 1) return (maxCorner.y - minCorner.y) * 0.5f;
            if (axis == 2) return (maxCorner.z - minCorner.z) * 0.5f;
            nvAssume(false);
            return 0.0f;
        }
-	/// Return extents of the box.
+        /// Add a point to this box.
-	Vector3 extents() const
+        void addPointToBounds(Vector3::Arg p)
-	{
+        {
-		return (m_maxs - m_mins) * 0.5f;
+            minCorner = min(minCorner, p);
-	}
+            maxCorner = max(maxCorner, p);
        }
-	/// Return extents of the box.
+        /// Add a box to this box.
-	scalar extents(uint axis) const
+        void addBoxToBounds(const Box & b)
-	{
+        {
-		nvDebugCheck(axis < 3);
+            minCorner = min(minCorner, b.minCorner);
-		if (axis == 0) return (m_maxs.x() - m_mins.x()) * 0.5f;
+            maxCorner = max(maxCorner, b.maxCorner);
-		if (axis == 1) return (m_maxs.y() - m_mins.y()) * 0.5f;
+        }
 		if (axis == 2) return (m_maxs.z() - m_mins.z()) * 0.5f;
 		nvAssume(false);
 		return 0.0f;
 	}
-	/// Add a point to this box.
+        /// Translate box.
-	void addPointToBounds(Vector3::Arg p)
+        void translate(Vector3::Arg v)
-	{
+        {
-		m_mins = min(m_mins, p);
+            minCorner += v;
-		m_maxs = max(m_maxs, p);
+            maxCorner += v;
-	}
+        }
-	/// Add a box to this box.
+        /// Scale the box.
-	void addBoxToBounds(const Box & b)
+        void scale(float s)
-	{
+        {
-		m_mins = min(m_mins, b.m_mins);
+            minCorner *= s;
-		m_maxs = max(m_maxs, b.m_maxs);
+            maxCorner *= s;
-	}
+        }
-	/// Translate box.
+        // Expand the box by a fixed amount.
-	void translate(Vector3::Arg v)
+        void expand(float r) {
-	{
+            minCorner -= Vector3(r,r,r);
-		m_mins += v;
+            maxCorner += Vector3(r,r,r);
-		m_maxs += v;
+        }
 	}
-	/// Scale the box.
+        /// Get the area of the box.
-	void scale(float s)
+        float area() const
-	{
+        {
-		m_mins *= s;
+            const Vector3 d = extents();
-		m_maxs *= s;
+            return 8.0f * (d.x*d.y + d.x*d.z + d.y*d.z);
-	}
+        }	
-	/// Get the area of the box.
+        /// Get the volume of the box.
-	float area() const
+        float volume() const
-	{
+        {
-		const Vector3 d = extents();
+            Vector3 d = extents();
-		return 8.0f * (d.x()*d.y() + d.x()*d.z() + d.y()*d.z());
+            return 8.0f * (d.x * d.y * d.z);
-	}	
+        }
-	/// Get the volume of the box.
+        /// Return true if the box contains the given point.
-	float volume() const
+        bool contains(Vector3::Arg p) const
-	{
+        {
-		Vector3 d = extents();
+            return 
-		return 8.0f * (d.x() * d.y() * d.z());
+                minCorner.x < p.x && minCorner.y < p.y && minCorner.z < p.z &&
-	}
+                maxCorner.x > p.x && maxCorner.y > p.y && maxCorner.z > p.z;
-	
+        }
 	/// Return true if the box contains the given point.
 	bool contains(Vector3::Arg p) const
 	{
 		return 
 			m_mins.x() < p.x() && m_mins.y() < p.y() && m_mins.z() < p.z() &&
 			m_maxs.x() > p.x() && m_maxs.y() > p.y() && m_maxs.z() > p.z();
 	}
-	friend Stream & operator<< (Stream & s, Box & box);
+        /// Split the given box in 8 octants and assign the ith one to this box.
        void setOctant(const Box & box, Vector3::Arg center, int i)
        {
            minCorner = box.minCorner;
            maxCorner = box.maxCorner;
-private:
+            if (i & 4) minCorner.x = center.x;
            else       maxCorner.x = center.x;
            if (i & 2) minCorner.y = center.y;
            else       maxCorner.y = center.y;
            if (i & 1) minCorner.z = center.z;
            else       maxCorner.z = center.z;
        }
-	Vector3 m_mins;
+        friend Stream & operator<< (Stream & s, Box & box);
 	Vector3 m_maxs;
 };
        Vector3 minCorner;
        Vector3 maxCorner;
    };
    float distanceSquared(const Box &box, const Vector3 &point);
    bool overlap(const Box &box, const Sphere &sphere);
 } // nv namespace
--- a/src/nvmath/CMakeLists.txt
+++ b/src/nvmath/CMakeLists.txt
@ -5,7 +5,7 @@ SET(MATH_SRCS
    Vector.h
    Matrix.h
    Plane.h Plane.cpp
-    Box.h
+    Box.h Box.cpp
    Color.h
    Half.h Half.cpp
    Fitting.h Fitting.cpp)
--- a/src/nvmath/Color.h
+++ b/src/nvmath/Color.h
@ -1,178 +1,179 @@
 // This code is in the public domain -- castanyo@yahoo.es
 #pragma once
 #ifndef NV_MATH_COLOR_H
 #define NV_MATH_COLOR_H
-#include <nvcore/Debug.h>
+#include "nvcore/Debug.h"
-#include <nvmath/Vector.h>
+#include "nvmath/Vector.h"
 namespace nv
 {
-/// 64 bit color stored as BGRA.
+    /// 64 bit color stored as BGRA.
-class NVMATH_CLASS Color64 
+    class NVMATH_CLASS Color64 
-{
+    {
-public:
+    public:
-	Color64() { }
+        Color64() { }
-	Color64(const Color64 & c) : u(c.u) { }
+        Color64(const Color64 & c) : u(c.u) { }
-	Color64(uint16 R, uint16 G, uint16 B, uint16 A) { setRGBA(R, G, B, A); }
+        Color64(uint16 R, uint16 G, uint16 B, uint16 A) { setRGBA(R, G, B, A); }
-	explicit Color64(uint64 U) : u(U) { }
+        explicit Color64(uint64 U) : u(U) { }
-	void setRGBA(uint16 R, uint16 G, uint16 B, uint16 A)
+        void setRGBA(uint16 R, uint16 G, uint16 B, uint16 A)
-	{
+        {
-		r = R;
+            r = R;
-		g = G;
+            g = G;
-		b = B;
+            b = B;
-		a = A;
+            a = A;
-	}
+        }
-	operator uint64 () const {
+        operator uint64 () const {
-		return u;
+            return u;
-	}
+        }
-	union {
+        union {
-		struct {
+            struct {
 #if NV_LITTLE_ENDIAN
-			uint16 r, a, b, g;
+                uint16 r, a, b, g;
 #else
-			uint16 a: 16;
+                uint16 a: 16;
-			uint16 r: 16;
+                uint16 r: 16;
-			uint16 g: 16;
+                uint16 g: 16;
-			uint16 b: 16;
+                uint16 b: 16;
 #endif
-		};
+            };
-		uint64 u;
+            uint64 u;
-	};
+        };
-};
+    };
-/// 32 bit color stored as BGRA.
+    /// 32 bit color stored as BGRA.
-class NVMATH_CLASS Color32
+    class NVMATH_CLASS Color32
-{
+    {
-public:
+    public:
-	Color32() { }
+        Color32() { }
-	Color32(const Color32 & c) : u(c.u) { }
+        Color32(const Color32 & c) : u(c.u) { }
-	Color32(uint8 R, uint8 G, uint8 B) { setRGBA(R, G, B, 0xFF); }
+        Color32(uint8 R, uint8 G, uint8 B) { setRGBA(R, G, B, 0xFF); }
-	Color32(uint8 R, uint8 G, uint8 B, uint8 A) { setRGBA( R, G, B, A); }
+        Color32(uint8 R, uint8 G, uint8 B, uint8 A) { setRGBA( R, G, B, A); }
-	//Color32(uint8 c[4]) { setRGBA(c[0], c[1], c[2], c[3]); }
+        //Color32(uint8 c[4]) { setRGBA(c[0], c[1], c[2], c[3]); }
-	//Color32(float R, float G, float B) { setRGBA(uint(R*255), uint(G*255), uint(B*255), 0xFF); }
+        //Color32(float R, float G, float B) { setRGBA(uint(R*255), uint(G*255), uint(B*255), 0xFF); }
-	//Color32(float R, float G, float B, float A) { setRGBA(uint(R*255), uint(G*255), uint(B*255), uint(A*255)); }
+        //Color32(float R, float G, float B, float A) { setRGBA(uint(R*255), uint(G*255), uint(B*255), uint(A*255)); }
-	explicit Color32(uint32 U) : u(U) { }
+        explicit Color32(uint32 U) : u(U) { }
-	void setRGBA(uint8 R, uint8 G, uint8 B, uint8 A)
+        void setRGBA(uint8 R, uint8 G, uint8 B, uint8 A)
-	{
+        {
-		r = R;
+            r = R;
-		g = G;
+            g = G;
-		b = B;
+            b = B;
-		a = A;
+            a = A;
-	}
+        }
-	void setBGRA(uint8 B, uint8 G, uint8 R, uint8 A = 0xFF)
+        void setBGRA(uint8 B, uint8 G, uint8 R, uint8 A = 0xFF)
-	{
+        {
-		r = R;
+            r = R;
-		g = G;
+            g = G;
-		b = B;
+            b = B;
-		a = A;
+            a = A;
-	}
+        }
-	
+
-	operator uint32 () const {
+        operator uint32 () const {
-		return u;
+            return u;
-	}
+        }
-	
+
-	union {
+        union {
-		struct {
+            struct {
 #if NV_LITTLE_ENDIAN
-			uint8 b, g, r, a;
+                uint8 b, g, r, a;
 #else
-			uint8 a: 8;
+                uint8 a: 8;
-			uint8 r: 8;
+                uint8 r: 8;
-			uint8 g: 8;
+                uint8 g: 8;
-			uint8 b: 8;
+                uint8 b: 8;
 #endif
-		};
+            };
-		uint32 u;
+            uint32 u;
-	};
+        };
-};
+    };
-/// 16 bit 565 BGR color.
+    /// 16 bit 565 BGR color.
-class NVMATH_CLASS Color16
+    class NVMATH_CLASS Color16
-{
+    {
-public:
+    public:
-	Color16() { }
+        Color16() { }
-	Color16(const Color16 & c) : u(c.u) { }
+        Color16(const Color16 & c) : u(c.u) { }
-	explicit Color16(uint16 U) : u(U) { }
+        explicit Color16(uint16 U) : u(U) { }
-	
+
-	union {
+        union {
-		struct {
+            struct {
 #if NV_LITTLE_ENDIAN
-			uint16 b : 5;
+                uint16 b : 5;
-			uint16 g : 6;
+                uint16 g : 6;
-			uint16 r : 5;
+                uint16 r : 5;
 #else
-			uint16 r : 5;
+                uint16 r : 5;
-			uint16 g : 6;
+                uint16 g : 6;
-			uint16 b : 5;
+                uint16 b : 5;
 #endif
-		};
+            };
-		uint16 u;
+            uint16 u;
-	};
+        };
-};
+    };
-/// Clamp color components.
+    /// Clamp color components.
-inline Vector3 colorClamp(Vector3::Arg c)
+    inline Vector3 colorClamp(Vector3::Arg c)
-{
+    {
-	return Vector3(clamp(c.x(), 0.0f, 1.0f), clamp(c.y(), 0.0f, 1.0f), clamp(c.z(), 0.0f, 1.0f));
+        return Vector3(clamp(c.x, 0.0f, 1.0f), clamp(c.y, 0.0f, 1.0f), clamp(c.z, 0.0f, 1.0f));
-}
+    }
-/// Clamp without allowing the hue to change.
+    /// Clamp without allowing the hue to change.
-inline Vector3 colorNormalize(Vector3::Arg c)
+    inline Vector3 colorNormalize(Vector3::Arg c)
-{
+    {
-	float scale = 1.0f;
+        float scale = 1.0f;
-	if (c.x() > scale) scale = c.x();
+        if (c.x > scale) scale = c.x;
-	if (c.y() > scale) scale = c.y();
+        if (c.y > scale) scale = c.y;
-	if (c.z() > scale) scale = c.z();
+        if (c.z > scale) scale = c.z;
-	return c / scale;
+        return c / scale;
-}
+    }
-/// Convert Color32 to Color16.
+    /// Convert Color32 to Color16.
-inline Color16 toColor16(Color32 c)
+    inline Color16 toColor16(Color32 c)
-{
+    {
-	Color16 color;
+        Color16 color;
-	//         rrrrrggggggbbbbb
+        //         rrrrrggggggbbbbb
-	// rrrrr000gggggg00bbbbb000
+        // rrrrr000gggggg00bbbbb000
-//	color.u = (c.u >> 3) & 0x1F;
+        //	color.u = (c.u >> 3) & 0x1F;
-//	color.u |= (c.u >> 5) & 0x7E0;
+        //	color.u |= (c.u >> 5) & 0x7E0;
-//	color.u |= (c.u >> 8) & 0xF800;
+        //	color.u |= (c.u >> 8) & 0xF800;
-	
+
-	color.r = c.r >> 3;
+        color.r = c.r >> 3;
-	color.g = c.g >> 2;
+        color.g = c.g >> 2;
-	color.b = c.b >> 3;
+        color.b = c.b >> 3;
-	return color; 
+        return color; 
-}
+    }
-/// Promote 16 bit color to 32 bit using regular bit expansion.
+    /// Promote 16 bit color to 32 bit using regular bit expansion.
-inline Color32 toColor32(Color16 c)
+    inline Color32 toColor32(Color16 c)
-{
+    {
-	Color32 color;
+        Color32 color;
-//	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
+        //	c.u = ((col0.u << 3) & 0xf8) | ((col0.u << 5) & 0xfc00) | ((col0.u << 8) & 0xf80000);
-//	c.u |= (c.u >> 5) & 0x070007;
+        //	c.u |= (c.u >> 5) & 0x070007;
-//	c.u |= (c.u >> 6) & 0x000300;
+        //	c.u |= (c.u >> 6) & 0x000300;
 	color.b = (c.b << 3) | (c.b >> 2);
 	color.g = (c.g << 2) | (c.g >> 4);
 	color.r = (c.r << 3) | (c.r >> 2);
 	color.a = 0xFF;
 	return color;
 }
-inline Vector4 toVector4(Color32 c)
+        color.b = (c.b << 3) | (c.b >> 2);
-{
+        color.g = (c.g << 2) | (c.g >> 4);
-	const float scale = 1.0f / 255.0f;
+        color.r = (c.r << 3) | (c.r >> 2);
-	return Vector4(c.r * scale, c.g * scale, c.b * scale, c.a * scale);
+        color.a = 0xFF;
-}
+
        return color;
    }
    inline Vector4 toVector4(Color32 c)
    {
        const float scale = 1.0f / 255.0f;
        return Vector4(c.r * scale, c.g * scale, c.b * scale, c.a * scale);
    }
 } // nv namespace
--- a/src/nvmath/Fitting.cpp
+++ b/src/nvmath/Fitting.cpp
@ -1,9 +1,7 @@
 // This code is in the public domain -- icastano@gmail.com
 #include "Fitting.h"
-
+#include "nvcore/Utils.h" // max, swap
 #include <nvcore/Algorithms.h> // max
 #include <nvcore/Containers.h> // swap
 #include <float.h> // FLT_MAX
@ -12,236 +10,236 @@ using namespace nv;
 // @@ Move to EigenSolver.h
 static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
 {
-	if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
-	{
+    {
-		return Vector3(zero);
+        return Vector3(zero);
-	}
+    }
 	const int NUM = 8;
-	Vector3 v(1, 1, 1);
+    const int NUM = 8;
 	for (int i = 0; i < NUM; i++)
 	{
 		float x = v.x() * matrix[0] + v.y() * matrix[1] + v.z() * matrix[2];
 		float y = v.x() * matrix[1] + v.y() * matrix[3] + v.z() * matrix[4];
 		float z = v.x() * matrix[2] + v.y() * matrix[4] + v.z() * matrix[5];
 		float norm = max(max(x, y), z);
 		v = Vector3(x, y, z) / norm;
 	}
-	return v;	
+    Vector3 v(1, 1, 1);
    for (int i = 0; i < NUM; i++)
    {
        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
        float norm = max(max(x, y), z);
        v = Vector3(x, y, z) / norm;
    }
    return v;	
 }
 Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points)
 {
-	Vector3 centroid(zero);
+    Vector3 centroid(zero);
-	for (int i = 0; i < n; i++)
+    for (int i = 0; i < n; i++)
-	{
+    {
-		centroid += points[i];
+        centroid += points[i];
-	}
+    }
-	centroid /= float(n);
+    centroid /= float(n);
-	return centroid;
+    return centroid;
 }
 Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
 {
-	Vector3 centroid(zero);
+    Vector3 centroid(zero);
-	float total = 0.0f;
+    float total = 0.0f;
-	for (int i = 0; i < n; i++)
+    for (int i = 0; i < n; i++)
-	{
+    {
-		total += weights[i];
+        total += weights[i];
-		centroid += weights[i]*points[i];
+        centroid += weights[i]*points[i];
-	}
+    }
-	centroid /= total;
+    centroid /= total;
-	return centroid;
+    return centroid;
 }
 Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, float *__restrict covariance)
 {
-	// compute the centroid
+    // compute the centroid
-	Vector3 centroid = computeCentroid(n, points);
+    Vector3 centroid = computeCentroid(n, points);
-	// compute covariance matrix
+    // compute covariance matrix
-	for (int i = 0; i < 6; i++)
+    for (int i = 0; i < 6; i++)
-	{
+    {
-		covariance[i] = 0.0f;
+        covariance[i] = 0.0f;
-	}
+    }
-	for (int i = 0; i < n; i++)
+    for (int i = 0; i < n; i++)
-	{
+    {
-		Vector3 v = points[i] - centroid;
+        Vector3 v = points[i] - centroid;
 		covariance[0] += v.x() * v.x();
 		covariance[1] += v.x() * v.y();
 		covariance[2] += v.x() * v.z();
 		covariance[3] += v.y() * v.y();
 		covariance[4] += v.y() * v.z();
 		covariance[5] += v.z() * v.z();
 	}
-	return centroid;
+        covariance[0] += v.x * v.x;
        covariance[1] += v.x * v.y;
        covariance[2] += v.x * v.z;
        covariance[3] += v.y * v.y;
        covariance[4] += v.y * v.z;
        covariance[5] += v.z * v.z;
    }
    return centroid;
 }
 Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, float *__restrict covariance)
 {
-	// compute the centroid
+    // compute the centroid
-	Vector3 centroid = computeCentroid(n, points, weights, metric);
+    Vector3 centroid = computeCentroid(n, points, weights, metric);
-	// compute covariance matrix
+    // compute covariance matrix
-	for (int i = 0; i < 6; i++)
+    for (int i = 0; i < 6; i++)
-	{
+    {
-		covariance[i] = 0.0f;
+        covariance[i] = 0.0f;
-	}
+    }
-	for (int i = 0; i < n; i++)
+    for (int i = 0; i < n; i++)
-	{
+    {
-		Vector3 a = (points[i] - centroid) * metric;
+        Vector3 a = (points[i] - centroid) * metric;
-		Vector3 b = weights[i]*a;
+        Vector3 b = weights[i]*a;
 		covariance[0] += a.x()*b.x();
 		covariance[1] += a.x()*b.y();
 		covariance[2] += a.x()*b.z();
 		covariance[3] += a.y()*b.y();
 		covariance[4] += a.y()*b.z();
 		covariance[5] += a.z()*b.z();
 	}
-	return centroid;
+        covariance[0] += a.x * b.x;
        covariance[1] += a.x * b.y;
        covariance[2] += a.x * b.z;
        covariance[3] += a.y * b.y;
        covariance[4] += a.y * b.z;
        covariance[5] += a.z * b.z;
    }
    return centroid;
 }
 Vector3 nv::Fit::computePrincipalComponent(int n, const Vector3 *__restrict points)
 {
-	float matrix[6];
+    float matrix[6];
-	computeCovariance(n, points, matrix);
+    computeCovariance(n, points, matrix);
-	return firstEigenVector_PowerMethod(matrix);
+    return firstEigenVector_PowerMethod(matrix);
 }
 Vector3 nv::Fit::computePrincipalComponent(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
 {
-	float matrix[6];
+    float matrix[6];
-	computeCovariance(n, points, weights, metric, matrix);
+    computeCovariance(n, points, weights, metric, matrix);
-	return firstEigenVector_PowerMethod(matrix);
+    return firstEigenVector_PowerMethod(matrix);
 }
 Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
 {
-	// compute the centroid and covariance
+    // compute the centroid and covariance
-	float matrix[6];
+    float matrix[6];
-	Vector3 centroid = computeCovariance(n, points, matrix);
+    Vector3 centroid = computeCovariance(n, points, matrix);
-	if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
+    if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
-	{
+    {
-		// If no plane defined, then return a horizontal plane.
+        // If no plane defined, then return a horizontal plane.
-		return Plane(Vector3(0, 0, 1), centroid);
+        return Plane(Vector3(0, 0, 1), centroid);
-	}
+    }
 #pragma message(NV_FILE_LINE "TODO: need to write an eigensolver!")
-	// - Numerical Recipes in C is a good reference. Householder transforms followed by QL decomposition seems to be the best approach.
+    // - Numerical Recipes in C is a good reference. Householder transforms followed by QL decomposition seems to be the best approach.
-	// - The one from magic-tools is now LGPL. For the 3D case it uses a cubic root solver, which is not very accurate.
+    // - The one from magic-tools is now LGPL. For the 3D case it uses a cubic root solver, which is not very accurate.
-	// - Charles' Galaxy3 contains an implementation of the tridiagonalization method, but is under BPL.
+    // - Charles' Galaxy3 contains an implementation of the tridiagonalization method, but is under BPL.
-	//EigenSolver3 solver(matrix);
+    //EigenSolver3 solver(matrix);
-	return Plane();
+    return Plane();
 }
 int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
 {
-	// Compute principal component.
+    // Compute principal component.
-	float matrix[6];
+    float matrix[6];
-	Vector3 centroid = computeCovariance(n, points, weights, metric, matrix);
+    Vector3 centroid = computeCovariance(n, points, weights, metric, matrix);
-	Vector3 principal = firstEigenVector_PowerMethod(matrix);
+    Vector3 principal = firstEigenVector_PowerMethod(matrix);
 	// Pick initial solution.
 	int mini, maxi;
 	mini = maxi = 0;
 	float mindps, maxdps;
 	mindps = maxdps = dot(points[0] - centroid, principal);
 	for (int i = 1; i < n; ++i)
 	{
 		float dps = dot(points[i] - centroid, principal);
 		if (dps < mindps) {
 			mindps = dps;
 			mini = i;
 		}
 		else {
 			maxdps = dps;
 			maxi = i;
 		}
 	}
-	cluster[0] = centroid + mindps * principal;
+    // Pick initial solution.
-	cluster[1] = centroid + maxdps * principal;
+    int mini, maxi;
-	cluster[2] = (2 * cluster[0] + cluster[1]) / 3;
+    mini = maxi = 0;
 	cluster[3] = (2 * cluster[1] + cluster[0]) / 3;
-	// Now we have to iteratively refine the clusters.
+    float mindps, maxdps;
-	while (true)
+    mindps = maxdps = dot(points[0] - centroid, principal);
 	{
 		Vector3 newCluster[4] = { Vector3(zero), Vector3(zero), Vector3(zero), Vector3(zero) };
 		float total[4] = {0, 0, 0, 0};
 		for (int i = 0; i < n; ++i)
 		{
 			// Find nearest cluster.
 			int nearest = 0;
 			float mindist = FLT_MAX;
 			for (int j = 0; j < 4; j++)
 			{
 				float dist = length_squared((cluster[j] - points[i]) * metric);
 				if (dist < mindist)
 				{
 					mindist = dist;
 					nearest = j;
 				}
 			}
 			newCluster[nearest] += weights[i] * points[i];
 			total[nearest] += weights[i];
 		}
-		for (int j = 0; j < 4; j++)
+    for (int i = 1; i < n; ++i)
-		{
+    {
        float dps = dot(points[i] - centroid, principal);
        if (dps < mindps) {
            mindps = dps;
            mini = i;
        }
        else {
            maxdps = dps;
            maxi = i;
        }
    }
    cluster[0] = centroid + mindps * principal;
    cluster[1] = centroid + maxdps * principal;
    cluster[2] = (2 * cluster[0] + cluster[1]) / 3;
    cluster[3] = (2 * cluster[1] + cluster[0]) / 3;
    // Now we have to iteratively refine the clusters.
    while (true)
    {
        Vector3 newCluster[4] = { Vector3(zero), Vector3(zero), Vector3(zero), Vector3(zero) };
        float total[4] = {0, 0, 0, 0};
        for (int i = 0; i < n; ++i)
        {
            // Find nearest cluster.
            int nearest = 0;
            float mindist = FLT_MAX;
            for (int j = 0; j < 4; j++)
            {
                float dist = lengthSquared((cluster[j] - points[i]) * metric);
                if (dist < mindist)
                {
                    mindist = dist;
                    nearest = j;
                }
            }
            newCluster[nearest] += weights[i] * points[i];
            total[nearest] += weights[i];
        }
        for (int j = 0; j < 4; j++)
        {
            if (total[j] != 0)
-			    newCluster[j] /= total[j];
+                newCluster[j] /= total[j];
-		}
+        }
-		if (equal(cluster[0], newCluster[0]) && equal(cluster[1], newCluster[1]) && 
+        if (equal(cluster[0], newCluster[0]) && equal(cluster[1], newCluster[1]) && 
-			equal(cluster[2], newCluster[2]) && equal(cluster[3], newCluster[3]))
+            equal(cluster[2], newCluster[2]) && equal(cluster[3], newCluster[3]))
-		{
+        {
-			return (total[0] != 0) + (total[1] != 0) + (total[2] != 0) + (total[3] != 0);
+            return (total[0] != 0) + (total[1] != 0) + (total[2] != 0) + (total[3] != 0);
-		}
+        }
-		cluster[0] = newCluster[0];
+        cluster[0] = newCluster[0];
-		cluster[1] = newCluster[1];
+        cluster[1] = newCluster[1];
-		cluster[2] = newCluster[2];
+        cluster[2] = newCluster[2];
-		cluster[3] = newCluster[3];
+        cluster[3] = newCluster[3];
-		// Sort clusters by weight.
+        // Sort clusters by weight.
-		for (int i = 0; i < 4; i++)
+        for (int i = 0; i < 4; i++)
-		{
+        {
-			for (int j = i; j > 0 && total[j] > total[j - 1]; j--)
+            for (int j = i; j > 0 && total[j] > total[j - 1]; j--)
-			{
+            {
-				swap( total[j], total[j - 1] );
+                swap( total[j], total[j - 1] );
-				swap( cluster[j], cluster[j - 1] );
+                swap( cluster[j], cluster[j - 1] );
-			}
+            }
-		}
+        }
-	}
+    }
 }
--- a/src/nvmath/Fitting.h
+++ b/src/nvmath/Fitting.h
@ -1,5 +1,6 @@
 // This code is in the public domain -- icastano@gmail.com
 #pragma once
 #ifndef NV_MATH_FITTING_H
 #define NV_MATH_FITTING_H
@ -9,22 +10,22 @@
 namespace nv
 {
-	namespace Fit
+    namespace Fit
-	{
+    {
-		Vector3 computeCentroid(int n, const Vector3 * points);
+        Vector3 computeCentroid(int n, const Vector3 * points);
-		Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, Vector3::Arg metric);
+        Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, Vector3::Arg metric);
-		Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
+        Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
-		Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, Vector3::Arg metric, float * covariance);
+        Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, Vector3::Arg metric, float * covariance);
-		Vector3 computePrincipalComponent(int n, const Vector3 * points);
+        Vector3 computePrincipalComponent(int n, const Vector3 * points);
-		Vector3 computePrincipalComponent(int n, const Vector3 * points, const float * weights, Vector3::Arg metric);
+        Vector3 computePrincipalComponent(int n, const Vector3 * points, const float * weights, Vector3::Arg metric);
-		Plane bestPlane(int n, const Vector3 * points);
+        Plane bestPlane(int n, const Vector3 * points);
-		// Returns number of clusters [1-4].
+        // Returns number of clusters [1-4].
-		int compute4Means(int n, const Vector3 * points, const float * weights, Vector3::Arg metric, Vector3 * cluster);
+        int compute4Means(int n, const Vector3 * points, const float * weights, Vector3::Arg metric, Vector3 * cluster);
-	}
+    }
 } // nv namespace
--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@ -79,235 +79,257 @@
 // Load immediate
 static inline uint32 _uint32_li( uint32 a )
 {
-  return (a);
+    return (a);
 }
 // Decrement
 static inline uint32 _uint32_dec( uint32 a )
 {
-  return (a - 1);
+    return (a - 1);
 }
 // Increment
 static inline uint32 _uint32_inc( uint32 a )
 {
  return (a + 1);
 }
 // Complement
 static inline uint32 _uint32_not( uint32 a )
 {
-  return (~a);
+    return (~a);
 }
 // Negate
 static inline uint32 _uint32_neg( uint32 a )
 {
-#if NV_CC_MSVC
+#pragma warning(disable : 4146)     // unary minus operator applied to unsigned type, result still unsigned
-  // prevent msvc warning.
+    return (-a);
-  return ~a + 1;
+#pragma warning(default : 4146)
 #else
  return (-a);
 #endif
 }
 // Extend sign
 static inline uint32 _uint32_ext( uint32 a )
 {
-  return (((int32)a)>>31);
+    return (((int32)a)>>31);
 }
 // And
 static inline uint32 _uint32_and( uint32 a, uint32 b )
 {
-  return (a & b);
+    return (a & b);
 }
 // And with Complement
 static inline uint32 _uint32_andc( uint32 a, uint32 b )
 {
-  return (a & ~b);
+    return (a & ~b);
 }
 // Or
 static inline uint32 _uint32_or( uint32 a, uint32 b )
 {
-  return (a | b);
+    return (a | b);
 }
 // Shift Right Logical
 static inline uint32 _uint32_srl( uint32 a, int sa )
 {
-  return (a >> sa);
+    return (a >> sa);
 }
 // Shift Left Logical
 static inline uint32 _uint32_sll( uint32 a, int sa )
 {
-  return (a << sa);
+    return (a << sa);
 }
 // Add
 static inline uint32 _uint32_add( uint32 a, uint32 b )
 {
-  return (a + b);
+    return (a + b);
 }
 // Subtract
 static inline uint32 _uint32_sub( uint32 a, uint32 b )
 {
-  return (a - b);
+    return (a - b);
 }
 // Select on Sign bit
 static inline uint32 _uint32_sels( uint32 test, uint32 a, uint32 b )
 {
-  const uint32 mask   = _uint32_ext( test );
+    const uint32 mask   = _uint32_ext( test );
-  const uint32 sel_a  = _uint32_and(  a,     mask  );
+    const uint32 sel_a  = _uint32_and(  a,     mask  );
-  const uint32 sel_b  = _uint32_andc( b,     mask  );
+    const uint32 sel_b  = _uint32_andc( b,     mask  );
-  const uint32 result = _uint32_or(   sel_a, sel_b );
+    const uint32 result = _uint32_or(   sel_a, sel_b );
-  return (result);
+    return (result);
 }
 // Load Immediate
 static inline uint16 _uint16_li( uint16 a )
 {
-  return (a);
+    return (a);
 }
 // Extend sign
 static inline uint16 _uint16_ext( uint16 a )
 {
-  return (((int16)a)>>15);
+    return (((int16)a)>>15);
 }
 // Negate
 static inline uint16 _uint16_neg( uint16 a )
 {
-  return (-a);
+    return (-a);
 }
 // Complement
 static inline uint16 _uint16_not( uint16 a )
 {
-  return (~a);
+    return (~a);
 }
 // Decrement
 static inline uint16 _uint16_dec( uint16 a )
 {
-  return (a - 1);
+    return (a - 1);
 }
 // Shift Left Logical
 static inline uint16 _uint16_sll( uint16 a, int sa )
 {
-  return (a << sa);
+    return (a << sa);
 }
 // Shift Right Logical
 static inline uint16 _uint16_srl( uint16 a, int sa )
 {
-  return (a >> sa);
+    return (a >> sa);
 }
 // Add
 static inline uint16 _uint16_add( uint16 a, uint16 b )
 {
-  return (a + b);
+    return (a + b);
 }
 // Subtract
 static inline uint16 _uint16_sub( uint16 a, uint16 b )
 {
-  return (a - b);
+    return (a - b);
 }
 // And
 static inline uint16 _uint16_and( uint16 a, uint16 b )
 {
-  return (a & b);
+    return (a & b);
 }
 // Or
 static inline uint16 _uint16_or( uint16 a, uint16 b )
 {
-  return (a | b);
+    return (a | b);
 }
 // Exclusive Or
 static inline uint16 _uint16_xor( uint16 a, uint16 b )
 {
-  return (a ^ b);
+    return (a ^ b);
 }
 // And with Complement
 static inline uint16 _uint16_andc( uint16 a, uint16 b )
 {
-  return (a & ~b);
+    return (a & ~b);
 }
 // And then Shift Right Logical
 static inline uint16 _uint16_andsrl( uint16 a, uint16 b, int sa )
 {
-  return ((a & b) >> sa);
+    return ((a & b) >> sa);
 }
 // Shift Right Logical then Mask
 static inline uint16 _uint16_srlm( uint16 a, int sa, uint16 mask )
 {
-  return ((a >> sa) & mask);
+    return ((a >> sa) & mask);
 }
 // Add then Mask
 static inline uint16 _uint16_addm( uint16 a, uint16 b, uint16 mask )
 {
-  return ((a + b) & mask);
+    return ((a + b) & mask);
 }
 // Select on Sign bit
 static inline uint16 _uint16_sels( uint16 test, uint16 a, uint16 b )
 {
-  const uint16 mask   = _uint16_ext( test );
+    const uint16 mask   = _uint16_ext( test );
-  const uint16 sel_a  = _uint16_and(  a,     mask  );
+    const uint16 sel_a  = _uint16_and(  a,     mask  );
-  const uint16 sel_b  = _uint16_andc( b,     mask  );
+    const uint16 sel_b  = _uint16_andc( b,     mask  );
-  const uint16 result = _uint16_or(   sel_a, sel_b );
+    const uint16 result = _uint16_or(   sel_a, sel_b );
-  return (result);
+    return (result);
 }
 #if NV_CC_MSVC
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
 uint32 _uint32_nlz( uint32 x ) {
    unsigned long index;
    _BitScanReverse(&index, x);
    return 31 - index;
 }
 #endif
 // Count Leading Zeros
 static inline uint32 _uint32_cntlz( uint32 x )
 {
-#ifdef __GNUC__
+#if NV_CC_GCC
-  /* On PowerPC, this will map to insn: cntlzw */
+    /* On PowerPC, this will map to insn: cntlzw */
-  /* On Pentium, this will map to insn: clz    */
+    /* On Pentium, this will map to insn: clz    */
-  uint32 nlz = __builtin_clz( x );
+    uint32 is_x_nez_msb = _uint32_neg( x );
-  return (nlz);
+    uint32 nlz          = __builtin_clz( x );
    uint32 result       = _uint32_sels( is_x_nez_msb, nlz, 0x00000020 );
    return (result);
 #elif NV_CC_MSVC
    uint32 is_x_nez_msb = _uint32_neg( x );
    uint32 nlz          = _uint32_nlz( x );
    uint32 result       = _uint32_sels( is_x_nez_msb, nlz, 0x00000020 );
    return (result);
 #else
-  const uint32 x0  = _uint32_srl(  x,  1 );
+    const uint32 x0  = _uint32_srl(  x,  1 );
-  const uint32 x1  = _uint32_or(   x,  x0 );
+    const uint32 x1  = _uint32_or(   x,  x0 );
-  const uint32 x2  = _uint32_srl(  x1, 2 );
+    const uint32 x2  = _uint32_srl(  x1, 2 );
-  const uint32 x3  = _uint32_or(   x1, x2 );
+    const uint32 x3  = _uint32_or(   x1, x2 );
-  const uint32 x4  = _uint32_srl(  x3, 4 );
+    const uint32 x4  = _uint32_srl(  x3, 4 );
-  const uint32 x5  = _uint32_or(   x3, x4 );
+    const uint32 x5  = _uint32_or(   x3, x4 );
-  const uint32 x6  = _uint32_srl(  x5, 8 );
+    const uint32 x6  = _uint32_srl(  x5, 8 );
-  const uint32 x7  = _uint32_or(   x5, x6 );
+    const uint32 x7  = _uint32_or(   x5, x6 );
-  const uint32 x8  = _uint32_srl(  x7, 16 );
+    const uint32 x8  = _uint32_srl(  x7, 16 );
-  const uint32 x9  = _uint32_or(   x7, x8 );
+    const uint32 x9  = _uint32_or(   x7, x8 );
-  const uint32 xA  = _uint32_not(  x9 );
+    const uint32 xA  = _uint32_not(  x9 );
-  const uint32 xB  = _uint32_srl(  xA, 1 );
+    const uint32 xB  = _uint32_srl(  xA, 1 );
-  const uint32 xC  = _uint32_and(  xB, 0x55555555 );
+    const uint32 xC  = _uint32_and(  xB, 0x55555555 );
-  const uint32 xD  = _uint32_sub(  xA, xC );
+    const uint32 xD  = _uint32_sub(  xA, xC );
-  const uint32 xE  = _uint32_and(  xD, 0x33333333 );
+    const uint32 xE  = _uint32_and(  xD, 0x33333333 );
-  const uint32 xF  = _uint32_srl(  xD, 2 );
+    const uint32 xF  = _uint32_srl(  xD, 2 );
-  const uint32 x10 = _uint32_and(  xF, 0x33333333 );
+    const uint32 x10 = _uint32_and(  xF, 0x33333333 );
-  const uint32 x11 = _uint32_add(  xE, x10 );
+    const uint32 x11 = _uint32_add(  xE, x10 );
-  const uint32 x12 = _uint32_srl(  x11, 4 );
+    const uint32 x12 = _uint32_srl(  x11, 4 );
-  const uint32 x13 = _uint32_add(  x11, x12 );
+    const uint32 x13 = _uint32_add(  x11, x12 );
-  const uint32 x14 = _uint32_and(  x13, 0x0f0f0f0f );
+    const uint32 x14 = _uint32_and(  x13, 0x0f0f0f0f );
-  const uint32 x15 = _uint32_srl(  x14, 8 );
+    const uint32 x15 = _uint32_srl(  x14, 8 );
-  const uint32 x16 = _uint32_add(  x14, x15 );
+    const uint32 x16 = _uint32_add(  x14, x15 );
-  const uint32 x17 = _uint32_srl(  x16, 16 );
+    const uint32 x17 = _uint32_srl(  x16, 16 );
-  const uint32 x18 = _uint32_add(  x16, x17 );
+    const uint32 x18 = _uint32_add(  x16, x17 );
-  const uint32 x19 = _uint32_and(  x18, 0x0000003f );
+    const uint32 x19 = _uint32_and(  x18, 0x0000003f );
-  return ( x19 );
+    return ( x19 );
 #endif
 }
@ -315,249 +337,187 @@ static inline uint32 _uint32_cntlz( uint32 x )
 static inline uint16 _uint16_cntlz( uint16 x )
 {
 #ifdef __GNUC__
-  /* On PowerPC, this will map to insn: cntlzw */
+    /* On PowerPC, this will map to insn: cntlzw */
-  /* On Pentium, this will map to insn: clz    */
+    /* On Pentium, this will map to insn: clz    */
-  uint32 x32   = _uint32_sll( x, 16 );
+    uint16 nlz32 = (uint16)_uint32_cntlz( (uint32)x );
-  uint16 nlz   = (uint16)__builtin_clz( x32 );
+    uint32 nlz   = _uint32_sub( nlz32, 16 );
-  return (nlz);
+    return (nlz);
 #else
-  const uint16 x0  = _uint16_srl(  x,  1 );
+    const uint16 x0  = _uint16_srl(  x,  1 );
-  const uint16 x1  = _uint16_or(   x,  x0 );
+    const uint16 x1  = _uint16_or(   x,  x0 );
-  const uint16 x2  = _uint16_srl(  x1, 2 );
+    const uint16 x2  = _uint16_srl(  x1, 2 );
-  const uint16 x3  = _uint16_or(   x1, x2 );
+    const uint16 x3  = _uint16_or(   x1, x2 );
-  const uint16 x4  = _uint16_srl(  x3, 4 );
+    const uint16 x4  = _uint16_srl(  x3, 4 );
-  const uint16 x5  = _uint16_or(   x3, x4 );
+    const uint16 x5  = _uint16_or(   x3, x4 );
-  const uint16 x6  = _uint16_srl(  x5, 8 );
+    const uint16 x6  = _uint16_srl(  x5, 8 );
-  const uint16 x7  = _uint16_or(   x5, x6 );
+    const uint16 x7  = _uint16_or(   x5, x6 );
-  const uint16 x8  = _uint16_not(  x7 );
+    const uint16 x8  = _uint16_not(  x7 );
-  const uint16 x9  = _uint16_srlm( x8, 1, 0x5555 );
+    const uint16 x9  = _uint16_srlm( x8, 1, 0x5555 );
-  const uint16 xA  = _uint16_sub(  x8, x9 );
+    const uint16 xA  = _uint16_sub(  x8, x9 );
-  const uint16 xB  = _uint16_and(  xA, 0x3333 );
+    const uint16 xB  = _uint16_and(  xA, 0x3333 );
-  const uint16 xC  = _uint16_srlm( xA, 2, 0x3333 );
+    const uint16 xC  = _uint16_srlm( xA, 2, 0x3333 );
-  const uint16 xD  = _uint16_add(  xB, xC );
+    const uint16 xD  = _uint16_add(  xB, xC );
-  const uint16 xE  = _uint16_srl(  xD, 4 );
+    const uint16 xE  = _uint16_srl(  xD, 4 );
-  const uint16 xF  = _uint16_addm( xD, xE, 0x0f0f );
+    const uint16 xF  = _uint16_addm( xD, xE, 0x0f0f );
-  const uint16 x10 = _uint16_srl(  xF, 8 );
+    const uint16 x10 = _uint16_srl(  xF, 8 );
-  const uint16 x11 = _uint16_addm( xF, x10, 0x001f );
+    const uint16 x11 = _uint16_addm( xF, x10, 0x001f );
-  return ( x11 );
+    return ( x11 );
 #endif
 }
 uint16
-half_from_float( uint32 f )
+nv::half_from_float( uint32 f )
 {
-  const uint32 one                        = _uint32_li( 0x00000001 );
+    const uint32 one                        = _uint32_li( 0x00000001 );
-  const uint32 f_e_mask                   = _uint32_li( 0x7f800000 );
+    const uint32 f_s_mask                   = _uint32_li( 0x80000000 );
-  const uint32 f_m_mask                   = _uint32_li( 0x007fffff );
+    const uint32 f_e_mask                   = _uint32_li( 0x7f800000 );
-  const uint32 f_s_mask                   = _uint32_li( 0x80000000 );
+    const uint32 f_m_mask                   = _uint32_li( 0x007fffff );
-  const uint32 h_e_mask                   = _uint32_li( 0x00007c00 );
+    const uint32 f_m_hidden_bit             = _uint32_li( 0x00800000 );
-  const uint32 f_e_pos                    = _uint32_li( 0x00000017 );
+    const uint32 f_m_round_bit              = _uint32_li( 0x00001000 );
-  const uint32 f_m_round_bit              = _uint32_li( 0x00001000 );
+    const uint32 f_snan_mask                = _uint32_li( 0x7fc00000 );
-  const uint32 h_nan_em_min               = _uint32_li( 0x00007c01 );
+    const uint32 f_e_pos                    = _uint32_li( 0x00000017 );
-  const uint32 f_h_s_pos_offset           = _uint32_li( 0x00000010 );
+    const uint32 h_e_pos                    = _uint32_li( 0x0000000a );
-  const uint32 f_m_hidden_bit             = _uint32_li( 0x00800000 );
+    const uint32 h_e_mask                   = _uint32_li( 0x00007c00 );
-  const uint32 f_h_m_pos_offset           = _uint32_li( 0x0000000d );
+    const uint32 h_snan_mask                = _uint32_li( 0x00007e00 );
-  const uint32 f_h_bias_offset            = _uint32_li( 0x38000000 );
+    const uint32 h_e_mask_value             = _uint32_li( 0x0000001f );
-  const uint32 f_m_snan_mask              = _uint32_li( 0x003fffff );
+    const uint32 f_h_s_pos_offset           = _uint32_li( 0x00000010 );
-  const uint16 h_snan_mask                = _uint32_li( 0x00007e00 );
+    const uint32 f_h_bias_offset            = _uint32_li( 0x00000070 );
-  const uint32 f_e                        = _uint32_and( f, f_e_mask  );
+    const uint32 f_h_m_pos_offset           = _uint32_li( 0x0000000d );
-  const uint32 f_m                        = _uint32_and( f, f_m_mask  );
+    const uint32 h_nan_min                  = _uint32_li( 0x00007c01 );
-  const uint32 f_s                        = _uint32_and( f, f_s_mask  );
+    const uint32 f_h_e_biased_flag          = _uint32_li( 0x0000008f );
-  const uint32 f_e_h_bias                 = _uint32_sub( f_e,               f_h_bias_offset );
+    const uint32 f_s                        = _uint32_and( f,               f_s_mask         );
-  const uint32 f_e_h_bias_amount          = _uint32_srl( f_e_h_bias,        f_e_pos         );
+    const uint32 f_e                        = _uint32_and( f,               f_e_mask         );
-  const uint32 f_m_round_mask             = _uint32_and( f_m,               f_m_round_bit     );
+    const uint16 h_s                        = _uint32_srl( f_s,             f_h_s_pos_offset );
-  const uint32 f_m_round_offset           = _uint32_sll( f_m_round_mask,    one               );
+    const uint32 f_m                        = _uint32_and( f,               f_m_mask         );
-  const uint32 f_m_rounded                = _uint32_add( f_m,               f_m_round_offset  );
+    const uint16 f_e_amount                 = _uint32_srl( f_e,             f_e_pos          );
-  const uint32 f_m_rounded_overflow       = _uint32_and( f_m_rounded,       f_m_hidden_bit    );
+    const uint32 f_e_half_bias              = _uint32_sub( f_e_amount,      f_h_bias_offset  );
-  const uint32 f_m_denorm_sa              = _uint32_sub( one,               f_e_h_bias_amount );
+    const uint32 f_snan                     = _uint32_and( f,               f_snan_mask      );
-  const uint32 f_m_with_hidden            = _uint32_or(  f_m_rounded,       f_m_hidden_bit    );
+    const uint32 f_m_round_mask             = _uint32_and( f_m,             f_m_round_bit    );
-  const uint32 f_m_denorm                 = _uint32_srl( f_m_with_hidden,   f_m_denorm_sa     );
+    const uint32 f_m_round_offset           = _uint32_sll( f_m_round_mask,  one              );
-  const uint32 f_em_norm_packed           = _uint32_or(  f_e_h_bias,        f_m_rounded       );
+    const uint32 f_m_rounded                = _uint32_add( f_m,             f_m_round_offset );
-  const uint32 f_e_overflow               = _uint32_add( f_e_h_bias,        f_m_hidden_bit    );
+    const uint32 f_m_denorm_sa              = _uint32_sub( one,             f_e_half_bias    );
-  const uint32 h_s                        = _uint32_srl( f_s,               f_h_s_pos_offset );
+    const uint32 f_m_with_hidden            = _uint32_or(  f_m_rounded,     f_m_hidden_bit   );
-  const uint32 h_m_nan                    = _uint32_srl( f_m,               f_h_m_pos_offset );
+    const uint32 f_m_denorm                 = _uint32_srl( f_m_with_hidden, f_m_denorm_sa    );
-  const uint32 h_m_denorm                 = _uint32_srl( f_m_denorm,        f_h_m_pos_offset );
+    const uint32 h_m_denorm                 = _uint32_srl( f_m_denorm,      f_h_m_pos_offset );
-  const uint32 h_em_norm                  = _uint32_srl( f_em_norm_packed,  f_h_m_pos_offset );
+    const uint32 f_m_rounded_overflow       = _uint32_and( f_m_rounded,     f_m_hidden_bit   );
-  const uint32 h_em_overflow              = _uint32_srl( f_e_overflow,      f_h_m_pos_offset );
+    const uint32 m_nan                      = _uint32_srl( f_m,             f_h_m_pos_offset );
-  const uint32 is_e_eqz_msb               = _uint32_dec(  f_e     );
+    const uint32 h_em_nan                   = _uint32_or(  h_e_mask,        m_nan            );
-  const uint32 is_m_nez_msb               = _uint32_neg(  f_m     );
+    const uint32 h_e_norm_overflow_offset   = _uint32_inc( f_e_half_bias );
-  const uint32 is_h_m_nan_nez_msb         = _uint32_neg(  h_m_nan );
+    const uint32 h_e_norm_overflow          = _uint32_sll( h_e_norm_overflow_offset, h_e_pos          );
-  const uint32 is_e_nflagged_msb          = _uint32_sub(  f_e,                 f_e_mask          );
+    const uint32 h_e_norm                   = _uint32_sll( f_e_half_bias,            h_e_pos          );
-  const uint32 is_ninf_msb                = _uint32_or(   is_e_nflagged_msb,   is_m_nez_msb      );
+    const uint32 h_m_norm                   = _uint32_srl( f_m_rounded,              f_h_m_pos_offset );
-  const uint32 is_underflow_msb           = _uint32_sub(  is_e_eqz_msb,        f_h_bias_offset   );
+    const uint32 h_em_norm                  = _uint32_or(  h_e_norm,                 h_m_norm         );
-  const uint32 is_nan_nunderflow_msb      = _uint32_or(   is_h_m_nan_nez_msb,  is_e_nflagged_msb );
+    const uint32 is_h_ndenorm_msb           = _uint32_sub( f_h_bias_offset,   f_e_amount    );
-  const uint32 is_m_snan_msb              = _uint32_sub(  f_m_snan_mask,       f_m               );
+    const uint32 is_f_e_flagged_msb         = _uint32_sub( f_h_e_biased_flag, f_e_half_bias );
-  const uint32 is_snan_msb                = _uint32_andc( is_m_snan_msb,       is_e_nflagged_msb );
+    const uint32 is_h_denorm_msb            = _uint32_not( is_h_ndenorm_msb );
-  const uint32 is_overflow_msb            = _uint32_neg(  f_m_rounded_overflow );
+    const uint32 is_f_m_eqz_msb             = _uint32_dec( f_m   );
-  const uint32 h_nan_underflow_result     = _uint32_sels( is_nan_nunderflow_msb, h_em_norm,                h_nan_em_min       );
+    const uint32 is_h_nan_eqz_msb           = _uint32_dec( m_nan );
-  const uint32 h_inf_result               = _uint32_sels( is_ninf_msb,           h_nan_underflow_result,   h_e_mask           );
+    const uint32 is_f_inf_msb               = _uint32_and( is_f_e_flagged_msb, is_f_m_eqz_msb   );
-  const uint32 h_underflow_result         = _uint32_sels( is_underflow_msb,      h_m_denorm,               h_inf_result       );
+    const uint32 is_f_nan_underflow_msb     = _uint32_and( is_f_e_flagged_msb, is_h_nan_eqz_msb );
-  const uint32 h_overflow_result          = _uint32_sels( is_overflow_msb,       h_em_overflow,            h_underflow_result );
+    const uint32 is_e_overflow_msb          = _uint32_sub( h_e_mask_value,     f_e_half_bias    );
-  const uint32 h_em_result                = _uint32_sels( is_snan_msb,           h_snan_mask,              h_overflow_result  );
+    const uint32 is_h_inf_msb               = _uint32_or(  is_e_overflow_msb,  is_f_inf_msb     );
-  const uint32 h_result                   = _uint32_or( h_em_result, h_s );
+    const uint32 is_f_nsnan_msb             = _uint32_sub( f_snan,             f_snan_mask      );
    const uint32 is_m_norm_overflow_msb     = _uint32_neg( f_m_rounded_overflow );
    const uint32 is_f_snan_msb              = _uint32_not( is_f_nsnan_msb );
    const uint32 h_em_overflow_result       = _uint32_sels( is_m_norm_overflow_msb, h_e_norm_overflow, h_em_norm                 );
    const uint32 h_em_nan_result            = _uint32_sels( is_f_e_flagged_msb,     h_em_nan,          h_em_overflow_result      );
    const uint32 h_em_nan_underflow_result  = _uint32_sels( is_f_nan_underflow_msb, h_nan_min,         h_em_nan_result           );
    const uint32 h_em_inf_result            = _uint32_sels( is_h_inf_msb,           h_e_mask,          h_em_nan_underflow_result );
    const uint32 h_em_denorm_result         = _uint32_sels( is_h_denorm_msb,        h_m_denorm,        h_em_inf_result           );
    const uint32 h_em_snan_result           = _uint32_sels( is_f_snan_msb,          h_snan_mask,       h_em_denorm_result        );
    const uint32 h_result                   = _uint32_or( h_s, h_em_snan_result );
-  return (h_result);
+    return (uint16)(h_result);
 }
 uint32 
-half_to_float( uint16 h )
+nv::half_to_float( uint16 h )
 {
-  const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
+    const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
-  const uint32 h_m_mask              = _uint32_li( 0x000003ff );
+    const uint32 h_m_mask              = _uint32_li( 0x000003ff );
-  const uint32 h_s_mask              = _uint32_li( 0x00008000 );
+    const uint32 h_s_mask              = _uint32_li( 0x00008000 );
-  const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
+    const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
-  const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
+    const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
-  const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
+    const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
-  const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
+    const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
-  const uint32 f_m_mask              = _uint32_li( 0x007fffff );
+    const uint32 f_m_mask              = _uint32_li( 0x007fffff );
-  const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
+    const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
-  const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
+    const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
-  const uint32 f_e_pos               = _uint32_li( 0x00000017 );
+    const uint32 f_e_pos               = _uint32_li( 0x00000017 );
-  const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
+    const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
-  const uint32 h_e                   = _uint32_and( h, h_e_mask );
+    const uint32 h_e                   = _uint32_and( h, h_e_mask );
-  const uint32 h_m                   = _uint32_and( h, h_m_mask );
+    const uint32 h_m                   = _uint32_and( h, h_m_mask );
-  const uint32 h_s                   = _uint32_and( h, h_s_mask );
+    const uint32 h_s                   = _uint32_and( h, h_s_mask );
-  const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
+    const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
-  const uint32 h_m_nlz               = _uint32_cntlz( h_m );
+    const uint32 h_m_nlz               = _uint32_cntlz( h_m );
-  const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
+    const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
-  const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
+    const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
-  const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
+    const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
-  const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
+    const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
-  const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
+    const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
-  const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
+    const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
-  const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
+    const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
-  const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
+    const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
-  const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
+    const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
-  const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
+    const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
-  const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
+    const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
-  const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
+    const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
-  const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
+    const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
-  const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
+    const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
-  const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
+    const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
-  const uint32 is_inf_msb            = _uint32_andc( is_e_flagged_msb,   is_m_nez_msb );
+    const uint32 is_inf_msb            = _uint32_andc( is_e_flagged_msb,   is_m_nez_msb );
-  const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
+    const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
-  const uint32 is_nan_msb            = _uint32_and(  is_e_flagged_msb,   is_m_nez_msb ); 
+    const uint32 is_nan_msb            = _uint32_and(  is_e_flagged_msb,   is_m_nez_msb ); 
-  const uint32 is_zero               = _uint32_ext(  is_zero_msb );
+    const uint32 is_zero               = _uint32_ext(  is_zero_msb );
-  const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
+    const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
-  const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
+    const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
-  const uint32 f_inf_result          = _uint32_sels( is_inf_msb,    f_e_mask,    f_denorm_result );
+    const uint32 f_inf_result          = _uint32_sels( is_inf_msb,    f_e_mask,    f_denorm_result );
-  const uint32 f_nan_result          = _uint32_sels( is_nan_msb,    f_em_nan,    f_inf_result    );
+    const uint32 f_nan_result          = _uint32_sels( is_nan_msb,    f_em_nan,    f_inf_result    );
-  const uint32 f_result              = _uint32_or( f_s, f_nan_result );
+    const uint32 f_result              = _uint32_or( f_s, f_nan_result );
- 
+
-  return (f_result);
+    return (f_result);
 }
-uint16
+uint32 
-half_add( uint16 x, uint16 y )
+nv::fast_half_to_float( uint16 h )
 {
-  const uint16 one                       = _uint16_li( 0x0001 );
+    const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
-  const uint16 msb_to_lsb_sa             = _uint16_li( 0x000f );
+    const uint32 h_m_mask              = _uint32_li( 0x000003ff );
-  const uint16 h_s_mask                  = _uint16_li( 0x8000 );
+    const uint32 h_s_mask              = _uint32_li( 0x00008000 );
-  const uint16 h_e_mask                  = _uint16_li( 0x7c00 );
+    const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
-  const uint16 h_m_mask                  = _uint16_li( 0x03ff );
+    const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
-  const uint16 h_m_msb_mask              = _uint16_li( 0x2000 );
+    const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
-  const uint16 h_m_msb_sa                = _uint16_li( 0x000d );
+    const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
-  const uint16 h_m_hidden                = _uint16_li( 0x0400 );
+    const uint32 f_m_mask              = _uint32_li( 0x007fffff );
-  const uint16 h_e_pos                   = _uint16_li( 0x000a );
+    const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
-  const uint16 h_e_bias_minus_one        = _uint16_li( 0x000e );
+    const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
-  const uint16 h_m_grs_carry             = _uint16_li( 0x4000 );
+    const uint32 f_e_pos               = _uint32_li( 0x00000017 );
-  const uint16 h_m_grs_carry_pos         = _uint16_li( 0x000e );
+    const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
-  const uint16 h_grs_size                = _uint16_li( 0x0003 );
+    const uint32 h_e                   = _uint32_and( h, h_e_mask );
-  const uint16 h_snan                    = _uint16_li( 0xfe00 );
+    const uint32 h_m                   = _uint32_and( h, h_m_mask );
-  const uint16 h_e_mask_minus_one        = _uint16_li( 0x7bff );
+    const uint32 h_s                   = _uint32_and( h, h_s_mask );
-  const uint16 h_grs_round_carry         = _uint16_sll( one, h_grs_size );
+    const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
-  const uint16 h_grs_round_mask          = _uint16_sub( h_grs_round_carry, one );
+    const uint32 h_m_nlz               = _uint32_cntlz( h_m );
-  const uint16 x_e                       = _uint16_and( x, h_e_mask );
+    const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
-  const uint16 y_e                       = _uint16_and( y, h_e_mask );
+    const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
-  const uint16 is_y_e_larger_msb         = _uint16_sub( x_e, y_e );
+    const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
-  const uint16 a                         = _uint16_sels( is_y_e_larger_msb, y, x);
+    const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
-  const uint16 a_s                       = _uint16_and( a, h_s_mask );
+    const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
-  const uint16 a_e                       = _uint16_and( a, h_e_mask );
+    const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
-  const uint16 a_m_no_hidden_bit         = _uint16_and( a, h_m_mask );
+    const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
-  const uint16 a_em_no_hidden_bit        = _uint16_or( a_e, a_m_no_hidden_bit );
+    const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
-  const uint16 b                         = _uint16_sels( is_y_e_larger_msb, x, y);
+    const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
-  const uint16 b_s                       = _uint16_and( b, h_s_mask );
+    const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
-  const uint16 b_e                       = _uint16_and( b, h_e_mask );
+    const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
-  const uint16 b_m_no_hidden_bit         = _uint16_and( b, h_m_mask );
+    const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
-  const uint16 b_em_no_hidden_bit        = _uint16_or( b_e, b_m_no_hidden_bit );
+    const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
-  const uint16 is_diff_sign_msb          = _uint16_xor( a_s, b_s );
+    const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
-  const uint16 is_a_inf_msb              = _uint16_sub( h_e_mask_minus_one, a_em_no_hidden_bit );
+    const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
-  const uint16 is_b_inf_msb              = _uint16_sub( h_e_mask_minus_one, b_em_no_hidden_bit );
+    const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
-  const uint16 is_undenorm_msb           = _uint16_dec( a_e );
+    const uint32 is_zero               = _uint32_ext(  is_zero_msb );
-  const uint16 is_undenorm               = _uint16_ext( is_undenorm_msb );
+    const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
-  const uint16 is_both_inf_msb           = _uint16_and( is_a_inf_msb, is_b_inf_msb );
+    const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
-  const uint16 is_invalid_inf_op_msb     = _uint16_and( is_both_inf_msb, b_s );
+    const uint32 f_result              = _uint32_or( f_s, f_denorm_result );
  const uint16 is_a_e_nez_msb            = _uint16_neg( a_e );
  const uint16 is_b_e_nez_msb            = _uint16_neg( b_e );
  const uint16 is_a_e_nez                = _uint16_ext( is_a_e_nez_msb );
  const uint16 is_b_e_nez                = _uint16_ext( is_b_e_nez_msb );
  const uint16 a_m_hidden_bit            = _uint16_and( is_a_e_nez, h_m_hidden );
  const uint16 b_m_hidden_bit            = _uint16_and( is_b_e_nez, h_m_hidden );
  const uint16 a_m_no_grs                = _uint16_or( a_m_no_hidden_bit, a_m_hidden_bit );
  const uint16 b_m_no_grs                = _uint16_or( b_m_no_hidden_bit, b_m_hidden_bit );
  const uint16 diff_e                    = _uint16_sub( a_e,        b_e );
  const uint16 a_e_unbias                = _uint16_sub( a_e,        h_e_bias_minus_one );
  const uint16 a_m                       = _uint16_sll( a_m_no_grs, h_grs_size );
  const uint16 a_e_biased                = _uint16_srl( a_e,        h_e_pos );
  const uint16 m_sa_unbias               = _uint16_srl( a_e_unbias, h_e_pos );
  const uint16 m_sa_default              = _uint16_srl( diff_e,     h_e_pos );
  const uint16 m_sa_unbias_mask          = _uint16_andc( is_a_e_nez_msb,   is_b_e_nez_msb );
  const uint16 m_sa                      = _uint16_sels( m_sa_unbias_mask, m_sa_unbias, m_sa_default );
  const uint16 b_m_no_sticky             = _uint16_sll( b_m_no_grs,        h_grs_size );
  const uint16 sh_m                      = _uint16_srl( b_m_no_sticky,     m_sa );
  const uint16 sticky_overflow           = _uint16_sll( one,               m_sa );
  const uint16 sticky_mask               = _uint16_dec( sticky_overflow );
  const uint16 sticky_collect            = _uint16_and( b_m_no_sticky, sticky_mask );
  const uint16 is_sticky_set_msb         = _uint16_neg( sticky_collect );
  const uint16 sticky                    = _uint16_srl( is_sticky_set_msb, msb_to_lsb_sa);
  const uint16 b_m                       = _uint16_or( sh_m, sticky );
  const uint16 is_c_m_ab_pos_msb         = _uint16_sub( b_m, a_m );
  const uint16 c_inf                     = _uint16_or( a_s, h_e_mask );
  const uint16 c_m_sum                   = _uint16_add( a_m, b_m );
  const uint16 c_m_diff_ab               = _uint16_sub( a_m, b_m );
  const uint16 c_m_diff_ba               = _uint16_sub( b_m, a_m );
  const uint16 c_m_smag_diff             = _uint16_sels( is_c_m_ab_pos_msb, c_m_diff_ab, c_m_diff_ba );
  const uint16 c_s_diff                  = _uint16_sels( is_c_m_ab_pos_msb, a_s,         b_s         );
  const uint16 c_s                       = _uint16_sels( is_diff_sign_msb,  c_s_diff,    a_s         );
  const uint16 c_m_smag_diff_nlz         = _uint16_cntlz( c_m_smag_diff );
  const uint16 diff_norm_sa              = _uint16_sub( c_m_smag_diff_nlz, one );
  const uint16 is_diff_denorm_msb        = _uint16_sub( a_e_biased, diff_norm_sa );
  const uint16 is_diff_denorm            = _uint16_ext( is_diff_denorm_msb );
  const uint16 is_a_or_b_norm_msb        = _uint16_neg( a_e_biased );
  const uint16 diff_denorm_sa            = _uint16_dec( a_e_biased );
  const uint16 c_m_diff_denorm           = _uint16_sll( c_m_smag_diff, diff_denorm_sa );
  const uint16 c_m_diff_norm             = _uint16_sll( c_m_smag_diff, diff_norm_sa );
  const uint16 c_e_diff_norm             = _uint16_sub( a_e_biased,  diff_norm_sa );
  const uint16 c_m_diff_ab_norm          = _uint16_sels( is_diff_denorm_msb, c_m_diff_denorm, c_m_diff_norm );
  const uint16 c_e_diff_ab_norm          = _uint16_andc( c_e_diff_norm, is_diff_denorm );
  const uint16 c_m_diff                  = _uint16_sels( is_a_or_b_norm_msb, c_m_diff_ab_norm, c_m_smag_diff );
  const uint16 c_e_diff                  = _uint16_sels( is_a_or_b_norm_msb, c_e_diff_ab_norm, a_e_biased    );
  const uint16 is_diff_eqz_msb           = _uint16_dec( c_m_diff );
  const uint16 is_diff_exactly_zero_msb  = _uint16_and( is_diff_sign_msb, is_diff_eqz_msb );
  const uint16 is_diff_exactly_zero      = _uint16_ext( is_diff_exactly_zero_msb );
  const uint16 c_m_added                 = _uint16_sels( is_diff_sign_msb, c_m_diff, c_m_sum );
  const uint16 c_e_added                 = _uint16_sels( is_diff_sign_msb, c_e_diff, a_e_biased );
  const uint16 c_m_carry                 = _uint16_and( c_m_added, h_m_grs_carry );
  const uint16 is_c_m_carry_msb          = _uint16_neg( c_m_carry );
  const uint16 c_e_hidden_offset         = _uint16_andsrl( c_m_added, h_m_grs_carry, h_m_grs_carry_pos );
  const uint16 c_m_sub_hidden            = _uint16_srl( c_m_added, one );
  const uint16 c_m_no_hidden             = _uint16_sels( is_c_m_carry_msb, c_m_sub_hidden, c_m_added );
  const uint16 c_e_no_hidden             = _uint16_add( c_e_added,         c_e_hidden_offset  );
  const uint16 c_m_no_hidden_msb         = _uint16_and( c_m_no_hidden,     h_m_msb_mask       );
  const uint16 undenorm_m_msb_odd        = _uint16_srl( c_m_no_hidden_msb, h_m_msb_sa         );
  const uint16 undenorm_fix_e            = _uint16_and( is_undenorm,       undenorm_m_msb_odd );
  const uint16 c_e_fixed                 = _uint16_add( c_e_no_hidden,     undenorm_fix_e     );
  const uint16 c_m_round_amount          = _uint16_and( c_m_no_hidden,     h_grs_round_mask   );
  const uint16 c_m_rounded               = _uint16_add( c_m_no_hidden,     c_m_round_amount   );
  const uint16 c_m_round_overflow        = _uint16_andsrl( c_m_rounded, h_m_grs_carry, h_m_grs_carry_pos );
  const uint16 c_e_rounded               = _uint16_add( c_e_fixed, c_m_round_overflow );
  const uint16 c_m_no_grs                = _uint16_srlm( c_m_rounded, h_grs_size,  h_m_mask );
  const uint16 c_e                       = _uint16_sll( c_e_rounded, h_e_pos );
  const uint16 c_em                      = _uint16_or( c_e, c_m_no_grs );
  const uint16 c_normal                  = _uint16_or( c_s, c_em );
  const uint16 c_inf_result              = _uint16_sels( is_a_inf_msb, c_inf, c_normal );
  const uint16 c_zero_result             = _uint16_andc( c_inf_result, is_diff_exactly_zero );
  const uint16 c_result                  = _uint16_sels( is_invalid_inf_op_msb, h_snan, c_zero_result );
-  return (c_result);
+    return (f_result);
 }
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@ -1,9 +1,17 @@
 #pragma once
 #ifndef NV_MATH_HALF_H
 #define NV_MATH_HALF_H
-#include <nvmath/nvmath.h>
+#include "nvmath.h"
-uint32 half_to_float( uint16 h );
+namespace nv {
 uint16 half_from_float( uint32 f );
-#endif /* NV_MATH_HALF_H */
+    uint32 half_to_float( uint16 h );
    uint16 half_from_float( uint32 f );
    // Does not handle NaN or infinity.
    uint32 fast_half_to_float( uint16 h );
 } // nv namespace
 #endif // NV_MATH_HALF_H
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
--- a/src/nvmath/Plane.cpp
+++ b/src/nvmath/Plane.cpp
@ -5,13 +5,22 @@
 namespace nv
 {
-	Plane transformPlane(const Matrix& m, Plane::Arg p)
+    Plane transformPlane(const Matrix& m, Plane::Arg p)
-	{
+    {
-		Vector3 newVec = transformVector(m, p.vector());
+        Vector3 newVec = transformVector(m, p.vector());
-		Vector3 ptInPlane = p.offset() * p.vector();
+        Vector3 ptInPlane = p.offset() * p.vector();
-		ptInPlane = transformPoint(m, ptInPlane);
+        ptInPlane = transformPoint(m, ptInPlane);
-		return Plane(newVec, ptInPlane);
+        return Plane(newVec, ptInPlane);
-	}
+    }
-}
+
    Vector3 planeIntersection(Plane::Arg a, Plane::Arg b, Plane::Arg c)
    {
        return dot(a.vector(), cross(b.vector(), c.vector())) * (
            a.offset() * cross(b.vector(), c.vector()) + 
            c.offset() * cross(a.vector(), b.vector()) +
            b.offset() * cross(c.vector(), a.vector()));
    }
 } // nv namespace
--- a/src/nvmath/Plane.h
+++ b/src/nvmath/Plane.h
@ -1,77 +1,81 @@
 // This code is in the public domain -- castanyo@yahoo.es
 #pragma once
 #ifndef NV_MATH_PLANE_H
 #define NV_MATH_PLANE_H
-#include "nvmath.h"
+#include <nvmath/nvmath.h>
-#include "Vector.h"
+#include <nvmath/Vector.h>
 namespace nv
 {
-	class Matrix;
+    class Matrix;
-	class NVMATH_CLASS Plane
+    class NVMATH_CLASS Plane
-	{
+    {
-	public:
+    public:
-		typedef Plane const & Arg;
+        typedef Plane const & Arg;
 		Plane();
 		Plane(float x, float y, float z, float w);
 		Plane(Vector4::Arg v);
 		Plane(Vector3::Arg v, float d);
 		Plane(Vector3::Arg normal, Vector3::Arg point);
 		const Plane & operator=(Plane::Arg v);
 		Vector3 vector() const;
 		scalar offset() const;
 		const Vector4 & asVector() const;
 		Vector4 & asVector();
 		void operator*=(scalar s);
-	private:
+        Plane();
-		Vector4 p;
+        Plane(float x, float y, float z, float w);
-	};
+        Plane(Vector4::Arg v);
        Plane(Vector3::Arg v, float d);
        Plane(Vector3::Arg normal, Vector3::Arg point);
-	inline Plane::Plane() {}
+        const Plane & operator=(Plane::Arg v);
 	inline Plane::Plane(float x, float y, float z, float w) : p(x, y, z, w) {}
 	inline Plane::Plane(Vector4::Arg v) : p(v) {}
 	inline Plane::Plane(Vector3::Arg v, float d) : p(v, d) {}
 	inline Plane::Plane(Vector3::Arg normal, Vector3::Arg point) : p(normal, dot(normal, point)) {}
 	inline const Plane & Plane::operator=(Plane::Arg v) { p = v.p; return *this; }
 	inline Vector3 Plane::vector() const { return p.xyz(); }
 	inline scalar Plane::offset() const { return p.w(); }
-	inline const Vector4 & Plane::asVector() const { return p; }
+        Vector3 vector() const;
-	inline Vector4 & Plane::asVector() { return p; }
+        scalar offset() const;
-	// Normalize plane.
+        const Vector4 & asVector() const;
-	inline Plane normalize(Plane::Arg plane, float epsilon = NV_EPSILON)
+        Vector4 & asVector();
 	{
 		const float len = length(plane.vector());
 		nvDebugCheck(!isZero(len, epsilon));
 		const float inv = 1.0f / len;
 		return Plane(plane.asVector() * inv);
 	}
-	// Get the signed distance from the given point to this plane.
+        void operator*=(scalar s);
-	inline float distance(Plane::Arg plane, Vector3::Arg point)
+
-	{
+    private:
-		return dot(plane.vector(), point) - plane.offset();
+        Vector4 p;
-	}
+    };
    inline Plane::Plane() {}
    inline Plane::Plane(float x, float y, float z, float w) : p(x, y, z, w) {}
    inline Plane::Plane(Vector4::Arg v) : p(v) {}
    inline Plane::Plane(Vector3::Arg v, float d) : p(v, d) {}
    inline Plane::Plane(Vector3::Arg normal, Vector3::Arg point) : p(normal, dot(normal, point)) {}
    inline const Plane & Plane::operator=(Plane::Arg v) { p = v.p; return *this; }
    inline Vector3 Plane::vector() const { return p.xyz(); }
    inline scalar Plane::offset() const { return p.w; }
    inline const Vector4 & Plane::asVector() const { return p; }
    inline Vector4 & Plane::asVector() { return p; }
    // Normalize plane.
    inline Plane normalize(Plane::Arg plane, float epsilon = NV_EPSILON)
    {
        const float len = length(plane.vector());
        nvDebugCheck(!isZero(len, epsilon));
        const float inv = 1.0f / len;
        return Plane(plane.asVector() * inv);
    }
    // Get the signed distance from the given point to this plane.
    inline float distance(Plane::Arg plane, Vector3::Arg point)
    {
        return dot(plane.vector(), point) - plane.offset();
    }
    inline void Plane::operator*=(scalar s)
    {
        scale(p, s);
    }
    Plane transformPlane(const Matrix&, Plane::Arg);
    Vector3 planeIntersection(Plane::Arg a, Plane::Arg b, Plane::Arg c);
 	inline void Plane::operator*=(scalar s)
 	{
 		scale(p, s);
 	}
 	Plane transformPlane(const Matrix&, Plane::Arg);
 } // nv namespace
 #endif // NV_MATH_PLANE_H
--- a/src/nvmath/Vector.h
+++ b/src/nvmath/Vector.h
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@ -1,5 +1,6 @@
 // This code is in the public domain -- castanyo@yahoo.es
 #pragma once
 #ifndef NV_MATH_H
 #define NV_MATH_H
@ -28,29 +29,29 @@
 #endif // NVMATH_SHARED
 #ifndef PI
-#define PI      			float(3.1415926535897932384626433833)
+#define PI                  float(3.1415926535897932384626433833)
 #endif
-#define NV_EPSILON			(0.0001f)
+#define NV_EPSILON          (0.0001f)
-#define NV_NORMAL_EPSILON	(0.001f)
+#define NV_NORMAL_EPSILON   (0.001f)
 /*
-#define SQ(r)				((r)*(r))
+#define SQ(r)               ((r)*(r))
-#define	SIGN_BITMASK		0x80000000
+#define SIGN_BITMASK        0x80000000
 /// Integer representation of a floating-point value.
-#define IR(x)					((uint32 &)(x))
+#define IR(x)               ((uint32 &)(x))
 /// Absolute integer representation of a floating-point value
-#define AIR(x)					(IR(x) & 0x7fffffff)
+#define AIR(x)              (IR(x) & 0x7fffffff)
 /// Floating-point representation of an integer value.
-#define FR(x)					((float&)(x))
+#define FR(x)               ((float&)(x))
 /// Integer-based comparison of a floating point value.
 /// Don't use it blindly, it can be faster or slower than the FPU comparison, depends on the context.
-#define IS_NEGATIVE_FLOAT(x)	(IR(x)&SIGN_BITMASK)
+#define IS_NEGATIVE_FLOAT(x) (IR(x)&SIGN_BITMASK)
 */
 inline double sqrt_assert(const double f)
@ -97,6 +98,7 @@ inline float asinf_assert(const float f)
 #define asin asin_assert
 #define asinf asinf_assert
 namespace nv
 {
    inline float toRadian(float degree) { return degree * (PI / 180.0f); }
@ -121,7 +123,7 @@ namespace nv
 #elif NV_OS_LINUX
        return finitef(f);
 #else
-#	error "isFinite not supported"
+#   error "isFinite not supported"
 #endif
        //return std::isfinite (f);
        //return finite (f);
@ -136,7 +138,7 @@ namespace nv
 #elif NV_OS_LINUX
        return isnanf(f);
 #else
-#	error "isNan not supported"
+#   error "isNan not supported"
 #endif
    }
@ -161,10 +163,11 @@ namespace nv
        return f0 * s + f1 * t;
    }
-    inline float square(float f)
+    inline float square(float f) { return f * f; }
-    {
+    inline int square(int i) { return i * i; }
-        return f * f;
+
-    }
+    inline float cube(float f) { return f * f; }
    inline int cube(int i) { return i * i; }
    // @@ Float to int conversions to be optimized at some point. See:
    // http://cbloomrants.blogspot.com/2009/01/01-17-09-float-to-int.html
@ -186,10 +189,10 @@ namespace nv
        return int(ceilf(f));
    }
-    inline float frac(float f)
+	inline float frac(float f)
-    {
+	{
-        return f - floor(f);
+		return f - floor(f);
-    }
+	}
 } // nv