Merge changes from the witness.

2011-09-27 17:48:46 +00:00
parent 9c0658edca
commit 3c0ab2d3f3
47 changed files with 1811 additions and 186 deletions
--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@ -487,46 +487,126 @@ nv::half_to_float( uint16 h )
    return (f_result);
 }

-uint32 
-nv::fast_half_to_float( uint16 h )
-{
-    const uint32 h_e_mask              = _uint32_li( 0x00007c00 );
-    const uint32 h_m_mask              = _uint32_li( 0x000003ff );
-    const uint32 h_s_mask              = _uint32_li( 0x00008000 );
-    const uint32 h_f_s_pos_offset      = _uint32_li( 0x00000010 );
-    const uint32 h_f_e_pos_offset      = _uint32_li( 0x0000000d );
-    const uint32 h_f_bias_offset       = _uint32_li( 0x0001c000 );
-    const uint32 f_e_mask              = _uint32_li( 0x7f800000 );
-    const uint32 f_m_mask              = _uint32_li( 0x007fffff );
-    const uint32 h_f_e_denorm_bias     = _uint32_li( 0x0000007e );
-    const uint32 h_f_m_denorm_sa_bias  = _uint32_li( 0x00000008 );
-    const uint32 f_e_pos               = _uint32_li( 0x00000017 );
-    const uint32 h_e_mask_minus_one    = _uint32_li( 0x00007bff );
-    const uint32 h_e                   = _uint32_and( h, h_e_mask );
-    const uint32 h_m                   = _uint32_and( h, h_m_mask );
-    const uint32 h_s                   = _uint32_and( h, h_s_mask );
-    const uint32 h_e_f_bias            = _uint32_add( h_e, h_f_bias_offset );
-    const uint32 h_m_nlz               = _uint32_cntlz( h_m );
-    const uint32 f_s                   = _uint32_sll( h_s,        h_f_s_pos_offset );
-    const uint32 f_e                   = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
-    const uint32 f_m                   = _uint32_sll( h_m,        h_f_e_pos_offset );
-    const uint32 f_em                  = _uint32_or(  f_e,        f_m              );
-    const uint32 h_f_m_sa              = _uint32_sub( h_m_nlz,             h_f_m_denorm_sa_bias );
-    const uint32 f_e_denorm_unpacked   = _uint32_sub( h_f_e_denorm_bias,   h_f_m_sa             );
-    const uint32 h_f_m                 = _uint32_sll( h_m,                 h_f_m_sa             );
-    const uint32 f_m_denorm            = _uint32_and( h_f_m,               f_m_mask             );
-    const uint32 f_e_denorm            = _uint32_sll( f_e_denorm_unpacked, f_e_pos              );
-    const uint32 f_em_denorm           = _uint32_or(  f_e_denorm,          f_m_denorm           );
-    const uint32 f_em_nan              = _uint32_or(  f_e_mask,            f_m                  );
-    const uint32 is_e_eqz_msb          = _uint32_dec(  h_e );
-    const uint32 is_m_nez_msb          = _uint32_neg(  h_m );
-    const uint32 is_e_flagged_msb      = _uint32_sub(  h_e_mask_minus_one, h_e );
-    const uint32 is_zero_msb           = _uint32_andc( is_e_eqz_msb,       is_m_nez_msb );
-    const uint32 is_denorm_msb         = _uint32_and(  is_m_nez_msb,       is_e_eqz_msb );
-    const uint32 is_zero               = _uint32_ext(  is_zero_msb );
-    const uint32 f_zero_result         = _uint32_andc( f_em, is_zero );
-    const uint32 f_denorm_result       = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
-    const uint32 f_result              = _uint32_or( f_s, f_denorm_result );

-    return (f_result);
+// @@ These tables could be smaller.
+static uint32 mantissa_table[2048];
+static uint32 exponent_table[64];
+static uint32 offset_table[64];
+
+void nv::half_init_tables()
+{
+    // Init mantissa table.
+	mantissa_table[0] = 0;
+
+	for (int i = 1; i < 1024; i++) {
+		uint m = i << 13;
+		uint e = 0;
+
+		while ((m & 0x00800000) == 0) {
+			e -= 0x00800000;
+			m <<= 1;
+		}
+		m &= ~0x00800000;
+		e += 0x38800000;
+		mantissa_table[i] = m | e;
+	}
+
+    for (int i = 1024; i < 2048; i++) {
+		mantissa_table[i] = 0x38000000 + ((i - 1024) << 13);
+    }
+
+
+    // Init exponent table.
+	exponent_table[0] = 0;
+
+    for (int i = 1; i < 31; i++) {
+		exponent_table[i] = (i << 23);
+    }
+
+	exponent_table[31] = 0x47800000;
+	exponent_table[32] = 0x80000000;
+
+    for (int i = 33; i < 63; i++) {
+		exponent_table[i] = 0x80000000 + ((i - 32) << 23);
+    }
+
+	exponent_table[63] = 0xC7800000;
+
+
+    // Init offset table.
+	offset_table[0] = 0;
+
+    for (int i = 1; i < 32; i++) {
+		offset_table[i] = 1024;
+    }
+
+	offset_table[32] = 0;
+
+    for (int i = 33; i < 64; i++) {
+		offset_table[i] = 1024;
+    }
+
+    /*for (int i = 0; i < 64; i++) {
+        offset_table[i] = ((i & 31) != 0) * 1024;
+    }*/
 }
+
+// Fast half to float conversion based on:
+// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
+uint32 nv::fast_half_to_float(uint16 h)
+{
+	uint exp = h >> 10;
+	return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
+}
+
+
+#if 0
+// Inaccurate conversion suggested at the ffmpeg mailing list:
+// http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
+uint32 nv::fast_half_to_float(uint16 v)
+{
+    if (v & 0x8000) return 0;
+    uint exp = v >> 10;
+    if (!exp) return (v>>9)&1;
+    if (exp >= 15) return 0xffff;
+    v <<= 6;
+    return (v+(1<<16)) >> (15-exp);
+}
+
+#endif
+
+#if 0
+
+// Some more from a gamedev thread:
+// http://www.devmaster.net/forums/showthread.php?t=10924
+
+// I believe it does not handle specials either.
+
+// Mike Acton's code should be fairly easy to vectorize and that would handle all cases too, the table method might still be faster, though.
+
+
+static __declspec(align(16)) unsigned half_sign[4]	  = {0x00008000, 0x00008000, 0x00008000, 0x00008000};
+static __declspec(align(16)) unsigned half_exponent[4]	  = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00};
+static __declspec(align(16)) unsigned half_mantissa[4]	  = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF};
+static __declspec(align(16)) unsigned half_bias_offset[4] = {0x0001C000, 0x0001C000, 0x0001C000, 0x0001C000};
+
+__asm
+{
+	movaps	xmm1, xmm0  // Input in xmm0
+	movaps	xmm2, xmm0
+
+	andps	xmm0, half_sign
+	andps	xmm1, half_exponent
+	andps	xmm2, half_mantissa
+	paddd	xmm1, half_bias_offset
+
+	pslld	xmm0, 16
+	pslld	xmm1, 13
+	pslld	xmm2, 13
+
+	orps	xmm1, xmm2
+	orps	xmm0, xmm1  // Result in xmm0
+}
+
+
+#endif
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@ -9,8 +9,9 @@ namespace nv {
    uint32 half_to_float( uint16 h );
    uint16 half_from_float( uint32 f );

-    // Does not handle NaN or infinity.
-    uint32 fast_half_to_float( uint16 h );
+    void half_init_tables();
+
+    uint32 fast_half_to_float(uint16 h);

    inline uint16 to_half(float c) {
        union { float f; uint32 u; } f;
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
@ -9,15 +9,14 @@

 namespace nv
 {
-    enum zero_t { zero };
    enum identity_t { identity };

    class NVMATH_CLASS Matrix3
    {
    public:
        Matrix3();
-        Matrix3(zero_t);
-        Matrix3(identity_t);
+        explicit Matrix3(float f);
+        explicit Matrix3(identity_t);
        Matrix3(const Matrix3 & m);
        Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);

@ -41,10 +40,10 @@ namespace nv

    inline Matrix3::Matrix3() {}
    
-    inline Matrix3::Matrix3(zero_t)
+    inline Matrix3::Matrix3(float f)
    {
        for(int i = 0; i < 9; i++) {
-            m_data[i] = 0.0f;
+            m_data[i] = f;
        }
    }

@ -204,11 +203,11 @@ namespace nv
        typedef Matrix const & Arg;

        Matrix();
-        Matrix(zero_t);
-        Matrix(identity_t);
+        explicit Matrix(float f);
+        explicit Matrix(identity_t);
        Matrix(const Matrix & m);
        Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
-        Matrix(const scalar m[]);	// m is assumed to contain 16 elements
+        //explicit Matrix(const scalar m[]);	// m is assumed to contain 16 elements

        scalar data(uint idx) const;
        scalar & data(uint idx);
@ -237,7 +236,7 @@ namespace nv
    {
    }

-    inline Matrix::Matrix(zero_t)
+    inline Matrix::Matrix(float f)
    {
        for(int i = 0; i < 16; i++) {
            m_data[i] = 0.0f;
@ -268,12 +267,12 @@ namespace nv
        m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w;
    }

-    inline Matrix::Matrix(const scalar m[])
+    /*inline Matrix::Matrix(const scalar m[])
    {
        for(int i = 0; i < 16; i++) {
            m_data[i] = m[i];
        }
-    }
+    }*/


    // Accessors
@ -456,7 +455,7 @@ namespace nv
    /// Get frustum matrix.
    inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear, scalar zFar)
    {
-        Matrix m(zero);
+        Matrix m(0.0f);

        scalar doubleznear = 2.0f * zNear;
        scalar one_deltax = 1.0f / (xmax - xmin);
@ -477,7 +476,7 @@ namespace nv
    /// Get infinite frustum matrix.
    inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear)
    {
-        Matrix m(zero);
+        Matrix m(0.0f);

        scalar doubleznear = 2.0f * zNear;
        scalar one_deltax = 1.0f / (xmax - xmin);
--- a/src/nvmath/Vector.h
+++ b/src/nvmath/Vector.h
@ -100,6 +100,7 @@ namespace nv
        explicit Vector4(scalar x);
        Vector4(scalar x, scalar y, scalar z, scalar w);
        Vector4(Vector2::Arg v, scalar z, scalar w);
+        Vector4(Vector2::Arg v, Vector2::Arg u);
        Vector4(Vector3::Arg v, scalar w);
        Vector4(Vector4::Arg v);
        //	Vector4(const Quaternion & v);
@ -107,6 +108,7 @@ namespace nv
        const Vector4 & operator=(Vector4::Arg v);

        Vector2 xy() const;
+        Vector2 zw() const;
        Vector3 xyz() const;

        const scalar * ptr() const;
@ -290,6 +292,7 @@ namespace nv
    inline Vector4::Vector4(scalar f) : x(f), y(f), z(f), w(f) {}
    inline Vector4::Vector4(scalar x, scalar y, scalar z, scalar w) : x(x), y(y), z(z), w(w) {}
    inline Vector4::Vector4(Vector2::Arg v, scalar z, scalar w) : x(v.x), y(v.y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
    inline Vector4::Vector4(Vector3::Arg v, scalar w) : x(v.x), y(v.y), z(v.z), w(w) {}
    inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}

@ -307,6 +310,11 @@ namespace nv
        return Vector2(x, y);
    }

+    inline Vector2 Vector4::zw() const
+    {
+        return Vector2(z, w);
+    }
+
    inline Vector3 Vector4::xyz() const
    {
        return Vector3(x, y, z);
@ -469,6 +477,14 @@ namespace nv
        return scale(v, 1.0f / l);
    }

+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector2 normalizeFast(Vector2::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }

    inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
    {
@ -498,6 +514,14 @@ namespace nv
        return vf;
    }

+    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
+    {
+	    Vector2 v0 = a - c;
+	    Vector2 v1 = b - c;
+
+	    return (v0.x * v1.y - v0.y * v1.x);
+    }
+

    // Vector3

@ -570,10 +594,10 @@ namespace nv
        return scale(v, 1.0f/s);
    }

-    inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s)
+    /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s)
    {
        return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
-    }
+    }*/

    inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, scalar t)
    {
@ -624,6 +648,15 @@ namespace nv
        return scale(v, 1.0f / l);
    }

+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector3 normalizeFast(Vector3::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
    inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
    {
        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
@ -762,6 +795,15 @@ namespace nv
        return scale(v, 1.0f / l);
    }

+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector4 normalizeFast(Vector4::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
    inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
    {
        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);
--- a/src/nvmath/nvmath.h
+++ b/src/nvmath/nvmath.h
@ -4,8 +4,9 @@
 #ifndef NV_MATH_H
 #define NV_MATH_H

-#include <nvcore/nvcore.h>
-#include <nvcore/Debug.h>
+#include "nvcore/nvcore.h"
+#include "nvcore/Debug.h"
+#include "nvcore/Utils.h" // clamp

 #include <math.h>
 #include <limits.h> // INT_MAX
@ -194,7 +195,7 @@ namespace nv
        return f - floor(f);
    }

-    inline float fround(float f)
+    inline float fround(float f)    // @@ rename floatRound
    {
        // @@ Do something better.
        return float(iround(f));
@ -210,6 +211,29 @@ namespace nv
        }
    }

+    inline float saturate(float f) {
+        return clamp(f, 0.0f, 1.0f);
+    }
+
+    inline float linearstep(float edge0, float edge1, float x) {
+        // Scale, bias and saturate x to 0..1 range
+        return saturate((x - edge0) / (edge1 - edge0));
+    }
+
+    inline float smoothstep(float edge0, float edge1, float x) {
+        x = linearstep(edge0, edge1, x); 
+
+        // Evaluate polynomial
+        return x*x*(3 - 2*x);
+    }
+
+    inline int sign(float a)
+    {
+        if (a > 0.0f) return 1;
+        if (a < 0.0f) return -1;
+        return 0;
+    }
+
 } // nv

 #endif // NV_MATH_H