Merge changes from the witness.

This commit is contained in:
castano
2011-09-27 17:48:46 +00:00
parent 9c0658edca
commit 3c0ab2d3f3
47 changed files with 1811 additions and 186 deletions

View File

@ -487,46 +487,126 @@ nv::half_to_float( uint16 h )
return (f_result);
}
uint32
nv::fast_half_to_float( uint16 h )
{
const uint32 h_e_mask = _uint32_li( 0x00007c00 );
const uint32 h_m_mask = _uint32_li( 0x000003ff );
const uint32 h_s_mask = _uint32_li( 0x00008000 );
const uint32 h_f_s_pos_offset = _uint32_li( 0x00000010 );
const uint32 h_f_e_pos_offset = _uint32_li( 0x0000000d );
const uint32 h_f_bias_offset = _uint32_li( 0x0001c000 );
const uint32 f_e_mask = _uint32_li( 0x7f800000 );
const uint32 f_m_mask = _uint32_li( 0x007fffff );
const uint32 h_f_e_denorm_bias = _uint32_li( 0x0000007e );
const uint32 h_f_m_denorm_sa_bias = _uint32_li( 0x00000008 );
const uint32 f_e_pos = _uint32_li( 0x00000017 );
const uint32 h_e_mask_minus_one = _uint32_li( 0x00007bff );
const uint32 h_e = _uint32_and( h, h_e_mask );
const uint32 h_m = _uint32_and( h, h_m_mask );
const uint32 h_s = _uint32_and( h, h_s_mask );
const uint32 h_e_f_bias = _uint32_add( h_e, h_f_bias_offset );
const uint32 h_m_nlz = _uint32_cntlz( h_m );
const uint32 f_s = _uint32_sll( h_s, h_f_s_pos_offset );
const uint32 f_e = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
const uint32 f_m = _uint32_sll( h_m, h_f_e_pos_offset );
const uint32 f_em = _uint32_or( f_e, f_m );
const uint32 h_f_m_sa = _uint32_sub( h_m_nlz, h_f_m_denorm_sa_bias );
const uint32 f_e_denorm_unpacked = _uint32_sub( h_f_e_denorm_bias, h_f_m_sa );
const uint32 h_f_m = _uint32_sll( h_m, h_f_m_sa );
const uint32 f_m_denorm = _uint32_and( h_f_m, f_m_mask );
const uint32 f_e_denorm = _uint32_sll( f_e_denorm_unpacked, f_e_pos );
const uint32 f_em_denorm = _uint32_or( f_e_denorm, f_m_denorm );
const uint32 f_em_nan = _uint32_or( f_e_mask, f_m );
const uint32 is_e_eqz_msb = _uint32_dec( h_e );
const uint32 is_m_nez_msb = _uint32_neg( h_m );
const uint32 is_e_flagged_msb = _uint32_sub( h_e_mask_minus_one, h_e );
const uint32 is_zero_msb = _uint32_andc( is_e_eqz_msb, is_m_nez_msb );
const uint32 is_denorm_msb = _uint32_and( is_m_nez_msb, is_e_eqz_msb );
const uint32 is_zero = _uint32_ext( is_zero_msb );
const uint32 f_zero_result = _uint32_andc( f_em, is_zero );
const uint32 f_denorm_result = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
const uint32 f_result = _uint32_or( f_s, f_denorm_result );
return (f_result);
// @@ These tables could be smaller.
static uint32 mantissa_table[2048];
static uint32 exponent_table[64];
static uint32 offset_table[64];
void nv::half_init_tables()
{
// Init mantissa table.
mantissa_table[0] = 0;
for (int i = 1; i < 1024; i++) {
uint m = i << 13;
uint e = 0;
while ((m & 0x00800000) == 0) {
e -= 0x00800000;
m <<= 1;
}
m &= ~0x00800000;
e += 0x38800000;
mantissa_table[i] = m | e;
}
for (int i = 1024; i < 2048; i++) {
mantissa_table[i] = 0x38000000 + ((i - 1024) << 13);
}
// Init exponent table.
exponent_table[0] = 0;
for (int i = 1; i < 31; i++) {
exponent_table[i] = (i << 23);
}
exponent_table[31] = 0x47800000;
exponent_table[32] = 0x80000000;
for (int i = 33; i < 63; i++) {
exponent_table[i] = 0x80000000 + ((i - 32) << 23);
}
exponent_table[63] = 0xC7800000;
// Init offset table.
offset_table[0] = 0;
for (int i = 1; i < 32; i++) {
offset_table[i] = 1024;
}
offset_table[32] = 0;
for (int i = 33; i < 64; i++) {
offset_table[i] = 1024;
}
/*for (int i = 0; i < 64; i++) {
offset_table[i] = ((i & 31) != 0) * 1024;
}*/
}
// Fast half to float conversion based on:
// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
uint32 nv::fast_half_to_float(uint16 h)
{
uint exp = h >> 10;
return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
}
#if 0
// Inaccurate conversion suggested at the ffmpeg mailing list:
// http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
uint32 nv::fast_half_to_float(uint16 v)
{
if (v & 0x8000) return 0;
uint exp = v >> 10;
if (!exp) return (v>>9)&1;
if (exp >= 15) return 0xffff;
v <<= 6;
return (v+(1<<16)) >> (15-exp);
}
#endif
#if 0
// Some more from a gamedev thread:
// http://www.devmaster.net/forums/showthread.php?t=10924
// I believe it does not handle specials either.
// Mike Acton's code should be fairly easy to vectorize and that would handle all cases too, the table method might still be faster, though.
static __declspec(align(16)) unsigned half_sign[4] = {0x00008000, 0x00008000, 0x00008000, 0x00008000};
static __declspec(align(16)) unsigned half_exponent[4] = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00};
static __declspec(align(16)) unsigned half_mantissa[4] = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF};
static __declspec(align(16)) unsigned half_bias_offset[4] = {0x0001C000, 0x0001C000, 0x0001C000, 0x0001C000};
__asm
{
movaps xmm1, xmm0 // Input in xmm0
movaps xmm2, xmm0
andps xmm0, half_sign
andps xmm1, half_exponent
andps xmm2, half_mantissa
paddd xmm1, half_bias_offset
pslld xmm0, 16
pslld xmm1, 13
pslld xmm2, 13
orps xmm1, xmm2
orps xmm0, xmm1 // Result in xmm0
}
#endif

View File

@ -9,8 +9,9 @@ namespace nv {
uint32 half_to_float( uint16 h );
uint16 half_from_float( uint32 f );
// Does not handle NaN or infinity.
uint32 fast_half_to_float( uint16 h );
void half_init_tables();
uint32 fast_half_to_float(uint16 h);
inline uint16 to_half(float c) {
union { float f; uint32 u; } f;

View File

@ -9,15 +9,14 @@
namespace nv
{
enum zero_t { zero };
enum identity_t { identity };
class NVMATH_CLASS Matrix3
{
public:
Matrix3();
Matrix3(zero_t);
Matrix3(identity_t);
explicit Matrix3(float f);
explicit Matrix3(identity_t);
Matrix3(const Matrix3 & m);
Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);
@ -41,10 +40,10 @@ namespace nv
inline Matrix3::Matrix3() {}
inline Matrix3::Matrix3(zero_t)
inline Matrix3::Matrix3(float f)
{
for(int i = 0; i < 9; i++) {
m_data[i] = 0.0f;
m_data[i] = f;
}
}
@ -204,11 +203,11 @@ namespace nv
typedef Matrix const & Arg;
Matrix();
Matrix(zero_t);
Matrix(identity_t);
explicit Matrix(float f);
explicit Matrix(identity_t);
Matrix(const Matrix & m);
Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
Matrix(const scalar m[]); // m is assumed to contain 16 elements
//explicit Matrix(const scalar m[]); // m is assumed to contain 16 elements
scalar data(uint idx) const;
scalar & data(uint idx);
@ -237,7 +236,7 @@ namespace nv
{
}
inline Matrix::Matrix(zero_t)
inline Matrix::Matrix(float f)
{
for(int i = 0; i < 16; i++) {
m_data[i] = 0.0f;
@ -268,12 +267,12 @@ namespace nv
m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w;
}
inline Matrix::Matrix(const scalar m[])
/*inline Matrix::Matrix(const scalar m[])
{
for(int i = 0; i < 16; i++) {
m_data[i] = m[i];
}
}
}*/
// Accessors
@ -456,7 +455,7 @@ namespace nv
/// Get frustum matrix.
inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear, scalar zFar)
{
Matrix m(zero);
Matrix m(0.0f);
scalar doubleznear = 2.0f * zNear;
scalar one_deltax = 1.0f / (xmax - xmin);
@ -477,7 +476,7 @@ namespace nv
/// Get infinite frustum matrix.
inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear)
{
Matrix m(zero);
Matrix m(0.0f);
scalar doubleznear = 2.0f * zNear;
scalar one_deltax = 1.0f / (xmax - xmin);

View File

@ -100,6 +100,7 @@ namespace nv
explicit Vector4(scalar x);
Vector4(scalar x, scalar y, scalar z, scalar w);
Vector4(Vector2::Arg v, scalar z, scalar w);
Vector4(Vector2::Arg v, Vector2::Arg u);
Vector4(Vector3::Arg v, scalar w);
Vector4(Vector4::Arg v);
// Vector4(const Quaternion & v);
@ -107,6 +108,7 @@ namespace nv
const Vector4 & operator=(Vector4::Arg v);
Vector2 xy() const;
Vector2 zw() const;
Vector3 xyz() const;
const scalar * ptr() const;
@ -290,6 +292,7 @@ namespace nv
inline Vector4::Vector4(scalar f) : x(f), y(f), z(f), w(f) {}
inline Vector4::Vector4(scalar x, scalar y, scalar z, scalar w) : x(x), y(y), z(z), w(w) {}
inline Vector4::Vector4(Vector2::Arg v, scalar z, scalar w) : x(v.x), y(v.y), z(z), w(w) {}
inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
inline Vector4::Vector4(Vector3::Arg v, scalar w) : x(v.x), y(v.y), z(v.z), w(w) {}
inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
@ -307,6 +310,11 @@ namespace nv
return Vector2(x, y);
}
inline Vector2 Vector4::zw() const
{
return Vector2(z, w);
}
inline Vector3 Vector4::xyz() const
{
return Vector3(x, y, z);
@ -469,6 +477,14 @@ namespace nv
return scale(v, 1.0f / l);
}
// Safe, branchless normalization from Andy Firth. All error checking ommitted.
// http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
inline Vector2 normalizeFast(Vector2::Arg v)
{
const float very_small_float = 1.0e-037f;
float l = very_small_float + length(v);
return scale(v, 1.0f / l);
}
inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
{
@ -498,6 +514,14 @@ namespace nv
return vf;
}
inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
{
Vector2 v0 = a - c;
Vector2 v1 = b - c;
return (v0.x * v1.y - v0.y * v1.x);
}
// Vector3
@ -570,10 +594,10 @@ namespace nv
return scale(v, 1.0f/s);
}
inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s)
/*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s)
{
return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
}
}*/
inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, scalar t)
{
@ -624,6 +648,15 @@ namespace nv
return scale(v, 1.0f / l);
}
// Safe, branchless normalization from Andy Firth. All error checking ommitted.
// http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
inline Vector3 normalizeFast(Vector3::Arg v)
{
const float very_small_float = 1.0e-037f;
float l = very_small_float + length(v);
return scale(v, 1.0f / l);
}
inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
{
return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
@ -762,6 +795,15 @@ namespace nv
return scale(v, 1.0f / l);
}
// Safe, branchless normalization from Andy Firth. All error checking ommitted.
// http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
inline Vector4 normalizeFast(Vector4::Arg v)
{
const float very_small_float = 1.0e-037f;
float l = very_small_float + length(v);
return scale(v, 1.0f / l);
}
inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
{
return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);

View File

@ -4,8 +4,9 @@
#ifndef NV_MATH_H
#define NV_MATH_H
#include <nvcore/nvcore.h>
#include <nvcore/Debug.h>
#include "nvcore/nvcore.h"
#include "nvcore/Debug.h"
#include "nvcore/Utils.h" // clamp
#include <math.h>
#include <limits.h> // INT_MAX
@ -194,7 +195,7 @@ namespace nv
return f - floor(f);
}
inline float fround(float f)
inline float fround(float f) // @@ rename floatRound
{
// @@ Do something better.
return float(iround(f));
@ -210,6 +211,29 @@ namespace nv
}
}
inline float saturate(float f) {
return clamp(f, 0.0f, 1.0f);
}
inline float linearstep(float edge0, float edge1, float x) {
// Scale, bias and saturate x to 0..1 range
return saturate((x - edge0) / (edge1 - edge0));
}
inline float smoothstep(float edge0, float edge1, float x) {
x = linearstep(edge0, edge1, x);
// Evaluate polynomial
return x*x*(3 - 2*x);
}
inline int sign(float a)
{
if (a > 0.0f) return 1;
if (a < 0.0f) return -1;
return 0;
}
} // nv
#endif // NV_MATH_H