Merge changes from the witness.
This commit is contained in:
@ -487,46 +487,126 @@ nv::half_to_float( uint16 h )
|
||||
return (f_result);
|
||||
}
|
||||
|
||||
uint32
|
||||
nv::fast_half_to_float( uint16 h )
|
||||
{
|
||||
const uint32 h_e_mask = _uint32_li( 0x00007c00 );
|
||||
const uint32 h_m_mask = _uint32_li( 0x000003ff );
|
||||
const uint32 h_s_mask = _uint32_li( 0x00008000 );
|
||||
const uint32 h_f_s_pos_offset = _uint32_li( 0x00000010 );
|
||||
const uint32 h_f_e_pos_offset = _uint32_li( 0x0000000d );
|
||||
const uint32 h_f_bias_offset = _uint32_li( 0x0001c000 );
|
||||
const uint32 f_e_mask = _uint32_li( 0x7f800000 );
|
||||
const uint32 f_m_mask = _uint32_li( 0x007fffff );
|
||||
const uint32 h_f_e_denorm_bias = _uint32_li( 0x0000007e );
|
||||
const uint32 h_f_m_denorm_sa_bias = _uint32_li( 0x00000008 );
|
||||
const uint32 f_e_pos = _uint32_li( 0x00000017 );
|
||||
const uint32 h_e_mask_minus_one = _uint32_li( 0x00007bff );
|
||||
const uint32 h_e = _uint32_and( h, h_e_mask );
|
||||
const uint32 h_m = _uint32_and( h, h_m_mask );
|
||||
const uint32 h_s = _uint32_and( h, h_s_mask );
|
||||
const uint32 h_e_f_bias = _uint32_add( h_e, h_f_bias_offset );
|
||||
const uint32 h_m_nlz = _uint32_cntlz( h_m );
|
||||
const uint32 f_s = _uint32_sll( h_s, h_f_s_pos_offset );
|
||||
const uint32 f_e = _uint32_sll( h_e_f_bias, h_f_e_pos_offset );
|
||||
const uint32 f_m = _uint32_sll( h_m, h_f_e_pos_offset );
|
||||
const uint32 f_em = _uint32_or( f_e, f_m );
|
||||
const uint32 h_f_m_sa = _uint32_sub( h_m_nlz, h_f_m_denorm_sa_bias );
|
||||
const uint32 f_e_denorm_unpacked = _uint32_sub( h_f_e_denorm_bias, h_f_m_sa );
|
||||
const uint32 h_f_m = _uint32_sll( h_m, h_f_m_sa );
|
||||
const uint32 f_m_denorm = _uint32_and( h_f_m, f_m_mask );
|
||||
const uint32 f_e_denorm = _uint32_sll( f_e_denorm_unpacked, f_e_pos );
|
||||
const uint32 f_em_denorm = _uint32_or( f_e_denorm, f_m_denorm );
|
||||
const uint32 f_em_nan = _uint32_or( f_e_mask, f_m );
|
||||
const uint32 is_e_eqz_msb = _uint32_dec( h_e );
|
||||
const uint32 is_m_nez_msb = _uint32_neg( h_m );
|
||||
const uint32 is_e_flagged_msb = _uint32_sub( h_e_mask_minus_one, h_e );
|
||||
const uint32 is_zero_msb = _uint32_andc( is_e_eqz_msb, is_m_nez_msb );
|
||||
const uint32 is_denorm_msb = _uint32_and( is_m_nez_msb, is_e_eqz_msb );
|
||||
const uint32 is_zero = _uint32_ext( is_zero_msb );
|
||||
const uint32 f_zero_result = _uint32_andc( f_em, is_zero );
|
||||
const uint32 f_denorm_result = _uint32_sels( is_denorm_msb, f_em_denorm, f_zero_result );
|
||||
const uint32 f_result = _uint32_or( f_s, f_denorm_result );
|
||||
|
||||
return (f_result);
|
||||
// @@ These tables could be smaller.
|
||||
static uint32 mantissa_table[2048];
|
||||
static uint32 exponent_table[64];
|
||||
static uint32 offset_table[64];
|
||||
|
||||
void nv::half_init_tables()
|
||||
{
|
||||
// Init mantissa table.
|
||||
mantissa_table[0] = 0;
|
||||
|
||||
for (int i = 1; i < 1024; i++) {
|
||||
uint m = i << 13;
|
||||
uint e = 0;
|
||||
|
||||
while ((m & 0x00800000) == 0) {
|
||||
e -= 0x00800000;
|
||||
m <<= 1;
|
||||
}
|
||||
m &= ~0x00800000;
|
||||
e += 0x38800000;
|
||||
mantissa_table[i] = m | e;
|
||||
}
|
||||
|
||||
for (int i = 1024; i < 2048; i++) {
|
||||
mantissa_table[i] = 0x38000000 + ((i - 1024) << 13);
|
||||
}
|
||||
|
||||
|
||||
// Init exponent table.
|
||||
exponent_table[0] = 0;
|
||||
|
||||
for (int i = 1; i < 31; i++) {
|
||||
exponent_table[i] = (i << 23);
|
||||
}
|
||||
|
||||
exponent_table[31] = 0x47800000;
|
||||
exponent_table[32] = 0x80000000;
|
||||
|
||||
for (int i = 33; i < 63; i++) {
|
||||
exponent_table[i] = 0x80000000 + ((i - 32) << 23);
|
||||
}
|
||||
|
||||
exponent_table[63] = 0xC7800000;
|
||||
|
||||
|
||||
// Init offset table.
|
||||
offset_table[0] = 0;
|
||||
|
||||
for (int i = 1; i < 32; i++) {
|
||||
offset_table[i] = 1024;
|
||||
}
|
||||
|
||||
offset_table[32] = 0;
|
||||
|
||||
for (int i = 33; i < 64; i++) {
|
||||
offset_table[i] = 1024;
|
||||
}
|
||||
|
||||
/*for (int i = 0; i < 64; i++) {
|
||||
offset_table[i] = ((i & 31) != 0) * 1024;
|
||||
}*/
|
||||
}
|
||||
|
||||
// Fast half to float conversion based on:
|
||||
// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
|
||||
uint32 nv::fast_half_to_float(uint16 h)
|
||||
{
|
||||
uint exp = h >> 10;
|
||||
return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
// Inaccurate conversion suggested at the ffmpeg mailing list:
|
||||
// http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
|
||||
uint32 nv::fast_half_to_float(uint16 v)
|
||||
{
|
||||
if (v & 0x8000) return 0;
|
||||
uint exp = v >> 10;
|
||||
if (!exp) return (v>>9)&1;
|
||||
if (exp >= 15) return 0xffff;
|
||||
v <<= 6;
|
||||
return (v+(1<<16)) >> (15-exp);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
|
||||
// Some more from a gamedev thread:
|
||||
// http://www.devmaster.net/forums/showthread.php?t=10924
|
||||
|
||||
// I believe it does not handle specials either.
|
||||
|
||||
// Mike Acton's code should be fairly easy to vectorize and that would handle all cases too, the table method might still be faster, though.
|
||||
|
||||
|
||||
static __declspec(align(16)) unsigned half_sign[4] = {0x00008000, 0x00008000, 0x00008000, 0x00008000};
|
||||
static __declspec(align(16)) unsigned half_exponent[4] = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00};
|
||||
static __declspec(align(16)) unsigned half_mantissa[4] = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF};
|
||||
static __declspec(align(16)) unsigned half_bias_offset[4] = {0x0001C000, 0x0001C000, 0x0001C000, 0x0001C000};
|
||||
|
||||
__asm
|
||||
{
|
||||
movaps xmm1, xmm0 // Input in xmm0
|
||||
movaps xmm2, xmm0
|
||||
|
||||
andps xmm0, half_sign
|
||||
andps xmm1, half_exponent
|
||||
andps xmm2, half_mantissa
|
||||
paddd xmm1, half_bias_offset
|
||||
|
||||
pslld xmm0, 16
|
||||
pslld xmm1, 13
|
||||
pslld xmm2, 13
|
||||
|
||||
orps xmm1, xmm2
|
||||
orps xmm0, xmm1 // Result in xmm0
|
||||
}
|
||||
|
||||
|
||||
#endif
|
@ -9,8 +9,9 @@ namespace nv {
|
||||
uint32 half_to_float( uint16 h );
|
||||
uint16 half_from_float( uint32 f );
|
||||
|
||||
// Does not handle NaN or infinity.
|
||||
uint32 fast_half_to_float( uint16 h );
|
||||
void half_init_tables();
|
||||
|
||||
uint32 fast_half_to_float(uint16 h);
|
||||
|
||||
inline uint16 to_half(float c) {
|
||||
union { float f; uint32 u; } f;
|
||||
|
@ -9,15 +9,14 @@
|
||||
|
||||
namespace nv
|
||||
{
|
||||
enum zero_t { zero };
|
||||
enum identity_t { identity };
|
||||
|
||||
class NVMATH_CLASS Matrix3
|
||||
{
|
||||
public:
|
||||
Matrix3();
|
||||
Matrix3(zero_t);
|
||||
Matrix3(identity_t);
|
||||
explicit Matrix3(float f);
|
||||
explicit Matrix3(identity_t);
|
||||
Matrix3(const Matrix3 & m);
|
||||
Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);
|
||||
|
||||
@ -41,10 +40,10 @@ namespace nv
|
||||
|
||||
inline Matrix3::Matrix3() {}
|
||||
|
||||
inline Matrix3::Matrix3(zero_t)
|
||||
inline Matrix3::Matrix3(float f)
|
||||
{
|
||||
for(int i = 0; i < 9; i++) {
|
||||
m_data[i] = 0.0f;
|
||||
m_data[i] = f;
|
||||
}
|
||||
}
|
||||
|
||||
@ -204,11 +203,11 @@ namespace nv
|
||||
typedef Matrix const & Arg;
|
||||
|
||||
Matrix();
|
||||
Matrix(zero_t);
|
||||
Matrix(identity_t);
|
||||
explicit Matrix(float f);
|
||||
explicit Matrix(identity_t);
|
||||
Matrix(const Matrix & m);
|
||||
Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
|
||||
Matrix(const scalar m[]); // m is assumed to contain 16 elements
|
||||
//explicit Matrix(const scalar m[]); // m is assumed to contain 16 elements
|
||||
|
||||
scalar data(uint idx) const;
|
||||
scalar & data(uint idx);
|
||||
@ -237,7 +236,7 @@ namespace nv
|
||||
{
|
||||
}
|
||||
|
||||
inline Matrix::Matrix(zero_t)
|
||||
inline Matrix::Matrix(float f)
|
||||
{
|
||||
for(int i = 0; i < 16; i++) {
|
||||
m_data[i] = 0.0f;
|
||||
@ -268,12 +267,12 @@ namespace nv
|
||||
m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w;
|
||||
}
|
||||
|
||||
inline Matrix::Matrix(const scalar m[])
|
||||
/*inline Matrix::Matrix(const scalar m[])
|
||||
{
|
||||
for(int i = 0; i < 16; i++) {
|
||||
m_data[i] = m[i];
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
|
||||
// Accessors
|
||||
@ -456,7 +455,7 @@ namespace nv
|
||||
/// Get frustum matrix.
|
||||
inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear, scalar zFar)
|
||||
{
|
||||
Matrix m(zero);
|
||||
Matrix m(0.0f);
|
||||
|
||||
scalar doubleznear = 2.0f * zNear;
|
||||
scalar one_deltax = 1.0f / (xmax - xmin);
|
||||
@ -477,7 +476,7 @@ namespace nv
|
||||
/// Get infinite frustum matrix.
|
||||
inline Matrix frustum(scalar xmin, scalar xmax, scalar ymin, scalar ymax, scalar zNear)
|
||||
{
|
||||
Matrix m(zero);
|
||||
Matrix m(0.0f);
|
||||
|
||||
scalar doubleznear = 2.0f * zNear;
|
||||
scalar one_deltax = 1.0f / (xmax - xmin);
|
||||
|
@ -100,6 +100,7 @@ namespace nv
|
||||
explicit Vector4(scalar x);
|
||||
Vector4(scalar x, scalar y, scalar z, scalar w);
|
||||
Vector4(Vector2::Arg v, scalar z, scalar w);
|
||||
Vector4(Vector2::Arg v, Vector2::Arg u);
|
||||
Vector4(Vector3::Arg v, scalar w);
|
||||
Vector4(Vector4::Arg v);
|
||||
// Vector4(const Quaternion & v);
|
||||
@ -107,6 +108,7 @@ namespace nv
|
||||
const Vector4 & operator=(Vector4::Arg v);
|
||||
|
||||
Vector2 xy() const;
|
||||
Vector2 zw() const;
|
||||
Vector3 xyz() const;
|
||||
|
||||
const scalar * ptr() const;
|
||||
@ -290,6 +292,7 @@ namespace nv
|
||||
inline Vector4::Vector4(scalar f) : x(f), y(f), z(f), w(f) {}
|
||||
inline Vector4::Vector4(scalar x, scalar y, scalar z, scalar w) : x(x), y(y), z(z), w(w) {}
|
||||
inline Vector4::Vector4(Vector2::Arg v, scalar z, scalar w) : x(v.x), y(v.y), z(z), w(w) {}
|
||||
inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
|
||||
inline Vector4::Vector4(Vector3::Arg v, scalar w) : x(v.x), y(v.y), z(v.z), w(w) {}
|
||||
inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
|
||||
|
||||
@ -307,6 +310,11 @@ namespace nv
|
||||
return Vector2(x, y);
|
||||
}
|
||||
|
||||
inline Vector2 Vector4::zw() const
|
||||
{
|
||||
return Vector2(z, w);
|
||||
}
|
||||
|
||||
inline Vector3 Vector4::xyz() const
|
||||
{
|
||||
return Vector3(x, y, z);
|
||||
@ -469,6 +477,14 @@ namespace nv
|
||||
return scale(v, 1.0f / l);
|
||||
}
|
||||
|
||||
// Safe, branchless normalization from Andy Firth. All error checking ommitted.
|
||||
// http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
|
||||
inline Vector2 normalizeFast(Vector2::Arg v)
|
||||
{
|
||||
const float very_small_float = 1.0e-037f;
|
||||
float l = very_small_float + length(v);
|
||||
return scale(v, 1.0f / l);
|
||||
}
|
||||
|
||||
inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
|
||||
{
|
||||
@ -498,6 +514,14 @@ namespace nv
|
||||
return vf;
|
||||
}
|
||||
|
||||
inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
|
||||
{
|
||||
Vector2 v0 = a - c;
|
||||
Vector2 v1 = b - c;
|
||||
|
||||
return (v0.x * v1.y - v0.y * v1.x);
|
||||
}
|
||||
|
||||
|
||||
// Vector3
|
||||
|
||||
@ -570,10 +594,10 @@ namespace nv
|
||||
return scale(v, 1.0f/s);
|
||||
}
|
||||
|
||||
inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s)
|
||||
/*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, scalar s)
|
||||
{
|
||||
return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
|
||||
}
|
||||
}*/
|
||||
|
||||
inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, scalar t)
|
||||
{
|
||||
@ -624,6 +648,15 @@ namespace nv
|
||||
return scale(v, 1.0f / l);
|
||||
}
|
||||
|
||||
// Safe, branchless normalization from Andy Firth. All error checking ommitted.
|
||||
// http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
|
||||
inline Vector3 normalizeFast(Vector3::Arg v)
|
||||
{
|
||||
const float very_small_float = 1.0e-037f;
|
||||
float l = very_small_float + length(v);
|
||||
return scale(v, 1.0f / l);
|
||||
}
|
||||
|
||||
inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
|
||||
{
|
||||
return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
|
||||
@ -762,6 +795,15 @@ namespace nv
|
||||
return scale(v, 1.0f / l);
|
||||
}
|
||||
|
||||
// Safe, branchless normalization from Andy Firth. All error checking ommitted.
|
||||
// http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
|
||||
inline Vector4 normalizeFast(Vector4::Arg v)
|
||||
{
|
||||
const float very_small_float = 1.0e-037f;
|
||||
float l = very_small_float + length(v);
|
||||
return scale(v, 1.0f / l);
|
||||
}
|
||||
|
||||
inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
|
||||
{
|
||||
return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);
|
||||
|
@ -4,8 +4,9 @@
|
||||
#ifndef NV_MATH_H
|
||||
#define NV_MATH_H
|
||||
|
||||
#include <nvcore/nvcore.h>
|
||||
#include <nvcore/Debug.h>
|
||||
#include "nvcore/nvcore.h"
|
||||
#include "nvcore/Debug.h"
|
||||
#include "nvcore/Utils.h" // clamp
|
||||
|
||||
#include <math.h>
|
||||
#include <limits.h> // INT_MAX
|
||||
@ -194,7 +195,7 @@ namespace nv
|
||||
return f - floor(f);
|
||||
}
|
||||
|
||||
inline float fround(float f)
|
||||
inline float fround(float f) // @@ rename floatRound
|
||||
{
|
||||
// @@ Do something better.
|
||||
return float(iround(f));
|
||||
@ -210,6 +211,29 @@ namespace nv
|
||||
}
|
||||
}
|
||||
|
||||
inline float saturate(float f) {
|
||||
return clamp(f, 0.0f, 1.0f);
|
||||
}
|
||||
|
||||
inline float linearstep(float edge0, float edge1, float x) {
|
||||
// Scale, bias and saturate x to 0..1 range
|
||||
return saturate((x - edge0) / (edge1 - edge0));
|
||||
}
|
||||
|
||||
inline float smoothstep(float edge0, float edge1, float x) {
|
||||
x = linearstep(edge0, edge1, x);
|
||||
|
||||
// Evaluate polynomial
|
||||
return x*x*(3 - 2*x);
|
||||
}
|
||||
|
||||
inline int sign(float a)
|
||||
{
|
||||
if (a > 0.0f) return 1;
|
||||
if (a < 0.0f) return -1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // nv
|
||||
|
||||
#endif // NV_MATH_H
|
||||
|
Reference in New Issue
Block a user