Merge changes from the witness.

This commit is contained in:
castano
2014-11-04 17:49:29 +00:00
parent 4cb60cc5ba
commit d019cd7080
86 changed files with 3534 additions and 882 deletions

View File

@ -39,7 +39,7 @@ namespace nv
// Build a cube centered on center and with edge = 2*dist
inline void Box::cube(const Vector3 & center, float dist)
{
setCenterExtents(center, Vector3(dist, dist, dist));
setCenterExtents(center, Vector3(dist));
}
// Build a box, given center and extents.

View File

@ -89,6 +89,7 @@ namespace nv
uint8 b: 8;
#endif
};
uint8 component[4];
uint32 u;
};
};

View File

@ -6,6 +6,7 @@
#include "Color.h"
#include "Vector.inl"
#include "ftoi.h"
namespace nv
@ -123,30 +124,30 @@ namespace nv
inline Color32 toColor32(const Vector4 & v)
{
Color32 color;
color.r = toU8(nv::iround(saturate(v.x) * 255));
color.g = toU8(nv::iround(saturate(v.y) * 255));
color.b = toU8(nv::iround(saturate(v.z) * 255));
color.a = toU8(nv::iround(saturate(v.w) * 255));
color.r = U8(ftoi_round(saturate(v.x) * 255));
color.g = U8(ftoi_round(saturate(v.y) * 255));
color.b = U8(ftoi_round(saturate(v.z) * 255));
color.a = U8(ftoi_round(saturate(v.w) * 255));
return color;
}
inline Color32 toColor32_from_bgra(const Vector4 & v)
{
Color32 color;
color.b = toU8(nv::iround(saturate(v.x) * 255));
color.g = toU8(nv::iround(saturate(v.y) * 255));
color.r = toU8(nv::iround(saturate(v.z) * 255));
color.a = toU8(nv::iround(saturate(v.w) * 255));
color.b = U8(ftoi_round(saturate(v.x) * 255));
color.g = U8(ftoi_round(saturate(v.y) * 255));
color.r = U8(ftoi_round(saturate(v.z) * 255));
color.a = U8(ftoi_round(saturate(v.w) * 255));
return color;
}
inline Color32 toColor32_from_argb(const Vector4 & v)
{
Color32 color;
color.a = toU8(nv::iround(saturate(v.x) * 255));
color.r = toU8(nv::iround(saturate(v.y) * 255));
color.g = toU8(nv::iround(saturate(v.z) * 255));
color.b = toU8(nv::iround(saturate(v.w) * 255));
color.a = U8(ftoi_round(saturate(v.x) * 255));
color.r = U8(ftoi_round(saturate(v.y) * 255));
color.g = U8(ftoi_round(saturate(v.z) * 255));
color.b = U8(ftoi_round(saturate(v.w) * 255));
return color;
}

View File

@ -4,10 +4,11 @@
#include "Vector.inl"
#include "Plane.inl"
#include "nvcore/Array.inl"
#include "nvcore/Utils.h" // max, swap
#include <float.h> // FLT_MAX
#include <vector>
//#include <vector>
#include <string.h>
using namespace nv;
@ -329,7 +330,7 @@ void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R);
Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict points)
{
// Store the points in an n x n matrix
std::vector<float> Q(n*n, 0.0f);
Array<float> Q; Q.resize(n*n, 0.0f);
for (int i = 0; i < n; ++i)
{
Q[i*n+0] = points[i].x;
@ -338,8 +339,8 @@ Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict
}
// Alloc space for the SVD outputs
std::vector<float> diag(n, 0.0f);
std::vector<float> R(n*n, 0.0f);
Array<float> diag; diag.resize(n, 0.0f);
Array<float> R; R.resize(n*n, 0.0f);
ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);
@ -350,7 +351,7 @@ Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict
Vector4 nv::Fit::computePrincipalComponent_SVD(int n, const Vector4 *__restrict points)
{
// Store the points in an n x n matrix
std::vector<float> Q(n*n, 0.0f);
Array<float> Q; Q.resize(n*n, 0.0f);
for (int i = 0; i < n; ++i)
{
Q[i*n+0] = points[i].x;
@ -360,8 +361,8 @@ Vector4 nv::Fit::computePrincipalComponent_SVD(int n, const Vector4 *__restrict
}
// Alloc space for the SVD outputs
std::vector<float> diag(n, 0.0f);
std::vector<float> R(n*n, 0.0f);
Array<float> diag; diag.resize(n, 0.0f);
Array<float> R; R.resize(n*n, 0.0f);
ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);
@ -940,7 +941,7 @@ void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R)
float g = 0.0f;
float scale = 0.0f;
std::vector<float> temp(cols, 0.0f);
Array<float> temp; temp.resize(cols, 0.0f);
for( i = 0; i < cols; i++ )
{

View File

@ -580,56 +580,56 @@ namespace nv {
void nv::half_init_tables()
{
// Init mantissa table.
mantissa_table[0] = 0;
mantissa_table[0] = 0;
// denormals
for (int i = 1; i < 1024; i++) {
uint m = i << 13;
uint e = 0;
for (int i = 1; i < 1024; i++) {
uint m = i << 13;
uint e = 0;
while ((m & 0x00800000) == 0) {
e -= 0x00800000;
m <<= 1;
}
m &= ~0x00800000;
e += 0x38800000;
mantissa_table[i] = m | e;
}
while ((m & 0x00800000) == 0) {
e -= 0x00800000;
m <<= 1;
}
m &= ~0x00800000;
e += 0x38800000;
mantissa_table[i] = m | e;
}
// normals
for (int i = 1024; i < 2048; i++) {
mantissa_table[i] = (i - 1024) << 13;
mantissa_table[i] = (i - 1024) << 13;
}
// Init exponent table.
exponent_table[0] = 0;
exponent_table[0] = 0;
for (int i = 1; i < 31; i++) {
exponent_table[i] = 0x38000000 + (i << 23);
exponent_table[i] = 0x38000000 + (i << 23);
}
exponent_table[31] = 0x7f800000;
exponent_table[32] = 0x80000000;
exponent_table[31] = 0x7f800000;
exponent_table[32] = 0x80000000;
for (int i = 33; i < 63; i++) {
exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
}
exponent_table[63] = 0xff800000;
exponent_table[63] = 0xff800000;
// Init offset table.
offset_table[0] = 0;
offset_table[0] = 0;
for (int i = 1; i < 32; i++) {
offset_table[i] = 1024;
offset_table[i] = 1024;
}
offset_table[32] = 0;
offset_table[32] = 0;
for (int i = 33; i < 64; i++) {
offset_table[i] = 1024;
offset_table[i] = 1024;
}
}
@ -660,27 +660,27 @@ uint32 nv::fast_half_to_float(uint16 v)
// Mike Acton's code should be fairly easy to vectorize and that would handle all cases too, the table method might still be faster, though.
static __declspec(align(16)) unsigned half_sign[4] = {0x00008000, 0x00008000, 0x00008000, 0x00008000};
static __declspec(align(16)) unsigned half_exponent[4] = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00};
static __declspec(align(16)) unsigned half_mantissa[4] = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF};
static __declspec(align(16)) unsigned half_sign[4] = {0x00008000, 0x00008000, 0x00008000, 0x00008000};
static __declspec(align(16)) unsigned half_exponent[4] = {0x00007C00, 0x00007C00, 0x00007C00, 0x00007C00};
static __declspec(align(16)) unsigned half_mantissa[4] = {0x000003FF, 0x000003FF, 0x000003FF, 0x000003FF};
static __declspec(align(16)) unsigned half_bias_offset[4] = {0x0001C000, 0x0001C000, 0x0001C000, 0x0001C000};
__asm
{
movaps xmm1, xmm0 // Input in xmm0
movaps xmm2, xmm0
movaps xmm1, xmm0 // Input in xmm0
movaps xmm2, xmm0
andps xmm0, half_sign
andps xmm1, half_exponent
andps xmm2, half_mantissa
paddd xmm1, half_bias_offset
andps xmm0, half_sign
andps xmm1, half_exponent
andps xmm2, half_mantissa
paddd xmm1, half_bias_offset
pslld xmm0, 16
pslld xmm1, 13
pslld xmm2, 13
pslld xmm0, 16
pslld xmm1, 13
pslld xmm2, 13
orps xmm1, xmm2
orps xmm0, xmm1 // Result in xmm0
orps xmm1, xmm2
orps xmm0, xmm1 // Result in xmm0
}

View File

@ -7,6 +7,10 @@
#include <float.h>
#if !NV_CC_MSVC && !NV_OS_ORBIS
#include <alloca.h>
#endif
using namespace nv;
@ -20,8 +24,7 @@ static bool ludcmp(float **a, int n, int *indx, float *d)
{
const float TINY = 1.0e-20f;
Array<float> vv; // vv stores the implicit scaling of each row.
vv.resize(n);
float * vv = (float*)alloca(sizeof(float) * n); // vv stores the implicit scaling of each row.
*d = 1.0; // No row interchanges yet.
for (int i = 0; i < n; i++) { // Loop over rows to get the implicit scaling information.
@ -149,6 +152,21 @@ bool nv::solveLU(const Matrix & A, const Vector4 & b, Vector4 * x)
return true;
}
// @@ Not tested.
Matrix nv::inverseLU(const Matrix & A)
{
Vector4 Ai[4];
solveLU(A, Vector4(1, 0, 0, 0), &Ai[0]);
solveLU(A, Vector4(0, 1, 0, 0), &Ai[1]);
solveLU(A, Vector4(0, 0, 1, 0), &Ai[2]);
solveLU(A, Vector4(0, 0, 0, 1), &Ai[3]);
return Matrix(Ai[0], Ai[1], Ai[2], Ai[3]);
}
bool nv::solveLU(const Matrix3 & A, const Vector3 & b, Vector3 * x)
{
nvDebugCheck(x != NULL);
@ -184,7 +202,7 @@ bool nv::solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x)
{
nvDebugCheck(x != NULL);
*x = transform(inverse(A), b);
*x = transform(inverseCramer(A), b);
return true; // @@ Return false if determinant(A) == 0 !
}
@ -198,7 +216,7 @@ bool nv::solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x)
return false;
}
Matrix3 Ai = inverse(A);
Matrix3 Ai = inverseCramer(A);
*x = transform(Ai, b);
@ -207,6 +225,119 @@ bool nv::solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x)
// Inverse using gaussian elimination. From Jon's code.
Matrix nv::inverse(const Matrix & m) {
Matrix A = m;
Matrix B(identity);
int i, j, k;
float max, t, det, pivot;
det = 1.0;
for (i=0; i<4; i++) { /* eliminate in column i, below diag */
max = -1.;
for (k=i; k<4; k++) /* find pivot for column i */
if (fabs(A(k, i)) > max) {
max = fabs(A(k, i));
j = k;
}
if (max<=0.) return B; /* if no nonzero pivot, PUNT */
if (j!=i) { /* swap rows i and j */
for (k=i; k<4; k++)
swap(A(i, k), A(j, k));
for (k=0; k<4; k++)
swap(B(i, k), B(j, k));
det = -det;
}
pivot = A(i, i);
det *= pivot;
for (k=i+1; k<4; k++) /* only do elems to right of pivot */
A(i, k) /= pivot;
for (k=0; k<4; k++)
B(i, k) /= pivot;
/* we know that A(i, i) will be set to 1, so don't bother to do it */
for (j=i+1; j<4; j++) { /* eliminate in rows below i */
t = A(j, i); /* we're gonna zero this guy */
for (k=i+1; k<4; k++) /* subtract scaled row i from row j */
A(j, k) -= A(i, k)*t; /* (ignore k<=i, we know they're 0) */
for (k=0; k<4; k++)
B(j, k) -= B(i, k)*t;
}
}
/*---------- backward elimination ----------*/
for (i=4-1; i>0; i--) { /* eliminate in column i, above diag */
for (j=0; j<i; j++) { /* eliminate in rows above i */
t = A(j, i); /* we're gonna zero this guy */
for (k=0; k<4; k++) /* subtract scaled row i from row j */
B(j, k) -= B(i, k)*t;
}
}
return B;
}
Matrix3 nv::inverse(const Matrix3 & m) {
Matrix3 A = m;
Matrix3 B(identity);
int i, j, k;
float max, t, det, pivot;
det = 1.0;
for (i=0; i<3; i++) { /* eliminate in column i, below diag */
max = -1.;
for (k=i; k<3; k++) /* find pivot for column i */
if (fabs(A(k, i)) > max) {
max = fabs(A(k, i));
j = k;
}
if (max<=0.) return B; /* if no nonzero pivot, PUNT */
if (j!=i) { /* swap rows i and j */
for (k=i; k<3; k++)
swap(A(i, k), A(j, k));
for (k=0; k<3; k++)
swap(B(i, k), B(j, k));
det = -det;
}
pivot = A(i, i);
det *= pivot;
for (k=i+1; k<3; k++) /* only do elems to right of pivot */
A(i, k) /= pivot;
for (k=0; k<3; k++)
B(i, k) /= pivot;
/* we know that A(i, i) will be set to 1, so don't bother to do it */
for (j=i+1; j<3; j++) { /* eliminate in rows below i */
t = A(j, i); /* we're gonna zero this guy */
for (k=i+1; k<3; k++) /* subtract scaled row i from row j */
A(j, k) -= A(i, k)*t; /* (ignore k<=i, we know they're 0) */
for (k=0; k<3; k++)
B(j, k) -= B(i, k)*t;
}
}
/*---------- backward elimination ----------*/
for (i=3-1; i>0; i--) { /* eliminate in column i, above diag */
for (j=0; j<i; j++) { /* eliminate in rows above i */
t = A(j, i); /* we're gonna zero this guy */
for (k=0; k<3; k++) /* subtract scaled row i from row j */
B(j, k) -= B(i, k)*t;
}
}
return B;
}
#if 0

View File

@ -83,6 +83,9 @@ namespace nv
void rotate(float theta, float v0, float v1, float v2);
float determinant() const;
void operator+=(const Matrix & m);
void operator-=(const Matrix & m);
void apply(Matrix::Arg m);
private:
@ -90,11 +93,18 @@ namespace nv
};
// Solve equation system using LU decomposition and back-substitution.
extern bool solveLU(const Matrix & m, const Vector4 & b, Vector4 * x);
extern bool solveLU(const Matrix & A, const Vector4 & b, Vector4 * x);
// Solve equation system using Cramer's inverse.
extern bool solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x);
// Compute inverse using LU decomposition.
extern Matrix inverseLU(const Matrix & m);
// Compute inverse using Gaussian elimination and partial pivoting.
extern Matrix inverse(const Matrix & m);
extern Matrix3 inverse(const Matrix3 & m);
} // nv namespace
#endif // NV_MATH_MATRIX_H

View File

@ -195,7 +195,7 @@ namespace nv
}
// Inverse using Cramer's rule.
inline Matrix3 inverse(const Matrix3 & m)
inline Matrix3 inverseCramer(const Matrix3 & m)
{
const float det = m.determinant();
if (equal(det, 0.0f, 0.0f)) {
@ -477,6 +477,25 @@ namespace nv
return m;
}
// Get inverse frustum matrix.
inline Matrix frustumInverse(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar)
{
Matrix m(0.0f);
float one_doubleznear = 1.0f / (2.0f * zNear);
float one_doubleznearzfar = 1.0f / (2.0f * zNear * zFar);
m(0,0) = (xmax - xmin) * one_doubleznear;
m(0,3) = (xmax + xmin) * one_doubleznear;
m(1,1) = (ymax - ymin) * one_doubleznear;
m(1,3) = (ymax + ymin) * one_doubleznear;
m(2,3) = -1;
m(3,2) = -(zFar - zNear) * one_doubleznearzfar;
m(3,3) = (zFar + zNear) * one_doubleznearzfar;
return m;
}
// Get infinite frustum matrix.
inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear)
{
@ -510,6 +529,18 @@ namespace nv
return frustum(xmin, xmax, ymin, ymax, zNear, zFar);
}
// Get inverse perspective matrix.
inline Matrix perspectiveInverse(float fovy, float aspect, float zNear, float zFar)
{
float xmax = zNear * tan(fovy / 2);
float xmin = -xmax;
float ymax = xmax / aspect;
float ymin = -ymax;
return frustumInverse(xmin, xmax, ymin, ymax, zNear, zFar);
}
// Get infinite perspective matrix.
inline Matrix perspective(float fovy, float aspect, float zNear)
{
@ -544,7 +575,7 @@ namespace nv
}
// Inverse using Cramer's rule.
inline Matrix inverse(Matrix::Arg m)
inline Matrix inverseCramer(Matrix::Arg m)
{
Matrix r;
r.data( 0) = m.data(6)*m.data(11)*m.data(13) - m.data(7)*m.data(10)*m.data(13) + m.data(7)*m.data(9)*m.data(14) - m.data(5)*m.data(11)*m.data(14) - m.data(6)*m.data(9)*m.data(15) + m.data(5)*m.data(10)*m.data(15);
@ -622,6 +653,35 @@ namespace nv
return m;
}
inline void Matrix::operator+=(const Matrix & m)
{
for(int i = 0; i < 16; i++) {
m_data[i] += m.m_data[i];
}
}
inline void Matrix::operator-=(const Matrix & m)
{
for(int i = 0; i < 16; i++) {
m_data[i] -= m.m_data[i];
}
}
inline Matrix operator+(const Matrix & a, const Matrix & b)
{
Matrix m = a;
m += b;
return m;
}
inline Matrix operator-(const Matrix & a, const Matrix & b)
{
Matrix m = a;
m -= b;
return m;
}
} // nv namespace

View File

@ -12,13 +12,13 @@
# if NV_CPU_X86 || NV_CPU_X86_64
# define NV_USE_SSE 2
# endif
//# if defined(__SSE2__)
//# define NV_USE_SSE 2
//# elif defined(__SSE__)
//# define NV_USE_SSE 1
//# else
//# define NV_USE_SSE 0
//# endif
# if defined(__SSE2__)
# define NV_USE_SSE 2
# elif defined(__SSE__)
# define NV_USE_SSE 1
# else
# define NV_USE_SSE 0
# endif
#endif
// Internally set NV_USE_SIMD when either altivec or sse is available.

View File

@ -144,6 +144,6 @@ namespace nv
// Instead we simply have explicit casts:
template <typename T> T to(const nv::Vector2 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector2)); return T(v.x, v.y); }
template <typename T> T to(const nv::Vector3 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector3)); return T(v.x, v.y, v.z); }
template <typename T> T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.z); }
template <typename T> T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.w); }
#endif // NV_MATH_VECTOR_H

View File

@ -440,14 +440,17 @@ namespace nv
}
// Note, this is the area scaled by 2!
inline float triangleArea(Vector2::Arg v0, Vector2::Arg v1)
{
return (v0.x * v1.y - v0.y * v1.x); // * 0.5f;
}
inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
{
Vector2 v0 = a - c;
Vector2 v1 = b - c;
return (v0.x * v1.y - v0.y * v1.x);
return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y); // * 0.5f;
//return triangleArea(a-c, b-c);
}
template <>
inline uint hash(const Vector2 & v, uint h)
{

256
src/nvmath/ftoi.h Executable file
View File

@ -0,0 +1,256 @@
// This code is in the public domain -- castano@gmail.com
#pragma once
#ifndef NV_MATH_FTOI_H
#define NV_MATH_FTOI_H
#include "nvmath/nvmath.h"
#include <math.h>
namespace nv
{
// Optimized float to int conversions. See:
// http://cbloomrants.blogspot.com/2009/01/01-17-09-float-to-int.html
// http://www.stereopsis.com/sree/fpu2006.html
// http://assemblyrequired.crashworks.org/2009/01/12/why-you-should-never-cast-floats-to-ints/
// http://chrishecker.com/Miscellaneous_Technical_Articles#Floating_Point
union DoubleAnd64 {
uint64 i;
double d;
};
static const double floatutil_xs_doublemagic = (6755399441055744.0); // 2^52 * 1.5
static const double floatutil_xs_doublemagicdelta = (1.5e-8); // almost .5f = .5f + 1e^(number of exp bit)
static const double floatutil_xs_doublemagicroundeps = (0.5f - floatutil_xs_doublemagicdelta); // almost .5f = .5f - 1e^(number of exp bit)
NV_FORCEINLINE int ftoi_round_xs(double val, double magic) {
#if 1
DoubleAnd64 dunion;
dunion.d = val + magic;
return (int32) dunion.i; // just cast to grab the bottom bits
#else
val += magic;
return ((int*)&val)[0]; // @@ Assumes little endian.
#endif
}
NV_FORCEINLINE int ftoi_round_xs(float val) {
return ftoi_round_xs(val, floatutil_xs_doublemagic);
}
NV_FORCEINLINE int ftoi_floor_xs(float val) {
return ftoi_round_xs(val - floatutil_xs_doublemagicroundeps, floatutil_xs_doublemagic);
}
NV_FORCEINLINE int ftoi_ceil_xs(float val) {
return ftoi_round_xs(val + floatutil_xs_doublemagicroundeps, floatutil_xs_doublemagic);
}
NV_FORCEINLINE int ftoi_trunc_xs(float val) {
return (val<0) ? ftoi_ceil_xs(val) : ftoi_floor_xs(val);
}
#if NV_CPU_X86 || NV_CPU_X86_64
NV_FORCEINLINE int ftoi_round_sse(float f) {
return _mm_cvt_ss2si(_mm_set_ss(f));
}
NV_FORCEINLINE int ftoi_trunc_sse(float f) {
return _mm_cvtt_ss2si(_mm_set_ss(f));
}
#endif
#if NV_USE_SSE
NV_FORCEINLINE int ftoi_round(float val) {
return ftoi_round_sse(val);
}
NV_FORCEINLINE int ftoi_trunc(float f) {
return ftoi_trunc_sse(f);
}
// We can probably do better than this. See for example:
// http://dss.stephanierct.com/DevBlog/?p=8
NV_FORCEINLINE int ftoi_floor(float val) {
return ftoi_round(floorf(val));
}
NV_FORCEINLINE int ftoi_ceil(float val) {
return ftoi_round(ceilf(val));
}
#else
// In theory this should work with any double floating point math implementation, but it appears that MSVC produces incorrect code
// when SSE2 is targeted and fast math is enabled (/arch:SSE2 & /fp:fast). These problems go away with /fp:precise, which is the default mode.
NV_FORCEINLINE int ftoi_round(float val) {
return ftoi_round_xs(val);
}
NV_FORCEINLINE int ftoi_floor(float val) {
return ftoi_floor_xs(val);
}
NV_FORCEINLINE int ftoi_ceil(float val) {
return ftoi_ceil_xs(val);
}
NV_FORCEINLINE int ftoi_trunc(float f) {
return ftoi_trunc_xs(f);
}
#endif
inline void test_ftoi() {
// Round to nearest integer.
nvCheck(ftoi_round(0.1f) == 0);
nvCheck(ftoi_round(0.6f) == 1);
nvCheck(ftoi_round(-0.2f) == 0);
nvCheck(ftoi_round(-0.7f) == -1);
nvCheck(ftoi_round(10.1f) == 10);
nvCheck(ftoi_round(10.6f) == 11);
nvCheck(ftoi_round(-90.1f) == -90);
nvCheck(ftoi_round(-90.6f) == -91);
nvCheck(ftoi_round(0) == 0);
nvCheck(ftoi_round(1) == 1);
nvCheck(ftoi_round(-1) == -1);
nvCheck(ftoi_round(0.5f) == 0); // How are midpoints rounded? Bankers rounding.
nvCheck(ftoi_round(1.5f) == 2);
nvCheck(ftoi_round(2.5f) == 2);
nvCheck(ftoi_round(3.5f) == 4);
nvCheck(ftoi_round(4.5f) == 4);
nvCheck(ftoi_round(-0.5f) == 0);
nvCheck(ftoi_round(-1.5f) == -2);
// Truncation (round down if > 0, round up if < 0).
nvCheck(ftoi_trunc(0.1f) == 0);
nvCheck(ftoi_trunc(0.6f) == 0);
nvCheck(ftoi_trunc(-0.2f) == 0);
nvCheck(ftoi_trunc(-0.7f) == 0); // @@ When using /arch:SSE2 in Win32, msvc produce wrong code for this one. It is skipping the addition.
nvCheck(ftoi_trunc(1.99f) == 1);
nvCheck(ftoi_trunc(-1.2f) == -1);
// Floor (round down).
nvCheck(ftoi_floor(0.1f) == 0);
nvCheck(ftoi_floor(0.6f) == 0);
nvCheck(ftoi_floor(-0.2f) == -1);
nvCheck(ftoi_floor(-0.7f) == -1);
nvCheck(ftoi_floor(1.99f) == 1);
nvCheck(ftoi_floor(-1.2f) == -2);
nvCheck(ftoi_floor(0) == 0);
nvCheck(ftoi_floor(1) == 1);
nvCheck(ftoi_floor(-1) == -1);
nvCheck(ftoi_floor(2) == 2);
nvCheck(ftoi_floor(-2) == -2);
// Ceil (round up).
nvCheck(ftoi_ceil(0.1f) == 1);
nvCheck(ftoi_ceil(0.6f) == 1);
nvCheck(ftoi_ceil(-0.2f) == 0);
nvCheck(ftoi_ceil(-0.7f) == 0);
nvCheck(ftoi_ceil(1.99f) == 2);
nvCheck(ftoi_ceil(-1.2f) == -1);
nvCheck(ftoi_ceil(0) == 0);
nvCheck(ftoi_ceil(1) == 1);
nvCheck(ftoi_ceil(-1) == -1);
nvCheck(ftoi_ceil(2) == 2);
nvCheck(ftoi_ceil(-2) == -2);
}
// Safe versions using standard casts.
inline int iround(float f)
{
return int(floorf(f + 0.5f));
}
inline int iround(double f)
{
return int(::floor(f + 0.5));
}
inline int ifloor(float f)
{
return int(floorf(f));
}
inline int iceil(float f)
{
return int(ceilf(f));
}
// I'm always confused about which quantizer to use. I think we should choose a quantizer based on how the values are expanded later and this is generally using the 'exact endpoints' rule.
// Some notes from cbloom: http://cbloomrants.blogspot.com/2011/07/07-26-11-pixel-int-to-float-options.html
// Quantize a float in the [0,1] range, using exact end points or uniform bins.
inline float quantizeFloat(float x, uint bits, bool exactEndPoints = true) {
nvDebugCheck(bits <= 16);
float range = float(1 << bits);
if (exactEndPoints) {
return floorf(x * (range-1) + 0.5f) / (range-1);
}
else {
return (floorf(x * range) + 0.5f) / range;
}
}
// This is the most common rounding mode:
//
// 0 1 2 3
// |___|_______|_______|___|
// 0 1
//
// You get that if you take the unit floating point number multiply by 'N-1' and round to nearest. That is, `i = round(f * (N-1))`.
// You reconstruct the original float dividing by 'N-1': `f = i / (N-1)`
// 0 1 2 3
// |_____|_____|_____|_____|
// 0 1
/*enum BinningMode {
RoundMode_ExactEndPoints,
RoundMode_UniformBins,
};*/
template <int N>
inline uint unitFloatToFixed(float f) {
return ftoi_round(f * ((1<<N)-1));
}
inline uint8 unitFloatToFixed8(float f) {
return (uint8)unitFloatToFixed<8>(f);
}
inline uint16 unitFloatToFixed16(float f) {
return (uint16)unitFloatToFixed<16>(f);
}
} // nv
#endif // NV_MATH_FTOI_H

View File

@ -14,6 +14,13 @@
#include <float.h> // finite, isnan
#endif
#if NV_CPU_X86 || NV_CPU_X86_64
//#include <intrin.h>
#include <xmmintrin.h>
#endif
// Function linkage
#if NVMATH_SHARED
#ifdef NVMATH_EXPORTS
@ -28,6 +35,37 @@
#define NVMATH_CLASS
#endif // NVMATH_SHARED
// Set some reasonable defaults.
#ifndef NV_USE_ALTIVEC
# define NV_USE_ALTIVEC NV_CPU_PPC
//# define NV_USE_ALTIVEC defined(__VEC__)
#endif
#ifndef NV_USE_SSE
# if NV_CPU_X86_64
// x64 always supports at least SSE2
# define NV_USE_SSE 2
# elif NV_CC_MSVC && defined(_M_IX86_FP)
// Also on x86 with the /arch:SSE flag in MSVC.
# define NV_USE_SSE _M_IX86_FP // 1=SSE, 2=SS2
# elif defined(__SSE__)
# define NV_USE_SSE 1
# elif defined(__SSE2__)
# define NV_USE_SSE 2
# else
// Otherwise we assume no SSE.
# define NV_USE_SSE 0
# endif
#endif
// Internally set NV_USE_SIMD when either altivec or sse is available.
#if NV_USE_ALTIVEC && NV_USE_SSE
# error "Cannot enable both altivec and sse!"
#endif
#ifndef PI
#define PI float(3.1415926535897932384626433833)
#endif
@ -179,26 +217,6 @@ namespace nv
inline float cube(float f) { return f * f * f; }
inline int cube(int i) { return i * i * i; }
// @@ Float to int conversions to be optimized at some point. See:
// http://cbloomrants.blogspot.com/2009/01/01-17-09-float-to-int.html
// http://www.stereopsis.com/sree/fpu2006.html
// http://assemblyrequired.crashworks.org/2009/01/12/why-you-should-never-cast-floats-to-ints/
// http://chrishecker.com/Miscellaneous_Technical_Articles#Floating_Point
inline int iround(float f)
{
return int(floorf(f + 0.5f));
}
inline int ifloor(float f)
{
return int(floorf(f));
}
inline int iceil(float f)
{
return int(ceilf(f));
}
inline float frac(float f)
{
return f - floor(f);
@ -242,21 +260,6 @@ namespace nv
return 0;
}
// I'm always confused about which quantizer to use. I think we should choose a quantizer based on how the values are expanded later and this is generally using the 'exact endpoints' rule.
// Quantize a float in the [0,1] range, using exact end points or uniform bins.
inline float quantizeFloat(float x, uint bits, bool exactEndPoints = true) {
nvDebugCheck(bits <= 16);
float range = float(1 << bits);
if (exactEndPoints) {
return floorf(x * (range-1) + 0.5f) / (range-1);
}
else {
return (floorf(x * range) + 0.5f) / range;
}
}
union Float754 {
unsigned int raw;
float value;