Merge changes from The Witness.
This commit is contained in:
parent
06c170b41b
commit
fa468b04ab
@ -29,3 +29,38 @@ float nv::distanceSquared(const Box &box, const Vector3 &point) {
|
||||
/*bool nv::overlap(const Box &box, const Sphere &sphere) {
|
||||
return distanceSquared(box, sphere.center) < sphere.radius * sphere.radius;
|
||||
}*/
|
||||
|
||||
|
||||
bool nv::intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t /*= NULL*/) {
|
||||
// Precompute these in ray structure?
|
||||
int sdx = (id.x < 0);
|
||||
int sdy = (id.y < 0);
|
||||
int sdz = (id.z < 0);
|
||||
|
||||
float tmin = (box.corner( sdx).x - p.x) * id.x;
|
||||
float tmax = (box.corner(1-sdx).x - p.x) * id.x;
|
||||
float tymin = (box.corner( sdy).y - p.y) * id.y;
|
||||
float tymax = (box.corner(1-sdy).y - p.y) * id.y;
|
||||
|
||||
if ((tmin > tymax) || (tymin > tmax))
|
||||
return false;
|
||||
|
||||
if (tymin > tmin) tmin = tymin;
|
||||
if (tymax < tmax) tmax = tymax;
|
||||
|
||||
float tzmin = (box.corner( sdz).z - p.z) * id.z;
|
||||
float tzmax = (box.corner(1-sdz).z - p.z) * id.z;
|
||||
|
||||
if ((tmin > tzmax) || (tzmin > tmax))
|
||||
return false;
|
||||
|
||||
if (tzmin > tmin) tmin = tzmin;
|
||||
if (tzmax < tmax) tmax = tzmax;
|
||||
|
||||
if (tmax < 0)
|
||||
return false;
|
||||
|
||||
if (t != NULL) *t = tmin;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -74,6 +74,8 @@ namespace nv
|
||||
|
||||
friend Stream & operator<< (Stream & s, Box & box);
|
||||
|
||||
const Vector3 & corner(int i) const { return (&minCorner)[i]; }
|
||||
|
||||
Vector3 minCorner;
|
||||
Vector3 maxCorner;
|
||||
};
|
||||
@ -81,6 +83,8 @@ namespace nv
|
||||
float distanceSquared(const Box &box, const Vector3 &point);
|
||||
bool overlap(const Box &box, const Sphere &sphere);
|
||||
|
||||
// p is ray origin, id is inverse ray direction.
|
||||
bool intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t);
|
||||
|
||||
} // nv namespace
|
||||
|
||||
|
@ -12,51 +12,51 @@
|
||||
namespace nv
|
||||
{
|
||||
// Default ctor.
|
||||
Box::Box() { };
|
||||
inline Box::Box() { };
|
||||
|
||||
// Copy ctor.
|
||||
Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
|
||||
inline Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
|
||||
|
||||
// Init ctor.
|
||||
Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { }
|
||||
inline Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { }
|
||||
|
||||
// Assignment operator.
|
||||
Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; }
|
||||
inline Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; }
|
||||
|
||||
// Clear the bounds.
|
||||
void Box::clearBounds()
|
||||
inline void Box::clearBounds()
|
||||
{
|
||||
minCorner.set(FLT_MAX, FLT_MAX, FLT_MAX);
|
||||
maxCorner.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
|
||||
}
|
||||
|
||||
// Build a cube centered on center and with edge = 2*dist
|
||||
void Box::cube(const Vector3 & center, float dist)
|
||||
inline void Box::cube(const Vector3 & center, float dist)
|
||||
{
|
||||
setCenterExtents(center, Vector3(dist, dist, dist));
|
||||
}
|
||||
|
||||
// Build a box, given center and extents.
|
||||
void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents)
|
||||
inline void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents)
|
||||
{
|
||||
minCorner = center - extents;
|
||||
maxCorner = center + extents;
|
||||
}
|
||||
|
||||
// Get box center.
|
||||
Vector3 Box::center() const
|
||||
inline Vector3 Box::center() const
|
||||
{
|
||||
return (minCorner + maxCorner) * 0.5f;
|
||||
}
|
||||
|
||||
// Return extents of the box.
|
||||
Vector3 Box::extents() const
|
||||
inline Vector3 Box::extents() const
|
||||
{
|
||||
return (maxCorner - minCorner) * 0.5f;
|
||||
}
|
||||
|
||||
// Return extents of the box.
|
||||
float Box::extents(uint axis) const
|
||||
inline float Box::extents(uint axis) const
|
||||
{
|
||||
nvDebugCheck(axis < 3);
|
||||
if (axis == 0) return (maxCorner.x - minCorner.x) * 0.5f;
|
||||
@ -67,55 +67,55 @@ namespace nv
|
||||
}
|
||||
|
||||
// Add a point to this box.
|
||||
void Box::addPointToBounds(const Vector3 & p)
|
||||
inline void Box::addPointToBounds(const Vector3 & p)
|
||||
{
|
||||
minCorner = min(minCorner, p);
|
||||
maxCorner = max(maxCorner, p);
|
||||
}
|
||||
|
||||
// Add a box to this box.
|
||||
void Box::addBoxToBounds(const Box & b)
|
||||
inline void Box::addBoxToBounds(const Box & b)
|
||||
{
|
||||
minCorner = min(minCorner, b.minCorner);
|
||||
maxCorner = max(maxCorner, b.maxCorner);
|
||||
}
|
||||
|
||||
// Translate box.
|
||||
void Box::translate(const Vector3 & v)
|
||||
inline void Box::translate(const Vector3 & v)
|
||||
{
|
||||
minCorner += v;
|
||||
maxCorner += v;
|
||||
}
|
||||
|
||||
// Scale the box.
|
||||
void Box::scale(float s)
|
||||
inline void Box::scale(float s)
|
||||
{
|
||||
minCorner *= s;
|
||||
maxCorner *= s;
|
||||
}
|
||||
|
||||
// Expand the box by a fixed amount.
|
||||
void Box::expand(float r) {
|
||||
inline void Box::expand(float r) {
|
||||
minCorner -= Vector3(r,r,r);
|
||||
maxCorner += Vector3(r,r,r);
|
||||
}
|
||||
|
||||
// Get the area of the box.
|
||||
float Box::area() const
|
||||
inline float Box::area() const
|
||||
{
|
||||
const Vector3 d = extents();
|
||||
return 8.0f * (d.x*d.y + d.x*d.z + d.y*d.z);
|
||||
}
|
||||
|
||||
// Get the volume of the box.
|
||||
float Box::volume() const
|
||||
inline float Box::volume() const
|
||||
{
|
||||
Vector3 d = extents();
|
||||
return 8.0f * (d.x * d.y * d.z);
|
||||
}
|
||||
|
||||
// Return true if the box contains the given point.
|
||||
bool Box::contains(const Vector3 & p) const
|
||||
inline bool Box::contains(const Vector3 & p) const
|
||||
{
|
||||
return
|
||||
minCorner.x < p.x && minCorner.y < p.y && minCorner.z < p.z &&
|
||||
@ -123,7 +123,7 @@ namespace nv
|
||||
}
|
||||
|
||||
// Split the given box in 8 octants and assign the ith one to this box.
|
||||
void Box::setOctant(const Box & box, const Vector3 & center, int i)
|
||||
inline void Box::setOctant(const Box & box, const Vector3 & center, int i)
|
||||
{
|
||||
minCorner = box.minCorner;
|
||||
maxCorner = box.maxCorner;
|
||||
|
@ -159,23 +159,227 @@ Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
|
||||
float matrix[6];
|
||||
Vector3 centroid = computeCovariance(n, points, matrix);
|
||||
|
||||
if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
|
||||
if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
|
||||
{
|
||||
// If no plane defined, then return a horizontal plane.
|
||||
return Plane(Vector3(0, 0, 1), centroid);
|
||||
}
|
||||
|
||||
#pragma NV_MESSAGE("TODO: need to write an eigensolver!")
|
||||
float eigenValues[3];
|
||||
Vector3 eigenVectors[3];
|
||||
if (!eigenSolveSymmetric(matrix, eigenValues, eigenVectors)) {
|
||||
// If no plane defined, then return a horizontal plane.
|
||||
return Plane(Vector3(0, 0, 1), centroid);
|
||||
}
|
||||
|
||||
// - Numerical Recipes in C is a good reference. Householder transforms followed by QL decomposition seems to be the best approach.
|
||||
// - The one from magic-tools is now LGPL. For the 3D case it uses a cubic root solver, which is not very accurate.
|
||||
// - Charles' Galaxy3 contains an implementation of the tridiagonalization method, but is under BPL.
|
||||
|
||||
//EigenSolver3 solver(matrix);
|
||||
|
||||
return Plane();
|
||||
return Plane(eigenVectors[2], centroid);
|
||||
}
|
||||
|
||||
bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON*/)
|
||||
{
|
||||
// compute the centroid and covariance
|
||||
float matrix[6];
|
||||
Vector3 centroid = computeCovariance(n, points, matrix);
|
||||
|
||||
float eigenValues[3];
|
||||
Vector3 eigenVectors[3];
|
||||
if (!eigenSolveSymmetric(matrix, eigenValues, eigenVectors)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return eigenValues[2] < epsilon;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Tridiagonal solver from Charles Bloom.
|
||||
// Householder transforms followed by QL decomposition.
|
||||
// Seems to be based on the code from Numerical Recipes in C.
|
||||
|
||||
static void EigenSolver_Tridiagonal(double mat[3][3],double * diag,double * subd);
|
||||
static bool EigenSolver_QLAlgorithm(double mat[3][3],double * diag,double * subd);
|
||||
|
||||
bool nv::Fit::eigenSolveSymmetric(float matrix[6], float eigenValues[3], Vector3 eigenVectors[3])
|
||||
{
|
||||
nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
|
||||
|
||||
double subd[3];
|
||||
double diag[3];
|
||||
double work[3][3];
|
||||
|
||||
work[0][0] = matrix[0];
|
||||
work[0][1] = work[1][0] = matrix[1];
|
||||
work[0][2] = work[2][0] = matrix[2];
|
||||
work[1][1] = matrix[3];
|
||||
work[1][2] = work[2][1] = matrix[4];
|
||||
work[2][2] = matrix[5];
|
||||
|
||||
EigenSolver_Tridiagonal(work, diag, subd);
|
||||
if (!EigenSolver_QLAlgorithm(work, diag, subd))
|
||||
{
|
||||
for (int i = 0; i < 3; i++) {
|
||||
eigenValues[i] = 0;
|
||||
eigenVectors[i] = Vector3(0);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
eigenValues[i] = (float)diag[i];
|
||||
}
|
||||
|
||||
// eigenvectors are the columns; make them the rows :
|
||||
|
||||
for (int i=0; i < 3; i++)
|
||||
{
|
||||
for (int j = 0; j < 3; j++)
|
||||
{
|
||||
eigenVectors[j].component[i] = (float) work[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
// shuffle to sort by singular value :
|
||||
if (eigenValues[2] > eigenValues[0] && eigenValues[2] > eigenValues[1])
|
||||
{
|
||||
swap(eigenValues[0], eigenValues[2]);
|
||||
swap(eigenVectors[0], eigenVectors[2]);
|
||||
}
|
||||
if (eigenValues[1] > eigenValues[0])
|
||||
{
|
||||
swap(eigenValues[0], eigenValues[1]);
|
||||
swap(eigenVectors[0], eigenVectors[1]);
|
||||
}
|
||||
if (eigenValues[2] > eigenValues[1])
|
||||
{
|
||||
swap(eigenValues[1], eigenValues[2]);
|
||||
swap(eigenVectors[1], eigenVectors[2]);
|
||||
}
|
||||
|
||||
nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2]);
|
||||
nvDebugCheck(eigenValues[1] >= eigenValues[2]);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void EigenSolver_Tridiagonal(double mat[3][3],double * diag,double * subd)
|
||||
{
|
||||
// Householder reduction T = Q^t M Q
|
||||
// Input:
|
||||
// mat, symmetric 3x3 matrix M
|
||||
// Output:
|
||||
// mat, orthogonal matrix Q
|
||||
// diag, diagonal entries of T
|
||||
// subd, subdiagonal entries of T (T is symmetric)
|
||||
const double epsilon = 1e-08f;
|
||||
|
||||
double a = mat[0][0];
|
||||
double b = mat[0][1];
|
||||
double c = mat[0][2];
|
||||
double d = mat[1][1];
|
||||
double e = mat[1][2];
|
||||
double f = mat[2][2];
|
||||
|
||||
diag[0] = a;
|
||||
subd[2] = 0.f;
|
||||
if ( fabs(c) >= epsilon )
|
||||
{
|
||||
const double ell = sqrt(b*b+c*c);
|
||||
b /= ell;
|
||||
c /= ell;
|
||||
const double q = 2*b*e+c*(f-d);
|
||||
diag[1] = d+c*q;
|
||||
diag[2] = f-c*q;
|
||||
subd[0] = ell;
|
||||
subd[1] = e-b*q;
|
||||
mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
|
||||
mat[1][0] = 0; mat[1][1] = b; mat[1][2] = c;
|
||||
mat[2][0] = 0; mat[2][1] = c; mat[2][2] = -b;
|
||||
}
|
||||
else
|
||||
{
|
||||
diag[1] = d;
|
||||
diag[2] = f;
|
||||
subd[0] = b;
|
||||
subd[1] = e;
|
||||
mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
|
||||
mat[1][0] = 0; mat[1][1] = 1; mat[1][2] = 0;
|
||||
mat[2][0] = 0; mat[2][1] = 0; mat[2][2] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
static bool EigenSolver_QLAlgorithm(double mat[3][3],double * diag,double * subd)
|
||||
{
|
||||
// QL iteration with implicit shifting to reduce matrix from tridiagonal
|
||||
// to diagonal
|
||||
const int maxiter = 32;
|
||||
|
||||
for (int ell = 0; ell < 3; ell++)
|
||||
{
|
||||
int iter;
|
||||
for (iter = 0; iter < maxiter; iter++)
|
||||
{
|
||||
int m;
|
||||
for (m = ell; m <= 1; m++)
|
||||
{
|
||||
double dd = fabs(diag[m]) + fabs(diag[m+1]);
|
||||
if ( fabs(subd[m]) + dd == dd )
|
||||
break;
|
||||
}
|
||||
if ( m == ell )
|
||||
break;
|
||||
|
||||
double g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
|
||||
double r = sqrt(g*g+1);
|
||||
if ( g < 0 )
|
||||
g = diag[m]-diag[ell]+subd[ell]/(g-r);
|
||||
else
|
||||
g = diag[m]-diag[ell]+subd[ell]/(g+r);
|
||||
double s = 1, c = 1, p = 0;
|
||||
for (int i = m-1; i >= ell; i--)
|
||||
{
|
||||
double f = s*subd[i], b = c*subd[i];
|
||||
if ( fabs(f) >= fabs(g) )
|
||||
{
|
||||
c = g/f;
|
||||
r = sqrt(c*c+1);
|
||||
subd[i+1] = f*r;
|
||||
c *= (s = 1/r);
|
||||
}
|
||||
else
|
||||
{
|
||||
s = f/g;
|
||||
r = sqrt(s*s+1);
|
||||
subd[i+1] = g*r;
|
||||
s *= (c = 1/r);
|
||||
}
|
||||
g = diag[i+1]-p;
|
||||
r = (diag[i]-g)*s+2*b*c;
|
||||
p = s*r;
|
||||
diag[i+1] = g+p;
|
||||
g = c*r-b;
|
||||
|
||||
for (int k = 0; k < 3; k++)
|
||||
{
|
||||
f = mat[k][i+1];
|
||||
mat[k][i+1] = s*mat[k][i]+c*f;
|
||||
mat[k][i] = c*mat[k][i]-s*f;
|
||||
}
|
||||
}
|
||||
diag[ell] -= p;
|
||||
subd[ell] = g;
|
||||
subd[m] = 0;
|
||||
}
|
||||
|
||||
if ( iter == maxiter )
|
||||
// should not get here under normal circumstances
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
|
||||
{
|
||||
|
@ -23,6 +23,10 @@ namespace nv
|
||||
Vector3 computePrincipalComponent(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
|
||||
|
||||
Plane bestPlane(int n, const Vector3 * points);
|
||||
bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);
|
||||
|
||||
bool eigenSolveSymmetric (float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
|
||||
|
||||
|
||||
// Returns number of clusters [1-4].
|
||||
int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);
|
||||
|
@ -488,16 +488,20 @@ nv::half_to_float( uint16 h )
|
||||
}
|
||||
|
||||
|
||||
// @@ This code appears to be wrong.
|
||||
// @@ These tables could be smaller.
|
||||
static uint32 mantissa_table[2048];
|
||||
static uint32 exponent_table[64];
|
||||
static uint32 offset_table[64];
|
||||
namespace nv {
|
||||
uint32 mantissa_table[2048];
|
||||
uint32 exponent_table[64];
|
||||
uint32 offset_table[64];
|
||||
}
|
||||
|
||||
void nv::half_init_tables()
|
||||
{
|
||||
// Init mantissa table.
|
||||
mantissa_table[0] = 0;
|
||||
|
||||
// denormals
|
||||
for (int i = 1; i < 1024; i++) {
|
||||
uint m = i << 13;
|
||||
uint e = 0;
|
||||
@ -511,8 +515,9 @@ void nv::half_init_tables()
|
||||
mantissa_table[i] = m | e;
|
||||
}
|
||||
|
||||
// normals
|
||||
for (int i = 1024; i < 2048; i++) {
|
||||
mantissa_table[i] = 0x38000000 + ((i - 1024) << 13);
|
||||
mantissa_table[i] = (i - 1024) << 13;
|
||||
}
|
||||
|
||||
|
||||
@ -520,17 +525,17 @@ void nv::half_init_tables()
|
||||
exponent_table[0] = 0;
|
||||
|
||||
for (int i = 1; i < 31; i++) {
|
||||
exponent_table[i] = (i << 23);
|
||||
exponent_table[i] = 0x38000000 + (i << 23);
|
||||
}
|
||||
|
||||
exponent_table[31] = 0x47800000;
|
||||
exponent_table[31] = 0x7f800000;
|
||||
exponent_table[32] = 0x80000000;
|
||||
|
||||
for (int i = 33; i < 63; i++) {
|
||||
exponent_table[i] = 0x80000000 + ((i - 32) << 23);
|
||||
exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
|
||||
}
|
||||
|
||||
exponent_table[63] = 0xC7800000;
|
||||
exponent_table[63] = 0xff800000;
|
||||
|
||||
|
||||
// Init offset table.
|
||||
@ -545,22 +550,11 @@ void nv::half_init_tables()
|
||||
for (int i = 33; i < 64; i++) {
|
||||
offset_table[i] = 1024;
|
||||
}
|
||||
|
||||
/*for (int i = 0; i < 64; i++) {
|
||||
offset_table[i] = ((i & 31) != 0) * 1024;
|
||||
}*/
|
||||
}
|
||||
|
||||
// Fast half to float conversion based on:
|
||||
// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
|
||||
uint32 nv::fast_half_to_float(uint16 h)
|
||||
{
|
||||
uint exp = h >> 10;
|
||||
return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
|
||||
// Inaccurate conversion suggested at the ffmpeg mailing list:
|
||||
// http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
|
||||
uint32 nv::fast_half_to_float(uint16 v)
|
||||
@ -609,4 +603,94 @@ __asm
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
// These version computes the tables at compile time:
|
||||
// http://gamedev.stackexchange.com/questions/17326/conversion-of-a-number-from-single-precision-floating-point-representation-to-a
|
||||
|
||||
/* This method is faster than the OpenEXR implementation (very often
|
||||
* used, eg. in Ogre), with the additional benefit of rounding, inspired
|
||||
* by James Tursa’s half-precision code. */
|
||||
static inline uint16_t float_to_half_branch(uint32_t x)
|
||||
{
|
||||
uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
|
||||
uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
|
||||
unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
|
||||
|
||||
/* If zero, or denormal, or exponent underflows too much for a denormal
|
||||
* half, return signed zero. */
|
||||
if (e < 103)
|
||||
return bits;
|
||||
|
||||
/* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
|
||||
if (e > 142)
|
||||
{
|
||||
bits |= 0x7c00u;
|
||||
/* If exponent was 0xff and one mantissa bit was set, it means NaN,
|
||||
* not Inf, so make sure we set one mantissa bit too. */
|
||||
bits |= e == 255 && (x & 0x007fffffu);
|
||||
return bits;
|
||||
}
|
||||
|
||||
/* If exponent underflows but not too much, return a denormal */
|
||||
if (e < 113)
|
||||
{
|
||||
m |= 0x0800u;
|
||||
/* Extra rounding may overflow and set mantissa to 0 and exponent
|
||||
* to 1, which is OK. */
|
||||
bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
|
||||
return bits;
|
||||
}
|
||||
|
||||
bits |= ((e - 112) << 10) | (m >> 1);
|
||||
/* Extra rounding. An overflow will set mantissa to 0 and increment
|
||||
* the exponent, which is OK. */
|
||||
bits += m & 1;
|
||||
return bits;
|
||||
}
|
||||
|
||||
/* These macros implement a finite iterator useful to build lookup
|
||||
* tables. For instance, S64(0) will call S1(x) for all values of x
|
||||
* between 0 and 63.
|
||||
* Due to the exponential behaviour of the calls, the stress on the
|
||||
* compiler may be important. */
|
||||
#define S4(x) S1((x)), S1((x)+1), S1((x)+2), S1((x)+3)
|
||||
#define S16(x) S4((x)), S4((x)+4), S4((x)+8), S4((x)+12)
|
||||
#define S64(x) S16((x)), S16((x)+16), S16((x)+32), S16((x)+48)
|
||||
#define S256(x) S64((x)), S64((x)+64), S64((x)+128), S64((x)+192)
|
||||
#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
|
||||
|
||||
/* Lookup table-based algorithm from “Fast Half Float Conversions”
|
||||
* by Jeroen van der Zijp, November 2008. No rounding is performed,
|
||||
* and some NaN values may be incorrectly converted to Inf. */
|
||||
static inline uint16_t float_to_half_nobranch(uint32_t x)
|
||||
{
|
||||
static uint16_t const basetable[512] =
|
||||
{
|
||||
#define S1(i) (((i) < 103) ? 0x0000 : \
|
||||
((i) < 113) ? 0x0400 >> (113 - (i)) : \
|
||||
((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
|
||||
S256(0),
|
||||
#undef S1
|
||||
#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
|
||||
((i) < 113) ? 0x0400 >> (113 - (i)) : \
|
||||
((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
|
||||
S256(0),
|
||||
#undef S1
|
||||
};
|
||||
|
||||
static uint8_t const shifttable[512] =
|
||||
{
|
||||
#define S1(i) (((i) < 103) ? 24 : \
|
||||
((i) < 113) ? 126 - (i) : \
|
||||
((i) < 143 || (i) == 255) ? 13 : 24)
|
||||
S256(0), S256(0),
|
||||
#undef S1
|
||||
};
|
||||
|
||||
uint16_t bits = basetable[(x >> 23) & 0x1ff];
|
||||
bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
|
||||
return bits;
|
||||
}
|
||||
#endif
|
@ -11,7 +11,18 @@ namespace nv {
|
||||
|
||||
void half_init_tables();
|
||||
|
||||
uint32 fast_half_to_float(uint16 h);
|
||||
extern uint32 mantissa_table[2048];
|
||||
extern uint32 exponent_table[64];
|
||||
extern uint32 offset_table[64];
|
||||
|
||||
// Fast half to float conversion based on:
|
||||
// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
|
||||
inline uint32 fast_half_to_float(uint16 h)
|
||||
{
|
||||
uint exp = h >> 10;
|
||||
return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
|
||||
}
|
||||
|
||||
|
||||
inline uint16 to_half(float c) {
|
||||
union { float f; uint32 u; } f;
|
||||
|
@ -33,8 +33,15 @@
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
// See this for ideas:
|
||||
// http://molecularmusings.wordpress.com/2011/10/18/simdifying-multi-platform-math/
|
||||
|
||||
|
||||
namespace nv {
|
||||
|
||||
#define NV_SIMD_NATIVE NV_FORCEINLINE
|
||||
#define NV_SIMD_INLINE inline
|
||||
|
||||
class SimdVector
|
||||
{
|
||||
public:
|
||||
@ -42,45 +49,47 @@ namespace nv {
|
||||
|
||||
typedef SimdVector const& Arg;
|
||||
|
||||
NV_FORCEINLINE SimdVector() {}
|
||||
NV_FORCEINLINE explicit SimdVector(float f) : vec(_mm_set1_ps(f)) {}
|
||||
NV_FORCEINLINE explicit SimdVector(__m128 v) : vec(v) {}
|
||||
|
||||
NV_FORCEINLINE explicit SimdVector(NV_ALIGN_16 Vector4 v) {
|
||||
vec = _mm_load_ps( v.component );
|
||||
NV_SIMD_NATIVE SimdVector() {}
|
||||
|
||||
NV_SIMD_NATIVE explicit SimdVector(__m128 v) : vec(v) {}
|
||||
|
||||
NV_SIMD_NATIVE explicit SimdVector(float f) {
|
||||
vec = _mm_set1_ps(f);
|
||||
}
|
||||
|
||||
NV_FORCEINLINE explicit SimdVector(const float * v) {
|
||||
NV_SIMD_NATIVE explicit SimdVector(const float * v)
|
||||
{
|
||||
vec = _mm_load_ps( v );
|
||||
}
|
||||
|
||||
NV_FORCEINLINE SimdVector(float x, float y, float z, float w) {
|
||||
NV_SIMD_NATIVE SimdVector(float x, float y, float z, float w)
|
||||
{
|
||||
vec = _mm_setr_ps( x, y, z, w );
|
||||
}
|
||||
|
||||
NV_FORCEINLINE SimdVector(const SimdVector & arg) : vec(arg.vec) {}
|
||||
NV_SIMD_NATIVE SimdVector(const SimdVector & arg) : vec(arg.vec) {}
|
||||
|
||||
NV_FORCEINLINE SimdVector & operator=(const SimdVector & arg) {
|
||||
NV_SIMD_NATIVE SimdVector & operator=(const SimdVector & arg)
|
||||
{
|
||||
vec = arg.vec;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
float toFloat() const
|
||||
NV_SIMD_INLINE float toFloat() const
|
||||
{
|
||||
NV_ALIGN_16 float f;
|
||||
_mm_store_ss(&f, vec);
|
||||
return f;
|
||||
}
|
||||
|
||||
Vector3 toVector3() const
|
||||
NV_SIMD_INLINE Vector3 toVector3() const
|
||||
{
|
||||
NV_ALIGN_16 float c[4];
|
||||
_mm_store_ps( c, vec );
|
||||
return Vector3( c[0], c[1], c[2] );
|
||||
}
|
||||
|
||||
Vector4 toVector4() const
|
||||
NV_SIMD_INLINE Vector4 toVector4() const
|
||||
{
|
||||
NV_ALIGN_16 float c[4];
|
||||
_mm_store_ps( c, vec );
|
||||
@ -88,57 +97,60 @@ namespace nv {
|
||||
}
|
||||
|
||||
#define SSE_SPLAT( a ) ((a) | ((a) << 2) | ((a) << 4) | ((a) << 6))
|
||||
NV_FORCEINLINE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); }
|
||||
NV_FORCEINLINE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); }
|
||||
NV_FORCEINLINE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); }
|
||||
NV_FORCEINLINE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); }
|
||||
NV_SIMD_NATIVE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); }
|
||||
NV_SIMD_NATIVE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); }
|
||||
NV_SIMD_NATIVE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); }
|
||||
NV_SIMD_NATIVE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); }
|
||||
#undef SSE_SPLAT
|
||||
|
||||
NV_FORCEINLINE SimdVector & operator+=( Arg v ) {
|
||||
NV_SIMD_NATIVE SimdVector& operator+=( Arg v )
|
||||
{
|
||||
vec = _mm_add_ps( vec, v.vec );
|
||||
return *this;
|
||||
}
|
||||
|
||||
NV_FORCEINLINE SimdVector & operator-=( Arg v ) {
|
||||
NV_SIMD_NATIVE SimdVector& operator-=( Arg v )
|
||||
{
|
||||
vec = _mm_sub_ps( vec, v.vec );
|
||||
return *this;
|
||||
}
|
||||
|
||||
NV_FORCEINLINE SimdVector & operator*=( Arg v ) {
|
||||
NV_SIMD_NATIVE SimdVector& operator*=( Arg v )
|
||||
{
|
||||
vec = _mm_mul_ps( vec, v.vec );
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
NV_FORCEINLINE SimdVector operator+(SimdVector::Arg left, SimdVector::Arg right)
|
||||
NV_SIMD_NATIVE SimdVector operator+( SimdVector::Arg left, SimdVector::Arg right )
|
||||
{
|
||||
return SimdVector( _mm_add_ps( left.vec, right.vec ) );
|
||||
}
|
||||
|
||||
NV_FORCEINLINE SimdVector operator-(SimdVector::Arg left, SimdVector::Arg right)
|
||||
NV_SIMD_NATIVE SimdVector operator-( SimdVector::Arg left, SimdVector::Arg right )
|
||||
{
|
||||
return SimdVector( _mm_sub_ps( left.vec, right.vec ) );
|
||||
}
|
||||
|
||||
NV_FORCEINLINE SimdVector operator*(SimdVector::Arg left, SimdVector::Arg right)
|
||||
NV_SIMD_NATIVE SimdVector operator*( SimdVector::Arg left, SimdVector::Arg right )
|
||||
{
|
||||
return SimdVector( _mm_mul_ps( left.vec, right.vec ) );
|
||||
}
|
||||
|
||||
// Returns a*b + c
|
||||
NV_FORCEINLINE SimdVector multiplyAdd(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c)
|
||||
NV_SIMD_INLINE SimdVector multiplyAdd( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
|
||||
{
|
||||
return SimdVector( _mm_add_ps( _mm_mul_ps( a.vec, b.vec ), c.vec ) );
|
||||
}
|
||||
|
||||
// Returns -( a*b - c ) = c - a*b
|
||||
NV_FORCEINLINE SimdVector negativeMultiplySubtract(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c)
|
||||
// Returns -( a*b - c )
|
||||
NV_SIMD_INLINE SimdVector negativeMultiplySubtract( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
|
||||
{
|
||||
return SimdVector( _mm_sub_ps( c.vec, _mm_mul_ps( a.vec, b.vec ) ) );
|
||||
}
|
||||
|
||||
inline SimdVector reciprocal( SimdVector::Arg v )
|
||||
NV_SIMD_INLINE SimdVector reciprocal( SimdVector::Arg v )
|
||||
{
|
||||
// get the reciprocal estimate
|
||||
__m128 estimate = _mm_rcp_ps( v.vec );
|
||||
@ -148,17 +160,17 @@ namespace nv {
|
||||
return SimdVector( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) );
|
||||
}
|
||||
|
||||
NV_FORCEINLINE SimdVector min(SimdVector::Arg left, SimdVector::Arg right)
|
||||
NV_SIMD_NATIVE SimdVector min( SimdVector::Arg left, SimdVector::Arg right )
|
||||
{
|
||||
return SimdVector( _mm_min_ps( left.vec, right.vec ) );
|
||||
}
|
||||
|
||||
NV_FORCEINLINE SimdVector max(SimdVector::Arg left, SimdVector::Arg right)
|
||||
NV_SIMD_NATIVE SimdVector max( SimdVector::Arg left, SimdVector::Arg right )
|
||||
{
|
||||
return SimdVector( _mm_max_ps( left.vec, right.vec ) );
|
||||
}
|
||||
|
||||
inline SimdVector truncate( SimdVector::Arg v )
|
||||
NV_SIMD_INLINE SimdVector truncate( SimdVector::Arg v )
|
||||
{
|
||||
#if (NV_USE_SSE == 1)
|
||||
// convert to ints
|
||||
@ -179,12 +191,12 @@ namespace nv {
|
||||
#endif
|
||||
}
|
||||
|
||||
NV_FORCEINLINE SimdVector compareEqual(SimdVector::Arg left, SimdVector::Arg right)
|
||||
NV_SIMD_NATIVE SimdVector compareEqual( SimdVector::Arg left, SimdVector::Arg right )
|
||||
{
|
||||
return SimdVector( _mm_cmpeq_ps( left.vec, right.vec ) );
|
||||
}
|
||||
|
||||
inline SimdVector select(SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits)
|
||||
NV_SIMD_INLINE SimdVector select( SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits )
|
||||
{
|
||||
__m128 a = _mm_andnot_ps( bits.vec, off.vec );
|
||||
__m128 b = _mm_and_ps( bits.vec, on.vec );
|
||||
@ -192,7 +204,7 @@ namespace nv {
|
||||
return SimdVector( _mm_or_ps( a, b ) );
|
||||
}
|
||||
|
||||
inline bool compareAnyLessThan(SimdVector::Arg left, SimdVector::Arg right)
|
||||
NV_SIMD_INLINE bool compareAnyLessThan( SimdVector::Arg left, SimdVector::Arg right )
|
||||
{
|
||||
__m128 bits = _mm_cmplt_ps( left.vec, right.vec );
|
||||
int value = _mm_movemask_ps( bits );
|
||||
|
@ -314,6 +314,12 @@ namespace nv
|
||||
return scale(v, 1.0f/s);
|
||||
}
|
||||
|
||||
inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
|
||||
{
|
||||
const float s = 1.0f - t;
|
||||
return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
|
||||
}
|
||||
|
||||
inline float dot(Vector2::Arg a, Vector2::Arg b)
|
||||
{
|
||||
return a.x * b.x + a.y * b.y;
|
||||
@ -381,6 +387,16 @@ namespace nv
|
||||
return Vector2(max(a.x, b.x), max(a.y, b.y));
|
||||
}
|
||||
|
||||
inline Vector2 clamp(Vector2::Arg v, float min, float max)
|
||||
{
|
||||
return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
|
||||
}
|
||||
|
||||
inline Vector2 saturate(Vector2::Arg v)
|
||||
{
|
||||
return Vector2(saturate(v.x), saturate(v.y));
|
||||
}
|
||||
|
||||
inline bool isFinite(Vector2::Arg v)
|
||||
{
|
||||
return isFinite(v.x) && isFinite(v.y);
|
||||
@ -394,6 +410,7 @@ namespace nv
|
||||
return vf;
|
||||
}
|
||||
|
||||
// Note, this is the area scaled by 2!
|
||||
inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
|
||||
{
|
||||
Vector2 v0 = a - c;
|
||||
@ -500,6 +517,16 @@ namespace nv
|
||||
return sqrtf(lengthSquared(v));
|
||||
}
|
||||
|
||||
inline float distance(Vector3::Arg a, Vector3::Arg b)
|
||||
{
|
||||
return length(a - b);
|
||||
}
|
||||
|
||||
inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
|
||||
{
|
||||
return lengthSquared(a - b);
|
||||
}
|
||||
|
||||
inline float inverseLength(Vector3::Arg v)
|
||||
{
|
||||
return 1.0f / sqrtf(lengthSquared(v));
|
||||
@ -557,6 +584,11 @@ namespace nv
|
||||
return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
|
||||
}
|
||||
|
||||
inline Vector3 saturate(Vector3::Arg v)
|
||||
{
|
||||
return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
|
||||
}
|
||||
|
||||
inline Vector3 floor(Vector3::Arg v)
|
||||
{
|
||||
return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
|
||||
@ -580,6 +612,11 @@ namespace nv
|
||||
return vf;
|
||||
}
|
||||
|
||||
inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
|
||||
{
|
||||
return v - (2 * dot(v, n)) * n;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Vector4
|
||||
@ -627,9 +664,15 @@ namespace nv
|
||||
return scale(v, 1.0f/s);
|
||||
}
|
||||
|
||||
inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
|
||||
/*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
|
||||
{
|
||||
return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
|
||||
}*/
|
||||
|
||||
inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
|
||||
{
|
||||
const float s = 1.0f - t;
|
||||
return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
|
||||
}
|
||||
|
||||
inline float dot(Vector4::Arg a, Vector4::Arg b)
|
||||
@ -699,6 +742,16 @@ namespace nv
|
||||
return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
|
||||
}
|
||||
|
||||
inline Vector4 clamp(Vector4::Arg v, float min, float max)
|
||||
{
|
||||
return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
|
||||
}
|
||||
|
||||
inline Vector4 saturate(Vector4::Arg v)
|
||||
{
|
||||
return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
|
||||
}
|
||||
|
||||
inline bool isFinite(Vector4::Arg v)
|
||||
{
|
||||
return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);
|
||||
|
@ -155,15 +155,14 @@ namespace nv
|
||||
}
|
||||
|
||||
#if NV_CC_MSVC
|
||||
inline float log2f(float x)
|
||||
NV_FORCEINLINE float log2f(float x)
|
||||
{
|
||||
nvCheck(x >= 0);
|
||||
return logf(x) / logf(2.0f);
|
||||
}
|
||||
|
||||
inline float exp2f(float x)
|
||||
NV_FORCEINLINE float exp2f(float x)
|
||||
{
|
||||
return powf(2, x);
|
||||
return powf(2.0f, x);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user