From fa468b04ab991fa25b930183cb1caa9923548452 Mon Sep 17 00:00:00 2001 From: castano Date: Tue, 14 Feb 2012 16:31:25 +0000 Subject: [PATCH] Merge changes from The Witness. --- src/nvmath/Box.cpp | 35 ++++++ src/nvmath/Box.h | 4 + src/nvmath/Box.inl | 38 +++---- src/nvmath/Fitting.cpp | 218 ++++++++++++++++++++++++++++++++++-- src/nvmath/Fitting.h | 4 + src/nvmath/Half.cpp | 124 ++++++++++++++++---- src/nvmath/Half.h | 13 ++- src/nvmath/SimdVector_SSE.h | 80 +++++++------ src/nvmath/Vector.inl | 55 ++++++++- src/nvmath/nvmath.h | 7 +- 10 files changed, 492 insertions(+), 86 deletions(-) diff --git a/src/nvmath/Box.cpp b/src/nvmath/Box.cpp index 7368f3c..48e8517 100644 --- a/src/nvmath/Box.cpp +++ b/src/nvmath/Box.cpp @@ -29,3 +29,38 @@ float nv::distanceSquared(const Box &box, const Vector3 &point) { /*bool nv::overlap(const Box &box, const Sphere &sphere) { return distanceSquared(box, sphere.center) < sphere.radius * sphere.radius; }*/ + + +bool nv::intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t /*= NULL*/) { + // Precompute these in ray structure? + int sdx = (id.x < 0); + int sdy = (id.y < 0); + int sdz = (id.z < 0); + + float tmin = (box.corner( sdx).x - p.x) * id.x; + float tmax = (box.corner(1-sdx).x - p.x) * id.x; + float tymin = (box.corner( sdy).y - p.y) * id.y; + float tymax = (box.corner(1-sdy).y - p.y) * id.y; + + if ((tmin > tymax) || (tymin > tmax)) + return false; + + if (tymin > tmin) tmin = tymin; + if (tymax < tmax) tmax = tymax; + + float tzmin = (box.corner( sdz).z - p.z) * id.z; + float tzmax = (box.corner(1-sdz).z - p.z) * id.z; + + if ((tmin > tzmax) || (tzmin > tmax)) + return false; + + if (tzmin > tmin) tmin = tzmin; + if (tzmax < tmax) tmax = tzmax; + + if (tmax < 0) + return false; + + if (t != NULL) *t = tmin; + + return true; +} diff --git a/src/nvmath/Box.h b/src/nvmath/Box.h index 74e4bf3..71fff03 100644 --- a/src/nvmath/Box.h +++ b/src/nvmath/Box.h @@ -74,6 +74,8 @@ namespace nv friend Stream & operator<< (Stream & s, Box & box); + const Vector3 & corner(int i) const { return (&minCorner)[i]; } + Vector3 minCorner; Vector3 maxCorner; }; @@ -81,6 +83,8 @@ namespace nv float distanceSquared(const Box &box, const Vector3 &point); bool overlap(const Box &box, const Sphere &sphere); + // p is ray origin, id is inverse ray direction. + bool intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t); } // nv namespace diff --git a/src/nvmath/Box.inl b/src/nvmath/Box.inl index 9b69828..55d738e 100644 --- a/src/nvmath/Box.inl +++ b/src/nvmath/Box.inl @@ -12,51 +12,51 @@ namespace nv { // Default ctor. - Box::Box() { }; + inline Box::Box() { }; // Copy ctor. - Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { } + inline Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { } // Init ctor. - Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { } + inline Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { } // Assignment operator. - Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; } + inline Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; } // Clear the bounds. - void Box::clearBounds() + inline void Box::clearBounds() { minCorner.set(FLT_MAX, FLT_MAX, FLT_MAX); maxCorner.set(-FLT_MAX, -FLT_MAX, -FLT_MAX); } // Build a cube centered on center and with edge = 2*dist - void Box::cube(const Vector3 & center, float dist) + inline void Box::cube(const Vector3 & center, float dist) { setCenterExtents(center, Vector3(dist, dist, dist)); } // Build a box, given center and extents. - void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents) + inline void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents) { minCorner = center - extents; maxCorner = center + extents; } // Get box center. - Vector3 Box::center() const + inline Vector3 Box::center() const { return (minCorner + maxCorner) * 0.5f; } // Return extents of the box. - Vector3 Box::extents() const + inline Vector3 Box::extents() const { return (maxCorner - minCorner) * 0.5f; } // Return extents of the box. - float Box::extents(uint axis) const + inline float Box::extents(uint axis) const { nvDebugCheck(axis < 3); if (axis == 0) return (maxCorner.x - minCorner.x) * 0.5f; @@ -67,55 +67,55 @@ namespace nv } // Add a point to this box. - void Box::addPointToBounds(const Vector3 & p) + inline void Box::addPointToBounds(const Vector3 & p) { minCorner = min(minCorner, p); maxCorner = max(maxCorner, p); } // Add a box to this box. - void Box::addBoxToBounds(const Box & b) + inline void Box::addBoxToBounds(const Box & b) { minCorner = min(minCorner, b.minCorner); maxCorner = max(maxCorner, b.maxCorner); } // Translate box. - void Box::translate(const Vector3 & v) + inline void Box::translate(const Vector3 & v) { minCorner += v; maxCorner += v; } // Scale the box. - void Box::scale(float s) + inline void Box::scale(float s) { minCorner *= s; maxCorner *= s; } // Expand the box by a fixed amount. - void Box::expand(float r) { + inline void Box::expand(float r) { minCorner -= Vector3(r,r,r); maxCorner += Vector3(r,r,r); } // Get the area of the box. - float Box::area() const + inline float Box::area() const { const Vector3 d = extents(); return 8.0f * (d.x*d.y + d.x*d.z + d.y*d.z); } // Get the volume of the box. - float Box::volume() const + inline float Box::volume() const { Vector3 d = extents(); return 8.0f * (d.x * d.y * d.z); } // Return true if the box contains the given point. - bool Box::contains(const Vector3 & p) const + inline bool Box::contains(const Vector3 & p) const { return minCorner.x < p.x && minCorner.y < p.y && minCorner.z < p.z && @@ -123,7 +123,7 @@ namespace nv } // Split the given box in 8 octants and assign the ith one to this box. - void Box::setOctant(const Box & box, const Vector3 & center, int i) + inline void Box::setOctant(const Box & box, const Vector3 & center, int i) { minCorner = box.minCorner; maxCorner = box.maxCorner; diff --git a/src/nvmath/Fitting.cpp b/src/nvmath/Fitting.cpp index 57c755a..0106ef8 100644 --- a/src/nvmath/Fitting.cpp +++ b/src/nvmath/Fitting.cpp @@ -159,24 +159,228 @@ Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points) float matrix[6]; Vector3 centroid = computeCovariance(n, points, matrix); - if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0) + if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0) { // If no plane defined, then return a horizontal plane. return Plane(Vector3(0, 0, 1), centroid); } -#pragma NV_MESSAGE("TODO: need to write an eigensolver!") + float eigenValues[3]; + Vector3 eigenVectors[3]; + if (!eigenSolveSymmetric(matrix, eigenValues, eigenVectors)) { + // If no plane defined, then return a horizontal plane. + return Plane(Vector3(0, 0, 1), centroid); + } - // - Numerical Recipes in C is a good reference. Householder transforms followed by QL decomposition seems to be the best approach. - // - The one from magic-tools is now LGPL. For the 3D case it uses a cubic root solver, which is not very accurate. - // - Charles' Galaxy3 contains an implementation of the tridiagonalization method, but is under BPL. + return Plane(eigenVectors[2], centroid); +} + +bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON*/) +{ + // compute the centroid and covariance + float matrix[6]; + Vector3 centroid = computeCovariance(n, points, matrix); - //EigenSolver3 solver(matrix); + float eigenValues[3]; + Vector3 eigenVectors[3]; + if (!eigenSolveSymmetric(matrix, eigenValues, eigenVectors)) { + return false; + } - return Plane(); + return eigenValues[2] < epsilon; } + +// Tridiagonal solver from Charles Bloom. +// Householder transforms followed by QL decomposition. +// Seems to be based on the code from Numerical Recipes in C. + +static void EigenSolver_Tridiagonal(double mat[3][3],double * diag,double * subd); +static bool EigenSolver_QLAlgorithm(double mat[3][3],double * diag,double * subd); + +bool nv::Fit::eigenSolveSymmetric(float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]) +{ + nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL); + + double subd[3]; + double diag[3]; + double work[3][3]; + + work[0][0] = matrix[0]; + work[0][1] = work[1][0] = matrix[1]; + work[0][2] = work[2][0] = matrix[2]; + work[1][1] = matrix[3]; + work[1][2] = work[2][1] = matrix[4]; + work[2][2] = matrix[5]; + + EigenSolver_Tridiagonal(work, diag, subd); + if (!EigenSolver_QLAlgorithm(work, diag, subd)) + { + for (int i = 0; i < 3; i++) { + eigenValues[i] = 0; + eigenVectors[i] = Vector3(0); + } + return false; + } + + for (int i = 0; i < 3; i++) { + eigenValues[i] = (float)diag[i]; + } + + // eigenvectors are the columns; make them the rows : + + for (int i=0; i < 3; i++) + { + for (int j = 0; j < 3; j++) + { + eigenVectors[j].component[i] = (float) work[i][j]; + } + } + + // shuffle to sort by singular value : + if (eigenValues[2] > eigenValues[0] && eigenValues[2] > eigenValues[1]) + { + swap(eigenValues[0], eigenValues[2]); + swap(eigenVectors[0], eigenVectors[2]); + } + if (eigenValues[1] > eigenValues[0]) + { + swap(eigenValues[0], eigenValues[1]); + swap(eigenVectors[0], eigenVectors[1]); + } + if (eigenValues[2] > eigenValues[1]) + { + swap(eigenValues[1], eigenValues[2]); + swap(eigenVectors[1], eigenVectors[2]); + } + + nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2]); + nvDebugCheck(eigenValues[1] >= eigenValues[2]); + + return true; +} + +static void EigenSolver_Tridiagonal(double mat[3][3],double * diag,double * subd) +{ + // Householder reduction T = Q^t M Q + // Input: + // mat, symmetric 3x3 matrix M + // Output: + // mat, orthogonal matrix Q + // diag, diagonal entries of T + // subd, subdiagonal entries of T (T is symmetric) + const double epsilon = 1e-08f; + + double a = mat[0][0]; + double b = mat[0][1]; + double c = mat[0][2]; + double d = mat[1][1]; + double e = mat[1][2]; + double f = mat[2][2]; + + diag[0] = a; + subd[2] = 0.f; + if ( fabs(c) >= epsilon ) + { + const double ell = sqrt(b*b+c*c); + b /= ell; + c /= ell; + const double q = 2*b*e+c*(f-d); + diag[1] = d+c*q; + diag[2] = f-c*q; + subd[0] = ell; + subd[1] = e-b*q; + mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0; + mat[1][0] = 0; mat[1][1] = b; mat[1][2] = c; + mat[2][0] = 0; mat[2][1] = c; mat[2][2] = -b; + } + else + { + diag[1] = d; + diag[2] = f; + subd[0] = b; + subd[1] = e; + mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0; + mat[1][0] = 0; mat[1][1] = 1; mat[1][2] = 0; + mat[2][0] = 0; mat[2][1] = 0; mat[2][2] = 1; + } +} + +static bool EigenSolver_QLAlgorithm(double mat[3][3],double * diag,double * subd) +{ + // QL iteration with implicit shifting to reduce matrix from tridiagonal + // to diagonal + const int maxiter = 32; + + for (int ell = 0; ell < 3; ell++) + { + int iter; + for (iter = 0; iter < maxiter; iter++) + { + int m; + for (m = ell; m <= 1; m++) + { + double dd = fabs(diag[m]) + fabs(diag[m+1]); + if ( fabs(subd[m]) + dd == dd ) + break; + } + if ( m == ell ) + break; + + double g = (diag[ell+1]-diag[ell])/(2*subd[ell]); + double r = sqrt(g*g+1); + if ( g < 0 ) + g = diag[m]-diag[ell]+subd[ell]/(g-r); + else + g = diag[m]-diag[ell]+subd[ell]/(g+r); + double s = 1, c = 1, p = 0; + for (int i = m-1; i >= ell; i--) + { + double f = s*subd[i], b = c*subd[i]; + if ( fabs(f) >= fabs(g) ) + { + c = g/f; + r = sqrt(c*c+1); + subd[i+1] = f*r; + c *= (s = 1/r); + } + else + { + s = f/g; + r = sqrt(s*s+1); + subd[i+1] = g*r; + s *= (c = 1/r); + } + g = diag[i+1]-p; + r = (diag[i]-g)*s+2*b*c; + p = s*r; + diag[i+1] = g+p; + g = c*r-b; + + for (int k = 0; k < 3; k++) + { + f = mat[k][i+1]; + mat[k][i+1] = s*mat[k][i]+c*f; + mat[k][i] = c*mat[k][i]-s*f; + } + } + diag[ell] -= p; + subd[ell] = g; + subd[m] = 0; + } + + if ( iter == maxiter ) + // should not get here under normal circumstances + return false; + } + + return true; +} + + + + int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster) { // Compute principal component. diff --git a/src/nvmath/Fitting.h b/src/nvmath/Fitting.h index 84d397f..73fad82 100644 --- a/src/nvmath/Fitting.h +++ b/src/nvmath/Fitting.h @@ -23,6 +23,10 @@ namespace nv Vector3 computePrincipalComponent(int n, const Vector3 * points, const float * weights, const Vector3 & metric); Plane bestPlane(int n, const Vector3 * points); + bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON); + + bool eigenSolveSymmetric (float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]); + // Returns number of clusters [1-4]. int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster); diff --git a/src/nvmath/Half.cpp b/src/nvmath/Half.cpp index b0bd2a8..7bd9122 100644 --- a/src/nvmath/Half.cpp +++ b/src/nvmath/Half.cpp @@ -488,16 +488,20 @@ nv::half_to_float( uint16 h ) } +// @@ This code appears to be wrong. // @@ These tables could be smaller. -static uint32 mantissa_table[2048]; -static uint32 exponent_table[64]; -static uint32 offset_table[64]; +namespace nv { + uint32 mantissa_table[2048]; + uint32 exponent_table[64]; + uint32 offset_table[64]; +} void nv::half_init_tables() { // Init mantissa table. mantissa_table[0] = 0; + // denormals for (int i = 1; i < 1024; i++) { uint m = i << 13; uint e = 0; @@ -511,8 +515,9 @@ void nv::half_init_tables() mantissa_table[i] = m | e; } + // normals for (int i = 1024; i < 2048; i++) { - mantissa_table[i] = 0x38000000 + ((i - 1024) << 13); + mantissa_table[i] = (i - 1024) << 13; } @@ -520,17 +525,17 @@ void nv::half_init_tables() exponent_table[0] = 0; for (int i = 1; i < 31; i++) { - exponent_table[i] = (i << 23); + exponent_table[i] = 0x38000000 + (i << 23); } - exponent_table[31] = 0x47800000; + exponent_table[31] = 0x7f800000; exponent_table[32] = 0x80000000; for (int i = 33; i < 63; i++) { - exponent_table[i] = 0x80000000 + ((i - 32) << 23); + exponent_table[i] = 0xb8000000 + ((i - 32) << 23); } - exponent_table[63] = 0xC7800000; + exponent_table[63] = 0xff800000; // Init offset table. @@ -545,22 +550,11 @@ void nv::half_init_tables() for (int i = 33; i < 64; i++) { offset_table[i] = 1024; } - - /*for (int i = 0; i < 64; i++) { - offset_table[i] = ((i & 31) != 0) * 1024; - }*/ -} - -// Fast half to float conversion based on: -// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf -uint32 nv::fast_half_to_float(uint16 h) -{ - uint exp = h >> 10; - return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp]; } #if 0 + // Inaccurate conversion suggested at the ffmpeg mailing list: // http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html uint32 nv::fast_half_to_float(uint16 v) @@ -609,4 +603,94 @@ __asm } +#endif + +#if 0 +// These version computes the tables at compile time: +// http://gamedev.stackexchange.com/questions/17326/conversion-of-a-number-from-single-precision-floating-point-representation-to-a + +/* This method is faster than the OpenEXR implementation (very often + * used, eg. in Ogre), with the additional benefit of rounding, inspired + * by James Tursa’s half-precision code. */ +static inline uint16_t float_to_half_branch(uint32_t x) +{ + uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */ + uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */ + unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */ + + /* If zero, or denormal, or exponent underflows too much for a denormal + * half, return signed zero. */ + if (e < 103) + return bits; + + /* If NaN, return NaN. If Inf or exponent overflow, return Inf. */ + if (e > 142) + { + bits |= 0x7c00u; + /* If exponent was 0xff and one mantissa bit was set, it means NaN, + * not Inf, so make sure we set one mantissa bit too. */ + bits |= e == 255 && (x & 0x007fffffu); + return bits; + } + + /* If exponent underflows but not too much, return a denormal */ + if (e < 113) + { + m |= 0x0800u; + /* Extra rounding may overflow and set mantissa to 0 and exponent + * to 1, which is OK. */ + bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1); + return bits; + } + + bits |= ((e - 112) << 10) | (m >> 1); + /* Extra rounding. An overflow will set mantissa to 0 and increment + * the exponent, which is OK. */ + bits += m & 1; + return bits; +} + +/* These macros implement a finite iterator useful to build lookup + * tables. For instance, S64(0) will call S1(x) for all values of x + * between 0 and 63. + * Due to the exponential behaviour of the calls, the stress on the + * compiler may be important. */ +#define S4(x) S1((x)), S1((x)+1), S1((x)+2), S1((x)+3) +#define S16(x) S4((x)), S4((x)+4), S4((x)+8), S4((x)+12) +#define S64(x) S16((x)), S16((x)+16), S16((x)+32), S16((x)+48) +#define S256(x) S64((x)), S64((x)+64), S64((x)+128), S64((x)+192) +#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768) + +/* Lookup table-based algorithm from “Fast Half Float Conversions” + * by Jeroen van der Zijp, November 2008. No rounding is performed, + * and some NaN values may be incorrectly converted to Inf. */ +static inline uint16_t float_to_half_nobranch(uint32_t x) +{ + static uint16_t const basetable[512] = + { +#define S1(i) (((i) < 103) ? 0x0000 : \ + ((i) < 113) ? 0x0400 >> (113 - (i)) : \ + ((i) < 143) ? ((i) - 112) << 10 : 0x7c00) + S256(0), +#undef S1 +#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \ + ((i) < 113) ? 0x0400 >> (113 - (i)) : \ + ((i) < 143) ? ((i) - 112) << 10 : 0x7c00)) + S256(0), +#undef S1 + }; + + static uint8_t const shifttable[512] = + { +#define S1(i) (((i) < 103) ? 24 : \ + ((i) < 113) ? 126 - (i) : \ + ((i) < 143 || (i) == 255) ? 13 : 24) + S256(0), S256(0), +#undef S1 + }; + + uint16_t bits = basetable[(x >> 23) & 0x1ff]; + bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff]; + return bits; +} #endif \ No newline at end of file diff --git a/src/nvmath/Half.h b/src/nvmath/Half.h index f732e93..53fceb6 100644 --- a/src/nvmath/Half.h +++ b/src/nvmath/Half.h @@ -11,7 +11,18 @@ namespace nv { void half_init_tables(); - uint32 fast_half_to_float(uint16 h); + extern uint32 mantissa_table[2048]; + extern uint32 exponent_table[64]; + extern uint32 offset_table[64]; + + // Fast half to float conversion based on: + // http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf + inline uint32 fast_half_to_float(uint16 h) + { + uint exp = h >> 10; + return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp]; + } + inline uint16 to_half(float c) { union { float f; uint32 u; } f; diff --git a/src/nvmath/SimdVector_SSE.h b/src/nvmath/SimdVector_SSE.h index a8d52d9..c816de2 100644 --- a/src/nvmath/SimdVector_SSE.h +++ b/src/nvmath/SimdVector_SSE.h @@ -33,8 +33,15 @@ #include #endif +// See this for ideas: +// http://molecularmusings.wordpress.com/2011/10/18/simdifying-multi-platform-math/ + + namespace nv { +#define NV_SIMD_NATIVE NV_FORCEINLINE +#define NV_SIMD_INLINE inline + class SimdVector { public: @@ -42,45 +49,47 @@ namespace nv { typedef SimdVector const& Arg; - NV_FORCEINLINE SimdVector() {} - NV_FORCEINLINE explicit SimdVector(float f) : vec(_mm_set1_ps(f)) {} - NV_FORCEINLINE explicit SimdVector(__m128 v) : vec(v) {} - - NV_FORCEINLINE explicit SimdVector(NV_ALIGN_16 Vector4 v) { - vec = _mm_load_ps( v.component ); + NV_SIMD_NATIVE SimdVector() {} + + NV_SIMD_NATIVE explicit SimdVector(__m128 v) : vec(v) {} + + NV_SIMD_NATIVE explicit SimdVector(float f) { + vec = _mm_set1_ps(f); } - NV_FORCEINLINE explicit SimdVector(const float * v) { + NV_SIMD_NATIVE explicit SimdVector(const float * v) + { vec = _mm_load_ps( v ); } - NV_FORCEINLINE SimdVector(float x, float y, float z, float w) { + NV_SIMD_NATIVE SimdVector(float x, float y, float z, float w) + { vec = _mm_setr_ps( x, y, z, w ); } - NV_FORCEINLINE SimdVector(const SimdVector & arg) : vec(arg.vec) {} + NV_SIMD_NATIVE SimdVector(const SimdVector & arg) : vec(arg.vec) {} - NV_FORCEINLINE SimdVector & operator=(const SimdVector & arg) { + NV_SIMD_NATIVE SimdVector & operator=(const SimdVector & arg) + { vec = arg.vec; return *this; } - - float toFloat() const + NV_SIMD_INLINE float toFloat() const { NV_ALIGN_16 float f; _mm_store_ss(&f, vec); return f; } - Vector3 toVector3() const + NV_SIMD_INLINE Vector3 toVector3() const { NV_ALIGN_16 float c[4]; _mm_store_ps( c, vec ); return Vector3( c[0], c[1], c[2] ); } - Vector4 toVector4() const + NV_SIMD_INLINE Vector4 toVector4() const { NV_ALIGN_16 float c[4]; _mm_store_ps( c, vec ); @@ -88,57 +97,60 @@ namespace nv { } #define SSE_SPLAT( a ) ((a) | ((a) << 2) | ((a) << 4) | ((a) << 6)) - NV_FORCEINLINE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); } - NV_FORCEINLINE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); } - NV_FORCEINLINE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); } - NV_FORCEINLINE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); } + NV_SIMD_NATIVE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); } + NV_SIMD_NATIVE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); } + NV_SIMD_NATIVE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); } + NV_SIMD_NATIVE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); } #undef SSE_SPLAT - NV_FORCEINLINE SimdVector & operator+=( Arg v ) { + NV_SIMD_NATIVE SimdVector& operator+=( Arg v ) + { vec = _mm_add_ps( vec, v.vec ); return *this; } - NV_FORCEINLINE SimdVector & operator-=( Arg v ) { + NV_SIMD_NATIVE SimdVector& operator-=( Arg v ) + { vec = _mm_sub_ps( vec, v.vec ); return *this; } - NV_FORCEINLINE SimdVector & operator*=( Arg v ) { + NV_SIMD_NATIVE SimdVector& operator*=( Arg v ) + { vec = _mm_mul_ps( vec, v.vec ); return *this; } }; - NV_FORCEINLINE SimdVector operator+(SimdVector::Arg left, SimdVector::Arg right) + NV_SIMD_NATIVE SimdVector operator+( SimdVector::Arg left, SimdVector::Arg right ) { return SimdVector( _mm_add_ps( left.vec, right.vec ) ); } - NV_FORCEINLINE SimdVector operator-(SimdVector::Arg left, SimdVector::Arg right) + NV_SIMD_NATIVE SimdVector operator-( SimdVector::Arg left, SimdVector::Arg right ) { return SimdVector( _mm_sub_ps( left.vec, right.vec ) ); } - NV_FORCEINLINE SimdVector operator*(SimdVector::Arg left, SimdVector::Arg right) + NV_SIMD_NATIVE SimdVector operator*( SimdVector::Arg left, SimdVector::Arg right ) { return SimdVector( _mm_mul_ps( left.vec, right.vec ) ); } // Returns a*b + c - NV_FORCEINLINE SimdVector multiplyAdd(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c) + NV_SIMD_INLINE SimdVector multiplyAdd( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c ) { return SimdVector( _mm_add_ps( _mm_mul_ps( a.vec, b.vec ), c.vec ) ); } - // Returns -( a*b - c ) = c - a*b - NV_FORCEINLINE SimdVector negativeMultiplySubtract(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c) + // Returns -( a*b - c ) + NV_SIMD_INLINE SimdVector negativeMultiplySubtract( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c ) { return SimdVector( _mm_sub_ps( c.vec, _mm_mul_ps( a.vec, b.vec ) ) ); } - inline SimdVector reciprocal( SimdVector::Arg v ) + NV_SIMD_INLINE SimdVector reciprocal( SimdVector::Arg v ) { // get the reciprocal estimate __m128 estimate = _mm_rcp_ps( v.vec ); @@ -148,17 +160,17 @@ namespace nv { return SimdVector( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) ); } - NV_FORCEINLINE SimdVector min(SimdVector::Arg left, SimdVector::Arg right) + NV_SIMD_NATIVE SimdVector min( SimdVector::Arg left, SimdVector::Arg right ) { return SimdVector( _mm_min_ps( left.vec, right.vec ) ); } - NV_FORCEINLINE SimdVector max(SimdVector::Arg left, SimdVector::Arg right) + NV_SIMD_NATIVE SimdVector max( SimdVector::Arg left, SimdVector::Arg right ) { return SimdVector( _mm_max_ps( left.vec, right.vec ) ); } - inline SimdVector truncate( SimdVector::Arg v ) + NV_SIMD_INLINE SimdVector truncate( SimdVector::Arg v ) { #if (NV_USE_SSE == 1) // convert to ints @@ -179,12 +191,12 @@ namespace nv { #endif } - NV_FORCEINLINE SimdVector compareEqual(SimdVector::Arg left, SimdVector::Arg right) + NV_SIMD_NATIVE SimdVector compareEqual( SimdVector::Arg left, SimdVector::Arg right ) { return SimdVector( _mm_cmpeq_ps( left.vec, right.vec ) ); } - inline SimdVector select(SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits) + NV_SIMD_INLINE SimdVector select( SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits ) { __m128 a = _mm_andnot_ps( bits.vec, off.vec ); __m128 b = _mm_and_ps( bits.vec, on.vec ); @@ -192,7 +204,7 @@ namespace nv { return SimdVector( _mm_or_ps( a, b ) ); } - inline bool compareAnyLessThan(SimdVector::Arg left, SimdVector::Arg right) + NV_SIMD_INLINE bool compareAnyLessThan( SimdVector::Arg left, SimdVector::Arg right ) { __m128 bits = _mm_cmplt_ps( left.vec, right.vec ); int value = _mm_movemask_ps( bits ); diff --git a/src/nvmath/Vector.inl b/src/nvmath/Vector.inl index d2d3341..d252e4d 100644 --- a/src/nvmath/Vector.inl +++ b/src/nvmath/Vector.inl @@ -314,6 +314,12 @@ namespace nv return scale(v, 1.0f/s); } + inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t) + { + const float s = 1.0f - t; + return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y); + } + inline float dot(Vector2::Arg a, Vector2::Arg b) { return a.x * b.x + a.y * b.y; @@ -381,6 +387,16 @@ namespace nv return Vector2(max(a.x, b.x), max(a.y, b.y)); } + inline Vector2 clamp(Vector2::Arg v, float min, float max) + { + return Vector2(clamp(v.x, min, max), clamp(v.y, min, max)); + } + + inline Vector2 saturate(Vector2::Arg v) + { + return Vector2(saturate(v.x), saturate(v.y)); + } + inline bool isFinite(Vector2::Arg v) { return isFinite(v.x) && isFinite(v.y); @@ -394,6 +410,7 @@ namespace nv return vf; } + // Note, this is the area scaled by 2! inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c) { Vector2 v0 = a - c; @@ -500,6 +517,16 @@ namespace nv return sqrtf(lengthSquared(v)); } + inline float distance(Vector3::Arg a, Vector3::Arg b) + { + return length(a - b); + } + + inline float distanceSquared(Vector3::Arg a, Vector3::Arg b) + { + return lengthSquared(a - b); + } + inline float inverseLength(Vector3::Arg v) { return 1.0f / sqrtf(lengthSquared(v)); @@ -557,6 +584,11 @@ namespace nv return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max)); } + inline Vector3 saturate(Vector3::Arg v) + { + return Vector3(saturate(v.x), saturate(v.y), saturate(v.z)); + } + inline Vector3 floor(Vector3::Arg v) { return Vector3(floorf(v.x), floorf(v.y), floorf(v.z)); @@ -580,6 +612,11 @@ namespace nv return vf; } + inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n) + { + return v - (2 * dot(v, n)) * n; + } + // Vector4 @@ -627,9 +664,15 @@ namespace nv return scale(v, 1.0f/s); } - inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s) + /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s) { return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s); + }*/ + + inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t) + { + const float s = 1.0f - t; + return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w); } inline float dot(Vector4::Arg a, Vector4::Arg b) @@ -699,6 +742,16 @@ namespace nv return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); } + inline Vector4 clamp(Vector4::Arg v, float min, float max) + { + return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max)); + } + + inline Vector4 saturate(Vector4::Arg v) + { + return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w)); + } + inline bool isFinite(Vector4::Arg v) { return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w); diff --git a/src/nvmath/nvmath.h b/src/nvmath/nvmath.h index 20c64d9..798ddce 100644 --- a/src/nvmath/nvmath.h +++ b/src/nvmath/nvmath.h @@ -155,15 +155,14 @@ namespace nv } #if NV_CC_MSVC - inline float log2f(float x) + NV_FORCEINLINE float log2f(float x) { nvCheck(x >= 0); return logf(x) / logf(2.0f); } - - inline float exp2f(float x) + NV_FORCEINLINE float exp2f(float x) { - return powf(2, x); + return powf(2.0f, x); } #endif