Merge changes from The Witness.

pull/216/head
castano 12 years ago
parent 06c170b41b
commit fa468b04ab

@ -29,3 +29,38 @@ float nv::distanceSquared(const Box &box, const Vector3 &point) {
/*bool nv::overlap(const Box &box, const Sphere &sphere) {
return distanceSquared(box, sphere.center) < sphere.radius * sphere.radius;
}*/
bool nv::intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t /*= NULL*/) {
// Precompute these in ray structure?
int sdx = (id.x < 0);
int sdy = (id.y < 0);
int sdz = (id.z < 0);
float tmin = (box.corner( sdx).x - p.x) * id.x;
float tmax = (box.corner(1-sdx).x - p.x) * id.x;
float tymin = (box.corner( sdy).y - p.y) * id.y;
float tymax = (box.corner(1-sdy).y - p.y) * id.y;
if ((tmin > tymax) || (tymin > tmax))
return false;
if (tymin > tmin) tmin = tymin;
if (tymax < tmax) tmax = tymax;
float tzmin = (box.corner( sdz).z - p.z) * id.z;
float tzmax = (box.corner(1-sdz).z - p.z) * id.z;
if ((tmin > tzmax) || (tzmin > tmax))
return false;
if (tzmin > tmin) tmin = tzmin;
if (tzmax < tmax) tmax = tzmax;
if (tmax < 0)
return false;
if (t != NULL) *t = tmin;
return true;
}

@ -74,6 +74,8 @@ namespace nv
friend Stream & operator<< (Stream & s, Box & box);
const Vector3 & corner(int i) const { return (&minCorner)[i]; }
Vector3 minCorner;
Vector3 maxCorner;
};
@ -81,6 +83,8 @@ namespace nv
float distanceSquared(const Box &box, const Vector3 &point);
bool overlap(const Box &box, const Sphere &sphere);
// p is ray origin, id is inverse ray direction.
bool intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t);
} // nv namespace

@ -12,51 +12,51 @@
namespace nv
{
// Default ctor.
Box::Box() { };
inline Box::Box() { };
// Copy ctor.
Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
inline Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
// Init ctor.
Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { }
inline Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { }
// Assignment operator.
Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; }
inline Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; }
// Clear the bounds.
void Box::clearBounds()
inline void Box::clearBounds()
{
minCorner.set(FLT_MAX, FLT_MAX, FLT_MAX);
maxCorner.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
}
// Build a cube centered on center and with edge = 2*dist
void Box::cube(const Vector3 & center, float dist)
inline void Box::cube(const Vector3 & center, float dist)
{
setCenterExtents(center, Vector3(dist, dist, dist));
}
// Build a box, given center and extents.
void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents)
inline void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents)
{
minCorner = center - extents;
maxCorner = center + extents;
}
// Get box center.
Vector3 Box::center() const
inline Vector3 Box::center() const
{
return (minCorner + maxCorner) * 0.5f;
}
// Return extents of the box.
Vector3 Box::extents() const
inline Vector3 Box::extents() const
{
return (maxCorner - minCorner) * 0.5f;
}
// Return extents of the box.
float Box::extents(uint axis) const
inline float Box::extents(uint axis) const
{
nvDebugCheck(axis < 3);
if (axis == 0) return (maxCorner.x - minCorner.x) * 0.5f;
@ -67,55 +67,55 @@ namespace nv
}
// Add a point to this box.
void Box::addPointToBounds(const Vector3 & p)
inline void Box::addPointToBounds(const Vector3 & p)
{
minCorner = min(minCorner, p);
maxCorner = max(maxCorner, p);
}
// Add a box to this box.
void Box::addBoxToBounds(const Box & b)
inline void Box::addBoxToBounds(const Box & b)
{
minCorner = min(minCorner, b.minCorner);
maxCorner = max(maxCorner, b.maxCorner);
}
// Translate box.
void Box::translate(const Vector3 & v)
inline void Box::translate(const Vector3 & v)
{
minCorner += v;
maxCorner += v;
}
// Scale the box.
void Box::scale(float s)
inline void Box::scale(float s)
{
minCorner *= s;
maxCorner *= s;
}
// Expand the box by a fixed amount.
void Box::expand(float r) {
inline void Box::expand(float r) {
minCorner -= Vector3(r,r,r);
maxCorner += Vector3(r,r,r);
}
// Get the area of the box.
float Box::area() const
inline float Box::area() const
{
const Vector3 d = extents();
return 8.0f * (d.x*d.y + d.x*d.z + d.y*d.z);
}
// Get the volume of the box.
float Box::volume() const
inline float Box::volume() const
{
Vector3 d = extents();
return 8.0f * (d.x * d.y * d.z);
}
// Return true if the box contains the given point.
bool Box::contains(const Vector3 & p) const
inline bool Box::contains(const Vector3 & p) const
{
return
minCorner.x < p.x && minCorner.y < p.y && minCorner.z < p.z &&
@ -123,7 +123,7 @@ namespace nv
}
// Split the given box in 8 octants and assign the ith one to this box.
void Box::setOctant(const Box & box, const Vector3 & center, int i)
inline void Box::setOctant(const Box & box, const Vector3 & center, int i)
{
minCorner = box.minCorner;
maxCorner = box.maxCorner;

@ -159,24 +159,228 @@ Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
float matrix[6];
Vector3 centroid = computeCovariance(n, points, matrix);
if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0)
if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
{
// If no plane defined, then return a horizontal plane.
return Plane(Vector3(0, 0, 1), centroid);
}
#pragma NV_MESSAGE("TODO: need to write an eigensolver!")
float eigenValues[3];
Vector3 eigenVectors[3];
if (!eigenSolveSymmetric(matrix, eigenValues, eigenVectors)) {
// If no plane defined, then return a horizontal plane.
return Plane(Vector3(0, 0, 1), centroid);
}
// - Numerical Recipes in C is a good reference. Householder transforms followed by QL decomposition seems to be the best approach.
// - The one from magic-tools is now LGPL. For the 3D case it uses a cubic root solver, which is not very accurate.
// - Charles' Galaxy3 contains an implementation of the tridiagonalization method, but is under BPL.
return Plane(eigenVectors[2], centroid);
}
bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON*/)
{
// compute the centroid and covariance
float matrix[6];
Vector3 centroid = computeCovariance(n, points, matrix);
//EigenSolver3 solver(matrix);
float eigenValues[3];
Vector3 eigenVectors[3];
if (!eigenSolveSymmetric(matrix, eigenValues, eigenVectors)) {
return false;
}
return Plane();
return eigenValues[2] < epsilon;
}
// Tridiagonal solver from Charles Bloom.
// Householder transforms followed by QL decomposition.
// Seems to be based on the code from Numerical Recipes in C.
static void EigenSolver_Tridiagonal(double mat[3][3],double * diag,double * subd);
static bool EigenSolver_QLAlgorithm(double mat[3][3],double * diag,double * subd);
bool nv::Fit::eigenSolveSymmetric(float matrix[6], float eigenValues[3], Vector3 eigenVectors[3])
{
nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
double subd[3];
double diag[3];
double work[3][3];
work[0][0] = matrix[0];
work[0][1] = work[1][0] = matrix[1];
work[0][2] = work[2][0] = matrix[2];
work[1][1] = matrix[3];
work[1][2] = work[2][1] = matrix[4];
work[2][2] = matrix[5];
EigenSolver_Tridiagonal(work, diag, subd);
if (!EigenSolver_QLAlgorithm(work, diag, subd))
{
for (int i = 0; i < 3; i++) {
eigenValues[i] = 0;
eigenVectors[i] = Vector3(0);
}
return false;
}
for (int i = 0; i < 3; i++) {
eigenValues[i] = (float)diag[i];
}
// eigenvectors are the columns; make them the rows :
for (int i=0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
eigenVectors[j].component[i] = (float) work[i][j];
}
}
// shuffle to sort by singular value :
if (eigenValues[2] > eigenValues[0] && eigenValues[2] > eigenValues[1])
{
swap(eigenValues[0], eigenValues[2]);
swap(eigenVectors[0], eigenVectors[2]);
}
if (eigenValues[1] > eigenValues[0])
{
swap(eigenValues[0], eigenValues[1]);
swap(eigenVectors[0], eigenVectors[1]);
}
if (eigenValues[2] > eigenValues[1])
{
swap(eigenValues[1], eigenValues[2]);
swap(eigenVectors[1], eigenVectors[2]);
}
nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2]);
nvDebugCheck(eigenValues[1] >= eigenValues[2]);
return true;
}
static void EigenSolver_Tridiagonal(double mat[3][3],double * diag,double * subd)
{
// Householder reduction T = Q^t M Q
// Input:
// mat, symmetric 3x3 matrix M
// Output:
// mat, orthogonal matrix Q
// diag, diagonal entries of T
// subd, subdiagonal entries of T (T is symmetric)
const double epsilon = 1e-08f;
double a = mat[0][0];
double b = mat[0][1];
double c = mat[0][2];
double d = mat[1][1];
double e = mat[1][2];
double f = mat[2][2];
diag[0] = a;
subd[2] = 0.f;
if ( fabs(c) >= epsilon )
{
const double ell = sqrt(b*b+c*c);
b /= ell;
c /= ell;
const double q = 2*b*e+c*(f-d);
diag[1] = d+c*q;
diag[2] = f-c*q;
subd[0] = ell;
subd[1] = e-b*q;
mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
mat[1][0] = 0; mat[1][1] = b; mat[1][2] = c;
mat[2][0] = 0; mat[2][1] = c; mat[2][2] = -b;
}
else
{
diag[1] = d;
diag[2] = f;
subd[0] = b;
subd[1] = e;
mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
mat[1][0] = 0; mat[1][1] = 1; mat[1][2] = 0;
mat[2][0] = 0; mat[2][1] = 0; mat[2][2] = 1;
}
}
static bool EigenSolver_QLAlgorithm(double mat[3][3],double * diag,double * subd)
{
// QL iteration with implicit shifting to reduce matrix from tridiagonal
// to diagonal
const int maxiter = 32;
for (int ell = 0; ell < 3; ell++)
{
int iter;
for (iter = 0; iter < maxiter; iter++)
{
int m;
for (m = ell; m <= 1; m++)
{
double dd = fabs(diag[m]) + fabs(diag[m+1]);
if ( fabs(subd[m]) + dd == dd )
break;
}
if ( m == ell )
break;
double g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
double r = sqrt(g*g+1);
if ( g < 0 )
g = diag[m]-diag[ell]+subd[ell]/(g-r);
else
g = diag[m]-diag[ell]+subd[ell]/(g+r);
double s = 1, c = 1, p = 0;
for (int i = m-1; i >= ell; i--)
{
double f = s*subd[i], b = c*subd[i];
if ( fabs(f) >= fabs(g) )
{
c = g/f;
r = sqrt(c*c+1);
subd[i+1] = f*r;
c *= (s = 1/r);
}
else
{
s = f/g;
r = sqrt(s*s+1);
subd[i+1] = g*r;
s *= (c = 1/r);
}
g = diag[i+1]-p;
r = (diag[i]-g)*s+2*b*c;
p = s*r;
diag[i+1] = g+p;
g = c*r-b;
for (int k = 0; k < 3; k++)
{
f = mat[k][i+1];
mat[k][i+1] = s*mat[k][i]+c*f;
mat[k][i] = c*mat[k][i]-s*f;
}
}
diag[ell] -= p;
subd[ell] = g;
subd[m] = 0;
}
if ( iter == maxiter )
// should not get here under normal circumstances
return false;
}
return true;
}
int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
{
// Compute principal component.

@ -23,6 +23,10 @@ namespace nv
Vector3 computePrincipalComponent(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
Plane bestPlane(int n, const Vector3 * points);
bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);
bool eigenSolveSymmetric (float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
// Returns number of clusters [1-4].
int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);

@ -488,16 +488,20 @@ nv::half_to_float( uint16 h )
}
// @@ This code appears to be wrong.
// @@ These tables could be smaller.
static uint32 mantissa_table[2048];
static uint32 exponent_table[64];
static uint32 offset_table[64];
namespace nv {
uint32 mantissa_table[2048];
uint32 exponent_table[64];
uint32 offset_table[64];
}
void nv::half_init_tables()
{
// Init mantissa table.
mantissa_table[0] = 0;
// denormals
for (int i = 1; i < 1024; i++) {
uint m = i << 13;
uint e = 0;
@ -511,8 +515,9 @@ void nv::half_init_tables()
mantissa_table[i] = m | e;
}
// normals
for (int i = 1024; i < 2048; i++) {
mantissa_table[i] = 0x38000000 + ((i - 1024) << 13);
mantissa_table[i] = (i - 1024) << 13;
}
@ -520,17 +525,17 @@ void nv::half_init_tables()
exponent_table[0] = 0;
for (int i = 1; i < 31; i++) {
exponent_table[i] = (i << 23);
exponent_table[i] = 0x38000000 + (i << 23);
}
exponent_table[31] = 0x47800000;
exponent_table[31] = 0x7f800000;
exponent_table[32] = 0x80000000;
for (int i = 33; i < 63; i++) {
exponent_table[i] = 0x80000000 + ((i - 32) << 23);
exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
}
exponent_table[63] = 0xC7800000;
exponent_table[63] = 0xff800000;
// Init offset table.
@ -545,22 +550,11 @@ void nv::half_init_tables()
for (int i = 33; i < 64; i++) {
offset_table[i] = 1024;
}
/*for (int i = 0; i < 64; i++) {
offset_table[i] = ((i & 31) != 0) * 1024;
}*/
}
// Fast half to float conversion based on:
// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
uint32 nv::fast_half_to_float(uint16 h)
{
uint exp = h >> 10;
return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
}
#if 0
// Inaccurate conversion suggested at the ffmpeg mailing list:
// http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
uint32 nv::fast_half_to_float(uint16 v)
@ -609,4 +603,94 @@ __asm
}
#endif
#if 0
// These version computes the tables at compile time:
// http://gamedev.stackexchange.com/questions/17326/conversion-of-a-number-from-single-precision-floating-point-representation-to-a
/* This method is faster than the OpenEXR implementation (very often
* used, eg. in Ogre), with the additional benefit of rounding, inspired
* by James Tursas half-precision code. */
static inline uint16_t float_to_half_branch(uint32_t x)
{
uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
/* If zero, or denormal, or exponent underflows too much for a denormal
* half, return signed zero. */
if (e < 103)
return bits;
/* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
if (e > 142)
{
bits |= 0x7c00u;
/* If exponent was 0xff and one mantissa bit was set, it means NaN,
* not Inf, so make sure we set one mantissa bit too. */
bits |= e == 255 && (x & 0x007fffffu);
return bits;
}
/* If exponent underflows but not too much, return a denormal */
if (e < 113)
{
m |= 0x0800u;
/* Extra rounding may overflow and set mantissa to 0 and exponent
* to 1, which is OK. */
bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
return bits;
}
bits |= ((e - 112) << 10) | (m >> 1);
/* Extra rounding. An overflow will set mantissa to 0 and increment
* the exponent, which is OK. */
bits += m & 1;
return bits;
}
/* These macros implement a finite iterator useful to build lookup
* tables. For instance, S64(0) will call S1(x) for all values of x
* between 0 and 63.
* Due to the exponential behaviour of the calls, the stress on the
* compiler may be important. */
#define S4(x) S1((x)), S1((x)+1), S1((x)+2), S1((x)+3)
#define S16(x) S4((x)), S4((x)+4), S4((x)+8), S4((x)+12)
#define S64(x) S16((x)), S16((x)+16), S16((x)+32), S16((x)+48)
#define S256(x) S64((x)), S64((x)+64), S64((x)+128), S64((x)+192)
#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
/* Lookup table-based algorithm from “Fast Half Float Conversions”
* by Jeroen van der Zijp, November 2008. No rounding is performed,
* and some NaN values may be incorrectly converted to Inf. */
static inline uint16_t float_to_half_nobranch(uint32_t x)
{
static uint16_t const basetable[512] =
{
#define S1(i) (((i) < 103) ? 0x0000 : \
((i) < 113) ? 0x0400 >> (113 - (i)) : \
((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
S256(0),
#undef S1
#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
((i) < 113) ? 0x0400 >> (113 - (i)) : \
((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
S256(0),
#undef S1
};
static uint8_t const shifttable[512] =
{
#define S1(i) (((i) < 103) ? 24 : \
((i) < 113) ? 126 - (i) : \
((i) < 143 || (i) == 255) ? 13 : 24)
S256(0), S256(0),
#undef S1
};
uint16_t bits = basetable[(x >> 23) & 0x1ff];
bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
return bits;
}
#endif

@ -11,7 +11,18 @@ namespace nv {
void half_init_tables();
uint32 fast_half_to_float(uint16 h);
extern uint32 mantissa_table[2048];
extern uint32 exponent_table[64];
extern uint32 offset_table[64];
// Fast half to float conversion based on:
// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
inline uint32 fast_half_to_float(uint16 h)
{
uint exp = h >> 10;
return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
}
inline uint16 to_half(float c) {
union { float f; uint32 u; } f;

@ -33,8 +33,15 @@
#include <emmintrin.h>
#endif
// See this for ideas:
// http://molecularmusings.wordpress.com/2011/10/18/simdifying-multi-platform-math/
namespace nv {
#define NV_SIMD_NATIVE NV_FORCEINLINE
#define NV_SIMD_INLINE inline
class SimdVector
{
public:
@ -42,45 +49,47 @@ namespace nv {
typedef SimdVector const& Arg;
NV_FORCEINLINE SimdVector() {}
NV_FORCEINLINE explicit SimdVector(float f) : vec(_mm_set1_ps(f)) {}
NV_FORCEINLINE explicit SimdVector(__m128 v) : vec(v) {}
NV_FORCEINLINE explicit SimdVector(NV_ALIGN_16 Vector4 v) {
vec = _mm_load_ps( v.component );
NV_SIMD_NATIVE SimdVector() {}
NV_SIMD_NATIVE explicit SimdVector(__m128 v) : vec(v) {}
NV_SIMD_NATIVE explicit SimdVector(float f) {
vec = _mm_set1_ps(f);
}
NV_FORCEINLINE explicit SimdVector(const float * v) {
NV_SIMD_NATIVE explicit SimdVector(const float * v)
{
vec = _mm_load_ps( v );
}
NV_FORCEINLINE SimdVector(float x, float y, float z, float w) {
NV_SIMD_NATIVE SimdVector(float x, float y, float z, float w)
{
vec = _mm_setr_ps( x, y, z, w );
}
NV_FORCEINLINE SimdVector(const SimdVector & arg) : vec(arg.vec) {}
NV_SIMD_NATIVE SimdVector(const SimdVector & arg) : vec(arg.vec) {}
NV_FORCEINLINE SimdVector & operator=(const SimdVector & arg) {
NV_SIMD_NATIVE SimdVector & operator=(const SimdVector & arg)
{
vec = arg.vec;
return *this;
}
float toFloat() const
NV_SIMD_INLINE float toFloat() const
{
NV_ALIGN_16 float f;
_mm_store_ss(&f, vec);
return f;
}
Vector3 toVector3() const
NV_SIMD_INLINE Vector3 toVector3() const
{
NV_ALIGN_16 float c[4];
_mm_store_ps( c, vec );
return Vector3( c[0], c[1], c[2] );
}
Vector4 toVector4() const
NV_SIMD_INLINE Vector4 toVector4() const
{
NV_ALIGN_16 float c[4];
_mm_store_ps( c, vec );
@ -88,57 +97,60 @@ namespace nv {
}
#define SSE_SPLAT( a ) ((a) | ((a) << 2) | ((a) << 4) | ((a) << 6))
NV_FORCEINLINE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); }
NV_FORCEINLINE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); }
NV_FORCEINLINE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); }
NV_FORCEINLINE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); }
NV_SIMD_NATIVE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); }
NV_SIMD_NATIVE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); }
NV_SIMD_NATIVE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); }
NV_SIMD_NATIVE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); }
#undef SSE_SPLAT
NV_FORCEINLINE SimdVector & operator+=( Arg v ) {
NV_SIMD_NATIVE SimdVector& operator+=( Arg v )
{
vec = _mm_add_ps( vec, v.vec );
return *this;
}
NV_FORCEINLINE SimdVector & operator-=( Arg v ) {
NV_SIMD_NATIVE SimdVector& operator-=( Arg v )
{
vec = _mm_sub_ps( vec, v.vec );
return *this;
}
NV_FORCEINLINE SimdVector & operator*=( Arg v ) {
NV_SIMD_NATIVE SimdVector& operator*=( Arg v )
{
vec = _mm_mul_ps( vec, v.vec );
return *this;
}
};
NV_FORCEINLINE SimdVector operator+(SimdVector::Arg left, SimdVector::Arg right)
NV_SIMD_NATIVE SimdVector operator+( SimdVector::Arg left, SimdVector::Arg right )
{
return SimdVector( _mm_add_ps( left.vec, right.vec ) );
}
NV_FORCEINLINE SimdVector operator-(SimdVector::Arg left, SimdVector::Arg right)
NV_SIMD_NATIVE SimdVector operator-( SimdVector::Arg left, SimdVector::Arg right )
{
return SimdVector( _mm_sub_ps( left.vec, right.vec ) );
}
NV_FORCEINLINE SimdVector operator*(SimdVector::Arg left, SimdVector::Arg right)
NV_SIMD_NATIVE SimdVector operator*( SimdVector::Arg left, SimdVector::Arg right )
{
return SimdVector( _mm_mul_ps( left.vec, right.vec ) );
}
// Returns a*b + c
NV_FORCEINLINE SimdVector multiplyAdd(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c)
NV_SIMD_INLINE SimdVector multiplyAdd( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
{
return SimdVector( _mm_add_ps( _mm_mul_ps( a.vec, b.vec ), c.vec ) );
}
// Returns -( a*b - c ) = c - a*b
NV_FORCEINLINE SimdVector negativeMultiplySubtract(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c)
// Returns -( a*b - c )
NV_SIMD_INLINE SimdVector negativeMultiplySubtract( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
{
return SimdVector( _mm_sub_ps( c.vec, _mm_mul_ps( a.vec, b.vec ) ) );
}
inline SimdVector reciprocal( SimdVector::Arg v )
NV_SIMD_INLINE SimdVector reciprocal( SimdVector::Arg v )
{
// get the reciprocal estimate
__m128 estimate = _mm_rcp_ps( v.vec );
@ -148,17 +160,17 @@ namespace nv {
return SimdVector( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) );
}
NV_FORCEINLINE SimdVector min(SimdVector::Arg left, SimdVector::Arg right)
NV_SIMD_NATIVE SimdVector min( SimdVector::Arg left, SimdVector::Arg right )
{
return SimdVector( _mm_min_ps( left.vec, right.vec ) );
}
NV_FORCEINLINE SimdVector max(SimdVector::Arg left, SimdVector::Arg right)
NV_SIMD_NATIVE SimdVector max( SimdVector::Arg left, SimdVector::Arg right )
{
return SimdVector( _mm_max_ps( left.vec, right.vec ) );
}
inline SimdVector truncate( SimdVector::Arg v )
NV_SIMD_INLINE SimdVector truncate( SimdVector::Arg v )
{
#if (NV_USE_SSE == 1)
// convert to ints
@ -179,12 +191,12 @@ namespace nv {
#endif
}
NV_FORCEINLINE SimdVector compareEqual(SimdVector::Arg left, SimdVector::Arg right)
NV_SIMD_NATIVE SimdVector compareEqual( SimdVector::Arg left, SimdVector::Arg right )
{
return SimdVector( _mm_cmpeq_ps( left.vec, right.vec ) );
}
inline SimdVector select(SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits)
NV_SIMD_INLINE SimdVector select( SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits )
{
__m128 a = _mm_andnot_ps( bits.vec, off.vec );
__m128 b = _mm_and_ps( bits.vec, on.vec );
@ -192,7 +204,7 @@ namespace nv {
return SimdVector( _mm_or_ps( a, b ) );
}
inline bool compareAnyLessThan(SimdVector::Arg left, SimdVector::Arg right)
NV_SIMD_INLINE bool compareAnyLessThan( SimdVector::Arg left, SimdVector::Arg right )
{
__m128 bits = _mm_cmplt_ps( left.vec, right.vec );
int value = _mm_movemask_ps( bits );

@ -314,6 +314,12 @@ namespace nv
return scale(v, 1.0f/s);
}
inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
{
const float s = 1.0f - t;
return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
}
inline float dot(Vector2::Arg a, Vector2::Arg b)
{
return a.x * b.x + a.y * b.y;
@ -381,6 +387,16 @@ namespace nv
return Vector2(max(a.x, b.x), max(a.y, b.y));
}
inline Vector2 clamp(Vector2::Arg v, float min, float max)
{
return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
}
inline Vector2 saturate(Vector2::Arg v)
{
return Vector2(saturate(v.x), saturate(v.y));
}
inline bool isFinite(Vector2::Arg v)
{
return isFinite(v.x) && isFinite(v.y);
@ -394,6 +410,7 @@ namespace nv
return vf;
}
// Note, this is the area scaled by 2!
inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
{
Vector2 v0 = a - c;
@ -500,6 +517,16 @@ namespace nv
return sqrtf(lengthSquared(v));
}
inline float distance(Vector3::Arg a, Vector3::Arg b)
{
return length(a - b);
}
inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
{
return lengthSquared(a - b);
}
inline float inverseLength(Vector3::Arg v)
{
return 1.0f / sqrtf(lengthSquared(v));
@ -557,6 +584,11 @@ namespace nv
return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
}
inline Vector3 saturate(Vector3::Arg v)
{
return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
}
inline Vector3 floor(Vector3::Arg v)
{
return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
@ -580,6 +612,11 @@ namespace nv
return vf;
}
inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
{
return v - (2 * dot(v, n)) * n;
}
// Vector4
@ -627,9 +664,15 @@ namespace nv
return scale(v, 1.0f/s);
}
inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
/*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
{
return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
}*/
inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
{
const float s = 1.0f - t;
return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
}
inline float dot(Vector4::Arg a, Vector4::Arg b)
@ -699,6 +742,16 @@ namespace nv
return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
}
inline Vector4 clamp(Vector4::Arg v, float min, float max)
{
return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
}
inline Vector4 saturate(Vector4::Arg v)
{
return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
}
inline bool isFinite(Vector4::Arg v)
{
return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);

@ -155,15 +155,14 @@ namespace nv
}
#if NV_CC_MSVC
inline float log2f(float x)
NV_FORCEINLINE float log2f(float x)
{
nvCheck(x >= 0);
return logf(x) / logf(2.0f);
}
inline float exp2f(float x)
NV_FORCEINLINE float exp2f(float x)
{
return powf(2, x);
return powf(2.0f, x);
}
#endif

Loading…
Cancel
Save