Merge changes from The Witness.

pull/216/head
castano 12 years ago
parent 06c170b41b
commit fa468b04ab

@ -29,3 +29,38 @@ float nv::distanceSquared(const Box &box, const Vector3 &point) {
/*bool nv::overlap(const Box &box, const Sphere &sphere) { /*bool nv::overlap(const Box &box, const Sphere &sphere) {
return distanceSquared(box, sphere.center) < sphere.radius * sphere.radius; return distanceSquared(box, sphere.center) < sphere.radius * sphere.radius;
}*/ }*/
bool nv::intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t /*= NULL*/) {
// Precompute these in ray structure?
int sdx = (id.x < 0);
int sdy = (id.y < 0);
int sdz = (id.z < 0);
float tmin = (box.corner( sdx).x - p.x) * id.x;
float tmax = (box.corner(1-sdx).x - p.x) * id.x;
float tymin = (box.corner( sdy).y - p.y) * id.y;
float tymax = (box.corner(1-sdy).y - p.y) * id.y;
if ((tmin > tymax) || (tymin > tmax))
return false;
if (tymin > tmin) tmin = tymin;
if (tymax < tmax) tmax = tymax;
float tzmin = (box.corner( sdz).z - p.z) * id.z;
float tzmax = (box.corner(1-sdz).z - p.z) * id.z;
if ((tmin > tzmax) || (tzmin > tmax))
return false;
if (tzmin > tmin) tmin = tzmin;
if (tzmax < tmax) tmax = tzmax;
if (tmax < 0)
return false;
if (t != NULL) *t = tmin;
return true;
}

@ -74,6 +74,8 @@ namespace nv
friend Stream & operator<< (Stream & s, Box & box); friend Stream & operator<< (Stream & s, Box & box);
const Vector3 & corner(int i) const { return (&minCorner)[i]; }
Vector3 minCorner; Vector3 minCorner;
Vector3 maxCorner; Vector3 maxCorner;
}; };
@ -81,6 +83,8 @@ namespace nv
float distanceSquared(const Box &box, const Vector3 &point); float distanceSquared(const Box &box, const Vector3 &point);
bool overlap(const Box &box, const Sphere &sphere); bool overlap(const Box &box, const Sphere &sphere);
// p is ray origin, id is inverse ray direction.
bool intersect(const Box & box, const Vector3 & p, const Vector3 & id, float * t);
} // nv namespace } // nv namespace

@ -12,51 +12,51 @@
namespace nv namespace nv
{ {
// Default ctor. // Default ctor.
Box::Box() { }; inline Box::Box() { };
// Copy ctor. // Copy ctor.
Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { } inline Box::Box(const Box & b) : minCorner(b.minCorner), maxCorner(b.maxCorner) { }
// Init ctor. // Init ctor.
Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { } inline Box::Box(const Vector3 & mins, const Vector3 & maxs) : minCorner(mins), maxCorner(maxs) { }
// Assignment operator. // Assignment operator.
Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; } inline Box & Box::operator=(const Box & b) { minCorner = b.minCorner; maxCorner = b.maxCorner; return *this; }
// Clear the bounds. // Clear the bounds.
void Box::clearBounds() inline void Box::clearBounds()
{ {
minCorner.set(FLT_MAX, FLT_MAX, FLT_MAX); minCorner.set(FLT_MAX, FLT_MAX, FLT_MAX);
maxCorner.set(-FLT_MAX, -FLT_MAX, -FLT_MAX); maxCorner.set(-FLT_MAX, -FLT_MAX, -FLT_MAX);
} }
// Build a cube centered on center and with edge = 2*dist // Build a cube centered on center and with edge = 2*dist
void Box::cube(const Vector3 & center, float dist) inline void Box::cube(const Vector3 & center, float dist)
{ {
setCenterExtents(center, Vector3(dist, dist, dist)); setCenterExtents(center, Vector3(dist, dist, dist));
} }
// Build a box, given center and extents. // Build a box, given center and extents.
void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents) inline void Box::setCenterExtents(const Vector3 & center, const Vector3 & extents)
{ {
minCorner = center - extents; minCorner = center - extents;
maxCorner = center + extents; maxCorner = center + extents;
} }
// Get box center. // Get box center.
Vector3 Box::center() const inline Vector3 Box::center() const
{ {
return (minCorner + maxCorner) * 0.5f; return (minCorner + maxCorner) * 0.5f;
} }
// Return extents of the box. // Return extents of the box.
Vector3 Box::extents() const inline Vector3 Box::extents() const
{ {
return (maxCorner - minCorner) * 0.5f; return (maxCorner - minCorner) * 0.5f;
} }
// Return extents of the box. // Return extents of the box.
float Box::extents(uint axis) const inline float Box::extents(uint axis) const
{ {
nvDebugCheck(axis < 3); nvDebugCheck(axis < 3);
if (axis == 0) return (maxCorner.x - minCorner.x) * 0.5f; if (axis == 0) return (maxCorner.x - minCorner.x) * 0.5f;
@ -67,55 +67,55 @@ namespace nv
} }
// Add a point to this box. // Add a point to this box.
void Box::addPointToBounds(const Vector3 & p) inline void Box::addPointToBounds(const Vector3 & p)
{ {
minCorner = min(minCorner, p); minCorner = min(minCorner, p);
maxCorner = max(maxCorner, p); maxCorner = max(maxCorner, p);
} }
// Add a box to this box. // Add a box to this box.
void Box::addBoxToBounds(const Box & b) inline void Box::addBoxToBounds(const Box & b)
{ {
minCorner = min(minCorner, b.minCorner); minCorner = min(minCorner, b.minCorner);
maxCorner = max(maxCorner, b.maxCorner); maxCorner = max(maxCorner, b.maxCorner);
} }
// Translate box. // Translate box.
void Box::translate(const Vector3 & v) inline void Box::translate(const Vector3 & v)
{ {
minCorner += v; minCorner += v;
maxCorner += v; maxCorner += v;
} }
// Scale the box. // Scale the box.
void Box::scale(float s) inline void Box::scale(float s)
{ {
minCorner *= s; minCorner *= s;
maxCorner *= s; maxCorner *= s;
} }
// Expand the box by a fixed amount. // Expand the box by a fixed amount.
void Box::expand(float r) { inline void Box::expand(float r) {
minCorner -= Vector3(r,r,r); minCorner -= Vector3(r,r,r);
maxCorner += Vector3(r,r,r); maxCorner += Vector3(r,r,r);
} }
// Get the area of the box. // Get the area of the box.
float Box::area() const inline float Box::area() const
{ {
const Vector3 d = extents(); const Vector3 d = extents();
return 8.0f * (d.x*d.y + d.x*d.z + d.y*d.z); return 8.0f * (d.x*d.y + d.x*d.z + d.y*d.z);
} }
// Get the volume of the box. // Get the volume of the box.
float Box::volume() const inline float Box::volume() const
{ {
Vector3 d = extents(); Vector3 d = extents();
return 8.0f * (d.x * d.y * d.z); return 8.0f * (d.x * d.y * d.z);
} }
// Return true if the box contains the given point. // Return true if the box contains the given point.
bool Box::contains(const Vector3 & p) const inline bool Box::contains(const Vector3 & p) const
{ {
return return
minCorner.x < p.x && minCorner.y < p.y && minCorner.z < p.z && minCorner.x < p.x && minCorner.y < p.y && minCorner.z < p.z &&
@ -123,7 +123,7 @@ namespace nv
} }
// Split the given box in 8 octants and assign the ith one to this box. // Split the given box in 8 octants and assign the ith one to this box.
void Box::setOctant(const Box & box, const Vector3 & center, int i) inline void Box::setOctant(const Box & box, const Vector3 & center, int i)
{ {
minCorner = box.minCorner; minCorner = box.minCorner;
maxCorner = box.maxCorner; maxCorner = box.maxCorner;

@ -159,24 +159,228 @@ Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
float matrix[6]; float matrix[6];
Vector3 centroid = computeCovariance(n, points, matrix); Vector3 centroid = computeCovariance(n, points, matrix);
if (matrix[0] == 0 || matrix[3] == 0 || matrix[5] == 0) if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
{ {
// If no plane defined, then return a horizontal plane. // If no plane defined, then return a horizontal plane.
return Plane(Vector3(0, 0, 1), centroid); return Plane(Vector3(0, 0, 1), centroid);
} }
#pragma NV_MESSAGE("TODO: need to write an eigensolver!") float eigenValues[3];
Vector3 eigenVectors[3];
if (!eigenSolveSymmetric(matrix, eigenValues, eigenVectors)) {
// If no plane defined, then return a horizontal plane.
return Plane(Vector3(0, 0, 1), centroid);
}
// - Numerical Recipes in C is a good reference. Householder transforms followed by QL decomposition seems to be the best approach. return Plane(eigenVectors[2], centroid);
// - The one from magic-tools is now LGPL. For the 3D case it uses a cubic root solver, which is not very accurate. }
// - Charles' Galaxy3 contains an implementation of the tridiagonalization method, but is under BPL.
bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON*/)
{
// compute the centroid and covariance
float matrix[6];
Vector3 centroid = computeCovariance(n, points, matrix);
//EigenSolver3 solver(matrix); float eigenValues[3];
Vector3 eigenVectors[3];
if (!eigenSolveSymmetric(matrix, eigenValues, eigenVectors)) {
return false;
}
return Plane(); return eigenValues[2] < epsilon;
} }
// Tridiagonal solver from Charles Bloom.
// Householder transforms followed by QL decomposition.
// Seems to be based on the code from Numerical Recipes in C.
static void EigenSolver_Tridiagonal(double mat[3][3],double * diag,double * subd);
static bool EigenSolver_QLAlgorithm(double mat[3][3],double * diag,double * subd);
bool nv::Fit::eigenSolveSymmetric(float matrix[6], float eigenValues[3], Vector3 eigenVectors[3])
{
nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
double subd[3];
double diag[3];
double work[3][3];
work[0][0] = matrix[0];
work[0][1] = work[1][0] = matrix[1];
work[0][2] = work[2][0] = matrix[2];
work[1][1] = matrix[3];
work[1][2] = work[2][1] = matrix[4];
work[2][2] = matrix[5];
EigenSolver_Tridiagonal(work, diag, subd);
if (!EigenSolver_QLAlgorithm(work, diag, subd))
{
for (int i = 0; i < 3; i++) {
eigenValues[i] = 0;
eigenVectors[i] = Vector3(0);
}
return false;
}
for (int i = 0; i < 3; i++) {
eigenValues[i] = (float)diag[i];
}
// eigenvectors are the columns; make them the rows :
for (int i=0; i < 3; i++)
{
for (int j = 0; j < 3; j++)
{
eigenVectors[j].component[i] = (float) work[i][j];
}
}
// shuffle to sort by singular value :
if (eigenValues[2] > eigenValues[0] && eigenValues[2] > eigenValues[1])
{
swap(eigenValues[0], eigenValues[2]);
swap(eigenVectors[0], eigenVectors[2]);
}
if (eigenValues[1] > eigenValues[0])
{
swap(eigenValues[0], eigenValues[1]);
swap(eigenVectors[0], eigenVectors[1]);
}
if (eigenValues[2] > eigenValues[1])
{
swap(eigenValues[1], eigenValues[2]);
swap(eigenVectors[1], eigenVectors[2]);
}
nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2]);
nvDebugCheck(eigenValues[1] >= eigenValues[2]);
return true;
}
static void EigenSolver_Tridiagonal(double mat[3][3],double * diag,double * subd)
{
// Householder reduction T = Q^t M Q
// Input:
// mat, symmetric 3x3 matrix M
// Output:
// mat, orthogonal matrix Q
// diag, diagonal entries of T
// subd, subdiagonal entries of T (T is symmetric)
const double epsilon = 1e-08f;
double a = mat[0][0];
double b = mat[0][1];
double c = mat[0][2];
double d = mat[1][1];
double e = mat[1][2];
double f = mat[2][2];
diag[0] = a;
subd[2] = 0.f;
if ( fabs(c) >= epsilon )
{
const double ell = sqrt(b*b+c*c);
b /= ell;
c /= ell;
const double q = 2*b*e+c*(f-d);
diag[1] = d+c*q;
diag[2] = f-c*q;
subd[0] = ell;
subd[1] = e-b*q;
mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
mat[1][0] = 0; mat[1][1] = b; mat[1][2] = c;
mat[2][0] = 0; mat[2][1] = c; mat[2][2] = -b;
}
else
{
diag[1] = d;
diag[2] = f;
subd[0] = b;
subd[1] = e;
mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
mat[1][0] = 0; mat[1][1] = 1; mat[1][2] = 0;
mat[2][0] = 0; mat[2][1] = 0; mat[2][2] = 1;
}
}
static bool EigenSolver_QLAlgorithm(double mat[3][3],double * diag,double * subd)
{
// QL iteration with implicit shifting to reduce matrix from tridiagonal
// to diagonal
const int maxiter = 32;
for (int ell = 0; ell < 3; ell++)
{
int iter;
for (iter = 0; iter < maxiter; iter++)
{
int m;
for (m = ell; m <= 1; m++)
{
double dd = fabs(diag[m]) + fabs(diag[m+1]);
if ( fabs(subd[m]) + dd == dd )
break;
}
if ( m == ell )
break;
double g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
double r = sqrt(g*g+1);
if ( g < 0 )
g = diag[m]-diag[ell]+subd[ell]/(g-r);
else
g = diag[m]-diag[ell]+subd[ell]/(g+r);
double s = 1, c = 1, p = 0;
for (int i = m-1; i >= ell; i--)
{
double f = s*subd[i], b = c*subd[i];
if ( fabs(f) >= fabs(g) )
{
c = g/f;
r = sqrt(c*c+1);
subd[i+1] = f*r;
c *= (s = 1/r);
}
else
{
s = f/g;
r = sqrt(s*s+1);
subd[i+1] = g*r;
s *= (c = 1/r);
}
g = diag[i+1]-p;
r = (diag[i]-g)*s+2*b*c;
p = s*r;
diag[i+1] = g+p;
g = c*r-b;
for (int k = 0; k < 3; k++)
{
f = mat[k][i+1];
mat[k][i+1] = s*mat[k][i]+c*f;
mat[k][i] = c*mat[k][i]-s*f;
}
}
diag[ell] -= p;
subd[ell] = g;
subd[m] = 0;
}
if ( iter == maxiter )
// should not get here under normal circumstances
return false;
}
return true;
}
int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster) int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
{ {
// Compute principal component. // Compute principal component.

@ -23,6 +23,10 @@ namespace nv
Vector3 computePrincipalComponent(int n, const Vector3 * points, const float * weights, const Vector3 & metric); Vector3 computePrincipalComponent(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
Plane bestPlane(int n, const Vector3 * points); Plane bestPlane(int n, const Vector3 * points);
bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);
bool eigenSolveSymmetric (float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
// Returns number of clusters [1-4]. // Returns number of clusters [1-4].
int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster); int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);

@ -488,16 +488,20 @@ nv::half_to_float( uint16 h )
} }
// @@ This code appears to be wrong.
// @@ These tables could be smaller. // @@ These tables could be smaller.
static uint32 mantissa_table[2048]; namespace nv {
static uint32 exponent_table[64]; uint32 mantissa_table[2048];
static uint32 offset_table[64]; uint32 exponent_table[64];
uint32 offset_table[64];
}
void nv::half_init_tables() void nv::half_init_tables()
{ {
// Init mantissa table. // Init mantissa table.
mantissa_table[0] = 0; mantissa_table[0] = 0;
// denormals
for (int i = 1; i < 1024; i++) { for (int i = 1; i < 1024; i++) {
uint m = i << 13; uint m = i << 13;
uint e = 0; uint e = 0;
@ -511,8 +515,9 @@ void nv::half_init_tables()
mantissa_table[i] = m | e; mantissa_table[i] = m | e;
} }
// normals
for (int i = 1024; i < 2048; i++) { for (int i = 1024; i < 2048; i++) {
mantissa_table[i] = 0x38000000 + ((i - 1024) << 13); mantissa_table[i] = (i - 1024) << 13;
} }
@ -520,17 +525,17 @@ void nv::half_init_tables()
exponent_table[0] = 0; exponent_table[0] = 0;
for (int i = 1; i < 31; i++) { for (int i = 1; i < 31; i++) {
exponent_table[i] = (i << 23); exponent_table[i] = 0x38000000 + (i << 23);
} }
exponent_table[31] = 0x47800000; exponent_table[31] = 0x7f800000;
exponent_table[32] = 0x80000000; exponent_table[32] = 0x80000000;
for (int i = 33; i < 63; i++) { for (int i = 33; i < 63; i++) {
exponent_table[i] = 0x80000000 + ((i - 32) << 23); exponent_table[i] = 0xb8000000 + ((i - 32) << 23);
} }
exponent_table[63] = 0xC7800000; exponent_table[63] = 0xff800000;
// Init offset table. // Init offset table.
@ -545,22 +550,11 @@ void nv::half_init_tables()
for (int i = 33; i < 64; i++) { for (int i = 33; i < 64; i++) {
offset_table[i] = 1024; offset_table[i] = 1024;
} }
/*for (int i = 0; i < 64; i++) {
offset_table[i] = ((i & 31) != 0) * 1024;
}*/
}
// Fast half to float conversion based on:
// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
uint32 nv::fast_half_to_float(uint16 h)
{
uint exp = h >> 10;
return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
} }
#if 0 #if 0
// Inaccurate conversion suggested at the ffmpeg mailing list: // Inaccurate conversion suggested at the ffmpeg mailing list:
// http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html // http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/2009-July/068949.html
uint32 nv::fast_half_to_float(uint16 v) uint32 nv::fast_half_to_float(uint16 v)
@ -609,4 +603,94 @@ __asm
} }
#endif
#if 0
// These version computes the tables at compile time:
// http://gamedev.stackexchange.com/questions/17326/conversion-of-a-number-from-single-precision-floating-point-representation-to-a
/* This method is faster than the OpenEXR implementation (very often
* used, eg. in Ogre), with the additional benefit of rounding, inspired
* by James Tursas half-precision code. */
static inline uint16_t float_to_half_branch(uint32_t x)
{
uint16_t bits = (x >> 16) & 0x8000; /* Get the sign */
uint16_t m = (x >> 12) & 0x07ff; /* Keep one extra bit for rounding */
unsigned int e = (x >> 23) & 0xff; /* Using int is faster here */
/* If zero, or denormal, or exponent underflows too much for a denormal
* half, return signed zero. */
if (e < 103)
return bits;
/* If NaN, return NaN. If Inf or exponent overflow, return Inf. */
if (e > 142)
{
bits |= 0x7c00u;
/* If exponent was 0xff and one mantissa bit was set, it means NaN,
* not Inf, so make sure we set one mantissa bit too. */
bits |= e == 255 && (x & 0x007fffffu);
return bits;
}
/* If exponent underflows but not too much, return a denormal */
if (e < 113)
{
m |= 0x0800u;
/* Extra rounding may overflow and set mantissa to 0 and exponent
* to 1, which is OK. */
bits |= (m >> (114 - e)) + ((m >> (113 - e)) & 1);
return bits;
}
bits |= ((e - 112) << 10) | (m >> 1);
/* Extra rounding. An overflow will set mantissa to 0 and increment
* the exponent, which is OK. */
bits += m & 1;
return bits;
}
/* These macros implement a finite iterator useful to build lookup
* tables. For instance, S64(0) will call S1(x) for all values of x
* between 0 and 63.
* Due to the exponential behaviour of the calls, the stress on the
* compiler may be important. */
#define S4(x) S1((x)), S1((x)+1), S1((x)+2), S1((x)+3)
#define S16(x) S4((x)), S4((x)+4), S4((x)+8), S4((x)+12)
#define S64(x) S16((x)), S16((x)+16), S16((x)+32), S16((x)+48)
#define S256(x) S64((x)), S64((x)+64), S64((x)+128), S64((x)+192)
#define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)
/* Lookup table-based algorithm from “Fast Half Float Conversions”
* by Jeroen van der Zijp, November 2008. No rounding is performed,
* and some NaN values may be incorrectly converted to Inf. */
static inline uint16_t float_to_half_nobranch(uint32_t x)
{
static uint16_t const basetable[512] =
{
#define S1(i) (((i) < 103) ? 0x0000 : \
((i) < 113) ? 0x0400 >> (113 - (i)) : \
((i) < 143) ? ((i) - 112) << 10 : 0x7c00)
S256(0),
#undef S1
#define S1(i) (0x8000 | (((i) < 103) ? 0x0000 : \
((i) < 113) ? 0x0400 >> (113 - (i)) : \
((i) < 143) ? ((i) - 112) << 10 : 0x7c00))
S256(0),
#undef S1
};
static uint8_t const shifttable[512] =
{
#define S1(i) (((i) < 103) ? 24 : \
((i) < 113) ? 126 - (i) : \
((i) < 143 || (i) == 255) ? 13 : 24)
S256(0), S256(0),
#undef S1
};
uint16_t bits = basetable[(x >> 23) & 0x1ff];
bits |= (x & 0x007fffff) >> shifttable[(x >> 23) & 0x1ff];
return bits;
}
#endif #endif

@ -11,7 +11,18 @@ namespace nv {
void half_init_tables(); void half_init_tables();
uint32 fast_half_to_float(uint16 h); extern uint32 mantissa_table[2048];
extern uint32 exponent_table[64];
extern uint32 offset_table[64];
// Fast half to float conversion based on:
// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
inline uint32 fast_half_to_float(uint16 h)
{
uint exp = h >> 10;
return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
}
inline uint16 to_half(float c) { inline uint16 to_half(float c) {
union { float f; uint32 u; } f; union { float f; uint32 u; } f;

@ -33,8 +33,15 @@
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
// See this for ideas:
// http://molecularmusings.wordpress.com/2011/10/18/simdifying-multi-platform-math/
namespace nv { namespace nv {
#define NV_SIMD_NATIVE NV_FORCEINLINE
#define NV_SIMD_INLINE inline
class SimdVector class SimdVector
{ {
public: public:
@ -42,45 +49,47 @@ namespace nv {
typedef SimdVector const& Arg; typedef SimdVector const& Arg;
NV_FORCEINLINE SimdVector() {} NV_SIMD_NATIVE SimdVector() {}
NV_FORCEINLINE explicit SimdVector(float f) : vec(_mm_set1_ps(f)) {}
NV_FORCEINLINE explicit SimdVector(__m128 v) : vec(v) {} NV_SIMD_NATIVE explicit SimdVector(__m128 v) : vec(v) {}
NV_FORCEINLINE explicit SimdVector(NV_ALIGN_16 Vector4 v) { NV_SIMD_NATIVE explicit SimdVector(float f) {
vec = _mm_load_ps( v.component ); vec = _mm_set1_ps(f);
} }
NV_FORCEINLINE explicit SimdVector(const float * v) { NV_SIMD_NATIVE explicit SimdVector(const float * v)
{
vec = _mm_load_ps( v ); vec = _mm_load_ps( v );
} }
NV_FORCEINLINE SimdVector(float x, float y, float z, float w) { NV_SIMD_NATIVE SimdVector(float x, float y, float z, float w)
{
vec = _mm_setr_ps( x, y, z, w ); vec = _mm_setr_ps( x, y, z, w );
} }
NV_FORCEINLINE SimdVector(const SimdVector & arg) : vec(arg.vec) {} NV_SIMD_NATIVE SimdVector(const SimdVector & arg) : vec(arg.vec) {}
NV_FORCEINLINE SimdVector & operator=(const SimdVector & arg) { NV_SIMD_NATIVE SimdVector & operator=(const SimdVector & arg)
{
vec = arg.vec; vec = arg.vec;
return *this; return *this;
} }
NV_SIMD_INLINE float toFloat() const
float toFloat() const
{ {
NV_ALIGN_16 float f; NV_ALIGN_16 float f;
_mm_store_ss(&f, vec); _mm_store_ss(&f, vec);
return f; return f;
} }
Vector3 toVector3() const NV_SIMD_INLINE Vector3 toVector3() const
{ {
NV_ALIGN_16 float c[4]; NV_ALIGN_16 float c[4];
_mm_store_ps( c, vec ); _mm_store_ps( c, vec );
return Vector3( c[0], c[1], c[2] ); return Vector3( c[0], c[1], c[2] );
} }
Vector4 toVector4() const NV_SIMD_INLINE Vector4 toVector4() const
{ {
NV_ALIGN_16 float c[4]; NV_ALIGN_16 float c[4];
_mm_store_ps( c, vec ); _mm_store_ps( c, vec );
@ -88,57 +97,60 @@ namespace nv {
} }
#define SSE_SPLAT( a ) ((a) | ((a) << 2) | ((a) << 4) | ((a) << 6)) #define SSE_SPLAT( a ) ((a) | ((a) << 2) | ((a) << 4) | ((a) << 6))
NV_FORCEINLINE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); } NV_SIMD_NATIVE SimdVector splatX() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 0 ) ) ); }
NV_FORCEINLINE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); } NV_SIMD_NATIVE SimdVector splatY() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 1 ) ) ); }
NV_FORCEINLINE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); } NV_SIMD_NATIVE SimdVector splatZ() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 2 ) ) ); }
NV_FORCEINLINE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); } NV_SIMD_NATIVE SimdVector splatW() const { return SimdVector( _mm_shuffle_ps( vec, vec, SSE_SPLAT( 3 ) ) ); }
#undef SSE_SPLAT #undef SSE_SPLAT
NV_FORCEINLINE SimdVector & operator+=( Arg v ) { NV_SIMD_NATIVE SimdVector& operator+=( Arg v )
{
vec = _mm_add_ps( vec, v.vec ); vec = _mm_add_ps( vec, v.vec );
return *this; return *this;
} }
NV_FORCEINLINE SimdVector & operator-=( Arg v ) { NV_SIMD_NATIVE SimdVector& operator-=( Arg v )
{
vec = _mm_sub_ps( vec, v.vec ); vec = _mm_sub_ps( vec, v.vec );
return *this; return *this;
} }
NV_FORCEINLINE SimdVector & operator*=( Arg v ) { NV_SIMD_NATIVE SimdVector& operator*=( Arg v )
{
vec = _mm_mul_ps( vec, v.vec ); vec = _mm_mul_ps( vec, v.vec );
return *this; return *this;
} }
}; };
NV_FORCEINLINE SimdVector operator+(SimdVector::Arg left, SimdVector::Arg right) NV_SIMD_NATIVE SimdVector operator+( SimdVector::Arg left, SimdVector::Arg right )
{ {
return SimdVector( _mm_add_ps( left.vec, right.vec ) ); return SimdVector( _mm_add_ps( left.vec, right.vec ) );
} }
NV_FORCEINLINE SimdVector operator-(SimdVector::Arg left, SimdVector::Arg right) NV_SIMD_NATIVE SimdVector operator-( SimdVector::Arg left, SimdVector::Arg right )
{ {
return SimdVector( _mm_sub_ps( left.vec, right.vec ) ); return SimdVector( _mm_sub_ps( left.vec, right.vec ) );
} }
NV_FORCEINLINE SimdVector operator*(SimdVector::Arg left, SimdVector::Arg right) NV_SIMD_NATIVE SimdVector operator*( SimdVector::Arg left, SimdVector::Arg right )
{ {
return SimdVector( _mm_mul_ps( left.vec, right.vec ) ); return SimdVector( _mm_mul_ps( left.vec, right.vec ) );
} }
// Returns a*b + c // Returns a*b + c
NV_FORCEINLINE SimdVector multiplyAdd(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c) NV_SIMD_INLINE SimdVector multiplyAdd( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
{ {
return SimdVector( _mm_add_ps( _mm_mul_ps( a.vec, b.vec ), c.vec ) ); return SimdVector( _mm_add_ps( _mm_mul_ps( a.vec, b.vec ), c.vec ) );
} }
// Returns -( a*b - c ) = c - a*b // Returns -( a*b - c )
NV_FORCEINLINE SimdVector negativeMultiplySubtract(SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c) NV_SIMD_INLINE SimdVector negativeMultiplySubtract( SimdVector::Arg a, SimdVector::Arg b, SimdVector::Arg c )
{ {
return SimdVector( _mm_sub_ps( c.vec, _mm_mul_ps( a.vec, b.vec ) ) ); return SimdVector( _mm_sub_ps( c.vec, _mm_mul_ps( a.vec, b.vec ) ) );
} }
inline SimdVector reciprocal( SimdVector::Arg v ) NV_SIMD_INLINE SimdVector reciprocal( SimdVector::Arg v )
{ {
// get the reciprocal estimate // get the reciprocal estimate
__m128 estimate = _mm_rcp_ps( v.vec ); __m128 estimate = _mm_rcp_ps( v.vec );
@ -148,17 +160,17 @@ namespace nv {
return SimdVector( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) ); return SimdVector( _mm_add_ps( _mm_mul_ps( diff, estimate ), estimate ) );
} }
NV_FORCEINLINE SimdVector min(SimdVector::Arg left, SimdVector::Arg right) NV_SIMD_NATIVE SimdVector min( SimdVector::Arg left, SimdVector::Arg right )
{ {
return SimdVector( _mm_min_ps( left.vec, right.vec ) ); return SimdVector( _mm_min_ps( left.vec, right.vec ) );
} }
NV_FORCEINLINE SimdVector max(SimdVector::Arg left, SimdVector::Arg right) NV_SIMD_NATIVE SimdVector max( SimdVector::Arg left, SimdVector::Arg right )
{ {
return SimdVector( _mm_max_ps( left.vec, right.vec ) ); return SimdVector( _mm_max_ps( left.vec, right.vec ) );
} }
inline SimdVector truncate( SimdVector::Arg v ) NV_SIMD_INLINE SimdVector truncate( SimdVector::Arg v )
{ {
#if (NV_USE_SSE == 1) #if (NV_USE_SSE == 1)
// convert to ints // convert to ints
@ -179,12 +191,12 @@ namespace nv {
#endif #endif
} }
NV_FORCEINLINE SimdVector compareEqual(SimdVector::Arg left, SimdVector::Arg right) NV_SIMD_NATIVE SimdVector compareEqual( SimdVector::Arg left, SimdVector::Arg right )
{ {
return SimdVector( _mm_cmpeq_ps( left.vec, right.vec ) ); return SimdVector( _mm_cmpeq_ps( left.vec, right.vec ) );
} }
inline SimdVector select(SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits) NV_SIMD_INLINE SimdVector select( SimdVector::Arg off, SimdVector::Arg on, SimdVector::Arg bits )
{ {
__m128 a = _mm_andnot_ps( bits.vec, off.vec ); __m128 a = _mm_andnot_ps( bits.vec, off.vec );
__m128 b = _mm_and_ps( bits.vec, on.vec ); __m128 b = _mm_and_ps( bits.vec, on.vec );
@ -192,7 +204,7 @@ namespace nv {
return SimdVector( _mm_or_ps( a, b ) ); return SimdVector( _mm_or_ps( a, b ) );
} }
inline bool compareAnyLessThan(SimdVector::Arg left, SimdVector::Arg right) NV_SIMD_INLINE bool compareAnyLessThan( SimdVector::Arg left, SimdVector::Arg right )
{ {
__m128 bits = _mm_cmplt_ps( left.vec, right.vec ); __m128 bits = _mm_cmplt_ps( left.vec, right.vec );
int value = _mm_movemask_ps( bits ); int value = _mm_movemask_ps( bits );

@ -314,6 +314,12 @@ namespace nv
return scale(v, 1.0f/s); return scale(v, 1.0f/s);
} }
inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
{
const float s = 1.0f - t;
return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
}
inline float dot(Vector2::Arg a, Vector2::Arg b) inline float dot(Vector2::Arg a, Vector2::Arg b)
{ {
return a.x * b.x + a.y * b.y; return a.x * b.x + a.y * b.y;
@ -381,6 +387,16 @@ namespace nv
return Vector2(max(a.x, b.x), max(a.y, b.y)); return Vector2(max(a.x, b.x), max(a.y, b.y));
} }
inline Vector2 clamp(Vector2::Arg v, float min, float max)
{
return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
}
inline Vector2 saturate(Vector2::Arg v)
{
return Vector2(saturate(v.x), saturate(v.y));
}
inline bool isFinite(Vector2::Arg v) inline bool isFinite(Vector2::Arg v)
{ {
return isFinite(v.x) && isFinite(v.y); return isFinite(v.x) && isFinite(v.y);
@ -394,6 +410,7 @@ namespace nv
return vf; return vf;
} }
// Note, this is the area scaled by 2!
inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c) inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
{ {
Vector2 v0 = a - c; Vector2 v0 = a - c;
@ -500,6 +517,16 @@ namespace nv
return sqrtf(lengthSquared(v)); return sqrtf(lengthSquared(v));
} }
inline float distance(Vector3::Arg a, Vector3::Arg b)
{
return length(a - b);
}
inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
{
return lengthSquared(a - b);
}
inline float inverseLength(Vector3::Arg v) inline float inverseLength(Vector3::Arg v)
{ {
return 1.0f / sqrtf(lengthSquared(v)); return 1.0f / sqrtf(lengthSquared(v));
@ -557,6 +584,11 @@ namespace nv
return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max)); return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
} }
inline Vector3 saturate(Vector3::Arg v)
{
return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
}
inline Vector3 floor(Vector3::Arg v) inline Vector3 floor(Vector3::Arg v)
{ {
return Vector3(floorf(v.x), floorf(v.y), floorf(v.z)); return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
@ -580,6 +612,11 @@ namespace nv
return vf; return vf;
} }
inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
{
return v - (2 * dot(v, n)) * n;
}
// Vector4 // Vector4
@ -627,9 +664,15 @@ namespace nv
return scale(v, 1.0f/s); return scale(v, 1.0f/s);
} }
inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s) /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
{ {
return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s); return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
}*/
inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
{
const float s = 1.0f - t;
return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
} }
inline float dot(Vector4::Arg a, Vector4::Arg b) inline float dot(Vector4::Arg a, Vector4::Arg b)
@ -699,6 +742,16 @@ namespace nv
return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w)); return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
} }
inline Vector4 clamp(Vector4::Arg v, float min, float max)
{
return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
}
inline Vector4 saturate(Vector4::Arg v)
{
return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
}
inline bool isFinite(Vector4::Arg v) inline bool isFinite(Vector4::Arg v)
{ {
return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w); return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);

@ -155,15 +155,14 @@ namespace nv
} }
#if NV_CC_MSVC #if NV_CC_MSVC
inline float log2f(float x) NV_FORCEINLINE float log2f(float x)
{ {
nvCheck(x >= 0); nvCheck(x >= 0);
return logf(x) / logf(2.0f); return logf(x) / logf(2.0f);
} }
NV_FORCEINLINE float exp2f(float x)
inline float exp2f(float x)
{ {
return powf(2, x); return powf(2.0f, x);
} }
#endif #endif

Loading…
Cancel
Save