Merge changes from The Witness.
This commit is contained in:
@ -76,6 +76,10 @@
|
||||
#include "Half.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#if NV_CC_GNUC
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
// Load immediate
|
||||
static inline uint32 _uint32_li( uint32 a )
|
||||
{
|
||||
@ -488,10 +492,79 @@ nv::half_to_float( uint16 h )
|
||||
}
|
||||
|
||||
|
||||
// @@ This code appears to be wrong.
|
||||
static __m128 half_to_float4_SSE2(__m128i h)
|
||||
{
|
||||
#define SSE_CONST4(name, val) static const __declspec(align(16)) uint name[4] = { (val), (val), (val), (val) }
|
||||
#define CONST(name) *(const __m128i *)&name
|
||||
|
||||
SSE_CONST4(mask_nosign, 0x7fff);
|
||||
SSE_CONST4(mask_justsign, 0x8000);
|
||||
SSE_CONST4(mask_shifted_exp, 0x7c00 << 13);
|
||||
SSE_CONST4(expadjust_normal, (127 - 15) << 23);
|
||||
SSE_CONST4(expadjust_infnan, (128 - 16) << 23);
|
||||
SSE_CONST4(expadjust_denorm, 1 << 23);
|
||||
SSE_CONST4(magic_denorm, 113 << 23);
|
||||
|
||||
__m128i mnosign = CONST(mask_nosign);
|
||||
__m128i expmant = _mm_and_si128(mnosign, h);
|
||||
__m128i justsign = _mm_and_si128(h, CONST(mask_justsign));
|
||||
__m128i mshiftexp = CONST(mask_shifted_exp);
|
||||
__m128i eadjust = CONST(expadjust_normal);
|
||||
__m128i shifted = _mm_slli_epi32(expmant, 13);
|
||||
__m128i adjusted = _mm_add_epi32(eadjust, shifted);
|
||||
__m128i justexp = _mm_and_si128(shifted, mshiftexp);
|
||||
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
__m128i b_isinfnan = _mm_cmpeq_epi32(mshiftexp, justexp);
|
||||
__m128i b_isdenorm = _mm_cmpeq_epi32(zero, justexp);
|
||||
|
||||
__m128i adj_infnan = _mm_and_si128(b_isinfnan, CONST(expadjust_infnan));
|
||||
__m128i adjusted2 = _mm_add_epi32(adjusted, adj_infnan);
|
||||
|
||||
__m128i adj_den = CONST(expadjust_denorm);
|
||||
__m128i den1 = _mm_add_epi32(adj_den, adjusted2);
|
||||
__m128 den2 = _mm_sub_ps(_mm_castsi128_ps(den1), *(const __m128 *)&magic_denorm);
|
||||
__m128 adjusted3 = _mm_and_ps(den2, _mm_castsi128_ps(b_isdenorm));
|
||||
__m128 adjusted4 = _mm_andnot_ps(_mm_castsi128_ps(b_isdenorm), _mm_castsi128_ps(adjusted2));
|
||||
__m128 adjusted5 = _mm_or_ps(adjusted3, adjusted4);
|
||||
__m128i sign = _mm_slli_epi32(justsign, 16);
|
||||
__m128 final = _mm_or_ps(adjusted5, _mm_castsi128_ps(sign));
|
||||
|
||||
// ~21 SSE2 ops.
|
||||
return final;
|
||||
|
||||
#undef SSE_CONST4
|
||||
#undef CONST
|
||||
}
|
||||
|
||||
|
||||
void nv::half_to_float_array(const uint16 * vin, float * vout, int count) {
|
||||
nvDebugCheck((intptr_t(vin) & 15) == 0);
|
||||
nvDebugCheck((intptr_t(vout) & 15) == 0);
|
||||
nvDebugCheck((count & 7) == 0);
|
||||
|
||||
__m128i zero = _mm_setzero_si128();
|
||||
|
||||
for (int i = 0; i < count; i += 8)
|
||||
{
|
||||
__m128i in = _mm_loadu_si128((const __m128i *)(vin + i));
|
||||
__m128i a = _mm_unpacklo_epi16(in, zero);
|
||||
__m128i b = _mm_unpackhi_epi16(in, zero);
|
||||
|
||||
__m128 outa = half_to_float4_SSE2(a);
|
||||
_mm_storeu_ps((float *)(vout + i), outa);
|
||||
|
||||
__m128 outb = half_to_float4_SSE2(b);
|
||||
_mm_storeu_ps((float *)(vout + i + 4), outb);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// @@ These tables could be smaller.
|
||||
namespace nv {
|
||||
uint32 mantissa_table[2048];
|
||||
uint32 mantissa_table[2048] = { 0xDEADBEEF };
|
||||
uint32 exponent_table[64];
|
||||
uint32 offset_table[64];
|
||||
}
|
||||
|
@ -9,6 +9,9 @@ namespace nv {
|
||||
uint32 half_to_float( uint16 h );
|
||||
uint16 half_from_float( uint32 f );
|
||||
|
||||
// vin,vout must be 16 byte aligned. count must be a multiple of 8.
|
||||
void half_to_float_array(const uint16 * vin, float * vout, int count);
|
||||
|
||||
void half_init_tables();
|
||||
|
||||
extern uint32 mantissa_table[2048];
|
||||
@ -19,6 +22,7 @@ namespace nv {
|
||||
// http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
|
||||
inline uint32 fast_half_to_float(uint16 h)
|
||||
{
|
||||
nvDebugCheck(mantissa_table[0] == 0); // Make sure table was initialized.
|
||||
uint exp = h >> 10;
|
||||
return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
|
||||
}
|
||||
|
@ -62,6 +62,7 @@ namespace nv
|
||||
Matrix();
|
||||
explicit Matrix(float f);
|
||||
explicit Matrix(identity_t);
|
||||
Matrix(const Matrix3 & m);
|
||||
Matrix(const Matrix & m);
|
||||
Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
|
||||
//explicit Matrix(const float m[]); // m is assumed to contain 16 elements
|
||||
|
@ -250,6 +250,19 @@ namespace nv
|
||||
}
|
||||
}
|
||||
|
||||
inline Matrix::Matrix(const Matrix3 & m)
|
||||
{
|
||||
for(int i = 0; i < 3; i++) {
|
||||
for(int j = 0; j < 3; j++) {
|
||||
operator()(i, j) = m.get(i, j);
|
||||
}
|
||||
}
|
||||
for(int i = 0; i < 4; i++) {
|
||||
operator()(3, i) = 0;
|
||||
operator()(i, 3) = 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3)
|
||||
{
|
||||
m_data[ 0] = v0.x; m_data[ 1] = v0.y; m_data[ 2] = v0.z; m_data[ 3] = v0.w;
|
||||
|
Reference in New Issue
Block a user