Try to optimize color rounding and expansion.
Detect CUDA 2.1 properly.
This commit is contained in:
parent
fa53ddcecd
commit
2ad15489bb
@ -305,13 +305,23 @@ __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sum
|
|||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
inline __device__ float3 roundAndExpand565(float3 v, ushort * w)
|
inline __device__ float3 roundAndExpand565(float3 v, ushort * w)
|
||||||
{
|
{
|
||||||
|
#if 0
|
||||||
|
uint x = __float2uint_rn(__saturatef(v.x) * 31.0f);
|
||||||
|
uint y = __float2uint_rn(__saturatef(v.y) * 63.0f);
|
||||||
|
uint z = __float2uint_rn(__saturatef(v.z) * 31.0f);
|
||||||
|
*w = (x << 11) | (y << 5) | z;
|
||||||
|
v.x = __uint2float_rn(x) * 1.0f / 31.0f;
|
||||||
|
v.y = __uint2float_rn(y) * 1.0f / 63.0f;
|
||||||
|
v.z = __uint2float_rn(z) * 1.0f / 31.0f;
|
||||||
|
#else
|
||||||
v.x = rintf(__saturatef(v.x) * 31.0f);
|
v.x = rintf(__saturatef(v.x) * 31.0f);
|
||||||
v.y = rintf(__saturatef(v.y) * 63.0f);
|
v.y = rintf(__saturatef(v.y) * 63.0f);
|
||||||
v.z = rintf(__saturatef(v.z) * 31.0f);
|
v.z = rintf(__saturatef(v.z) * 31.0f);
|
||||||
*w = ((ushort)v.x << 11) | ((ushort)v.y << 5) | (ushort)v.z;
|
*w = ((ushort)v.x << 11) | ((ushort)v.y << 5) | (ushort)v.z;
|
||||||
v.x *= 0.03227752766457f; // approximate integer bit expansion.
|
v.x *= 1.0f / 31.0f;
|
||||||
v.y *= 0.01583151765563f;
|
v.y *= 1.0f / 63.0f;
|
||||||
v.z *= 0.03227752766457f;
|
v.z *= 1.0f / 31.0f;
|
||||||
|
#endif
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -320,8 +330,8 @@ inline __device__ float2 roundAndExpand56(float2 v, ushort * w)
|
|||||||
v.x = rintf(__saturatef(v.x) * 31.0f);
|
v.x = rintf(__saturatef(v.x) * 31.0f);
|
||||||
v.y = rintf(__saturatef(v.y) * 63.0f);
|
v.y = rintf(__saturatef(v.y) * 63.0f);
|
||||||
*w = ((ushort)v.x << 11) | ((ushort)v.y << 5);
|
*w = ((ushort)v.x << 11) | ((ushort)v.y << 5);
|
||||||
v.x *= 0.03227752766457f; // approximate integer bit expansion.
|
v.x *= 1.0f / 31.0f;
|
||||||
v.y *= 0.01583151765563f;
|
v.y *= 1.0f / 63.0f;
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -69,6 +69,7 @@ static bool isWow32()
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
static bool isCudaDriverAvailable(uint version)
|
static bool isCudaDriverAvailable(uint version)
|
||||||
{
|
{
|
||||||
@ -91,7 +92,7 @@ static bool isCudaDriverAvailable(uint version)
|
|||||||
|
|
||||||
if (version >= 2010)
|
if (version >= 2010)
|
||||||
{
|
{
|
||||||
void * address = nvcuda.bindSymbol("cuLoadDataEx");
|
void * address = nvcuda.bindSymbol("cuModuleLoadDataEx");
|
||||||
if (address == NULL) return false;
|
if (address == NULL) return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user