diff --git a/src/nvtt/cuda/CompressKernel.cu b/src/nvtt/cuda/CompressKernel.cu index 9711014..f2846f8 100644 --- a/src/nvtt/cuda/CompressKernel.cu +++ b/src/nvtt/cuda/CompressKernel.cu @@ -305,13 +305,23 @@ __device__ void loadColorBlock(const uint * image, float2 colors[16], float2 sum //////////////////////////////////////////////////////////////////////////////// inline __device__ float3 roundAndExpand565(float3 v, ushort * w) { +#if 0 + uint x = __float2uint_rn(__saturatef(v.x) * 31.0f); + uint y = __float2uint_rn(__saturatef(v.y) * 63.0f); + uint z = __float2uint_rn(__saturatef(v.z) * 31.0f); + *w = (x << 11) | (y << 5) | z; + v.x = __uint2float_rn(x) * 1.0f / 31.0f; + v.y = __uint2float_rn(y) * 1.0f / 63.0f; + v.z = __uint2float_rn(z) * 1.0f / 31.0f; +#else v.x = rintf(__saturatef(v.x) * 31.0f); v.y = rintf(__saturatef(v.y) * 63.0f); v.z = rintf(__saturatef(v.z) * 31.0f); *w = ((ushort)v.x << 11) | ((ushort)v.y << 5) | (ushort)v.z; - v.x *= 0.03227752766457f; // approximate integer bit expansion. - v.y *= 0.01583151765563f; - v.z *= 0.03227752766457f; + v.x *= 1.0f / 31.0f; + v.y *= 1.0f / 63.0f; + v.z *= 1.0f / 31.0f; +#endif return v; } @@ -320,8 +330,8 @@ inline __device__ float2 roundAndExpand56(float2 v, ushort * w) v.x = rintf(__saturatef(v.x) * 31.0f); v.y = rintf(__saturatef(v.y) * 63.0f); *w = ((ushort)v.x << 11) | ((ushort)v.y << 5); - v.x *= 0.03227752766457f; // approximate integer bit expansion. - v.y *= 0.01583151765563f; + v.x *= 1.0f / 31.0f; + v.y *= 1.0f / 63.0f; return v; } diff --git a/src/nvtt/cuda/CudaUtils.cpp b/src/nvtt/cuda/CudaUtils.cpp index ee9460a..7c70240 100644 --- a/src/nvtt/cuda/CudaUtils.cpp +++ b/src/nvtt/cuda/CudaUtils.cpp @@ -69,6 +69,7 @@ static bool isWow32() #endif +#include static bool isCudaDriverAvailable(uint version) { @@ -91,7 +92,7 @@ static bool isCudaDriverAvailable(uint version) if (version >= 2010) { - void * address = nvcuda.bindSymbol("cuLoadDataEx"); + void * address = nvcuda.bindSymbol("cuModuleLoadDataEx"); if (address == NULL) return false; }