Add proper credits.

Tag 2.0.1
Add post build command to copy header files.
2008-02-28 22:52:00 +00:00 · 2008-02-28 22:37:49 +00:00 · 2008-02-28 22:07:08 +00:00 · 2008-02-28 21:45:46 +00:00 · 2008-02-28 21:45:26 +00:00 · 2008-02-28 21:45:04 +00:00
21 changed files with 1459 additions and 1289 deletions
--- a/6
+++ b/6
@ -1,3 +1,9 @@
+NVIDIA Texture Tools version 2.0.1
+ * Fix memory leaks.
+ * Pre-allocate device memory for CUDA compressor.
+ * Add single color compressor. Thanks to Amir Ebrahimi.
+ * Better CUDA error checking.
+
 NVIDIA Texture Tools version 2.0.0
 * Fixed PSNR formula in nvimgdiff.
 * Added support for arbitrary RGB formats.
--- a/NVIDIA_Texture_Tools_README.txt
+++ b/NVIDIA_Texture_Tools_README.txt
@ -2,7 +2,7 @@
 --------------------------------------------------------------------------------
 NVIDIA Texture Tools
 README.txt
-Version 2.0.0
+Version 2.0
 --------------------------------------------------------------------------------
 --------------------------------------------------------------------------------

--- a/2
+++ b/2
@ -1 +1 @@
-2.0.0
+2.0.1
--- a/project/vc8/nvcore/nvcore.vcproj
+++ b/project/vc8/nvcore/nvcore.vcproj
@ -327,6 +327,10 @@
 				RelativePath="..\..\..\src\nvcore\nvcore.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\src\nvcore\Ptr.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\src\nvcore\StrLib.h"
 				>
--- a/project/vc8/nvtt/nvtt.rc
+++ b/project/vc8/nvtt/nvtt.rc
@ -53,8 +53,8 @@ END
 //

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 2,0,0,0
- PRODUCTVERSION 2,0,0,0
+ FILEVERSION 2,0,1,0
+ PRODUCTVERSION 2,0,1,0
 FILEFLAGSMASK 0x17L
 #ifdef _DEBUG
 FILEFLAGS 0x1L
@ -71,12 +71,12 @@ BEGIN
        BEGIN
            VALUE "CompanyName", "NVIDIA Corporation"
            VALUE "FileDescription", "NVIDIA Texture Tools Dynamic Link Library"
-            VALUE "FileVersion", "2, 0, 0, 0"
+            VALUE "FileVersion", "2, 0, 1, 0"
            VALUE "InternalName", "nvtt"
            VALUE "LegalCopyright", "Copyright (C) 2007"
            VALUE "OriginalFilename", "nvtt.dll"
            VALUE "ProductName", "NVIDIA Texture Tools Dynamic Link Library"
-            VALUE "ProductVersion", "2, 0, 0, 0"
+            VALUE "ProductVersion", "2, 0, 1, 0"
        END
    END
    BLOCK "VarFileInfo"
--- a/project/vc8/nvtt/nvtt.vcproj
+++ b/project/vc8/nvtt/nvtt.vcproj
@ -96,6 +96,8 @@
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
+				Description="Copying header files..."
+				CommandLine="xcopy /y /f /i $(SolutionDir)\..\..\src\nvtt\nvtt*.h $(SolutionDir)\$(ConfigurationName).$(PlatformName)\include\"
 			/>
 		</Configuration>
 		<Configuration
@ -258,6 +260,8 @@
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
+				Description="Copying header files..."
+				CommandLine="xcopy /y /f /i $(SolutionDir)\..\..\src\nvtt\nvtt*.h $(SolutionDir)\$(ConfigurationName).$(PlatformName)\include\"
 			/>
 		</Configuration>
 		<Configuration
@ -420,6 +424,8 @@
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
+				Description="Copying header files..."
+				CommandLine="xcopy /y /f /i $(SolutionDir)\..\..\src\nvtt\nvtt*.h $(SolutionDir)\$(ConfigurationName).$(PlatformName)\include\"
 			/>
 		</Configuration>
 		<Configuration
@ -578,6 +584,8 @@
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
+				Description="Copying header files..."
+				CommandLine="xcopy /y /f /i $(SolutionDir)\..\..\src\nvtt\nvtt*.h $(SolutionDir)\$(ConfigurationName).$(PlatformName)\include\"
 			/>
 		</Configuration>
 		<Configuration
--- a/src/nvimage/ColorBlock.cpp
+++ b/src/nvimage/ColorBlock.cpp
@ -110,6 +110,19 @@ void ColorBlock::splatY()
 	}
 }

+/// Returns true if the block has a single color.
+bool ColorBlock::isSingleColor() const
+{
+	for(int i = 1; i < 16; i++)
+	{
+		if (m_color[0] != m_color[i])
+		{
+			return false;
+		}
+	}
+	
+	return true;
+}

 /// Count number of unique colors in this color block.
 uint ColorBlock::countUniqueColors() const
--- a/src/nvimage/ColorBlock.h
+++ b/src/nvimage/ColorBlock.h
@ -24,6 +24,7 @@ namespace nv
 		void splatX();
 		void splatY();
 		
+		bool isSingleColor() const;
 		uint countUniqueColors() const;
 		Color32 averageColor() const;
 		bool hasAlpha() const;
--- a/src/nvtt/CompressDXT.cpp
+++ b/src/nvtt/CompressDXT.cpp
@ -69,7 +69,14 @@ void nv::fastCompressDXT1(const Image * image, const OutputOptions::Private & ou
 		for (uint x = 0; x < w; x += 4) {
 			rgba.init(image, x, y);
 			
-			QuickCompress::compressDXT1(rgba, &block);
+			if (rgba.isSingleColor())
+			{
+				QuickCompress::compressDXT1(rgba.color(0), &block);
+			}
+			else
+			{
+				QuickCompress::compressDXT1(rgba, &block);
+			}
 			
 			if (outputOptions.outputHandler != NULL) {
 				outputOptions.outputHandler->writeData(&block, sizeof(block));
@ -214,10 +221,16 @@ void nv::compressDXT1(const Image * image, const OutputOptions::Private & output
 			
 			rgba.init(image, x, y);
 			
-			// Compress color.
-			squish::ColourSet colours((uint8 *)rgba.colors(), 0);
-			fit.SetColourSet(&colours, squish::kDxt1);
-			fit.Compress(&block);
+			if (rgba.isSingleColor())
+			{
+				QuickCompress::compressDXT1(rgba.color(0), &block);
+			}
+			else
+			{
+				squish::ColourSet colours((uint8 *)rgba.colors(), 0);
+				fit.SetColourSet(&colours, squish::kDxt1);
+				fit.Compress(&block);
+			}
 			
 			if (outputOptions.outputHandler != NULL) {
 				outputOptions.outputHandler->writeData(&block, sizeof(block));
--- a/src/nvtt/Compressor.cpp
+++ b/src/nvtt/Compressor.cpp
@ -205,16 +205,25 @@ namespace nvtt

 Compressor::Compressor() : m(*new Compressor::Private())
 {
+	// CUDA initialization.
 	m.cudaSupported = cuda::isHardwarePresent();
 	m.cudaEnabled = m.cudaSupported;

-	// @@ Do CUDA initialization here.
+	if (m.cudaEnabled)
+	{
+		m.cuda = new CudaCompressor();

+		if (!m.cuda->isValid())
+		{
+			m.cudaEnabled = false;
+			m.cuda = NULL;
+		}
+	}
 }

 Compressor::~Compressor()
 {
-	// @@ Free CUDA resources here.
+	delete &m;
 }


@ -225,6 +234,17 @@ void Compressor::enableCudaAcceleration(bool enable)
 	{
 		m.cudaEnabled = enable;
 	}
+
+	if (m.cudaEnabled && m.cuda == NULL)
+	{
+		m.cuda = new CudaCompressor();
+		
+		if (!m.cuda->isValid())
+		{
+			m.cudaEnabled = false;
+			m.cuda = NULL;
+		}
+	}
 }

 /// Check if CUDA acceleration is enabled.
@ -670,7 +690,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
 			if (cudaEnabled)
 			{
 				nvDebugCheck(cudaSupported);
-				cudaCompressDXT1(image, outputOptions, compressionOptions);
+				cuda->compressDXT1(image, outputOptions, compressionOptions);
 			}
 			else
 			{
@ -708,7 +728,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
 			if (cudaEnabled)
 			{
 				nvDebugCheck(cudaSupported);
-				cudaCompressDXT3(image, outputOptions, compressionOptions);
+				cuda->compressDXT3(image, outputOptions, compressionOptions);
 			}
 			else
 			{
@ -727,7 +747,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
 			if (cudaEnabled)
 			{
 				nvDebugCheck(cudaSupported);
-				cudaCompressDXT5(image, outputOptions, compressionOptions);
+				cuda->compressDXT5(image, outputOptions, compressionOptions);
 			}
 			else
 			{
--- a/src/nvtt/Compressor.h
+++ b/src/nvtt/Compressor.h
@ -24,6 +24,10 @@
 #ifndef NV_TT_COMPRESSOR_H
 #define NV_TT_COMPRESSOR_H

+#include <nvcore/Ptr.h>
+
+#include <nvtt/cuda/CudaCompressDXT.h>
+
 #include "nvtt.h"

 namespace nv
@ -63,6 +67,9 @@ namespace nvtt

 		bool cudaSupported;
 		bool cudaEnabled;
+
+		nv::AutoPtr<nv::CudaCompressor> cuda;
+
 	};

 } // nvtt namespace
--- a/src/nvtt/QuickCompressDXT.cpp
+++ b/src/nvtt/QuickCompressDXT.cpp
@ -353,12 +353,18 @@ static void optimizeAlpha8(const ColorBlock & rgba, AlphaBlockDXT5 * block)
 void QuickCompress::compressDXT1(Color32 c, BlockDXT1 * dxtBlock)
 {
 	dxtBlock->col0.r = OMatch5[c.r][0];
-	dxtBlock->col0.g = OMatch5[c.g][0];
+	dxtBlock->col0.g = OMatch6[c.g][0];
 	dxtBlock->col0.b = OMatch5[c.b][0];
 	dxtBlock->col1.r = OMatch5[c.r][1];
-	dxtBlock->col1.g = OMatch5[c.g][1];
+	dxtBlock->col1.g = OMatch6[c.g][1];
 	dxtBlock->col1.b = OMatch5[c.b][1];
 	dxtBlock->indices = 0xaaaaaaaa;
+
+	if (dxtBlock->col0.u < dxtBlock->col1.u)
+	{
+		swap(dxtBlock->col0.u, dxtBlock->col1.u);
+		dxtBlock->indices ^= 0x55555555;
+	}
 }

 void QuickCompress::compressDXT1(const ColorBlock & rgba, BlockDXT1 * dxtBlock)
--- a/src/nvtt/SingleColorLookup.h
+++ b/src/nvtt/SingleColorLookup.h
@ -48,7 +48,12 @@ void initTables()
 };
 */

-const static uint8 OMatch5[256][2] =
+#if __CUDACC__
+__constant__ unsigned short
+#else
+const static uint8 
+#endif
+OMatch5[256][2] =
 {
 	{0x00, 0x00},
 	{0x00, 0x00},
@ -308,7 +313,12 @@ const static uint8 OMatch5[256][2] =
 	{0x1F, 0x1F},
 };

-const static uint8 OMatch6[256][2] =
+#if __CUDACC__
+__constant__ unsigned short
+#else
+const static uint8
+#endif
+OMatch6[256][2] =
 {
 	{0x00, 0x00},
 	{0x00, 0x01},
--- a/src/nvtt/cuda/Bitmaps.h
+++ b/src/nvtt/cuda/Bitmaps.h
@ -122,7 +122,7 @@ static void doPrecomputation()
 */


-const static uint bitmaps[992] =
+const static uint s_bitmapTable[992] =
 {
 	0x80000000,
 	0x40000000,
--- a/src/nvtt/cuda/CompressKernel.cu
+++ b/src/nvtt/cuda/CompressKernel.cu
@ -28,6 +28,8 @@

 #include "CudaMath.h"

+#include "../SingleColorLookup.h"
+
 #define NUM_THREADS 64		// Number of threads per block.

 #if __DEVICE_EMULATION__
@ -60,6 +62,7 @@ __device__ void sortColors(const float * values, int * cmp)
 {
 	int tid = threadIdx.x;

+#if 1
 	cmp[tid] = (values[0] < values[tid]);
 	cmp[tid] += (values[1] < values[tid]);
 	cmp[tid] += (values[2] < values[tid]);
@ -93,13 +96,30 @@ __device__ void sortColors(const float * values, int * cmp)
 	if (tid > 12 && cmp[tid] == cmp[12]) ++cmp[tid];
 	if (tid > 13 && cmp[tid] == cmp[13]) ++cmp[tid];
 	if (tid > 14 && cmp[tid] == cmp[14]) ++cmp[tid];
+#else
+
+	cmp[tid] = 0;
+
+	#pragma unroll
+	for (int i = 0; i < 16; i++)
+	{
+		cmp[tid] += (values[i] < values[tid]);
+	}
+
+	// Resolve elements with the same index.
+	#pragma unroll
+	for (int i = 0; i < 15; i++)
+	{
+		if (tid > 0 && cmp[tid] == cmp[i]) ++cmp[tid];
+	}
+#endif
 }


 ////////////////////////////////////////////////////////////////////////////////
 // Load color block to shared mem
 ////////////////////////////////////////////////////////////////////////////////
-__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16])
+__device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sums[16], int xrefs[16], int * sameColor)
 {
 	const int bid = blockIdx.x;
 	const int idx = threadIdx.x;
@ -124,6 +144,8 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 		colorSums(colors, sums);
 		float3 axis = bestFitLine(colors, sums[0], kColorMetric);
 		
+		*sameColor = (axis == make_float3(0, 0, 0));
+		
 		dps[idx] = dot(colors[idx], axis);
 		
 #if __DEVICE_EMULATION__
@ -187,7 +209,7 @@ __device__ void loadColorBlock(const uint * image, float3 colors[16], float3 sum
 ////////////////////////////////////////////////////////////////////////////////
 // Round color to RGB565 and expand
 ////////////////////////////////////////////////////////////////////////////////
-inline __device__ float3 roundAndExpand(float3 v, ushort * w)
+inline __device__ float3 roundAndExpand565(float3 v, ushort * w)
 {
 	v.x = rintf(__saturatef(v.x) * 31.0f);
 	v.y = rintf(__saturatef(v.y) * 63.0f);
@ -234,8 +256,8 @@ __device__ float evalPermutation4(const float3 * colors, uint permutation, ushor
 	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
 	
 	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand(a, start);
-	b = roundAndExpand(b, end);
+	a = roundAndExpand565(a, start);
+	b = roundAndExpand565(b, end);

 	// compute the error
 	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
@ -274,8 +296,8 @@ __device__ float evalPermutation3(const float3 * colors, uint permutation, ushor
 	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
 	
 	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand(a, start);
-	b = roundAndExpand(b, end);
+	a = roundAndExpand565(a, start);
+	b = roundAndExpand565(b, end);

 	// compute the error
 	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
@ -315,8 +337,8 @@ __device__ float evalPermutation4(const float3 * colors, float3 color_sum, uint
 	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
 	
 	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand(a, start);
-	b = roundAndExpand(b, end);
+	a = roundAndExpand565(a, start);
+	b = roundAndExpand565(b, end);

 	// compute the error
 	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
@ -351,8 +373,8 @@ __device__ float evalPermutation3(const float3 * colors, float3 color_sum, uint
 	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
 	
 	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand(a, start);
-	b = roundAndExpand(b, end);
+	a = roundAndExpand565(a, start);
+	b = roundAndExpand565(b, end);

 	// compute the error
 	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
@ -391,8 +413,8 @@ __device__ float evalPermutation4(const float3 * colors, const float * weights,
 	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;
 	
 	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand(a, start);
-	b = roundAndExpand(b, end);
+	a = roundAndExpand565(a, start);
+	b = roundAndExpand565(b, end);

 	// compute the error
 	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
@ -432,8 +454,8 @@ __device__ float evalPermutation3(const float3 * colors, const float * weights,
 	float3 b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;

 	// Round a, b to the closest 5-6-5 color and expand...
-	a = roundAndExpand(a, start);
-	b = roundAndExpand(b, end);
+	a = roundAndExpand565(a, start);
+	b = roundAndExpand565(b, end);

 	// compute the error
 	float3 e = a * a * alpha2_sum + b * b * beta2_sum + 2.0f * (a * b * alphabeta_sum - a * alphax_sum - b * betax_sum);
@ -715,22 +737,50 @@ __device__ void saveBlockDXT1(ushort start, ushort end, uint permutation, int xr
 	result[bid].y = indices;
 }

+__device__ void saveSingleColorBlockDXT1(float3 color, uint2 * result)
+{
+	const int bid = blockIdx.x;

+	int r = color.x * 255;
+	int g = color.y * 255;
+	int b = color.z * 255;
+
+	ushort color0 = (OMatch5[r][0] << 11) | (OMatch6[g][0] << 5) | OMatch5[b][0];
+	ushort color1 = (OMatch5[r][1] << 11) | (OMatch6[g][1] << 5) | OMatch5[b][1];
+
+	if (color0 < color1)
+	{
+		result[bid].x = (color0 << 16) | color1;
+		result[bid].y = 0xffffffff;
+	}
+	else
+	{
+		result[bid].x = (color1 << 16) | color0;
+		result[bid].y = 0xaaaaaaaa;
+	}
+}


 ////////////////////////////////////////////////////////////////////////////////
 // Compress color block
 ////////////////////////////////////////////////////////////////////////////////
-__global__ void compress(const uint * permutations, const uint * image, uint2 * result)
+__global__ void compressDXT1(const uint * permutations, const uint * image, uint2 * result)
 {
 	__shared__ float3 colors[16];
 	__shared__ float3 sums[16];
 	__shared__ int xrefs[16];
+	__shared__ int sameColor;
 	
-	loadColorBlock(image, colors, sums, xrefs);
+	loadColorBlock(image, colors, sums, xrefs, &sameColor);

 	__syncthreads();

+	if (sameColor)
+	{
+		if (threadIdx.x == 0) saveSingleColorBlockDXT1(colors[0], result);
+		return;
+	}
+
 	ushort bestStart, bestEnd;
 	uint bestPermutation;

@ -749,7 +799,7 @@ __global__ void compress(const uint * permutations, const uint * image, uint2 *
 }


-__global__ void compressWeighted(const uint * permutations, const uint * image, uint2 * result)
+__global__ void compressWeightedDXT1(const uint * permutations, const uint * image, uint2 * result)
 {
 	__shared__ float3 colors[16];
 	__shared__ float3 sums[16];
@ -845,8 +895,8 @@ __device__ void optimizeAlpha8(const float alphas[16], uchar & a0, uchar & a1)
 	float a = (alphax_sum * beta2_sum - betax_sum * alphabeta_sum) * factor;
 	float b = (betax_sum * alpha2_sum - alphax_sum * alphabeta_sum) * factor;

-	a0 = roundAndExpand(a);
-	a1 = roundAndExpand(b);
+	a0 = roundAndExpand8(a);
+	a1 = roundAndExpand8(b);
 }
 */
 /*
@ -978,12 +1028,12 @@ extern "C" void setupCompressKernel(const float weights[3])
 // Launch kernel
 ////////////////////////////////////////////////////////////////////////////////

-extern "C" void compressKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
+extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
 {
-	compress<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
+	compressDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
 }

-extern "C" void compressWeightedKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
+extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps)
 {
-	compressWeighted<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
+	compressWeightedDXT1<<<blockNum, NUM_THREADS>>>(d_bitmaps, d_data, (uint2 *)d_result);
 }
--- a/src/nvtt/cuda/CudaCompressDXT.cpp
+++ b/src/nvtt/cuda/CudaCompressDXT.cpp
@ -48,29 +48,14 @@ using namespace nvtt;

 #if defined HAVE_CUDA

+#define MAX_BLOCKS 8192U // 32768, 65535
+
+
 extern "C" void setupCompressKernel(const float weights[3]);
-extern "C" void compressKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
-extern "C" void compressWeightedKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);

-#include "Bitmaps.h"
-
-// @@ Store this pointer in CompressionOptions. Allocate in ctor, free in dtor.
-static uint * d_bitmaps = NULL;
-
-static void doPrecomputation()
-{
-	if (d_bitmaps != NULL) {
-		return;
-	}
-
-    // Upload bitmaps.
-    cudaMalloc((void**) &d_bitmaps, 992 * sizeof(uint));
-    cudaMemcpy(d_bitmaps, bitmaps, 992 * sizeof(uint), cudaMemcpyHostToDevice);
-
-	// @@ Check for errors.
-
-	// @@ Free allocated memory.
-}
+#include "Bitmaps.h"	// @@ Rename to BitmapTable.h

 // Convert linear image to block linear.
 static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
@ -92,53 +77,80 @@ static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
 	}
 }

-#endif // defined HAVE_CUDA
+#endif
+
+
+CudaCompressor::CudaCompressor() : m_bitmapTable(NULL), m_data(NULL), m_result(NULL)
+{
+#if defined HAVE_CUDA
+    // Allocate and upload bitmaps.
+    cudaMalloc((void**) &m_bitmapTable, 992 * sizeof(uint));
+	if (m_bitmapTable != NULL)
+	{
+		cudaMemcpy(m_bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice);
+	}
+
+	// Allocate scratch buffers.
+    cudaMalloc((void**) &m_data, MAX_BLOCKS * 64U);
+    cudaMalloc((void**) &m_result, MAX_BLOCKS * 8U);
+#endif
+}
+
+CudaCompressor::~CudaCompressor()
+{
+#if defined HAVE_CUDA
+	// Free device mem allocations.
+	cudaFree(m_data);
+	cudaFree(m_result);
+	cudaFree(m_bitmapTable);
+#endif
+}
+
+bool CudaCompressor::isValid() const
+{
+#if defined HAVE_CUDA
+	if (cudaGetLastError() != cudaSuccess)
+   	{
+		return false;
+	}
+#endif
+	return m_data != NULL && m_result != NULL && m_bitmapTable != NULL;
+}

 // @@ This code is very repetitive and needs to be cleaned up.


 /// Compress image using CUDA.
-void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
+void CudaCompressor::compressDXT1(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA

-	doPrecomputation();
-
 	// Image size in blocks.
 	const uint w = (image->width() + 3) / 4;
 	const uint h = (image->height() + 3) / 4;

 	uint imageSize = w * h * 16 * sizeof(Color32);
    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);	// @@ Do this on the GPU!
+	convertToBlockLinear(image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!

 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
-	const uint blockMax = 32768; // 49152, 65535

 	clock_t start = clock();

-    // Allocate image in device memory.
-    uint * d_data = NULL;
-    cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
-
-	// Allocate result.
-    uint * d_result = NULL;
-    cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));
-
 	setupCompressKernel(compressionOptions.colorWeight.ptr());
 	
 	// TODO: Add support for multiple GPUs.
 	uint bn = 0;
 	while(bn != blockNum)
 	{
-		uint count = min(blockNum - bn, blockMax);
+		uint count = min(blockNum - bn, MAX_BLOCKS);

-	    cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);

 		// Launch kernel.
-		compressKernel(count, d_data, d_result, d_bitmaps);
+		compressKernelDXT1(count, m_data, m_result, m_bitmapTable);

 		// Check for errors.
 		cudaError_t err = cudaGetLastError();
@ -153,7 +165,7 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou
 		}

 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);

 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@ -168,8 +180,6 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou
 	printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);

 	free(blockLinearImage);
-	cudaFree(d_data);
-	cudaFree(d_result);

 #else
 	if (outputOptions.errorHandler != NULL)
@ -181,13 +191,11 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou


 /// Compress image using CUDA.
-void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
+void CudaCompressor::compressDXT3(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA

-	doPrecomputation();
-
 	// Image size in blocks.
 	const uint w = (image->width() + 3) / 4;
 	const uint h = (image->height() + 3) / 4;
@ -198,18 +206,9 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou

 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
-	const uint blockMax = 32768; // 49152, 65535
-
-    // Allocate image in device memory.
-    uint * d_data = NULL;
-    cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
-
-	// Allocate result.
-    uint * d_result = NULL;
-    cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));

 	AlphaBlockDXT3 * alphaBlocks = NULL;
-	alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, blockMax * 8U));
+	alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, MAX_BLOCKS * 8U));

 	setupCompressKernel(compressionOptions.colorWeight.ptr());
 	
@ -218,12 +217,12 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou
 	uint bn = 0;
 	while(bn != blockNum)
 	{
-		uint count = min(blockNum - bn, blockMax);
+		uint count = min(blockNum - bn, MAX_BLOCKS);

-	    cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);

 		// Launch kernel.
-		compressWeightedKernel(count, d_data, d_result, d_bitmaps);
+		compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);

 		// Compress alpha in parallel with the GPU.
 		for (uint i = 0; i < count; i++)
@ -245,7 +244,7 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou
 		}

 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);

 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@ -265,8 +264,6 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou

 	free(alphaBlocks);
 	free(blockLinearImage);
-	cudaFree(d_data);
-	cudaFree(d_result);

 #else
 	if (outputOptions.errorHandler != NULL)
@ -278,13 +275,11 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou


 /// Compress image using CUDA.
-void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
+void CudaCompressor::compressDXT5(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA

-	doPrecomputation();
-
 	// Image size in blocks.
 	const uint w = (image->width() + 3) / 4;
 	const uint h = (image->height() + 3) / 4;
@ -295,18 +290,9 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou

 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
-	const uint blockMax = 32768; // 49152, 65535
-
-    // Allocate image in device memory.
-    uint * d_data = NULL;
-    cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
-
-	// Allocate result.
-    uint * d_result = NULL;
-    cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));

 	AlphaBlockDXT5 * alphaBlocks = NULL;
-	alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, blockMax * 8U));
+	alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, MAX_BLOCKS * 8U));

 	setupCompressKernel(compressionOptions.colorWeight.ptr());
 	
@ -315,12 +301,12 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou
 	uint bn = 0;
 	while(bn != blockNum)
 	{
-		uint count = min(blockNum - bn, blockMax);
+		uint count = min(blockNum - bn, MAX_BLOCKS);

-	    cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);

 		// Launch kernel.
-		compressWeightedKernel(count, d_data, d_result, d_bitmaps);
+		compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);

 		// Compress alpha in parallel with the GPU.
 		for (uint i = 0; i < count; i++)
@ -342,7 +328,7 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou
 		}

 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);

 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@ -362,8 +348,6 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou

 	free(alphaBlocks);
 	free(blockLinearImage);
-	cudaFree(d_data);
-	cudaFree(d_result);

 #else
 	if (outputOptions.errorHandler != NULL)
@ -375,7 +359,7 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou



-#if defined HAVE_CUDA
+#if 0

 class Task
 {
@ -469,7 +453,7 @@ public:
 		cudaMemcpy(d_blockLinearImage, blockLinearImage, blockCount * 64, cudaMemcpyHostToDevice);
 		
 		// Launch kernel.
-		compressKernel(blockCount, d_blockLinearImage, d_compressedImage, d_bitmaps);
+		compressKernelDXT1(blockCount, d_blockLinearImage, d_compressedImage, d_bitmaps);
 		
 		// Check for errors.
 		cudaError_t err = cudaGetLastError();
@ -511,8 +495,6 @@ private:
 	
 };

-#endif // defined HAVE_CUDA
-

 void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
 {
@ -523,9 +505,7 @@ void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private &
 	const uint blockNum = ((w + 3) / 4) * ((h + 3) / 4);
 	const uint blockMax = 32768; // 49152, 65535
 		
-	doPrecomputation();
-	
-	setupCompressKernel(compressionOptions.colorWeight.ptr());
+	setupCompressKernelDXT1(compressionOptions.colorWeight.ptr());

 	ColorBlock rgba;
 	Task task(min(blockNum, blockMax));
@ -559,4 +539,4 @@ void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private &
 #endif
 }

-
+#endif // 0
--- a/src/nvtt/cuda/CudaCompressDXT.h
+++ b/src/nvtt/cuda/CudaCompressDXT.h
@ -31,11 +31,24 @@ namespace nv
 {
 	class Image;

-	void cudaCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
-	void cudaCompressDXT3(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
-	void cudaCompressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	class CudaCompressor
+	{
+	public:
+		CudaCompressor();
+		~CudaCompressor();

-	void cudaCompressDXT1_2(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+		bool isValid() const;
+
+		void compressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+		void compressDXT3(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+		void compressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+
+	private:
+
+		uint * m_bitmapTable;
+		uint * m_data;
+		uint * m_result;
+	};

 } // nv namespace

--- a/src/nvtt/cuda/CudaMath.h
+++ b/src/nvtt/cuda/CudaMath.h
@ -82,6 +82,10 @@ inline __device__ __host__ void operator /=(float3 & b, float f)
    b.z *= inv;
 }

+inline __device__ __host__ bool operator ==(float3 a, float3 b)
+{
+	return a.x == b.x && a.y == b.y && a.z == b.z;
+}

 inline __device__ __host__ float dot(float3 a, float3 b)
 {
@ -131,15 +135,37 @@ inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
 		float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
 		float m = max(max(x, y), z);        
 		float iv = 1.0f / m;
-		#if __DEVICE_EMULATION__
 		if (m == 0.0f) iv = 0.0f;
-		#endif
 		v = make_float3(x*iv, y*iv, z*iv);
 	}

 	return v;
 }

+inline __device__ bool singleColor(const float3 * colors)
+{
+#if __DEVICE_EMULATION__
+	bool sameColor = false;
+	for (int i = 0; i < 16; i++)
+	{
+		sameColor &= (colors[idx] == colors[0]);
+	}
+	return sameColor;
+#else
+	__shared__ int sameColor[16];
+	
+	const int idx = threadIdx.x;
+	
+	sameColor[idx] = (colors[idx] == colors[0]);
+	sameColor[idx] &= sameColor[idx^8];
+	sameColor[idx] &= sameColor[idx^4];
+	sameColor[idx] &= sameColor[idx^2];
+	sameColor[idx] &= sameColor[idx^1];
+	
+	return sameColor[0];
+#endif
+}
+
 inline __device__ void colorSums(const float3 * colors, float3 * sums)
 {
 #if __DEVICE_EMULATION__
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@ -49,6 +49,14 @@

 #define NVTT_VERSION 200

+#define NVTT_DECLARE_PIMPL(Class) \
+	private: \
+		Class(const Class &); \
+		void operator=(const Class &); \
+	public: \
+		struct Private; \
+		Private & m
+

 // Public interface.
 namespace nvtt
@ -89,6 +97,8 @@ namespace nvtt
 	/// Compression options. This class describes the desired compression format and other compression settings.
 	struct CompressionOptions
 	{
+		NVTT_DECLARE_PIMPL(CompressionOptions);
+
 		NVTT_API CompressionOptions();
 		NVTT_API ~CompressionOptions();
 		
@ -104,10 +114,6 @@ namespace nvtt
 		NVTT_API void setPixelFormat(unsigned int bitcount, unsigned int rmask, unsigned int gmask, unsigned int bmask, unsigned int amask);

 		NVTT_API void setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold = 127);
-
-	//private:
-		struct Private;
-		Private & m;
 	};


@ -170,6 +176,8 @@ namespace nvtt
 	/// Input options. Specify format and layout of the input texture.
 	struct InputOptions
 	{
+		NVTT_DECLARE_PIMPL(InputOptions);
+
 		NVTT_API InputOptions();
 		NVTT_API ~InputOptions();
 		
@ -214,10 +222,6 @@ namespace nvtt
 		// Set resizing options.
 		NVTT_API void setMaxExtents(int d);
 		NVTT_API void setRoundMode(RoundMode mode);
-		
-	//private:
-		struct Private;
-		Private & m;
 	};
 	
 	
@ -258,6 +262,8 @@ namespace nvtt
 	/// the compressor to the user.
 	struct OutputOptions
 	{
+		NVTT_DECLARE_PIMPL(OutputOptions);
+
 		NVTT_API OutputOptions();
 		NVTT_API ~OutputOptions();
 		
@ -269,16 +275,14 @@ namespace nvtt
 		NVTT_API void setOutputHandler(OutputHandler * outputHandler);
 		NVTT_API void setErrorHandler(ErrorHandler * errorHandler);
 		NVTT_API void setOutputHeader(bool outputHeader);
-		
-	//private:
-		struct Private;
-		Private & m;
 	};


 	/// Texture compressor.
 	struct Compressor
 	{
+		NVTT_DECLARE_PIMPL(Compressor);
+
 		NVTT_API Compressor();
 		NVTT_API ~Compressor();

@ -290,10 +294,6 @@ namespace nvtt
 		
 		// Estimate the size of compressing the input with the given options.
 		NVTT_API int estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const;
-
-	//private:
-		struct Private;
-		Private & m;
 	};
 	
 	
--- a/src/nvtt/squish/simd_ve.h
+++ b/src/nvtt/squish/simd_ve.h
@ -50,6 +50,16 @@ public:
 		return *this;
 	}
 	
+	Vec4( const float * v )
+	{
+		union { vector float v; float c[4]; } u;
+		u.c[0] = v[0];
+		u.c[1] = v[1];
+		u.c[2] = v[2];
+		u.c[3] = v[3];
+		m_v = u.v;
+	}
+
 	Vec4( float x, float y, float z, float w )
 	{
 		union { vector float v; float c[4]; } u;
--- a/src/nvtt/tools/imgdiff.cpp
+++ b/src/nvtt/tools/imgdiff.cpp
@ -130,10 +130,13 @@ struct NormalError

 	void done()
 	{
-		ade /= samples;
-		mse /= samples * 3;
-		rmse = sqrt(mse);
-		psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
+		if (samples)
+		{
+			ade /= samples;
+			mse /= samples * 3;
+			rmse = sqrt(mse);
+			psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
+		}
 	}

 	void print()
Author	SHA1	Message	Date
castano	fa4a7b9af7	Add proper credits.	2008-02-28 22:52:00 +00:00
castano	f111d23637	Tag 2.0.1	2008-02-28 22:37:49 +00:00
castano	bce983f39e	Add post build command to copy header files.	2008-02-28 22:07:08 +00:00
castano	ff93ad41cb	Fix end of lines.	2008-02-28 21:45:46 +00:00
castano	56c7771100	Fix end of lines.	2008-02-28 21:45:26 +00:00
castano	ccced843e3	Use smaller allocations to prevent errors. Check for allocation errors.	2008-02-28 21:45:04 +00:00
castano	dafe2b8841	Hide copy ctor and operator to prevent compiler warnings. Wrap pimpl using NVTT_DECLARE_PIMPL macro.	2008-02-28 21:14:40 +00:00
castano	e3e7fcb226	Check cuda errors to find out whether the cuda context initialization succeeded.	2008-02-28 17:52:32 +00:00
castano	970395fba8	Fix osx build.	2008-02-28 17:02:29 +00:00
castano	8a24a93e2f	Disable CUDA when memory allocations fail.	2008-02-28 16:06:27 +00:00
 @ -1 +1 @@
 .0.0
 .0.1