Add proper credits.

Tag 2.0.1
Add post build command to copy header files.
2008-02-28 22:52:00 +00:00 · 2008-02-28 22:37:49 +00:00 · 2008-02-28 22:07:08 +00:00 · 2008-02-28 21:45:46 +00:00 · 2008-02-28 21:45:26 +00:00 · 2008-02-28 21:45:04 +00:00
21 changed files with 1459 additions and 1289 deletions
--- a/6
+++ b/6
@ -1,3 +1,9 @@
+NVIDIA Texture Tools version 2.0.1
+ * Fix memory leaks.
+ * Pre-allocate device memory for CUDA compressor.
+ * Add single color compressor. Thanks to Amir Ebrahimi.
+ * Better CUDA error checking.
+
 NVIDIA Texture Tools version 2.0.0
 * Fixed PSNR formula in nvimgdiff.
 * Added support for arbitrary RGB formats.
--- a/NVIDIA_Texture_Tools_README.txt
+++ b/NVIDIA_Texture_Tools_README.txt
@ -2,7 +2,7 @@
 --------------------------------------------------------------------------------
 NVIDIA Texture Tools
 README.txt
-Version 2.0.0
+Version 2.0
 --------------------------------------------------------------------------------
 --------------------------------------------------------------------------------

--- a/2
+++ b/2
@ -1 +1 @@
-2.0.0
+2.0.1
--- a/project/vc8/nvcore/nvcore.vcproj
+++ b/project/vc8/nvcore/nvcore.vcproj
@ -327,6 +327,10 @@
 				RelativePath="..\..\..\src\nvcore\nvcore.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\..\src\nvcore\Ptr.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\..\src\nvcore\StrLib.h"
 				>
--- a/project/vc8/nvtt/nvtt.rc
+++ b/project/vc8/nvtt/nvtt.rc
@ -53,8 +53,8 @@ END
 //

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 2,0,0,0
- PRODUCTVERSION 2,0,0,0
+ FILEVERSION 2,0,1,0
+ PRODUCTVERSION 2,0,1,0
 FILEFLAGSMASK 0x17L
 #ifdef _DEBUG
 FILEFLAGS 0x1L
@ -71,12 +71,12 @@ BEGIN
        BEGIN
            VALUE "CompanyName", "NVIDIA Corporation"
            VALUE "FileDescription", "NVIDIA Texture Tools Dynamic Link Library"
-            VALUE "FileVersion", "2, 0, 0, 0"
+            VALUE "FileVersion", "2, 0, 1, 0"
            VALUE "InternalName", "nvtt"
            VALUE "LegalCopyright", "Copyright (C) 2007"
            VALUE "OriginalFilename", "nvtt.dll"
            VALUE "ProductName", "NVIDIA Texture Tools Dynamic Link Library"
-            VALUE "ProductVersion", "2, 0, 0, 0"
+            VALUE "ProductVersion", "2, 0, 1, 0"
        END
    END
    BLOCK "VarFileInfo"
--- a/project/vc8/nvtt/nvtt.vcproj
+++ b/project/vc8/nvtt/nvtt.vcproj
@ -96,6 +96,8 @@
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
+				Description="Copying header files..."
+				CommandLine="xcopy /y /f /i $(SolutionDir)\..\..\src\nvtt\nvtt*.h $(SolutionDir)\$(ConfigurationName).$(PlatformName)\include\"
 			/>
 		</Configuration>
 		<Configuration
@ -258,6 +260,8 @@
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
+				Description="Copying header files..."
+				CommandLine="xcopy /y /f /i $(SolutionDir)\..\..\src\nvtt\nvtt*.h $(SolutionDir)\$(ConfigurationName).$(PlatformName)\include\"
 			/>
 		</Configuration>
 		<Configuration
@ -420,6 +424,8 @@
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
+				Description="Copying header files..."
+				CommandLine="xcopy /y /f /i $(SolutionDir)\..\..\src\nvtt\nvtt*.h $(SolutionDir)\$(ConfigurationName).$(PlatformName)\include\"
 			/>
 		</Configuration>
 		<Configuration
@ -578,6 +584,8 @@
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
+				Description="Copying header files..."
+				CommandLine="xcopy /y /f /i $(SolutionDir)\..\..\src\nvtt\nvtt*.h $(SolutionDir)\$(ConfigurationName).$(PlatformName)\include\"
 			/>
 		</Configuration>
 		<Configuration
--- a/src/nvimage/ColorBlock.cpp
+++ b/src/nvimage/ColorBlock.cpp
@ -110,6 +110,19 @@ void ColorBlock::splatY()
 	}
 }

+/// Returns true if the block has a single color.
+bool ColorBlock::isSingleColor() const
+{
+	for(int i = 1; i < 16; i++)
+	{
+		if (m_color[0] != m_color[i])
+		{
+			return false;
+		}
+	}
+	
+	return true;
+}

 /// Count number of unique colors in this color block.
 uint ColorBlock::countUniqueColors() const
--- a/src/nvimage/ColorBlock.h
+++ b/src/nvimage/ColorBlock.h
@ -24,6 +24,7 @@ namespace nv
 		void splatX();
 		void splatY();
 		
+		bool isSingleColor() const;
 		uint countUniqueColors() const;
 		Color32 averageColor() const;
 		bool hasAlpha() const;
--- a/src/nvtt/CompressDXT.cpp
+++ b/src/nvtt/CompressDXT.cpp
@ -69,7 +69,14 @@ void nv::fastCompressDXT1(const Image * image, const OutputOptions::Private & ou
 		for (uint x = 0; x < w; x += 4) {
 			rgba.init(image, x, y);
 			
-			QuickCompress::compressDXT1(rgba, &block);
+			if (rgba.isSingleColor())
+			{
+				QuickCompress::compressDXT1(rgba.color(0), &block);
+			}
+			else
+			{
+				QuickCompress::compressDXT1(rgba, &block);
+			}
 			
 			if (outputOptions.outputHandler != NULL) {
 				outputOptions.outputHandler->writeData(&block, sizeof(block));
@ -214,10 +221,16 @@ void nv::compressDXT1(const Image * image, const OutputOptions::Private & output
 			
 			rgba.init(image, x, y);
 			
-			// Compress color.
-			squish::ColourSet colours((uint8 *)rgba.colors(), 0);
-			fit.SetColourSet(&colours, squish::kDxt1);
-			fit.Compress(&block);
+			if (rgba.isSingleColor())
+			{
+				QuickCompress::compressDXT1(rgba.color(0), &block);
+			}
+			else
+			{
+				squish::ColourSet colours((uint8 *)rgba.colors(), 0);
+				fit.SetColourSet(&colours, squish::kDxt1);
+				fit.Compress(&block);
+			}
 			
 			if (outputOptions.outputHandler != NULL) {
 				outputOptions.outputHandler->writeData(&block, sizeof(block));
--- a/src/nvtt/Compressor.cpp
+++ b/src/nvtt/Compressor.cpp
@ -205,16 +205,25 @@ namespace nvtt

 Compressor::Compressor() : m(*new Compressor::Private())
 {
+	// CUDA initialization.
 	m.cudaSupported = cuda::isHardwarePresent();
 	m.cudaEnabled = m.cudaSupported;

-	// @@ Do CUDA initialization here.
+	if (m.cudaEnabled)
+	{
+		m.cuda = new CudaCompressor();

+		if (!m.cuda->isValid())
+		{
+			m.cudaEnabled = false;
+			m.cuda = NULL;
+		}
+	}
 }

 Compressor::~Compressor()
 {
-	// @@ Free CUDA resources here.
+	delete &m;
 }


@ -225,6 +234,17 @@ void Compressor::enableCudaAcceleration(bool enable)
 	{
 		m.cudaEnabled = enable;
 	}
+
+	if (m.cudaEnabled && m.cuda == NULL)
+	{
+		m.cuda = new CudaCompressor();
+		
+		if (!m.cuda->isValid())
+		{
+			m.cudaEnabled = false;
+			m.cuda = NULL;
+		}
+	}
 }

 /// Check if CUDA acceleration is enabled.
@ -670,7 +690,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
 			if (cudaEnabled)
 			{
 				nvDebugCheck(cudaSupported);
-				cudaCompressDXT1(image, outputOptions, compressionOptions);
+				cuda->compressDXT1(image, outputOptions, compressionOptions);
 			}
 			else
 			{
@ -708,7 +728,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
 			if (cudaEnabled)
 			{
 				nvDebugCheck(cudaSupported);
-				cudaCompressDXT3(image, outputOptions, compressionOptions);
+				cuda->compressDXT3(image, outputOptions, compressionOptions);
 			}
 			else
 			{
@ -727,7 +747,7 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const Compressio
 			if (cudaEnabled)
 			{
 				nvDebugCheck(cudaSupported);
-				cudaCompressDXT5(image, outputOptions, compressionOptions);
+				cuda->compressDXT5(image, outputOptions, compressionOptions);
 			}
 			else
 			{
--- a/src/nvtt/Compressor.h
+++ b/src/nvtt/Compressor.h
@ -24,6 +24,10 @@
 #ifndef NV_TT_COMPRESSOR_H
 #define NV_TT_COMPRESSOR_H

+#include <nvcore/Ptr.h>
+
+#include <nvtt/cuda/CudaCompressDXT.h>
+
 #include "nvtt.h"

 namespace nv
@ -63,6 +67,9 @@ namespace nvtt

 		bool cudaSupported;
 		bool cudaEnabled;
+
+		nv::AutoPtr<nv::CudaCompressor> cuda;
+
 	};

 } // nvtt namespace
--- a/src/nvtt/QuickCompressDXT.cpp
+++ b/src/nvtt/QuickCompressDXT.cpp
@ -353,12 +353,18 @@ static void optimizeAlpha8(const ColorBlock & rgba, AlphaBlockDXT5 * block)
 void QuickCompress::compressDXT1(Color32 c, BlockDXT1 * dxtBlock)
 {
 	dxtBlock->col0.r = OMatch5[c.r][0];
-	dxtBlock->col0.g = OMatch5[c.g][0];
+	dxtBlock->col0.g = OMatch6[c.g][0];
 	dxtBlock->col0.b = OMatch5[c.b][0];
 	dxtBlock->col1.r = OMatch5[c.r][1];
-	dxtBlock->col1.g = OMatch5[c.g][1];
+	dxtBlock->col1.g = OMatch6[c.g][1];
 	dxtBlock->col1.b = OMatch5[c.b][1];
 	dxtBlock->indices = 0xaaaaaaaa;
+
+	if (dxtBlock->col0.u < dxtBlock->col1.u)
+	{
+		swap(dxtBlock->col0.u, dxtBlock->col1.u);
+		dxtBlock->indices ^= 0x55555555;
+	}
 }

 void QuickCompress::compressDXT1(const ColorBlock & rgba, BlockDXT1 * dxtBlock)
--- a/src/nvtt/SingleColorLookup.h
+++ b/src/nvtt/SingleColorLookup.h
@ -48,7 +48,12 @@ void initTables()
 };
 */

-const static uint8 OMatch5[256][2] =
+#if __CUDACC__
+__constant__ unsigned short
+#else
+const static uint8 
+#endif
+OMatch5[256][2] =
 {
 	{0x00, 0x00},
 	{0x00, 0x00},
@ -308,7 +313,12 @@ const static uint8 OMatch5[256][2] =
 	{0x1F, 0x1F},
 };

-const static uint8 OMatch6[256][2] =
+#if __CUDACC__
+__constant__ unsigned short
+#else
+const static uint8
+#endif
+OMatch6[256][2] =
 {
 	{0x00, 0x00},
 	{0x00, 0x01},
--- a/src/nvtt/cuda/Bitmaps.h
+++ b/src/nvtt/cuda/Bitmaps.h
@ -122,7 +122,7 @@ static void doPrecomputation()
 */


-const static uint bitmaps[992] =
+const static uint s_bitmapTable[992] =
 {
 	0x80000000,
 	0x40000000,
--- a/src/nvtt/cuda/CompressKernel.cu
+++ b/src/nvtt/cuda/CompressKernel.cu
--- a/src/nvtt/cuda/CudaCompressDXT.cpp
+++ b/src/nvtt/cuda/CudaCompressDXT.cpp
@ -48,29 +48,14 @@ using namespace nvtt;

 #if defined HAVE_CUDA

+#define MAX_BLOCKS 8192U // 32768, 65535
+
+
 extern "C" void setupCompressKernel(const float weights[3]);
-extern "C" void compressKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
-extern "C" void compressWeightedKernel(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+extern "C" void compressKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);
+extern "C" void compressWeightedKernelDXT1(uint blockNum, uint * d_data, uint * d_result, uint * d_bitmaps);

-#include "Bitmaps.h"
-
-// @@ Store this pointer in CompressionOptions. Allocate in ctor, free in dtor.
-static uint * d_bitmaps = NULL;
-
-static void doPrecomputation()
-{
-	if (d_bitmaps != NULL) {
-		return;
-	}
-
-    // Upload bitmaps.
-    cudaMalloc((void**) &d_bitmaps, 992 * sizeof(uint));
-    cudaMemcpy(d_bitmaps, bitmaps, 992 * sizeof(uint), cudaMemcpyHostToDevice);
-
-	// @@ Check for errors.
-
-	// @@ Free allocated memory.
-}
+#include "Bitmaps.h"	// @@ Rename to BitmapTable.h

 // Convert linear image to block linear.
 static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
@ -92,53 +77,80 @@ static void convertToBlockLinear(const Image * image, uint * blockLinearImage)
 	}
 }

-#endif // defined HAVE_CUDA
+#endif
+
+
+CudaCompressor::CudaCompressor() : m_bitmapTable(NULL), m_data(NULL), m_result(NULL)
+{
+#if defined HAVE_CUDA
+    // Allocate and upload bitmaps.
+    cudaMalloc((void**) &m_bitmapTable, 992 * sizeof(uint));
+	if (m_bitmapTable != NULL)
+	{
+		cudaMemcpy(m_bitmapTable, s_bitmapTable, 992 * sizeof(uint), cudaMemcpyHostToDevice);
+	}
+
+	// Allocate scratch buffers.
+    cudaMalloc((void**) &m_data, MAX_BLOCKS * 64U);
+    cudaMalloc((void**) &m_result, MAX_BLOCKS * 8U);
+#endif
+}
+
+CudaCompressor::~CudaCompressor()
+{
+#if defined HAVE_CUDA
+	// Free device mem allocations.
+	cudaFree(m_data);
+	cudaFree(m_result);
+	cudaFree(m_bitmapTable);
+#endif
+}
+
+bool CudaCompressor::isValid() const
+{
+#if defined HAVE_CUDA
+	if (cudaGetLastError() != cudaSuccess)
+   	{
+		return false;
+	}
+#endif
+	return m_data != NULL && m_result != NULL && m_bitmapTable != NULL;
+}

 // @@ This code is very repetitive and needs to be cleaned up.


 /// Compress image using CUDA.
-void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
+void CudaCompressor::compressDXT1(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA

-	doPrecomputation();
-
 	// Image size in blocks.
 	const uint w = (image->width() + 3) / 4;
 	const uint h = (image->height() + 3) / 4;

 	uint imageSize = w * h * 16 * sizeof(Color32);
    uint * blockLinearImage = (uint *) malloc(imageSize);
-	convertToBlockLinear(image, blockLinearImage);	// @@ Do this on the GPU!
+	convertToBlockLinear(image, blockLinearImage);	// @@ Do this in parallel with the GPU, or in the GPU!

 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
-	const uint blockMax = 32768; // 49152, 65535

 	clock_t start = clock();

-    // Allocate image in device memory.
-    uint * d_data = NULL;
-    cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
-
-	// Allocate result.
-    uint * d_result = NULL;
-    cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));
-
 	setupCompressKernel(compressionOptions.colorWeight.ptr());
 	
 	// TODO: Add support for multiple GPUs.
 	uint bn = 0;
 	while(bn != blockNum)
 	{
-		uint count = min(blockNum - bn, blockMax);
+		uint count = min(blockNum - bn, MAX_BLOCKS);

-	    cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);

 		// Launch kernel.
-		compressKernel(count, d_data, d_result, d_bitmaps);
+		compressKernelDXT1(count, m_data, m_result, m_bitmapTable);

 		// Check for errors.
 		cudaError_t err = cudaGetLastError();
@ -153,7 +165,7 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou
 		}

 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);

 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@ -168,8 +180,6 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou
 	printf("\rCUDA time taken: %.3f seconds\n", float(end-start) / CLOCKS_PER_SEC);

 	free(blockLinearImage);
-	cudaFree(d_data);
-	cudaFree(d_result);

 #else
 	if (outputOptions.errorHandler != NULL)
@ -181,13 +191,11 @@ void nv::cudaCompressDXT1(const Image * image, const OutputOptions::Private & ou


 /// Compress image using CUDA.
-void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
+void CudaCompressor::compressDXT3(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA

-	doPrecomputation();
-
 	// Image size in blocks.
 	const uint w = (image->width() + 3) / 4;
 	const uint h = (image->height() + 3) / 4;
@ -198,18 +206,9 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou

 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
-	const uint blockMax = 32768; // 49152, 65535
-
-    // Allocate image in device memory.
-    uint * d_data = NULL;
-    cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
-
-	// Allocate result.
-    uint * d_result = NULL;
-    cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));

 	AlphaBlockDXT3 * alphaBlocks = NULL;
-	alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, blockMax * 8U));
+	alphaBlocks = (AlphaBlockDXT3 *)malloc(min(compressedSize, MAX_BLOCKS * 8U));

 	setupCompressKernel(compressionOptions.colorWeight.ptr());
 	
@ -218,12 +217,12 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou
 	uint bn = 0;
 	while(bn != blockNum)
 	{
-		uint count = min(blockNum - bn, blockMax);
+		uint count = min(blockNum - bn, MAX_BLOCKS);

-	    cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);

 		// Launch kernel.
-		compressWeightedKernel(count, d_data, d_result, d_bitmaps);
+		compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);

 		// Compress alpha in parallel with the GPU.
 		for (uint i = 0; i < count; i++)
@ -245,7 +244,7 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou
 		}

 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);

 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@ -265,8 +264,6 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou

 	free(alphaBlocks);
 	free(blockLinearImage);
-	cudaFree(d_data);
-	cudaFree(d_result);

 #else
 	if (outputOptions.errorHandler != NULL)
@ -278,13 +275,11 @@ void nv::cudaCompressDXT3(const Image * image, const OutputOptions::Private & ou


 /// Compress image using CUDA.
-void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
+void CudaCompressor::compressDXT5(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
 {
 	nvDebugCheck(cuda::isHardwarePresent());
 #if defined HAVE_CUDA

-	doPrecomputation();
-
 	// Image size in blocks.
 	const uint w = (image->width() + 3) / 4;
 	const uint h = (image->height() + 3) / 4;
@ -295,18 +290,9 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou

 	const uint blockNum = w * h;
 	const uint compressedSize = blockNum * 8;
-	const uint blockMax = 32768; // 49152, 65535
-
-    // Allocate image in device memory.
-    uint * d_data = NULL;
-    cudaMalloc((void**) &d_data, min(imageSize, blockMax * 64U));
-
-	// Allocate result.
-    uint * d_result = NULL;
-    cudaMalloc((void**) &d_result, min(compressedSize, blockMax * 8U));

 	AlphaBlockDXT5 * alphaBlocks = NULL;
-	alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, blockMax * 8U));
+	alphaBlocks = (AlphaBlockDXT5 *)malloc(min(compressedSize, MAX_BLOCKS * 8U));

 	setupCompressKernel(compressionOptions.colorWeight.ptr());
 	
@ -315,12 +301,12 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou
 	uint bn = 0;
 	while(bn != blockNum)
 	{
-		uint count = min(blockNum - bn, blockMax);
+		uint count = min(blockNum - bn, MAX_BLOCKS);

-	    cudaMemcpy(d_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);
+	    cudaMemcpy(m_data, blockLinearImage + bn * 16, count * 64, cudaMemcpyHostToDevice);

 		// Launch kernel.
-		compressWeightedKernel(count, d_data, d_result, d_bitmaps);
+		compressWeightedKernelDXT1(count, m_data, m_result, m_bitmapTable);

 		// Compress alpha in parallel with the GPU.
 		for (uint i = 0; i < count; i++)
@ -342,7 +328,7 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou
 		}

 		// Copy result to host, overwrite swizzled image.
-		cudaMemcpy(blockLinearImage, d_result, count * 8, cudaMemcpyDeviceToHost);
+		cudaMemcpy(blockLinearImage, m_result, count * 8, cudaMemcpyDeviceToHost);

 		// Output result.
 		if (outputOptions.outputHandler != NULL)
@ -362,8 +348,6 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou

 	free(alphaBlocks);
 	free(blockLinearImage);
-	cudaFree(d_data);
-	cudaFree(d_result);

 #else
 	if (outputOptions.errorHandler != NULL)
@ -375,7 +359,7 @@ void nv::cudaCompressDXT5(const Image * image, const OutputOptions::Private & ou



-#if defined HAVE_CUDA
+#if 0

 class Task
 {
@ -469,7 +453,7 @@ public:
 		cudaMemcpy(d_blockLinearImage, blockLinearImage, blockCount * 64, cudaMemcpyHostToDevice);
 		
 		// Launch kernel.
-		compressKernel(blockCount, d_blockLinearImage, d_compressedImage, d_bitmaps);
+		compressKernelDXT1(blockCount, d_blockLinearImage, d_compressedImage, d_bitmaps);
 		
 		// Check for errors.
 		cudaError_t err = cudaGetLastError();
@ -511,8 +495,6 @@ private:
 	
 };

-#endif // defined HAVE_CUDA
-

 void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private & outputOptions, const CompressionOptions::Private & compressionOptions)
 {
@ -522,10 +504,8 @@ void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private &
 	
 	const uint blockNum = ((w + 3) / 4) * ((h + 3) / 4);
 	const uint blockMax = 32768; // 49152, 65535
-	
-	doPrecomputation();
-	
-	setupCompressKernel(compressionOptions.colorWeight.ptr());
+		
+	setupCompressKernelDXT1(compressionOptions.colorWeight.ptr());

 	ColorBlock rgba;
 	Task task(min(blockNum, blockMax));
@ -559,4 +539,4 @@ void nv::cudaCompressDXT1_2(const Image * image, const OutputOptions::Private &
 #endif
 }

-
+#endif // 0
--- a/src/nvtt/cuda/CudaCompressDXT.h
+++ b/src/nvtt/cuda/CudaCompressDXT.h
@ -31,11 +31,24 @@ namespace nv
 {
 	class Image;

-	void cudaCompressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
-	void cudaCompressDXT3(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
-	void cudaCompressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+	class CudaCompressor
+	{
+	public:
+		CudaCompressor();
+		~CudaCompressor();

-	void cudaCompressDXT1_2(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+		bool isValid() const;
+
+		void compressDXT1(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+		void compressDXT3(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+		void compressDXT5(const Image * image, const nvtt::OutputOptions::Private & outputOptions, const nvtt::CompressionOptions::Private & compressionOptions);
+
+	private:
+
+		uint * m_bitmapTable;
+		uint * m_data;
+		uint * m_result;
+	};

 } // nv namespace

--- a/src/nvtt/cuda/CudaMath.h
+++ b/src/nvtt/cuda/CudaMath.h
@ -1,221 +1,247 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-// Math functions and operators to be used with vector types.
-
-#ifndef CUDAMATH_H
-#define CUDAMATH_H
-
-#include <float.h>
-
-
-inline __device__ __host__ float3 operator *(float3 a, float3 b)
-{
-    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
-}
-
-inline __device__ __host__ float3 operator *(float f, float3 v)
-{
-    return make_float3(v.x*f, v.y*f, v.z*f);
-}
-
-inline __device__ __host__ float3 operator *(float3 v, float f)
-{
-    return make_float3(v.x*f, v.y*f, v.z*f);
-}
-
-inline __device__ __host__ float3 operator +(float3 a, float3 b)
-{
-    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
-}
-
-inline __device__ __host__ void operator +=(float3 & b, float3 a)
-{
-    b.x += a.x;
-    b.y += a.y;
-    b.z += a.z;
-}
-
-inline __device__ __host__ float3 operator -(float3 a, float3 b)
-{
-    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
-}
-
-inline __device__ __host__ void operator -=(float3 & b, float3 a)
-{
-    b.x -= a.x;
-    b.y -= a.y;
-    b.z -= a.z;
-}
-
-inline __device__ __host__ float3 operator /(float3 v, float f)
-{
-    float inv = 1.0f / f;
-    return v * inv;
-}
-
-inline __device__ __host__ void operator /=(float3 & b, float f)
-{
-    float inv = 1.0f / f;
-    b.x *= inv;
-    b.y *= inv;
-    b.z *= inv;
-}
-
-
-inline __device__ __host__ float dot(float3 a, float3 b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z;
-}
-
-inline __device__ __host__ float dot(float4 a, float4 b)
-{
-    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
-}
-
-inline __device__ __host__ float clamp(float f, float a, float b)
-{
-    return max(a, min(f, b));
-}
-
-inline __device__ __host__ float3 clamp(float3 v, float a, float b)
-{
-    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
-}
-
-inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
-{
-    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
-}
-
-
-inline __device__ __host__ float3 normalize(float3 v)
-{
-    float len = 1.0f / sqrtf(dot(v, v));
-    return make_float3(v.x * len, v.y * len, v.z * len);
-}
-
-
-
-
-// Use power method to find the first eigenvector.
-// http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html
-inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
-{
-	// 8 iterations seems to be more than enough.
-
-	float3 v = make_float3(1.0f, 1.0f, 1.0f);
-	for(int i = 0; i < 8; i++) {
-		float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
-		float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
-		float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
-		float m = max(max(x, y), z);        
-		float iv = 1.0f / m;
-		#if __DEVICE_EMULATION__
-		if (m == 0.0f) iv = 0.0f;
-		#endif
-		v = make_float3(x*iv, y*iv, z*iv);
-	}
-
-	return v;
-}
-
-inline __device__ void colorSums(const float3 * colors, float3 * sums)
-{
-#if __DEVICE_EMULATION__
-	float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
-	for (int i = 0; i < 16; i++)
-	{
-		color_sum += colors[i];
-	}
-
-	for (int i = 0; i < 16; i++)
-	{
-		sums[i] = color_sum;
-	}
-#else
-
-	const int idx = threadIdx.x;
-
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+// Math functions and operators to be used with vector types.
+
+#ifndef CUDAMATH_H
+#define CUDAMATH_H
+
+#include <float.h>
+
+
+inline __device__ __host__ float3 operator *(float3 a, float3 b)
+{
+    return make_float3(a.x*b.x, a.y*b.y, a.z*b.z);
+}
+
+inline __device__ __host__ float3 operator *(float f, float3 v)
+{
+    return make_float3(v.x*f, v.y*f, v.z*f);
+}
+
+inline __device__ __host__ float3 operator *(float3 v, float f)
+{
+    return make_float3(v.x*f, v.y*f, v.z*f);
+}
+
+inline __device__ __host__ float3 operator +(float3 a, float3 b)
+{
+    return make_float3(a.x+b.x, a.y+b.y, a.z+b.z);
+}
+
+inline __device__ __host__ void operator +=(float3 & b, float3 a)
+{
+    b.x += a.x;
+    b.y += a.y;
+    b.z += a.z;
+}
+
+inline __device__ __host__ float3 operator -(float3 a, float3 b)
+{
+    return make_float3(a.x-b.x, a.y-b.y, a.z-b.z);
+}
+
+inline __device__ __host__ void operator -=(float3 & b, float3 a)
+{
+    b.x -= a.x;
+    b.y -= a.y;
+    b.z -= a.z;
+}
+
+inline __device__ __host__ float3 operator /(float3 v, float f)
+{
+    float inv = 1.0f / f;
+    return v * inv;
+}
+
+inline __device__ __host__ void operator /=(float3 & b, float f)
+{
+    float inv = 1.0f / f;
+    b.x *= inv;
+    b.y *= inv;
+    b.z *= inv;
+}
+
+inline __device__ __host__ bool operator ==(float3 a, float3 b)
+{
+	return a.x == b.x && a.y == b.y && a.z == b.z;
+}
+
+inline __device__ __host__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+inline __device__ __host__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+
+
+inline __device__ __host__ float3 normalize(float3 v)
+{
+    float len = 1.0f / sqrtf(dot(v, v));
+    return make_float3(v.x * len, v.y * len, v.z * len);
+}
+
+
+
+
+// Use power method to find the first eigenvector.
+// http://www.miislita.com/information-retrieval-tutorial/matrix-tutorial-3-eigenvalues-eigenvectors.html
+inline __device__ __host__ float3 firstEigenVector( float matrix[6] )
+{
+	// 8 iterations seems to be more than enough.
+
+	float3 v = make_float3(1.0f, 1.0f, 1.0f);
+	for(int i = 0; i < 8; i++) {
+		float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
+		float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
+		float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
+		float m = max(max(x, y), z);        
+		float iv = 1.0f / m;
+		if (m == 0.0f) iv = 0.0f;
+		v = make_float3(x*iv, y*iv, z*iv);
+	}
+
+	return v;
+}
+
+inline __device__ bool singleColor(const float3 * colors)
+{
+#if __DEVICE_EMULATION__
+	bool sameColor = false;
+	for (int i = 0; i < 16; i++)
+	{
+		sameColor &= (colors[idx] == colors[0]);
+	}
+	return sameColor;
+#else
+	__shared__ int sameColor[16];
+	
+	const int idx = threadIdx.x;
+	
+	sameColor[idx] = (colors[idx] == colors[0]);
+	sameColor[idx] &= sameColor[idx^8];
+	sameColor[idx] &= sameColor[idx^4];
+	sameColor[idx] &= sameColor[idx^2];
+	sameColor[idx] &= sameColor[idx^1];
+	
+	return sameColor[0];
+#endif
+}
+
+inline __device__ void colorSums(const float3 * colors, float3 * sums)
+{
+#if __DEVICE_EMULATION__
+	float3 color_sum = make_float3(0.0f, 0.0f, 0.0f);
+	for (int i = 0; i < 16; i++)
+	{
+		color_sum += colors[i];
+	}
+
+	for (int i = 0; i < 16; i++)
+	{
+		sums[i] = color_sum;
+	}
+#else
+
+	const int idx = threadIdx.x;
+
 	sums[idx] = colors[idx];
 	sums[idx] += sums[idx^8];
 	sums[idx] += sums[idx^4];
 	sums[idx] += sums[idx^2];
 	sums[idx] += sums[idx^1];
-
-#endif
-}
-
-inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum, float3 colorMetric)
-{
-	// Compute covariance matrix of the given colors.
-#if __DEVICE_EMULATION__
-	float covariance[6] = {0, 0, 0, 0, 0, 0};
-	for (int i = 0; i < 16; i++)
-	{
-		float3 a = (colors[i] - color_sum * (1.0f / 16.0f)) * colorMetric;
-		covariance[0] += a.x * a.x;
-		covariance[1] += a.x * a.y;
-		covariance[2] += a.x * a.z;
-		covariance[3] += a.y * a.y;
-		covariance[4] += a.y * a.z;
-		covariance[5] += a.z * a.z;
-	}
-#else
-
-	const int idx = threadIdx.x;
-
-	float3 diff = (colors[idx] - color_sum * (1.0f / 16.0f)) * colorMetric;
-
-	// @@ Eliminate two-way bank conflicts here.
-	// @@ It seems that doing that and unrolling the reduction doesn't help...
-	__shared__ float covariance[16*6];
-
-	covariance[6 * idx + 0] = diff.x * diff.x;    // 0, 6, 12, 2, 8, 14, 4, 10, 0
-	covariance[6 * idx + 1] = diff.x * diff.y;
-	covariance[6 * idx + 2] = diff.x * diff.z;
-	covariance[6 * idx + 3] = diff.y * diff.y;
-	covariance[6 * idx + 4] = diff.y * diff.z;
-	covariance[6 * idx + 5] = diff.z * diff.z;
-
-	for(int d = 8; d > 0; d >>= 1)
-	{
-		if (idx < d)
-		{
-			covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0];
-			covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1];
-			covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2];
-			covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3];
-			covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4];
-			covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5];
-		}
-	}
-
-#endif
-
-	// Compute first eigen vector.
-	return firstEigenVector(covariance);
-}
-
-
-#endif // CUDAMATH_H
+
+#endif
+}
+
+inline __device__ float3 bestFitLine(const float3 * colors, float3 color_sum, float3 colorMetric)
+{
+	// Compute covariance matrix of the given colors.
+#if __DEVICE_EMULATION__
+	float covariance[6] = {0, 0, 0, 0, 0, 0};
+	for (int i = 0; i < 16; i++)
+	{
+		float3 a = (colors[i] - color_sum * (1.0f / 16.0f)) * colorMetric;
+		covariance[0] += a.x * a.x;
+		covariance[1] += a.x * a.y;
+		covariance[2] += a.x * a.z;
+		covariance[3] += a.y * a.y;
+		covariance[4] += a.y * a.z;
+		covariance[5] += a.z * a.z;
+	}
+#else
+
+	const int idx = threadIdx.x;
+
+	float3 diff = (colors[idx] - color_sum * (1.0f / 16.0f)) * colorMetric;
+
+	// @@ Eliminate two-way bank conflicts here.
+	// @@ It seems that doing that and unrolling the reduction doesn't help...
+	__shared__ float covariance[16*6];
+
+	covariance[6 * idx + 0] = diff.x * diff.x;    // 0, 6, 12, 2, 8, 14, 4, 10, 0
+	covariance[6 * idx + 1] = diff.x * diff.y;
+	covariance[6 * idx + 2] = diff.x * diff.z;
+	covariance[6 * idx + 3] = diff.y * diff.y;
+	covariance[6 * idx + 4] = diff.y * diff.z;
+	covariance[6 * idx + 5] = diff.z * diff.z;
+
+	for(int d = 8; d > 0; d >>= 1)
+	{
+		if (idx < d)
+		{
+			covariance[6 * idx + 0] += covariance[6 * (idx+d) + 0];
+			covariance[6 * idx + 1] += covariance[6 * (idx+d) + 1];
+			covariance[6 * idx + 2] += covariance[6 * (idx+d) + 2];
+			covariance[6 * idx + 3] += covariance[6 * (idx+d) + 3];
+			covariance[6 * idx + 4] += covariance[6 * (idx+d) + 4];
+			covariance[6 * idx + 5] += covariance[6 * (idx+d) + 5];
+		}
+	}
+
+#endif
+
+	// Compute first eigen vector.
+	return firstEigenVector(covariance);
+}
+
+
+#endif // CUDAMATH_H
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@ -49,6 +49,14 @@

 #define NVTT_VERSION 200

+#define NVTT_DECLARE_PIMPL(Class) \
+	private: \
+		Class(const Class &); \
+		void operator=(const Class &); \
+	public: \
+		struct Private; \
+		Private & m
+

 // Public interface.
 namespace nvtt
@ -89,6 +97,8 @@ namespace nvtt
 	/// Compression options. This class describes the desired compression format and other compression settings.
 	struct CompressionOptions
 	{
+		NVTT_DECLARE_PIMPL(CompressionOptions);
+
 		NVTT_API CompressionOptions();
 		NVTT_API ~CompressionOptions();
 		
@ -104,10 +114,6 @@ namespace nvtt
 		NVTT_API void setPixelFormat(unsigned int bitcount, unsigned int rmask, unsigned int gmask, unsigned int bmask, unsigned int amask);

 		NVTT_API void setQuantization(bool colorDithering, bool alphaDithering, bool binaryAlpha, int alphaThreshold = 127);
-
-	//private:
-		struct Private;
-		Private & m;
 	};


@ -170,6 +176,8 @@ namespace nvtt
 	/// Input options. Specify format and layout of the input texture.
 	struct InputOptions
 	{
+		NVTT_DECLARE_PIMPL(InputOptions);
+
 		NVTT_API InputOptions();
 		NVTT_API ~InputOptions();
 		
@ -214,10 +222,6 @@ namespace nvtt
 		// Set resizing options.
 		NVTT_API void setMaxExtents(int d);
 		NVTT_API void setRoundMode(RoundMode mode);
-		
-	//private:
-		struct Private;
-		Private & m;
 	};
 	
 	
@ -258,6 +262,8 @@ namespace nvtt
 	/// the compressor to the user.
 	struct OutputOptions
 	{
+		NVTT_DECLARE_PIMPL(OutputOptions);
+
 		NVTT_API OutputOptions();
 		NVTT_API ~OutputOptions();
 		
@ -269,16 +275,14 @@ namespace nvtt
 		NVTT_API void setOutputHandler(OutputHandler * outputHandler);
 		NVTT_API void setErrorHandler(ErrorHandler * errorHandler);
 		NVTT_API void setOutputHeader(bool outputHeader);
-		
-	//private:
-		struct Private;
-		Private & m;
 	};


 	/// Texture compressor.
 	struct Compressor
 	{
+		NVTT_DECLARE_PIMPL(Compressor);
+
 		NVTT_API Compressor();
 		NVTT_API ~Compressor();

@ -290,10 +294,6 @@ namespace nvtt
 		
 		// Estimate the size of compressing the input with the given options.
 		NVTT_API int estimateSize(const InputOptions & inputOptions, const CompressionOptions & compressionOptions) const;
-
-	//private:
-		struct Private;
-		Private & m;
 	};
 	
 	
--- a/src/nvtt/squish/simd_ve.h
+++ b/src/nvtt/squish/simd_ve.h
@ -50,6 +50,16 @@ public:
 		return *this;
 	}
 	
+	Vec4( const float * v )
+	{
+		union { vector float v; float c[4]; } u;
+		u.c[0] = v[0];
+		u.c[1] = v[1];
+		u.c[2] = v[2];
+		u.c[3] = v[3];
+		m_v = u.v;
+	}
+
 	Vec4( float x, float y, float z, float w )
 	{
 		union { vector float v; float c[4]; } u;
--- a/src/nvtt/tools/imgdiff.cpp
+++ b/src/nvtt/tools/imgdiff.cpp
@ -130,10 +130,13 @@ struct NormalError

 	void done()
 	{
-		ade /= samples;
-		mse /= samples * 3;
-		rmse = sqrt(mse);
-		psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
+		if (samples)
+		{
+			ade /= samples;
+			mse /= samples * 3;
+			rmse = sqrt(mse);
+			psnr = (rmse == 0) ? 999.0f : 20.0f * log10(255.0f / rmse);
+		}
 	}

 	void print()
Author	SHA1	Message	Date
castano	fa4a7b9af7	Add proper credits.	2008-02-28 22:52:00 +00:00
castano	f111d23637	Tag 2.0.1	2008-02-28 22:37:49 +00:00
castano	bce983f39e	Add post build command to copy header files.	2008-02-28 22:07:08 +00:00
castano	ff93ad41cb	Fix end of lines.	2008-02-28 21:45:46 +00:00
castano	56c7771100	Fix end of lines.	2008-02-28 21:45:26 +00:00
castano	ccced843e3	Use smaller allocations to prevent errors. Check for allocation errors.	2008-02-28 21:45:04 +00:00
castano	dafe2b8841	Hide copy ctor and operator to prevent compiler warnings. Wrap pimpl using NVTT_DECLARE_PIMPL macro.	2008-02-28 21:14:40 +00:00
castano	e3e7fcb226	Check cuda errors to find out whether the cuda context initialization succeeded.	2008-02-28 17:52:32 +00:00
castano	970395fba8	Fix osx build.	2008-02-28 17:02:29 +00:00
castano	8a24a93e2f	Disable CUDA when memory allocations fail.	2008-02-28 16:06:27 +00:00
 @ -1 +1 @@
 .0.0
 .0.1