From 93625c7de8b95680a49196c6e35c9eb5f7eb1bcc Mon Sep 17 00:00:00 2001
From: castano <castano@95f4ed2b-212e-0410-8b90-d31948207fce>
Date: Wed, 4 Nov 2009 06:16:03 +0000
Subject: [PATCH] Do not fail when the process is already using CUDA. Attempt
 to use the selected cuda device. More strict device selection.

---
 src/nvtt/Compressor.cpp     | 156 +++++------
 src/nvtt/Compressor.h       |   2 +
 src/nvtt/cuda/CudaUtils.cpp | 539 ++++++++++++++++++++----------------
 src/nvtt/cuda/CudaUtils.h   |   6 +-
 src/nvtt/nvtt.h             |   2 +-
 src/nvtt/tools/compress.cpp |   5 +
 6 files changed, 390 insertions(+), 320 deletions(-)
diff --git a/src/nvtt/Compressor.cpp b/src/nvtt/Compressor.cpp
index 9613fec..6631b91 100644
--- a/src/nvtt/Compressor.cpp
+++ b/src/nvtt/Compressor.cpp
@@ -53,7 +53,7 @@ using namespace nvtt;
 
 namespace
 {
-	
+
 	static int blockSize(Format format)
 	{
 		if (format == Format_DXT1 || format == Format_DXT1a) {
@@ -121,7 +121,7 @@ namespace nvtt
 			m_fixedImage = NULL;
 			m_floatImage = image;
 		}
-		
+
 
 		// Convert linear float image to fixed image ready for compression.
 		void toFixedImage(const InputOptions::Private & inputOptions)
@@ -153,7 +153,7 @@ namespace nvtt
 				if (inputOptions.isNormalMap)
 				{
 					// Expand normals to [-1, 1] range.
-				//	floatImage->expandNormals(0);
+					//	floatImage->expandNormals(0);
 				}
 				else if (inputOptions.inputGamma != 1.0f)
 				{
@@ -193,7 +193,7 @@ namespace nvtt
 			return m_fixedImage.ptr();
 		}
 
-		
+
 	private:
 		const Image * m_inputImage;
 		AutoPtr<Image> m_fixedImage;
@@ -207,28 +207,16 @@ Compressor::Compressor() : m(*new Compressor::Private())
 {
 	// CUDA initialization.
 	m.cudaSupported = cuda::isHardwarePresent();
-	m.cudaEnabled = m.cudaSupported;
-
-	if (m.cudaEnabled)
-	{
-		// Select fastest CUDA device.
-		int device = cuda::getFastestDevice();
-		cuda::setDevice(device);
-		
-		m.cuda = new CudaCompressor();
+	m.cudaEnabled = false;
+	m.cudaDevice = -1;
 
-		if (!m.cuda->isValid())
-		{
-			m.cudaEnabled = false;
-			m.cuda = NULL;
-		}
-	}
+	enableCudaAcceleration(m.cudaSupported);
 }
 
 Compressor::~Compressor()
 {
+	enableCudaAcceleration(false);
 	delete &m;
-	cuda::exit();
 }
 
 
@@ -237,21 +225,33 @@ void Compressor::enableCudaAcceleration(bool enable)
 {
 	if (m.cudaSupported)
 	{
-		m.cudaEnabled = enable;
-	}
-
-	if (m.cudaEnabled && m.cuda == NULL)
-	{
-		// Select fastest CUDA device.
-		int device = cuda::getFastestDevice();
-		cuda::setDevice(device);
-		
-		m.cuda = new CudaCompressor();
-		
-		if (!m.cuda->isValid())
+		if (m.cudaEnabled && !enable)
 		{
 			m.cudaEnabled = false;
 			m.cuda = NULL;
+
+			if (m.cudaDevice != -1)
+			{
+				// Exit device.
+				cuda::exitDevice();
+			}
+		}
+		else if (!m.cudaEnabled && enable)
+		{
+			// Init the CUDA device. This may return -1 if CUDA was already initialized by the app.
+			m.cudaEnabled = cuda::initDevice(&m.cudaDevice);
+
+			if (m.cudaEnabled)
+			{
+				// Create compressor if initialization succeeds.
+				m.cuda = new CudaCompressor();
+
+				// But cleanup if failed.
+				if (!m.cuda->isValid())
+				{
+					enableCudaAcceleration(false);
+				}
+			}
 		}
 	}
 }
@@ -292,9 +292,9 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c
 		if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_FileOpen);
 		return false;
 	}
-	
+
 	inputOptions.computeTargetExtents();
-	
+
 	// Output DDS header.
 	if (!outputHeader(inputOptions, compressionOptions, outputOptions))
 	{
@@ -310,7 +310,7 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c
 	}
 
 	outputOptions.closeFile();
-	
+
 	return true;
 }
 
@@ -325,15 +325,15 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 	}
 
 	DDSHeader header;
-	
+
 	header.setWidth(inputOptions.targetWidth);
 	header.setHeight(inputOptions.targetHeight);
-	
+
 	int mipmapCount = inputOptions.realMipmapCount();
 	nvDebugCheck(mipmapCount > 0);
-	
+
 	header.setMipmapCount(mipmapCount);
-	
+
 	if (inputOptions.textureType == TextureType_2D) {
 		header.setTexture2D();
 	}
@@ -341,10 +341,10 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 		header.setTextureCube();
 	}		
 	/*else if (inputOptions.textureType == TextureType_3D) {
-		header.setTexture3D();
-		header.setDepth(inputOptions.targetDepth);
+	header.setTexture3D();
+	header.setDepth(inputOptions.targetDepth);
 	}*/
-	
+
 	if (compressionOptions.format == Format_RGBA)
 	{
 		header.setPitch(computePitch(inputOptions.targetWidth, compressionOptions.bitcount));
@@ -353,7 +353,7 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 	else
 	{
 		header.setLinearSize(computeImageSize(inputOptions.targetWidth, inputOptions.targetHeight, inputOptions.targetDepth, compressionOptions.bitcount, compressionOptions.format));
-		
+
 		if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a) {
 			header.setFourCC('D', 'X', 'T', '1');
 			if (inputOptions.isNormalMap) header.setNormalFlag(true);
@@ -376,10 +376,10 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 			if (inputOptions.isNormalMap) header.setNormalFlag(true);
 		}
 	}
-	
+
 	// Swap bytes if necessary.
 	header.swapBytes();
-	
+
 	uint headerSize = 128;
 	if (header.hasDX10Header())
 	{
@@ -392,7 +392,7 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 	{
 		outputOptions.errorHandler->error(Error_FileWrite);
 	}
-	
+
 	return writeSucceed;
 }
 
@@ -428,7 +428,7 @@ bool Compressor::Private::compressMipmaps(uint f, const InputOptions::Private &
 				return false;
 			}
 		}
-		
+
 		quantizeMipmap(mipmap, compressionOptions);
 
 		compressMipmap(mipmap, inputOptions, compressionOptions, outputOptions);
@@ -438,7 +438,7 @@ bool Compressor::Private::compressMipmaps(uint f, const InputOptions::Private &
 		h = max(1U, h / 2);
 		d = max(1U, d / 2);
 	}
-	
+
 	return true;
 }
 
@@ -489,7 +489,7 @@ int Compressor::Private::findExactMipmap(const InputOptions::Private & inputOpti
 	{
 		int idx = f * inputOptions.mipmapCount + m;
 		const InputOptions::Private::InputImage & inputImage = inputOptions.images[idx];
-		
+
 		if (inputImage.width == int(w) && inputImage.height == int(h) && inputImage.depth == int(d))
 		{
 			if (inputImage.data != NULL)
@@ -544,7 +544,7 @@ void Compressor::Private::downsampleMipmap(Mipmap & mipmap, const InputOptions::
 	mipmap.toFloatImage(inputOptions);
 
 	const FloatImage * floatImage = mipmap.asFloatImage();
-	
+
 	if (inputOptions.mipmapFilter == MipmapFilter_Box)
 	{
 		// Use fast downsample.
@@ -562,7 +562,7 @@ void Compressor::Private::downsampleMipmap(Mipmap & mipmap, const InputOptions::
 		filter.setParameters(inputOptions.kaiserAlpha, inputOptions.kaiserStretch);
 		mipmap.setImage(floatImage->downSample(filter, (FloatImage::WrapMode)inputOptions.wrapMode));
 	}
-	
+
 	// Normalize mipmap.
 	if ((inputOptions.isNormalMap || inputOptions.convertToNormalMap) && inputOptions.normalizeMipmaps)
 	{
@@ -590,7 +590,7 @@ void Compressor::Private::processInputImage(Mipmap & mipmap, const InputOptions:
 	if (inputOptions.convertToNormalMap)
 	{
 		mipmap.toFixedImage(inputOptions);
-		
+
 		Vector4 heightScale = inputOptions.heightFactors;
 		mipmap.setImage(createNormalMap(mipmap.asFixedImage(), (FloatImage::WrapMode)inputOptions.wrapMode, heightScale, inputOptions.bumpFrequencyScale));
 	}
@@ -715,29 +715,29 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 #endif
 
 #if defined(HAVE_ATITC)
-		if (compressionOptions.externalCompressor == "ati")
-		{
-			atiCompressDXT1(image, outputOptions);
-		}
-		else
-#endif
-		if (compressionOptions.quality == Quality_Fastest)
-		{
-			fast.compressDXT1(outputOptions);
-		}
-		else
-		{
-			if (useCuda)
+			if (compressionOptions.externalCompressor == "ati")
 			{
-				nvDebugCheck(cudaSupported);
-				cuda->setImage(image, inputOptions.alphaMode);
-				cuda->compressDXT1(compressionOptions, outputOptions);
+				atiCompressDXT1(image, outputOptions);
 			}
 			else
-			{
-				slow.compressDXT1(compressionOptions, outputOptions);
-			}
-		}
+#endif
+				if (compressionOptions.quality == Quality_Fastest)
+				{
+					fast.compressDXT1(outputOptions);
+				}
+				else
+				{
+					if (useCuda)
+					{
+						nvDebugCheck(cudaSupported);
+						cuda->setImage(image, inputOptions.alphaMode);
+						cuda->compressDXT1(compressionOptions, outputOptions);
+					}
+					else
+					{
+						slow.compressDXT1(compressionOptions, outputOptions);
+					}
+				}
 	}
 	else if (compressionOptions.format == Format_DXT1a)
 	{
@@ -828,27 +828,27 @@ int Compressor::Private::estimateSize(const InputOptions::Private & inputOptions
 	const uint bitCount = compressionOptions.bitcount;
 
 	inputOptions.computeTargetExtents();
-	
+
 	uint mipmapCount = inputOptions.realMipmapCount();
-	
+
 	int size = 0;
-	
+
 	for (uint f = 0; f < inputOptions.faceCount; f++)
 	{
 		uint w = inputOptions.targetWidth;
 		uint h = inputOptions.targetHeight;
 		uint d = inputOptions.targetDepth;
-		
+
 		for (uint m = 0; m < mipmapCount; m++)
 		{
 			size += computeImageSize(w, h, d, bitCount, format);
-			
+
 			// Compute extents of next mipmap:
 			w = max(1U, w / 2);
 			h = max(1U, h / 2);
 			d = max(1U, d / 2);
 		}
 	}
-	
+
 	return size;
 }
diff --git a/src/nvtt/Compressor.h b/src/nvtt/Compressor.h
index 55b9563..8737e29 100644
--- a/src/nvtt/Compressor.h
+++ b/src/nvtt/Compressor.h
@@ -63,10 +63,12 @@ namespace nvtt
 		bool compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
 
 
+
 	public:
 
 		bool cudaSupported;
 		bool cudaEnabled;
+		int cudaDevice;
 
 		nv::AutoPtr<nv::CudaCompressor> cuda;
 
diff --git a/src/nvtt/cuda/CudaUtils.cpp b/src/nvtt/cuda/CudaUtils.cpp
index 7bb2b09..3cca04d 100644
--- a/src/nvtt/cuda/CudaUtils.cpp
+++ b/src/nvtt/cuda/CudaUtils.cpp
@@ -1,239 +1,300 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
-// 
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
-// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-
-#include <nvcore/Debug.h>
-#include <nvcore/Library.h>
-#include "CudaUtils.h"
-
-#if defined HAVE_CUDA
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#endif
-
-using namespace nv;
-using namespace cuda;
-
-/* @@ Move this to win32 utils or somewhere else.
-#if NV_OS_WIN32
-
-#define WINDOWS_LEAN_AND_MEAN
-#include <windows.h>
-
-static bool isWindowsVista()
-{
-	OSVERSIONINFO osvi;
-	osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
-
-	::GetVersionEx(&osvi);
-	return osvi.dwMajorVersion >= 6;
-}
-
-
-typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
-
-static bool isWow32()
-{
-	LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
-
-	BOOL bIsWow64 = FALSE;
- 
-	if (NULL != fnIsWow64Process)
-	{
-		if (!fnIsWow64Process(GetCurrentProcess(), &bIsWow64))
-		{
-			// Assume 32 bits.
-			return true;
-		}
-	}
-
-	return !bIsWow64;
-}
-
-#endif
-*/
-
-
-static bool isCudaDriverAvailable(int version)
-{
-#if defined HAVE_CUDA
-#if NV_OS_WIN32
-	Library nvcuda("nvcuda.dll");
-#else
-	Library nvcuda(NV_LIBRARY_NAME(cuda));
-#endif
-	
-	if (!nvcuda.isValid())
-	{
-		nvDebug("*** CUDA driver not found.\n");
-		return false;
-	}
-	
-	if (version >= 2000)
-	{
-		void * address = nvcuda.bindSymbol("cuStreamCreate");
-		if (address == NULL) {
-			nvDebug("*** CUDA driver version < 2.0.\n");
-			return false;
-		}
-	}
-
-	if (version >= 2010)
-	{
-		void * address = nvcuda.bindSymbol("cuModuleLoadDataEx");
-		if (address == NULL) {
-			nvDebug("*** CUDA driver version < 2.1.\n");
-			return false;
-		}
-	}
-	
-	if (version >= 2020)
-	{
-		typedef CUresult (CUDAAPI * PFCU_DRIVERGETVERSION)(int * version);
-
-		PFCU_DRIVERGETVERSION driverGetVersion = (PFCU_DRIVERGETVERSION)nvcuda.bindSymbol("cuDriverGetVersion");
-		if (driverGetVersion == NULL) {
-			nvDebug("*** CUDA driver version < 2.2.\n");
-			return false;
-		}
-
-		int driverVersion;
-		CUresult err = driverGetVersion(&driverVersion);
-		if (err != CUDA_SUCCESS) {
-			nvDebug("*** Error querying driver version: '%s'.\n", cudaGetErrorString((cudaError_t)err));
-			return false;
-		}
-
-		return driverVersion >= version;
-	}
-#endif // HAVE_CUDA
-
-	return true;
-}
-
-
-/// Determine if CUDA is available.
-bool nv::cuda::isHardwarePresent()
-{
-#if defined HAVE_CUDA
-	// Make sure that CUDA driver matches CUDA runtime.
-	if (!isCudaDriverAvailable(CUDART_VERSION))
-	{
-		nvDebug("CUDA driver not available for CUDA runtime %d\n", CUDART_VERSION);
-		return false;
-	}
-
-	int count = deviceCount();
-	if (count == 1)
-	{
-		// Make sure it's not an emulation device.
-		cudaDeviceProp deviceProp;
-		cudaGetDeviceProperties(&deviceProp, 0);
-
-		// deviceProp.name != Device Emulation (CPU)
-		if (deviceProp.major == -1 || deviceProp.minor == -1)
-		{
-			return false;
-		}
-	}
-
-	// @@ Make sure that warp size == 32
-
-	return count > 0;
-#else
-	return false;
-#endif
-}
-
-/// Get number of CUDA enabled devices.
-int nv::cuda::deviceCount()
-{
-#if defined HAVE_CUDA
-	int gpuCount = 0;
-
-	cudaError_t result = cudaGetDeviceCount(&gpuCount);
-
-	if (result == cudaSuccess)
-	{
-		return gpuCount;
-	}
-#endif
-	return 0;
-}
-
-int nv::cuda::getFastestDevice()
-{
-	int max_gflops_device = 0;
-#if defined HAVE_CUDA
-	int max_gflops = 0;
-
-	const int device_count = deviceCount();
-	int current_device = 0;
-	while (current_device < device_count)
-	{
-		cudaDeviceProp device_properties;
-		cudaGetDeviceProperties(&device_properties, current_device);
-		int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
-
-		if (device_properties.major != -1 && device_properties.minor != -1)
-		{
-			if( gflops > max_gflops )
-			{
-				max_gflops = gflops;
-				max_gflops_device = current_device;
-			}
-		}
-		
-		current_device++;
-	}
-#endif
-	return max_gflops_device;
-}
-
-
-/// Activate the given devices.
-bool nv::cuda::setDevice(int i)
-{
-	nvCheck(i < deviceCount());
-#if defined HAVE_CUDA
-	cudaError_t result = cudaSetDevice(i);
-
-	if (result != cudaSuccess) {
-		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
-	}
-
-	return result == cudaSuccess;
-#else
-	return false;
-#endif
-}
-
-void nv::cuda::exit()
-{
-#if defined HAVE_CUDA
-	cudaError_t result = cudaThreadExit();
-
-	if (result != cudaSuccess) {
-		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
-	}
-#endif
-}
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// 
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
+
+#include <nvcore/Debug.h>
+#include <nvcore/Library.h>
+#include "CudaUtils.h"
+
+#if defined HAVE_CUDA
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#endif
+
+using namespace nv;
+using namespace cuda;
+
+/* @@ Move this to win32 utils or somewhere else.
+#if NV_OS_WIN32
+
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+
+static bool isWindowsVista()
+{
+OSVERSIONINFO osvi;
+osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+
+::GetVersionEx(&osvi);
+return osvi.dwMajorVersion >= 6;
+}
+
+
+typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
+
+static bool isWow32()
+{
+LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
+
+BOOL bIsWow64 = FALSE;
+
+if (NULL != fnIsWow64Process)
+{
+if (!fnIsWow64Process(GetCurrentProcess(), &bIsWow64))
+{
+// Assume 32 bits.
+return true;
+}
+}
+
+return !bIsWow64;
+}
+
+#endif
+*/
+
+
+static bool isCudaDriverAvailable(int version)
+{
+#if defined HAVE_CUDA
+#if NV_OS_WIN32
+	Library nvcuda("nvcuda.dll");
+#else
+	Library nvcuda(NV_LIBRARY_NAME(cuda));
+#endif
+
+	if (!nvcuda.isValid())
+	{
+		nvDebug("*** CUDA driver not found.\n");
+		return false;
+	}
+
+	if (version >= 2000)
+	{
+		void * address = nvcuda.bindSymbol("cuStreamCreate");
+		if (address == NULL) {
+			nvDebug("*** CUDA driver version < 2.0.\n");
+			return false;
+		}
+	}
+
+	if (version >= 2010)
+	{
+		void * address = nvcuda.bindSymbol("cuModuleLoadDataEx");
+		if (address == NULL) {
+			nvDebug("*** CUDA driver version < 2.1.\n");
+			return false;
+		}
+	}
+
+	if (version >= 2020)
+	{
+		typedef CUresult (CUDAAPI * PFCU_DRIVERGETVERSION)(int * version);
+
+		PFCU_DRIVERGETVERSION driverGetVersion = (PFCU_DRIVERGETVERSION)nvcuda.bindSymbol("cuDriverGetVersion");
+		if (driverGetVersion == NULL) {
+			nvDebug("*** CUDA driver version < 2.2.\n");
+			return false;
+		}
+
+		int driverVersion;
+		CUresult err = driverGetVersion(&driverVersion);
+		if (err != CUDA_SUCCESS) {
+			nvDebug("*** Error querying driver version: '%s'.\n", cudaGetErrorString((cudaError_t)err));
+			return false;
+		}
+
+		return driverVersion >= version;
+	}
+#endif // HAVE_CUDA
+
+	return true;
+}
+
+
+/// Determine if CUDA is available.
+bool nv::cuda::isHardwarePresent()
+{
+#if defined HAVE_CUDA
+	// Make sure that CUDA driver matches CUDA runtime.
+	if (!isCudaDriverAvailable(CUDART_VERSION))
+	{
+		nvDebug("CUDA driver not available for CUDA runtime %d\n", CUDART_VERSION);
+		return false;
+	}
+
+	int count = deviceCount();
+	if (count == 1)
+	{
+		// Make sure it's not an emulation device.
+		cudaDeviceProp deviceProp;
+		cudaGetDeviceProperties(&deviceProp, 0);
+
+		// deviceProp.name != Device Emulation (CPU)
+		if (deviceProp.major == -1 || deviceProp.minor == -1)
+		{
+			return false;
+		}
+	}
+
+	// @@ Make sure that warp size == 32
+
+	// @@ Make sure available GPU is faster than the CPU.
+
+	return count > 0;
+#else
+	return false;
+#endif
+}
+
+/// Get number of CUDA enabled devices.
+int nv::cuda::deviceCount()
+{
+#if defined HAVE_CUDA
+	int gpuCount = 0;
+
+	cudaError_t result = cudaGetDeviceCount(&gpuCount);
+
+	if (result == cudaSuccess)
+	{
+		return gpuCount;
+	}
+#endif
+	return 0;
+}
+
+
+// Make sure device meets requirements:
+// - Not an emulation device.
+// - Not an integrated device?
+// - Faster than CPU.
+bool nv::cuda::isValidDevice(int i)
+{
+#if defined HAVE_CUDA
+
+	cudaDeviceProp device_properties;
+	cudaGetDeviceProperties(&device_properties, i);
+	int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
+
+	if (device_properties.major == -1 || device_properties.minor == -1) {
+		// Emulation device.
+		return false;
+	}
+
+#if CUDART_VERSION >= 2030 // 2.3
+	/*if (device_properties.integrated)
+	{
+		// Integrated devices.
+		return false;
+	}*/
+#endif
+
+	return true;
+#else
+	return false;
+#endif
+}
+
+int nv::cuda::getFastestDevice()
+{
+	int max_gflops_device = -1;
+#if defined HAVE_CUDA
+	int max_gflops = 0;
+
+	const int device_count = deviceCount();
+	for (int i = 0; i < device_count; i++)
+	{
+		if (isValidDevice(i))
+		{
+			cudaDeviceProp device_properties;
+			cudaGetDeviceProperties(&device_properties, i);
+			int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
+
+			if (gflops > max_gflops)
+			{
+				max_gflops = gflops;
+				max_gflops_device = i;
+			}
+		}
+	}
+#endif
+	return max_gflops_device;
+}
+
+
+/// Activate the given devices.
+bool nv::cuda::initDevice(int * device_ptr)
+{
+	nvDebugCheck(device_ptr != NULL);
+#if defined HAVE_CUDA
+
+#if CUDART_VERSION >= 2030 // 2.3
+
+	// Set device flags to yield in order to play nice with other threads and to find out if CUDA was already active.
+	cudaError_t resul = cudaSetDeviceFlags(cudaDeviceScheduleYield);
+
+#endif
+
+	int device = getFastestDevice();
+
+	if (device == -1)
+	{
+		// No device is fast enough.
+		*device_ptr = -1;
+		return false;
+	}
+
+	// Select CUDA device.
+	cudaError_t result = cudaSetDevice(device);
+
+	if (result == cudaErrorSetOnActiveProcess)
+	{
+		int device;
+		result = cudaGetDevice(&device);
+
+		*device_ptr = -1;  // No device to cleanup.
+		return isValidDevice(device); // Return true if device is valid.
+	}
+	else if (result != cudaSuccess)
+	{
+		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
+		*device_ptr = -1;
+		return false;
+	}
+
+	*device_ptr = device;
+	return true;
+#else
+	return false;
+#endif
+}
+
+void nv::cuda::exitDevice()
+{
+#if defined HAVE_CUDA
+	cudaError_t result = cudaThreadExit();
+
+	if (result != cudaSuccess) {
+		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
+	}
+#endif
+}
diff --git a/src/nvtt/cuda/CudaUtils.h b/src/nvtt/cuda/CudaUtils.h
index 376bbe1..7128b4d 100644
--- a/src/nvtt/cuda/CudaUtils.h
+++ b/src/nvtt/cuda/CudaUtils.h
@@ -32,8 +32,10 @@ namespace nv
 		bool isHardwarePresent();
 		int deviceCount();
 		int getFastestDevice();
-		bool setDevice(int i);
-		void exit();
+		bool isValidDevice(int i);
+
+		bool initDevice(int * device_ptr);
+		void exitDevice();
 	};
 	
 } // nv namespace
diff --git a/src/nvtt/nvtt.h b/src/nvtt/nvtt.h
index 698e533..b5d2e72 100644
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@@ -194,7 +194,7 @@ namespace nvtt
 		// Describe the format of the input.
 		NVTT_API void setFormat(InputFormat format);
 		
-		// Set the way the input alpha channel is interpreted. @@ Not implemented!
+		// Set the way the input alpha channel is interpreted.
 		NVTT_API void setAlphaMode(AlphaMode alphaMode);
 		
 		// Set gamma settings.
diff --git a/src/nvtt/tools/compress.cpp b/src/nvtt/tools/compress.cpp
index ea87ac3..a27d4fb 100644
--- a/src/nvtt/tools/compress.cpp
+++ b/src/nvtt/tools/compress.cpp
@@ -33,6 +33,9 @@
 
 #include <time.h> // clock
 
+#include <c:\CUDA\include\cuda_runtime.h>
+#pragma comment(lib, "c:\\CUDA\\lib\\cudart.lib")
+
 //#define WINDOWS_LEAN_AND_MEAN
 //#include <windows.h> // TIMER
 
@@ -413,6 +416,8 @@ int main(int argc, char *argv[])
 		return EXIT_FAILURE;
 	}
 
+	cudaSetDevice(0);
+
 	nvtt::Compressor compressor;
 	compressor.enableCudaAcceleration(!nocuda);