Do not fail when the process is already using CUDA.

Attempt to use the selected cuda device. More strict device selection.
2009-11-04 06:16:03 +00:00
parent 6e9feef6f4
commit 93625c7de8
6 changed files with 390 additions and 320 deletions
--- a/src/nvtt/Compressor.cpp
+++ b/src/nvtt/Compressor.cpp
@ -53,7 +53,7 @@ using namespace nvtt;
 namespace
 {
-	
+
 	static int blockSize(Format format)
 	{
 		if (format == Format_DXT1 || format == Format_DXT1a) {
@ -121,7 +121,7 @@ namespace nvtt
 			m_fixedImage = NULL;
 			m_floatImage = image;
 		}
-		
+
 		// Convert linear float image to fixed image ready for compression.
 		void toFixedImage(const InputOptions::Private & inputOptions)
@ -153,7 +153,7 @@ namespace nvtt
 				if (inputOptions.isNormalMap)
 				{
 					// Expand normals to [-1, 1] range.
-				//	floatImage->expandNormals(0);
+					//	floatImage->expandNormals(0);
 				}
 				else if (inputOptions.inputGamma != 1.0f)
 				{
@ -193,7 +193,7 @@ namespace nvtt
 			return m_fixedImage.ptr();
 		}
-		
+
 	private:
 		const Image * m_inputImage;
 		AutoPtr<Image> m_fixedImage;
@ -207,28 +207,16 @@ Compressor::Compressor() : m(*new Compressor::Private())
 {
 	// CUDA initialization.
 	m.cudaSupported = cuda::isHardwarePresent();
-	m.cudaEnabled = m.cudaSupported;
+	m.cudaEnabled = false;
 	m.cudaDevice = -1;
-	if (m.cudaEnabled)
+	enableCudaAcceleration(m.cudaSupported);
 	{
 		// Select fastest CUDA device.
 		int device = cuda::getFastestDevice();
 		cuda::setDevice(device);
 		m.cuda = new CudaCompressor();
 		if (!m.cuda->isValid())
 		{
 			m.cudaEnabled = false;
 			m.cuda = NULL;
 		}
 	}
 }
 Compressor::~Compressor()
 {
 	enableCudaAcceleration(false);
 	delete &m;
 	cuda::exit();
 }
@ -237,21 +225,33 @@ void Compressor::enableCudaAcceleration(bool enable)
 {
 	if (m.cudaSupported)
 	{
-		m.cudaEnabled = enable;
+		if (m.cudaEnabled && !enable)
 	}
 	if (m.cudaEnabled && m.cuda == NULL)
 	{
 		// Select fastest CUDA device.
 		int device = cuda::getFastestDevice();
 		cuda::setDevice(device);
 		m.cuda = new CudaCompressor();
 		if (!m.cuda->isValid())
 		{
 			m.cudaEnabled = false;
 			m.cuda = NULL;
 			if (m.cudaDevice != -1)
 			{
 				// Exit device.
 				cuda::exitDevice();
 			}
 		}
 		else if (!m.cudaEnabled && enable)
 		{
 			// Init the CUDA device. This may return -1 if CUDA was already initialized by the app.
 			m.cudaEnabled = cuda::initDevice(&m.cudaDevice);
 			if (m.cudaEnabled)
 			{
 				// Create compressor if initialization succeeds.
 				m.cuda = new CudaCompressor();
 				// But cleanup if failed.
 				if (!m.cuda->isValid())
 				{
 					enableCudaAcceleration(false);
 				}
 			}
 		}
 	}
 }
@ -292,9 +292,9 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c
 		if (outputOptions.errorHandler) outputOptions.errorHandler->error(Error_FileOpen);
 		return false;
 	}
-	
+
 	inputOptions.computeTargetExtents();
-	
+
 	// Output DDS header.
 	if (!outputHeader(inputOptions, compressionOptions, outputOptions))
 	{
@ -310,7 +310,7 @@ bool Compressor::Private::compress(const InputOptions::Private & inputOptions, c
 	}
 	outputOptions.closeFile();
-	
+
 	return true;
 }
@ -325,15 +325,15 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 	}
 	DDSHeader header;
-	
+
 	header.setWidth(inputOptions.targetWidth);
 	header.setHeight(inputOptions.targetHeight);
-	
+
 	int mipmapCount = inputOptions.realMipmapCount();
 	nvDebugCheck(mipmapCount > 0);
-	
+
 	header.setMipmapCount(mipmapCount);
-	
+
 	if (inputOptions.textureType == TextureType_2D) {
 		header.setTexture2D();
 	}
@ -341,10 +341,10 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 		header.setTextureCube();
 	}		
 	/*else if (inputOptions.textureType == TextureType_3D) {
-		header.setTexture3D();
+	header.setTexture3D();
-		header.setDepth(inputOptions.targetDepth);
+	header.setDepth(inputOptions.targetDepth);
 	}*/
-	
+
 	if (compressionOptions.format == Format_RGBA)
 	{
 		header.setPitch(computePitch(inputOptions.targetWidth, compressionOptions.bitcount));
@ -353,7 +353,7 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 	else
 	{
 		header.setLinearSize(computeImageSize(inputOptions.targetWidth, inputOptions.targetHeight, inputOptions.targetDepth, compressionOptions.bitcount, compressionOptions.format));
-		
+
 		if (compressionOptions.format == Format_DXT1 || compressionOptions.format == Format_DXT1a) {
 			header.setFourCC('D', 'X', 'T', '1');
 			if (inputOptions.isNormalMap) header.setNormalFlag(true);
@ -376,10 +376,10 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 			if (inputOptions.isNormalMap) header.setNormalFlag(true);
 		}
 	}
-	
+
 	// Swap bytes if necessary.
 	header.swapBytes();
-	
+
 	uint headerSize = 128;
 	if (header.hasDX10Header())
 	{
@ -392,7 +392,7 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 	{
 		outputOptions.errorHandler->error(Error_FileWrite);
 	}
-	
+
 	return writeSucceed;
 }
@ -428,7 +428,7 @@ bool Compressor::Private::compressMipmaps(uint f, const InputOptions::Private &
 				return false;
 			}
 		}
-		
+
 		quantizeMipmap(mipmap, compressionOptions);
 		compressMipmap(mipmap, inputOptions, compressionOptions, outputOptions);
@ -438,7 +438,7 @@ bool Compressor::Private::compressMipmaps(uint f, const InputOptions::Private &
 		h = max(1U, h / 2);
 		d = max(1U, d / 2);
 	}
-	
+
 	return true;
 }
@ -489,7 +489,7 @@ int Compressor::Private::findExactMipmap(const InputOptions::Private & inputOpti
 	{
 		int idx = f * inputOptions.mipmapCount + m;
 		const InputOptions::Private::InputImage & inputImage = inputOptions.images[idx];
-		
+
 		if (inputImage.width == int(w) && inputImage.height == int(h) && inputImage.depth == int(d))
 		{
 			if (inputImage.data != NULL)
@ -544,7 +544,7 @@ void Compressor::Private::downsampleMipmap(Mipmap & mipmap, const InputOptions::
 	mipmap.toFloatImage(inputOptions);
 	const FloatImage * floatImage = mipmap.asFloatImage();
-	
+
 	if (inputOptions.mipmapFilter == MipmapFilter_Box)
 	{
 		// Use fast downsample.
@ -562,7 +562,7 @@ void Compressor::Private::downsampleMipmap(Mipmap & mipmap, const InputOptions::
 		filter.setParameters(inputOptions.kaiserAlpha, inputOptions.kaiserStretch);
 		mipmap.setImage(floatImage->downSample(filter, (FloatImage::WrapMode)inputOptions.wrapMode));
 	}
-	
+
 	// Normalize mipmap.
 	if ((inputOptions.isNormalMap || inputOptions.convertToNormalMap) && inputOptions.normalizeMipmaps)
 	{
@ -590,7 +590,7 @@ void Compressor::Private::processInputImage(Mipmap & mipmap, const InputOptions:
 	if (inputOptions.convertToNormalMap)
 	{
 		mipmap.toFixedImage(inputOptions);
-		
+
 		Vector4 heightScale = inputOptions.heightFactors;
 		mipmap.setImage(createNormalMap(mipmap.asFixedImage(), (FloatImage::WrapMode)inputOptions.wrapMode, heightScale, inputOptions.bumpFrequencyScale));
 	}
@ -715,29 +715,29 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 #endif
 #if defined(HAVE_ATITC)
-		if (compressionOptions.externalCompressor == "ati")
+			if (compressionOptions.externalCompressor == "ati")
 		{
 			atiCompressDXT1(image, outputOptions);
 		}
 		else
 #endif
 		if (compressionOptions.quality == Quality_Fastest)
 		{
 			fast.compressDXT1(outputOptions);
 		}
 		else
 		{
 			if (useCuda)
 			{
-				nvDebugCheck(cudaSupported);
+				atiCompressDXT1(image, outputOptions);
 				cuda->setImage(image, inputOptions.alphaMode);
 				cuda->compressDXT1(compressionOptions, outputOptions);
 			}
 			else
-			{
+#endif
-				slow.compressDXT1(compressionOptions, outputOptions);
+				if (compressionOptions.quality == Quality_Fastest)
-			}
+				{
-		}
+					fast.compressDXT1(outputOptions);
 				}
 				else
 				{
 					if (useCuda)
 					{
 						nvDebugCheck(cudaSupported);
 						cuda->setImage(image, inputOptions.alphaMode);
 						cuda->compressDXT1(compressionOptions, outputOptions);
 					}
 					else
 					{
 						slow.compressDXT1(compressionOptions, outputOptions);
 					}
 				}
 	}
 	else if (compressionOptions.format == Format_DXT1a)
 	{
@ -828,27 +828,27 @@ int Compressor::Private::estimateSize(const InputOptions::Private & inputOptions
 	const uint bitCount = compressionOptions.bitcount;
 	inputOptions.computeTargetExtents();
-	
+
 	uint mipmapCount = inputOptions.realMipmapCount();
-	
+
 	int size = 0;
-	
+
 	for (uint f = 0; f < inputOptions.faceCount; f++)
 	{
 		uint w = inputOptions.targetWidth;
 		uint h = inputOptions.targetHeight;
 		uint d = inputOptions.targetDepth;
-		
+
 		for (uint m = 0; m < mipmapCount; m++)
 		{
 			size += computeImageSize(w, h, d, bitCount, format);
-			
+
 			// Compute extents of next mipmap:
 			w = max(1U, w / 2);
 			h = max(1U, h / 2);
 			d = max(1U, d / 2);
 		}
 	}
-	
+
 	return size;
 }
--- a/src/nvtt/Compressor.h
+++ b/src/nvtt/Compressor.h
@ -63,10 +63,12 @@ namespace nvtt
 		bool compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
 	public:
 		bool cudaSupported;
 		bool cudaEnabled;
 		int cudaDevice;
 		nv::AutoPtr<nv::CudaCompressor> cuda;
--- a/src/nvtt/cuda/CudaUtils.cpp
+++ b/src/nvtt/cuda/CudaUtils.cpp
@ -1,239 +1,300 @@
-// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
+// Copyright NVIDIA Corporation 2007 -- Ignacio Castano <icastano@nvidia.com>
-// 
+// 
-// Permission is hereby granted, free of charge, to any person
+// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
+// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
+// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
+// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
+// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
+// Software is furnished to do so, subject to the following
-// conditions:
+// conditions:
-// 
+// 
-// The above copyright notice and this permission notice shall be
+// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
+// included in all copies or substantial portions of the Software.
-// 
+// 
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
+// OTHER DEALINGS IN THE SOFTWARE.
-
+
-#include <nvcore/Debug.h>
+#include <nvcore/Debug.h>
-#include <nvcore/Library.h>
+#include <nvcore/Library.h>
-#include "CudaUtils.h"
+#include "CudaUtils.h"
-
+
-#if defined HAVE_CUDA
+#if defined HAVE_CUDA
-#include <cuda.h>
+#include <cuda.h>
-#include <cuda_runtime_api.h>
+#include <cuda_runtime_api.h>
-#endif
+#endif
-
+
-using namespace nv;
+using namespace nv;
-using namespace cuda;
+using namespace cuda;
-
+
-/* @@ Move this to win32 utils or somewhere else.
+/* @@ Move this to win32 utils or somewhere else.
-#if NV_OS_WIN32
+#if NV_OS_WIN32
-
+
-#define WINDOWS_LEAN_AND_MEAN
+#define WINDOWS_LEAN_AND_MEAN
-#include <windows.h>
+#include <windows.h>
-
+
-static bool isWindowsVista()
+static bool isWindowsVista()
-{
+{
-	OSVERSIONINFO osvi;
+OSVERSIONINFO osvi;
-	osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
-
+
-	::GetVersionEx(&osvi);
+::GetVersionEx(&osvi);
-	return osvi.dwMajorVersion >= 6;
+return osvi.dwMajorVersion >= 6;
-}
+}
-
+
-
+
-typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
+typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
-
+
-static bool isWow32()
+static bool isWow32()
-{
+{
-	LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
+LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
-
+
-	BOOL bIsWow64 = FALSE;
+BOOL bIsWow64 = FALSE;
- 
+
-	if (NULL != fnIsWow64Process)
+if (NULL != fnIsWow64Process)
-	{
+{
-		if (!fnIsWow64Process(GetCurrentProcess(), &bIsWow64))
+if (!fnIsWow64Process(GetCurrentProcess(), &bIsWow64))
-		{
+{
-			// Assume 32 bits.
+// Assume 32 bits.
-			return true;
+return true;
-		}
+}
-	}
+}
-
+
-	return !bIsWow64;
+return !bIsWow64;
-}
+}
-
+
-#endif
+#endif
-*/
+*/
-
+
-
+
-static bool isCudaDriverAvailable(int version)
+static bool isCudaDriverAvailable(int version)
-{
+{
-#if defined HAVE_CUDA
+#if defined HAVE_CUDA
-#if NV_OS_WIN32
+#if NV_OS_WIN32
-	Library nvcuda("nvcuda.dll");
+	Library nvcuda("nvcuda.dll");
-#else
+#else
-	Library nvcuda(NV_LIBRARY_NAME(cuda));
+	Library nvcuda(NV_LIBRARY_NAME(cuda));
-#endif
+#endif
-	
+
-	if (!nvcuda.isValid())
+	if (!nvcuda.isValid())
-	{
+	{
-		nvDebug("*** CUDA driver not found.\n");
+		nvDebug("*** CUDA driver not found.\n");
-		return false;
+		return false;
-	}
+	}
-	
+
-	if (version >= 2000)
+	if (version >= 2000)
-	{
+	{
-		void * address = nvcuda.bindSymbol("cuStreamCreate");
+		void * address = nvcuda.bindSymbol("cuStreamCreate");
-		if (address == NULL) {
+		if (address == NULL) {
-			nvDebug("*** CUDA driver version < 2.0.\n");
+			nvDebug("*** CUDA driver version < 2.0.\n");
-			return false;
+			return false;
-		}
+		}
-	}
+	}
-
+
-	if (version >= 2010)
+	if (version >= 2010)
-	{
+	{
-		void * address = nvcuda.bindSymbol("cuModuleLoadDataEx");
+		void * address = nvcuda.bindSymbol("cuModuleLoadDataEx");
-		if (address == NULL) {
+		if (address == NULL) {
-			nvDebug("*** CUDA driver version < 2.1.\n");
+			nvDebug("*** CUDA driver version < 2.1.\n");
-			return false;
+			return false;
-		}
+		}
-	}
+	}
-	
+
-	if (version >= 2020)
+	if (version >= 2020)
-	{
+	{
-		typedef CUresult (CUDAAPI * PFCU_DRIVERGETVERSION)(int * version);
+		typedef CUresult (CUDAAPI * PFCU_DRIVERGETVERSION)(int * version);
-
+
-		PFCU_DRIVERGETVERSION driverGetVersion = (PFCU_DRIVERGETVERSION)nvcuda.bindSymbol("cuDriverGetVersion");
+		PFCU_DRIVERGETVERSION driverGetVersion = (PFCU_DRIVERGETVERSION)nvcuda.bindSymbol("cuDriverGetVersion");
-		if (driverGetVersion == NULL) {
+		if (driverGetVersion == NULL) {
-			nvDebug("*** CUDA driver version < 2.2.\n");
+			nvDebug("*** CUDA driver version < 2.2.\n");
-			return false;
+			return false;
-		}
+		}
-
+
-		int driverVersion;
+		int driverVersion;
-		CUresult err = driverGetVersion(&driverVersion);
+		CUresult err = driverGetVersion(&driverVersion);
-		if (err != CUDA_SUCCESS) {
+		if (err != CUDA_SUCCESS) {
-			nvDebug("*** Error querying driver version: '%s'.\n", cudaGetErrorString((cudaError_t)err));
+			nvDebug("*** Error querying driver version: '%s'.\n", cudaGetErrorString((cudaError_t)err));
-			return false;
+			return false;
-		}
+		}
-
+
-		return driverVersion >= version;
+		return driverVersion >= version;
-	}
+	}
-#endif // HAVE_CUDA
+#endif // HAVE_CUDA
-
+
-	return true;
+	return true;
-}
+}
-
+
-
+
-/// Determine if CUDA is available.
+/// Determine if CUDA is available.
-bool nv::cuda::isHardwarePresent()
+bool nv::cuda::isHardwarePresent()
-{
+{
-#if defined HAVE_CUDA
+#if defined HAVE_CUDA
-	// Make sure that CUDA driver matches CUDA runtime.
+	// Make sure that CUDA driver matches CUDA runtime.
-	if (!isCudaDriverAvailable(CUDART_VERSION))
+	if (!isCudaDriverAvailable(CUDART_VERSION))
-	{
+	{
-		nvDebug("CUDA driver not available for CUDA runtime %d\n", CUDART_VERSION);
+		nvDebug("CUDA driver not available for CUDA runtime %d\n", CUDART_VERSION);
-		return false;
+		return false;
-	}
+	}
-
+
-	int count = deviceCount();
+	int count = deviceCount();
-	if (count == 1)
+	if (count == 1)
-	{
+	{
-		// Make sure it's not an emulation device.
+		// Make sure it's not an emulation device.
-		cudaDeviceProp deviceProp;
+		cudaDeviceProp deviceProp;
-		cudaGetDeviceProperties(&deviceProp, 0);
+		cudaGetDeviceProperties(&deviceProp, 0);
-
+
-		// deviceProp.name != Device Emulation (CPU)
+		// deviceProp.name != Device Emulation (CPU)
-		if (deviceProp.major == -1 || deviceProp.minor == -1)
+		if (deviceProp.major == -1 || deviceProp.minor == -1)
-		{
+		{
-			return false;
+			return false;
-		}
+		}
-	}
+	}
-
+
-	// @@ Make sure that warp size == 32
+	// @@ Make sure that warp size == 32
-
+
-	return count > 0;
+	// @@ Make sure available GPU is faster than the CPU.
-#else
+
-	return false;
+	return count > 0;
-#endif
+#else
-}
+	return false;
-
+#endif
-/// Get number of CUDA enabled devices.
+}
-int nv::cuda::deviceCount()
+
-{
+/// Get number of CUDA enabled devices.
-#if defined HAVE_CUDA
+int nv::cuda::deviceCount()
-	int gpuCount = 0;
+{
-
+#if defined HAVE_CUDA
-	cudaError_t result = cudaGetDeviceCount(&gpuCount);
+	int gpuCount = 0;
-
+
-	if (result == cudaSuccess)
+	cudaError_t result = cudaGetDeviceCount(&gpuCount);
-	{
+
-		return gpuCount;
+	if (result == cudaSuccess)
-	}
+	{
-#endif
+		return gpuCount;
-	return 0;
+	}
-}
+#endif
-
+	return 0;
-int nv::cuda::getFastestDevice()
+}
-{
+
-	int max_gflops_device = 0;
+
-#if defined HAVE_CUDA
+// Make sure device meets requirements:
-	int max_gflops = 0;
+// - Not an emulation device.
-
+// - Not an integrated device?
-	const int device_count = deviceCount();
+// - Faster than CPU.
-	int current_device = 0;
+bool nv::cuda::isValidDevice(int i)
-	while (current_device < device_count)
+{
-	{
+#if defined HAVE_CUDA
-		cudaDeviceProp device_properties;
+
-		cudaGetDeviceProperties(&device_properties, current_device);
+	cudaDeviceProp device_properties;
-		int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
+	cudaGetDeviceProperties(&device_properties, i);
-
+	int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
-		if (device_properties.major != -1 && device_properties.minor != -1)
+
-		{
+	if (device_properties.major == -1 || device_properties.minor == -1) {
-			if( gflops > max_gflops )
+		// Emulation device.
-			{
+		return false;
-				max_gflops = gflops;
+	}
-				max_gflops_device = current_device;
+
-			}
+#if CUDART_VERSION >= 2030 // 2.3
-		}
+	/*if (device_properties.integrated)
-		
+	{
-		current_device++;
+		// Integrated devices.
-	}
+		return false;
-#endif
+	}*/
-	return max_gflops_device;
+#endif
-}
+
-
+	return true;
-
+#else
-/// Activate the given devices.
+	return false;
-bool nv::cuda::setDevice(int i)
+#endif
-{
+}
-	nvCheck(i < deviceCount());
+
-#if defined HAVE_CUDA
+int nv::cuda::getFastestDevice()
-	cudaError_t result = cudaSetDevice(i);
+{
-
+	int max_gflops_device = -1;
-	if (result != cudaSuccess) {
+#if defined HAVE_CUDA
-		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
+	int max_gflops = 0;
-	}
+
-
+	const int device_count = deviceCount();
-	return result == cudaSuccess;
+	for (int i = 0; i < device_count; i++)
-#else
+	{
-	return false;
+		if (isValidDevice(i))
-#endif
+		{
-}
+			cudaDeviceProp device_properties;
-
+			cudaGetDeviceProperties(&device_properties, i);
-void nv::cuda::exit()
+			int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
-{
+
-#if defined HAVE_CUDA
+			if (gflops > max_gflops)
-	cudaError_t result = cudaThreadExit();
+			{
-
+				max_gflops = gflops;
-	if (result != cudaSuccess) {
+				max_gflops_device = i;
-		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
+			}
-	}
+		}
-#endif
+	}
-}
+#endif
 	return max_gflops_device;
 }
 /// Activate the given devices.
 bool nv::cuda::initDevice(int * device_ptr)
 {
 	nvDebugCheck(device_ptr != NULL);
 #if defined HAVE_CUDA
 #if CUDART_VERSION >= 2030 // 2.3
 	// Set device flags to yield in order to play nice with other threads and to find out if CUDA was already active.
 	cudaError_t resul = cudaSetDeviceFlags(cudaDeviceScheduleYield);
 #endif
 	int device = getFastestDevice();
 	if (device == -1)
 	{
 		// No device is fast enough.
 		*device_ptr = -1;
 		return false;
 	}
 	// Select CUDA device.
 	cudaError_t result = cudaSetDevice(device);
 	if (result == cudaErrorSetOnActiveProcess)
 	{
 		int device;
 		result = cudaGetDevice(&device);
 		*device_ptr = -1;  // No device to cleanup.
 		return isValidDevice(device); // Return true if device is valid.
 	}
 	else if (result != cudaSuccess)
 	{
 		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
 		*device_ptr = -1;
 		return false;
 	}
 	*device_ptr = device;
 	return true;
 #else
 	return false;
 #endif
 }
 void nv::cuda::exitDevice()
 {
 #if defined HAVE_CUDA
 	cudaError_t result = cudaThreadExit();
 	if (result != cudaSuccess) {
 		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
 	}
 #endif
 }
--- a/src/nvtt/cuda/CudaUtils.h
+++ b/src/nvtt/cuda/CudaUtils.h
@ -32,8 +32,10 @@ namespace nv
 		bool isHardwarePresent();
 		int deviceCount();
 		int getFastestDevice();
-		bool setDevice(int i);
+		bool isValidDevice(int i);
-		void exit();
+
 		bool initDevice(int * device_ptr);
 		void exitDevice();
 	};
 } // nv namespace
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@ -194,7 +194,7 @@ namespace nvtt
 		// Describe the format of the input.
 		NVTT_API void setFormat(InputFormat format);
-		// Set the way the input alpha channel is interpreted. @@ Not implemented!
+		// Set the way the input alpha channel is interpreted.
 		NVTT_API void setAlphaMode(AlphaMode alphaMode);
 		// Set gamma settings.
--- a/src/nvtt/tools/compress.cpp
+++ b/src/nvtt/tools/compress.cpp
@ -33,6 +33,9 @@
 #include <time.h> // clock
 #include <c:\CUDA\include\cuda_runtime.h>
 #pragma comment(lib, "c:\\CUDA\\lib\\cudart.lib")
 //#define WINDOWS_LEAN_AND_MEAN
 //#include <windows.h> // TIMER
@ -413,6 +416,8 @@ int main(int argc, char *argv[])
 		return EXIT_FAILURE;
 	}
 	cudaSetDevice(0);
 	nvtt::Compressor compressor;
 	compressor.enableCudaAcceleration(!nocuda);