Do not fail when the process is already using CUDA.

Attempt to use the selected cuda device. More strict device selection.
2009-11-04 06:16:03 +00:00
parent 6e9feef6f4
commit 93625c7de8
6 changed files with 390 additions and 320 deletions
--- a/src/nvtt/Compressor.cpp
+++ b/src/nvtt/Compressor.cpp
@ -153,7 +153,7 @@ namespace nvtt
 				if (inputOptions.isNormalMap)
 				{
 					// Expand normals to [-1, 1] range.
-				//	floatImage->expandNormals(0);
+					//	floatImage->expandNormals(0);
 				}
 				else if (inputOptions.inputGamma != 1.0f)
 				{
@ -207,28 +207,16 @@ Compressor::Compressor() : m(*new Compressor::Private())
 {
 	// CUDA initialization.
 	m.cudaSupported = cuda::isHardwarePresent();
-	m.cudaEnabled = m.cudaSupported;
+	m.cudaEnabled = false;
 	m.cudaDevice = -1;
-	if (m.cudaEnabled)
+	enableCudaAcceleration(m.cudaSupported);
 	{
 		// Select fastest CUDA device.
 		int device = cuda::getFastestDevice();
 		cuda::setDevice(device);
 		m.cuda = new CudaCompressor();
 		if (!m.cuda->isValid())
 		{
 			m.cudaEnabled = false;
 			m.cuda = NULL;
 		}
 	}
 }
 Compressor::~Compressor()
 {
 	enableCudaAcceleration(false);
 	delete &m;
 	cuda::exit();
 }
@ -237,21 +225,33 @@ void Compressor::enableCudaAcceleration(bool enable)
 {
 	if (m.cudaSupported)
 	{
-		m.cudaEnabled = enable;
+		if (m.cudaEnabled && !enable)
 	}
 	if (m.cudaEnabled && m.cuda == NULL)
 	{
 		// Select fastest CUDA device.
 		int device = cuda::getFastestDevice();
 		cuda::setDevice(device);
 		m.cuda = new CudaCompressor();
 		if (!m.cuda->isValid())
 		{
 			m.cudaEnabled = false;
 			m.cuda = NULL;
 			if (m.cudaDevice != -1)
 			{
 				// Exit device.
 				cuda::exitDevice();
 			}
 		}
 		else if (!m.cudaEnabled && enable)
 		{
 			// Init the CUDA device. This may return -1 if CUDA was already initialized by the app.
 			m.cudaEnabled = cuda::initDevice(&m.cudaDevice);
 			if (m.cudaEnabled)
 			{
 				// Create compressor if initialization succeeds.
 				m.cuda = new CudaCompressor();
 				// But cleanup if failed.
 				if (!m.cuda->isValid())
 				{
 					enableCudaAcceleration(false);
 				}
 			}
 		}
 	}
 }
@ -341,8 +341,8 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 		header.setTextureCube();
 	}		
 	/*else if (inputOptions.textureType == TextureType_3D) {
-		header.setTexture3D();
+	header.setTexture3D();
-		header.setDepth(inputOptions.targetDepth);
+	header.setDepth(inputOptions.targetDepth);
 	}*/
 	if (compressionOptions.format == Format_RGBA)
@ -715,29 +715,29 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 #endif
 #if defined(HAVE_ATITC)
-		if (compressionOptions.externalCompressor == "ati")
+			if (compressionOptions.externalCompressor == "ati")
 		{
 			atiCompressDXT1(image, outputOptions);
 		}
 		else
 #endif
 		if (compressionOptions.quality == Quality_Fastest)
 		{
 			fast.compressDXT1(outputOptions);
 		}
 		else
 		{
 			if (useCuda)
 			{
-				nvDebugCheck(cudaSupported);
+				atiCompressDXT1(image, outputOptions);
 				cuda->setImage(image, inputOptions.alphaMode);
 				cuda->compressDXT1(compressionOptions, outputOptions);
 			}
 			else
-			{
+#endif
-				slow.compressDXT1(compressionOptions, outputOptions);
+				if (compressionOptions.quality == Quality_Fastest)
-			}
+				{
-		}
+					fast.compressDXT1(outputOptions);
 				}
 				else
 				{
 					if (useCuda)
 					{
 						nvDebugCheck(cudaSupported);
 						cuda->setImage(image, inputOptions.alphaMode);
 						cuda->compressDXT1(compressionOptions, outputOptions);
 					}
 					else
 					{
 						slow.compressDXT1(compressionOptions, outputOptions);
 					}
 				}
 	}
 	else if (compressionOptions.format == Format_DXT1a)
 	{
--- a/src/nvtt/Compressor.h
+++ b/src/nvtt/Compressor.h
@ -63,10 +63,12 @@ namespace nvtt
 		bool compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;
 	public:
 		bool cudaSupported;
 		bool cudaEnabled;
 		int cudaDevice;
 		nv::AutoPtr<nv::CudaCompressor> cuda;
--- a/src/nvtt/cuda/CudaUtils.cpp
+++ b/src/nvtt/cuda/CudaUtils.cpp
@ -41,11 +41,11 @@ using namespace cuda;
 static bool isWindowsVista()
 {
-	OSVERSIONINFO osvi;
+OSVERSIONINFO osvi;
-	osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
-	::GetVersionEx(&osvi);
+::GetVersionEx(&osvi);
-	return osvi.dwMajorVersion >= 6;
+return osvi.dwMajorVersion >= 6;
 }
@ -53,20 +53,20 @@ typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
 static bool isWow32()
 {
-	LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
+LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
-	BOOL bIsWow64 = FALSE;
+BOOL bIsWow64 = FALSE;
-	if (NULL != fnIsWow64Process)
+if (NULL != fnIsWow64Process)
-	{
+{
-		if (!fnIsWow64Process(GetCurrentProcess(), &bIsWow64))
+if (!fnIsWow64Process(GetCurrentProcess(), &bIsWow64))
-		{
+{
-			// Assume 32 bits.
+// Assume 32 bits.
-			return true;
+return true;
-		}
+}
-	}
+}
-	return !bIsWow64;
+return !bIsWow64;
 }
 #endif
@ -158,6 +158,8 @@ bool nv::cuda::isHardwarePresent()
 	// @@ Make sure that warp size == 32
 	// @@ Make sure available GPU is faster than the CPU.
 	return count > 0;
 #else
 	return false;
@ -180,30 +182,59 @@ int nv::cuda::deviceCount()
 	return 0;
 }
 // Make sure device meets requirements:
 // - Not an emulation device.
 // - Not an integrated device?
 // - Faster than CPU.
 bool nv::cuda::isValidDevice(int i)
 {
 #if defined HAVE_CUDA
 	cudaDeviceProp device_properties;
 	cudaGetDeviceProperties(&device_properties, i);
 	int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
 	if (device_properties.major == -1 || device_properties.minor == -1) {
 		// Emulation device.
 		return false;
 	}
 #if CUDART_VERSION >= 2030 // 2.3
 	/*if (device_properties.integrated)
 	{
 		// Integrated devices.
 		return false;
 	}*/
 #endif
 	return true;
 #else
 	return false;
 #endif
 }
 int nv::cuda::getFastestDevice()
 {
-	int max_gflops_device = 0;
+	int max_gflops_device = -1;
 #if defined HAVE_CUDA
 	int max_gflops = 0;
 	const int device_count = deviceCount();
-	int current_device = 0;
+	for (int i = 0; i < device_count; i++)
 	while (current_device < device_count)
 	{
-		cudaDeviceProp device_properties;
+		if (isValidDevice(i))
 		cudaGetDeviceProperties(&device_properties, current_device);
 		int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
 		if (device_properties.major != -1 && device_properties.minor != -1)
 		{
-			if( gflops > max_gflops )
+			cudaDeviceProp device_properties;
 			cudaGetDeviceProperties(&device_properties, i);
 			int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
 			if (gflops > max_gflops)
 			{
 				max_gflops = gflops;
-				max_gflops_device = current_device;
+				max_gflops_device = i;
 			}
 		}
 		current_device++;
 	}
 #endif
 	return max_gflops_device;
@ -211,23 +242,53 @@ int nv::cuda::getFastestDevice()
 /// Activate the given devices.
-bool nv::cuda::setDevice(int i)
+bool nv::cuda::initDevice(int * device_ptr)
 {
-	nvCheck(i < deviceCount());
+	nvDebugCheck(device_ptr != NULL);
 #if defined HAVE_CUDA
 	cudaError_t result = cudaSetDevice(i);
-	if (result != cudaSuccess) {
+#if CUDART_VERSION >= 2030 // 2.3
-		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
+
 	// Set device flags to yield in order to play nice with other threads and to find out if CUDA was already active.
 	cudaError_t resul = cudaSetDeviceFlags(cudaDeviceScheduleYield);
 #endif
 	int device = getFastestDevice();
 	if (device == -1)
 	{
 		// No device is fast enough.
 		*device_ptr = -1;
 		return false;
 	}
-	return result == cudaSuccess;
+	// Select CUDA device.
 	cudaError_t result = cudaSetDevice(device);
 	if (result == cudaErrorSetOnActiveProcess)
 	{
 		int device;
 		result = cudaGetDevice(&device);
 		*device_ptr = -1;  // No device to cleanup.
 		return isValidDevice(device); // Return true if device is valid.
 	}
 	else if (result != cudaSuccess)
 	{
 		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
 		*device_ptr = -1;
 		return false;
 	}
 	*device_ptr = device;
 	return true;
 #else
 	return false;
 #endif
 }
-void nv::cuda::exit()
+void nv::cuda::exitDevice()
 {
 #if defined HAVE_CUDA
 	cudaError_t result = cudaThreadExit();
--- a/src/nvtt/cuda/CudaUtils.h
+++ b/src/nvtt/cuda/CudaUtils.h
@ -32,8 +32,10 @@ namespace nv
 		bool isHardwarePresent();
 		int deviceCount();
 		int getFastestDevice();
-		bool setDevice(int i);
+		bool isValidDevice(int i);
-		void exit();
+
 		bool initDevice(int * device_ptr);
 		void exitDevice();
 	};
 } // nv namespace
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@ -194,7 +194,7 @@ namespace nvtt
 		// Describe the format of the input.
 		NVTT_API void setFormat(InputFormat format);
-		// Set the way the input alpha channel is interpreted. @@ Not implemented!
+		// Set the way the input alpha channel is interpreted.
 		NVTT_API void setAlphaMode(AlphaMode alphaMode);
 		// Set gamma settings.
--- a/src/nvtt/tools/compress.cpp
+++ b/src/nvtt/tools/compress.cpp
@ -33,6 +33,9 @@
 #include <time.h> // clock
 #include <c:\CUDA\include\cuda_runtime.h>
 #pragma comment(lib, "c:\\CUDA\\lib\\cudart.lib")
 //#define WINDOWS_LEAN_AND_MEAN
 //#include <windows.h> // TIMER
@ -413,6 +416,8 @@ int main(int argc, char *argv[])
 		return EXIT_FAILURE;
 	}
 	cudaSetDevice(0);
 	nvtt::Compressor compressor;
 	compressor.enableCudaAcceleration(!nocuda);