Do not fail when the process is already using CUDA.

Attempt to use the selected cuda device. More strict device selection.
2009-11-04 06:16:03 +00:00
parent 6e9feef6f4
commit 93625c7de8
6 changed files with 390 additions and 320 deletions
--- a/src/nvtt/Compressor.cpp
+++ b/src/nvtt/Compressor.cpp
@ -153,7 +153,7 @@ namespace nvtt
 				if (inputOptions.isNormalMap)
 				{
 					// Expand normals to [-1, 1] range.
-				//	floatImage->expandNormals(0);
+					//	floatImage->expandNormals(0);
 				}
 				else if (inputOptions.inputGamma != 1.0f)
 				{
@ -207,28 +207,16 @@ Compressor::Compressor() : m(*new Compressor::Private())
 {
 	// CUDA initialization.
 	m.cudaSupported = cuda::isHardwarePresent();
-	m.cudaEnabled = m.cudaSupported;
+	m.cudaEnabled = false;
+	m.cudaDevice = -1;

-	if (m.cudaEnabled)
-	{
-		// Select fastest CUDA device.
-		int device = cuda::getFastestDevice();
-		cuda::setDevice(device);
-		
-		m.cuda = new CudaCompressor();
-
-		if (!m.cuda->isValid())
-		{
-			m.cudaEnabled = false;
-			m.cuda = NULL;
-		}
-	}
+	enableCudaAcceleration(m.cudaSupported);
 }

 Compressor::~Compressor()
 {
+	enableCudaAcceleration(false);
 	delete &m;
-	cuda::exit();
 }


@ -237,21 +225,33 @@ void Compressor::enableCudaAcceleration(bool enable)
 {
 	if (m.cudaSupported)
 	{
-		m.cudaEnabled = enable;
-	}
-
-	if (m.cudaEnabled && m.cuda == NULL)
-	{
-		// Select fastest CUDA device.
-		int device = cuda::getFastestDevice();
-		cuda::setDevice(device);
-		
-		m.cuda = new CudaCompressor();
-		
-		if (!m.cuda->isValid())
+		if (m.cudaEnabled && !enable)
 		{
 			m.cudaEnabled = false;
 			m.cuda = NULL;
+
+			if (m.cudaDevice != -1)
+			{
+				// Exit device.
+				cuda::exitDevice();
+			}
+		}
+		else if (!m.cudaEnabled && enable)
+		{
+			// Init the CUDA device. This may return -1 if CUDA was already initialized by the app.
+			m.cudaEnabled = cuda::initDevice(&m.cudaDevice);
+
+			if (m.cudaEnabled)
+			{
+				// Create compressor if initialization succeeds.
+				m.cuda = new CudaCompressor();
+
+				// But cleanup if failed.
+				if (!m.cuda->isValid())
+				{
+					enableCudaAcceleration(false);
+				}
+			}
 		}
 	}
 }
@ -341,8 +341,8 @@ bool Compressor::Private::outputHeader(const InputOptions::Private & inputOption
 		header.setTextureCube();
 	}		
 	/*else if (inputOptions.textureType == TextureType_3D) {
-		header.setTexture3D();
-		header.setDepth(inputOptions.targetDepth);
+	header.setTexture3D();
+	header.setDepth(inputOptions.targetDepth);
 	}*/

 	if (compressionOptions.format == Format_RGBA)
@ -715,29 +715,29 @@ bool Compressor::Private::compressMipmap(const Mipmap & mipmap, const InputOptio
 #endif

 #if defined(HAVE_ATITC)
-		if (compressionOptions.externalCompressor == "ati")
-		{
-			atiCompressDXT1(image, outputOptions);
-		}
-		else
-#endif
-		if (compressionOptions.quality == Quality_Fastest)
-		{
-			fast.compressDXT1(outputOptions);
-		}
-		else
-		{
-			if (useCuda)
+			if (compressionOptions.externalCompressor == "ati")
 			{
-				nvDebugCheck(cudaSupported);
-				cuda->setImage(image, inputOptions.alphaMode);
-				cuda->compressDXT1(compressionOptions, outputOptions);
+				atiCompressDXT1(image, outputOptions);
 			}
 			else
-			{
-				slow.compressDXT1(compressionOptions, outputOptions);
-			}
-		}
+#endif
+				if (compressionOptions.quality == Quality_Fastest)
+				{
+					fast.compressDXT1(outputOptions);
+				}
+				else
+				{
+					if (useCuda)
+					{
+						nvDebugCheck(cudaSupported);
+						cuda->setImage(image, inputOptions.alphaMode);
+						cuda->compressDXT1(compressionOptions, outputOptions);
+					}
+					else
+					{
+						slow.compressDXT1(compressionOptions, outputOptions);
+					}
+				}
 	}
 	else if (compressionOptions.format == Format_DXT1a)
 	{
--- a/src/nvtt/Compressor.h
+++ b/src/nvtt/Compressor.h
@ -63,10 +63,12 @@ namespace nvtt
 		bool compressMipmap(const Mipmap & mipmap, const InputOptions::Private & inputOptions, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) const;


+
 	public:

 		bool cudaSupported;
 		bool cudaEnabled;
+		int cudaDevice;

 		nv::AutoPtr<nv::CudaCompressor> cuda;

--- a/src/nvtt/cuda/CudaUtils.cpp
+++ b/src/nvtt/cuda/CudaUtils.cpp
@ -41,11 +41,11 @@ using namespace cuda;

 static bool isWindowsVista()
 {
-	OSVERSIONINFO osvi;
-	osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+OSVERSIONINFO osvi;
+osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);

-	::GetVersionEx(&osvi);
-	return osvi.dwMajorVersion >= 6;
+::GetVersionEx(&osvi);
+return osvi.dwMajorVersion >= 6;
 }


@ -53,20 +53,20 @@ typedef BOOL (WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);

 static bool isWow32()
 {
-	LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
+LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");

-	BOOL bIsWow64 = FALSE;
+BOOL bIsWow64 = FALSE;

-	if (NULL != fnIsWow64Process)
-	{
-		if (!fnIsWow64Process(GetCurrentProcess(), &bIsWow64))
-		{
-			// Assume 32 bits.
-			return true;
-		}
-	}
+if (NULL != fnIsWow64Process)
+{
+if (!fnIsWow64Process(GetCurrentProcess(), &bIsWow64))
+{
+// Assume 32 bits.
+return true;
+}
+}

-	return !bIsWow64;
+return !bIsWow64;
 }

 #endif
@ -158,6 +158,8 @@ bool nv::cuda::isHardwarePresent()

 	// @@ Make sure that warp size == 32

+	// @@ Make sure available GPU is faster than the CPU.
+
 	return count > 0;
 #else
 	return false;
@ -180,30 +182,59 @@ int nv::cuda::deviceCount()
 	return 0;
 }

+
+// Make sure device meets requirements:
+// - Not an emulation device.
+// - Not an integrated device?
+// - Faster than CPU.
+bool nv::cuda::isValidDevice(int i)
+{
+#if defined HAVE_CUDA
+
+	cudaDeviceProp device_properties;
+	cudaGetDeviceProperties(&device_properties, i);
+	int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
+
+	if (device_properties.major == -1 || device_properties.minor == -1) {
+		// Emulation device.
+		return false;
+	}
+
+#if CUDART_VERSION >= 2030 // 2.3
+	/*if (device_properties.integrated)
+	{
+		// Integrated devices.
+		return false;
+	}*/
+#endif
+
+	return true;
+#else
+	return false;
+#endif
+}
+
 int nv::cuda::getFastestDevice()
 {
-	int max_gflops_device = 0;
+	int max_gflops_device = -1;
 #if defined HAVE_CUDA
 	int max_gflops = 0;

 	const int device_count = deviceCount();
-	int current_device = 0;
-	while (current_device < device_count)
+	for (int i = 0; i < device_count; i++)
 	{
-		cudaDeviceProp device_properties;
-		cudaGetDeviceProperties(&device_properties, current_device);
-		int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
-
-		if (device_properties.major != -1 && device_properties.minor != -1)
+		if (isValidDevice(i))
 		{
-			if( gflops > max_gflops )
+			cudaDeviceProp device_properties;
+			cudaGetDeviceProperties(&device_properties, i);
+			int gflops = device_properties.multiProcessorCount * device_properties.clockRate;
+
+			if (gflops > max_gflops)
 			{
 				max_gflops = gflops;
-				max_gflops_device = current_device;
+				max_gflops_device = i;
 			}
 		}
-		
-		current_device++;
 	}
 #endif
 	return max_gflops_device;
@ -211,23 +242,53 @@ int nv::cuda::getFastestDevice()


 /// Activate the given devices.
-bool nv::cuda::setDevice(int i)
+bool nv::cuda::initDevice(int * device_ptr)
 {
-	nvCheck(i < deviceCount());
+	nvDebugCheck(device_ptr != NULL);
 #if defined HAVE_CUDA
-	cudaError_t result = cudaSetDevice(i);

-	if (result != cudaSuccess) {
-		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
+#if CUDART_VERSION >= 2030 // 2.3
+
+	// Set device flags to yield in order to play nice with other threads and to find out if CUDA was already active.
+	cudaError_t resul = cudaSetDeviceFlags(cudaDeviceScheduleYield);
+
+#endif
+
+	int device = getFastestDevice();
+
+	if (device == -1)
+	{
+		// No device is fast enough.
+		*device_ptr = -1;
+		return false;
 	}

-	return result == cudaSuccess;
+	// Select CUDA device.
+	cudaError_t result = cudaSetDevice(device);
+
+	if (result == cudaErrorSetOnActiveProcess)
+	{
+		int device;
+		result = cudaGetDevice(&device);
+
+		*device_ptr = -1;  // No device to cleanup.
+		return isValidDevice(device); // Return true if device is valid.
+	}
+	else if (result != cudaSuccess)
+	{
+		nvDebug("*** CUDA Error: %s\n", cudaGetErrorString(result));
+		*device_ptr = -1;
+		return false;
+	}
+
+	*device_ptr = device;
+	return true;
 #else
 	return false;
 #endif
 }

-void nv::cuda::exit()
+void nv::cuda::exitDevice()
 {
 #if defined HAVE_CUDA
 	cudaError_t result = cudaThreadExit();
--- a/src/nvtt/cuda/CudaUtils.h
+++ b/src/nvtt/cuda/CudaUtils.h
@ -32,8 +32,10 @@ namespace nv
 		bool isHardwarePresent();
 		int deviceCount();
 		int getFastestDevice();
-		bool setDevice(int i);
-		void exit();
+		bool isValidDevice(int i);
+
+		bool initDevice(int * device_ptr);
+		void exitDevice();
 	};
 	
 } // nv namespace
--- a/src/nvtt/nvtt.h
+++ b/src/nvtt/nvtt.h
@ -194,7 +194,7 @@ namespace nvtt
 		// Describe the format of the input.
 		NVTT_API void setFormat(InputFormat format);
 		
-		// Set the way the input alpha channel is interpreted. @@ Not implemented!
+		// Set the way the input alpha channel is interpreted.
 		NVTT_API void setAlphaMode(AlphaMode alphaMode);
 		
 		// Set gamma settings.
--- a/src/nvtt/tools/compress.cpp
+++ b/src/nvtt/tools/compress.cpp
@ -33,6 +33,9 @@

 #include <time.h> // clock

+#include <c:\CUDA\include\cuda_runtime.h>
+#pragma comment(lib, "c:\\CUDA\\lib\\cudart.lib")
+
 //#define WINDOWS_LEAN_AND_MEAN
 //#include <windows.h> // TIMER

@ -413,6 +416,8 @@ int main(int argc, char *argv[])
 		return EXIT_FAILURE;
 	}

+	cudaSetDevice(0);
+
 	nvtt::Compressor compressor;
 	compressor.enableCudaAcceleration(!nocuda);