diff --git a/src/nvcore/Utils.h b/src/nvcore/Utils.h
index 6c11185..68167be 100644
--- a/src/nvcore/Utils.h
+++ b/src/nvcore/Utils.h
@@ -25,6 +25,12 @@ namespace nv
     inline uint32 asUnsigned(int32 x) { return (uint32) x; }
     inline uint64 asUnsigned(int64 x) { return (uint64) x; }
 
+    template <typename T> inline uint32 toU32(T x) {
+        nvDebugCheck(x <= UINT32_MAX);
+        nvDebugCheck(x >= 0);
+        return (uint32) x;
+    }
+
     /*
     template <typename T> inline int8 toI8(T x) { 
         nvDebugCheck(x <= INT8_MAX);
diff --git a/src/nvthread/Atomic.h b/src/nvthread/Atomic.h
index fbc2694..8315d81 100644
--- a/src/nvthread/Atomic.h
+++ b/src/nvthread/Atomic.h
@@ -34,16 +34,16 @@ extern "C"
 namespace nv {
 
     // Load and stores.
-	inline uint32 loadRelaxed(const uint32 * ptr) { return *ptr; }
-	inline void storeRelaxed(uint32 * ptr, uint32 value) { *ptr = value; }
+    inline uint32 loadRelaxed(const uint32 * ptr) { return *ptr; }
+    inline void storeRelaxed(uint32 * ptr, uint32 value) { *ptr = value; }
 
-	inline uint32 loadAcquire(const volatile uint32 * ptr)
+    inline uint32 loadAcquire(const volatile uint32 * ptr)
     {
         nvDebugCheck((intptr_t(ptr) & 3) == 0);
 
 #if POSH_CPU_X86 || POSH_CPU_X86_64
         nvCompilerReadBarrier();
-		uint32 ret = *ptr;  // on x86, loads are Acquire
+        uint32 ret = *ptr;  // on x86, loads are Acquire
         nvCompilerReadBarrier();
         return ret;
 #else
@@ -51,17 +51,17 @@ namespace nv {
 #endif
     }
 
-	inline void storeRelease(volatile uint32 * ptr, uint32 value)
+    inline void storeRelease(volatile uint32 * ptr, uint32 value)
     {
         nvDebugCheck((intptr_t(ptr) & 3) == 0);
         nvDebugCheck((intptr_t(&value) & 3) == 0);
 
 #if POSH_CPU_X86 || POSH_CPU_X86_64
         nvCompilerWriteBarrier();
-		*ptr = value;   // on x86, stores are Release
-		nvCompilerWriteBarrier();
+        *ptr = value;   // on x86, stores are Release
+        nvCompilerWriteBarrier();
 #else
-#error "Not implemented"
+#error "Atomics not implemented."
 #endif
     }
 
@@ -84,6 +84,25 @@ namespace nv {
 
         return (uint32)_InterlockedDecrement((long *)value);
     }
+#elif NV_CC_GNUC
+    // Many alternative implementations at:
+    // http://www.memoryhole.net/kyle/2007/05/atomic_incrementing.html
+
+    inline uint32 atomicIncrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+
+        return __sync_fetch_and_add(value, 1);
+    }
+
+    inline uint32 atomicDecrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+
+        return __sync_fetch_and_sub(value, 1);
+    }
+#else
+#error "Atomics not implemented."
 #endif
 
 
@@ -107,19 +126,19 @@ namespace nv {
 
 
 
-	template <typename T>
-	class Atomic
-	{
-	public:
-		explicit Atomic()  : m_value() { }
-		explicit Atomic( T val ) : m_value(val) { }
-		~Atomic() { }
-		
-		T loadRelaxed()  const { return m_value; }
-		void storeRelaxed(T val) { m_value = val; }
+    template <typename T>
+    class Atomic
+    {
+    public:
+        explicit Atomic()  : m_value() { }
+        explicit Atomic( T val ) : m_value(val) { }
+        ~Atomic() { }
+
+        T loadRelaxed()  const { return m_value; }
+        void storeRelaxed(T val) { m_value = val; }
 
         //T loadAcquire() const volatile { return nv::loadAcquire(&m_value); }
-		//void storeRelease(T val) volatile { nv::storeRelease(&m_value, val); }
+        //void storeRelease(T val) volatile { nv::storeRelease(&m_value, val); }
 
         void increment() /*volatile*/ { nv::atomicIncrement(m_value); }
         void decrement() /*volatile*/ { nv::atomicDecrement(m_value); }
@@ -128,14 +147,14 @@ namespace nv {
         T compareAndExchange(T oldVal, T newVal) { nv::atomicCompareAndStore(&m_value, oldVal, newVal); }
         T exchange(T newVal) { nv::atomicExchange(&m_value, newVal); }
 
-	private:
-		// don't provide operator = or == ; make the client write Store( Load() )
-		NV_FORBID_COPY(Atomic);
+    private:
+        // don't provide operator = or == ; make the client write Store( Load() )
+        NV_FORBID_COPY(Atomic);
 		
-		NV_COMPILER_CHECK(sizeof(T) == sizeof(uint32) || sizeof(T) == sizeof(uint64));
-		
-		T m_value;
-	};
+	NV_COMPILER_CHECK(sizeof(T) == sizeof(uint32) || sizeof(T) == sizeof(uint64));
+
+        T m_value;
+    };
 #endif
 
 } // nv namespace 
diff --git a/src/nvthread/CMakeLists.txt b/src/nvthread/CMakeLists.txt
index 435141c..53f9a67 100644
--- a/src/nvthread/CMakeLists.txt
+++ b/src/nvthread/CMakeLists.txt
@@ -1,11 +1,13 @@
 PROJECT(nvthreads)
 
 SET(THREADS_SRCS
-	nvthreads.h
+	nvthread.h nvthread.cpp
+	Atomic.h
+	Event.h Event.cpp
 	Mutex.h Mutex.cpp
-	SpinWaiter.h SpinWaiter.cpp
+	ParallelFor.h ParallelFor.cpp
 	Thread.h Thread.cpp
-	ThreadLocalStorage.h ThreadLocalStorage.cpp)
+	ThreadPool.h ThreadPool.cpp)
 
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 
diff --git a/src/nvthread/Event.cpp b/src/nvthread/Event.cpp
index d39f54c..a86893d 100644
--- a/src/nvthread/Event.cpp
+++ b/src/nvthread/Event.cpp
@@ -13,7 +13,7 @@ using namespace nv;
 #if NV_OS_WIN32
 
 struct Event::Private {
-	HANDLE handle;
+    HANDLE handle;
 };
 
 Event::Event() : m(new Private) {
@@ -48,5 +48,6 @@ void Event::wait() {
 }
 
 #elif NV_OS_UNIX
-    // @@ 
+    // @@ TODO
+#pragma NV_MESSAGE("Implement event using pthreads!")
 #endif	
diff --git a/src/nvthread/Event.h b/src/nvthread/Event.h
index c8ff1d0..1e738a8 100644
--- a/src/nvthread/Event.h
+++ b/src/nvthread/Event.h
@@ -11,23 +11,23 @@
 namespace nv
 {
     // This is intended to be used by a single waiter thread.
-	class NVTHREAD_CLASS Event
-	{
-		NV_FORBID_COPY(Event);
-	public:
-		Event();
-		~Event();
+    class NVTHREAD_CLASS Event
+    {
+        NV_FORBID_COPY(Event);
+    public:
+        Event();
+        ~Event();
 
-		void post();
-		void wait();    // Wait resets the event.
+	void post();
+	void wait();    // Wait resets the event.
 
         static void post(Event * events, uint count);
         static void wait(Event * events, uint count);
 
-	private:
-		struct Private;
-		AutoPtr<Private> m;
-	};
+    private:
+        struct Private;
+        AutoPtr<Private> m;
+    };
 
 } // nv namespace
 
diff --git a/src/nvthread/Mutex.cpp b/src/nvthread/Mutex.cpp
index 698c879..cb6ebfc 100644
--- a/src/nvthread/Mutex.cpp
+++ b/src/nvthread/Mutex.cpp
@@ -19,71 +19,71 @@ using namespace nv;
 #if NV_OS_WIN32
 
 struct Mutex::Private {
-	CRITICAL_SECTION mutex;
+    CRITICAL_SECTION mutex;
 };
 
 
 Mutex::Mutex () : m(new Private)
 {
-	InitializeCriticalSection(&m->mutex);
+    InitializeCriticalSection(&m->mutex);
 }
 
 Mutex::~Mutex ()
 {
-	DeleteCriticalSection(&m->mutex);
+    DeleteCriticalSection(&m->mutex);
 }
 
 void Mutex::lock()
 {
-	EnterCriticalSection(&m->mutex);
+    EnterCriticalSection(&m->mutex);
 }
 
 bool Mutex::tryLock()
 {
-	return TryEnterCriticalSection(&m->mutex) != 0;
+    return TryEnterCriticalSection(&m->mutex) != 0;
 }
 
 void Mutex::unlock()
 {
-	LeaveCriticalSection(&m->mutex);	
+    LeaveCriticalSection(&m->mutex);
 }
 
 #elif NV_OS_UNIX
 
 struct Mutex::Private {
-	pthread_mutex_t mutex;
+    pthread_mutex_t mutex;
 };
 
 
 Mutex::Mutex () : m(new Private)
 {
-	int result = pthread_mutex_init(&m->mutex , NULL);
-	nvDebugCheck(result == 0);
+    int result = pthread_mutex_init(&m->mutex , NULL);
+    nvDebugCheck(result == 0);
 }
 
 Mutex::~Mutex ()
 {
-	int result = pthread_mutex_destroy(&m->mutex);
-	nvDebugCheck(result == 0);
+    int result = pthread_mutex_destroy(&m->mutex);
+    nvDebugCheck(result == 0);
 }
 
 void Mutex::lock()
 {
-	int result = pthread_mutex_lock(&m->mutex);
-	nvDebugCheck(result == 0);
+    int result = pthread_mutex_lock(&m->mutex);
+    nvDebugCheck(result == 0);
 }
 
 bool Mutex::tryLock()
 {
-	int result = pthread_mutex_trylock(&m->mutex);
-	nvDebugCheck(result == 0 || result == EBUSY);
-	return result == 0;
+    int result = pthread_mutex_trylock(&m->mutex);
+    nvDebugCheck(result == 0 || result == EBUSY);
+    return result == 0;
 }
 
 void Mutex::unlock()
 {
-	int result = pthread_mutex_unlock(&m->mutex);
-	nvDebugCheck(result == 0);
+    int result = pthread_mutex_unlock(&m->mutex);
+    nvDebugCheck(result == 0);
 }
 
-#endif // NV_OS
\ No newline at end of file
+#endif // NV_OS_UNIX
diff --git a/src/nvthread/Mutex.h b/src/nvthread/Mutex.h
index 841fc3d..13e34e0 100644
--- a/src/nvthread/Mutex.h
+++ b/src/nvthread/Mutex.h
@@ -11,36 +11,36 @@
 namespace nv
 {
 
-	class NVTHREAD_CLASS Mutex
-	{
-		NV_FORBID_COPY(Mutex);
-	public:
-		Mutex ();
-		~Mutex ();
+    class NVTHREAD_CLASS Mutex
+    {
+        NV_FORBID_COPY(Mutex);
+    public:
+        Mutex ();
+        ~Mutex ();
 
-		void lock();
-		bool tryLock();
-		void unlock();
+	void lock();
+	bool tryLock();
+	void unlock();
 
-	private:
-		struct Private;
-		AutoPtr<Private> m;
-	};
+    private:
+        struct Private;
+        AutoPtr<Private> m;
+};
 
 
     // Templated lock that can be used with any mutex.
     template <class M>
-	class Lock
-	{
-		NV_FORBID_COPY(Lock);
-	public:
-
-		Lock (M & m) : m_mutex (m) { m_mutex.lock(); }
-		~Lock () { m_mutex.unlock(); }
-		
-	private:
-		M & m_mutex;
-	};
+    class Lock
+    {
+        NV_FORBID_COPY(Lock);
+    public:
+
+	Lock (M & m) : m_mutex (m) { m_mutex.lock(); }
+	~Lock () { m_mutex.unlock(); }
+
+    private:
+        M & m_mutex;
+    };
 
 } // nv namespace
 
diff --git a/src/nvthread/Thread.cpp b/src/nvthread/Thread.cpp
index c8c39d8..68dc0c5 100644
--- a/src/nvthread/Thread.cpp
+++ b/src/nvthread/Thread.cpp
@@ -3,10 +3,10 @@
 #include "Thread.h"
 
 #if NV_OS_WIN32
-	#include "Win32.h"
+    #include "Win32.h"
 #elif NV_OS_UNIX
-	#include <pthread.h>
-	#include <unistd.h> // usleep
+    #include <pthread.h>
+    #include <unistd.h> // usleep
 #endif
 
 using namespace nv;
@@ -14,15 +14,16 @@ using namespace nv;
 struct Thread::Private
 {
 #if NV_OS_WIN32
-	HANDLE thread;
+    HANDLE thread;
 #elif NV_OS_UNIX
-	pthread_t thread;
+    pthread_t thread;
 #endif
 
     ThreadFunc * func;
     void * arg;
 };
 
+
 #if NV_OS_WIN32
 
 unsigned long __stdcall threadFunc(void * arg) {
@@ -32,11 +33,13 @@ unsigned long __stdcall threadFunc(void * arg) {
 }
 
 #elif NV_OS_UNIX
+
 extern "C" void * threadFunc(void * arg) {
     Thread * thread = (Thread *)arg;
-	thread->func(thread->arg);
-	pthread_exit(0);
+    thread->func(thread->arg);
+    pthread_exit(0);
 }
+
 #endif
 
 
@@ -47,7 +50,7 @@ Thread::Thread() : p(new Private)
 
 Thread::~Thread()
 {
-	nvDebugCheck(p->thread == 0);
+    nvDebugCheck(p->thread == 0);
 }
 
 void Thread::start(ThreadFunc * func, void * arg)
@@ -56,12 +59,12 @@ void Thread::start(ThreadFunc * func, void * arg)
     this->arg = arg;
 
 #if NV_OS_WIN32
-    p->thread = CreateThread(NULL, 0, threadFunc, this, 0, NULL);
-	//p->thread = (HANDLE)_beginthreadex (0, 0, threadFunc, this, 0, NULL);     // @@ So that we can call CRT functions...
-	nvDebugCheck(p->thread != NULL);
+    p->thread = CreateThread(NULL, 0, threadFunc, this, 0, NULL);
+    //p->thread = (HANDLE)_beginthreadex (0, 0, threadFunc, this, 0, NULL);     // @@ So that we can call CRT functions...
+    nvDebugCheck(p->thread != NULL);
 #elif NV_OS_UNIX
-	int result = pthread_create(&p->thread, NULL, threadFunc, this);
-	nvDebugCheck(result == 0);
+    int result = pthread_create(&p->thread, NULL, threadFunc, this);
+    nvDebugCheck(result == 0);
 #endif
 }
 
@@ -74,42 +77,42 @@ void Thread::wait()
     p->thread = NULL;
     nvCheck (ok);
 #elif NV_OS_UNIX
-	int result = pthread_join(p->thread, NULL); 
+    int result = pthread_join(p->thread, NULL);
     p->thread = 0;
-	nvDebugCheck(result == 0);
+    nvDebugCheck(result == 0);
 #endif
 }
 
 bool Thread::isRunning () const
 {
 #if NV_OS_WIN32
-	return p->thread != NULL;
+    return p->thread != NULL;
 #elif NV_OS_UNIX
-	return p->thread != 0;
+    return p->thread != 0;
 #endif
 }
 
 /*static*/ void Thread::spinWait(uint count)
 {
-	for (uint i = 0; i < count; i++) {}
+    for (uint i = 0; i < count; i++) {}
 }
 
 /*static*/ void Thread::yield()
 {
 #if NV_OS_WIN32
-	SwitchToThread();
+    SwitchToThread();
 #elif NV_OS_UNIX
-	int result = sched_yield();
-	nvDebugCheck(result == 0);
+    int result = sched_yield();
+    nvDebugCheck(result == 0);
 #endif
 }
 
 /*static*/ void Thread::sleep(uint ms)
 {
 #if NV_OS_WIN32
-	Sleep(ms);
+    Sleep(ms);
 #elif NV_OS_UNIX
-	usleep(1000 * ms);
+    usleep(1000 * ms);
 #endif
 }
 
@@ -133,4 +136,5 @@ bool Thread::isRunning () const
         threads[i].wait();
     }
 //#endif
-}
\ No newline at end of file
+}
+
diff --git a/src/nvthread/Thread.h b/src/nvthread/Thread.h
index cdd5b70..0e46564 100644
--- a/src/nvthread/Thread.h
+++ b/src/nvthread/Thread.h
@@ -12,34 +12,34 @@ namespace nv
 {
     typedef void ThreadFunc(void * arg);
 
-	class NVTHREAD_CLASS Thread
-	{
-		NV_FORBID_COPY(Thread);
-	public:
-		Thread();
-		~Thread();
+    class NVTHREAD_CLASS Thread
+    {
+        NV_FORBID_COPY(Thread);
+    public:
+        Thread();
+        ~Thread();
 
-		void start(ThreadFunc * func, void * arg);
-		void wait();
+	void start(ThreadFunc * func, void * arg);
+	void wait();
 
-		bool isRunning() const;
+	bool isRunning() const;
 
-		static void spinWait(uint count);
-		static void yield();
-		static void sleep(uint ms);
+	static void spinWait(uint count);
+	static void yield();
+	static void sleep(uint ms);
 
-        static void wait(Thread * threads, uint count);
-	
-	private:
+	static void wait(Thread * threads, uint count);
 
-		struct Private;
-		AutoPtr<Private> p;
-    
-    public:
+    private:
+
+	struct Private;
+	AutoPtr<Private> p;
+
+    public: // @@ Why public? Also in private?!
         ThreadFunc * func;
         void * arg;
 
-	};
+    };
 
 } // nv namespace
 
diff --git a/src/nvthread/ThreadPool.cpp b/src/nvthread/ThreadPool.cpp
index a343fab..af111f1 100644
--- a/src/nvthread/ThreadPool.cpp
+++ b/src/nvthread/ThreadPool.cpp
@@ -4,6 +4,8 @@
 #include "Mutex.h"
 #include "Thread.h"
 
+#include "nvcore/Utils.h"
+
 // Most of the time it's not necessary to protect the thread pool, but if it doesn't add a significant overhead, then it'd be safer to do it.
 #define PROTECT_THREAD_POOL 1
 
@@ -47,7 +49,7 @@ AutoPtr<ThreadPool> s_pool;
 
 
 /*static*/ void ThreadPool::workerFunc(void * arg) {
-    uint i = (uint)arg;
+    uint i = toU32((uintptr_t)arg); // This is OK, because workerCount should always be <<< 2^32
 
     while(true) 
     {
@@ -118,4 +120,4 @@ void ThreadPool::wait()
 
         allIdle = true;
     }
-}
\ No newline at end of file
+}
diff --git a/src/nvthread/nvthread.cpp b/src/nvthread/nvthread.cpp
index 0d40f86..db46927 100644
--- a/src/nvthread/nvthread.cpp
+++ b/src/nvthread/nvthread.cpp
@@ -1,11 +1,16 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
 
 #include "nvthread.h"
 
 #include "Thread.h"
 
-#define WIN32_LEAN_AND_MEAN
-#define VC_EXTRALEAN
-#include <windows.h>
+#if NV_OS_WIN32
+    #include "Win32.h"
+#elif NV_OS_UNIX
+    #include <sys/types.h>
+    #include <sys/sysctl.h>
+#endif
+
 
 using namespace nv;
 
diff --git a/src/nvtt/tests/testsuite.cpp b/src/nvtt/tests/testsuite.cpp
index 9645238..fb31c47 100644
--- a/src/nvtt/tests/testsuite.cpp
+++ b/src/nvtt/tests/testsuite.cpp
@@ -269,6 +269,10 @@ struct MyOutputHandler : public nvtt::OutputHandler
         m_ptr = m_data;
     }
 
+    virtual void endImage()
+    {
+    }
+
     virtual bool writeData(const void * data, int size)
     {
         memcpy(m_ptr, data, size);