Merge changes from The Witness.

2015-10-28 23:53:08 -07:00
parent a382ea5b21
commit c0ad0f4d31
43 changed files with 890 additions and 136 deletions
--- a/src/nvthread/Atomic.h
+++ b/src/nvthread/Atomic.h
@ -119,18 +119,35 @@ namespace nv {
 #if NV_CC_MSVC
    NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));

+    // Returns incremented value.
    inline uint32 atomicIncrement(uint32 * value)
    {
        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return (uint32)_InterlockedIncrement((long *)value);
+        return uint32(_InterlockedIncrement((long *)value));
    }

+    // Returns decremented value.
    inline uint32 atomicDecrement(uint32 * value)
    {
        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return (uint32)_InterlockedDecrement((long *)value);
+        return uint32(_InterlockedDecrement((long *)value));
    }

+    // Returns added value.
+    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add)) + value_to_add;
+    }
+
+    // Returns original value before addition.
+    inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add));
+    }
+
+
+
+
    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
    // @@ Is this strong or weak? Does InterlockedCompareExchange have spurious failures?
@ -148,10 +165,7 @@ namespace nv {
        return (uint32)_InterlockedExchange((long *)value, (long)desired);
    }

-    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return (uint32)_InterlockedExchangeAdd((long*)value, (long)value_to_add);
-    }
+

 #elif NV_CC_CLANG && (NV_OS_IOS || NV_OS_DARWIN)
    NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
@ -180,20 +194,31 @@ namespace nv {
    }
    */

-    inline uint32 atomicIncrement(uint32 * value)
-    {
+    // Returns incremented value.
+    inline uint32 atomicIncrement(uint32 * value) {
        nvDebugCheck((intptr_t(value) & 3) == 0);
-        
        return __sync_add_and_fetch(value, 1);
    }
    
-    inline uint32 atomicDecrement(uint32 * value)
-    {
+    // Returns decremented value.
+    inline uint32 atomicDecrement(uint32 * value) {
        nvDebugCheck((intptr_t(value) & 3) == 0);
-        
        return __sync_sub_and_fetch(value, 1);
    }
-    
+
+    // Returns added value.
+    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_add_and_fetch(value, value_to_add);
+    }
+
+    // Returns original value before addition.
+    inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_fetch_and_add(value, value_to_add);
+    }
+
+
    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
    // @@ Is this strong or weak?
@ -210,10 +235,6 @@ namespace nv {
        return __sync_lock_test_and_set(value, desired);
    }

-    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return __sync_add_and_fetch(value, value_to_add);
-    }



@ -271,20 +292,30 @@ namespace nv {
    // Many alternative implementations at:
    // http://www.memoryhole.net/kyle/2007/05/atomic_incrementing.html

-    inline uint32 atomicIncrement(uint32 * value)
-    {
+    // Returns incremented value.
+    inline uint32 atomicIncrement(uint32 * value) {
        nvDebugCheck((intptr_t(value) & 3) == 0);
-
        return __sync_add_and_fetch(value, 1);
    }

-    inline uint32 atomicDecrement(uint32 * value)
-    {
+    // Returns decremented value.
+    inline uint32 atomicDecrement(uint32 * value) {
        nvDebugCheck((intptr_t(value) & 3) == 0);
-
        return __sync_sub_and_fetch(value, 1);
    }
-    
+
+    // Returns added value.
+    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_add_and_fetch(value, value_to_add);
+    }
+
+    // Returns original value before addition.
+    inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_fetch_and_add(value, value_to_add);
+    }
+
    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
    // @@ Is this strong or weak?
@ -300,12 +331,6 @@ namespace nv {
        // this is confusingly named, it doesn't actually do a test but always sets
        return __sync_lock_test_and_set(value, desired);
    }
-
-    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return __sync_add_and_fetch(value, value_to_add);
-    }
-
    
 #else
 #error "Atomics not implemented."
--- a/src/nvthread/ParallelFor.cpp
+++ b/src/nvthread/ParallelFor.cpp
@ -11,18 +11,20 @@ using namespace nv;

 #define ENABLE_PARALLEL_FOR 1

-static void worker(void * arg) {
+static void worker(void * arg, int tid) {
    ParallelFor * owner = (ParallelFor *)arg;

    while(true) {
-        // Consume one element at a time. @@ Might be more efficient to have custom grain.
-        uint i = atomicIncrement(&owner->idx);
-        if (i > owner->count) {
+        uint new_idx = atomicFetchAndAdd(&owner->idx, owner->step);
+        if (new_idx >= owner->count) {
            break;
        }

-        owner->task(owner->context, i - 1);
-    } 
+        const uint count = min(owner->count, new_idx + owner->step);
+        for (uint i = new_idx; i < count; i++) {
+            owner->task(owner->context, /*tid, */i);
+        }
+    }
 }


@ -38,22 +40,16 @@ ParallelFor::~ParallelFor() {
 #endif
 }

-void ParallelFor::run(uint count, bool calling_thread_process_work /*=false*/) {
+void ParallelFor::run(uint count, uint step/*= 1*/) {
 #if ENABLE_PARALLEL_FOR
    storeRelease(&this->count, count);
+    storeRelease(&this->step, step);

    // Init atomic counter to zero.
    storeRelease(&idx, 0);

    // Start threads.
-    pool->start(worker, this);
-
-    if (calling_thread_process_work) {
-        worker(this);
-    }
-
-    // Wait for all threads to complete.
-    pool->wait();
+    pool->run(worker, this);

    nvDebugCheck(idx >= count);
 #else
@ -63,4 +59,3 @@ void ParallelFor::run(uint count, bool calling_thread_process_work /*=false*/) {
 #endif
 }

-
--- a/src/nvthread/ParallelFor.h
+++ b/src/nvthread/ParallelFor.h
@ -12,26 +12,169 @@ namespace nv
    class Thread;
    class ThreadPool;

-    typedef void ForTask(void * context, int id);
+    typedef void ForTask(void * context, /*int tid,*/ int idx); // @@ It would be nice to have the thread index as an argument here.

    struct ParallelFor {
        ParallelFor(ForTask * task, void * context);
        ~ParallelFor();

-        void run(uint count, bool calling_thread_process_work = false);
+        void run(uint count, uint step = 1);

        // Invariant:
        ForTask * task;
        void * context;
        ThreadPool * pool;
-        //uint workerCount;   // @@ Move to thread pool.
-        //Thread * workers;

        // State:
        uint count;
+        uint step;
        /*atomic<uint>*/ uint idx;
    };

+
+#if NV_CC_CPP11
+
+    template <typename F>
+    void sequential_for(uint count, F f) {
+        for (uint i = 0; i < count; i++) {
+            f(i);
+        }
+    }
+
+
+    template <typename F>
+    void parallel_for(uint count, uint step, F f) {
+        // Transform lambda into function pointer.
+        auto lambda = [](void* context, /*int tid, */int idx) {
+            F & f = *reinterpret_cast<F *>(context);
+            f(/*tid, */idx);
+        };
+
+        ParallelFor pf(lambda, &f);
+        pf.run(count, step);
+    }
+
+
+    template <typename F>
+    void parallel_for(uint count, F f) {
+        parallel_for(count, /*step=*/1, f);
+    }
+
+
+    template <typename F>
+    void parallel_for_if(uint count, uint step, bool condition, F f) {
+        if (condition) {
+            parallel_for(count, step, f);
+        }
+        else {
+            sequential_for(count, f);
+        }
+    }
+
+
+#if 0
+    template <typename F, typename T>
+    void parallel_for_each(Array<T> & array, uint step, F f) {
+        // Transform lambda into function pointer.
+        auto lambda = [](void* context, int idx) {
+            F & f = *reinterpret_cast<F *>(context);
+            f(array[idx]);
+        };
+
+        ParallelFor pf(lambda, &f);
+        pf.run(count, step);
+    }
+#endif
+
+
+#endif // NV_CC_CPP11
+
+
+/*
+
+#include "nvthread/Mutex.h"
+#include "nvcore/Array.inl"
+
+    template <typename T>
+    struct ParallelOutputStream {
+#if 0
+        // In its most basic implementation the parallel stream is simply a single array protected by a mutex.
+        Parallel_Output_Stream(uint producer_count) {}
+
+        void reset() { final_array.clear(); }
+        void append(uint producer_id, const T & t) { Lock(mutex); final_array.append(t); }
+        nv::Array<T> & finalize() { return final_array; }
+        
+        nv::Mutex mutex;
+        nv::Array<T> final_array;
+
+#elif 0
+        // Another simple implementation is to have N arrays that are merged at the end.
+        ParallelOutputStream(uint producer_count) : producer_count(producer_count) {
+            partial_array = new Array<T>[producer_count];
+        }
+
+        void reset() {
+            for (int i = 0; i < producer_count; i++) {
+                partial_array[i].clear();
+            }
+        }
+
+        void append(uint producer_id, const T & t) { 
+            nvCheck(producer_id < producer_count);
+            partial_array[producer_id].append(t);
+        }
+
+        nv::Array<T> & finalize() {
+            for (int i = 1; i < producer_count; i++) {
+                partial_array->append(partial_array[i]);
+                partial_array[i].clear();
+            }
+            return *partial_array;
+        }
+
+        uint producer_count;
+        nv::Array<T> * partial_array;
+#else
+        ParallelOutputStream(uint producer_count) : producer_count(producer_count) {
+            partial_array = new PartialArray[producer_count];
+        }
+
+        // But a more sophisticated implementation keeps N short arrays that are merged as they get full. This preserves partial order.
+        struct PartialArray {          // Make sure this is aligned to cache lines. We want producers to access their respective arrays without conflicts.
+            uint count;
+            T data[32];                 // Pick size to minimize wasted space considering cache line alignment?
+        };
+
+        const uint producer_count;
+        PartialArray * partial_array;
+
+        // @@ Make sure mutex and partial_array are not in the same cache line!
+
+        nv::Mutex mutex;
+        nv::Array<T> final_array;
+
+        void append(uint producer_id, const T & t) {
+            if (partial_array[producer_id].count == 32) {
+                partial_array[producer_id].count = 0;
+                Lock(mutex);
+                final_array.append(partial_array[producer_id].data, 32);
+            }
+
+            partial_array[producer_id].data[partial_array[producer_id].count++] = t;
+        }
+        nv::Array<T> & finalize() {
+            for (int i = 0; i < producer_count; i++) {
+                final_array.append(partial_array[producer_id].data, partial_array[producer_id].count);
+            }
+            return final_array;
+        }
+#endif
+    };
+
+*/
+
+
 } // nv namespace


--- a/src/nvthread/Thread.cpp
+++ b/src/nvthread/Thread.cpp
@ -89,7 +89,7 @@ Thread::Thread() : p(new Private)
    p->name = NULL;
 }

-Thread::Thread(const char * const name) : p(new Private)
+Thread::Thread(const char * name) : p(new Private)
 {
    p->thread = 0;
    p->name = name;
@ -100,6 +100,12 @@ Thread::~Thread()
    nvDebugCheck(p->thread == 0);
 }

+void Thread::setName(const char * name)
+{
+    nvCheck(p->name == NULL);
+    p->name = name;
+}
+
 void Thread::start(ThreadFunc * func, void * arg)
 {
    p->func = func;
@ -110,10 +116,12 @@ void Thread::start(ThreadFunc * func, void * arg)
    p->thread = CreateThread(NULL, 0, threadFunc, p.ptr(), 0, &threadId);
    //p->thread = (HANDLE)_beginthreadex (0, 0, threadFunc, p.ptr(), 0, NULL);     // @@ So that we can call CRT functions...
    nvDebugCheck(p->thread != NULL);
-    setThreadName(threadId, p->name);
-#if NV_USE_TELEMETRY
-    tmThreadName(tmContext, threadId, p->name);
-#endif
+    if (p->name != NULL) {
+        setThreadName(threadId, p->name);
+    #if NV_USE_TELEMETRY
+        tmThreadName(tmContext, threadId, p->name);
+    #endif
+    }
 #elif NV_OS_ORBIS
    int ret = scePthreadCreate(&p->thread, NULL, threadFunc, p.ptr(), p->name ? p->name : "nv::Thread");
    nvDebugCheck(ret == 0);
--- a/src/nvthread/Thread.h
+++ b/src/nvthread/Thread.h
@ -17,9 +17,11 @@ namespace nv
        NV_FORBID_COPY(Thread);
    public:
        Thread();
-        Thread(const char * const name);
+        Thread(const char * name);
        ~Thread();

+        void setName(const char * name);
+
        void start(ThreadFunc * func, void * arg);
        void wait();

--- a/src/nvthread/ThreadPool.cpp
+++ b/src/nvthread/ThreadPool.cpp
@ -6,6 +6,13 @@
 #include "Atomic.h"

 #include "nvcore/Utils.h"
+#include "nvcore/StrLib.h"
+
+#if NV_USE_TELEMETRY
+#include <telemetry.h>
+extern HTELEMETRY tmContext;
+#endif
+

 // Most of the time it's not necessary to protect the thread pool, but if it doesn't add a significant overhead, then it'd be safer to do it.
 #define PROTECT_THREAD_POOL 1
@ -20,6 +27,14 @@ Mutex s_pool_mutex("thread pool");
 AutoPtr<ThreadPool> s_pool;


+/*static*/ void ThreadPool::setup(uint workerCount, bool useThreadAffinity, bool useCallingThread) {
+#if PROTECT_THREAD_POOL 
+    Lock<Mutex> lock(s_pool_mutex);
+#endif
+
+    s_pool = new ThreadPool(workerCount, useThreadAffinity, useCallingThread);
+}
+
 /*static*/ ThreadPool * ThreadPool::acquire()
 {
 #if PROTECT_THREAD_POOL 
@ -52,36 +67,59 @@ AutoPtr<ThreadPool> s_pool;
 /*static*/ void ThreadPool::workerFunc(void * arg) {
    uint i = U32((uintptr_t)arg); // This is OK, because workerCount should always be much smaller than 2^32

+    //ThreadPool::threadId = i;
+
+    if (s_pool->useThreadAffinity) {
+        lockThreadToProcessor(s_pool->useCallingThread + i);
+    }
+
    while(true) 
    {
        s_pool->startEvents[i].wait();

-        nv::ThreadFunc * func = loadAcquirePointer(&s_pool->func);
+        ThreadTask * func = loadAcquirePointer(&s_pool->func);

        if (func == NULL) {
            return;
        }
        
-        func(s_pool->arg);
+        {
+#if NV_USE_TELEMETRY
+            tmZoneFiltered(tmContext, 20, TMZF_NONE, "worker");
+#endif
+            func(s_pool->arg, s_pool->useCallingThread + i);
+        }

        s_pool->finishEvents[i].post();
    }
 }


-ThreadPool::ThreadPool() 
+ThreadPool::ThreadPool(uint workerCount/*=processorCount()*/, bool useThreadAffinity/*=true*/, bool useCallingThread/*=false*/)
 {
    s_pool = this;  // Worker threads need this to be initialized before they start.

-    workerCount = nv::hardwareThreadCount();
-    workers = new Thread[workerCount];
+    this->useThreadAffinity = useThreadAffinity;
+    this->workerCount = workerCount;
+    this->useCallingThread = useCallingThread;

-    startEvents = new Event[workerCount];
-    finishEvents = new Event[workerCount];
+    uint threadCount = workerCount - useCallingThread;
+
+    workers = new Thread[threadCount];
+
+    startEvents = new Event[threadCount];
+    finishEvents = new Event[threadCount];

    nvCompilerWriteBarrier(); // @@ Use a memory fence?

-    for (uint i = 0; i < workerCount; i++) {
+    if (useCallingThread && useThreadAffinity) {
+        lockThreadToProcessor(0);   // Calling thread always locked to processor 0.
+    }
+
+    for (uint i = 0; i < threadCount; i++) {
+        StringBuilder name;
+        name.format("worker %d", i);
+        workers[i].setName(name.release());     // @Leak
        workers[i].start(workerFunc, (void *)i);
    }

@ -94,14 +132,28 @@ ThreadPool::~ThreadPool()
    start(NULL, NULL);

    // Wait until threads actually exit.
-    Thread::wait(workers, workerCount);
+    Thread::wait(workers, workerCount - useCallingThread);

    delete [] workers;
    delete [] startEvents;
    delete [] finishEvents;
 }

-void ThreadPool::start(ThreadFunc * func, void * arg)
+void ThreadPool::run(ThreadTask * func, void * arg)
+{
+    // Wait until threads are idle.
+    wait();
+
+    start(func, arg);
+
+    if (useCallingThread) {
+        func(arg, 0);
+    }
+
+    wait();
+}
+
+void ThreadPool::start(ThreadTask * func, void * arg)
 {
    // Wait until threads are idle.
    wait();
@ -113,7 +165,7 @@ void ThreadPool::start(ThreadFunc * func, void * arg)
    allIdle = false;

    // Resume threads.
-    Event::post(startEvents, workerCount);
+    Event::post(startEvents, workerCount - useCallingThread);
 }

 void ThreadPool::wait()
@ -121,7 +173,7 @@ void ThreadPool::wait()
    if (!allIdle)
    {
        // Wait for threads to complete.
-        Event::wait(finishEvents, workerCount);
+        Event::wait(finishEvents, workerCount - useCallingThread);

        allIdle = true;
    }
--- a/src/nvthread/ThreadPool.h
+++ b/src/nvthread/ThreadPool.h
@ -14,30 +14,42 @@
 // The thread pool runs the same function in all worker threads, the idea is to use this as the foundation of a custom task scheduler.
 // When the thread pool starts, the main thread continues running, but the common use case is to inmmediately wait for the termination events of the worker threads.
 // @@ The start and wait methods could probably be merged.
+// It may be running the thread function on the invoking thread to avoid thread switches.

 namespace nv {

    class Thread;
    class Event;

+    typedef void ThreadTask(void * context, int id);
+
    class ThreadPool {
        NV_FORBID_COPY(ThreadPool);
    public:

+        static void setup(uint workerCount, bool useThreadAffinity, bool useCallingThread);
+
        static ThreadPool * acquire();
        static void release(ThreadPool *);

-        ThreadPool();
+        ThreadPool(uint workerCount = processorCount(), bool useThreadAffinity = true, bool useCallingThread = false);
        ~ThreadPool();

-        void start(ThreadFunc * func, void * arg);
+        void run(ThreadTask * func, void * arg);
+
+        void start(ThreadTask * func, void * arg);
        void wait();

+        //NV_THREAD_LOCAL static uint threadId;
+
    private:

        static void workerFunc(void * arg);

+        bool useThreadAffinity;
+        bool useCallingThread;
        uint workerCount;
+
        Thread * workers;
        Event * startEvents;
        Event * finishEvents;
@ -45,10 +57,29 @@ namespace nv {
        uint allIdle;

        // Current function:
-        ThreadFunc * func;
+        ThreadTask * func;
        void * arg;
    };

+
+#if NV_CC_CPP11
+
+    template <typename F>
+    void thread_pool_run(F f) {
+        // Transform lambda into function pointer.
+        auto lambda = [](void* context, int id) {
+            F & f = *reinterpret_cast<F *>(context);
+            f(id);
+        };
+
+        ThreadPool * pool = ThreadPool::acquire();
+        pool->run(lambda, &f);
+        ThreadPool::release(pool);
+    }
+
+#endif // NV_CC_CPP11
+
+
 } // namespace nv


--- a/src/nvthread/nvthread.cpp
+++ b/src/nvthread/nvthread.cpp
@ -27,15 +27,65 @@

 using namespace nv;

+#if NV_OS_WIN32

-// Find the number of cores in the system.
+typedef BOOL(WINAPI *LPFN_GSI)(LPSYSTEM_INFO);
+typedef BOOL(WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
+
+static bool isWow64() {
+    LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
+
+    BOOL wow64 = FALSE;
+
+    if (NULL != fnIsWow64Process) {
+        if (!fnIsWow64Process(GetCurrentProcess(), &wow64)) {
+            // If error, assume false.
+        }
+    }
+
+    return wow64 != 0;
+}
+
+static void getSystemInfo(SYSTEM_INFO * sysinfo) {
+    BOOL success = FALSE;
+
+    if (isWow64()) {
+        LPFN_GSI fnGetNativeSystemInfo = (LPFN_GSI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetNativeSystemInfo");
+
+        if (fnGetNativeSystemInfo != NULL) {
+            success = fnGetNativeSystemInfo(sysinfo);
+        }
+    }
+
+    if (!success) {
+        GetSystemInfo(sysinfo);
+    }
+}
+
+#endif // NV_OS_WIN32
+
+// Find the number of logical processors in the system.
 // Based on: http://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
-// @@ Distinguish between logical and physical cores?
-uint nv::hardwareThreadCount() {
+uint nv::processorCount() {
 #if NV_OS_WIN32
    SYSTEM_INFO sysinfo;
-    GetSystemInfo( &sysinfo );
-    return sysinfo.dwNumberOfProcessors;
+    getSystemInfo(&sysinfo);
+    //return sysinfo.dwNumberOfProcessors;
+
+    // Respect process affinity mask?
+    DWORD_PTR pam, sam;
+    GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
+
+    // Count number of bits set in the processor affinity mask.
+    uint count = 0;
+    for (int i = 0; i < sizeof(DWORD_PTR) * 8; i++) {
+        if (pam & (DWORD_PTR(1) << i)) count++;
+    }
+    nvDebugCheck(count <= sysinfo.dwNumberOfProcessors);
+
+    return count;
+#elif NV_OS_ORBIS
+    return 6;
 #elif NV_OS_XBOX
    return 3; // or 6?
 #elif NV_OS_LINUX // Linux, Solaris, & AIX
@ -72,10 +122,211 @@ uint nv::hardwareThreadCount() {
 #endif
 }

+
 uint nv::threadId() {
 #if NV_OS_WIN32
    return GetCurrentThreadId();
 #else
    return 0;   // @@ 
 #endif
-}
+}
+
+
+// @@ If we are using less worker threads than processors and hyperthreading is available, we probably want to enumerate the logical processors 
+// so that the first cores of each processor goes first. This way, if say, we leave 2 hardware threads free, then we still have one worker
+// thread on each physical processor.
+
+// I believe that currently logical processors are enumerated in physical order, that is:
+//   0 = thread a in physical core 0
+//   1 = thread b in physical core 0
+//   2 = thread a in physical core 1
+//   ... and so on ...
+// I'm not sure we can actually rely on that. And in any case we should start detecting the number of physical processors, which appears to be a pain 
+// to do in a way that's compatible with newer i7 processors.
+
+void nv::lockThreadToProcessor(int idx) {
+#if NV_OS_WIN32
+    //nvDebugCheck(idx < hardwareThreadCount());
+#if 0
+    DWORD_PTR tam = 1 << idx;
+#else
+    DWORD_PTR pam, sam;
+    BOOL rc = GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
+
+    // Find the idx's bit set.
+    uint pidx = 0;
+    DWORD_PTR tam = 0;
+    for (int i = 0; i < sizeof(DWORD_PTR) * 8; i++) {
+        DWORD_PTR mask = DWORD_PTR(1) << i;
+        if (pam & mask) {
+            if (pidx == idx) {
+                tam = mask;
+                break;
+            }
+            pidx++;
+        }
+    }
+
+    nvDebugCheck(tam != 0);
+#endif
+
+    SetThreadAffinityMask(GetCurrentThread(), tam);
+#else
+    // @@ NOP
+#endif
+}
+
+
+void nv::unlockThreadToProcessor() {
+#if NV_OS_WIN32
+    DWORD_PTR pam, sam;
+    BOOL rc = GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
+    SetThreadAffinityMask(GetCurrentThread(), pam);
+#else
+    // @@ NOP
+#endif
+}
+
+uint nv::logicalProcessorCount() {
+    return processorCount();
+}
+
+
+#if NV_OS_WIN32
+
+struct LOGICALPROCESSORDATA
+{
+    unsigned int nLargestStandardFunctionNumber;
+    unsigned int nLargestExtendedFunctionNumber;
+    int nLogicalProcessorCount;
+    int nLocalApicId;
+    int nCPUcore;
+    int nProcessorId;
+    int nApicIdCoreIdSize;
+    int nNC;
+    int nMNC;
+    int nCPUCoresperProcessor;
+    int nThreadsperCPUCore;
+    int nProcId;
+    int nCoreId;
+    bool CmpLegacy;
+    bool HTT;
+};
+
+#define MAX_NUMBER_OF_LOGICAL_PROCESSORS 96
+#define MAX_NUMBER_OF_PHYSICAL_PROCESSORS 8
+#define MAX_NUMBER_OF_IOAPICS 16
+static LOGICALPROCESSORDATA LogicalProcessorMap[MAX_NUMBER_OF_LOGICAL_PROCESSORS];
+static int PhysProcIds[MAX_NUMBER_OF_PHYSICAL_PROCESSORS + MAX_NUMBER_OF_IOAPICS];
+
+static void gatherProcessorData(LOGICALPROCESSORDATA * p) {
+
+    int CPUInfo[4] = { 0, 0, 0, 0 };
+    __cpuid(CPUInfo, 0);
+
+    p->nLargestStandardFunctionNumber = CPUInfo[0];
+
+    // Get the information associated with each valid Id
+    for (uint i = 0; i <= p->nLargestStandardFunctionNumber; ++i) {
+        __cpuid(CPUInfo, i);
+
+        // Interpret CPU feature information.
+        if (i == 1) {
+            // Some of the bits of LocalApicId represent the CPU core 
+            // within a processor and other bits represent the processor ID. 
+            p->nLocalApicId = (CPUInfo[1] >> 24) & 0xff;
+            p->HTT = (CPUInfo[3] >> 28) & 0x1;
+            // recalculate later after 0x80000008
+            p->nLogicalProcessorCount = (CPUInfo[1] >> 16) & 0x0FF;
+        }
+    }
+
+    // Calling __cpuid with 0x80000000 as the InfoType argument
+    // gets the number of valid extended IDs.
+    __cpuid(CPUInfo, 0x80000000);
+    p->nLargestExtendedFunctionNumber = CPUInfo[0];
+
+    // Get the information associated with each extended ID.
+    for (uint i = 0x80000000; i <= p->nLargestExtendedFunctionNumber; ++i) {
+        __cpuid(CPUInfo, i);
+        if (i == 0x80000008) {
+            p->nApicIdCoreIdSize = (CPUInfo[2] >> 12) & 0xF;
+            p->nNC = (CPUInfo[2]) & 0x0FF;
+        }
+    }
+
+    // MNC
+    // A value of zero for ApicIdCoreIdSize indicates that MNC is derived by this
+    // legacy formula: MNC = NC + 1
+    // A non-zero value of ApicIdCoreIdSize means that MNC is 2^ApicIdCoreIdSize  
+    if (p->nApicIdCoreIdSize) {
+        p->nMNC = 2;
+        for (uint j = p->nApicIdCoreIdSize - 1; j > 0; j--) {
+            p->nMNC = p->nMNC * 2;
+        }
+    }
+    else {
+        p->nMNC = p->nNC + 1;
+    }
+
+    // If HTT==0, then LogicalProcessorCount is reserved, and the CPU contains 
+    // one CPU core and the CPU core is single-threaded.
+    // If HTT==1 and CmpLegacy==1, LogicalProcessorCount represents the number of
+    // CPU cores per processor, where each CPU core is single-threaded.  If HTT==1
+    // and CmpLegacy==0, then LogicalProcessorCount is the number of threads per
+    // processor, which is the number of cores times the number of threads per core.
+    // The number of cores is NC+1.
+    p->nCPUCoresperProcessor = p->nNC + 1;
+    p->nThreadsperCPUCore = (p->HTT == 0 ? 1 : (p->CmpLegacy == 1 ? 1 : p->nLogicalProcessorCount / p->nCPUCoresperProcessor ));
+
+    // Calculate a mask for the core IDs
+    uint mask = 1;
+    uint numbits = 1;
+    if (p->nApicIdCoreIdSize) {
+        numbits = p->nApicIdCoreIdSize;
+        for (uint j = p->nApicIdCoreIdSize; j > 1; j--) {
+            mask = (mask << 1) + 1;
+        }
+    }
+    p->nProcId = (p->nLocalApicId & ~mask) >> numbits;
+    p->nCoreId = p->nLocalApicId & mask;
+}
+
+
+uint nv::physicalProcessorCount() {
+
+    uint lpc = logicalProcessorCount();
+
+    // Get info about each logical processor.
+    for (uint i = 0; i < lpc; i++) {
+        // Make sure thread doesn't change processor while we gather it's data.
+        lockThreadToProcessor(i);
+
+        gatherProcessorData(&LogicalProcessorMap[i]);
+    }
+
+    unlockThreadToProcessor();
+
+    memset(PhysProcIds, 0, sizeof(PhysProcIds));
+    for (uint i = 0; i < lpc; i++) {
+        PhysProcIds[LogicalProcessorMap[i].nProcId]++;
+    }
+
+    uint pc = 0;
+    for (uint i = 0; i < (MAX_NUMBER_OF_PHYSICAL_PROCESSORS + MAX_NUMBER_OF_IOAPICS); i++) {
+        if (PhysProcIds[i] != 0) {
+            pc++;
+        }
+    }
+
+    return pc;
+}
+
+#else
+
+uint nv::physicalProcessorCount() {
+    // @@ Assume the same.
+    return processorCount();
+}
+
+#endif
--- a/src/nvthread/nvthread.h
+++ b/src/nvthread/nvthread.h
@ -82,13 +82,16 @@ BOOL WINAPI SwitchToThread(void);

 namespace nv
 {
-    // Reentrant.
-    uint hardwareThreadCount();
+    //void initThreadingSystemInfo();

-    // Not thread-safe. Use from main thread only.
-    void initWorkers();
-    void shutWorkers();
-    void setWorkerFunction(void * func);
+    // Reentrant.
+    uint processorCount();
+    uint logicalProcessorCount();
+    uint physicalProcessorCount();
+
+    // Locks the current thread to the given logical processor index.
+    void lockThreadToProcessor(int idx);
+    void unlockThreadToProcessor();

    uint threadId();