Merge changes from The Witness.
This commit is contained in:
@ -119,18 +119,35 @@ namespace nv {
|
||||
#if NV_CC_MSVC
|
||||
NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
|
||||
|
||||
// Returns incremented value.
|
||||
inline uint32 atomicIncrement(uint32 * value)
|
||||
{
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
return (uint32)_InterlockedIncrement((long *)value);
|
||||
return uint32(_InterlockedIncrement((long *)value));
|
||||
}
|
||||
|
||||
// Returns decremented value.
|
||||
inline uint32 atomicDecrement(uint32 * value)
|
||||
{
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
return (uint32)_InterlockedDecrement((long *)value);
|
||||
return uint32(_InterlockedDecrement((long *)value));
|
||||
}
|
||||
|
||||
// Returns added value.
|
||||
inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add)) + value_to_add;
|
||||
}
|
||||
|
||||
// Returns original value before addition.
|
||||
inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add));
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
|
||||
// @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
|
||||
// @@ Is this strong or weak? Does InterlockedCompareExchange have spurious failures?
|
||||
@ -148,10 +165,7 @@ namespace nv {
|
||||
return (uint32)_InterlockedExchange((long *)value, (long)desired);
|
||||
}
|
||||
|
||||
inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
return (uint32)_InterlockedExchangeAdd((long*)value, (long)value_to_add);
|
||||
}
|
||||
|
||||
|
||||
#elif NV_CC_CLANG && (NV_OS_IOS || NV_OS_DARWIN)
|
||||
NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
|
||||
@ -180,20 +194,31 @@ namespace nv {
|
||||
}
|
||||
*/
|
||||
|
||||
inline uint32 atomicIncrement(uint32 * value)
|
||||
{
|
||||
// Returns incremented value.
|
||||
inline uint32 atomicIncrement(uint32 * value) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
|
||||
return __sync_add_and_fetch(value, 1);
|
||||
}
|
||||
|
||||
inline uint32 atomicDecrement(uint32 * value)
|
||||
{
|
||||
// Returns decremented value.
|
||||
inline uint32 atomicDecrement(uint32 * value) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
|
||||
return __sync_sub_and_fetch(value, 1);
|
||||
}
|
||||
|
||||
|
||||
// Returns added value.
|
||||
inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
return __sync_add_and_fetch(value, value_to_add);
|
||||
}
|
||||
|
||||
// Returns original value before addition.
|
||||
inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
return __sync_fetch_and_add(value, value_to_add);
|
||||
}
|
||||
|
||||
|
||||
// Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
|
||||
// @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
|
||||
// @@ Is this strong or weak?
|
||||
@ -210,10 +235,6 @@ namespace nv {
|
||||
return __sync_lock_test_and_set(value, desired);
|
||||
}
|
||||
|
||||
inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
return __sync_add_and_fetch(value, value_to_add);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -271,20 +292,30 @@ namespace nv {
|
||||
// Many alternative implementations at:
|
||||
// http://www.memoryhole.net/kyle/2007/05/atomic_incrementing.html
|
||||
|
||||
inline uint32 atomicIncrement(uint32 * value)
|
||||
{
|
||||
// Returns incremented value.
|
||||
inline uint32 atomicIncrement(uint32 * value) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
|
||||
return __sync_add_and_fetch(value, 1);
|
||||
}
|
||||
|
||||
inline uint32 atomicDecrement(uint32 * value)
|
||||
{
|
||||
// Returns decremented value.
|
||||
inline uint32 atomicDecrement(uint32 * value) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
|
||||
return __sync_sub_and_fetch(value, 1);
|
||||
}
|
||||
|
||||
|
||||
// Returns added value.
|
||||
inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
return __sync_add_and_fetch(value, value_to_add);
|
||||
}
|
||||
|
||||
// Returns original value before addition.
|
||||
inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
return __sync_fetch_and_add(value, value_to_add);
|
||||
}
|
||||
|
||||
// Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
|
||||
// @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
|
||||
// @@ Is this strong or weak?
|
||||
@ -300,12 +331,6 @@ namespace nv {
|
||||
// this is confusingly named, it doesn't actually do a test but always sets
|
||||
return __sync_lock_test_and_set(value, desired);
|
||||
}
|
||||
|
||||
inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
|
||||
nvDebugCheck((intptr_t(value) & 3) == 0);
|
||||
return __sync_add_and_fetch(value, value_to_add);
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
#error "Atomics not implemented."
|
||||
|
@ -11,18 +11,20 @@ using namespace nv;
|
||||
|
||||
#define ENABLE_PARALLEL_FOR 1
|
||||
|
||||
static void worker(void * arg) {
|
||||
static void worker(void * arg, int tid) {
|
||||
ParallelFor * owner = (ParallelFor *)arg;
|
||||
|
||||
while(true) {
|
||||
// Consume one element at a time. @@ Might be more efficient to have custom grain.
|
||||
uint i = atomicIncrement(&owner->idx);
|
||||
if (i > owner->count) {
|
||||
uint new_idx = atomicFetchAndAdd(&owner->idx, owner->step);
|
||||
if (new_idx >= owner->count) {
|
||||
break;
|
||||
}
|
||||
|
||||
owner->task(owner->context, i - 1);
|
||||
}
|
||||
const uint count = min(owner->count, new_idx + owner->step);
|
||||
for (uint i = new_idx; i < count; i++) {
|
||||
owner->task(owner->context, /*tid, */i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -38,22 +40,16 @@ ParallelFor::~ParallelFor() {
|
||||
#endif
|
||||
}
|
||||
|
||||
void ParallelFor::run(uint count, bool calling_thread_process_work /*=false*/) {
|
||||
void ParallelFor::run(uint count, uint step/*= 1*/) {
|
||||
#if ENABLE_PARALLEL_FOR
|
||||
storeRelease(&this->count, count);
|
||||
storeRelease(&this->step, step);
|
||||
|
||||
// Init atomic counter to zero.
|
||||
storeRelease(&idx, 0);
|
||||
|
||||
// Start threads.
|
||||
pool->start(worker, this);
|
||||
|
||||
if (calling_thread_process_work) {
|
||||
worker(this);
|
||||
}
|
||||
|
||||
// Wait for all threads to complete.
|
||||
pool->wait();
|
||||
pool->run(worker, this);
|
||||
|
||||
nvDebugCheck(idx >= count);
|
||||
#else
|
||||
@ -63,4 +59,3 @@ void ParallelFor::run(uint count, bool calling_thread_process_work /*=false*/) {
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@ -12,26 +12,169 @@ namespace nv
|
||||
class Thread;
|
||||
class ThreadPool;
|
||||
|
||||
typedef void ForTask(void * context, int id);
|
||||
typedef void ForTask(void * context, /*int tid,*/ int idx); // @@ It would be nice to have the thread index as an argument here.
|
||||
|
||||
struct ParallelFor {
|
||||
ParallelFor(ForTask * task, void * context);
|
||||
~ParallelFor();
|
||||
|
||||
void run(uint count, bool calling_thread_process_work = false);
|
||||
void run(uint count, uint step = 1);
|
||||
|
||||
// Invariant:
|
||||
ForTask * task;
|
||||
void * context;
|
||||
ThreadPool * pool;
|
||||
//uint workerCount; // @@ Move to thread pool.
|
||||
//Thread * workers;
|
||||
|
||||
// State:
|
||||
uint count;
|
||||
uint step;
|
||||
/*atomic<uint>*/ uint idx;
|
||||
};
|
||||
|
||||
|
||||
#if NV_CC_CPP11
|
||||
|
||||
template <typename F>
|
||||
void sequential_for(uint count, F f) {
|
||||
for (uint i = 0; i < count; i++) {
|
||||
f(i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename F>
|
||||
void parallel_for(uint count, uint step, F f) {
|
||||
// Transform lambda into function pointer.
|
||||
auto lambda = [](void* context, /*int tid, */int idx) {
|
||||
F & f = *reinterpret_cast<F *>(context);
|
||||
f(/*tid, */idx);
|
||||
};
|
||||
|
||||
ParallelFor pf(lambda, &f);
|
||||
pf.run(count, step);
|
||||
}
|
||||
|
||||
|
||||
template <typename F>
|
||||
void parallel_for(uint count, F f) {
|
||||
parallel_for(count, /*step=*/1, f);
|
||||
}
|
||||
|
||||
|
||||
template <typename F>
|
||||
void parallel_for_if(uint count, uint step, bool condition, F f) {
|
||||
if (condition) {
|
||||
parallel_for(count, step, f);
|
||||
}
|
||||
else {
|
||||
sequential_for(count, f);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#if 0
|
||||
template <typename F, typename T>
|
||||
void parallel_for_each(Array<T> & array, uint step, F f) {
|
||||
// Transform lambda into function pointer.
|
||||
auto lambda = [](void* context, int idx) {
|
||||
F & f = *reinterpret_cast<F *>(context);
|
||||
f(array[idx]);
|
||||
};
|
||||
|
||||
ParallelFor pf(lambda, &f);
|
||||
pf.run(count, step);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#endif // NV_CC_CPP11
|
||||
|
||||
|
||||
/*
|
||||
|
||||
#include "nvthread/Mutex.h"
|
||||
#include "nvcore/Array.inl"
|
||||
|
||||
template <typename T>
|
||||
struct ParallelOutputStream {
|
||||
#if 0
|
||||
// In its most basic implementation the parallel stream is simply a single array protected by a mutex.
|
||||
Parallel_Output_Stream(uint producer_count) {}
|
||||
|
||||
void reset() { final_array.clear(); }
|
||||
void append(uint producer_id, const T & t) { Lock(mutex); final_array.append(t); }
|
||||
nv::Array<T> & finalize() { return final_array; }
|
||||
|
||||
nv::Mutex mutex;
|
||||
nv::Array<T> final_array;
|
||||
|
||||
#elif 0
|
||||
// Another simple implementation is to have N arrays that are merged at the end.
|
||||
ParallelOutputStream(uint producer_count) : producer_count(producer_count) {
|
||||
partial_array = new Array<T>[producer_count];
|
||||
}
|
||||
|
||||
void reset() {
|
||||
for (int i = 0; i < producer_count; i++) {
|
||||
partial_array[i].clear();
|
||||
}
|
||||
}
|
||||
|
||||
void append(uint producer_id, const T & t) {
|
||||
nvCheck(producer_id < producer_count);
|
||||
partial_array[producer_id].append(t);
|
||||
}
|
||||
|
||||
nv::Array<T> & finalize() {
|
||||
for (int i = 1; i < producer_count; i++) {
|
||||
partial_array->append(partial_array[i]);
|
||||
partial_array[i].clear();
|
||||
}
|
||||
return *partial_array;
|
||||
}
|
||||
|
||||
uint producer_count;
|
||||
nv::Array<T> * partial_array;
|
||||
#else
|
||||
ParallelOutputStream(uint producer_count) : producer_count(producer_count) {
|
||||
partial_array = new PartialArray[producer_count];
|
||||
}
|
||||
|
||||
// But a more sophisticated implementation keeps N short arrays that are merged as they get full. This preserves partial order.
|
||||
struct PartialArray { // Make sure this is aligned to cache lines. We want producers to access their respective arrays without conflicts.
|
||||
uint count;
|
||||
T data[32]; // Pick size to minimize wasted space considering cache line alignment?
|
||||
};
|
||||
|
||||
const uint producer_count;
|
||||
PartialArray * partial_array;
|
||||
|
||||
// @@ Make sure mutex and partial_array are not in the same cache line!
|
||||
|
||||
nv::Mutex mutex;
|
||||
nv::Array<T> final_array;
|
||||
|
||||
void append(uint producer_id, const T & t) {
|
||||
if (partial_array[producer_id].count == 32) {
|
||||
partial_array[producer_id].count = 0;
|
||||
Lock(mutex);
|
||||
final_array.append(partial_array[producer_id].data, 32);
|
||||
}
|
||||
|
||||
partial_array[producer_id].data[partial_array[producer_id].count++] = t;
|
||||
}
|
||||
nv::Array<T> & finalize() {
|
||||
for (int i = 0; i < producer_count; i++) {
|
||||
final_array.append(partial_array[producer_id].data, partial_array[producer_id].count);
|
||||
}
|
||||
return final_array;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
*/
|
||||
|
||||
|
||||
} // nv namespace
|
||||
|
||||
|
||||
|
@ -89,7 +89,7 @@ Thread::Thread() : p(new Private)
|
||||
p->name = NULL;
|
||||
}
|
||||
|
||||
Thread::Thread(const char * const name) : p(new Private)
|
||||
Thread::Thread(const char * name) : p(new Private)
|
||||
{
|
||||
p->thread = 0;
|
||||
p->name = name;
|
||||
@ -100,6 +100,12 @@ Thread::~Thread()
|
||||
nvDebugCheck(p->thread == 0);
|
||||
}
|
||||
|
||||
void Thread::setName(const char * name)
|
||||
{
|
||||
nvCheck(p->name == NULL);
|
||||
p->name = name;
|
||||
}
|
||||
|
||||
void Thread::start(ThreadFunc * func, void * arg)
|
||||
{
|
||||
p->func = func;
|
||||
@ -110,10 +116,12 @@ void Thread::start(ThreadFunc * func, void * arg)
|
||||
p->thread = CreateThread(NULL, 0, threadFunc, p.ptr(), 0, &threadId);
|
||||
//p->thread = (HANDLE)_beginthreadex (0, 0, threadFunc, p.ptr(), 0, NULL); // @@ So that we can call CRT functions...
|
||||
nvDebugCheck(p->thread != NULL);
|
||||
setThreadName(threadId, p->name);
|
||||
#if NV_USE_TELEMETRY
|
||||
tmThreadName(tmContext, threadId, p->name);
|
||||
#endif
|
||||
if (p->name != NULL) {
|
||||
setThreadName(threadId, p->name);
|
||||
#if NV_USE_TELEMETRY
|
||||
tmThreadName(tmContext, threadId, p->name);
|
||||
#endif
|
||||
}
|
||||
#elif NV_OS_ORBIS
|
||||
int ret = scePthreadCreate(&p->thread, NULL, threadFunc, p.ptr(), p->name ? p->name : "nv::Thread");
|
||||
nvDebugCheck(ret == 0);
|
||||
|
@ -17,9 +17,11 @@ namespace nv
|
||||
NV_FORBID_COPY(Thread);
|
||||
public:
|
||||
Thread();
|
||||
Thread(const char * const name);
|
||||
Thread(const char * name);
|
||||
~Thread();
|
||||
|
||||
void setName(const char * name);
|
||||
|
||||
void start(ThreadFunc * func, void * arg);
|
||||
void wait();
|
||||
|
||||
|
@ -6,6 +6,13 @@
|
||||
#include "Atomic.h"
|
||||
|
||||
#include "nvcore/Utils.h"
|
||||
#include "nvcore/StrLib.h"
|
||||
|
||||
#if NV_USE_TELEMETRY
|
||||
#include <telemetry.h>
|
||||
extern HTELEMETRY tmContext;
|
||||
#endif
|
||||
|
||||
|
||||
// Most of the time it's not necessary to protect the thread pool, but if it doesn't add a significant overhead, then it'd be safer to do it.
|
||||
#define PROTECT_THREAD_POOL 1
|
||||
@ -20,6 +27,14 @@ Mutex s_pool_mutex("thread pool");
|
||||
AutoPtr<ThreadPool> s_pool;
|
||||
|
||||
|
||||
/*static*/ void ThreadPool::setup(uint workerCount, bool useThreadAffinity, bool useCallingThread) {
|
||||
#if PROTECT_THREAD_POOL
|
||||
Lock<Mutex> lock(s_pool_mutex);
|
||||
#endif
|
||||
|
||||
s_pool = new ThreadPool(workerCount, useThreadAffinity, useCallingThread);
|
||||
}
|
||||
|
||||
/*static*/ ThreadPool * ThreadPool::acquire()
|
||||
{
|
||||
#if PROTECT_THREAD_POOL
|
||||
@ -52,36 +67,59 @@ AutoPtr<ThreadPool> s_pool;
|
||||
/*static*/ void ThreadPool::workerFunc(void * arg) {
|
||||
uint i = U32((uintptr_t)arg); // This is OK, because workerCount should always be much smaller than 2^32
|
||||
|
||||
//ThreadPool::threadId = i;
|
||||
|
||||
if (s_pool->useThreadAffinity) {
|
||||
lockThreadToProcessor(s_pool->useCallingThread + i);
|
||||
}
|
||||
|
||||
while(true)
|
||||
{
|
||||
s_pool->startEvents[i].wait();
|
||||
|
||||
nv::ThreadFunc * func = loadAcquirePointer(&s_pool->func);
|
||||
ThreadTask * func = loadAcquirePointer(&s_pool->func);
|
||||
|
||||
if (func == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
func(s_pool->arg);
|
||||
{
|
||||
#if NV_USE_TELEMETRY
|
||||
tmZoneFiltered(tmContext, 20, TMZF_NONE, "worker");
|
||||
#endif
|
||||
func(s_pool->arg, s_pool->useCallingThread + i);
|
||||
}
|
||||
|
||||
s_pool->finishEvents[i].post();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ThreadPool::ThreadPool()
|
||||
ThreadPool::ThreadPool(uint workerCount/*=processorCount()*/, bool useThreadAffinity/*=true*/, bool useCallingThread/*=false*/)
|
||||
{
|
||||
s_pool = this; // Worker threads need this to be initialized before they start.
|
||||
|
||||
workerCount = nv::hardwareThreadCount();
|
||||
workers = new Thread[workerCount];
|
||||
this->useThreadAffinity = useThreadAffinity;
|
||||
this->workerCount = workerCount;
|
||||
this->useCallingThread = useCallingThread;
|
||||
|
||||
startEvents = new Event[workerCount];
|
||||
finishEvents = new Event[workerCount];
|
||||
uint threadCount = workerCount - useCallingThread;
|
||||
|
||||
workers = new Thread[threadCount];
|
||||
|
||||
startEvents = new Event[threadCount];
|
||||
finishEvents = new Event[threadCount];
|
||||
|
||||
nvCompilerWriteBarrier(); // @@ Use a memory fence?
|
||||
|
||||
for (uint i = 0; i < workerCount; i++) {
|
||||
if (useCallingThread && useThreadAffinity) {
|
||||
lockThreadToProcessor(0); // Calling thread always locked to processor 0.
|
||||
}
|
||||
|
||||
for (uint i = 0; i < threadCount; i++) {
|
||||
StringBuilder name;
|
||||
name.format("worker %d", i);
|
||||
workers[i].setName(name.release()); // @Leak
|
||||
workers[i].start(workerFunc, (void *)i);
|
||||
}
|
||||
|
||||
@ -94,14 +132,28 @@ ThreadPool::~ThreadPool()
|
||||
start(NULL, NULL);
|
||||
|
||||
// Wait until threads actually exit.
|
||||
Thread::wait(workers, workerCount);
|
||||
Thread::wait(workers, workerCount - useCallingThread);
|
||||
|
||||
delete [] workers;
|
||||
delete [] startEvents;
|
||||
delete [] finishEvents;
|
||||
}
|
||||
|
||||
void ThreadPool::start(ThreadFunc * func, void * arg)
|
||||
void ThreadPool::run(ThreadTask * func, void * arg)
|
||||
{
|
||||
// Wait until threads are idle.
|
||||
wait();
|
||||
|
||||
start(func, arg);
|
||||
|
||||
if (useCallingThread) {
|
||||
func(arg, 0);
|
||||
}
|
||||
|
||||
wait();
|
||||
}
|
||||
|
||||
void ThreadPool::start(ThreadTask * func, void * arg)
|
||||
{
|
||||
// Wait until threads are idle.
|
||||
wait();
|
||||
@ -113,7 +165,7 @@ void ThreadPool::start(ThreadFunc * func, void * arg)
|
||||
allIdle = false;
|
||||
|
||||
// Resume threads.
|
||||
Event::post(startEvents, workerCount);
|
||||
Event::post(startEvents, workerCount - useCallingThread);
|
||||
}
|
||||
|
||||
void ThreadPool::wait()
|
||||
@ -121,7 +173,7 @@ void ThreadPool::wait()
|
||||
if (!allIdle)
|
||||
{
|
||||
// Wait for threads to complete.
|
||||
Event::wait(finishEvents, workerCount);
|
||||
Event::wait(finishEvents, workerCount - useCallingThread);
|
||||
|
||||
allIdle = true;
|
||||
}
|
||||
|
@ -14,30 +14,42 @@
|
||||
// The thread pool runs the same function in all worker threads, the idea is to use this as the foundation of a custom task scheduler.
|
||||
// When the thread pool starts, the main thread continues running, but the common use case is to inmmediately wait for the termination events of the worker threads.
|
||||
// @@ The start and wait methods could probably be merged.
|
||||
// It may be running the thread function on the invoking thread to avoid thread switches.
|
||||
|
||||
namespace nv {
|
||||
|
||||
class Thread;
|
||||
class Event;
|
||||
|
||||
typedef void ThreadTask(void * context, int id);
|
||||
|
||||
class ThreadPool {
|
||||
NV_FORBID_COPY(ThreadPool);
|
||||
public:
|
||||
|
||||
static void setup(uint workerCount, bool useThreadAffinity, bool useCallingThread);
|
||||
|
||||
static ThreadPool * acquire();
|
||||
static void release(ThreadPool *);
|
||||
|
||||
ThreadPool();
|
||||
ThreadPool(uint workerCount = processorCount(), bool useThreadAffinity = true, bool useCallingThread = false);
|
||||
~ThreadPool();
|
||||
|
||||
void start(ThreadFunc * func, void * arg);
|
||||
void run(ThreadTask * func, void * arg);
|
||||
|
||||
void start(ThreadTask * func, void * arg);
|
||||
void wait();
|
||||
|
||||
//NV_THREAD_LOCAL static uint threadId;
|
||||
|
||||
private:
|
||||
|
||||
static void workerFunc(void * arg);
|
||||
|
||||
bool useThreadAffinity;
|
||||
bool useCallingThread;
|
||||
uint workerCount;
|
||||
|
||||
Thread * workers;
|
||||
Event * startEvents;
|
||||
Event * finishEvents;
|
||||
@ -45,10 +57,29 @@ namespace nv {
|
||||
uint allIdle;
|
||||
|
||||
// Current function:
|
||||
ThreadFunc * func;
|
||||
ThreadTask * func;
|
||||
void * arg;
|
||||
};
|
||||
|
||||
|
||||
#if NV_CC_CPP11
|
||||
|
||||
template <typename F>
|
||||
void thread_pool_run(F f) {
|
||||
// Transform lambda into function pointer.
|
||||
auto lambda = [](void* context, int id) {
|
||||
F & f = *reinterpret_cast<F *>(context);
|
||||
f(id);
|
||||
};
|
||||
|
||||
ThreadPool * pool = ThreadPool::acquire();
|
||||
pool->run(lambda, &f);
|
||||
ThreadPool::release(pool);
|
||||
}
|
||||
|
||||
#endif // NV_CC_CPP11
|
||||
|
||||
|
||||
} // namespace nv
|
||||
|
||||
|
||||
|
@ -27,15 +27,65 @@
|
||||
|
||||
using namespace nv;
|
||||
|
||||
#if NV_OS_WIN32
|
||||
|
||||
// Find the number of cores in the system.
|
||||
typedef BOOL(WINAPI *LPFN_GSI)(LPSYSTEM_INFO);
|
||||
typedef BOOL(WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
|
||||
|
||||
static bool isWow64() {
|
||||
LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
|
||||
|
||||
BOOL wow64 = FALSE;
|
||||
|
||||
if (NULL != fnIsWow64Process) {
|
||||
if (!fnIsWow64Process(GetCurrentProcess(), &wow64)) {
|
||||
// If error, assume false.
|
||||
}
|
||||
}
|
||||
|
||||
return wow64 != 0;
|
||||
}
|
||||
|
||||
static void getSystemInfo(SYSTEM_INFO * sysinfo) {
|
||||
BOOL success = FALSE;
|
||||
|
||||
if (isWow64()) {
|
||||
LPFN_GSI fnGetNativeSystemInfo = (LPFN_GSI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetNativeSystemInfo");
|
||||
|
||||
if (fnGetNativeSystemInfo != NULL) {
|
||||
success = fnGetNativeSystemInfo(sysinfo);
|
||||
}
|
||||
}
|
||||
|
||||
if (!success) {
|
||||
GetSystemInfo(sysinfo);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // NV_OS_WIN32
|
||||
|
||||
// Find the number of logical processors in the system.
|
||||
// Based on: http://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
|
||||
// @@ Distinguish between logical and physical cores?
|
||||
uint nv::hardwareThreadCount() {
|
||||
uint nv::processorCount() {
|
||||
#if NV_OS_WIN32
|
||||
SYSTEM_INFO sysinfo;
|
||||
GetSystemInfo( &sysinfo );
|
||||
return sysinfo.dwNumberOfProcessors;
|
||||
getSystemInfo(&sysinfo);
|
||||
//return sysinfo.dwNumberOfProcessors;
|
||||
|
||||
// Respect process affinity mask?
|
||||
DWORD_PTR pam, sam;
|
||||
GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
|
||||
|
||||
// Count number of bits set in the processor affinity mask.
|
||||
uint count = 0;
|
||||
for (int i = 0; i < sizeof(DWORD_PTR) * 8; i++) {
|
||||
if (pam & (DWORD_PTR(1) << i)) count++;
|
||||
}
|
||||
nvDebugCheck(count <= sysinfo.dwNumberOfProcessors);
|
||||
|
||||
return count;
|
||||
#elif NV_OS_ORBIS
|
||||
return 6;
|
||||
#elif NV_OS_XBOX
|
||||
return 3; // or 6?
|
||||
#elif NV_OS_LINUX // Linux, Solaris, & AIX
|
||||
@ -72,10 +122,211 @@ uint nv::hardwareThreadCount() {
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
uint nv::threadId() {
|
||||
#if NV_OS_WIN32
|
||||
return GetCurrentThreadId();
|
||||
#else
|
||||
return 0; // @@
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// @@ If we are using less worker threads than processors and hyperthreading is available, we probably want to enumerate the logical processors
|
||||
// so that the first cores of each processor goes first. This way, if say, we leave 2 hardware threads free, then we still have one worker
|
||||
// thread on each physical processor.
|
||||
|
||||
// I believe that currently logical processors are enumerated in physical order, that is:
|
||||
// 0 = thread a in physical core 0
|
||||
// 1 = thread b in physical core 0
|
||||
// 2 = thread a in physical core 1
|
||||
// ... and so on ...
|
||||
// I'm not sure we can actually rely on that. And in any case we should start detecting the number of physical processors, which appears to be a pain
|
||||
// to do in a way that's compatible with newer i7 processors.
|
||||
|
||||
void nv::lockThreadToProcessor(int idx) {
|
||||
#if NV_OS_WIN32
|
||||
//nvDebugCheck(idx < hardwareThreadCount());
|
||||
#if 0
|
||||
DWORD_PTR tam = 1 << idx;
|
||||
#else
|
||||
DWORD_PTR pam, sam;
|
||||
BOOL rc = GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
|
||||
|
||||
// Find the idx's bit set.
|
||||
uint pidx = 0;
|
||||
DWORD_PTR tam = 0;
|
||||
for (int i = 0; i < sizeof(DWORD_PTR) * 8; i++) {
|
||||
DWORD_PTR mask = DWORD_PTR(1) << i;
|
||||
if (pam & mask) {
|
||||
if (pidx == idx) {
|
||||
tam = mask;
|
||||
break;
|
||||
}
|
||||
pidx++;
|
||||
}
|
||||
}
|
||||
|
||||
nvDebugCheck(tam != 0);
|
||||
#endif
|
||||
|
||||
SetThreadAffinityMask(GetCurrentThread(), tam);
|
||||
#else
|
||||
// @@ NOP
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
void nv::unlockThreadToProcessor() {
|
||||
#if NV_OS_WIN32
|
||||
DWORD_PTR pam, sam;
|
||||
BOOL rc = GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
|
||||
SetThreadAffinityMask(GetCurrentThread(), pam);
|
||||
#else
|
||||
// @@ NOP
|
||||
#endif
|
||||
}
|
||||
|
||||
uint nv::logicalProcessorCount() {
|
||||
return processorCount();
|
||||
}
|
||||
|
||||
|
||||
#if NV_OS_WIN32
|
||||
|
||||
struct LOGICALPROCESSORDATA
|
||||
{
|
||||
unsigned int nLargestStandardFunctionNumber;
|
||||
unsigned int nLargestExtendedFunctionNumber;
|
||||
int nLogicalProcessorCount;
|
||||
int nLocalApicId;
|
||||
int nCPUcore;
|
||||
int nProcessorId;
|
||||
int nApicIdCoreIdSize;
|
||||
int nNC;
|
||||
int nMNC;
|
||||
int nCPUCoresperProcessor;
|
||||
int nThreadsperCPUCore;
|
||||
int nProcId;
|
||||
int nCoreId;
|
||||
bool CmpLegacy;
|
||||
bool HTT;
|
||||
};
|
||||
|
||||
#define MAX_NUMBER_OF_LOGICAL_PROCESSORS 96
|
||||
#define MAX_NUMBER_OF_PHYSICAL_PROCESSORS 8
|
||||
#define MAX_NUMBER_OF_IOAPICS 16
|
||||
static LOGICALPROCESSORDATA LogicalProcessorMap[MAX_NUMBER_OF_LOGICAL_PROCESSORS];
|
||||
static int PhysProcIds[MAX_NUMBER_OF_PHYSICAL_PROCESSORS + MAX_NUMBER_OF_IOAPICS];
|
||||
|
||||
static void gatherProcessorData(LOGICALPROCESSORDATA * p) {
|
||||
|
||||
int CPUInfo[4] = { 0, 0, 0, 0 };
|
||||
__cpuid(CPUInfo, 0);
|
||||
|
||||
p->nLargestStandardFunctionNumber = CPUInfo[0];
|
||||
|
||||
// Get the information associated with each valid Id
|
||||
for (uint i = 0; i <= p->nLargestStandardFunctionNumber; ++i) {
|
||||
__cpuid(CPUInfo, i);
|
||||
|
||||
// Interpret CPU feature information.
|
||||
if (i == 1) {
|
||||
// Some of the bits of LocalApicId represent the CPU core
|
||||
// within a processor and other bits represent the processor ID.
|
||||
p->nLocalApicId = (CPUInfo[1] >> 24) & 0xff;
|
||||
p->HTT = (CPUInfo[3] >> 28) & 0x1;
|
||||
// recalculate later after 0x80000008
|
||||
p->nLogicalProcessorCount = (CPUInfo[1] >> 16) & 0x0FF;
|
||||
}
|
||||
}
|
||||
|
||||
// Calling __cpuid with 0x80000000 as the InfoType argument
|
||||
// gets the number of valid extended IDs.
|
||||
__cpuid(CPUInfo, 0x80000000);
|
||||
p->nLargestExtendedFunctionNumber = CPUInfo[0];
|
||||
|
||||
// Get the information associated with each extended ID.
|
||||
for (uint i = 0x80000000; i <= p->nLargestExtendedFunctionNumber; ++i) {
|
||||
__cpuid(CPUInfo, i);
|
||||
if (i == 0x80000008) {
|
||||
p->nApicIdCoreIdSize = (CPUInfo[2] >> 12) & 0xF;
|
||||
p->nNC = (CPUInfo[2]) & 0x0FF;
|
||||
}
|
||||
}
|
||||
|
||||
// MNC
|
||||
// A value of zero for ApicIdCoreIdSize indicates that MNC is derived by this
|
||||
// legacy formula: MNC = NC + 1
|
||||
// A non-zero value of ApicIdCoreIdSize means that MNC is 2^ApicIdCoreIdSize
|
||||
if (p->nApicIdCoreIdSize) {
|
||||
p->nMNC = 2;
|
||||
for (uint j = p->nApicIdCoreIdSize - 1; j > 0; j--) {
|
||||
p->nMNC = p->nMNC * 2;
|
||||
}
|
||||
}
|
||||
else {
|
||||
p->nMNC = p->nNC + 1;
|
||||
}
|
||||
|
||||
// If HTT==0, then LogicalProcessorCount is reserved, and the CPU contains
|
||||
// one CPU core and the CPU core is single-threaded.
|
||||
// If HTT==1 and CmpLegacy==1, LogicalProcessorCount represents the number of
|
||||
// CPU cores per processor, where each CPU core is single-threaded. If HTT==1
|
||||
// and CmpLegacy==0, then LogicalProcessorCount is the number of threads per
|
||||
// processor, which is the number of cores times the number of threads per core.
|
||||
// The number of cores is NC+1.
|
||||
p->nCPUCoresperProcessor = p->nNC + 1;
|
||||
p->nThreadsperCPUCore = (p->HTT == 0 ? 1 : (p->CmpLegacy == 1 ? 1 : p->nLogicalProcessorCount / p->nCPUCoresperProcessor ));
|
||||
|
||||
// Calculate a mask for the core IDs
|
||||
uint mask = 1;
|
||||
uint numbits = 1;
|
||||
if (p->nApicIdCoreIdSize) {
|
||||
numbits = p->nApicIdCoreIdSize;
|
||||
for (uint j = p->nApicIdCoreIdSize; j > 1; j--) {
|
||||
mask = (mask << 1) + 1;
|
||||
}
|
||||
}
|
||||
p->nProcId = (p->nLocalApicId & ~mask) >> numbits;
|
||||
p->nCoreId = p->nLocalApicId & mask;
|
||||
}
|
||||
|
||||
|
||||
uint nv::physicalProcessorCount() {
|
||||
|
||||
uint lpc = logicalProcessorCount();
|
||||
|
||||
// Get info about each logical processor.
|
||||
for (uint i = 0; i < lpc; i++) {
|
||||
// Make sure thread doesn't change processor while we gather it's data.
|
||||
lockThreadToProcessor(i);
|
||||
|
||||
gatherProcessorData(&LogicalProcessorMap[i]);
|
||||
}
|
||||
|
||||
unlockThreadToProcessor();
|
||||
|
||||
memset(PhysProcIds, 0, sizeof(PhysProcIds));
|
||||
for (uint i = 0; i < lpc; i++) {
|
||||
PhysProcIds[LogicalProcessorMap[i].nProcId]++;
|
||||
}
|
||||
|
||||
uint pc = 0;
|
||||
for (uint i = 0; i < (MAX_NUMBER_OF_PHYSICAL_PROCESSORS + MAX_NUMBER_OF_IOAPICS); i++) {
|
||||
if (PhysProcIds[i] != 0) {
|
||||
pc++;
|
||||
}
|
||||
}
|
||||
|
||||
return pc;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
uint nv::physicalProcessorCount() {
|
||||
// @@ Assume the same.
|
||||
return processorCount();
|
||||
}
|
||||
|
||||
#endif
|
@ -82,13 +82,16 @@ BOOL WINAPI SwitchToThread(void);
|
||||
|
||||
namespace nv
|
||||
{
|
||||
// Reentrant.
|
||||
uint hardwareThreadCount();
|
||||
//void initThreadingSystemInfo();
|
||||
|
||||
// Not thread-safe. Use from main thread only.
|
||||
void initWorkers();
|
||||
void shutWorkers();
|
||||
void setWorkerFunction(void * func);
|
||||
// Reentrant.
|
||||
uint processorCount();
|
||||
uint logicalProcessorCount();
|
||||
uint physicalProcessorCount();
|
||||
|
||||
// Locks the current thread to the given logical processor index.
|
||||
void lockThreadToProcessor(int idx);
|
||||
void unlockThreadToProcessor();
|
||||
|
||||
uint threadId();
|
||||
|
||||
|
Reference in New Issue
Block a user