Merge changes from The Witness.

This commit is contained in:
Ignacio
2015-10-28 23:53:08 -07:00
parent a382ea5b21
commit c0ad0f4d31
43 changed files with 890 additions and 136 deletions

View File

@ -119,18 +119,35 @@ namespace nv {
#if NV_CC_MSVC
NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
// Returns incremented value.
inline uint32 atomicIncrement(uint32 * value)
{
nvDebugCheck((intptr_t(value) & 3) == 0);
return (uint32)_InterlockedIncrement((long *)value);
return uint32(_InterlockedIncrement((long *)value));
}
// Returns decremented value.
inline uint32 atomicDecrement(uint32 * value)
{
nvDebugCheck((intptr_t(value) & 3) == 0);
return (uint32)_InterlockedDecrement((long *)value);
return uint32(_InterlockedDecrement((long *)value));
}
// Returns added value.
inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add)) + value_to_add;
}
// Returns original value before addition.
inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add));
}
// Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
// @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
// @@ Is this strong or weak? Does InterlockedCompareExchange have spurious failures?
@ -148,10 +165,7 @@ namespace nv {
return (uint32)_InterlockedExchange((long *)value, (long)desired);
}
inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return (uint32)_InterlockedExchangeAdd((long*)value, (long)value_to_add);
}
#elif NV_CC_CLANG && (NV_OS_IOS || NV_OS_DARWIN)
NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
@ -180,20 +194,31 @@ namespace nv {
}
*/
inline uint32 atomicIncrement(uint32 * value)
{
// Returns incremented value.
inline uint32 atomicIncrement(uint32 * value) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return __sync_add_and_fetch(value, 1);
}
inline uint32 atomicDecrement(uint32 * value)
{
// Returns decremented value.
inline uint32 atomicDecrement(uint32 * value) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return __sync_sub_and_fetch(value, 1);
}
// Returns added value.
inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return __sync_add_and_fetch(value, value_to_add);
}
// Returns original value before addition.
inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return __sync_fetch_and_add(value, value_to_add);
}
// Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
// @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
// @@ Is this strong or weak?
@ -210,10 +235,6 @@ namespace nv {
return __sync_lock_test_and_set(value, desired);
}
inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return __sync_add_and_fetch(value, value_to_add);
}
@ -271,20 +292,30 @@ namespace nv {
// Many alternative implementations at:
// http://www.memoryhole.net/kyle/2007/05/atomic_incrementing.html
inline uint32 atomicIncrement(uint32 * value)
{
// Returns incremented value.
inline uint32 atomicIncrement(uint32 * value) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return __sync_add_and_fetch(value, 1);
}
inline uint32 atomicDecrement(uint32 * value)
{
// Returns decremented value.
inline uint32 atomicDecrement(uint32 * value) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return __sync_sub_and_fetch(value, 1);
}
// Returns added value.
inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return __sync_add_and_fetch(value, value_to_add);
}
// Returns original value before addition.
inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return __sync_fetch_and_add(value, value_to_add);
}
// Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
// @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
// @@ Is this strong or weak?
@ -300,12 +331,6 @@ namespace nv {
// this is confusingly named, it doesn't actually do a test but always sets
return __sync_lock_test_and_set(value, desired);
}
inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
nvDebugCheck((intptr_t(value) & 3) == 0);
return __sync_add_and_fetch(value, value_to_add);
}
#else
#error "Atomics not implemented."

View File

@ -11,18 +11,20 @@ using namespace nv;
#define ENABLE_PARALLEL_FOR 1
static void worker(void * arg) {
static void worker(void * arg, int tid) {
ParallelFor * owner = (ParallelFor *)arg;
while(true) {
// Consume one element at a time. @@ Might be more efficient to have custom grain.
uint i = atomicIncrement(&owner->idx);
if (i > owner->count) {
uint new_idx = atomicFetchAndAdd(&owner->idx, owner->step);
if (new_idx >= owner->count) {
break;
}
owner->task(owner->context, i - 1);
}
const uint count = min(owner->count, new_idx + owner->step);
for (uint i = new_idx; i < count; i++) {
owner->task(owner->context, /*tid, */i);
}
}
}
@ -38,22 +40,16 @@ ParallelFor::~ParallelFor() {
#endif
}
void ParallelFor::run(uint count, bool calling_thread_process_work /*=false*/) {
void ParallelFor::run(uint count, uint step/*= 1*/) {
#if ENABLE_PARALLEL_FOR
storeRelease(&this->count, count);
storeRelease(&this->step, step);
// Init atomic counter to zero.
storeRelease(&idx, 0);
// Start threads.
pool->start(worker, this);
if (calling_thread_process_work) {
worker(this);
}
// Wait for all threads to complete.
pool->wait();
pool->run(worker, this);
nvDebugCheck(idx >= count);
#else
@ -63,4 +59,3 @@ void ParallelFor::run(uint count, bool calling_thread_process_work /*=false*/) {
#endif
}

View File

@ -12,26 +12,169 @@ namespace nv
class Thread;
class ThreadPool;
typedef void ForTask(void * context, int id);
typedef void ForTask(void * context, /*int tid,*/ int idx); // @@ It would be nice to have the thread index as an argument here.
struct ParallelFor {
ParallelFor(ForTask * task, void * context);
~ParallelFor();
void run(uint count, bool calling_thread_process_work = false);
void run(uint count, uint step = 1);
// Invariant:
ForTask * task;
void * context;
ThreadPool * pool;
//uint workerCount; // @@ Move to thread pool.
//Thread * workers;
// State:
uint count;
uint step;
/*atomic<uint>*/ uint idx;
};
#if NV_CC_CPP11
template <typename F>
void sequential_for(uint count, F f) {
for (uint i = 0; i < count; i++) {
f(i);
}
}
template <typename F>
void parallel_for(uint count, uint step, F f) {
// Transform lambda into function pointer.
auto lambda = [](void* context, /*int tid, */int idx) {
F & f = *reinterpret_cast<F *>(context);
f(/*tid, */idx);
};
ParallelFor pf(lambda, &f);
pf.run(count, step);
}
template <typename F>
void parallel_for(uint count, F f) {
parallel_for(count, /*step=*/1, f);
}
template <typename F>
void parallel_for_if(uint count, uint step, bool condition, F f) {
if (condition) {
parallel_for(count, step, f);
}
else {
sequential_for(count, f);
}
}
#if 0
template <typename F, typename T>
void parallel_for_each(Array<T> & array, uint step, F f) {
// Transform lambda into function pointer.
auto lambda = [](void* context, int idx) {
F & f = *reinterpret_cast<F *>(context);
f(array[idx]);
};
ParallelFor pf(lambda, &f);
pf.run(count, step);
}
#endif
#endif // NV_CC_CPP11
/*
#include "nvthread/Mutex.h"
#include "nvcore/Array.inl"
template <typename T>
struct ParallelOutputStream {
#if 0
// In its most basic implementation the parallel stream is simply a single array protected by a mutex.
Parallel_Output_Stream(uint producer_count) {}
void reset() { final_array.clear(); }
void append(uint producer_id, const T & t) { Lock(mutex); final_array.append(t); }
nv::Array<T> & finalize() { return final_array; }
nv::Mutex mutex;
nv::Array<T> final_array;
#elif 0
// Another simple implementation is to have N arrays that are merged at the end.
ParallelOutputStream(uint producer_count) : producer_count(producer_count) {
partial_array = new Array<T>[producer_count];
}
void reset() {
for (int i = 0; i < producer_count; i++) {
partial_array[i].clear();
}
}
void append(uint producer_id, const T & t) {
nvCheck(producer_id < producer_count);
partial_array[producer_id].append(t);
}
nv::Array<T> & finalize() {
for (int i = 1; i < producer_count; i++) {
partial_array->append(partial_array[i]);
partial_array[i].clear();
}
return *partial_array;
}
uint producer_count;
nv::Array<T> * partial_array;
#else
ParallelOutputStream(uint producer_count) : producer_count(producer_count) {
partial_array = new PartialArray[producer_count];
}
// But a more sophisticated implementation keeps N short arrays that are merged as they get full. This preserves partial order.
struct PartialArray { // Make sure this is aligned to cache lines. We want producers to access their respective arrays without conflicts.
uint count;
T data[32]; // Pick size to minimize wasted space considering cache line alignment?
};
const uint producer_count;
PartialArray * partial_array;
// @@ Make sure mutex and partial_array are not in the same cache line!
nv::Mutex mutex;
nv::Array<T> final_array;
void append(uint producer_id, const T & t) {
if (partial_array[producer_id].count == 32) {
partial_array[producer_id].count = 0;
Lock(mutex);
final_array.append(partial_array[producer_id].data, 32);
}
partial_array[producer_id].data[partial_array[producer_id].count++] = t;
}
nv::Array<T> & finalize() {
for (int i = 0; i < producer_count; i++) {
final_array.append(partial_array[producer_id].data, partial_array[producer_id].count);
}
return final_array;
}
#endif
};
*/
} // nv namespace

View File

@ -89,7 +89,7 @@ Thread::Thread() : p(new Private)
p->name = NULL;
}
Thread::Thread(const char * const name) : p(new Private)
Thread::Thread(const char * name) : p(new Private)
{
p->thread = 0;
p->name = name;
@ -100,6 +100,12 @@ Thread::~Thread()
nvDebugCheck(p->thread == 0);
}
void Thread::setName(const char * name)
{
nvCheck(p->name == NULL);
p->name = name;
}
void Thread::start(ThreadFunc * func, void * arg)
{
p->func = func;
@ -110,10 +116,12 @@ void Thread::start(ThreadFunc * func, void * arg)
p->thread = CreateThread(NULL, 0, threadFunc, p.ptr(), 0, &threadId);
//p->thread = (HANDLE)_beginthreadex (0, 0, threadFunc, p.ptr(), 0, NULL); // @@ So that we can call CRT functions...
nvDebugCheck(p->thread != NULL);
setThreadName(threadId, p->name);
#if NV_USE_TELEMETRY
tmThreadName(tmContext, threadId, p->name);
#endif
if (p->name != NULL) {
setThreadName(threadId, p->name);
#if NV_USE_TELEMETRY
tmThreadName(tmContext, threadId, p->name);
#endif
}
#elif NV_OS_ORBIS
int ret = scePthreadCreate(&p->thread, NULL, threadFunc, p.ptr(), p->name ? p->name : "nv::Thread");
nvDebugCheck(ret == 0);

View File

@ -17,9 +17,11 @@ namespace nv
NV_FORBID_COPY(Thread);
public:
Thread();
Thread(const char * const name);
Thread(const char * name);
~Thread();
void setName(const char * name);
void start(ThreadFunc * func, void * arg);
void wait();

View File

@ -6,6 +6,13 @@
#include "Atomic.h"
#include "nvcore/Utils.h"
#include "nvcore/StrLib.h"
#if NV_USE_TELEMETRY
#include <telemetry.h>
extern HTELEMETRY tmContext;
#endif
// Most of the time it's not necessary to protect the thread pool, but if it doesn't add a significant overhead, then it'd be safer to do it.
#define PROTECT_THREAD_POOL 1
@ -20,6 +27,14 @@ Mutex s_pool_mutex("thread pool");
AutoPtr<ThreadPool> s_pool;
/*static*/ void ThreadPool::setup(uint workerCount, bool useThreadAffinity, bool useCallingThread) {
#if PROTECT_THREAD_POOL
Lock<Mutex> lock(s_pool_mutex);
#endif
s_pool = new ThreadPool(workerCount, useThreadAffinity, useCallingThread);
}
/*static*/ ThreadPool * ThreadPool::acquire()
{
#if PROTECT_THREAD_POOL
@ -52,36 +67,59 @@ AutoPtr<ThreadPool> s_pool;
/*static*/ void ThreadPool::workerFunc(void * arg) {
uint i = U32((uintptr_t)arg); // This is OK, because workerCount should always be much smaller than 2^32
//ThreadPool::threadId = i;
if (s_pool->useThreadAffinity) {
lockThreadToProcessor(s_pool->useCallingThread + i);
}
while(true)
{
s_pool->startEvents[i].wait();
nv::ThreadFunc * func = loadAcquirePointer(&s_pool->func);
ThreadTask * func = loadAcquirePointer(&s_pool->func);
if (func == NULL) {
return;
}
func(s_pool->arg);
{
#if NV_USE_TELEMETRY
tmZoneFiltered(tmContext, 20, TMZF_NONE, "worker");
#endif
func(s_pool->arg, s_pool->useCallingThread + i);
}
s_pool->finishEvents[i].post();
}
}
ThreadPool::ThreadPool()
ThreadPool::ThreadPool(uint workerCount/*=processorCount()*/, bool useThreadAffinity/*=true*/, bool useCallingThread/*=false*/)
{
s_pool = this; // Worker threads need this to be initialized before they start.
workerCount = nv::hardwareThreadCount();
workers = new Thread[workerCount];
this->useThreadAffinity = useThreadAffinity;
this->workerCount = workerCount;
this->useCallingThread = useCallingThread;
startEvents = new Event[workerCount];
finishEvents = new Event[workerCount];
uint threadCount = workerCount - useCallingThread;
workers = new Thread[threadCount];
startEvents = new Event[threadCount];
finishEvents = new Event[threadCount];
nvCompilerWriteBarrier(); // @@ Use a memory fence?
for (uint i = 0; i < workerCount; i++) {
if (useCallingThread && useThreadAffinity) {
lockThreadToProcessor(0); // Calling thread always locked to processor 0.
}
for (uint i = 0; i < threadCount; i++) {
StringBuilder name;
name.format("worker %d", i);
workers[i].setName(name.release()); // @Leak
workers[i].start(workerFunc, (void *)i);
}
@ -94,14 +132,28 @@ ThreadPool::~ThreadPool()
start(NULL, NULL);
// Wait until threads actually exit.
Thread::wait(workers, workerCount);
Thread::wait(workers, workerCount - useCallingThread);
delete [] workers;
delete [] startEvents;
delete [] finishEvents;
}
void ThreadPool::start(ThreadFunc * func, void * arg)
void ThreadPool::run(ThreadTask * func, void * arg)
{
// Wait until threads are idle.
wait();
start(func, arg);
if (useCallingThread) {
func(arg, 0);
}
wait();
}
void ThreadPool::start(ThreadTask * func, void * arg)
{
// Wait until threads are idle.
wait();
@ -113,7 +165,7 @@ void ThreadPool::start(ThreadFunc * func, void * arg)
allIdle = false;
// Resume threads.
Event::post(startEvents, workerCount);
Event::post(startEvents, workerCount - useCallingThread);
}
void ThreadPool::wait()
@ -121,7 +173,7 @@ void ThreadPool::wait()
if (!allIdle)
{
// Wait for threads to complete.
Event::wait(finishEvents, workerCount);
Event::wait(finishEvents, workerCount - useCallingThread);
allIdle = true;
}

View File

@ -14,30 +14,42 @@
// The thread pool runs the same function in all worker threads, the idea is to use this as the foundation of a custom task scheduler.
// When the thread pool starts, the main thread continues running, but the common use case is to inmmediately wait for the termination events of the worker threads.
// @@ The start and wait methods could probably be merged.
// It may be running the thread function on the invoking thread to avoid thread switches.
namespace nv {
class Thread;
class Event;
typedef void ThreadTask(void * context, int id);
class ThreadPool {
NV_FORBID_COPY(ThreadPool);
public:
static void setup(uint workerCount, bool useThreadAffinity, bool useCallingThread);
static ThreadPool * acquire();
static void release(ThreadPool *);
ThreadPool();
ThreadPool(uint workerCount = processorCount(), bool useThreadAffinity = true, bool useCallingThread = false);
~ThreadPool();
void start(ThreadFunc * func, void * arg);
void run(ThreadTask * func, void * arg);
void start(ThreadTask * func, void * arg);
void wait();
//NV_THREAD_LOCAL static uint threadId;
private:
static void workerFunc(void * arg);
bool useThreadAffinity;
bool useCallingThread;
uint workerCount;
Thread * workers;
Event * startEvents;
Event * finishEvents;
@ -45,10 +57,29 @@ namespace nv {
uint allIdle;
// Current function:
ThreadFunc * func;
ThreadTask * func;
void * arg;
};
#if NV_CC_CPP11
template <typename F>
void thread_pool_run(F f) {
// Transform lambda into function pointer.
auto lambda = [](void* context, int id) {
F & f = *reinterpret_cast<F *>(context);
f(id);
};
ThreadPool * pool = ThreadPool::acquire();
pool->run(lambda, &f);
ThreadPool::release(pool);
}
#endif // NV_CC_CPP11
} // namespace nv

View File

@ -27,15 +27,65 @@
using namespace nv;
#if NV_OS_WIN32
// Find the number of cores in the system.
typedef BOOL(WINAPI *LPFN_GSI)(LPSYSTEM_INFO);
typedef BOOL(WINAPI *LPFN_ISWOW64PROCESS) (HANDLE, PBOOL);
static bool isWow64() {
LPFN_ISWOW64PROCESS fnIsWow64Process = (LPFN_ISWOW64PROCESS)GetProcAddress(GetModuleHandle("kernel32"), "IsWow64Process");
BOOL wow64 = FALSE;
if (NULL != fnIsWow64Process) {
if (!fnIsWow64Process(GetCurrentProcess(), &wow64)) {
// If error, assume false.
}
}
return wow64 != 0;
}
static void getSystemInfo(SYSTEM_INFO * sysinfo) {
BOOL success = FALSE;
if (isWow64()) {
LPFN_GSI fnGetNativeSystemInfo = (LPFN_GSI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetNativeSystemInfo");
if (fnGetNativeSystemInfo != NULL) {
success = fnGetNativeSystemInfo(sysinfo);
}
}
if (!success) {
GetSystemInfo(sysinfo);
}
}
#endif // NV_OS_WIN32
// Find the number of logical processors in the system.
// Based on: http://stackoverflow.com/questions/150355/programmatically-find-the-number-of-cores-on-a-machine
// @@ Distinguish between logical and physical cores?
uint nv::hardwareThreadCount() {
uint nv::processorCount() {
#if NV_OS_WIN32
SYSTEM_INFO sysinfo;
GetSystemInfo( &sysinfo );
return sysinfo.dwNumberOfProcessors;
getSystemInfo(&sysinfo);
//return sysinfo.dwNumberOfProcessors;
// Respect process affinity mask?
DWORD_PTR pam, sam;
GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
// Count number of bits set in the processor affinity mask.
uint count = 0;
for (int i = 0; i < sizeof(DWORD_PTR) * 8; i++) {
if (pam & (DWORD_PTR(1) << i)) count++;
}
nvDebugCheck(count <= sysinfo.dwNumberOfProcessors);
return count;
#elif NV_OS_ORBIS
return 6;
#elif NV_OS_XBOX
return 3; // or 6?
#elif NV_OS_LINUX // Linux, Solaris, & AIX
@ -72,10 +122,211 @@ uint nv::hardwareThreadCount() {
#endif
}
uint nv::threadId() {
#if NV_OS_WIN32
return GetCurrentThreadId();
#else
return 0; // @@
#endif
}
}
// @@ If we are using less worker threads than processors and hyperthreading is available, we probably want to enumerate the logical processors
// so that the first cores of each processor goes first. This way, if say, we leave 2 hardware threads free, then we still have one worker
// thread on each physical processor.
// I believe that currently logical processors are enumerated in physical order, that is:
// 0 = thread a in physical core 0
// 1 = thread b in physical core 0
// 2 = thread a in physical core 1
// ... and so on ...
// I'm not sure we can actually rely on that. And in any case we should start detecting the number of physical processors, which appears to be a pain
// to do in a way that's compatible with newer i7 processors.
void nv::lockThreadToProcessor(int idx) {
#if NV_OS_WIN32
//nvDebugCheck(idx < hardwareThreadCount());
#if 0
DWORD_PTR tam = 1 << idx;
#else
DWORD_PTR pam, sam;
BOOL rc = GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
// Find the idx's bit set.
uint pidx = 0;
DWORD_PTR tam = 0;
for (int i = 0; i < sizeof(DWORD_PTR) * 8; i++) {
DWORD_PTR mask = DWORD_PTR(1) << i;
if (pam & mask) {
if (pidx == idx) {
tam = mask;
break;
}
pidx++;
}
}
nvDebugCheck(tam != 0);
#endif
SetThreadAffinityMask(GetCurrentThread(), tam);
#else
// @@ NOP
#endif
}
void nv::unlockThreadToProcessor() {
#if NV_OS_WIN32
DWORD_PTR pam, sam;
BOOL rc = GetProcessAffinityMask(GetCurrentProcess(), &pam, &sam);
SetThreadAffinityMask(GetCurrentThread(), pam);
#else
// @@ NOP
#endif
}
uint nv::logicalProcessorCount() {
return processorCount();
}
#if NV_OS_WIN32
struct LOGICALPROCESSORDATA
{
unsigned int nLargestStandardFunctionNumber;
unsigned int nLargestExtendedFunctionNumber;
int nLogicalProcessorCount;
int nLocalApicId;
int nCPUcore;
int nProcessorId;
int nApicIdCoreIdSize;
int nNC;
int nMNC;
int nCPUCoresperProcessor;
int nThreadsperCPUCore;
int nProcId;
int nCoreId;
bool CmpLegacy;
bool HTT;
};
#define MAX_NUMBER_OF_LOGICAL_PROCESSORS 96
#define MAX_NUMBER_OF_PHYSICAL_PROCESSORS 8
#define MAX_NUMBER_OF_IOAPICS 16
static LOGICALPROCESSORDATA LogicalProcessorMap[MAX_NUMBER_OF_LOGICAL_PROCESSORS];
static int PhysProcIds[MAX_NUMBER_OF_PHYSICAL_PROCESSORS + MAX_NUMBER_OF_IOAPICS];
static void gatherProcessorData(LOGICALPROCESSORDATA * p) {
int CPUInfo[4] = { 0, 0, 0, 0 };
__cpuid(CPUInfo, 0);
p->nLargestStandardFunctionNumber = CPUInfo[0];
// Get the information associated with each valid Id
for (uint i = 0; i <= p->nLargestStandardFunctionNumber; ++i) {
__cpuid(CPUInfo, i);
// Interpret CPU feature information.
if (i == 1) {
// Some of the bits of LocalApicId represent the CPU core
// within a processor and other bits represent the processor ID.
p->nLocalApicId = (CPUInfo[1] >> 24) & 0xff;
p->HTT = (CPUInfo[3] >> 28) & 0x1;
// recalculate later after 0x80000008
p->nLogicalProcessorCount = (CPUInfo[1] >> 16) & 0x0FF;
}
}
// Calling __cpuid with 0x80000000 as the InfoType argument
// gets the number of valid extended IDs.
__cpuid(CPUInfo, 0x80000000);
p->nLargestExtendedFunctionNumber = CPUInfo[0];
// Get the information associated with each extended ID.
for (uint i = 0x80000000; i <= p->nLargestExtendedFunctionNumber; ++i) {
__cpuid(CPUInfo, i);
if (i == 0x80000008) {
p->nApicIdCoreIdSize = (CPUInfo[2] >> 12) & 0xF;
p->nNC = (CPUInfo[2]) & 0x0FF;
}
}
// MNC
// A value of zero for ApicIdCoreIdSize indicates that MNC is derived by this
// legacy formula: MNC = NC + 1
// A non-zero value of ApicIdCoreIdSize means that MNC is 2^ApicIdCoreIdSize
if (p->nApicIdCoreIdSize) {
p->nMNC = 2;
for (uint j = p->nApicIdCoreIdSize - 1; j > 0; j--) {
p->nMNC = p->nMNC * 2;
}
}
else {
p->nMNC = p->nNC + 1;
}
// If HTT==0, then LogicalProcessorCount is reserved, and the CPU contains
// one CPU core and the CPU core is single-threaded.
// If HTT==1 and CmpLegacy==1, LogicalProcessorCount represents the number of
// CPU cores per processor, where each CPU core is single-threaded. If HTT==1
// and CmpLegacy==0, then LogicalProcessorCount is the number of threads per
// processor, which is the number of cores times the number of threads per core.
// The number of cores is NC+1.
p->nCPUCoresperProcessor = p->nNC + 1;
p->nThreadsperCPUCore = (p->HTT == 0 ? 1 : (p->CmpLegacy == 1 ? 1 : p->nLogicalProcessorCount / p->nCPUCoresperProcessor ));
// Calculate a mask for the core IDs
uint mask = 1;
uint numbits = 1;
if (p->nApicIdCoreIdSize) {
numbits = p->nApicIdCoreIdSize;
for (uint j = p->nApicIdCoreIdSize; j > 1; j--) {
mask = (mask << 1) + 1;
}
}
p->nProcId = (p->nLocalApicId & ~mask) >> numbits;
p->nCoreId = p->nLocalApicId & mask;
}
uint nv::physicalProcessorCount() {
uint lpc = logicalProcessorCount();
// Get info about each logical processor.
for (uint i = 0; i < lpc; i++) {
// Make sure thread doesn't change processor while we gather it's data.
lockThreadToProcessor(i);
gatherProcessorData(&LogicalProcessorMap[i]);
}
unlockThreadToProcessor();
memset(PhysProcIds, 0, sizeof(PhysProcIds));
for (uint i = 0; i < lpc; i++) {
PhysProcIds[LogicalProcessorMap[i].nProcId]++;
}
uint pc = 0;
for (uint i = 0; i < (MAX_NUMBER_OF_PHYSICAL_PROCESSORS + MAX_NUMBER_OF_IOAPICS); i++) {
if (PhysProcIds[i] != 0) {
pc++;
}
}
return pc;
}
#else
uint nv::physicalProcessorCount() {
// @@ Assume the same.
return processorCount();
}
#endif

View File

@ -82,13 +82,16 @@ BOOL WINAPI SwitchToThread(void);
namespace nv
{
// Reentrant.
uint hardwareThreadCount();
//void initThreadingSystemInfo();
// Not thread-safe. Use from main thread only.
void initWorkers();
void shutWorkers();
void setWorkerFunction(void * func);
// Reentrant.
uint processorCount();
uint logicalProcessorCount();
uint physicalProcessorCount();
// Locks the current thread to the given logical processor index.
void lockThreadToProcessor(int idx);
void unlockThreadToProcessor();
uint threadId();