// This code is in the public domain -- castanyo@yahoo.es #ifndef NV_THREAD_ATOMIC_H #define NV_THREAD_ATOMIC_H #include "nvthread.h" #include "nvcore/Debug.h" #if NV_CC_MSVC #include // Already included by nvthread.h #pragma intrinsic(_InterlockedIncrement, _InterlockedDecrement) #pragma intrinsic(_InterlockedCompareExchange, _InterlockedExchange) //#pragma intrinsic(_InterlockedExchangeAdd64) /* extern "C" { #pragma intrinsic(_InterlockedIncrement, _InterlockedDecrement) LONG __cdecl _InterlockedIncrement(long volatile *Addend); LONG __cdecl _InterlockedDecrement(long volatile *Addend); #pragma intrinsic(_InterlockedCompareExchange, _InterlockedExchange) LONG __cdecl _InterlockedCompareExchange(long volatile * Destination, long Exchange, long Compared); LONG __cdecl _InterlockedExchange(long volatile * Target, LONG Value); } */ #endif // NV_CC_MSVC #if NV_CC_CLANG && POSH_CPU_STRONGARM // LLVM/Clang do not yet have functioning atomics as of 2.1 // #include #endif //ACS: need this if we want to use Apple's atomics. /* #if NV_OS_IOS || NV_OS_DARWIN // for iOS & OSX we use apple's atomics #include "libkern/OSAtomic.h" #endif */ namespace nv { // Load and stores. inline uint32 loadRelaxed(const uint32 * ptr) { return *ptr; } inline void storeRelaxed(uint32 * ptr, uint32 value) { *ptr = value; } inline uint32 loadAcquire(const volatile uint32 * ptr) { nvDebugCheck((intptr_t(ptr) & 3) == 0); #if POSH_CPU_X86 || POSH_CPU_X86_64 uint32 ret = *ptr; // on x86, loads are Acquire nvCompilerReadBarrier(); return ret; #elif POSH_CPU_STRONGARM || POSH_CPU_AARCH64 // need more specific cpu type for armv7? // also utilizes a full barrier // currently treating laod like x86 - this could be wrong // this is the easiest but slowest way to do this nvCompilerReadWriteBarrier(); uint32 ret = *ptr; // replace with ldrex? nvCompilerReadWriteBarrier(); return ret; #elif POSH_CPU_PPC64 // need more specific cpu type for ppc64? // also utilizes a full barrier // currently treating load like x86 - this could be wrong // this is the easiest but slowest way to do this nvCompilerReadWriteBarrier(); uint32 ret = *ptr; // replace with ldrex? nvCompilerReadWriteBarrier(); return ret; #else #error "Not implemented" #endif } inline void storeRelease(volatile uint32 * ptr, uint32 value) { nvDebugCheck((intptr_t(ptr) & 3) == 0); nvDebugCheck((intptr_t(&value) & 3) == 0); #if POSH_CPU_X86 || POSH_CPU_X86_64 nvCompilerWriteBarrier(); *ptr = value; // on x86, stores are Release //nvCompilerWriteBarrier(); // @@ IC: Where does this barrier go? In nvtt it was after, in Witness before. Not sure which one is right. #elif POSH_CPU_STRONGARM || POSH_CPU_AARCH64 // this is the easiest but slowest way to do this nvCompilerReadWriteBarrier(); *ptr = value; //strex? nvCompilerReadWriteBarrier(); #elif POSH_CPU_PPC64 // this is the easiest but slowest way to do this nvCompilerReadWriteBarrier(); *ptr = value; //strex? nvCompilerReadWriteBarrier(); #else #error "Atomics not implemented." #endif } template inline void storeReleasePointer(volatile T * pTo, T from) { NV_COMPILER_CHECK(sizeof(T) == sizeof(intptr_t)); nvDebugCheck((((intptr_t)pTo) % sizeof(intptr_t)) == 0); nvDebugCheck((((intptr_t)&from) % sizeof(intptr_t)) == 0); nvCompilerWriteBarrier(); *pTo = from; // on x86, stores are Release } template inline T loadAcquirePointer(volatile T * ptr) { NV_COMPILER_CHECK(sizeof(T) == sizeof(intptr_t)); nvDebugCheck((((intptr_t)ptr) % sizeof(intptr_t)) == 0); T ret = *ptr; // on x86, loads are Acquire nvCompilerReadBarrier(); return ret; } // Atomics. @@ Assuming sequential memory order? #if NV_CC_MSVC NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long)); // Returns incremented value. inline uint32 atomicIncrement(uint32 * value) { nvDebugCheck((intptr_t(value) & 3) == 0); return uint32(_InterlockedIncrement((long *)value)); } // Returns decremented value. inline uint32 atomicDecrement(uint32 * value) { nvDebugCheck((intptr_t(value) & 3) == 0); return uint32(_InterlockedDecrement((long *)value)); } // Returns added value. inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) { nvDebugCheck((intptr_t(value) & 3) == 0); return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add)) + value_to_add; } // Returns original value before addition. inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) { nvDebugCheck((intptr_t(value) & 3) == 0); return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add)); } // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'. // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated. // @@ Is this strong or weak? Does InterlockedCompareExchange have spurious failures? inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired) { nvDebugCheck((intptr_t(value) & 3) == 0); long result = _InterlockedCompareExchange((long *)value, (long)desired, (long)expected); return result == (long)expected; } inline uint32 atomicSwap(uint32 * value, uint32 desired) { nvDebugCheck((intptr_t(value) & 3) == 0); return (uint32)_InterlockedExchange((long *)value, (long)desired); } #elif NV_CC_CLANG && (NV_OS_IOS || NV_OS_DARWIN) //ACS: Use Apple's atomics instead? I don't know if these are better in any way; there are non-barrier versions too. There's no OSAtomicSwap32 tho' /* inline uint32 atomicIncrement(uint32 * value) { nvDebugCheck((intptr_t(value) & 3) == 0); return (uint32)OSAtomicIncrement32Barrier((int32_t *)value); } inline uint32 atomicDecrement(uint32 * value) { nvDebugCheck((intptr_t(value) & 3) == 0); return (uint32)OSAtomicDecrement32Barrier((int32_t *)value); } // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'. // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated. // @@ Is this strong or weak? inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired) { nvDebugCheck((intptr_t(value) & 3) == 0); return OSAtomicCompareAndSwap32Barrier((int32_t)expected, (int32_t)desired, (int32_t *)value); } */ // Returns incremented value. inline uint32 atomicIncrement(uint32 * value) { nvDebugCheck((intptr_t(value) & 3) == 0); return __sync_add_and_fetch(value, 1); } // Returns decremented value. inline uint32 atomicDecrement(uint32 * value) { nvDebugCheck((intptr_t(value) & 3) == 0); return __sync_sub_and_fetch(value, 1); } // Returns added value. inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) { nvDebugCheck((intptr_t(value) & 3) == 0); return __sync_add_and_fetch(value, value_to_add); } // Returns original value before addition. inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) { nvDebugCheck((intptr_t(value) & 3) == 0); return __sync_fetch_and_add(value, value_to_add); } // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'. // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated. // @@ Is this strong or weak? inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired) { nvDebugCheck((intptr_t(value) & 3) == 0); return __sync_bool_compare_and_swap(value, expected, desired); } inline uint32 atomicSwap(uint32 * value, uint32 desired) { nvDebugCheck((intptr_t(value) & 3) == 0); // this is confusingly named, it doesn't actually do a test but always sets return __sync_lock_test_and_set(value, desired); } #elif NV_CC_CLANG && POSH_CPU_STRONGARM inline uint32 atomicIncrement(uint32 * value) { nvDebugCheck((intptr_t(value) & 3) == 0); // this should work in LLVM eventually, but not as of 2.1 // return (uint32)AtomicIncrement((long *)value); // in the mean time, register uint32 result; asm volatile ( "1: ldrexb %0, [%1] \n\t" "add %0, %0, #1 \n\t" "strexb r1, %0, [%1] \n\t" "cmp r1, #0 \n\t" "bne 1b" : "=&r" (result) : "r"(value) : "r1" ); return result; } inline uint32 atomicDecrement(uint32 * value) { nvDebugCheck((intptr_t(value) & 3) == 0); // this should work in LLVM eventually, but not as of 2.1: // return (uint32)sys::AtomicDecrement((long *)value); // in the mean time, register uint32 result; asm volatile ( "1: ldrexb %0, [%1] \n\t" "sub %0, %0, #1 \n\t" "strexb r1, %0, [%1] \n\t" "cmp r1, #0 \n\t" "bne 1b" : "=&r" (result) : "r"(value) : "r1" ); return result; } #elif NV_CC_GNUC // Many alternative implementations at: // http://www.memoryhole.net/kyle/2007/05/atomic_incrementing.html // Returns incremented value. inline uint32 atomicIncrement(uint32 * value) { nvDebugCheck((intptr_t(value) & 3) == 0); return __sync_add_and_fetch(value, 1); } // Returns decremented value. inline uint32 atomicDecrement(uint32 * value) { nvDebugCheck((intptr_t(value) & 3) == 0); return __sync_sub_and_fetch(value, 1); } // Returns added value. inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) { nvDebugCheck((intptr_t(value) & 3) == 0); return __sync_add_and_fetch(value, value_to_add); } // Returns original value before addition. inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) { nvDebugCheck((intptr_t(value) & 3) == 0); return __sync_fetch_and_add(value, value_to_add); } // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'. // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated. // @@ Is this strong or weak? inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired) { nvDebugCheck((intptr_t(value) & 3) == 0); return __sync_bool_compare_and_swap(value, expected, desired); } inline uint32 atomicSwap(uint32 * value, uint32 desired) { nvDebugCheck((intptr_t(value) & 3) == 0); // this is confusingly named, it doesn't actually do a test but always sets return __sync_lock_test_and_set(value, desired); } #else #error "Atomics not implemented." #endif // It would be nice to have C++0x-style atomic types, but I'm not in the mood right now. Only uint32 supported so far. #if 0 template void increment(T * value); template void decrement(T * value); template <> void increment(uint32 * value) { } template <> void increment(uint64 * value) { } template class Atomic { public: explicit Atomic() : m_value() { } explicit Atomic( T val ) : m_value(val) { } ~Atomic() { } T loadRelaxed() const { return m_value; } void storeRelaxed(T val) { m_value = val; } //T loadAcquire() const volatile { return nv::loadAcquire(&m_value); } //void storeRelease(T val) volatile { nv::storeRelease(&m_value, val); } void increment() /*volatile*/ { nv::atomicIncrement(m_value); } void decrement() /*volatile*/ { nv::atomicDecrement(m_value); } void compareAndStore(T oldVal, T newVal) { nv::atomicCompareAndStore(&m_value, oldVal, newVal); } T compareAndExchange(T oldVal, T newVal) { nv::atomicCompareAndStore(&m_value, oldVal, newVal); } T exchange(T newVal) { nv::atomicExchange(&m_value, newVal); } private: // don't provide operator = or == ; make the client write Store( Load() ) NV_FORBID_COPY(Atomic); NV_COMPILER_CHECK(sizeof(T) == sizeof(uint32) || sizeof(T) == sizeof(uint64)); T m_value; }; #endif } // nv namespace #endif // NV_THREADS_ATOMICS_H