E2K: added initial support for MCST Elbrus 2000

5 years ago · 809c6df193
parent 261948f076
commit 809c6df193
3 changed files with 1716 additions and 1691 deletions
--- a/src/nvcore/Debug.cpp
+++ b/src/nvcore/Debug.cpp
--- a/src/nvcore/nvcore.h
+++ b/src/nvcore/nvcore.h
@ -56,7 +56,7 @@
 #   define NV_OS_MINGW 1
 #   define NV_OS_WIN32 1
 #elif defined POSH_OS_OSX
-#   define NV_OS_OSX 1      // IC: Adding this, because iOS defines NV_OS_DARWIN too.
+#   define NV_OS_OSX 1 // IC: Adding this, because iOS defines NV_OS_DARWIN too.
 #   define NV_OS_DARWIN 1
 #   define NV_OS_UNIX 1
 #elif defined POSH_OS_IOS
@ -98,6 +98,7 @@
 // NV_CPU_PPC
 // NV_CPU_ARM
 // NV_CPU_ARM_64
+// NV_CPU_E2K /* MCST Elbrus 2000 */

 #define NV_CPU_STRING   POSH_CPU_STRING

@ -112,6 +113,8 @@
 #   define NV_CPU_ARM 1
 #elif defined POSH_CPU_AARCH64
 #   define NV_CPU_ARM_64 1
+#elif defined POSH_CPU_E2K
+#   define NV_CPU_E2K 1
 #else
 #   error "Unsupported CPU"
 #endif
@ -124,7 +127,7 @@

 #if defined POSH_COMPILER_CLANG
 #   define NV_CC_CLANG  1
-#   define NV_CC_GNUC   1    // Clang is compatible with GCC.
+#   define NV_CC_GNUC   1 // Clang is compatible with GCC.
 #   define NV_CC_STRING "clang"
 #elif defined POSH_COMPILER_GCC
 #   define NV_CC_GNUC   1
@ -232,7 +235,7 @@ typedef uint32      uint;
    NV_ENDIAN_STRING"-endian - " __DATE__ "-" __TIME__


-// Disable copy constructor and assignment operator. 
+// Disable copy constructor and assignment operator.
 #if NV_CC_CPP11
 #define NV_FORBID_COPY(C) \
    C( const C & ) = delete; \
@ -244,7 +247,7 @@ typedef uint32      uint;
    C &operator=( const C & )
 #endif

-// Disable dynamic allocation on the heap. 
+// Disable dynamic allocation on the heap.
 // See Prohibiting Heap-Based Objects in More Effective C++.
 #define NV_FORBID_HEAPALLOC() \
    private: \
@ -262,12 +265,12 @@ typedef uint32      uint;
 #define NV_STRING(x) NV_STRING2(x)

 #if NV_CC_MSVC
-#define NV_MULTI_LINE_MACRO_BEGIN do {  
+#define NV_MULTI_LINE_MACRO_BEGIN do {
 #define NV_MULTI_LINE_MACRO_END \
    __pragma(warning(push)) \
    __pragma(warning(disable:4127)) \
    } while(false) \
-    __pragma(warning(pop))  
+    __pragma(warning(pop))
 #else
 #define NV_MULTI_LINE_MACRO_BEGIN do {
 #define NV_MULTI_LINE_MACRO_END } while(false)
@ -278,7 +281,7 @@ typedef uint32      uint;
 #else
 #define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
 #endif
-#define NV_COMPILER_CHECK(x) nvStaticCheck(x)   // I like this name best.
+#define NV_COMPILER_CHECK(x) nvStaticCheck(x) // I like this name best.

 // Make sure type definitions are fine.
 NV_COMPILER_CHECK(sizeof(int8) == 1);
@ -302,7 +305,7 @@ template <typename T, size_t N> char (&ArraySizeHelper(T (&array)[N]))[N];
 #define NV_MESSAGE(x) message(x)
 #endif
 #else
-#define NV_MESSAGE(x) 
+#define NV_MESSAGE(x)
 #endif


--- a/src/nvthread/Atomic.h
+++ b/src/nvthread/Atomic.h
@ -1,413 +1,428 @@
-// This code is in the public domain -- castanyo@yahoo.es
-
-#ifndef NV_THREAD_ATOMIC_H
-#define NV_THREAD_ATOMIC_H
-
-#include "nvthread.h"
-
-#include "nvcore/Debug.h"
-
-
-#if NV_CC_MSVC
-
-#include <intrin.h> // Already included by nvthread.h
-
-#pragma intrinsic(_InterlockedIncrement, _InterlockedDecrement)
-#pragma intrinsic(_InterlockedCompareExchange, _InterlockedExchange)
-//#pragma intrinsic(_InterlockedExchangeAdd64)
-
-/*
-extern "C"
-{
-    #pragma intrinsic(_InterlockedIncrement, _InterlockedDecrement)
-    LONG  __cdecl _InterlockedIncrement(long volatile *Addend);
-    LONG  __cdecl _InterlockedDecrement(long volatile *Addend);
-
-    #pragma intrinsic(_InterlockedCompareExchange, _InterlockedExchange)
-    LONG  __cdecl _InterlockedCompareExchange(long volatile * Destination, long Exchange, long Compared);
-    LONG  __cdecl _InterlockedExchange(long volatile * Target, LONG Value);
-}
-*/
-
-#endif // NV_CC_MSVC
-
-#if NV_CC_CLANG && POSH_CPU_STRONGARM
-// LLVM/Clang do not yet have functioning atomics as of 2.1
-// #include <atomic>
-#endif
-
-//ACS: need this if we want to use Apple's atomics.
-/*
-#if NV_OS_IOS || NV_OS_DARWIN
-// for iOS & OSX we use apple's atomics
-#include "libkern/OSAtomic.h"
-#endif
-*/
-
-namespace nv {
-
-    // Load and stores.
-    inline uint32 loadRelaxed(const uint32 * ptr) { return *ptr; }
-    inline void storeRelaxed(uint32 * ptr, uint32 value) { *ptr = value; }
-
-    inline uint32 loadAcquire(const volatile uint32 * ptr)
-    {
-        nvDebugCheck((intptr_t(ptr) & 3) == 0);
-
-#if POSH_CPU_X86 || POSH_CPU_X86_64
-        uint32 ret = *ptr;  // on x86, loads are Acquire
-        nvCompilerReadBarrier();
-        return ret;
-#elif POSH_CPU_STRONGARM || POSH_CPU_AARCH64
-        // need more specific cpu type for armv7?
-        // also utilizes a full barrier
-        // currently treating laod like x86 - this could be wrong
-        
-        // this is the easiest but slowest way to do this
-        nvCompilerReadWriteBarrier();
-		uint32 ret = *ptr; // replace with ldrex?
-        nvCompilerReadWriteBarrier();
-        return ret;
-#elif POSH_CPU_PPC64
-        // need more specific cpu type for ppc64?
-        // also utilizes a full barrier
-        // currently treating load like x86 - this could be wrong
-
-        // this is the easiest but slowest way to do this
-        nvCompilerReadWriteBarrier();
-		uint32 ret = *ptr; // replace with ldrex?
-        nvCompilerReadWriteBarrier();
-        return ret;
-#else
-#error "Not implemented"
-#endif
-    }
-
-    inline void storeRelease(volatile uint32 * ptr, uint32 value)
-    {
-        nvDebugCheck((intptr_t(ptr) & 3) == 0);
-        nvDebugCheck((intptr_t(&value) & 3) == 0);
-
-#if POSH_CPU_X86 || POSH_CPU_X86_64
-        nvCompilerWriteBarrier();
-        *ptr = value;   // on x86, stores are Release
-        //nvCompilerWriteBarrier(); // @@ IC: Where does this barrier go? In nvtt it was after, in Witness before. Not sure which one is right.
-#elif POSH_CPU_STRONGARM || POSH_CPU_AARCH64
-        // this is the easiest but slowest way to do this
-        nvCompilerReadWriteBarrier();
-		*ptr = value; //strex?
-		nvCompilerReadWriteBarrier();
-#elif POSH_CPU_PPC64
-        // this is the easiest but slowest way to do this
-        nvCompilerReadWriteBarrier();
-		*ptr = value; //strex?
-		nvCompilerReadWriteBarrier();
-#else
-#error "Atomics not implemented."
-#endif
-    }
-    
-    inline void storeRelease(volatile float * ptr, float value)
-    {
-        storeRelease((uint32 *)ptr, *(uint32 *)&value);
-    }
-
-
-    template <typename T>
-    inline void storeReleasePointer(volatile T * pTo, T from)
-    {
-        NV_COMPILER_CHECK(sizeof(T) == sizeof(intptr_t));
-        nvDebugCheck((((intptr_t)pTo) % sizeof(intptr_t)) == 0);
-        nvDebugCheck((((intptr_t)&from) % sizeof(intptr_t)) == 0);
-        nvCompilerWriteBarrier();
-        *pTo = from;    // on x86, stores are Release
-    }
-
-    template <typename T>
-    inline T loadAcquirePointer(volatile T * ptr)
-    {
-        NV_COMPILER_CHECK(sizeof(T) == sizeof(intptr_t));
-        nvDebugCheck((((intptr_t)ptr) % sizeof(intptr_t)) == 0);
-        T ret = *ptr;   // on x86, loads are Acquire
-        nvCompilerReadBarrier();
-        return ret;
-    } 
-
-
-    // Atomics. @@ Assuming sequential memory order?
-
-#if NV_CC_MSVC
-    NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
-
-    // Returns incremented value.
-    inline uint32 atomicIncrement(uint32 * value)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return uint32(_InterlockedIncrement((long *)value));
-    }
-
-    // Returns decremented value.
-    inline uint32 atomicDecrement(uint32 * value)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return uint32(_InterlockedDecrement((long *)value));
-    }
-
-    // Returns added value.
-    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add)) + value_to_add;
-    }
-
-    // Returns original value before addition.
-    inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add));
-    }
-
-
-
-
-    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
-    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
-    // @@ Is this strong or weak? Does InterlockedCompareExchange have spurious failures?
-    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        long result = _InterlockedCompareExchange((long *)value, (long)desired, (long)expected);
-        return result == (long)expected;
-    }
-
-
-    inline uint32 atomicSwap(uint32 * value, uint32 desired)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return (uint32)_InterlockedExchange((long *)value, (long)desired);
-    }
-
-
-
-#elif NV_CC_CLANG && (NV_OS_IOS || NV_OS_DARWIN)
-
-    //ACS: Use Apple's atomics instead? I don't know if these are better in any way; there are non-barrier versions too. There's no OSAtomicSwap32 tho'
-    /*
-    inline uint32 atomicIncrement(uint32 * value)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return (uint32)OSAtomicIncrement32Barrier((int32_t *)value);
-    }
-    
-    inline uint32 atomicDecrement(uint32 * value)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return (uint32)OSAtomicDecrement32Barrier((int32_t *)value);
-    }
-
-    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
-    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
-    // @@ Is this strong or weak?
-    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return OSAtomicCompareAndSwap32Barrier((int32_t)expected, (int32_t)desired, (int32_t *)value);
-    }
-    */
-
-    // Returns incremented value.
-    inline uint32 atomicIncrement(uint32 * value) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return __sync_add_and_fetch(value, 1);
-    }
-    
-    // Returns decremented value.
-    inline uint32 atomicDecrement(uint32 * value) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return __sync_sub_and_fetch(value, 1);
-    }
-
-    // Returns added value.
-    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return __sync_add_and_fetch(value, value_to_add);
-    }
-
-    // Returns original value before addition.
-    inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return __sync_fetch_and_add(value, value_to_add);
-    }
-
-
-    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
-    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
-    // @@ Is this strong or weak?
-    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return __sync_bool_compare_and_swap(value, expected, desired);
-    }
-    
-    inline uint32 atomicSwap(uint32 * value, uint32 desired)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        // this is confusingly named, it doesn't actually do a test but always sets
-        return __sync_lock_test_and_set(value, desired);
-    }
-
-
-
-
-#elif NV_CC_CLANG && POSH_CPU_STRONGARM
-    
-    inline uint32 atomicIncrement(uint32 * value)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        
-        // this should work in LLVM eventually, but not as of 2.1
-        // return (uint32)AtomicIncrement((long *)value);
-        
-        // in the mean time,
-        register uint32 result;
-        asm volatile (
-                      "1:   ldrexb  %0,  [%1]	\n\t"
-                      "add     %0,   %0, #1     \n\t"
-                      "strexb  r1,   %0, [%1]	\n\t"
-                      "cmp     r1,   #0			\n\t"
-                      "bne     1b"
-                      : "=&r" (result)
-                      : "r"(value)
-                      : "r1"
-                      );
-        return result;
-
-    }
-    
-    inline uint32 atomicDecrement(uint32 * value)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        
-        // this should work in LLVM eventually, but not as of 2.1:
-        // return (uint32)sys::AtomicDecrement((long *)value);
-
-        // in the mean time,
-        
-        register uint32 result;
-        asm volatile (
-                      "1:   ldrexb  %0,  [%1]	\n\t"
-                      "sub     %0,   %0, #1     \n\t"
-                      "strexb  r1,   %0, [%1]	\n\t"
-                      "cmp     r1,   #0			\n\t"
-                      "bne     1b"
-                      : "=&r" (result)
-                      : "r"(value)
-                      : "r1"
-                      );
-        return result;
-         
-    }
-
-#elif NV_CC_GNUC
-    // Many alternative implementations at:
-    // http://www.memoryhole.net/kyle/2007/05/atomic_incrementing.html
-
-    // Returns incremented value.
-    inline uint32 atomicIncrement(uint32 * value) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return __sync_add_and_fetch(value, 1);
-    }
-
-    // Returns decremented value.
-    inline uint32 atomicDecrement(uint32 * value) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return __sync_sub_and_fetch(value, 1);
-    }
-
-    // Returns added value.
-    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return __sync_add_and_fetch(value, value_to_add);
-    }
-
-    // Returns original value before addition.
-    inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return __sync_fetch_and_add(value, value_to_add);
-    }
-
-    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
-    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
-    // @@ Is this strong or weak?
-    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        return __sync_bool_compare_and_swap(value, expected, desired);
-    }
-    
-    inline uint32 atomicSwap(uint32 * value, uint32 desired)
-    {
-        nvDebugCheck((intptr_t(value) & 3) == 0);
-        // this is confusingly named, it doesn't actually do a test but always sets
-        return __sync_lock_test_and_set(value, desired);
-    }
-    
-#else
-#error "Atomics not implemented."
-
-#endif
-
-
-
-
-    // It would be nice to have C++0x-style atomic types, but I'm not in the mood right now. Only uint32 supported so far.
-#if 0
-    template <typename T>
-    void increment(T * value);
-
-    template <typename T>
-    void decrement(T * value);
-
-    template <>
-    void increment(uint32 * value) {
-    }
-
-    template <>
-    void increment(uint64 * value) {
-    }
-
-
-
-    template <typename T>
-    class Atomic
-    {
-    public:
-        explicit Atomic()  : m_value() { }
-        explicit Atomic( T val ) : m_value(val) { }
-        ~Atomic() { }
-
-        T loadRelaxed()  const { return m_value; }
-        void storeRelaxed(T val) { m_value = val; }
-
-        //T loadAcquire() const volatile { return nv::loadAcquire(&m_value); }
-        //void storeRelease(T val) volatile { nv::storeRelease(&m_value, val); }
-
-        void increment() /*volatile*/ { nv::atomicIncrement(m_value); }
-        void decrement() /*volatile*/ { nv::atomicDecrement(m_value); }
-
-        void compareAndStore(T oldVal, T newVal) { nv::atomicCompareAndStore(&m_value, oldVal, newVal); }
-        T compareAndExchange(T oldVal, T newVal) { nv::atomicCompareAndStore(&m_value, oldVal, newVal); }
-        T exchange(T newVal) { nv::atomicExchange(&m_value, newVal); }
-
-    private:
-        // don't provide operator = or == ; make the client write Store( Load() )
-        NV_FORBID_COPY(Atomic);
-
-        NV_COMPILER_CHECK(sizeof(T) == sizeof(uint32) || sizeof(T) == sizeof(uint64));
-
-        T m_value;
-    };
-#endif
-
-} // nv namespace 
-
-
-#endif // NV_THREADS_ATOMICS_H
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_THREAD_ATOMIC_H
+#define NV_THREAD_ATOMIC_H
+
+#include "nvthread.h"
+
+#include "nvcore/Debug.h"
+
+
+#if NV_CC_MSVC
+
+#include <intrin.h> // Already included by nvthread.h
+
+#pragma intrinsic(_InterlockedIncrement, _InterlockedDecrement)
+#pragma intrinsic(_InterlockedCompareExchange, _InterlockedExchange)
+//#pragma intrinsic(_InterlockedExchangeAdd64)
+
+/*
+extern "C"
+{
+    #pragma intrinsic(_InterlockedIncrement, _InterlockedDecrement)
+    LONG  __cdecl _InterlockedIncrement(long volatile *Addend);
+    LONG  __cdecl _InterlockedDecrement(long volatile *Addend);
+
+    #pragma intrinsic(_InterlockedCompareExchange, _InterlockedExchange)
+    LONG  __cdecl _InterlockedCompareExchange(long volatile * Destination, long Exchange, long Compared);
+    LONG  __cdecl _InterlockedExchange(long volatile * Target, LONG Value);
+}
+*/
+
+#endif // NV_CC_MSVC
+
+#if NV_CC_CLANG && POSH_CPU_STRONGARM
+// LLVM/Clang do not yet have functioning atomics as of 2.1
+// #include <atomic>
+#endif
+
+//ACS: need this if we want to use Apple's atomics.
+/*
+#if NV_OS_IOS || NV_OS_DARWIN
+// for iOS & OSX we use apple's atomics
+#include "libkern/OSAtomic.h"
+#endif
+*/
+
+namespace nv {
+
+    // Load and stores.
+    inline uint32 loadRelaxed(const uint32 * ptr) { return *ptr; }
+    inline void storeRelaxed(uint32 * ptr, uint32 value) { *ptr = value; }
+
+    inline uint32 loadAcquire(const volatile uint32 * ptr)
+    {
+        nvDebugCheck((intptr_t(ptr) & 3) == 0);
+
+#if POSH_CPU_X86 || POSH_CPU_X86_64
+        uint32 ret = *ptr; // on x86, loads are Acquire
+        nvCompilerReadBarrier();
+        return ret;
+#elif POSH_CPU_STRONGARM || POSH_CPU_AARCH64
+        // need more specific cpu type for armv7?
+        // also utilizes a full barrier
+        // currently treating load like x86 - this could be wrong
+
+        // this is the easiest but slowest way to do this
+        nvCompilerReadWriteBarrier();
+        uint32 ret = *ptr; // replace with ldrex?
+        nvCompilerReadWriteBarrier();
+        return ret;
+#elif POSH_CPU_PPC64
+        // need more specific cpu type for ppc64?
+        // also utilizes a full barrier
+        // currently treating load like x86 - this could be wrong
+
+        // this is the easiest but slowest way to do this
+        nvCompilerReadWriteBarrier();
+        uint32 ret = *ptr; // replace with ldrex?
+        nvCompilerReadWriteBarrier();
+        return ret;
+#elif POSH_CPU_E2K
+        // need more specific cpu type for e2k?
+        // also utilizes a full barrier
+        // currently treating load like x86 - this could be wrong
+
+        // this is the easiest but slowest way to do this
+        nvCompilerReadWriteBarrier();
+        uint32 ret = *ptr; // replace with ldrex?
+        nvCompilerReadWriteBarrier();
+        return ret;
+#else
+#error "Not implemented"
+#endif
+    }
+
+    inline void storeRelease(volatile uint32 * ptr, uint32 value)
+    {
+        nvDebugCheck((intptr_t(ptr) & 3) == 0);
+        nvDebugCheck((intptr_t(&value) & 3) == 0);
+
+#if POSH_CPU_X86 || POSH_CPU_X86_64
+        nvCompilerWriteBarrier();
+        *ptr = value; // on x86, stores are Release
+        //nvCompilerWriteBarrier(); // @@ IC: Where does this barrier go? In nvtt it was after, in Witness before. Not sure which one is right.
+#elif POSH_CPU_STRONGARM || POSH_CPU_AARCH64
+        // this is the easiest but slowest way to do this
+        nvCompilerReadWriteBarrier();
+        *ptr = value; //strex?
+        nvCompilerReadWriteBarrier();
+#elif POSH_CPU_PPC64
+        // this is the easiest but slowest way to do this
+        nvCompilerReadWriteBarrier();
+        *ptr = value; //strex?
+        nvCompilerReadWriteBarrier();
+#elif POSH_CPU_E2K
+        // this is the easiest but slowest way to do this
+        nvCompilerReadWriteBarrier();
+        *ptr = value; //strex?
+        nvCompilerReadWriteBarrier();
+#else
+#error "Atomics not implemented."
+#endif
+    }
+    
+    inline void storeRelease(volatile float * ptr, float value)
+    {
+        storeRelease((uint32 *)ptr, *(uint32 *)&value);
+    }
+
+
+    template <typename T>
+    inline void storeReleasePointer(volatile T * pTo, T from)
+    {
+        NV_COMPILER_CHECK(sizeof(T) == sizeof(intptr_t));
+        nvDebugCheck((((intptr_t)pTo) % sizeof(intptr_t)) == 0);
+        nvDebugCheck((((intptr_t)&from) % sizeof(intptr_t)) == 0);
+        nvCompilerWriteBarrier();
+        *pTo = from; // on x86, stores are Release
+    }
+
+    template <typename T>
+    inline T loadAcquirePointer(volatile T * ptr)
+    {
+        NV_COMPILER_CHECK(sizeof(T) == sizeof(intptr_t));
+        nvDebugCheck((((intptr_t)ptr) % sizeof(intptr_t)) == 0);
+        T ret = *ptr; // on x86, loads are Acquire
+        nvCompilerReadBarrier();
+        return ret;
+    }
+
+
+    // Atomics. @@ Assuming sequential memory order?
+
+#if NV_CC_MSVC
+    NV_COMPILER_CHECK(sizeof(uint32) == sizeof(long));
+
+    // Returns incremented value.
+    inline uint32 atomicIncrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return uint32(_InterlockedIncrement((long *)value));
+    }
+
+    // Returns decremented value.
+    inline uint32 atomicDecrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return uint32(_InterlockedDecrement((long *)value));
+    }
+
+    // Returns added value.
+    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add)) + value_to_add;
+    }
+
+    // Returns original value before addition.
+    inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return uint32(_InterlockedExchangeAdd((long*)value, (long)value_to_add));
+    }
+
+
+
+
+    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
+    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
+    // @@ Is this strong or weak? Does InterlockedCompareExchange have spurious failures?
+    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        long result = _InterlockedCompareExchange((long *)value, (long)desired, (long)expected);
+        return result == (long)expected;
+    }
+
+
+    inline uint32 atomicSwap(uint32 * value, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return (uint32)_InterlockedExchange((long *)value, (long)desired);
+    }
+
+
+
+#elif NV_CC_CLANG && (NV_OS_IOS || NV_OS_DARWIN)
+
+    //ACS: Use Apple's atomics instead? I don't know if these are better in any way; there are non-barrier versions too. There's no OSAtomicSwap32 tho'
+    /*
+    inline uint32 atomicIncrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return (uint32)OSAtomicIncrement32Barrier((int32_t *)value);
+    }
+
+    inline uint32 atomicDecrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return (uint32)OSAtomicDecrement32Barrier((int32_t *)value);
+    }
+
+    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
+    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
+    // @@ Is this strong or weak?
+    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return OSAtomicCompareAndSwap32Barrier((int32_t)expected, (int32_t)desired, (int32_t *)value);
+    }
+    */
+
+    // Returns incremented value.
+    inline uint32 atomicIncrement(uint32 * value) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_add_and_fetch(value, 1);
+    }
+
+    // Returns decremented value.
+    inline uint32 atomicDecrement(uint32 * value) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_sub_and_fetch(value, 1);
+    }
+
+    // Returns added value.
+    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_add_and_fetch(value, value_to_add);
+    }
+
+    // Returns original value before addition.
+    inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_fetch_and_add(value, value_to_add);
+    }
+
+
+    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
+    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
+    // @@ Is this strong or weak?
+    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_bool_compare_and_swap(value, expected, desired);
+    }
+
+    inline uint32 atomicSwap(uint32 * value, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        // this is confusingly named, it doesn't actually do a test but always sets
+        return __sync_lock_test_and_set(value, desired);
+    }
+
+
+
+
+#elif NV_CC_CLANG && POSH_CPU_STRONGARM
+
+    inline uint32 atomicIncrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+
+        // this should work in LLVM eventually, but not as of 2.1
+        // return (uint32)AtomicIncrement((long *)value);
+
+        // in the mean time,
+        register uint32 result;
+        asm volatile (
+                      "1:   ldrexb  %0,  [%1]	\n\t"
+                      "add     %0,   %0, #1     \n\t"
+                      "strexb  r1,   %0, [%1]	\n\t"
+                      "cmp     r1,   #0			\n\t"
+                      "bne     1b"
+                      : "=&r" (result)
+                      : "r"(value)
+                      : "r1"
+                      );
+        return result;
+
+    }
+
+    inline uint32 atomicDecrement(uint32 * value)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+
+        // this should work in LLVM eventually, but not as of 2.1:
+        // return (uint32)sys::AtomicDecrement((long *)value);
+
+        // in the mean time,
+
+        register uint32 result;
+        asm volatile (
+                      "1:   ldrexb  %0,  [%1]	\n\t"
+                      "sub     %0,   %0, #1     \n\t"
+                      "strexb  r1,   %0, [%1]	\n\t"
+                      "cmp     r1,   #0			\n\t"
+                      "bne     1b"
+                      : "=&r" (result)
+                      : "r"(value)
+                      : "r1"
+                      );
+        return result;
+
+    }
+
+#elif NV_CC_GNUC
+    // Many alternative implementations at:
+    // http://www.memoryhole.net/kyle/2007/05/atomic_incrementing.html
+
+    // Returns incremented value.
+    inline uint32 atomicIncrement(uint32 * value) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_add_and_fetch(value, 1);
+    }
+
+    // Returns decremented value.
+    inline uint32 atomicDecrement(uint32 * value) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_sub_and_fetch(value, 1);
+    }
+
+    // Returns added value.
+    inline uint32 atomicAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_add_and_fetch(value, value_to_add);
+    }
+
+    // Returns original value before addition.
+    inline uint32 atomicFetchAndAdd(uint32 * value, uint32 value_to_add) {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_fetch_and_add(value, value_to_add);
+    }
+
+    // Compare '*value' against 'expected', if equal, then stores 'desired' in '*value'.
+    // @@ C++0x style CAS? Unlike the C++0x version, 'expected' is not passed by reference and not mutated.
+    // @@ Is this strong or weak?
+    inline bool atomicCompareAndSwap(uint32 * value, uint32 expected, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        return __sync_bool_compare_and_swap(value, expected, desired);
+    }
+
+    inline uint32 atomicSwap(uint32 * value, uint32 desired)
+    {
+        nvDebugCheck((intptr_t(value) & 3) == 0);
+        // this is confusingly named, it doesn't actually do a test but always sets
+        return __sync_lock_test_and_set(value, desired);
+    }
+
+#else
+#error "Atomics not implemented."
+
+#endif
+
+
+
+
+    // It would be nice to have C++0x-style atomic types, but I'm not in the mood right now. Only uint32 supported so far.
+#if 0
+    template <typename T>
+    void increment(T * value);
+
+    template <typename T>
+    void decrement(T * value);
+
+    template <>
+    void increment(uint32 * value) {
+    }
+
+    template <>
+    void increment(uint64 * value) {
+    }
+
+
+
+    template <typename T>
+    class Atomic
+    {
+    public:
+        explicit Atomic()  : m_value() { }
+        explicit Atomic( T val ) : m_value(val) { }
+        ~Atomic() { }
+
+        T loadRelaxed()  const { return m_value; }
+        void storeRelaxed(T val) { m_value = val; }
+
+        //T loadAcquire() const volatile { return nv::loadAcquire(&m_value); }
+        //void storeRelease(T val) volatile { nv::storeRelease(&m_value, val); }
+
+        void increment() /*volatile*/ { nv::atomicIncrement(m_value); }
+        void decrement() /*volatile*/ { nv::atomicDecrement(m_value); }
+
+        void compareAndStore(T oldVal, T newVal) { nv::atomicCompareAndStore(&m_value, oldVal, newVal); }
+        T compareAndExchange(T oldVal, T newVal) { nv::atomicCompareAndStore(&m_value, oldVal, newVal); }
+        T exchange(T newVal) { nv::atomicExchange(&m_value, newVal); }
+
+    private:
+        // don't provide operator = or == ; make the client write Store( Load() )
+        NV_FORBID_COPY(Atomic);
+
+        NV_COMPILER_CHECK(sizeof(T) == sizeof(uint32) || sizeof(T) == sizeof(uint64));
+
+        T m_value;
+    };
+#endif
+
+} // nv namespace
+
+
+#endif // NV_THREADS_ATOMICS_H