Add external libs for comparisons and benchmarks.

2020-03-23 10:07:38 -07:00
parent 4a33d1ac75
commit 9a16bebf8f
67 changed files with 24230 additions and 1 deletions
--- a/extern/CMP_Core/source/CMP_Core.h
+++ b/extern/CMP_Core/source/CMP_Core.h
@ -0,0 +1,153 @@
+//=====================================================================
+// Copyright (c) 2019   Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+/// \file CMP_Core.h
+//
+//=====================================================================
+
+#ifndef CMP_CORE_H
+#define CMP_CORE_H
+
+#include <stdint.h>
+#ifdef _WIN32
+#define CMP_CDECL __cdecl
+#else
+#define CMP_CDECL
+#endif
+
+//====================================================================================
+// API Definitions for Core API
+//------------------------------------------------------------------------------------
+// All API return 0 on success else error codes > 0
+// See Common_Def.h CGU_CORE_ values for the error codes
+//=====================================================================================
+
+//======================================================================================================
+// Block level setting option: Create and Destroy Reference Pointers
+//======================================================================================================
+// Context create and destroy to use for BCn codec settings, where n is the set [1,2,3,4,5,6,7]
+// All codecs will use default max quality settings, users can create multiple contexts to 
+// set quality levels, masks , channel mapping, etc...
+
+int CMP_CDECL CreateOptionsBC1(void **optionsBC1);
+int CMP_CDECL CreateOptionsBC2(void **optionsBC2);
+int CMP_CDECL CreateOptionsBC3(void **optionsBC3);
+int CMP_CDECL CreateOptionsBC4(void **optionsBC4);
+int CMP_CDECL CreateOptionsBC5(void **optionsBC5);
+int CMP_CDECL CreateOptionsBC6(void **optionsBC6);
+int CMP_CDECL CreateOptionsBC7(void **optionsBC7);
+
+int CMP_CDECL DestroyOptionsBC1(void *optionsBC1);
+int CMP_CDECL DestroyOptionsBC2(void *optionsBC2);
+int CMP_CDECL DestroyOptionsBC3(void *optionsBC3);
+int CMP_CDECL DestroyOptionsBC4(void *optionsBC4);
+int CMP_CDECL DestroyOptionsBC5(void *optionsBC5);
+int CMP_CDECL DestroyOptionsBC6(void *optionsBC6);
+int CMP_CDECL DestroyOptionsBC7(void *optionsBC7);
+
+
+//======================================================================================================
+// Block level settings using the options Reference Pointers
+//======================================================================================================
+
+// Setting channel Weights : Applies to BC1, BC2 and BC3 valid ranges are [0..1.0f] Default is {1.0f, 1.0f , 1.0f}
+// Use channel weightings. With swizzled formats the weighting applies to the data within the specified channel not the channel itself.
+int CMP_CDECL SetChannelWeightsBC1(void *options, float WeightRed, float WeightGreen, float WeightBlue);
+int CMP_CDECL SetChannelWeightsBC2(void *options, float WeightRed, float WeightGreen, float WeightBlue);
+int CMP_CDECL SetChannelWeightsBC3(void *options, float WeightRed, float WeightGreen, float WeightBlue);
+
+
+//  True sets mapping CMP_Core BC1, BC2 & BC3 to decode Red,Green,Blue and Alpha as
+//       RGBA to channels [0,1,2,3] else BGRA maps to [0,1,2,3]
+//  Default is set to true.
+int CMP_CDECL SetDecodeChannelMapping(void *options, bool mapRGBA);
+
+int CMP_CDECL SetQualityBC1(void *options, float fquality);
+int CMP_CDECL SetQualityBC2(void *options, float fquality);
+int CMP_CDECL SetQualityBC3(void *options, float fquality);
+int CMP_CDECL SetQualityBC4(void *options, float fquality);
+int CMP_CDECL SetQualityBC5(void *options, float fquality);
+int CMP_CDECL SetQualityBC6(void *options, float fquality);
+int CMP_CDECL SetQualityBC7(void *options, float fquality);
+
+
+int CMP_CDECL SetAlphaThresholdBC1(void *options, unsigned char alphaThreshold);
+
+int CMP_CDECL SetMaskBC6(void *options, unsigned int  mask);
+int CMP_CDECL SetMaskBC7(void *options, unsigned char mask);
+
+int CMP_CDECL SetAlphaOptionsBC7(void *options, bool imageNeedsAlpha, bool colourRestrict, bool alphaRestrict);
+int CMP_CDECL SetErrorThresholdBC7(void *options, float minThreshold, float maxThreshold);
+
+//======================================================================================================
+// (4x4) Block level 4 channel source CompressBlock and DecompressBlock API for BCn Codecs
+//======================================================================================================
+// The options parameter for these API can be set to null in the calls if defaults settings is sufficient
+// Example: CompressBlockBC1(srcBlock,16,cmpBlock,NULL);   For "C" call
+//          CompressBlockBC1(srcBlock,16,cmpBlock);        For "C++" calls
+//
+// To use this parameter first create the options context using the CreateOptions call
+// then use the Set Options to set various codec settings and pass them to the appropriate 
+// Compress or Decompress API.
+// The source (srcBlock) channel format is expected to be RGBA:8888 by default for LDR Codecs
+// for BC6H the format is RGBA Half float (16 bits per channel)
+//------------------------------------------------------------------------------------------------------
+#ifdef __cplusplus
+#define CMP_DEFAULTNULL  =NULL
+#else
+#define CMP_DEFAULTNULL
+#endif
+
+//=========================================================================================================
+// 4 channel Sources, default format RGBA:8888 is processed as a 4x4 block starting at srcBlock location
+// where each row of the block is calculated from srcStride
+//=========================================================================================================
+int CMP_CDECL CompressBlockBC1(const unsigned char *srcBlock, unsigned int  srcStrideInBytes, unsigned char cmpBlock[8 ], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL CompressBlockBC2(const unsigned char *srcBlock, unsigned int  srcStrideInBytes, unsigned char cmpBlock[16], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL CompressBlockBC3(const unsigned char *srcBlock, unsigned int  srcStrideInBytes, unsigned char cmpBlock[16], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL CompressBlockBC7(const unsigned char *srcBlock, unsigned int  srcStrideInBytes, unsigned char cmpBlock[16], const void *options CMP_DEFAULTNULL);
+
+int CMP_CDECL DecompressBlockBC1(const unsigned char cmpBlock[8 ], unsigned char srcBlock[64], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL DecompressBlockBC2(const unsigned char cmpBlock[16], unsigned char srcBlock[64], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL DecompressBlockBC3(const unsigned char cmpBlock[16], unsigned char srcBlock[64], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL DecompressBlockBC7(const unsigned char cmpBlock[16], unsigned char srcBlock[64], const void *options CMP_DEFAULTNULL);
+
+//================================================
+// 1 channel Source 4x4 8 bits per block
+//================================================
+int CMP_CDECL CompressBlockBC4(const unsigned char *srcBlock, unsigned int  srcStrideInBytes, unsigned char cmpBlock[8], const void *options  CMP_DEFAULTNULL);
+int CMP_CDECL DecompressBlockBC4(const unsigned char cmpBlock[8], unsigned char srcBlock[16], const void *options  CMP_DEFAULTNULL);
+
+//================================================
+// 2 channel Source 2x(4x4 8 bits)
+//================================================
+int CMP_CDECL CompressBlockBC5(const unsigned char *srcBlock1, unsigned int srcStrideInBytes1,
+                               const unsigned char *srcBlock2, unsigned int srcStrideInBytes2,
+                               unsigned char cmpBlock[16], const void *options  CMP_DEFAULTNULL);
+int CMP_CDECL DecompressBlockBC5(const unsigned char cmpBlock[16], unsigned char srcBlock1[16], unsigned char srcBlock2[16], const void *options  CMP_DEFAULTNULL);
+
+//========================================================================================
+// For 3 channel Source  RGB_16, Note srcStride is in unsigned short steps (2 bytes each)
+//========================================================================================
+int CMP_CDECL CompressBlockBC6(const unsigned short *srcBlock, unsigned int srcStrideInShorts, unsigned char cmpBlock[16], const void *options CMP_DEFAULTNULL);
+int CMP_CDECL DecompressBlockBC6(const unsigned char cmpBlock[16], unsigned short srcBlock[48], const void *options CMP_DEFAULTNULL);
+
+#endif  // CMP_CORE
--- a/extern/CMP_Core/source/cmp_math_vec4.h
+++ b/extern/CMP_Core/source/cmp_math_vec4.h
@ -0,0 +1,417 @@
+//=====================================================================
+// Copyright 2019 (c), Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files(the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions :
+// 
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+//
+//=====================================================================
+#ifndef CMP_MATH_VEC4_H
+#define CMP_MATH_VEC4_H
+
+//====================================================
+// Vector Class definitions for CPU & Intrinsics
+//====================================================
+
+#if defined (_LINUX) || defined (_WIN32)
+
+//============================================= VEC2 ==================================================
+template<class T>
+class Vec2
+{
+public:
+
+    T x;
+    T y;
+
+    // *****************************************
+    //     Constructors
+    // *****************************************
+
+    /// Default constructor
+    Vec2() : x((T)0), y((T)0) {};
+
+    /// Value constructor
+    Vec2(const T& vx, const T& vy) : x(vx), y(vy) {};
+
+    /// Copy constructor
+    Vec2(const Vec2<T>& val) : x(val.x), y(val.y) {};
+
+    /// Single value constructor.  Sets all components to the given value
+    Vec2(const T& v) : x(v), y(v) {};
+
+
+    // *****************************************
+    //     Conversions/Assignment/Indexing
+    // *****************************************
+
+    /// cast to T*
+    operator const T* () const { return (const T*)this; };
+
+    /// cast to T*
+    operator T* () { return (T*)this; };
+
+    /// Indexing
+    const T& operator[](int i) const { return ((const T*)this)[i]; };
+    T& operator[](int i) { return ((T*)this)[i]; };
+
+    /// Assignment
+    const Vec2<T>& operator=(const Vec2<T>& rhs) { x = rhs.x; y = rhs.y; return *this; };
+
+    // *****************************************
+    //    Comparison
+    // *****************************************
+
+    /// Equality comparison
+    bool operator==(const Vec2<T>& rhs) const { return (x == rhs.x && y == rhs.y); };
+
+    /// Inequality comparision
+    bool operator!=(const Vec2<T>& rhs) const { return (x != rhs.x || y != rhs.y); };
+
+    // *****************************************
+    //    Arithmetic
+    // *****************************************
+
+    /// Addition
+    const Vec2<T> operator+(const Vec2<T>& rhs) const { return Vec2<T>(x + rhs.x, y + rhs.y); };
+
+    /// Subtraction
+    const Vec2<T> operator-(const Vec2<T>& rhs) const { return Vec2<T>(x - rhs.x, y - rhs.y); };
+
+    /// Multiply by scalar
+    const Vec2<T> operator*(const T& v) const { return Vec2<T>(x * v, y * v); };
+
+    /// Divide by scalar
+    const Vec2<T> operator/(const T& v) const { return Vec2<T>(x / v, y / v); };
+
+    /// Addition in-place
+    Vec2<T>& operator+= (const Vec2<T>& rhs) { x += rhs.x; y += rhs.y; return *this; };
+
+    /// Subtract in-place
+    Vec2<T>& operator-= (const Vec2<T>& rhs) { x -= rhs.x; y -= rhs.y; return *this; };
+
+    /// Scalar multiply in-place
+    Vec2<T>& operator*= (const T& v) { x *= v; y *= v; return *this; };
+
+    /// Scalar divide in-place
+    Vec2<T>& operator/= (const T& v) { x /= v; y /= v; return *this; };
+
+
+};
+
+typedef Vec2<float>  CMP_Vec2f;
+typedef Vec2<float>  CGU_Vec2f;
+typedef Vec2<float>  CGV_Vec2f;
+typedef Vec2<double> CMP_Vec2d;
+typedef Vec2<int>    CMP_Vec2i;
+
+//}
+
+
+
+
+//============================================= VEC3 ==================================================
+template<class T>
+class Vec3
+{
+public:
+
+    T x;
+    T y;
+    T z;
+
+    // *****************************************
+    //     Constructors
+    // *****************************************
+
+    /// Default constructor
+    Vec3() : x((T)0), y((T)0), z((T)0) {};
+
+    /// Value constructor
+    Vec3(const T& vx, const T& vy, const T& vz) : x(vx), y(vy), z(vz) {};
+
+    /// Copy constructor
+    Vec3(const Vec3<T>& val) : x(val.x), y(val.y), z(val.z) {};
+
+    /// Single value constructor.  Sets all components to the given value
+    Vec3(const T& v) : x(v), y(v), z(v) {};
+
+    /// Array constructor.  Assumes a 3-component array
+    Vec3(const T* v) : x(v[0]), y(v[1]), z(v[2]) {};
+
+    // *****************************************
+    //     Conversions/Assignment/Indexing
+    // *****************************************
+
+    /// cast to T*
+    operator const T* () const { return (const T*)this; };
+
+    /// cast to T*
+    operator T* () { return (T*)this; };
+
+    /// Assignment
+    const Vec3<T>& operator=(const Vec3<T>& rhs) { x = rhs.x; y = rhs.y; z = rhs.z; return *this; };
+
+    // *****************************************
+    //    Comparison
+    // *****************************************
+
+    /// Equality comparison
+    bool operator==(const Vec3<T>& rhs) const { return (x == rhs.x && y == rhs.y && z == rhs.z); };
+
+    /// Inequality comparision
+    bool operator!=(const Vec3<T>& rhs) const { return (x != rhs.x || y != rhs.y || z != rhs.z); };
+
+    // *****************************************
+    //    Arithmetic
+    // *****************************************
+
+    /// Addition
+    const Vec3<T> operator+(const Vec3<T>& rhs) const { return Vec3<T>(x + rhs.x, y + rhs.y, z + rhs.z); };
+
+    /// Subtraction
+    const Vec3<T> operator-(const Vec3<T>& rhs) const { return Vec3<T>(x - rhs.x, y - rhs.y, z - rhs.z); };
+
+    /// Multiply by scalar
+    const Vec3<T> operator*(const T& v) const { return Vec3<T>(x * v, y * v, z * v); };
+
+    /// Divide by scalar
+    const Vec3<T> operator/(const T& v) const { return Vec3<T>(x / v, y / v, z / v); };
+
+    /// Divide by vector
+    const Vec3<T> operator/(const Vec3<T>& rhs) const { return Vec3<T>(x / rhs.x, y / rhs.y, z / rhs.z); };
+
+    /// Addition in-place
+    Vec3<T>& operator+= (const Vec3<T>& rhs) { x += rhs.x; y += rhs.y; z += rhs.z; return *this; };
+
+    /// Subtract in-place
+    Vec3<T>& operator-= (const Vec3<T>& rhs) { x -= rhs.x; y -= rhs.y; z -= rhs.z; return *this; };
+
+    /// Scalar multiply in-place
+    Vec3<T>& operator*= (const T& v) { x *= v; y *= v; z *= v; return *this; };
+
+    /// Scalar divide in-place
+    Vec3<T>& operator/= (const T& v) { x /= v; y /= v; z /= v; return *this; };
+};
+
+typedef Vec3<float>             CGU_Vec3f;
+typedef Vec3<float>             CGV_Vec3f;
+typedef Vec3<unsigned char>     CGU_Vec3uc;
+typedef Vec3<unsigned char>     CGV_Vec3uc;
+
+typedef Vec3<float>             CMP_Vec3f;
+typedef Vec3<double>            CMP_Vec3d;
+typedef Vec3<int>               CMP_Vec3i;
+typedef Vec3<unsigned char>     CMP_Vec3uc;
+
+//============================================= VEC4 ==================================================
+template<class T>
+class Vec4
+{
+public:
+
+    T x;
+    T y;
+    T z;
+    T w;
+
+    // *****************************************
+    //     Constructors
+    // *****************************************
+
+    /// Default constructor
+    Vec4() : x((T)0), y((T)0), z((T)0), w((T)0) {};
+
+    /// Value constructor
+    Vec4(const T& vx, const T& vy, const T& vz, const T& vw) : x(vx), y(vy), z(vz), w(vw) {};
+
+    /// Copy constructor
+    Vec4(const Vec4<T>& val) : x(val.x), y(val.y), z(val.z), w(val.w) {};
+
+    /// Single value constructor.  Sets all components to the given value
+    Vec4(const T& v) : x(v), y(v), z(v), w(v) {};
+
+    /// Array constructor.  Assumes a 4-component array
+    Vec4(const T* v) : x(v[0]), y(v[1]), z(v[2]), w(v[3]) {};
+
+    // *****************************************
+    //     Conversions/Assignment/Indexing
+    // *****************************************
+
+    /// cast to T*
+    operator const T* () const { return (const T*)this; };
+
+    /// cast to T*
+    operator T* () { return (T*)this; };
+
+    /// Assignment
+    const Vec4<T>& operator=(const Vec4<T>& rhs) { x = rhs.x; y = rhs.y; z = rhs.z;  w = rhs.w; return *this; };
+
+    // *****************************************
+    //    Comparison
+    // *****************************************
+
+    /// Equality comparison
+    bool operator==(const Vec4<T>& rhs) const { return (x == rhs.x && y == rhs.y && z == rhs.z && w == rhs.w); };
+
+    /// Inequality comparision
+    bool operator!=(const Vec4<T>& rhs) const { return (x != rhs.x || y != rhs.y || z != rhs.z || w != rhs.w); };
+
+    // *****************************************
+    //    Arithmetic
+    // *****************************************
+
+    /// Addition
+    const Vec4<T> operator+(const Vec4<T>& rhs) const { return Vec4<T>(x + rhs.x, y + rhs.y, z + rhs.z, w + rhs.w); };
+
+    /// Subtraction
+    const Vec4<T> operator-(const Vec4<T>& rhs) const { return Vec4<T>(x - rhs.x, y - rhs.y, z - rhs.z, w - rhs.w); };
+
+    /// Multiply by scalar
+    const Vec4<T> operator*(const T& v) const { return Vec4<T>(x * v, y * v, z * v, w * v); };
+
+    /// Divide by scalar
+    const Vec4<T> operator/(const T& v) const { return Vec4<T>(x / v, y / v, z / v, w / v); };
+
+    /// Divide by vector
+    const Vec4<T> operator/(const Vec4<T>& rhs) const { return Vec4<T>(x / rhs.x, y / rhs.y, z / rhs.z, w / rhs.w); };
+
+    /// Addition in-place
+    Vec4<T>& operator+= (const Vec4<T>& rhs) { x += rhs.x; y += rhs.y; z += rhs.z; w += rhs.w; return *this; };
+
+    /// Subtract in-place
+    Vec4<T>& operator-= (const Vec4<T>& rhs) { x -= rhs.x; y -= rhs.y; z -= rhs.z; w -= rhs.w; return *this; };
+
+    /// Scalar multiply in-place
+    Vec4<T>& operator*= (const T& v) { x *= v; y *= v; z *= v; w *= v; return *this; };
+
+    /// Scalar divide in-place
+    Vec4<T>& operator/= (const T& v) { x /= v; y /= v; z /= v; w /= v; return *this; };
+};
+
+#include <stdio.h>
+#include "xmmintrin.h"
+#include <math.h>
+#include <float.h> 
+
+// SSE Vec4
+#ifdef _LINUX
+class CMP_SSEVec4f
+#else
+#include "intrin.h"
+class   __declspec(align(16)) CMP_SSEVec4f
+#endif
+{
+public:
+
+    union
+    {
+        __m128 vec128;          // float Vector 128 bits in total (16 Bytes) = array of 4 floats
+#ifdef _LINUX
+        float f32[4];
+#endif
+    };
+
+    // constructors
+    inline CMP_SSEVec4f() {};
+    inline CMP_SSEVec4f(float x, float y, float z, float w) : vec128(_mm_setr_ps(x, y, z, w)) {};
+    inline CMP_SSEVec4f(__m128 vec) : vec128(vec) {}
+    inline CMP_SSEVec4f(const float* data) : vec128(_mm_load_ps(data)) {};
+    inline CMP_SSEVec4f(float scalar) : vec128(_mm_load1_ps(&scalar)) {};
+
+    // copy and assignment
+    inline CMP_SSEVec4f(const CMP_SSEVec4f& init) : vec128(init.vec128) {};
+    inline const CMP_SSEVec4f& operator=(const CMP_SSEVec4f& lhs) { vec128 = lhs.vec128; return *this; };
+
+    // conversion to m128 type for direct use in _mm intrinsics
+    inline operator __m128() { return vec128; };
+    inline operator const __m128() const { return vec128; };
+
+    // indexing
+#ifdef _LINUX
+    inline const float& operator[](int i) const { return f32[i]; };
+    inline float& operator[](int i) { return f32[i]; };
+#else
+    inline const float& operator[](int i) const { return vec128.m128_f32[i]; };
+    inline float& operator[](int i) { return vec128.m128_f32[i]; };
+#endif
+
+    // addition
+    inline CMP_SSEVec4f operator+(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_add_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f& operator+=(const CMP_SSEVec4f& rhs) { vec128 = _mm_add_ps(vec128, rhs.vec128); return *this; };
+
+    // multiplication
+    inline CMP_SSEVec4f operator*(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_mul_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f& operator*=(const CMP_SSEVec4f& rhs) { vec128 = _mm_mul_ps(vec128, rhs.vec128); return *this; };
+
+    // scalar multiplication
+    //inline CMP_SSEVec4f operator*( float rhs ) const { return CMP_SSEVec4f( _mm_mul_ps(vec128, _mm_load1_ps(&rhs)) ); };
+    //inline CMP_SSEVec4f& operator*=( float rhs )  { vec128 = _mm_mul_ps(vec128, _mm_load1_ps(&rhs)); return *this; };
+
+
+    // subtraction
+    inline CMP_SSEVec4f operator-(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_sub_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f& operator-= (const CMP_SSEVec4f& rhs) { vec128 = _mm_sub_ps(vec128, rhs.vec128); return *this; };
+
+    // division
+    inline CMP_SSEVec4f operator/(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_div_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f& operator/= (const CMP_SSEVec4f& rhs) { vec128 = _mm_div_ps(vec128, rhs.vec128); return *this; };
+
+    // scalar division
+    inline CMP_SSEVec4f operator/(float rhs)   const { return CMP_SSEVec4f(_mm_div_ps(vec128, _mm_load1_ps(&rhs))); };
+    inline CMP_SSEVec4f& operator/=(float rhs) { vec128 = _mm_div_ps(vec128, _mm_load1_ps(&rhs)); return *this; };
+
+    // comparison
+    // these return 0 or 0xffffffff in each component
+    inline CMP_SSEVec4f operator< (const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_cmplt_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f operator> (const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_cmpgt_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f operator<=(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_cmple_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f operator>=(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_cmpge_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f operator==(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_cmpeq_ps(vec128, rhs.vec128)); };
+
+    // bitwise operators
+    inline CMP_SSEVec4f operator|(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_or_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f operator&(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_and_ps(vec128, rhs.vec128)); };
+    inline CMP_SSEVec4f operator^(const CMP_SSEVec4f& rhs) const { return CMP_SSEVec4f(_mm_xor_ps(vec128, rhs.vec128)); };
+    inline const CMP_SSEVec4f& operator|=(const CMP_SSEVec4f& rhs) { vec128 = _mm_or_ps(vec128, rhs.vec128); return *this; };
+    inline const CMP_SSEVec4f& operator&=(const CMP_SSEVec4f& rhs) { vec128 = _mm_and_ps(vec128, rhs.vec128); return *this; };
+
+    // for some horrible reason,there's no bitwise not instruction for SSE,
+    // so we have to do xor with 0xfffffff in order to fake it.
+    // TO get a 0xffffffff, we execute 0=0
+    inline CMP_SSEVec4f operator~() const
+    {
+        __m128 zero = _mm_setzero_ps();
+        __m128 is_true = _mm_cmpeq_ps(zero, zero);
+        return _mm_xor_ps(is_true, vec128);
+    };
+
+};
+
+typedef Vec4<float>             CMP_Vec4f;
+typedef Vec4<double>            CMP_Vec4d;
+typedef Vec4<int>               CMP_Vec4i;
+typedef Vec4<unsigned int>      CMP_Vec4ui;         // unsigned 16 bit x,y,x,w
+typedef Vec4<unsigned char>     CMP_Vec4uc;         // unsigned 8  bit x,y,x,w
+
+typedef Vec4<unsigned char>     CGU_Vec4uc;         // unsigned 8  bit x,y,x,w
+typedef Vec4<unsigned char>     CGV_Vec4uc;         // unsigned 8  bit x,y,x,w
+
+#endif // not ASPM_GPU
+
+#endif // Header Guard
+