Merge 714faa03e6 into 5c70ffef0b

6 years ago · 52bba99122
parent 5c70ffef0b 714faa03e6
commit 52bba99122
19 changed files with 148 additions and 132 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -35,11 +35,13 @@ IF(CMAKE_BUILD_TYPE MATCHES "debug")
    ADD_DEFINITIONS(-D_DEBUG=1)
 ENDIF()

+OPTION(NVTT_SHARED "build shared library" OFF)

 IF(NVTT_SHARED)
 	SET(NVCORE_SHARED TRUE)
 	SET(NVMATH_SHARED TRUE)
 	SET(NVIMAGE_SHARED TRUE)
+	SET(NVTHREAD_SHARED TRUE)
 ENDIF(NVTT_SHARED)

 ADD_SUBDIRECTORY(extern)
--- a/extern/CMakeLists.txt
+++ b/extern/CMakeLists.txt
@ -9,5 +9,5 @@ ADD_SUBDIRECTORY(EtcLib)
 ADD_SUBDIRECTORY(rg_etc1_v104)
 #ADD_SUBDIRECTORY(etcpack)

-ADD_SUBDIRECTORY(butteraugli)
+# ADD_SUBDIRECTORY(butteraugli)

--- a/src/nvcore/StrLib.h
+++ b/src/nvcore/StrLib.h
@ -202,11 +202,11 @@ namespace nv
        void stripExtension();

        // statics
-        NVCORE_API static char separator();
-        NVCORE_API static const char * fileName(const char *);
-        NVCORE_API static const char * extension(const char *);
+        static char separator();
+        static const char * fileName(const char *);
+        static const char * extension(const char *);

-        NVCORE_API static void translatePath(char * path, char pathSeparator = NV_PATH_SEPARATOR);
+        static void translatePath(char * path, char pathSeparator = NV_PATH_SEPARATOR);
    };


--- a/src/nvcore/Timer.h
+++ b/src/nvcore/Timer.h
@ -30,8 +30,8 @@ namespace nv {
    NV_FORCEINLINE uint64 fastCpuClock() { return 0; }    
 #endif
    
-    uint64 systemClockFrequency();
-    uint64 systemClock();
+    NVCORE_API uint64 systemClockFrequency();
+    NVCORE_API uint64 systemClock();

    class NVCORE_CLASS Timer
    {
--- a/src/nvimage/BlockDXT.h
+++ b/src/nvimage/BlockDXT.h
@ -40,7 +40,7 @@ namespace nv


    /// DXT1 block.
-    struct BlockDXT1
+    struct NVIMAGE_CLASS BlockDXT1
    {
        Color16 col0;
        Color16 col1;
@ -74,7 +74,7 @@ namespace nv


    /// DXT3 alpha block with explicit alpha.
-    struct AlphaBlockDXT3
+    struct NVIMAGE_CLASS AlphaBlockDXT3
    {
        union {
            struct {
@ -106,7 +106,7 @@ namespace nv


    /// DXT3 block.
-    struct BlockDXT3
+    struct NVIMAGE_CLASS BlockDXT3
    {
        AlphaBlockDXT3 alpha;
        BlockDXT1 color;
@ -120,7 +120,7 @@ namespace nv


    /// DXT5 alpha block.
-    struct AlphaBlockDXT5
+    struct NVIMAGE_CLASS AlphaBlockDXT5
    {
        union {
            struct {
@ -163,7 +163,7 @@ namespace nv


    /// DXT5 block.
-    struct BlockDXT5
+    struct NVIMAGE_CLASS BlockDXT5
    {
        AlphaBlockDXT5 alpha;
        BlockDXT1 color;
@ -176,7 +176,7 @@ namespace nv
    };

    /// ATI1 block.
-    struct BlockATI1
+    struct NVIMAGE_CLASS BlockATI1
    {
        AlphaBlockDXT5 alpha;

@ -187,7 +187,7 @@ namespace nv
    };

    /// ATI2 block.
-    struct BlockATI2
+    struct NVIMAGE_CLASS BlockATI2
    {
        AlphaBlockDXT5 x;
        AlphaBlockDXT5 y;
@ -199,7 +199,7 @@ namespace nv
    };

    /// CTX1 block.
-    struct BlockCTX1
+    struct NVIMAGE_CLASS BlockCTX1
    {
        uint8 col0[2];
        uint8 col1[2];
@ -218,14 +218,14 @@ namespace nv
    };

 	/// BC6 block.
-	struct BlockBC6
+	struct NVIMAGE_CLASS BlockBC6
 	{
 		uint8 data[16];		// Not even going to try to write a union for this thing.
        void decodeBlock(Vector4 colors[16]) const;
 	};

 	/// BC7 block.
-	struct BlockBC7
+	struct NVIMAGE_CLASS BlockBC7
 	{
 		uint8 data[16];		// Not even going to try to write a union for this thing.
 		void decodeBlock(ColorBlock * block) const;
--- a/src/nvimage/ColorBlock.h
+++ b/src/nvimage/ColorBlock.h
@ -4,6 +4,7 @@
 #ifndef NV_IMAGE_COLORBLOCK_H
 #define NV_IMAGE_COLORBLOCK_H

+#include "nvimage/nvimage.h"
 #include "nvmath/Color.h"
 #include "nvmath/Vector.h"

@ -14,7 +15,7 @@ namespace nv


    /// Uncompressed 4x4 color block.
-    struct ColorBlock
+    struct NVIMAGE_CLASS ColorBlock
    {
        ColorBlock();
        ColorBlock(const uint * linearImage);
@ -128,7 +129,7 @@ namespace nv


    /// Uncompressed 4x4 alpha block.
-    struct AlphaBlock4x4
+    struct NVIMAGE_CLASS AlphaBlock4x4
    {
        void init(uint8 value);
        void init(const ColorBlock & src, uint channel);
--- a/src/nvimage/DirectDrawSurface.h
+++ b/src/nvimage/DirectDrawSurface.h
@ -268,9 +268,9 @@ namespace nv
        DXGI_FORMAT_BC7_UNORM_SRGB = 99,
    };

-    extern uint findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+    NVIMAGE_API uint findD3D9Format(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);

-    extern uint findDXGIFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);
+    NVIMAGE_API uint findDXGIFormat(uint bitcount, uint rmask, uint gmask, uint bmask, uint amask);

    struct RGBAPixelFormat
    {
--- a/src/nvimage/ErrorMetric.h
+++ b/src/nvimage/ErrorMetric.h
@ -7,19 +7,19 @@ namespace nv
 {
    class FloatImage;

-    float rmsColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
-    float rmsAlphaError(const FloatImage * ref, const FloatImage * img);
+    NVIMAGE_API float rmsColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
+    NVIMAGE_API float rmsAlphaError(const FloatImage * ref, const FloatImage * img);

-    float averageColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
-    float averageAlphaError(const FloatImage * ref, const FloatImage * img);
+    NVIMAGE_API float averageColorError(const FloatImage * ref, const FloatImage * img, bool alphaWeight);
+    NVIMAGE_API float averageAlphaError(const FloatImage * ref, const FloatImage * img);

-    float rmsBilinearColorError(const FloatImage * ref, const FloatImage * img, FloatImage::WrapMode wm, bool alphaWeight);
+    NVIMAGE_API float rmsBilinearColorError(const FloatImage * ref, const FloatImage * img, FloatImage::WrapMode wm, bool alphaWeight);

-    float cieLabError(const FloatImage * ref, const FloatImage * img);
-    float cieLab94Error(const FloatImage * ref, const FloatImage * img);
-    float spatialCieLabError(const FloatImage * ref, const FloatImage * img);
+    NVIMAGE_API float cieLabError(const FloatImage * ref, const FloatImage * img);
+    NVIMAGE_API float cieLab94Error(const FloatImage * ref, const FloatImage * img);
+    NVIMAGE_API float spatialCieLabError(const FloatImage * ref, const FloatImage * img);

-    float averageAngularError(const FloatImage * img0, const FloatImage * img1);
-    float rmsAngularError(const FloatImage * img0, const FloatImage * img1);
+    NVIMAGE_API float averageAngularError(const FloatImage * img0, const FloatImage * img1);
+    NVIMAGE_API float rmsAngularError(const FloatImage * img0, const FloatImage * img1);

 } // nv namespace
--- a/src/nvimage/FloatImage.h
+++ b/src/nvimage/FloatImage.h
@ -24,7 +24,7 @@ namespace nv
    class PolyphaseKernel;

    /// Multicomponent floating point image class.
-    class FloatImage
+    class NVIMAGE_CLASS FloatImage
    {
    public:

@ -34,78 +34,78 @@ namespace nv
            WrapMode_Mirror
        };

-        NVIMAGE_API FloatImage();
-        NVIMAGE_API FloatImage(const FloatImage & img);
-        NVIMAGE_API FloatImage(const Image * img);
-        NVIMAGE_API virtual ~FloatImage();
+        FloatImage();
+        FloatImage(const FloatImage & img);
+        FloatImage(const Image * img);
+        virtual ~FloatImage();

        /** @name Conversion. */
        //@{
-        NVIMAGE_API void initFrom(const Image * img);
-        NVIMAGE_API Image * createImage(uint base_component = 0, uint num = 4) const;
-        NVIMAGE_API Image * createImageGammaCorrect(float gamma = 2.2f) const;
+        void initFrom(const Image * img);
+        Image * createImage(uint base_component = 0, uint num = 4) const;
+        Image * createImageGammaCorrect(float gamma = 2.2f) const;
        //@}

        /** @name Allocation. */
        //@{
-        NVIMAGE_API void allocate(uint c, uint w, uint h, uint d = 1);
-        NVIMAGE_API void free(); // Does not clear members.
-        NVIMAGE_API void resizeChannelCount(uint c);
+        void allocate(uint c, uint w, uint h, uint d = 1);
+        void free(); // Does not clear members.
+        void resizeChannelCount(uint c);
        //@}

        /** @name Manipulation. */
        //@{
-        NVIMAGE_API void clear(float f = 0.0f);
-        NVIMAGE_API void clear(uint component, float f = 0.0f);
-        NVIMAGE_API void copyChannel(uint src, uint dst);
+        void clear(float f = 0.0f);
+        void clear(uint component, float f = 0.0f);
+        void copyChannel(uint src, uint dst);

-        NVIMAGE_API void normalize(uint base_component);
+        void normalize(uint base_component);

-        NVIMAGE_API void packNormals(uint base_component);
-        NVIMAGE_API void expandNormals(uint base_component);
-        NVIMAGE_API void scaleBias(uint base_component, uint num, float scale, float add);
+        void packNormals(uint base_component);
+        void expandNormals(uint base_component);
+        void scaleBias(uint base_component, uint num, float scale, float add);

-        NVIMAGE_API void clamp(uint base_component, uint num, float low, float high);
+        void clamp(uint base_component, uint num, float low, float high);

-        NVIMAGE_API void toLinear(uint base_component, uint num, float gamma = 2.2f);
-        NVIMAGE_API void toGamma(uint base_component, uint num, float gamma = 2.2f);
-        NVIMAGE_API void exponentiate(uint base_component, uint num, float power);
+        void toLinear(uint base_component, uint num, float gamma = 2.2f);
+        void toGamma(uint base_component, uint num, float gamma = 2.2f);
+        void exponentiate(uint base_component, uint num, float power);

-        NVIMAGE_API void transform(uint base_component, const Matrix & m, const Vector4 & offset);
-        NVIMAGE_API void swizzle(uint base_component, uint r, uint g, uint b, uint a);
+        void transform(uint base_component, const Matrix & m, const Vector4 & offset);
+        void swizzle(uint base_component, uint r, uint g, uint b, uint a);

-        NVIMAGE_API FloatImage * fastDownSample() const;
-        NVIMAGE_API FloatImage * downSample(const Filter & filter, WrapMode wm) const;
-        NVIMAGE_API FloatImage * downSample(const Filter & filter, WrapMode wm, uint alpha) const;
-        NVIMAGE_API FloatImage * resize(const Filter & filter, uint w, uint h, WrapMode wm) const;
-        NVIMAGE_API FloatImage * resize(const Filter & filter, uint w, uint h, uint d, WrapMode wm) const;
-        NVIMAGE_API FloatImage * resize(const Filter & filter, uint w, uint h, WrapMode wm, uint alpha) const;
-        NVIMAGE_API FloatImage * resize(const Filter & filter, uint w, uint h, uint d, WrapMode wm, uint alpha) const;
+        FloatImage * fastDownSample() const;
+        FloatImage * downSample(const Filter & filter, WrapMode wm) const;
+        FloatImage * downSample(const Filter & filter, WrapMode wm, uint alpha) const;
+        FloatImage * resize(const Filter & filter, uint w, uint h, WrapMode wm) const;
+        FloatImage * resize(const Filter & filter, uint w, uint h, uint d, WrapMode wm) const;
+        FloatImage * resize(const Filter & filter, uint w, uint h, WrapMode wm, uint alpha) const;
+        FloatImage * resize(const Filter & filter, uint w, uint h, uint d, WrapMode wm, uint alpha) const;

-        NVIMAGE_API void convolve(const Kernel2 & k, uint c, WrapMode wm);
+        void convolve(const Kernel2 & k, uint c, WrapMode wm);

        //NVIMAGE_API FloatImage * downSample(const Kernel1 & filter, WrapMode wm) const;
        //NVIMAGE_API FloatImage * downSample(const Kernel1 & filter, uint w, uint h, WrapMode wm) const;
        //@}

-        NVIMAGE_API float applyKernelXY(const Kernel2 * k, int x, int y, int z, uint c, WrapMode wm) const;
-        NVIMAGE_API float applyKernelX(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const;
-        NVIMAGE_API float applyKernelY(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const;
-        NVIMAGE_API float applyKernelZ(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const;
-        NVIMAGE_API void applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, WrapMode wm, float * output) const;
-        NVIMAGE_API void applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, WrapMode wm, float * output, int output_stride) const;
-        NVIMAGE_API void applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, WrapMode wm, float * output) const;
-        NVIMAGE_API void applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, uint a, WrapMode wm, float * output) const;
-        NVIMAGE_API void applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, uint a, WrapMode wm, float * output, int output_stride) const;
-        NVIMAGE_API void applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, uint a, WrapMode wm, float * output) const;
+        float applyKernelXY(const Kernel2 * k, int x, int y, int z, uint c, WrapMode wm) const;
+        float applyKernelX(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const;
+        float applyKernelY(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const;
+        float applyKernelZ(const Kernel1 * k, int x, int y, int z, uint c, WrapMode wm) const;
+        void applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, WrapMode wm, float * output) const;
+        void applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, WrapMode wm, float * output, int output_stride) const;
+        void applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, WrapMode wm, float * output) const;
+        void applyKernelX(const PolyphaseKernel & k, int y, int z, uint c, uint a, WrapMode wm, float * output) const;
+        void applyKernelY(const PolyphaseKernel & k, int x, int z, uint c, uint a, WrapMode wm, float * output, int output_stride) const;
+        void applyKernelZ(const PolyphaseKernel & k, int x, int y, uint c, uint a, WrapMode wm, float * output) const;


-        NVIMAGE_API void flipX();
-        NVIMAGE_API void flipY();
-        NVIMAGE_API void flipZ();
+        void flipX();
+        void flipY();
+        void flipZ();

-        NVIMAGE_API float alphaTestCoverage(float alphaRef, int alphaChannel, float alphaScale = 1.0f) const;
-        NVIMAGE_API void scaleAlphaToCoverage(float coverage, float alphaRef, int alphaChannel);
+        float alphaTestCoverage(float alphaRef, int alphaChannel, float alphaScale = 1.0f) const;
+        void scaleAlphaToCoverage(float coverage, float alphaRef, int alphaChannel);


        uint width() const { return m_width; }
--- a/src/nvimage/KtxFile.h
+++ b/src/nvimage/KtxFile.h
@ -110,7 +110,7 @@ namespace nv
    const uint KTX_BASE_INTERNAL_STENCIL_INDEX = 0x1901;


-    struct KtxHeader {
+    struct NVIMAGE_CLASS KtxHeader {
        uint8 identifier[12];
        uint32 endianness;
        uint32 glType;
--- a/src/nvimage/NormalMap.h
+++ b/src/nvimage/NormalMap.h
@ -44,12 +44,12 @@ namespace nv
 	};

 	// @@ These two functions should be deprecated:
-	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter = NormalMapFilter_Sobel3x3);
-	FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights);
+    NVIMAGE_API FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, NormalMapFilter filter = NormalMapFilter_Sobel3x3);
+    NVIMAGE_API FloatImage * createNormalMap(const Image * img, FloatImage::WrapMode wm, Vector4::Arg heightWeights, Vector4::Arg filterWeights);

-	FloatImage * createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, Vector4::Arg filterWeights);
+    NVIMAGE_API FloatImage * createNormalMap(const FloatImage * img, FloatImage::WrapMode wm, Vector4::Arg filterWeights);

-	void normalizeNormalMap(FloatImage * img);
+    NVIMAGE_API void normalizeNormalMap(FloatImage * img);

 	// @@ Add generation of DU/DV maps.

--- a/src/nvmath/Fitting.h
+++ b/src/nvmath/Fitting.h
@ -11,38 +11,38 @@ namespace nv
 {
    namespace Fit
    {
-        Vector3 computeCentroid(int n, const Vector3 * points);
-        Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+        NVMATH_API Vector3 computeCentroid(int n, const Vector3 * points);
+        NVMATH_API Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, const Vector3 & metric);

-        Vector4 computeCentroid(int n, const Vector4 * points);
-        Vector4 computeCentroid(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+        NVMATH_API Vector4 computeCentroid(int n, const Vector4 * points);
+        NVMATH_API Vector4 computeCentroid(int n, const Vector4 * points, const float * weights, const Vector4 & metric);

-        Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
-        Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, const Vector3 & metric, float * covariance);
+        NVMATH_API Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
+        NVMATH_API Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, const Vector3 & metric, float * covariance);

-        Vector4 computeCovariance(int n, const Vector4 * points, float * covariance);
-        Vector4 computeCovariance(int n, const Vector4 * points, const float * weights, const Vector4 & metric, float * covariance);
+        NVMATH_API Vector4 computeCovariance(int n, const Vector4 * points, float * covariance);
+        NVMATH_API Vector4 computeCovariance(int n, const Vector4 * points, const float * weights, const Vector4 & metric, float * covariance);

-        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points);
-        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+        NVMATH_API Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points);
+        NVMATH_API Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points, const float * weights, const Vector3 & metric);

-        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points);
-        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+        NVMATH_API Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points);
+        NVMATH_API Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points, const float * weights, const Vector3 & metric);

-		Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points);
-        Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+        NVMATH_API Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points);
+        NVMATH_API Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points, const float * weights, const Vector4 & metric);

-        Vector3 computePrincipalComponent_SVD(int n, const Vector3 * points);
-        Vector4 computePrincipalComponent_SVD(int n, const Vector4 * points);
+        NVMATH_API Vector3 computePrincipalComponent_SVD(int n, const Vector3 * points);
+        NVMATH_API Vector4 computePrincipalComponent_SVD(int n, const Vector4 * points);

-        Plane bestPlane(int n, const Vector3 * points);
-        bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);
+        NVMATH_API Plane bestPlane(int n, const Vector3 * points);
+        NVMATH_API bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);

-        bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
-        bool eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4]);
+        NVMATH_API bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
+        NVMATH_API bool eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4]);

        // Returns number of clusters [1-4].
-        int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);
+        NVMATH_API int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);
    }

 } // nv namespace
--- a/src/nvmath/Gamma.h
+++ b/src/nvmath/Gamma.h
@ -30,8 +30,8 @@
 namespace nv {

    // gamma conversion of float array (in-place is allowed)
-    void powf_5_11(const float* src, float* dst, int count);
-    void powf_11_5(const float* src, float* dst, int count);
+    NVMATH_API void powf_5_11(const float* src, float* dst, int count);
+    NVMATH_API void powf_11_5(const float* src, float* dst, int count);

 } // nv namespace

--- a/src/nvmath/Half.cpp
+++ b/src/nvmath/Half.cpp
@ -577,6 +577,15 @@ namespace nv {
    uint32 offset_table[64];
 }

+uint32 nv::fast_half_to_float(uint16 h)
+{
+    // Initialize table if necessary.
+    if (mantissa_table[0] != 0)
+        half_init_tables();
+    uint exp = h >> 10;
+    return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
+}
+
 void nv::half_init_tables()
 {
    // Init mantissa table.
@ -742,7 +751,7 @@ static inline uint16_t float_to_half_branch(uint32_t x)
 #define S256(x)  S64((x)),  S64((x)+64),   S64((x)+128),  S64((x)+192)
 #define S1024(x) S256((x)), S256((x)+256), S256((x)+512), S256((x)+768)

-/* Lookup table-based algorithm from “Fast Half Float Conversions”
+/* Lookup table-based algorithm from “Fast Half Float Conversions?
 * by Jeroen van der Zijp, November 2008. No rounding is performed,
 * and some NaN values may be incorrectly converted to Inf. */
 static inline uint16_t float_to_half_nobranch(uint32_t x)
--- a/src/nvmath/Half.h
+++ b/src/nvmath/Half.h
@ -6,29 +6,31 @@

 namespace nv {

-    uint32 half_to_float( uint16 h );
-    uint16 half_from_float( uint32 f );
+    NVMATH_API uint32 half_to_float( uint16 h );
+    NVMATH_API uint16 half_from_float( uint32 f );

    // vin,vout must be 16 byte aligned. count must be a multiple of 8.
    // implement a non-SSE version if we need it. For now, this naming makes it clear this is only available when SSE2 is
-    void half_to_float_array_SSE2(const uint16 * vin, float * vout, int count);
+    NVMATH_API void half_to_float_array_SSE2(const uint16 * vin, float * vout, int count);

-    void half_init_tables();
+    NVMATH_API void half_init_tables();

-    extern uint32 mantissa_table[2048];
-    extern uint32 exponent_table[64];
-    extern uint32 offset_table[64];
+    //extern uint32 mantissa_table[2048];
+    //extern uint32 exponent_table[64];
+    //extern uint32 offset_table[64];

    // Fast half to float conversion based on:
    // http://www.fox-toolkit.org/ftp/fasthalffloatconversion.pdf
-    inline uint32 fast_half_to_float(uint16 h)
-    {
-		// Initialize table if necessary.
-		if (mantissa_table[0] != 0)
-			half_init_tables();
-	    uint exp = h >> 10;
-	    return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
-    }
+  //  inline uint32 fast_half_to_float(uint16 h)
+  //  {
+		//// Initialize table if necessary.
+		//if (mantissa_table[0] != 0)
+		//	half_init_tables();
+	 //   uint exp = h >> 10;
+	 //   return mantissa_table[offset_table[exp] + (h & 0x3ff)] + exponent_table[exp];
+  //  }
+
+    NVMATH_API uint32 fast_half_to_float(uint16 h);


    inline uint16 to_half(float c) {
--- a/src/nvmath/Matrix.h
+++ b/src/nvmath/Matrix.h
@ -48,10 +48,10 @@ namespace nv
    };
    
    // Solve equation system using LU decomposition and back-substitution.
-    extern bool solveLU(const Matrix2 & m, const Vector2 & b, Vector2 * x);
+    NVMATH_API bool solveLU(const Matrix2 & m, const Vector2 & b, Vector2 * x);
    
    // Solve equation system using Cramer's inverse.
-    extern bool solveCramer(const Matrix2 & A, const Vector2 & b, Vector2 * x);
+    NVMATH_API bool solveCramer(const Matrix2 & A, const Vector2 & b, Vector2 * x);
    
    
    // 3x3 matrix.
@ -87,12 +87,12 @@ namespace nv
    };

    // Solve equation system using LU decomposition and back-substitution.
-    extern bool solveLU(const Matrix3 & m, const Vector3 & b, Vector3 * x);
+    NVMATH_API bool solveLU(const Matrix3 & m, const Vector3 & b, Vector3 * x);

    // Solve equation system using Cramer's inverse.
-    extern bool solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x);
+    NVMATH_API bool solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x);

-    extern Matrix3 inverse(const Matrix3 & m);
+    NVMATH_API Matrix3 inverse(const Matrix3 & m);
    

    // 4x4 matrix.
@ -138,16 +138,16 @@ namespace nv
    };

    // Solve equation system using LU decomposition and back-substitution.
-    extern bool solveLU(const Matrix & A, const Vector4 & b, Vector4 * x);
+    NVMATH_API bool solveLU(const Matrix & A, const Vector4 & b, Vector4 * x);

    // Solve equation system using Cramer's inverse.
-    extern bool solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x);
+    NVMATH_API bool solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x);

    // Compute inverse using LU decomposition.
-    extern Matrix inverseLU(const Matrix & m);
+    NVMATH_API Matrix inverseLU(const Matrix & m);

    // Compute inverse using Gaussian elimination and partial pivoting.
-    extern Matrix inverse(const Matrix & m);
+    NVMATH_API Matrix inverse(const Matrix & m);

 } // nv namespace

--- a/src/nvthread/CMakeLists.txt
+++ b/src/nvthread/CMakeLists.txt
@ -15,6 +15,7 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
 ADD_DEFINITIONS(-DNVTHREAD_EXPORTS)

 IF(NVTHREAD_SHARED)
+	ADD_DEFINITIONS(-DNVTHREAD_SHARED=1)
 	ADD_LIBRARY(nvthread SHARED ${THREAD_SRCS})
 ELSE(NVTHREAD_SHARED)
 	ADD_LIBRARY(nvthread ${THREAD_SRCS})
--- a/src/nvthread/ParallelFor.h
+++ b/src/nvthread/ParallelFor.h
@ -14,7 +14,7 @@ namespace nv

    typedef void ForTask(void * context, /*int tid,*/ int idx); // @@ It would be nice to have the thread index as an argument here.

-    struct ParallelFor {
+    struct NVTHREAD_CLASS ParallelFor {
        ParallelFor(ForTask * task, void * context);
        ~ParallelFor();

--- a/src/nvtt/CMakeLists.txt
+++ b/src/nvtt/CMakeLists.txt
@ -44,6 +44,7 @@ INCLUDE_DIRECTORIES(${NV_SOURCE_DIR}/extern/rg_etc1_v104)
 ADD_DEFINITIONS(-DNVTT_EXPORTS)

 IF(NVTT_SHARED)	
+	ADD_DEFINITIONS(-DNVTT_SHARED)
    ADD_LIBRARY(nvtt SHARED ${NVTT_SRCS})
 ELSE(NVTT_SHARED)
    ADD_LIBRARY(nvtt ${NVTT_SRCS})