diff --git a/trunk/src/nvtt/ClusterFit.cpp b/trunk/src/nvtt/ClusterFit.cpp index 7a072c5..a8e08a4 100644 --- a/trunk/src/nvtt/ClusterFit.cpp +++ b/trunk/src/nvtt/ClusterFit.cpp @@ -92,8 +92,8 @@ void ClusterFit::setColourSet(const ColorSet * set) { int p = order[i]; #if NVTT_USE_SIMD - Vector4 tmp(values[p] * set->weights[p], set->weights[p]); - m_weighted[i] = SimdVector(tmp); + NV_ALIGN_16 Vector4 tmp(values[p] * set->weights[p], set->weights[p]); + m_weighted[i] = SimdVector(tmp.component); m_xxsum += m_weighted[i] * m_weighted[i]; m_xsum += m_weighted[i]; #else @@ -110,8 +110,8 @@ void ClusterFit::setColourSet(const ColorSet * set) void ClusterFit::setMetric(Vector4::Arg w) { #if NVTT_USE_SIMD - Vector4 tmp(w.xyz(), 1); - m_metric = SimdVector(tmp); + NV_ALIGN_16 Vector4 tmp(w.xyz(), 1); + m_metric = SimdVector(tmp.component); #else m_metric = w.xyz(); #endif @@ -134,13 +134,13 @@ float ClusterFit::bestError() const bool ClusterFit::compress3( Vector3 * start, Vector3 * end ) { - int const count = m_count; - SimdVector const one = SimdVector(1.0f); - SimdVector const zero = SimdVector(0.0f); - SimdVector const half(0.5f, 0.5f, 0.5f, 0.25f); - SimdVector const two = SimdVector(2.0); - SimdVector const grid( 31.0f, 63.0f, 31.0f, 0.0f ); - SimdVector const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f ); + const int count = m_count; + const SimdVector one = SimdVector(1.0f); + const SimdVector zero = SimdVector(0.0f); + const SimdVector half(0.5f, 0.5f, 0.5f, 0.25f); + const SimdVector two = SimdVector(2.0); + const SimdVector grid( 31.0f, 63.0f, 31.0f, 0.0f ); + const SimdVector gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f ); // declare variables SimdVector beststart = SimdVector( 0.0f ); @@ -158,23 +158,23 @@ bool ClusterFit::compress3( Vector3 * start, Vector3 * end ) for( int c1 = 0; c1 <= count-c0; c1++) { - SimdVector const x2 = m_xsum - x1 - x0; + const SimdVector x2 = m_xsum - x1 - x0; - //Vector3 const alphax_sum = x0 + x1 * 0.5f; - //float const alpha2_sum = w0 + w1 * 0.25f; - SimdVector const alphax_sum = multiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum - SimdVector const alpha2_sum = alphax_sum.splatW(); + //Vector3 alphax_sum = x0 + x1 * 0.5f; + //float alpha2_sum = w0 + w1 * 0.25f; + const SimdVector alphax_sum = multiplyAdd(x1, half, x0); // alphax_sum, alpha2_sum + const SimdVector alpha2_sum = alphax_sum.splatW(); - //Vector3 const betax_sum = x2 + x1 * 0.5f; - //float const beta2_sum = w2 + w1 * 0.25f; - SimdVector const betax_sum = multiplyAdd(x1, half, x2); // betax_sum, beta2_sum - SimdVector const beta2_sum = betax_sum.splatW(); + //const Vector3 betax_sum = x2 + x1 * 0.5f; + //const float beta2_sum = w2 + w1 * 0.25f; + const SimdVector betax_sum = multiplyAdd(x1, half, x2); // betax_sum, beta2_sum + const SimdVector beta2_sum = betax_sum.splatW(); - //float const alphabeta_sum = w1 * 0.25f; - SimdVector const alphabeta_sum = (x1 * half).splatW(); // alphabeta_sum + //const float alphabeta_sum = w1 * 0.25f; + const SimdVector alphabeta_sum = (x1 * half).splatW(); // alphabeta_sum - // float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); - SimdVector const factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) ); + // const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); + const SimdVector factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) ); SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor; SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor; @@ -229,16 +229,16 @@ bool ClusterFit::compress3( Vector3 * start, Vector3 * end ) bool ClusterFit::compress4( Vector3 * start, Vector3 * end ) { - int const count = m_count; - SimdVector const one = SimdVector(1.0f); - SimdVector const zero = SimdVector(0.0f); - SimdVector const half = SimdVector(0.5f); - SimdVector const two = SimdVector(2.0); - SimdVector const onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f ); - SimdVector const twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f ); - SimdVector const twonineths = SimdVector( 2.0f/9.0f ); - SimdVector const grid( 31.0f, 63.0f, 31.0f, 0.0f ); - SimdVector const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f ); + const int count = m_count; + const SimdVector one = SimdVector(1.0f); + const SimdVector zero = SimdVector(0.0f); + const SimdVector half = SimdVector(0.5f); + const SimdVector two = SimdVector(2.0); + const SimdVector onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f ); + const SimdVector twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f ); + const SimdVector twonineths = SimdVector( 2.0f/9.0f ); + const SimdVector grid( 31.0f, 63.0f, 31.0f, 0.0f ); + const SimdVector gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f ); // declare variables SimdVector beststart = SimdVector( 0.0f ); @@ -259,23 +259,23 @@ bool ClusterFit::compress4( Vector3 * start, Vector3 * end ) for( int c2 = 0; c2 <= count-c0-c1; c2++) { - SimdVector const x3 = m_xsum - x2 - x1 - x0; + const SimdVector x3 = m_xsum - x2 - x1 - x0; - //Vector3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f); - //float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f); - SimdVector const alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum - SimdVector const alpha2_sum = alphax_sum.splatW(); + //const Vector3 alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f); + //const float alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f); + const SimdVector alphax_sum = multiplyAdd(x2, onethird, multiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum + const SimdVector alpha2_sum = alphax_sum.splatW(); - //Vector3 const betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f); - //float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f); - SimdVector const betax_sum = multiplyAdd(x2, twothirds, multiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum - SimdVector const beta2_sum = betax_sum.splatW(); + //const Vector3 betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f); + //const float beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f); + const SimdVector betax_sum = multiplyAdd(x2, twothirds, multiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum + const SimdVector beta2_sum = betax_sum.splatW(); - //float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f); - SimdVector const alphabeta_sum = twonineths*( x1 + x2 ).splatW(); // alphabeta_sum + //const float alphabeta_sum = (w1 + w2) * (2.0f/9.0f); + const SimdVector alphabeta_sum = twonineths*( x1 + x2 ).splatW(); // alphabeta_sum - // float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); - SimdVector const factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) ); + //const float factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); + const SimdVector factor = reciprocal( negativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) ); SimdVector a = negativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor; SimdVector b = negativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor; diff --git a/trunk/src/nvtt/CompressorDXT.cpp b/trunk/src/nvtt/CompressorDXT.cpp index c614bd7..ea8c629 100644 --- a/trunk/src/nvtt/CompressorDXT.cpp +++ b/trunk/src/nvtt/CompressorDXT.cpp @@ -113,7 +113,7 @@ void FixedBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, c */ -struct CompressorContext +struct FixedBlockCompressorContext { nvtt::AlphaMode alphaMode; uint w, h; @@ -125,10 +125,10 @@ struct CompressorContext FixedBlockCompressor * compressor; }; -// Each task compresses one row. -void CompressorTask(void * data, int i) +// Each task compresses one block. +void FixedBlockCompressorTask(void * data, int i) { - CompressorContext * d = (CompressorContext *) data; + FixedBlockCompressorContext * d = (FixedBlockCompressorContext *) data; uint x = i % d->bw; uint y = i / d->bw; @@ -147,7 +147,7 @@ void FixedBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, u { nvDebugCheck(d == 1); - CompressorContext context; + FixedBlockCompressorContext context; context.alphaMode = alphaMode; context.w = w; context.h = h; @@ -169,7 +169,7 @@ void FixedBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, u const uint size = context.bs * count; context.mem = new uint8[size]; - dispatcher->dispatch(CompressorTask, &context, count); + dispatcher->dispatch(FixedBlockCompressorTask, &context, count); outputOptions.writeData(context.mem, size); @@ -177,35 +177,67 @@ void FixedBlockCompressor::compress(nvtt::AlphaMode alphaMode, uint w, uint h, u } +struct ColorSetCompressorContext +{ + nvtt::AlphaMode alphaMode; + uint w, h; + const float * data; + const nvtt::CompressionOptions::Private * compressionOptions; + + uint bw, bh, bs; + uint8 * mem; + ColorSetCompressor * compressor; +}; + + +// Each task compresses one block. +void ColorSetCompressorTask(void * data, int i) +{ + ColorSetCompressorContext * d = (ColorSetCompressorContext *) data; + + uint x = i % d->bw; + uint y = i / d->bw; + + //for (uint x = 0; x < d->bw; x++) + { + ColorSet set; + set.setColors(d->data, d->w, d->h, x, y); + + uint8 * ptr = d->mem + (y * d->bw + x) * d->bs; + d->compressor->compressBlock(set, d->alphaMode, *d->compressionOptions, ptr); + } +} void ColorSetCompressor::compress(AlphaMode alphaMode, uint w, uint h, uint d, const float * data, nvtt::TaskDispatcher * dispatcher, const CompressionOptions::Private & compressionOptions, const OutputOptions::Private & outputOptions) { nvDebugCheck(d == 1); - const uint bs = blockSize(); - const uint bw = (w + 3) / 4; - const uint bh = (h + 3) / 4; + ColorSetCompressorContext context; + context.alphaMode = alphaMode; + context.w = w; + context.h = h; + context.data = data; + context.compressionOptions = &compressionOptions; - //bool singleThreaded = true; - //if (singleThreaded) - { - uint8 * mem = malloc(bs * bw); + context.bs = blockSize(); + context.bw = (w + 3) / 4; + context.bh = (h + 3) / 4; - ColorSet set; + context.compressor = this; - for (uint y = 0; y < h; y += 4) { - uint8 * ptr = mem; - for (uint x = 0; x < w; x += 4, ptr += bs) { - set.setColors(data, w, h, x, y); - compressBlock(set, alphaMode, compressionOptions, ptr); - } + SequentialTaskDispatcher sequential; - if (outputOptions.outputHandler != NULL) { - outputOptions.outputHandler->writeData(mem, bs * bw); - } - } + // Use a single thread to compress small textures. + if (context.bh < 4) dispatcher = &sequential; - free(mem); - } + const uint count = context.bw * context.bh; + const uint size = context.bs * count; + context.mem = new uint8[size]; + + dispatcher->dispatch(ColorSetCompressorTask, &context, count); + + outputOptions.writeData(context.mem, size); + + delete [] context.mem; } diff --git a/trunk/src/nvtt/QuickCompressDXT.cpp b/trunk/src/nvtt/QuickCompressDXT.cpp index 2833636..084ad52 100644 --- a/trunk/src/nvtt/QuickCompressDXT.cpp +++ b/trunk/src/nvtt/QuickCompressDXT.cpp @@ -721,8 +721,8 @@ void QuickCompress::outputBlock4(const ColorSet & set, const Vector3 & start, co if (color0 < color1) { - swap(maxColor, minColor); - swap(color0, color1); + swap(maxColor, minColor); + swap(color0, color1); } block->col0 = Color16(color0); @@ -741,8 +741,8 @@ void QuickCompress::outputBlock3(const ColorSet & set, const Vector3 & start, co if (color0 > color1) { - swap(maxColor, minColor); - swap(color0, color1); + swap(maxColor, minColor); + swap(color0, color1); } block->col0 = Color16(color0); diff --git a/trunk/src/nvtt/Surface.cpp b/trunk/src/nvtt/Surface.cpp index 6e9cf73..86e67d4 100644 --- a/trunk/src/nvtt/Surface.cpp +++ b/trunk/src/nvtt/Surface.cpp @@ -432,6 +432,30 @@ bool Surface::save(const char * fileName) const return false; } +#if 0 //NV_OS_WIN32 + +#include +#undef min +#undef max + +static int filter(unsigned int code, struct _EXCEPTION_POINTERS *ep) { + if (code == EXCEPTION_ACCESS_VIOLATION) { + return EXCEPTION_EXECUTE_HANDLER; + } + else { + return EXCEPTION_CONTINUE_SEARCH; + }; +} + +#define TRY __try + +#define CATCH __except (filter(GetExceptionCode(), GetExceptionInformation())) +#else +#define TRY +#define CATCH +#endif + + bool Surface::setImage(nvtt::InputFormat format, int w, int h, int d, const void * data) { detach(); @@ -453,7 +477,7 @@ bool Surface::setImage(nvtt::InputFormat format, int w, int h, int d, const void { const Color32 * src = (const Color32 *)data; - try { + TRY { for (int i = 0; i < count; i++) { rdst[i] = float(src[i].r) / 255.0f; @@ -462,7 +486,7 @@ bool Surface::setImage(nvtt::InputFormat format, int w, int h, int d, const void adst[i] = float(src[i].a) / 255.0f; } } - catch(...) { + CATCH { return false; } } @@ -470,7 +494,7 @@ bool Surface::setImage(nvtt::InputFormat format, int w, int h, int d, const void { const uint16 * src = (const uint16 *)data; - try { + TRY { for (int i = 0; i < count; i++) { ((uint32 *)rdst)[i] = half_to_float(src[4*i+0]); @@ -479,7 +503,7 @@ bool Surface::setImage(nvtt::InputFormat format, int w, int h, int d, const void ((uint32 *)adst)[i] = half_to_float(src[4*i+3]); } } - catch(...) { + CATCH { return false; } } @@ -487,7 +511,7 @@ bool Surface::setImage(nvtt::InputFormat format, int w, int h, int d, const void { const float * src = (const float *)data; - try { + TRY { for (int i = 0; i < count; i++) { rdst[i] = src[4 * i + 0]; @@ -496,7 +520,7 @@ bool Surface::setImage(nvtt::InputFormat format, int w, int h, int d, const void adst[i] = src[4 * i + 3]; } } - catch(...) { + CATCH { return false; } } diff --git a/trunk/src/nvtt/tools/compress.cpp b/trunk/src/nvtt/tools/compress.cpp index fba7a26..8ac5a16 100644 --- a/trunk/src/nvtt/tools/compress.cpp +++ b/trunk/src/nvtt/tools/compress.cpp @@ -270,6 +270,11 @@ int main(int argc, char *argv[]) i++; } } + else if (strcmp("-pause", argv[i]) == 0) + { + printf("Press ENTER\n"); fflush(stdout); + getchar(); + } // Output options else if (strcmp("-silent", argv[i]) == 0) @@ -529,6 +534,11 @@ int main(int argc, char *argv[]) compressionOptions.setColorWeights(1, 1, 0); } + + //compressionOptions.setColorWeights(0.2126, 0.7152, 0.0722); + //compressionOptions.setColorWeights(0.299, 0.587, 0.114); + //compressionOptions.setColorWeights(3, 4, 2); + if (externalCompressor != NULL) { compressionOptions.setExternalCompressor(externalCompressor);