diff --git a/src/nvtt/squish/weightedclusterfit.cpp b/src/nvtt/squish/weightedclusterfit.cpp index 36186b8..018a3ef 100644 --- a/src/nvtt/squish/weightedclusterfit.cpp +++ b/src/nvtt/squish/weightedclusterfit.cpp @@ -135,6 +135,8 @@ void WeightedClusterFit::Compress3( void* block ) Vec4 const zero = VEC4_CONST(0.0f); Vec4 const half(0.5f, 0.5f, 0.5f, 0.25f); Vec4 const two = VEC4_CONST(2.0); + Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f ); + Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f ); // declare variables Vec4 beststart = VEC4_CONST( 0.0f ); @@ -173,24 +175,21 @@ void WeightedClusterFit::Compress3( void* block ) Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor; Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor; - // clamp the output to [0, 1] + // clamp to the grid a = Min( one, Max( zero, a ) ); b = Min( one, Max( zero, b ) ); - - // clamp to the grid - Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f ); - Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f ); a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp; b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp; - // compute the error - Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum ); - Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum ); - Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 ); - + // compute the error (we skip the constant xxsum) + Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum ); + Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum ); + Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 ); + Vec4 e4 = MultiplyAdd( two, e3, e1 ); + // apply the metric to the error term - Vec4 e4 = e3 * m_metricSqr; - Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ(); + Vec4 e5 = e4 * m_metricSqr; + Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ(); // keep the solution if it wins if( CompareAnyLessThan( error, besterror ) ) @@ -250,6 +249,9 @@ void WeightedClusterFit::Compress4( void* block ) Vec4 const two = VEC4_CONST(2.0); Vec4 const onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f ); Vec4 const twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f ); + Vec4 const twonineths = VEC4_CONST( 2.0f/9.0f ); + Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f ); + Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f ); // declare variables Vec4 beststart = VEC4_CONST( 0.0f ); @@ -274,16 +276,16 @@ void WeightedClusterFit::Compress4( void* block ) //Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f); //float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f); - Vec4 const alphax_sum = x0 + MultiplyAdd(x1, twothirds, x2 * onethird); // alphax_sum, alpha2_sum + Vec4 const alphax_sum = MultiplyAdd(x2, onethird, MultiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum Vec4 const alpha2_sum = alphax_sum.SplatW(); //Vec3 const betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f); //float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f); - Vec4 const betax_sum = x3 + MultiplyAdd(x2, twothirds, x1 * onethird); // betax_sum, beta2_sum + Vec4 const betax_sum = MultiplyAdd(x2, twothirds, MultiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum Vec4 const beta2_sum = betax_sum.SplatW(); - //float const alphabeta_sum = w1 * (2.0f/9.0f) + w2 * (2.0f/9.0f); - Vec4 const alphabeta_sum = two * (x1 * onethird + x2 * onethird).SplatW(); // alphabeta_sum + //float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f); + Vec4 const alphabeta_sum = twonineths*( x1 + x2 ).SplatW(); // alphabeta_sum // float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum); Vec4 const factor = Reciprocal( NegativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) ); @@ -291,24 +293,21 @@ void WeightedClusterFit::Compress4( void* block ) Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor; Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor; - // clamp the output to [0, 1] + // clamp to the grid a = Min( one, Max( zero, a ) ); b = Min( one, Max( zero, b ) ); - - // clamp to the grid - Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f ); - Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f ); a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp; b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp; - // compute the error - Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum ); - Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum ); - Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 ); - + // compute the error (we skip the constant xxsum) + Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum ); + Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum ); + Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 ); + Vec4 e4 = MultiplyAdd( two, e3, e1 ); + // apply the metric to the error term - Vec4 e4 = e3 * m_metricSqr; - Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ(); + Vec4 e5 = e4 * m_metricSqr; + Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ(); // keep the solution if it wins if( CompareAnyLessThan( error, besterror ) ) @@ -368,6 +367,12 @@ void WeightedClusterFit::Compress4( void* block ) void WeightedClusterFit::Compress3( void* block ) { + Vec3 const one( 1.0f ); + Vec3 const zero( 0.0f ); + Vec3 const half( 0.5f ); + Vec3 const grid( 31.0f, 63.0f, 31.0f ); + Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f ); + // declare variables Vec3 beststart( 0.0f ); Vec3 bestend( 0.0f ); @@ -400,16 +405,9 @@ void WeightedClusterFit::Compress3( void* block ) Vec3 a = (alphax_sum*beta2_sum - betax_sum*alphabeta_sum) * factor; Vec3 b = (betax_sum*alpha2_sum - alphax_sum*alphabeta_sum) * factor; - // clamp the output to [0, 1] - Vec3 const one( 1.0f ); - Vec3 const zero( 0.0f ); + // clamp to the grid a = Min( one, Max( zero, a ) ); b = Min( one, Max( zero, b ) ); - - // clamp to the grid - Vec3 const grid( 31.0f, 63.0f, 31.0f ); - Vec3 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f ); - Vec3 const half( 0.5f ); a = Floor( grid*a + half )*gridrcp; b = Floor( grid*b + half )*gridrcp; @@ -470,6 +468,12 @@ void WeightedClusterFit::Compress3( void* block ) void WeightedClusterFit::Compress4( void* block ) { + Vec3 const one( 1.0f ); + Vec3 const zero( 0.0f ); + Vec3 const half( 0.5f ); + Vec3 const grid( 31.0f, 63.0f, 31.0f ); + Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f ); + // declare variables Vec3 beststart( 0.0f ); Vec3 bestend( 0.0f ); @@ -505,16 +509,9 @@ void WeightedClusterFit::Compress4( void* block ) Vec3 a = ( alphax_sum*beta2_sum - betax_sum*alphabeta_sum )*factor; Vec3 b = ( betax_sum*alpha2_sum - alphax_sum*alphabeta_sum )*factor; - // clamp the output to [0, 1] - Vec3 const one( 1.0f ); - Vec3 const zero( 0.0f ); + // clamp to the grid a = Min( one, Max( zero, a ) ); b = Min( one, Max( zero, b ) ); - - // clamp to the grid - Vec3 const grid( 31.0f, 63.0f, 31.0f ); - Vec3 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f ); - Vec3 const half( 0.5f ); a = Floor( grid*a + half )*gridrcp; b = Floor( grid*b + half )*gridrcp;