//===================================================================== // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files(the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and / or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions : // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // //===================================================================== // Ref: GPUOpen-Tools/Compressonator /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2016, Intel Corporation // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated // documentation files (the "Software"), to deal in the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to // permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of // the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //-------------------------------------- // Common BC7 Header //-------------------------------------- #include "BC7_Encode_Kernel.h" #ifndef ASPM //--------------------------------------------- // Predefinitions for GPU and CPU compiled code //--------------------------------------------- #define ENABLE_CODE #ifndef ASPM_GPU // using code for CPU or hybrid (CPU/GPU) //#include "BC7.h" #endif INLINE CGU_INT a_compare( const void *arg1, const void *arg2 ) { if (((CMP_di* )arg1)->image-((CMP_di* )arg2)->image > 0 ) return 1; if (((CMP_di* )arg1)->image-((CMP_di* )arg2)->image < 0 ) return -1; return 0; }; #endif #ifndef ASPM_GPU CMP_GLOBAL BC7_EncodeRamps BC7EncodeRamps #ifndef ASPM = {0} #endif ; //--------------------------------------------- // CPU: Computes max of two float values //--------------------------------------------- float bc7_maxf(float l1, float r1) { return (l1 > r1 ? l1 : r1); } //--------------------------------------------- // CPU: Computes max of two float values //--------------------------------------------- float bc7_minf(float l1, float r1) { return (l1 < r1 ? l1 : r1); } #endif INLINE CGV_EPOCODE shift_right_epocode(CGV_EPOCODE v, CGU_INT bits) { return v>>bits; // (perf warning expected) } INLINE CGV_EPOCODE expand_epocode(CGV_EPOCODE v, CGU_INT bits) { CGV_EPOCODE vv = v<<(8-bits); return vv + shift_right_epocode(vv, bits); } // valid bit range is 0..8 CGU_INT expandbits(CGU_INT bits, CGU_INT v) { return ( v << (8-bits) | v >> (2* bits - 8)); } CMP_EXPORT CGU_INT bc7_isa() { #ifndef ASPM_GPU #if defined(ISPC_TARGET_SSE2) ASPM_PRINT(("SSE2")); return 0; #elif defined(ISPC_TARGET_SSE4) ASPM_PRINT(("SSE4")); return 1; #elif defined(ISPC_TARGET_AVX) ASPM_PRINT(("AVX")); return 2; #elif defined(ISPC_TARGET_AVX2) ASPM_PRINT(("AVX2")); return 3; #else ASPM_PRINT(("CPU")); #endif #endif return -1; } CMP_EXPORT void init_BC7ramps() { #ifdef ASPM_GPU #else CMP_STATIC CGU_BOOL g_rampsInitialized = FALSE; if (g_rampsInitialized == TRUE) return; g_rampsInitialized = TRUE; BC7EncodeRamps.ramp_init = TRUE; //bc7_isa(); ASPM_PRINT((" INIT Ramps\n")); CGU_INT bits; CGU_INT p1; CGU_INT p2; CGU_INT clogBC7; CGU_INT index; CGU_INT j; CGU_INT o1; CGU_INT o2; CGU_INT maxi = 0; for (bits = BIT_BASE; bits maxi) maxi = index; BC7EncodeRamps.ramp[(CLT(clogBC7)*4*256*256*16)+(BTT(bits)*256*256*16)+(p1*256*16)+(p2*16)+index] = //floor((CGV_IMAGE)BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] * (CGV_IMAGE)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F); floor(BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] *((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F); }//index<(1 << clogBC7) }//p2<(1 << bits) }//p1<(1 << bits) #endif #ifdef USE_BC7_SP_ERR_IDX for (j = 0; j<256; j++) { for (o1 = 0; o1<2; o1++) { for (o2 = 0; o2<2; o2++) { for (index = 0; index<16; index++) { BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] = 0; BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1] = 255; BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = 255; } // i<16 }//o2<2; }//o1<2 } //j<256 for (p1 = 0; p1<(1 << bits); p1++) { for (p2 = 0; p2<(1 << bits); p2++) { for (index = 0; index<(1 << clogBC7); index++) { #ifdef USE_BC7_RAMP CGV_EPOCODE floatf = (CGV_EPOCODE)BC7EncodeRamps.ramp[(CLT(clogBC7)*4*256*256*16)+(BTT(bits)*256*256*16)+(p1*256*16)+(p2*16)+index]; #else CGV_EPOCODE floatf = floor((CGV_IMAGE)BC7EncodeRamps.ep_d[BTT(bits)][p1] + rampWeights[clogBC7][index] * (CGV_IMAGE)((BC7EncodeRamps.ep_d[BTT(bits)][p2] - BC7EncodeRamps.ep_d[BTT(bits)][p1]))+ 0.5F); #endif BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(floatf*2*2*16*2)+((p1 & 0x1)*2*16*2)+((p2 & 0x1)*16*2)+(index*2)+0] = p1; BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(floatf*2*2*16*2)+((p1 & 0x1)*2*16*2)+((p2 & 0x1)*16*2)+(index*2)+1] = p2; BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(floatf*2*2*16)+((p1 & 0x1)*2*16)+(p2 & 0x1*16)+index] = 0; } //i<(1 << clogBC7) } //p2 }//p1<(1 << bits) for (j = 0; j<256; j++) { for (o1 = 0; o1<2; o1++) { for (o2 = 0; o2<2; o2++) { for (index = 0; index<(1 << clogBC7); index++) { if ( // check for unitialized sp_idx (BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] == 0) && (BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1] == 255) ) { CGU_INT k; CGU_INT tf; CGU_INT tc; for (k = 1; k<256; k++) { tf = j - k; tc = j + k; if ((tf >= 0 && BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(tf*2*2*16)+(o1*2*16)+(o2*16)+index] == 0)) { BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] = BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(tf*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0]; BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1] = BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(tf*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+1]; break; } else if ((tc < 256 && BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(tc*2*2*16)+(o1*2*16)+(o2*16)+index] == 0)) { BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(j*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0] = BC7EncodeRamps.sp_idx[(CLT(clogBC7)*4*256*2*2*16*2)+(BTT(bits)*256*2*2*16*2)+(tc*2*2*16*2)+(o1*2*16*2)+(o2*16*2)+(index*2)+0]; break; } } //BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = (CGV_ERROR) k; BC7EncodeRamps.sp_err[(CLT(clogBC7)*4*256*2*2*16)+(BTT(bits)*256*2*2*16)+(j*2*2*16)+(o1*2*16)+(o2*16)+index] = (CGU_UINT8)k; } //sp_idx < 0 }//i<(1 << clogBC7) }//o2 }//o1 }//j #endif } //bits b) return b; return v; } INLINE CGV_INDEX clampIndex(CGV_INDEX v, CGV_INDEX a, CGV_INDEX b) { if (v < a) return a; else if (v > b) return b; return v; } INLINE CGV_SHIFT32 shift_right_uint32(CGV_SHIFT32 v, CGU_INT bits) { return v>>bits; // (perf warning expected) } INLINE CGV_BYTE shift_right_uint8(CGV_BYTE v, CGU_UINT8 bits) { return v>>bits; // (perf warning expected) } INLINE CGV_BYTE shift_right_uint8V(CGV_BYTE v, CGV_UINT8 bits) { return v>>bits; // (perf warning expected) } // valid bit range is 0..8 INLINE CGV_EPOCODE expandEPObits(CGV_EPOCODE v, uniform CGV_EPOCODE bits) { CGV_EPOCODE vv = v<<(8-bits); return vv + shift_right_uint32(vv, bits); } CGV_ERROR err_absf(CGV_ERROR a) { return a>0.0F?a:-a;} CGV_IMAGE img_absf(CGV_IMAGE a) { return a>0.0F?a:-a;} CGU_UINT8 min8(CGU_UINT8 a, CGU_UINT8 b) { return ab?a:b;} void pack_index(CGV_INDEXPACKED packed_index[2], CGV_INDEX src_index[MAX_SUBSET_SIZE]) { // Converts from unpacked index to packed index packed_index[0] = 0x0000; packed_index[1] = 0x0000; CGV_BYTE shift = 0; // was CGV_UINT8 for (CGU_INT k=0; k<16; k++) { packed_index[k/8] |= (CGV_UINT32)(src_index[k]&0x0F) << shift; shift +=4; } } void unpack_index(CGV_INDEX unpacked_index[MAX_SUBSET_SIZE],CGV_INDEXPACKED src_packed[2]) { // Converts from packed index to unpacked index CGV_BYTE shift = 0; // was CGV_UINT8 for (CGV_BYTE k=0; k<16; k++) { unpacked_index[k] = (CGV_BYTE)(src_packed[k/8] >> shift)&0xF; if (k == 7) shift = 0; else shift +=4; } } //====================================== CMP MATH UTILS ============================================ CGV_ERROR err_Total( CGV_IMAGE image_src1[SOURCE_BLOCK_SIZE*MAX_CHANNELS], CGV_IMAGE image_src2[SOURCE_BLOCK_SIZE*MAX_CHANNELS], CGV_ENTRIES numEntries, // < 16 CGU_CHANNEL channels3or4) // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS) { CGV_ERROR err_t=0.0F; for (CGU_CHANNEL ch=0;ch 0) { for (CGV_ENTRIES k=0;k> 16; CGV_UINT32 mask = 0x01 << index; return ((mask1 & mask)?2:0 + (mask0 & mask)?1:0); // This can be moved to caller, just return mask!! } void GetPartitionSubSet_mode01237( CGV_IMAGE subsets_out[MAX_SUBSETS][SOURCE_BLOCK_SIZE][MAX_CHANNELS], // OUT: Subset pattern mapped with image src colors CGV_ENTRIES entryCount_out[MAX_SUBSETS], // OUT: Number of entries per subset CGV_UINT8 partition, // Partition Shape 0..63 CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS], // Image colors CGU_INT blockMode, // [0,1,2,3 or 7] CGU_CHANNEL channels3or4) // 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS) { CGU_UINT8 maxSubsets = 2; if (blockMode == 0 || blockMode == 2) maxSubsets = 3; entryCount_out[0] = 0; entryCount_out[1] = 0; entryCount_out[2] = 0; for (CGV_INT i = 0; i < MAX_SUBSET_SIZE; i++) { CGV_UINT8 subset = get_partition_subset(partition,maxSubsets,i); for (CGU_INT ch = 0; ch<3; ch++) subsets_out[subset][entryCount_out[subset]][ch] = image_src[i+(ch*SOURCE_BLOCK_SIZE)]; //subsets_out[subset*64+(entryCount_out[subset]*MAX_CHANNELS+ch)] = image_src[i+(ch*SOURCE_BLOCK_SIZE)]; // if we have only 3 channels then set the alpha subset to 0 if (channels3or4 == 3) subsets_out[subset][entryCount_out[subset]][3] = 0.0F; else subsets_out[subset][entryCount_out[subset]][3] = image_src[i+(COMP_ALPHA*SOURCE_BLOCK_SIZE)]; entryCount_out[subset]++; } } INLINE void GetClusterMean( CGV_IMAGE cluster_mean_out[SOURCE_BLOCK_SIZE][MAX_CHANNELS], CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS], CGV_INDEX index_in[MAX_SUBSET_SIZE], CGV_ENTRIES numEntries, // < 16 CGU_CHANNEL channels3or4) // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS) { // unused index values are underfined CGV_INDEX i_cnt[MAX_SUBSET_SIZE]; CGV_INDEX i_comp[MAX_SUBSET_SIZE]; for (CGV_ENTRIES i=0;i< numEntries;i++) for (CGU_CHANNEL ch=0; ch< channels3or4; ch++) { CGV_INDEX idx = index_in[i]&0x0F; cluster_mean_out[idx][ch] = 0; i_cnt[idx]=0; } CGV_INDEX ic = 0; // was CGV_INT for (CGV_ENTRIES i=0;i< numEntries;i++) { CGV_INDEX idx = index_in[i]&0x0F; if (i_cnt[idx]==0) i_comp[ic++]=idx; i_cnt[idx]++; for (CGU_CHANNEL ch=0; ch< channels3or4; ch++) { cluster_mean_out[idx][ch] += image_src[i+(ch*SOURCE_BLOCK_SIZE)]; } } for (CGU_CHANNEL ch=0; ch< channels3or4; ch++) for (CGU_INT i=0;i < ic;i++) { if (i_cnt[i_comp[i]] != 0) { CGV_INDEX icmp = i_comp[i]; cluster_mean_out[icmp][ch] = (CGV_IMAGE) floor( (cluster_mean_out[icmp][ch] / (CGV_IMAGE) i_cnt[icmp]) +0.5F); } } } INLINE void GetImageMean( CGV_IMAGE image_mean_out[SOURCE_BLOCK_SIZE*MAX_CHANNELS], CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS], CGV_ENTRIES numEntries, CGU_CHANNEL channels) { for (CGU_CHANNEL ch=0; ch< channels; ch++) image_mean_out[ch] =0; for (CGV_ENTRIES i=0;i< numEntries;i++) for (CGU_CHANNEL ch=0; ch< channels; ch++) image_mean_out[ch] += image_src[i+ch*SOURCE_BLOCK_SIZE]; for (CGU_CHANNEL ch=0; ch< channels; ch++) image_mean_out[ch] /=(CGV_IMAGE) numEntries; // Performance Warning: Conversion from unsigned int to float is slow. Use "int" if possible } // calculate an eigen vector corresponding to a biggest eigen value // will work for non-zero non-negative matricies only void GetEigenVector( CGV_IMAGE EigenVector_out[MAX_CHANNELS], // Normalized Eigen Vector output CGV_IMAGE CovarianceVector[MAX_CHANNELS*MAX_CHANNELS], // Covariance Vector CGU_CHANNEL channels3or4) // IN: 3 = RGB or 4 = RGBA { CGV_IMAGE vector_covIn[MAX_CHANNELS*MAX_CHANNELS]; CGV_IMAGE vector_covOut[MAX_CHANNELS*MAX_CHANNELS]; CGV_IMAGE vector_maxCovariance; for (CGU_CHANNEL ch1=0; ch1 vector_maxCovariance) vector_maxCovariance = vector_covIn[ch+ch*4]; } // Normalize Input Covariance Vector for (CGU_CHANNEL ch1=0; ch1 0) vector_covIn[ch1+ch2*4] = vector_covIn[ch1+ch2*4] / vector_maxCovariance; } for (CGU_CHANNEL ch1=0; ch1 vector_maxCovariance) { maxCovariance_channel = ch; vector_maxCovariance = vector_covOut[ch+ch*4]; } } CGV_IMAGE vector_t = 0; for (CGU_CHANNEL ch=0; ch 0) EigenVector_out[ch] = EigenVector_out[ch] / vector_t; } } CGV_INDEX index_collapse( CGV_INDEX index[MAX_SUBSET_SIZE], CGV_ENTRIES numEntries) { CGV_INDEX minIndex=index[0]; CGV_INDEX MaxIndex=index[0]; for (CGV_ENTRIES k=1;k MaxIndex) MaxIndex = index[k]; } CGV_INDEX D=1; for (CGV_INDEX d=2; d<= MaxIndex-minIndex; d++) { for (CGV_ENTRIES ent=0;ent=numEntries) D =d; break; } } } for (CGV_ENTRIES k=0;k MaxIndex) MaxIndex = index[k]; } return (MaxIndex); } void sortProjected_indexs( CGV_INDEX index_ordered[MAX_SUBSET_SIZE], CGV_IMAGE projection[SOURCE_BLOCK_SIZE], CGV_ENTRIES numEntries // max 16 ) { CMP_di what[SOURCE_BLOCK_SIZE]; for (CGV_INDEX i=0; i < numEntries;i++) { what[i].index = i; what[i].image = projection[i]; } CGV_INDEX tmp_index; CGV_IMAGE tmp_image; for (CGV_ENTRIES i = 1; i < numEntries; i++) { for (CGV_ENTRIES j=i; j>0; j--) { if (what[j - 1].image > what[j].image) { tmp_index = what[j].index; tmp_image = what[j].image; what[j].index = what[j - 1].index; what[j].image = what[j - 1].image; what[j - 1].index = tmp_index; what[j - 1].image = tmp_image; } } } for (CGV_ENTRIES i=0; i < numEntries;i++) index_ordered[i]=what[i].index; }; void sortPartitionProjection( CGV_IMAGE projection[MAX_PARTITION_ENTRIES], CGV_UINT8 order[MAX_PARTITION_ENTRIES], CGU_UINT8 numPartitions // max 64 ) { CMP_du what[MAX_PARTITION_ENTRIES]; for (CGU_UINT8 Parti=0; Parti < numPartitions;Parti++) { what[Parti].index = Parti; what[Parti].image = projection[Parti]; } CGV_UINT8 index; CGV_IMAGE data; for (CGU_UINT8 Parti = 1; Parti < numPartitions; Parti++) { for (CGU_UINT8 Partj=Parti; Partj>0; Partj--) { if (what[Partj - 1].image > what[Partj].image) { index = what[Partj].index; data = what[Partj].image; what[Partj].index = what[Partj - 1].index; what[Partj].image = what[Partj - 1].image; what[Partj - 1].index = index; what[Partj - 1].image = data; } } } for (CGU_UINT8 Parti=0; Parti < numPartitions;Parti++) order[Parti]=what[Parti].index; }; void cmp_Write8Bit( CGV_CMPOUT base[], CGU_INT* uniform offset, CGU_INT bits, CGV_BYTE bitVal) { base[*offset/8] |= bitVal << (*offset%8); if (*offset%8+bits>8) { base[*offset/8+1] |= shift_right_uint8(bitVal, 8-*offset%8); } *offset += bits; } void cmp_Write8BitV( CGV_CMPOUT base[], CGV_INT offset, CGU_INT bits, CGV_BYTE bitVal) { base[offset/8] |= bitVal << (offset%8); if (offset%8+bits>8) { base[offset/8+1] |= shift_right_uint8V(bitVal, 8-offset%8); } } INLINE CGV_EPOCODE ep_find_floor( CGV_IMAGE v, CGU_UINT8 bits, CGV_BYTE use_par, CGV_BYTE odd) { CGV_EPOCODE i1=0; CGV_EPOCODE i2=1<<(bits-use_par); odd = use_par ? odd : 0; while (i2-i1>1) { CGV_EPOCODE j = (i1+i2)/2; // Warning in ASMP code CGV_EPOCODE ep_d = expandEPObits((j<= ep_d ) i1=j; else i2=j; } return (i1<>4; fixup[2] = skip_packed&15; } //===================================== COMPRESS CODE ============================================= INLINE void SetDefaultIndex(CGV_INDEX index_io[MAX_SUBSET_SIZE]) { // Use this a final call for (CGU_INT i=0; i image_projected[i]) image_max = image_projected[i]; } CGV_IMAGE img_diff = image_max-image_min; if (img_diff == 0.0f) return; if (isnan(img_diff)) return; image_s = (clusters-1)/img_diff; for (CGV_INDEX i=0; i < numEntries;i++) { image_v[i] = image_projected[i]*image_s; image_z[i] = floor(image_v[i] + 0.5F - image_min *image_s); projected_index_out[i] = (CGV_INDEX)image_z[i]; what[i].image = image_v[i]-image_z[i]- image_min *image_s; what[i].index = i; image_dm+= what[i].image; image_r += what[i].image*what[i].image; } if (numEntries*image_r- image_dm*image_dm >= (CGV_IMAGE)(numEntries-1)/8) { image_dm /= numEntries; for (CGV_INT i=0; i < numEntries;i++) what[i].image -= image_dm; CGV_INDEX tmp_index; CGV_IMAGE tmp_image; for (CGV_ENTRIES i = 1; i < numEntries; i++) { for (CGV_ENTRIES j=i; j>0; j--) { if (what[j - 1].image > what[j].image) { tmp_index = what[j].index; tmp_image = what[j].image; what[j].index = what[j - 1].index; what[j].image = what[j - 1].image; what[j - 1].index = tmp_index; what[j - 1].image = tmp_image; } } } // got into fundamental simplex // move coordinate system origin to its center // i=0 < numEntries avoids varying int division by 0 for (CGV_ENTRIES i=0; i < numEntries;i++) { what[i].image = what[i].image - (CGV_IMAGE) (((2.0f*i+1)-numEntries)/(2.0f*numEntries)); } image_mm=0.0F; image_l=0.0F; CGV_INT j = -1; for (CGV_ENTRIES i=0; i < numEntries;i++) { image_l += what[i].image; if (image_l < image_mm) { image_mm = image_l; j=i; } } j = j + 1; // avoid j = j%numEntries us this while (j > numEntries) j = j - numEntries; for (CGV_ENTRIES i=j; i < numEntries;i++) { CGV_INDEX idx = what[i].index; CGV_INDEX pidx = projected_index_out[idx] + 1; //gather_index(projected_index_out,idx)+1; projected_index_out[idx] = pidx; // scatter_index(projected_index_out,idx,pidx); } } // get minimum index CGV_INDEX index_min=projected_index_out[0]; for (CGV_ENTRIES i=1; i < numEntries;i++) { if (projected_index_out[i] < index_min) index_min = projected_index_out[i]; } // reposition all index by min index (using min index as 0) for (CGV_ENTRIES i=0; i < numEntries;i++) { projected_index_out[i] = clampIndex(projected_index_out[i] - index_min,0,15); } } CGV_ERROR GetQuantizeIndex( CGV_INDEXPACKED index_packed_out[2], CGV_INDEX index_out[MAX_SUBSET_SIZE], // OUT: CGV_IMAGE image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS], CGV_ENTRIES numEntries, //IN: range 0..15 (MAX_SUBSET_SIZE) CGU_INT numClusters, CGU_CHANNEL channels3or4) // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS) { CGV_IMAGE image_centered[SOURCE_BLOCK_SIZE*MAX_CHANNELS]; CGV_IMAGE image_mean[MAX_CHANNELS]; CGV_IMAGE eigen_vector[MAX_CHANNELS]; CGV_IMAGE covariance_vector[MAX_CHANNELS*MAX_CHANNELS]; GetImageCentered(image_centered,image_mean, image_src, numEntries, channels3or4); GetCovarianceVector(covariance_vector, image_centered, numEntries, channels3or4); //----------------------------------------------------- // check if all covariances are the same // if so then set all index to same value 0 and return // use EPSILON to set the limit for all same limit //----------------------------------------------------- CGV_IMAGE image_covt=0.0F; for (CGU_CHANNEL ch=0; ch>= 1) clogBC7++; // init epo_0 CGV_EPOCODE epo_0[2*MAX_CHANNELS]; SetDefaultEPOCode(epo_0,0xFF,0,0,0); CGV_INDEX image_log = 0; CGV_INDEX image_idx = 0; CGU_BOOL use_par = FALSE; if (type != 0) use_par = TRUE; CGV_ERROR error_1 = CMP_FLOAT_MAX; for (CGU_INT pn = 0; pn err_tc) image_tcr[ch1] = image_tc; else if (err_tf < err_tc) image_tcr[ch1] = image_tf; else image_tcr[ch1] = (CGV_EPOCODE)floor(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)] + 0.5F); //image_tcr[ch1] = image_tf + (image_tc - image_tf)/2; //=============================== // Refine this for better quality! //=============================== error_tr = get_sperr(clogBC7,bits[ch1],image_tcr[ch1],t1,t2,iclogBC7); error_tr = (error_tr*error_tr) + 2 * error_tr * img_absf(image_tcr[ch1]- image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]) + (image_tcr[ch1] - image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]) * (image_tcr[ch1] - image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)]); if (error_tr < error_ta) { error_ta = error_tr; t1o[ch1] = t1; t2o[ch1] = t2; epo_dr_0[ch1] = clampEPO(image_tcr[ch1],0,255); } #else image_tcr[ch1] = floor(image_src[COMP_RED+(ch1*SOURCE_BLOCK_SIZE)] + 0.5F); error_ta = 0; t1o[ch1] = t1; t2o[ch1] = t2; epo_dr_0[ch1] = clampi(image_tcr[ch1],0,255); #endif } // B } //C error_t += error_ta; } // D if (error_t < error_0) { // We have a solid color: Use image src if on GPU image_log = iclogBC7; image_idx = image_log; #ifdef ASPM_GPU // This needs improving CGV_IMAGE MinC[4] = {255,255,255,255}; CGV_IMAGE MaxC[4] = {0,0,0,0}; // get min max colors for (CGU_CHANNEL ch=0;ch MaxC[ch] ) MaxC[ch] = image_src[k+ch*SOURCE_BLOCK_SIZE]; } for (CGU_CHANNEL ch = 0; chsp_idx,index+0)&0xFF; epo_0[4+ch] = BC7EncodeRamps.sp_idx[index+1]&0xFF;// gather_epocode(u_BC7Encode->sp_idx,index+1)&0xFF; } else { epo_0[ch] = 0; epo_0[4 + ch] = 0; } #else epo_0[ ch] = 0; epo_0[4+ch] = 0; #endif } #endif error_0 = error_t; } //if (error_0 == 0) // break; } // E if (error_0 < error_1) { image_idx = image_log; for (CGU_CHANNEL chE = 0; chE 0) { image_ramp = GetRamp(clogBC7,max_bits[ch],epo_p1,epo_p2,index_cidx[_mc-1]); image_square_diff += sq_image(image_ramp-image_src[(_mc-1)+(ch*SOURCE_BLOCK_SIZE)]); _mc--; } if (image_square_diff < err_ed[(ppA*8)+(ppB*4)+ch]) { err_ed[(ppA*8)+(ppB*4)+ch] = image_square_diff; epo_code_par[ppA][ppB][0][ch] = epo_p1; epo_code_par[ppA][ppB][1][ch] = epo_p2; } } } // pp1 } // pp0 } // j //--------------------------------------------------------- for (CGU_INT pn=0; pn < npv_nd[channels3or4-3][type]; pn++) { CGV_ERROR err_2=0.0F; CGU_INT d1; CGU_INT d2; for (CGU_CHANNEL ch=0; ch>=1) clogBC7++; CGU_INT clt_clogBC7 = CLT(clogBC7); if (clt_clogBC7 > 3) { ASPM_PRINT(("Err: optimize_IndexAndEndPoints, clt_clogBC7\n")); return CMP_FLOAT_MAX; } Mi_ = Mi_ - 1; CGV_INDEX MaxIndex; CGV_INDEX index_tmp[MAX_SUBSET_SIZE]; CGU_INT maxTry = MAX_TRY_SHAKER; CGV_INDEX index_best[MAX_SUBSET_SIZE]; for (CGV_ENTRIES k=0;kerrorThreshold) { break; } CGV_TYPEINT done; done = !(change && better); if ((maxTry > 0)&&(!done)) { maxTry--; MaxIndex = index_collapse(index_tmp, numEntries); } else { maxTry = 0; } } while (maxTry); if (err_best == CMP_FLOAT_MAX) { ASPM_PRINT(("Err: requantized_image_err\n")); } return err_best; } CGU_UINT8 get_partitionsToTry(uniform CMP_GLOBAL BC7_Encode u_BC7Encode[],CGU_UINT8 maxPartitions) { CGU_FLOAT u_minPartitionSearchSize = 0.30f; if(u_BC7Encode->quality <= BC7_qFAST_THRESHOLD) // Using this to match performance and quality of CPU code { u_minPartitionSearchSize = u_minPartitionSearchSize + ( u_BC7Encode->quality*BC7_qFAST_THRESHOLD); } else { u_minPartitionSearchSize = u_BC7Encode->quality; } return (CGU_UINT8)(maxPartitions * u_minPartitionSearchSize); } INLINE void cmp_encode_swap(CGV_EPOCODE endpoint[], CGU_INT channels, CGV_INDEX block_index[MAX_SUBSET_SIZE], CGU_INT bits) { CGU_INT levels = 1 << bits; if (block_index[0]>=levels/2) { cmp_swap_epo(&endpoint[0], &endpoint[channels], channels); for (CGU_INT k=0; k0) q = (levels-1)-q; if (k1==0 && k2==0) cmp_Write8Bit(data, pPos, bits-1, CMP_STATIC_CAST(CGV_BYTE,q)); else cmp_Write8Bit(data, pPos, bits , CMP_STATIC_CAST(CGV_BYTE,q)); qbits_shifted >>= 4; flips_shifted >>= 1; } } } INLINE CGV_SHIFT32 pow32(CGV_SHIFT32 x) { return 1<>= 1; packedColours[1] >>= 1; } else if(blockMode == 1) // ONE_PBIT { parityBits[subset][0] = packedColours[1] & 1; parityBits[subset][1] = packedColours[1] & 1; packedColours[0] >>= 1; packedColours[1] >>= 1; } else if(blockMode == 2) { parityBits[subset][0] = 0; parityBits[subset][1] = 0; } for (CGU_INT ch=0; ch>= componentBits; packedColours[1] >>= componentBits; } } // Loop over component for (CGU_INT ch=0; ch < channels; ch++) { // loop over subsets for (CGU_INT subset=0; subsetrotated_channel)); // idxMode 1 bit cmp_Write8Bit(cmp_out, &bitPosition, 1,CMP_STATIC_CAST(CGV_BYTE,params->idxMode)); CGU_INT idxBits[2] = {2,3}; if(params->idxMode) { idxBits[0] = 3; idxBits[1] = 2; // Indicate if we need to fixup the index cmp_swap_index(params->color_index,params->alpha_index,16); cmp_encode_swap(params->alpha_qendpoint, 4, params->color_index,2); cmp_encode_swap(params->color_qendpoint, 4, params->alpha_index,3); } else { cmp_encode_swap(params->color_qendpoint, 4, params->color_index,2); cmp_encode_swap(params->alpha_qendpoint, 4, params->alpha_index,3); } // color endpoints 5 bits each // R0 : R1 // G0 : G1 // B0 : B1 for (CGU_INT component=0; component < 3; component++) { cmp_Write8Bit(cmp_out,&bitPosition,5,CMP_STATIC_CAST(CGV_BYTE,params->color_qendpoint[component])); cmp_Write8Bit(cmp_out,&bitPosition,5,CMP_STATIC_CAST(CGV_BYTE,params->color_qendpoint[4 + component])); } // alpha endpoints (6 bits each) // A0 : A1 cmp_Write8Bit(cmp_out,&bitPosition,6,CMP_STATIC_CAST(CGV_BYTE,params->alpha_qendpoint[0])); cmp_Write8Bit(cmp_out,&bitPosition,6,CMP_STATIC_CAST(CGV_BYTE,params->alpha_qendpoint[4])); // index 2 bits each (31 bits total) cmp_encode_index(cmp_out, &bitPosition, params->color_index, 2); // index 3 bits each (47 bits total) cmp_encode_index(cmp_out, &bitPosition, params->alpha_index, 3); } void Encode_mode5( CGV_CMPOUT cmp_out[COMPRESSED_BLOCK_SIZE], varying cmp_mode_parameters* uniform params) { for (CGU_INT k=0; krotated_channel)); cmp_encode_swap(params->color_qendpoint, 4, params->color_index,2); cmp_encode_swap(params->alpha_qendpoint, 4, params->alpha_index,2); // color endpoints (7 bits each) // R0 : R1 // G0 : G1 // B0 : B1 for (CGU_INT component=0; component < 3; component++) { cmp_Write8Bit(cmp_out,&bitPosition,7,CMP_STATIC_CAST(CGV_BYTE,params->color_qendpoint[component])); cmp_Write8Bit(cmp_out,&bitPosition,7,CMP_STATIC_CAST(CGV_BYTE,params->color_qendpoint[4 + component])); } // alpha endpoints (8 bits each) // A0 : A1 cmp_Write8Bit(cmp_out,&bitPosition,8,CMP_STATIC_CAST(CGV_BYTE,params->alpha_qendpoint[0])); cmp_Write8Bit(cmp_out,&bitPosition,8,CMP_STATIC_CAST(CGV_BYTE,params->alpha_qendpoint[4])); // color index 2 bits each (31 bits total) // alpha index 2 bits each (31 bits total) cmp_encode_index(cmp_out, &bitPosition, params->color_index, 2); cmp_encode_index(cmp_out, &bitPosition, params->alpha_index, 2); } void Encode_mode6( CGV_INDEX index[MAX_SUBSET_SIZE], CGV_EPOCODE epo_code[8], CGV_CMPOUT cmp_out[COMPRESSED_BLOCK_SIZE]) { for (CGU_INT k=0; k> 1)); cmp_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_BYTE,epo_code[4 + p] >> 1)); } // p bits cmp_Write8Bit(cmp_out, &bitPosition, 1, epo_code[0]&1); cmp_Write8Bit(cmp_out, &bitPosition, 1, epo_code[4]&1); // quantized values cmp_encode_index(cmp_out, &bitPosition, index, 4); } void Compress_mode01237( CGU_INT blockMode, BC7_EncodeState EncodeState[], uniform CMP_GLOBAL BC7_Encode u_BC7Encode[]) { CGV_INDEX storedBestindex[MAX_PARTITIONS][MAX_SUBSETS][MAX_SUBSET_SIZE]; CGV_ERROR storedError[MAX_PARTITIONS]; CGV_UINT8 sortedPartition[MAX_PARTITIONS]; EncodeState->numPartitionModes = 64; EncodeState->maxSubSets = 2; if (blockMode == 0) { EncodeState->numPartitionModes = 16; EncodeState->channels3or4 = 3; EncodeState->bits = 26; EncodeState->clusters = 8; EncodeState->componentBits = 4; EncodeState->maxSubSets = 3; } else if (blockMode == 2) { EncodeState->channels3or4 = 3; EncodeState->bits = 30; EncodeState->clusters = 4; EncodeState->componentBits = 5; EncodeState->maxSubSets = 3; } else if (blockMode == 1) { EncodeState->channels3or4 = 3; EncodeState->bits = 37; EncodeState->clusters = 8; EncodeState->componentBits = 6; } else if (blockMode == 3) { EncodeState->channels3or4 = 3; EncodeState->bits = 44; EncodeState->clusters = 4; EncodeState->componentBits = 7; } else if (blockMode == 7) { EncodeState->channels3or4 = 4; EncodeState->bits = 42; // (2* (R 5 + G 5 + B 5 + A 5)) + 2 parity bits EncodeState->clusters = 4; EncodeState->componentBits = 5; // 5 bit components } CGV_IMAGE image_subsets[MAX_SUBSETS][MAX_SUBSET_SIZE][MAX_CHANNELS]; CGV_ENTRIES subset_entryCount[MAX_SUBSETS] = {0,0,0}; // Loop over the available partitions for the block mode and quantize them // to figure out the best candidates for further refinement CGU_UINT8 mode_partitionsToTry; mode_partitionsToTry = get_partitionsToTry(u_BC7Encode,EncodeState->numPartitionModes); CGV_UINT8 bestPartition = 0; for (CGU_INT mode_blockPartition = 0; mode_blockPartition < mode_partitionsToTry; mode_blockPartition++) { GetPartitionSubSet_mode01237( image_subsets, subset_entryCount, CMP_STATIC_CAST(CGV_UINT8,mode_blockPartition), EncodeState->image_src, blockMode, EncodeState->channels3or4); CGV_IMAGE subset_image_src[SOURCE_BLOCK_SIZE*MAX_CHANNELS]; CGV_INDEX index_out1[SOURCE_BLOCK_SIZE]; CGV_ERROR err_quant = 0.0F; // Store the quntize error for this partition to be sorted and processed later for (CGU_INT subset=0; subset < EncodeState->maxSubSets; subset++) { CGV_ENTRIES numEntries = subset_entryCount[subset]; for (CGU_INT ii=0; iiclusters, EncodeState->channels3or4); for (CGV_INT idx=0; idx < numEntries; idx++) { storedBestindex[mode_blockPartition][subset][idx] = index_out1[idx]; } } storedError[mode_blockPartition] = err_quant; } // Sort the results sortPartitionProjection( storedError, sortedPartition, mode_partitionsToTry); CGV_EPOCODE epo_code[MAX_SUBSETS*2*MAX_CHANNELS]; CGV_EPOCODE bestEndpoints[MAX_SUBSETS*2*MAX_CHANNELS]; CGV_BYTE bestindex[MAX_SUBSETS*MAX_SUBSET_SIZE]; CGV_ENTRIES bestEntryCount[MAX_SUBSETS]; CGV_BYTE bestindex16[MAX_SUBSET_SIZE]; // Extensive shaking is most important when the ramp is short, and // when we have less index. On a long ramp the quality of the // initial quantizing is relatively more important // We modulate the shake size according to the number of ramp index // - the more index we have the less shaking should be required to find a near // optimal match CGU_UINT8 numShakeAttempts = max8(1, min8((CGU_UINT8)floor(8 * u_BC7Encode->quality + 0.5), mode_partitionsToTry)); CGV_ERROR err_best = CMP_FLOAT_MAX; // Now do the endpoint shaking for (CGU_INT nSA =0; nSA < numShakeAttempts; nSA++) { CGV_ERROR err_optimized = 0.0F; CGV_UINT8 sortedBlockPartition; sortedBlockPartition = sortedPartition[nSA]; //******************************************** // Get the partition shape for the given mode //******************************************** GetPartitionSubSet_mode01237( image_subsets, subset_entryCount, sortedBlockPartition, EncodeState->image_src, blockMode, EncodeState->channels3or4); //***************************** // Process the partition shape //***************************** for (CGU_INT subset=0; subset < EncodeState->maxSubSets; subset++) { CGV_ENTRIES numEntries = subset_entryCount[subset]; CGV_IMAGE src_image_block[SOURCE_BLOCK_SIZE*MAX_CHANNELS]; CGV_INDEX index_io[MAX_SUBSET_SIZE]; CGV_EPOCODE tmp_epo_code[8]; for (CGU_INT k=0; kclusters), // Mi_ EncodeState->bits, EncodeState->channels3or4, u_BC7Encode); for (CGU_INT k=0; k < MAX_SUBSET_SIZE; k++) { storedBestindex[sortedBlockPartition][subset][k] = index_io[k]; } for (CGU_INT ch=0; chmaxSubSets; subset++) { CGV_ENTRIES numEntries = subset_entryCount[subset]; bestEntryCount[subset] = numEntries; if(numEntries) { for (CGU_INT ch=0; ch < EncodeState->channels3or4; ch++) { bestEndpoints[(subset*2+0)*4+ch] = epo_code[(subset*2+0)*4+ch]; bestEndpoints[(subset*2+1)*4+ch] = epo_code[(subset*2+1)*4+ch]; } for (CGV_ENTRIES k=0; k< numEntries; k++) { bestindex[subset*MAX_SUBSET_SIZE+k] = storedBestindex[sortedBlockPartition][subset][k]; bestindex16[bestIndexCount++] = storedBestindex[sortedBlockPartition][subset][k]; } } } err_best = err_optimized; // Early out if we found we can compress with error below the quality threshold if(err_best <= u_BC7Encode->errorThreshold) { break; } } } if (blockMode != 7) err_best += EncodeState->opaque_err; if(err_best > EncodeState->best_err) return; //************************** // Save the encoded block //************************** EncodeState->best_err = err_best; // Now we have all the data needed to encode the block // We need to pack the endpoints prior to encoding CGV_TYPEUINT32 packedEndpoints[MAX_SUBSETS*2] = {0,0,0,0,0,0}; for (CGU_INT subset=0; subsetmaxSubSets; subset++) { packedEndpoints[(subset*2)+0] = 0; packedEndpoints[(subset*2)+1] = 0; if(bestEntryCount[subset]) { CGU_UINT32 rightAlignment = 0; // Sort out parity bits if(blockMode != 2) { // Sort out BCC parity bits packedEndpoints[(subset*2)+0] = bestEndpoints[(subset*2+0)*4+0] & 1; packedEndpoints[(subset*2)+1] = bestEndpoints[(subset*2+1)*4+0] & 1; for (CGU_INT ch=0; chchannels3or4; ch++) { bestEndpoints[(subset*2+0)*4+ch] >>= 1; bestEndpoints[(subset*2+1)*4+ch] >>= 1; } rightAlignment++; } // Fixup endpoints for (CGU_INT ch=0; chchannels3or4; ch++) { packedEndpoints[(subset*2)+0] |= bestEndpoints[((subset*2)+0)*4+ch] << rightAlignment; packedEndpoints[(subset*2)+1] |= bestEndpoints[((subset*2)+1)*4+ch] << rightAlignment; rightAlignment += EncodeState->componentBits; } } } CGV_UINT8 idxCount[3] = {0, 0, 0}; for (CGV_INT k=0; kmaxSubSets,k); CGV_UINT8 idxC = idxCount[partsub]; bestindex16[k] = bestindex[partsub*MAX_SUBSET_SIZE+idxC]; idxCount[partsub] = idxC + 1; } Encode_mode02137( blockMode, bestPartition, packedEndpoints, bestindex16, EncodeState->cmp_out); } void Compress_mode45( CGU_INT blockMode, BC7_EncodeState EncodeState[], uniform CMP_GLOBAL BC7_Encode u_BC7Encode[]) { cmp_mode_parameters best_candidate; EncodeState->channels3or4 = 4; cmp_memsetBC7((CGV_BYTE *)&best_candidate, 0, sizeof(cmp_mode_parameters)); if (blockMode == 4) { EncodeState->max_idxMode = 2; EncodeState->modeBits[0] = 30; // bits = 2 * (Red 5+ Grn 5+ blu 5) EncodeState->modeBits[1] = 36; // bits = 2 * (Alpha 6+6+6) EncodeState->numClusters0[0] = 4; EncodeState->numClusters0[1] = 8; EncodeState->numClusters1[0] = 8; EncodeState->numClusters1[1] = 4; } else { EncodeState->max_idxMode = 1; EncodeState->modeBits[0] = 42; // bits = 2 * (Red 7+ Grn 7+ blu 7) EncodeState->modeBits[1] = 48; // bits = 2 * (Alpha 8+8+8) = 48 EncodeState->numClusters0[0] = 4; EncodeState->numClusters0[1] = 4; EncodeState->numClusters1[0] = 4; EncodeState->numClusters1[1] = 4; } CGV_IMAGE src_color_Block[SOURCE_BLOCK_SIZE*MAX_CHANNELS]; CGV_IMAGE src_alpha_Block[SOURCE_BLOCK_SIZE*MAX_CHANNELS]; // Go through each possible rotation and selection of index rotationBits) for (CGU_CHANNEL rotated_channel = 0; rotated_channel < EncodeState->channels3or4; rotated_channel++) { // A for (CGU_INT k=0; kimage_src[k+componentRotations[rotated_channel][p+1]*SOURCE_BLOCK_SIZE]; src_alpha_Block[k+p*SOURCE_BLOCK_SIZE] = EncodeState->image_src[k+componentRotations[rotated_channel][0]*SOURCE_BLOCK_SIZE]; } } CGV_ERROR err_quantizer; CGV_ERROR err_bestQuantizer = CMP_FLOAT_MAX; for (CGU_INT idxMode = 0; idxMode < EncodeState->max_idxMode; idxMode++) { // B CGV_INDEXPACKED color_index2[2]; // reserved .. Not used! err_quantizer = GetQuantizeIndex( color_index2, best_candidate.color_index, src_color_Block, SOURCE_BLOCK_SIZE, EncodeState->numClusters0[idxMode], 3); err_quantizer += GetQuantizeIndex( color_index2, best_candidate.alpha_index, src_alpha_Block, SOURCE_BLOCK_SIZE, EncodeState->numClusters1[idxMode], 3) / 3.0F; // If quality is high then run the full shaking for this config and // store the result if it beats the best overall error // Otherwise only run the shaking if the error is better than the best // quantizer error if(err_quantizer <= err_bestQuantizer) { err_bestQuantizer = err_quantizer; // Shake size gives the size of the shake cube CGV_ERROR err_overallError; err_overallError = optimize_IndexAndEndPoints( best_candidate.color_index, best_candidate.color_qendpoint, src_color_Block, SOURCE_BLOCK_SIZE, EncodeState->numClusters0[idxMode], CMP_STATIC_CAST(CGU_UINT8,EncodeState->modeBits[0]), 3, u_BC7Encode); // Alpha scalar block err_overallError += optimize_IndexAndEndPoints( best_candidate.alpha_index, best_candidate.alpha_qendpoint, src_alpha_Block, SOURCE_BLOCK_SIZE, EncodeState->numClusters1[idxMode], CMP_STATIC_CAST(CGU_UINT8,EncodeState->modeBits[1]), 3, u_BC7Encode) / 3.0f; // If we beat the previous best then encode the block if(err_overallError < EncodeState->best_err) { best_candidate.idxMode = idxMode; best_candidate.rotated_channel = rotated_channel; if (blockMode == 4) Encode_mode4( EncodeState->cmp_out, &best_candidate); else Encode_mode5( EncodeState->cmp_out, &best_candidate); EncodeState->best_err = err_overallError; } } } // B } // A } void Compress_mode6( BC7_EncodeState EncodeState[], uniform CMP_GLOBAL BC7_Encode u_BC7Encode[]) { CGV_ERROR err; CGV_EPOCODE epo_code_out[8] = {0}; CGV_INDEX best_index_out[MAX_SUBSET_SIZE]; CGV_INDEXPACKED best_packedindex_out[2]; // CGV_IMAGE block_endpoints[8]; // icmp_get_block_endpoints(block_endpoints, EncodeState->image_src, -1, 4); // icmp_GetQuantizedEpoCode(epo_code_out, block_endpoints, 6,4); // err = icmp_GetQuantizeIndex(best_packedindex_out, best_index_out, EncodeState->image_src, 4, block_endpoints, 0,4); err = GetQuantizeIndex( best_packedindex_out, best_index_out, EncodeState->image_src, 16, // numEntries 16, // clusters 4); // channels3or4 //***************************** // Process the partition shape //***************************** err = optimize_IndexAndEndPoints( best_index_out, epo_code_out, EncodeState->image_src, 16, //numEntries 16, // Mi_ = clusters 58, // bits 4, // channels3or4 u_BC7Encode); //************************** // Save the encoded block //************************** if (err < EncodeState->best_err) { EncodeState->best_err = err; Encode_mode6( best_index_out, epo_code_out, EncodeState->cmp_out); } } void copy_BC7_Encode_settings(BC7_EncodeState EncodeState[], uniform CMP_GLOBAL BC7_Encode settings []) { EncodeState->best_err = CMP_FLOAT_MAX; EncodeState->validModeMask = settings->validModeMask; #ifdef USE_ICMP EncodeState->part_count = settings->part_count; EncodeState->channels = settings->channels; #endif } //===================================== ICMP CODE ========================================================= #ifdef USE_ICMP //======================================== // Modified Intel Texture Compression Code //======================================== void icmp_Write32Bit(CGV_CMPOUTPACKED base[], CGU_INT* uniform offset, CGU_INT bits, CGV_CMPOUTPACKED bitVal) { base[*offset / 32] |= ((CGV_CMPOUTPACKED)bitVal) << (*offset % 32); if (*offset % 32 + bits > 32) { base[*offset / 32 + 1] |= shift_right_uint32(bitVal, 32 - *offset % 32); } *offset += bits; } //================ 32 bit cmp_out mode encoders =============== INLINE void icmp_swap_epocode(CGV_EPOCODE u[], CGV_EPOCODE v[], CGU_INT n) { for (CGU_INT i = 0; i < n; i++) { CGV_EPOCODE t = u[i]; u[i] = v[i]; v[i] = t; } } void icmp_encode_apply_swap(CGV_EPOCODE endpoint[], CGU_INT channel, CGV_INDEXPACKED block_index[2], CGU_INT bits) { CGU_INT levels = 1 << bits; if ((block_index[0] & 15) >= levels / 2) { icmp_swap_epocode(&endpoint[0], &endpoint[channel], channel); for (CGU_INT k = 0; k < 2; k++) block_index[k] = (CGV_INDEXPACKED)(0x11111111 * (levels - 1)) - block_index[k]; } } void icmp_encode_index(CGV_CMPOUTPACKED data[5], CGU_INT* uniform pPos, CGV_INDEXPACKED block_index[2], CGU_INT bits, CGV_MASK flips) { CGU_INT levels = 1 << bits; CGV_MASK flips_shifted = flips; for (CGU_INT k1 = 0; k1 < 2; k1++) { CGV_CMPOUTPACKED qbits_shifted = block_index[k1]; for (CGU_INT k2 = 0; k2 < 8; k2++) { CGV_CMPOUTPACKED q = qbits_shifted & 15; if ((flips_shifted & 1) > 0) q = (levels - 1) - q; if (k1 == 0 && k2 == 0) icmp_Write32Bit(data, pPos, bits - 1, q); else icmp_Write32Bit(data, pPos, bits, q); qbits_shifted >>= 4; flips_shifted >>= 1; } } } void icmp_bc7_encode_endpoint2(CGV_CMPOUTPACKED data[5], CGU_INT* uniform pPos, CGV_INDEXPACKED color_index[2], CGU_INT bits, CGV_MASK flips) { CGU_INT levels = 1 << bits; CGV_MASK flips_shifted = flips; for (CGU_INT k1 = 0; k1 < 2; k1++) { CGV_INDEXPACKED qbits_shifted = color_index[k1]; for (CGU_INT k2 = 0; k2 < 8; k2++) { CGV_INDEXPACKED q = qbits_shifted & 15; if ((flips_shifted & 1) > 0) q = (levels - 1) - q; if (k1 == 0 && k2 == 0) icmp_Write32Bit(data, pPos, bits - 1, q); else icmp_Write32Bit(data, pPos, bits, q); qbits_shifted >>= 4; flips_shifted >>= 1; } } } INLINE CGV_CMPOUTPACKED icmp_pow2Packed(CGV_FIXUPINDEX x) { return 1 << x; } INLINE void icmp_encode_data_shl_1bit_from(CGV_CMPOUTPACKED data[5], CGV_FIXUPINDEX from) { if (from < 96) { //assert(from > 64+10); CGV_CMPOUTPACKED shifted = (data[2] >> 1) | (data[3] << 31); CGV_CMPOUTPACKED mask = (icmp_pow2Packed(from - 64) - 1) >> 1; data[2] = (mask&data[2]) | (~mask&shifted); data[3] = (data[3] >> 1) | (data[4] << 31); data[4] = data[4] >> 1; } else if (from < 128) { CGV_CMPOUTPACKED shifted = (data[3] >> 1) | (data[4] << 31); CGV_CMPOUTPACKED mask = (icmp_pow2Packed(from - 96) - 1) >> 1; data[3] = (mask&data[3]) | (~mask&shifted); data[4] = data[4] >> 1; } } INLINE void icmp_get_fixuptable(CGV_FIXUPINDEX fixup[3], CGV_PARTID part_id) { // same as CMP SDK v3.1 BC7_FIXUPINDEX1 & BC7_FIXUPINDEX2 for each partition range 0..63 // The data is saved as a packed INT = (BC7_FIXUPINDEX1 << 4 + BC7_FIXUPINDEX2) CMP_STATIC uniform __constant CGV_FIXUPINDEX FIXUPINDEX[] = { // 2 subset partitions 0..63 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x80u, 0x80u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0x60u, 0x80u, 0x20u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x80u, 0x20u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0x60u, 0x60u, 0x20u, 0x60u, 0x80u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0xf0u, 0x20u, 0x20u, 0xf0u, // 3 subset partitions 64..128 0x3fu, 0x38u, 0xf8u, 0xf3u, 0x8fu, 0x3fu, 0xf3u, 0xf8u, 0x8fu, 0x8fu, 0x6fu, 0x6fu, 0x6fu, 0x5fu, 0x3fu, 0x38u, 0x3fu, 0x38u, 0x8fu, 0xf3u, 0x3fu, 0x38u, 0x6fu, 0xa8u, 0x53u, 0x8fu, 0x86u, 0x6au, 0x8fu, 0x5fu, 0xfau, 0xf8u, 0x8fu, 0xf3u, 0x3fu, 0x5au, 0x6au, 0xa8u, 0x89u, 0xfau, 0xf6u, 0x3fu, 0xf8u, 0x5fu, 0xf3u, 0xf6u, 0xf6u, 0xf8u, 0x3fu, 0xf3u, 0x5fu, 0x5fu, 0x5fu, 0x8fu, 0x5fu, 0xafu, 0x5fu, 0xafu, 0x8fu, 0xdfu, 0xf3u, 0xcfu, 0x3fu, 0x38u }; CGV_FIXUPINDEX skip_packed = FIXUPINDEX[part_id];// gather_int2(FIXUPINDEX, part_id); fixup[0] = 0; fixup[1] = skip_packed >> 4; fixup[2] = skip_packed & 15; } void icmp_bc7_encode_adjust_skip_mode01237_2(CGV_CMPOUTPACKED data[5], CGU_INT mode, CGV_PARTID part_id) { CGU_INT bits = 2; if (mode == 0 || mode == 1) bits = 3; CGU_INT maxSubSets = 2; if (mode == 0 || mode == 2) maxSubSets = 3; CGV_FIXUPINDEX fixup[3]; icmp_get_fixuptable(fixup, part_id); if (maxSubSets > 2 && fixup[1] < fixup[2]) { CGV_FIXUPINDEX t = fixup[1]; fixup[1] = fixup[2]; fixup[2] = t; } for (CGU_INT j = 1; j < maxSubSets; j++) { CGV_FIXUPINDEX k = fixup[j]; icmp_encode_data_shl_1bit_from(data, 128 + (maxSubSets - 1) - (15 - k)*bits); } } INLINE CGV_UINT32 gather_uint32(__constant CGU_UINT32 * const uniform ptr, CGV_INT idx) { return ptr[idx]; // (perf warning expected) } INLINE CGV_MASK icmp_get_partition_mask(CGV_PARTID part_id, CGU_INT subset) { CMP_STATIC uniform __constant CGV_SHIFT32 pattern_mask_table[] = { // 2 subset partitions 0xCCCC3333u, 0x88887777u, 0xEEEE1111u, 0xECC81337u, 0xC880377Fu, 0xFEEC0113u, 0xFEC80137u, 0xEC80137Fu, 0xC80037FFu, 0xFFEC0013u, 0xFE80017Fu, 0xE80017FFu, 0xFFE80017u, 0xFF0000FFu, 0xFFF0000Fu, 0xF0000FFFu, 0xF71008EFu, 0x008EFF71u, 0x71008EFFu, 0x08CEF731u, 0x008CFF73u, 0x73108CEFu, 0x3100CEFFu, 0x8CCE7331u, 0x088CF773u, 0x3110CEEFu, 0x66669999u, 0x366CC993u, 0x17E8E817u, 0x0FF0F00Fu, 0x718E8E71u, 0x399CC663u, 0xAAAA5555u, 0xF0F00F0Fu, 0x5A5AA5A5u, 0x33CCCC33u, 0x3C3CC3C3u, 0x55AAAA55u, 0x96966969u, 0xA55A5AA5u, 0x73CE8C31u, 0x13C8EC37u, 0x324CCDB3u, 0x3BDCC423u, 0x69969669u, 0xC33C3CC3u, 0x99666699u, 0x0660F99Fu, 0x0272FD8Du, 0x04E4FB1Bu, 0x4E40B1BFu, 0x2720D8DFu, 0xC93636C9u, 0x936C6C93u, 0x39C6C639u, 0x639C9C63u, 0x93366CC9u, 0x9CC66339u, 0x817E7E81u, 0xE71818E7u, 0xCCF0330Fu, 0x0FCCF033u, 0x774488BBu, 0xEE2211DDu, // 3 subset partitions 0x08CC0133u, 0x8CC80037u, 0xCC80006Fu, 0xEC001331u, 0x330000FFu, 0x00CC3333u, 0xFF000033u, 0xCCCC0033u, 0x0F0000FFu, 0x0FF0000Fu, 0x00F0000Fu, 0x44443333u, 0x66661111u, 0x22221111u, 0x136C0013u, 0x008C8C63u, 0x36C80137u, 0x08CEC631u, 0x3330000Fu, 0xF0000333u, 0x00EE1111u, 0x88880077u, 0x22C0113Fu, 0x443088CFu, 0x0C22F311u, 0x03440033u, 0x69969009u, 0x9960009Fu, 0x03303443u, 0x00660699u, 0xC22C3113u, 0x8C0000EFu, 0x1300007Fu, 0xC4003331u, 0x004C1333u, 0x22229999u, 0x00F0F00Fu, 0x24929249u, 0x29429429u, 0xC30C30C3u, 0xC03C3C03u, 0x00AA0055u, 0xAA0000FFu, 0x30300303u, 0xC0C03333u, 0x90900909u, 0xA00A5005u, 0xAAA0000Fu, 0x0AAA0555u, 0xE0E01111u, 0x70700707u, 0x6660000Fu, 0x0EE01111u, 0x07707007u, 0x06660999u, 0x660000FFu, 0x00660099u, 0x0CC03333u, 0x03303003u, 0x60000FFFu, 0x80807777u, 0x10100101u, 0x000A0005u, 0x08CE8421u }; CGV_MASK mask_packed = gather_uint32(pattern_mask_table, part_id); CGV_MASK mask0 = mask_packed & 0xFFFF; CGV_MASK mask1 = mask_packed >> 16; CGV_MASK mask = (subset == 2) ? (~mask0)&(~mask1) : ((subset == 0) ? mask0 : mask1); return mask; } #ifdef USE_VARYING #ifdef ASPM_GPU INLINE CGV_INDEXPACKED gather_packedindex(CGV_INDEXPACKED* ptr, CGV_FIXUPINDEX idx) { return ptr[idx]; } #else INLINE CGV_INDEXPACKED gather_packedindex(CMP_CONSTANT varying CGV_INDEXPACKED* CMP_CONSTANT uniform ptr, CGV_FIXUPINDEX idx) { return ptr[idx]; // (perf warning expected) } #endif #endif CGV_MASK icmp_encode_apply_swap_mode01237(CGV_EPOCODE qep[], CGV_INDEXPACKED color_index[2], CGU_INT blockMode, CGV_PARTID part_id) { CGU_INT bits = 2; if (blockMode == 0 || blockMode == 1) bits = 3; CGU_INT maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3; CGV_MASK flips = 0; CGU_INT levels = 1 << bits; CGV_FIXUPINDEX fixup[3]; icmp_get_fixuptable(fixup, part_id); for (CGU_INT j = 0; j < maxSubSets; j++) { CGV_FIXUPINDEX k0 = fixup[j]; #ifdef USE_VARYING CGV_INDEXPACKED q = ((gather_packedindex(color_index, k0 >> 3) << (28 - (k0 & 7) * 4)) >> 28); #else CGV_INDEXPACKED q = ((color_index[k0 >> 3] << (28 - (k0 & 7) * 4)) >> 28); #endif if (q >= levels / 2) { icmp_swap_epocode(&qep[8 * j], &qep[8 * j + 4], 4); CGV_MASK partition_mask = icmp_get_partition_mask(part_id, j); flips |= partition_mask; } } return flips; } void icmp_encode_mode01237(CGV_CMPOUTPACKED cmp_out[5], CGV_EPOCODE color_qendpoint[], CGV_INDEXPACKED color_index[2], CGV_PARTID part_id, CGU_INT blockMode) { CGU_INT bits = 2; if (blockMode == 0 || blockMode == 1) bits = 3; CGU_INT maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3; CGU_INT channels = 3; if (blockMode == 7) channels = 4; CGV_MASK flips = icmp_encode_apply_swap_mode01237(color_qendpoint, color_index, blockMode, part_id); for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0; CGU_INT pos = 0; // mode 0-3, 7 icmp_Write32Bit(cmp_out, &pos, blockMode + 1, 1 << blockMode); // partition if (blockMode == 0) { icmp_Write32Bit(cmp_out, &pos, 4, part_id & 15); } else { icmp_Write32Bit(cmp_out, &pos, 6, part_id & 63); } // endpoints for (CGU_INT ch = 0; ch < channels; ch++) for (CGU_INT j = 0; j < maxSubSets * 2; j++) { if (blockMode == 0) { icmp_Write32Bit(cmp_out, &pos, 4, color_qendpoint[j * 4 + 0 + ch] >> 1); } else if (blockMode == 1) { icmp_Write32Bit(cmp_out, &pos, 6, color_qendpoint[j * 4 + 0 + ch] >> 1); } else if (blockMode == 2) { icmp_Write32Bit(cmp_out, &pos, 5, color_qendpoint[j * 4 + 0 + ch]); } else if (blockMode == 3) { icmp_Write32Bit(cmp_out, &pos, 7, color_qendpoint[j * 4 + 0 + ch] >> 1); } else if (blockMode == 7) { icmp_Write32Bit(cmp_out, &pos, 5, color_qendpoint[j * 4 + 0 + ch] >> 1); } //else //{ // assert(false); //} } // p bits if (blockMode == 1) for (CGU_INT j = 0; j < 2; j++) { icmp_Write32Bit(cmp_out, &pos, 1, color_qendpoint[j * 8] & 1); } if (blockMode == 0 || blockMode == 3 || blockMode == 7) for (CGU_INT j = 0; j < maxSubSets * 2; j++) { icmp_Write32Bit(cmp_out, &pos, 1, color_qendpoint[j * 4] & 1); } // quantized values icmp_bc7_encode_endpoint2(cmp_out, &pos, color_index, bits, flips); icmp_bc7_encode_adjust_skip_mode01237_2(cmp_out, blockMode, part_id); } INLINE void icmp_swap_indexpacked(CGV_INDEXPACKED u[], CGV_INDEXPACKED v[], CGU_INT n) { for (CGU_INT i = 0; i < n; i++) { CGV_INDEXPACKED t = u[i]; u[i] = v[i]; v[i] = t; } } void icmp_encode_mode4(CGV_CMPOUTPACKED cmp_out[5], varying cmp_mode_parameters* uniform params) { CGV_EPOCODE color_qendpoint[8]; CGV_INDEXPACKED color_index[2]; CGV_EPOCODE alpha_qendpoint[2]; CGV_INDEXPACKED alpha_index[2]; CGV_CMPOUTPACKED rotated_channel = params->rotated_channel; CGV_SHIFT32 idxMode = params->idxMode; icmp_swap_epocode(params->color_qendpoint, color_qendpoint, 8); icmp_swap_indexpacked(params->best_color_index, color_index, 2); icmp_swap_epocode(params->alpha_qendpoint, alpha_qendpoint, 2); icmp_swap_indexpacked(params->best_alpha_index, alpha_index, 2); for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0; CGU_INT pos = 0; // mode 4 (5 bits) 00001 icmp_Write32Bit(cmp_out, &pos, 5, 16); // rotation channel 2 bits icmp_Write32Bit(cmp_out, &pos, 2, (rotated_channel + 1) & 3); // idxMode 1 bit icmp_Write32Bit(cmp_out, &pos, 1, idxMode); if (!idxMode) { icmp_encode_apply_swap(color_qendpoint, 4, color_index, 2); icmp_encode_apply_swap(alpha_qendpoint, 1, alpha_index, 3); } else { icmp_swap_indexpacked(color_index, alpha_index, 2); icmp_encode_apply_swap(alpha_qendpoint, 1, color_index, 2); icmp_encode_apply_swap(color_qendpoint, 4, alpha_index, 3); } // color endpoints 5 bits each // R0 : R1 // G0 : G1 // B0 : B1 for (CGU_INT p = 0; p < 3; p++) { CGV_EPOCODE c0 = color_qendpoint[0 + p]; CGV_EPOCODE c1 = color_qendpoint[4 + p]; icmp_Write32Bit(cmp_out, &pos, 5, c0); // 0 icmp_Write32Bit(cmp_out, &pos, 5, c1); // 1 } // alpha endpoints (6 bits each) // A0 : A1 icmp_Write32Bit(cmp_out, &pos, 6, alpha_qendpoint[0]); icmp_Write32Bit(cmp_out, &pos, 6, alpha_qendpoint[1]); // index data (color index 2 bits each) 31 bits total icmp_encode_index(cmp_out, &pos, color_index, 2, 0); // index data (alpha index 3 bits each) 47 bits total icmp_encode_index(cmp_out, &pos, alpha_index, 3, 0); } void icmp_Encode_mode5(CGV_CMPOUTPACKED cmp_out[5], varying cmp_mode_parameters* uniform params) { CGV_EPOCODE qep[8]; CGV_INDEXPACKED color_index[2]; CGV_EPOCODE alpha_qendpoint[2]; CGV_INDEXPACKED alpha_index[2]; icmp_swap_epocode(params->color_qendpoint, qep, 8); icmp_swap_indexpacked(params->best_color_index, color_index, 2); icmp_swap_epocode(params->alpha_qendpoint, alpha_qendpoint, 2); icmp_swap_indexpacked(params->best_alpha_index, alpha_index, 2); CGV_CMPOUTPACKED rotated_channel = params->rotated_channel; icmp_encode_apply_swap(qep, 4, color_index, 2); icmp_encode_apply_swap(alpha_qendpoint, 1, alpha_index, 2); for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0; CGU_INT pos = 0; // mode 5 icmp_Write32Bit(cmp_out, &pos, 6, 1 << 5); // rotated channel icmp_Write32Bit(cmp_out, &pos, 2, (rotated_channel + 1) & 3); // endpoints for (CGU_INT p = 0; p < 3; p++) { icmp_Write32Bit(cmp_out, &pos, 7, qep[0 + p]); icmp_Write32Bit(cmp_out, &pos, 7, qep[4 + p]); } // alpha endpoints icmp_Write32Bit(cmp_out, &pos, 8, alpha_qendpoint[0]); icmp_Write32Bit(cmp_out, &pos, 8, alpha_qendpoint[1]); // quantized values icmp_encode_index(cmp_out, &pos, color_index, 2, 0); icmp_encode_index(cmp_out, &pos, alpha_index, 2, 0); } void icmp_encode_mode6(CGV_CMPOUTPACKED cmp_out[5], CGV_EPOCODE qep[8], CGV_INDEXPACKED color_index[2]) { icmp_encode_apply_swap(qep, 4, color_index, 4); for (CGU_INT k = 0; k < 5; k++) cmp_out[k] = 0; CGU_INT pos = 0; // mode 6 icmp_Write32Bit(cmp_out, &pos, 7, 64); // endpoints for (CGU_INT p = 0; p < 4; p++) { icmp_Write32Bit(cmp_out, &pos, 7, qep[0 + p] >> 1); icmp_Write32Bit(cmp_out, &pos, 7, qep[4 + p] >> 1); } // p bits icmp_Write32Bit(cmp_out, &pos, 1, qep[0] & 1); icmp_Write32Bit(cmp_out, &pos, 1, qep[4] & 1); // quantized values icmp_encode_index(cmp_out, &pos, color_index, 4, 0); } /////////////////////////// // PCA helpers INLINE void icmp_compute_stats_masked(CGV_IMAGE stats[15], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_CHANNEL channels) { for (CGU_INT i = 0; i < 15; i++) stats[i] = 0; CGV_MASK mask_shifted = mask << 1; for (CGU_INT k = 0; k < 16; k++) { mask_shifted >>= 1; //if ((mask_shifted&1) == 0) continue; CGV_MASK flag = (mask_shifted & 1); CGV_IMAGE rgba[4]; for (CGU_CHANNEL ch = 0; ch < channels; ch++) rgba[ch] = image_src[k + ch * 16]; for (CGU_CHANNEL ch = 0; ch < channels; ch++) rgba[ch] *= flag; stats[14] += flag; stats[10] += rgba[0]; stats[11] += rgba[1]; stats[12] += rgba[2]; stats[0] += rgba[0] * rgba[0]; stats[1] += rgba[0] * rgba[1]; stats[2] += rgba[0] * rgba[2]; stats[4] += rgba[1] * rgba[1]; stats[5] += rgba[1] * rgba[2]; stats[7] += rgba[2] * rgba[2]; if (channels == 4) { stats[13] += rgba[3]; stats[3] += rgba[0] * rgba[3]; stats[6] += rgba[1] * rgba[3]; stats[8] += rgba[2] * rgba[3]; stats[9] += rgba[3] * rgba[3]; } } } INLINE void icmp_covar_from_stats(CGV_IMAGE covar[10], CGV_IMAGE stats[15], CGU_CHANNEL channels3or4) { covar[0] = stats[0] - stats[10 + 0] * stats[10 + 0] / stats[14]; covar[1] = stats[1] - stats[10 + 0] * stats[10 + 1] / stats[14]; covar[2] = stats[2] - stats[10 + 0] * stats[10 + 2] / stats[14]; covar[4] = stats[4] - stats[10 + 1] * stats[10 + 1] / stats[14]; covar[5] = stats[5] - stats[10 + 1] * stats[10 + 2] / stats[14]; covar[7] = stats[7] - stats[10 + 2] * stats[10 + 2] / stats[14]; if (channels3or4 == 4) { covar[3] = stats[3] - stats[10 + 0] * stats[10 + 3] / stats[14]; covar[6] = stats[6] - stats[10 + 1] * stats[10 + 3] / stats[14]; covar[8] = stats[8] - stats[10 + 2] * stats[10 + 3] / stats[14]; covar[9] = stats[9] - stats[10 + 3] * stats[10 + 3] / stats[14]; } } INLINE void icmp_compute_covar_dc_masked(CGV_IMAGE covar[6], CGV_IMAGE dc[3], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_INT channels3or4) { CGV_IMAGE stats[15]; icmp_compute_stats_masked(stats, image_src, mask, channels3or4); icmp_covar_from_stats(covar, stats, channels3or4); for (CGU_INT ch = 0; ch < channels3or4; ch++) dc[ch] = stats[10 + ch] / stats[14]; } INLINE void icmp_ssymv3(CGV_IMAGE a[4], CGV_IMAGE covar[10], CGV_IMAGE b[4]) { a[0] = covar[0] * b[0] + covar[1] * b[1] + covar[2] * b[2]; a[1] = covar[1] * b[0] + covar[4] * b[1] + covar[5] * b[2]; a[2] = covar[2] * b[0] + covar[5] * b[1] + covar[7] * b[2]; } INLINE void icmp_ssymv4_2(CGV_IMAGE a[4], CGV_IMAGE covar[10], CGV_IMAGE b[4]) { a[0] = covar[0] * b[0] + covar[1] * b[1] + covar[2] * b[2] + covar[3] * b[3]; a[1] = covar[1] * b[0] + covar[4] * b[1] + covar[5] * b[2] + covar[6] * b[3]; a[2] = covar[2] * b[0] + covar[5] * b[1] + covar[7] * b[2] + covar[8] * b[3]; a[3] = covar[3] * b[0] + covar[6] * b[1] + covar[8] * b[2] + covar[9] * b[3]; } #ifndef ASPM // Computes inverse square root over an implementation-defined range. The maximum error is implementation-defined. CGV_IMAGE Image_rsqrt(CGV_IMAGE f) { CGV_IMAGE sf = sqrt(f); if (sf != 0) return 1 / sqrt(f); else return 0.0f; } #endif INLINE void icmp_compute_axis(CGV_IMAGE axis[4], CGV_IMAGE covar[10], #ifdef ASPM_GPU CGV_ITTERATIONS powerIterations, #else uniform __constant CGV_ITTERATIONS powerIterations, #endif CGU_CHANNEL channels) { CGV_IMAGE vec[4] = { 1,1,1,1 }; for (CGU_INT i = 0; i < powerIterations; i++) { if (channels == 3) icmp_ssymv3(axis, covar, vec); if (channels == 4) icmp_ssymv4_2(axis, covar, vec); for (CGU_CHANNEL ch = 0; ch < channels; ch++) vec[ch] = axis[ch]; if (i % 2 == 1) // renormalize every other iteration { CGV_IMAGE norm_sq = 0; for (CGU_CHANNEL ch = 0; ch < channels; ch++) norm_sq += axis[ch] * axis[ch]; #ifndef ASPM CGV_IMAGE rnorm = Image_rsqrt(norm_sq); #else CGV_IMAGE rnorm = rsqrt(norm_sq); #endif for (CGU_CHANNEL ch = 0; ch < channels; ch++) vec[ch] *= rnorm; } } for (CGU_CHANNEL ch = 0; ch < channels; ch++) axis[ch] = vec[ch]; } void icmp_block_pca_axis(CGV_IMAGE axis[4], CGV_IMAGE dc[4], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_INT channels3or4) { uniform __constant CGV_ITTERATIONS powerIterations = 8; // 4 not enough for HQ CGV_IMAGE covar[10]; icmp_compute_covar_dc_masked(covar, dc, image_src, mask, channels3or4); CGV_IMAGE inv_var = 1.0 / (256 * 256); for (CGU_INT k = 0; k < 10; k++) { covar[k] *= inv_var; } CGV_IMAGE eps = sq_image(0.001F); covar[0] += eps; covar[4] += eps; covar[7] += eps; covar[9] += eps; icmp_compute_axis(axis, covar, powerIterations, channels3or4); } CGV_IMAGE minImage(CGV_IMAGE a, CGV_IMAGE b) { return a < b ? a : b; } CGV_IMAGE maxImage(CGV_IMAGE a, CGV_IMAGE b) { return a > b ? a : b; } void icmp_block_segment_core(CGV_IMAGE epo_code[], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_INT channels3or4) { CGV_IMAGE axis[4]; CGV_IMAGE dc[4]; icmp_block_pca_axis(axis, dc, image_src, mask, channels3or4); CGV_IMAGE ext[2]; ext[0] = +1e32; ext[1] = -1e32; // find min/max CGV_MASK mask_shifted = mask << 1; for (CGU_INT k = 0; k < 16; k++) { mask_shifted >>= 1; if ((mask_shifted & 1) == 0) continue; CGV_IMAGE dot = 0; for (CGU_INT ch = 0; ch < channels3or4; ch++) dot += axis[ch] * (image_src[16 * ch + k] - dc[ch]); ext[0] = minImage(ext[0], dot); ext[1] = maxImage(ext[1], dot); } // create some distance if the endpoints collapse if (ext[1] - ext[0] < 1.0f) { ext[0] -= 0.5f; ext[1] += 0.5f; } for (CGU_INT i = 0; i < 2; i++) for (CGU_INT ch = 0; ch < channels3or4; ch++) { epo_code[4 * i + ch] = ext[i] * axis[ch] + dc[ch]; } } INLINE CGV_IMAGE clampf(CGV_IMAGE v, CGV_IMAGE a, CGV_IMAGE b) { if (v < a) return a; else if (v > b) return b; return v; } void icmp_get_block_endpoints(CGV_IMAGE block_endpoints[], CGV_IMAGE image_src[64], CGV_MASK mask, CGU_CHANNEL channels3or4) { icmp_block_segment_core(block_endpoints, image_src, mask, channels3or4); for (CGU_INT i = 0; i < 2; i++) for (CGU_INT ch = 0; ch < channels3or4; ch++) { block_endpoints[4 * i + ch] = clampf(block_endpoints[4 * i + ch], 0.0f, 255.0f); } } void icmp_ep_quant0367_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT blockMode, CGU_INT channels) { CGU_INT bits = 7; if (blockMode == 0) bits = 4; if (blockMode == 7) bits = 5; CGU_INT levels = 1 << bits; CGU_INT levels2 = levels * 2 - 1; for (CGU_INT i = 0; i < 2; i++) { CGV_EPOCODE qep_b[8]; for (CGU_INT b = 0; b < 2; b++) for (CGU_INT p = 0; p < 4; p++) { CGV_EPOCODE v = (CGV_TYPEINT)((ep[i * 4 + p] / 255.0f*levels2 - b) / 2.0f + 0.5f) * 2 + b; qep_b[b * 4 + p] = clampEPO(v, b, levels2 - 1 + b); } CGV_IMAGE ep_b[8]; for (CGU_INT j = 0; j < 8; j++) ep_b[j] = qep_b[j]; if (blockMode == 0) for (CGU_INT j = 0; j < 8; j++) ep_b[j] = expandEPObits(qep_b[j], 5); CGV_ERROR err0 = 0.0f; CGV_ERROR err1 = 0.0f; for (CGU_INT ch = 0; ch < channels; ch++) { err0 += sq_image(ep[i * 4 + ch] - ep_b[0 + ch]); err1 += sq_image(ep[i * 4 + ch] - ep_b[4 + ch]); } for (CGU_INT p = 0; p < 4; p++) qep[i * 4 + p] = (err0 < err1) ? qep_b[0 + p] : qep_b[4 + p]; } } void icmp_ep_quant245_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT mode) { CGU_INT bits = 5; if (mode == 5) bits = 7; CGU_INT levels = 1 << bits; for (CGU_INT i = 0; i < 8; i++) { CGV_EPOCODE v = ((CGV_TYPEINT)(ep[i] / 255.0f*(levels - 1) + 0.5)); qep[i] = clampEPO(v, 0, levels - 1); } } void icmp_ep_quant1_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT mode) { CGV_EPOCODE qep_b[16]; for (CGU_INT b = 0; b < 2; b++) for (CGU_INT i = 0; i < 8; i++) { CGV_EPOCODE v = ((CGV_TYPEINT)((ep[i] / 255.0f*127.0f - b) / 2 + 0.5)) * 2 + b; qep_b[b * 8 + i] = clampEPO(v, b, 126 + b); } // dequant CGV_IMAGE ep_b[16]; for (CGU_INT k = 0; k < 16; k++) ep_b[k] = expandEPObits(qep_b[k], 7); CGV_ERROR err0 = 0.0f; CGV_ERROR err1 = 0.0f; for (CGU_INT j = 0; j < 2; j++) for (CGU_INT p = 0; p < 3; p++) { err0 += sq_image(ep[j * 4 + p] - ep_b[0 + j * 4 + p]); err1 += sq_image(ep[j * 4 + p] - ep_b[8 + j * 4 + p]); } for (CGU_INT i = 0; i < 8; i++) qep[i] = (err0 < err1) ? qep_b[0 + i] : qep_b[8 + i]; } void icmp_ep_quant2_2(CGV_EPOCODE qep[], CGV_IMAGE ep[], CGU_INT blockMode, CGU_INT channels3or4) { //assert(mode <= 7); CMP_STATIC uniform __constant CGV_SUBSETS SubSetTable[] = { 3,2,3,2,1,1,1,2 }; #ifndef ASPM_GPU uniform CMP_CONSTANT #endif CGV_SUBSETS maxSubSets = SubSetTable[blockMode]; if (blockMode == 0 || blockMode == 3 || blockMode == 6 || blockMode == 7) { for (CGU_INT i = 0; i < maxSubSets; i++) icmp_ep_quant0367_2(&qep[i * 8], &ep[i * 8], blockMode, channels3or4); } else if (blockMode == 1) { for (CGU_INT i = 0; i < maxSubSets; i++) icmp_ep_quant1_2(&qep[i * 8], &ep[i * 8], blockMode); } else if (blockMode == 2 || blockMode == 4 || blockMode == 5) { for (CGU_INT i = 0; i < maxSubSets; i++) icmp_ep_quant245_2(&qep[i * 8], &ep[i * 8], blockMode); } // else // assert(false); } void icmp_ep_dequant2(CGV_IMAGE ep[], CGV_EPOCODE qep[], CGU_INT blockMode) { //assert(mode <= 7); CMP_STATIC uniform __constant CGV_SUBSETS subSetTable[] = { 3,2,3,2,1,1,1,2 }; #ifndef ASPM_GPU uniform CMP_CONSTANT #endif CGV_SUBSETS maxSubSets = subSetTable[blockMode]; // mode 3, 6 are 8-bit if (blockMode == 3 || blockMode == 6) { for (CGU_INT i = 0; i < 8 * maxSubSets; i++) ep[i] = qep[i]; } else if (blockMode == 1 || blockMode == 5) { for (CGU_INT i = 0; i < 8 * maxSubSets; i++) ep[i] = expandEPObits(qep[i], 7); } else if (blockMode == 0 || blockMode == 2 || blockMode == 4) { for (CGU_INT i = 0; i < 8 * maxSubSets; i++) ep[i] = expandEPObits(qep[i], 5); } else if (blockMode == 7) { for (CGU_INT i = 0; i < 8 * maxSubSets; i++) ep[i] = expandEPObits(qep[i], 6); } //else // assert(false); } void icmp_GetQuantizedEpoCode(CGV_EPOCODE epo_code_out[], CGV_IMAGE block_endpoints[], CGU_INT blockMode, CGU_CHANNEL channels3or4) { icmp_ep_quant2_2(epo_code_out, block_endpoints, blockMode, channels3or4); icmp_ep_dequant2(block_endpoints, epo_code_out, blockMode); } void icmp_ep_quant_dequant_mode4(CGV_EPOCODE qep[], CGV_IMAGE ep[]) { icmp_ep_quant2_2(qep, ep, 4, 3); icmp_ep_dequant2(ep, qep, 4); } /////////////////////////// // pixel quantization //======================================== // Modified Intel Texture Compression Code //======================================== INLINE uniform __constant CGV_RAMP* uniform icmp_GetRamp(CGU_INT bits) { //assert(bits>=2 && bits<=4); // invalid bit size CMP_STATIC uniform __constant CGV_RAMP unquant_table_2bits[] = { 0, 21, 43, 64 }; CMP_STATIC uniform __constant CGV_RAMP unquant_table_3bits[] = { 0, 9, 18, 27, 37, 46, 55, 64 }; CMP_STATIC uniform __constant CGV_RAMP unquant_table_4bits[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; uniform __constant CGV_RAMP* uniform unquant_tables[] = { unquant_table_2bits, unquant_table_3bits, unquant_table_4bits }; return unquant_tables[bits - 2]; } #ifdef USE_VARYING INLINE CGV_IMAGE gather_image(varying CGV_IMAGE* uniform ptr, CGV_SHIFT32 idx) { return ptr[idx]; // (perf warning expected) } #endif INLINE CGV_RAMP gather_ramp( #ifdef ASPM_GPU CMP_CONSTANT CGV_RAMP* ptr, #else CMP_CONSTANT CGV_RAMP* CMP_CONSTANT uniform ptr, #endif CGV_INDEX idx) { return ptr[idx]; // (perf warning expected) } CGV_ERROR icmp_GetQuantizeIndex( CGV_INDEXPACKED index_packed_out[2], CGV_INDEX index_out[MAX_SUBSET_SIZE], CGV_IMAGE image_src[64], CGU_INT bits, CGV_IMAGE image_block[], CGV_SHIFT32 pattern, CGU_CHANNEL channels3or4) { CGV_ERROR total_err = 0; uniform __constant CGV_RAMP* uniform Ramp = icmp_GetRamp(bits); CGV_LEVELS levels = 1 << bits; // 64-bit color_qendpoint: 5% overhead in this function for (CGU_INT k = 0; k < 2; k++) index_packed_out[k] = 0; CGV_SHIFT32 pattern_shifted = pattern; for (CGU_INT k = 0; k < 16; k++) { CGV_SHIFT32 j = pattern_shifted & 3; pattern_shifted >>= 2; CGV_IMAGE proj = 0; CGV_IMAGE div = 0; for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++) { #ifdef USE_VARYING CGV_IMAGE ep_a = gather_image(image_block, 8 * j + 0 + ch); CGV_IMAGE ep_b = gather_image(image_block, 8 * j + 4 + ch); #else CGV_IMAGE ep_a = image_block[8 * j + 0 + ch]; CGV_IMAGE ep_b = image_block[8 * j + 4 + ch]; #endif proj += (image_src[k + ch * 16] - ep_a)*(ep_b - ep_a); div += sq_image(ep_b - ep_a); } proj /= div; CGV_INDEX index_q1 = (CGV_INDEX)(proj*levels + 0.5); index_q1 = clampIndex(index_q1, 1, levels - 1); CGV_ERROR err0 = 0; CGV_ERROR err1 = 0; CGV_RAMP ramp0 = gather_ramp(Ramp, index_q1 - 1); CGV_RAMP ramp1 = gather_ramp(Ramp, index_q1); for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++) { #ifdef USE_VARYING CGV_IMAGE ep_a = gather_image(image_block, 8 * j + 0 + ch); CGV_IMAGE ep_b = gather_image(image_block, 8 * j + 4 + ch); #else CGV_IMAGE ep_a = image_block[8 * j + 0 + ch]; CGV_IMAGE ep_b = image_block[8 * j + 4 + ch]; #endif CGV_IMAGE dec_v0 = (CGV_TYPEINT)(((64 - ramp0)*ep_a + ramp0 * ep_b + 32) / 64); CGV_IMAGE dec_v1 = (CGV_TYPEINT)(((64 - ramp1)*ep_a + ramp1 * ep_b + 32) / 64); err0 += sq_image(dec_v0 - image_src[k + ch * 16]); err1 += sq_image(dec_v1 - image_src[k + ch * 16]); } CGV_ERROR best_err = err1; CGV_INDEX best_index = index_q1; if (err0 < err1) { best_err = err0; best_index = index_q1 - 1; } index_out[k] = best_index; index_packed_out[k / 8] += ((CGV_INDEXPACKED)best_index) << 4 * (k % 8); total_err += best_err; } return total_err; } /////////////////////////// // LS endpoint refinement void icmp_opt_endpoints(CGV_IMAGE ep[], CGV_IMAGE image_src[64], CGU_INT bits, CGV_INDEXPACKED color_qendpoint[2], CGV_MASK mask, CGU_CHANNEL channels3or4) { CGU_INT levels = 1 << bits; CGV_IMAGE Atb1[4] = { 0,0,0,0 }; CGV_IMAGE sum_q = 0; CGV_IMAGE sum_qq = 0; CGV_IMAGE sum[5] = { 0,0,0,0,0 }; CGV_MASK mask_shifted = mask << 1; for (CGU_INT k1 = 0; k1 < 2; k1++) { CGV_INDEXPACKED qbits_shifted = color_qendpoint[k1]; for (CGU_INT k2 = 0; k2 < 8; k2++) { CGU_INT k = k1 * 8 + k2; CGV_IMAGE q = (CGV_TYPEINT)(qbits_shifted & 15); qbits_shifted >>= 4; mask_shifted >>= 1; if ((mask_shifted & 1) == 0) continue; CGV_LEVELS x = (levels - 1) - q; CGV_LEVELS y = q; sum_q += q; sum_qq += q * q; sum[4] += 1; for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++) sum[ch] += image_src[k + ch * 16]; for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++) Atb1[ch] += x * image_src[k + ch * 16]; } } CGV_IMAGE Atb2[4]; for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++) { //sum[ch] = dc[ch]*16; Atb2[ch] = (levels - 1)*sum[ch] - Atb1[ch]; } CGV_IMAGE Cxx = sum[4] * sq_image(levels - 1) - 2 * (levels - 1)*sum_q + sum_qq; CGV_IMAGE Cyy = sum_qq; CGV_IMAGE Cxy = (levels - 1)*sum_q - sum_qq; CGV_IMAGE scale = (levels - 1) / (Cxx*Cyy - Cxy * Cxy); for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++) { ep[0 + ch] = (Atb1[ch] * Cyy - Atb2[ch] * Cxy)*scale; ep[4 + ch] = (Atb2[ch] * Cxx - Atb1[ch] * Cxy)*scale; //ep[0+ch] = clamp(ep[0+ch], 0, 255); //ep[4+ch] = clamp(ep[4+ch], 0, 255); } if (img_absf(Cxx*Cyy - Cxy * Cxy) < 0.001f) { // flatten for (CGU_CHANNEL ch = 0; ch < channels3or4; ch++) { ep[0 + ch] = sum[ch] / sum[4]; ep[4 + ch] = ep[0 + ch]; } } } ////////////////////////// // parameter estimation void icmp_channel_quant_dequant2(CGV_EPOCODE qep[2], CGV_IMAGE ep[2], CGU_INT epbits) { CGV_LEVELS elevels = (1 << epbits); for (CGU_INT i = 0; i < 2; i++) { CGV_EPOCODE v = ((CGV_EPOCODE)(ep[i] / 255.0f*(elevels - 1) + 0.5f)); qep[i] = clampEPO(v, 0, elevels - 1); ep[i] = expandEPObits(qep[i], epbits); } } void icmp_refineEndpoints(CGV_IMAGE ep[2], CGV_IMAGE block[16], CGU_INT bits, CGV_INDEXPACKED color_index[2]) { CGU_INT levels = 1 << bits; CGV_IMAGE Atb1 = 0; CGV_IMAGE sum_q = 0; CGV_IMAGE sum_qq = 0; CGV_IMAGE sum = 0; for (CGU_INT k1 = 0; k1 < 2; k1++) { CGV_INDEXPACKED qbits_shifted = color_index[k1]; for (CGU_INT k2 = 0; k2 < 8; k2++) { CGU_INT k = k1 * 8 + k2; CGV_IMAGE q = (CGV_TYPEINT)(qbits_shifted & 15); qbits_shifted >>= 4; CGV_TYPEINT x = (levels - 1) - q; CGV_TYPEINT y = q; sum_q += q; sum_qq += q * q; sum += block[k]; Atb1 += x * block[k]; } } CGV_IMAGE Atb2 = (levels - 1)*sum - Atb1; CGV_IMAGE Cxx = 16 * sq_image(levels - 1) - 2 * (levels - 1)*sum_q + sum_qq; CGV_IMAGE Cyy = sum_qq; CGV_IMAGE Cxy = (levels - 1)*sum_q - sum_qq; CGV_IMAGE scale = (levels - 1) / (Cxx*Cyy - Cxy * Cxy); ep[0] = (Atb1*Cyy - Atb2 * Cxy)*scale; ep[1] = (Atb2*Cxx - Atb1 * Cxy)*scale; ep[0] = clampf(ep[0], 0.0f, 255.0f); ep[1] = clampf(ep[1], 0.0f, 255.0f); if (img_absf(Cxx*Cyy - Cxy * Cxy) < 0.001) { ep[0] = sum / 16; ep[1] = ep[0]; } } CGV_ERROR icmp_channelQuantizeIndex(CGV_INDEXPACKED color_index[2], CGV_INDEX index[MAX_SUBSET_SIZE], CGV_IMAGE block[16], CGU_INT bits, CGV_IMAGE ep[]) { uniform __constant CGV_RAMP* uniform Ramp = icmp_GetRamp(bits); CGV_LEVELS levels = (1 << bits); color_index[0] = 0; color_index[1] = 0; CGV_ERROR total_err = 0; for (CGU_INT k = 0; k < 16; k++) { CGV_IMAGE proj = (block[k] - ep[0]) / (ep[1] - ep[0] + 0.001f); CGV_INDEX q1 = (CGV_TYPEINT)(proj*levels + 0.5); q1 = clampEPO(q1, 1, levels - 1); CGV_ERROR err0 = 0; CGV_ERROR err1 = 0; CGV_RAMP ramp0 = gather_ramp(Ramp, q1 - 1); CGV_RAMP ramp1 = gather_ramp(Ramp, q1); CGV_IMAGE dec_v0 = (CGV_TYPEINT)(((64 - ramp0)*ep[0] + ramp0 * ep[1] + 32) / 64); CGV_IMAGE dec_v1 = (CGV_TYPEINT)(((64 - ramp1)*ep[0] + ramp1 * ep[1] + 32) / 64); err0 += sq_image(dec_v0 - block[k]); err1 += sq_image(dec_v1 - block[k]); CGV_TYPEINT best_err = err1; CGV_INDEX best_q = q1; if (err0 < err1) { best_err = err0; best_q = q1 - 1; } index[k] = best_q; color_index[k / 8] += ((CGV_INDEXPACKED)best_q) << 4 * (k % 8); total_err += best_err; } return total_err; } CGV_ERROR icmp_optQuantizeIndex(BC7_EncodeState EncodeState[], CGV_INDEXPACKED color_index[2], CGV_INDEX index[MAX_SUBSET_SIZE], CGV_EPOCODE qep[2], CGV_IMAGE block[16], CGU_INT bits, CGU_INT epbits) { CGV_IMAGE ep[2] = { 255,0 }; for (CGU_INT k = 0; k < 16; k++) { ep[0] = minImage(ep[0], block[k]); ep[1] = maxImage(ep[1], block[k]); } icmp_channel_quant_dequant2(qep, ep, epbits); CGV_ERROR err = icmp_channelQuantizeIndex(color_index, index, block, bits, ep); // refine #ifndef ASPM_GPU uniform CMP_CONSTANT #endif CGV_ITTERATIONS refineIterations = EncodeState->refineIterations; for (CGU_INT i = 0; i < refineIterations; i++) { icmp_refineEndpoints(ep, block, bits, color_index); icmp_channel_quant_dequant2(qep, ep, epbits); err = icmp_channelQuantizeIndex(color_index, index, block, bits, ep); } return err; } INLINE CGV_SHIFT32 icmp_get_pattern2(CGV_PARTID part_id) { CMP_STATIC uniform __constant CGV_SHIFT32 pattern_table[] = { 0x50505050u, 0x40404040u, 0x54545454u, 0x54505040u, 0x50404000u, 0x55545450u, 0x55545040u, 0x54504000u, 0x50400000u, 0x55555450u, 0x55544000u, 0x54400000u, 0x55555440u, 0x55550000u, 0x55555500u, 0x55000000u, 0x55150100u, 0x00004054u, 0x15010000u, 0x00405054u, 0x00004050u, 0x15050100u, 0x05010000u, 0x40505054u, 0x00404050u, 0x05010100u, 0x14141414u, 0x05141450u, 0x01155440u, 0x00555500u, 0x15014054u, 0x05414150u, 0x44444444u, 0x55005500u, 0x11441144u, 0x05055050u, 0x05500550u, 0x11114444u, 0x41144114u, 0x44111144u, 0x15055054u, 0x01055040u, 0x05041050u, 0x05455150u, 0x14414114u, 0x50050550u, 0x41411414u, 0x00141400u, 0x00041504u, 0x00105410u, 0x10541000u, 0x04150400u, 0x50410514u, 0x41051450u, 0x05415014u, 0x14054150u, 0x41050514u, 0x41505014u, 0x40011554u, 0x54150140u, 0x50505500u, 0x00555050u, 0x15151010u, 0x54540404u, 0xAA685050u, 0x6A5A5040u, 0x5A5A4200u, 0x5450A0A8u, 0xA5A50000u, 0xA0A05050u, 0x5555A0A0u, 0x5A5A5050u, 0xAA550000u, 0xAA555500u, 0xAAAA5500u, 0x90909090u, 0x94949494u, 0xA4A4A4A4u, 0xA9A59450u, 0x2A0A4250u, 0xA5945040u, 0x0A425054u, 0xA5A5A500u, 0x55A0A0A0u, 0xA8A85454u, 0x6A6A4040u, 0xA4A45000u, 0x1A1A0500u, 0x0050A4A4u, 0xAAA59090u, 0x14696914u, 0x69691400u, 0xA08585A0u, 0xAA821414u, 0x50A4A450u, 0x6A5A0200u, 0xA9A58000u, 0x5090A0A8u, 0xA8A09050u, 0x24242424u, 0x00AA5500u, 0x24924924u, 0x24499224u, 0x50A50A50u, 0x500AA550u, 0xAAAA4444u, 0x66660000u, 0xA5A0A5A0u, 0x50A050A0u, 0x69286928u, 0x44AAAA44u, 0x66666600u, 0xAA444444u, 0x54A854A8u, 0x95809580u, 0x96969600u, 0xA85454A8u, 0x80959580u, 0xAA141414u, 0x96960000u, 0xAAAA1414u, 0xA05050A0u, 0xA0A5A5A0u, 0x96000000u, 0x40804080u, 0xA9A8A9A8u, 0xAAAAAA44u, 0x2A4A5254u }; return gather_uint32(pattern_table, part_id); } CGV_IMAGE icmp_get_pca_bound(CGV_IMAGE covar[10], CGU_CHANNEL channels) { uniform __constant CGV_TYPEINT powerIterations = 4; // quite approximative, but enough for bounding CGV_IMAGE inv_var = 1.0 / (256 * 256); for (CGU_INT k = 0; k < 10; k++) { covar[k] *= inv_var; } CGV_IMAGE eps = sq_image(0.001); covar[0] += eps; covar[4] += eps; covar[7] += eps; CGV_IMAGE axis[4]; icmp_compute_axis(axis, covar, powerIterations, channels); CGV_IMAGE vec[4]; if (channels == 3) icmp_ssymv3(vec, covar, axis); if (channels == 4) icmp_ssymv4_2(vec, covar, axis); CGV_IMAGE sq_sum = 0.0f; for (CGU_INT p = 0; p < channels; p++) sq_sum += sq_image(vec[p]); CGV_IMAGE lambda = sqrt(sq_sum); CGV_IMAGE bound = covar[0] + covar[4] + covar[7]; if (channels == 4) bound += covar[9]; bound -= lambda; bound = maxImage(bound, 0.0f); return bound; } CGV_IMAGE icmp_block_pca_bound_split2(CGV_IMAGE image_src[64], CGV_MASK mask, CGV_IMAGE full_stats[15], CGU_CHANNEL channels) { CGV_IMAGE stats[15]; icmp_compute_stats_masked(stats, image_src, mask, channels); CGV_IMAGE covar1[10]; icmp_covar_from_stats(covar1, stats, channels); for (CGU_INT i = 0; i < 15; i++) stats[i] = full_stats[i] - stats[i]; CGV_IMAGE covar2[10]; icmp_covar_from_stats(covar2, stats, channels); CGV_IMAGE bound = 0.0f; bound += icmp_get_pca_bound(covar1, channels); bound += icmp_get_pca_bound(covar2, channels); return sqrt(bound) * 256; } #ifdef USE_VARYING INLINE void scatter_partid(varying CGV_PARTID* uniform ptr, CGV_TYPEINT idx, CGV_PARTID value) { ptr[idx] = value; // (perf warning expected) } #endif void icmp_sort_partlist(CGV_PARTID list[], CGU_INT length, CGU_INT partial_count) { for (CGU_INT k = 0; k < partial_count; k++) { CGV_TYPEINT best_idx = k; CGV_PARTID best_value = list[k]; for (CGU_INT i = k + 1; i < length; i++) { if (best_value > list[i]) { best_value = list[i]; best_idx = i; } } // swap #ifdef USE_VARYING scatter_partid(list, best_idx, list[k]); #else list[best_idx] = list[k]; #endif list[k] = best_value; } } INLINE void copy_epocode(CGV_EPOCODE u[], CGV_EPOCODE v[], CGU_INT n) { for (CGU_INT i = 0; i < n; i++) { u[i] = v[i]; } } INLINE void copy_indexpacked(CGV_INDEXPACKED u[], CGV_INDEXPACKED v[], CGU_INT n) { for (CGU_INT i = 0; i < n; i++) { u[i] = v[i]; } } void icmp_enc_mode4_candidate( BC7_EncodeState EncodeState[], cmp_mode_parameters best_candidate[], CGV_ERROR best_err[], CGU_INT rotated_channel, CGU_INT idxMode) { CGU_INT bits = 2; CGU_INT abits = 3; CGU_INT aepbits = 6; if (idxMode == 1) { bits = 3; abits = 2; } CGV_IMAGE src_block[48]; for (CGU_INT k = 0; k < 16; k++) { for (CGU_INT p = 0; p < 3; p++) src_block[k + p * 16] = EncodeState->image_src[k + p * 16]; if (rotated_channel < 3) { // apply channel rotation if (EncodeState->channels == 4) src_block[k + rotated_channel * 16] = EncodeState->image_src[k + 3 * 16]; if (EncodeState->channels == 3) src_block[k + rotated_channel * 16] = 255; } } CGV_IMAGE block_endpoints[8]; CGV_INDEXPACKED color_index[2]; CGV_INDEX c_index[MAX_SUBSET_SIZE]; CGV_EPOCODE color_qendpoint[8]; icmp_get_block_endpoints(block_endpoints, src_block, -1, 3); icmp_ep_quant_dequant_mode4(color_qendpoint, block_endpoints); CGV_ERROR err = icmp_GetQuantizeIndex(color_index, c_index, src_block, bits, block_endpoints, 0, 3); // refine CGU_INT refineIterations = EncodeState->refineIterations; for (CGU_INT i = 0; i < refineIterations; i++) { icmp_opt_endpoints(block_endpoints, src_block, bits, color_index, -1, 3); icmp_ep_quant_dequant_mode4(color_qendpoint, block_endpoints); err = icmp_GetQuantizeIndex(color_index, c_index, src_block, bits, block_endpoints, 0, 3); } // encoding selected channel CGV_EPOCODE alpha_qendpoint[2]; CGV_INDEXPACKED alpha_index[2]; CGV_INDEX a_index[MAX_SUBSET_SIZE]; err += icmp_optQuantizeIndex(EncodeState, alpha_index, a_index, alpha_qendpoint, &EncodeState->image_src[rotated_channel * 16], abits, aepbits); if (err < *best_err) { copy_epocode(best_candidate->color_qendpoint, color_qendpoint, 8); copy_epocode(best_candidate->alpha_qendpoint, alpha_qendpoint, 2); copy_indexpacked(best_candidate->best_color_index, color_index, 2); copy_indexpacked(best_candidate->best_alpha_index, alpha_index, 2); best_candidate->rotated_channel = rotated_channel; best_candidate->idxMode = idxMode; *best_err = err; } } void icmp_mode5_candidate( BC7_EncodeState EncodeState[], cmp_mode_parameters best_candidate[], CGV_ERROR best_err[], CGU_INT rotated_channel) { CGU_INT bits = 2; CGU_INT abits = 2; CGU_INT aepbits = 8; CGV_IMAGE block[48]; for (CGU_INT k = 0; k < 16; k++) { for (CGU_INT p = 0; p < 3; p++) block[k + p * 16] = EncodeState->image_src[k + p * 16]; if (rotated_channel < 3) { // apply channel rotation if (EncodeState->channels == 4) block[k + rotated_channel * 16] = EncodeState->image_src[k + 3 * 16]; if (EncodeState->channels == 3) block[k + rotated_channel * 16] = 255; } } CGV_IMAGE block_endpoints[8]; CGV_EPOCODE color_qendpoint[8]; CGV_INDEXPACKED color_index[2]; CGV_INDEX c_index[MAX_SUBSET_SIZE]; icmp_get_block_endpoints(block_endpoints, block, -1, 3); icmp_GetQuantizedEpoCode(color_qendpoint, block_endpoints, 5, 3); CGV_ERROR err = icmp_GetQuantizeIndex(color_index, c_index, block, bits, block_endpoints, 0, 3); // refine CGU_INT refineIterations = EncodeState->refineIterations; for (CGU_INT i = 0; i < refineIterations; i++) { icmp_opt_endpoints(block_endpoints, block, bits, color_index, -1, 3); icmp_GetQuantizedEpoCode(color_qendpoint, block_endpoints, 5, 3); err = icmp_GetQuantizeIndex(color_index, c_index, block, bits, block_endpoints, 0, 3); } // encoding selected channel CGV_EPOCODE alpha_qendpoint[2]; CGV_INDEXPACKED alpha_index[2]; CGV_INDEX a_index[MAX_SUBSET_SIZE]; err += icmp_optQuantizeIndex(EncodeState, alpha_index, a_index, alpha_qendpoint, &EncodeState->image_src[rotated_channel * 16], abits, aepbits); if (err < *best_err) { icmp_swap_epocode(best_candidate->color_qendpoint, color_qendpoint, 8); icmp_swap_indexpacked(best_candidate->best_color_index, color_index, 2); icmp_swap_epocode(best_candidate->alpha_qendpoint, alpha_qendpoint, 2); icmp_swap_indexpacked(best_candidate->best_alpha_index, alpha_index, 2); best_candidate->rotated_channel = rotated_channel; *best_err = err; } } // =============== Mode Compression CGV_ERROR icmp_enc_mode01237_part_fast( CGV_EPOCODE qep[24], CGV_INDEXPACKED color_index[2], CGV_INDEX index[MAX_SUBSET_SIZE], CGV_IMAGE image_src[64], CGV_PARTID part_id, CGU_INT blockMode) { CGV_SHIFT32 pattern = icmp_get_pattern2(part_id); CGU_INT bits = 2; if (blockMode == 0 || blockMode == 1) bits = 3; CGU_INT maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3; CGU_CHANNEL channels = 3; if (blockMode == 7) channels = 4; CGV_IMAGE block_endpoints[24]; for (CGU_INT subset = 0; subset < maxSubSets; subset++) { CGV_MASK partition_mask = icmp_get_partition_mask(part_id, subset); icmp_get_block_endpoints(&block_endpoints[subset * 8], image_src, partition_mask, channels); } icmp_GetQuantizedEpoCode(qep, block_endpoints, blockMode, channels); CGV_ERROR total_err = icmp_GetQuantizeIndex(color_index, index, image_src, bits, block_endpoints, pattern, channels); return total_err; } void icmp_enc_mode01237(BC7_EncodeState EncodeState[], CGU_INT blockMode, CGV_PARTID part_list[], CGU_INT part_count) { if (part_count == 0) return; CGU_INT bits = 2; if (blockMode == 0 || blockMode == 1) bits = 3; CGU_INT maxSubSets = 2; if (blockMode == 0 || blockMode == 2) maxSubSets = 3; CGU_CHANNEL channels = 3; if (blockMode == 7) channels = 4; CGV_EPOCODE best_qep[24]; CGV_INDEXPACKED best_endpoint[2]; CGV_PARTID best_part_id = -1; CGV_ERROR best_err = 1e99; for (CGU_INT part = 0; part < part_count; part++) { CGV_PARTID part_id = part_list[part] & 63; if (maxSubSets == 3) part_id += 64; CGV_EPOCODE qep[24]; CGV_INDEXPACKED color_index[2]; CGV_INDEX index[MAX_SUBSET_SIZE]; CGV_ERROR err = icmp_enc_mode01237_part_fast(qep, color_index, index, EncodeState->image_src, part_id, blockMode); if (err < best_err) { for (CGU_INT subset = 0; subset < 8 * maxSubSets; subset++) best_qep[subset] = qep[subset]; for (CGU_INT k = 0; k < 2; k++) best_endpoint[k] = color_index[k]; best_part_id = part_id; best_err = err; } } // refine CGU_INT refineIterations = EncodeState->refineIterations; for (CGU_INT _i = 0; _i < refineIterations; _i++) { CGV_IMAGE ep[24]; for (CGU_INT subset = 0; subset < maxSubSets; subset++) { CGV_SHIFT32 partition_mask = icmp_get_partition_mask(best_part_id, subset); icmp_opt_endpoints(&ep[subset * 8], EncodeState->image_src, bits, best_endpoint, partition_mask, channels); } CGV_EPOCODE qep[24]; CGV_INDEXPACKED color_index[2]; CGV_INDEX index[MAX_SUBSET_SIZE]; icmp_GetQuantizedEpoCode(qep, ep, blockMode, channels); CGV_SHIFT32 pattern = icmp_get_pattern2(best_part_id); CGV_ERROR err = icmp_GetQuantizeIndex(color_index, index, EncodeState->image_src, bits, ep, pattern, channels); if (err < best_err) { for (CGU_INT subset = 0; subset < 8 * maxSubSets; subset++) best_qep[subset] = qep[subset]; for (CGU_INT k = 0; k < 2; k++) best_endpoint[k] = color_index[k]; best_err = err; } } if (blockMode != 7) best_err += EncodeState->opaque_err; // take into account alpha channel if (best_err < EncodeState->best_err) { EncodeState->best_err = best_err; icmp_encode_mode01237(EncodeState->best_cmp_out, best_qep, best_endpoint, best_part_id, blockMode); } } void icmp_mode5(BC7_EncodeState EncodeState[]) { cmp_mode_parameters best_candidate; CGV_ERROR best_err = EncodeState->best_err; #ifdef ASPM_GPU cmp_memsetBC7((CGV_BYTE *)&best_candidate, 0, sizeof(cmp_mode_parameters)); #else memset(&best_candidate, 0, sizeof(cmp_mode_parameters)); #endif for (CGU_CHANNEL ch = 0; ch < EncodeState->channels; ch++) { icmp_mode5_candidate(EncodeState, &best_candidate, &best_err, ch); } if (best_err < EncodeState->best_err) { EncodeState->best_err = best_err; EncodeState->cmp_isout16Bytes = FALSE; icmp_Encode_mode5(EncodeState->best_cmp_out, &best_candidate); } } void icmp_mode6(BC7_EncodeState EncodeState[]) { CGV_IMAGE block_endpoints[8]; icmp_get_block_endpoints(block_endpoints, EncodeState->image_src, -1, 4); CGV_EPOCODE epo_code[8]; icmp_GetQuantizedEpoCode(epo_code, block_endpoints, 6, 4); CGV_INDEXPACKED color_index[2]; CGV_INDEX index[MAX_SUBSET_SIZE]; CGV_ERROR err = icmp_GetQuantizeIndex(color_index, index, EncodeState->image_src, 4, block_endpoints, 0, 4); // refine CGU_INT refineIterations = EncodeState->refineIterations; for (CGU_INT i = 0; i < refineIterations; i++) { icmp_opt_endpoints(block_endpoints, EncodeState->image_src, 4, color_index, -1, 4); icmp_GetQuantizedEpoCode(epo_code, block_endpoints, 6, EncodeState->channels); err = icmp_GetQuantizeIndex(color_index, index, EncodeState->image_src, 4, block_endpoints, 0, 4); } if (err < EncodeState->best_err) { EncodeState->best_err = err; EncodeState->cmp_isout16Bytes = FALSE; icmp_encode_mode6(EncodeState->best_cmp_out, epo_code, color_index); } } void icmp_mode02(BC7_EncodeState EncodeState[]) { CGV_PARTID part_list[64]; for (CGU_INT part = 0; part < 64; part++) part_list[part] = part; if (EncodeState->validModeMask & 0x01) icmp_enc_mode01237(EncodeState, 0, part_list, 16); if (EncodeState->validModeMask & 0x04) icmp_enc_mode01237(EncodeState, 2, part_list, 64); // usually not worth the time } void icmp_mode7(BC7_EncodeState EncodeState[]) { CGV_IMAGE full_stats[15]; icmp_compute_stats_masked(full_stats, EncodeState->image_src, -1, EncodeState->channels); CGV_PARTID part_list[64]; for (CGU_INT part = 0; part < 64; part++) { CGV_MASK partition_mask = icmp_get_partition_mask(part + 0, 0); CGV_IMAGE bound12 = icmp_block_pca_bound_split2(EncodeState->image_src, partition_mask, full_stats, EncodeState->channels); CGV_PARTID bound = (CGV_TYPEINT)(bound12); part_list[part] = part + bound * 64; } icmp_sort_partlist(part_list, 64, EncodeState->part_count); icmp_enc_mode01237(EncodeState, 7, part_list, EncodeState->part_count); } void icmp_mode13(BC7_EncodeState EncodeState[]) { CGV_IMAGE full_stats[15]; icmp_compute_stats_masked(full_stats, EncodeState->image_src, -1, 3); CGV_PARTID part_list[64]; for (CGU_INT part = 0; part < 64; part++) { CGV_MASK partition_mask = icmp_get_partition_mask(part + 0, 0); CGV_IMAGE bound12 = icmp_block_pca_bound_split2(EncodeState->image_src, partition_mask, full_stats, 3); CGV_PARTID bound = (CGV_TYPEINT)(bound12); part_list[part] = part + bound * 64; } icmp_sort_partlist(part_list, 64, EncodeState->part_count); if (EncodeState->validModeMask & 0x02) icmp_enc_mode01237(EncodeState, 1, part_list, EncodeState->part_count); if (EncodeState->validModeMask & 0x08) icmp_enc_mode01237(EncodeState, 3, part_list, EncodeState->part_count); } void icmp_mode4(BC7_EncodeState EncodeState[]) { cmp_mode_parameters best_candidate; CGV_ERROR best_err = EncodeState->best_err; #ifdef ASPM_GPU cmp_memsetBC7((CGV_BYTE *)&best_candidate, 0, sizeof(cmp_mode_parameters)); #else memset(&best_candidate, 0, sizeof(cmp_mode_parameters)); #endif for (CGU_CHANNEL rotated_channel = 0; rotated_channel < EncodeState->channels; rotated_channel++) { icmp_enc_mode4_candidate(EncodeState, &best_candidate, &best_err, rotated_channel, 0); icmp_enc_mode4_candidate(EncodeState, &best_candidate, &best_err, rotated_channel, 1); } // mode 4 if (best_err < EncodeState->best_err) { EncodeState->best_err = best_err; icmp_encode_mode4(EncodeState->best_cmp_out, &best_candidate); } } #endif //===================================== COMPRESS CODE ============================================= bool notValidBlockForMode( CGU_UINT32 blockMode, CGU_BOOL blockNeedsAlpha, CGU_BOOL blockAlphaZeroOne, uniform CMP_GLOBAL BC7_Encode u_BC7Encode[]) { // Do we need to skip alpha processing blocks if((blockNeedsAlpha == FALSE) && (blockMode > 3)) { return TRUE; } // Optional restriction for colour-only blocks so that they // don't use modes that have combined colour+alpha - this // avoids the possibility that the encoder might choose an // alpha other than 1.0 (due to parity) and cause something to // become accidentally slightly transparent (it's possible that // when encoding 3-component texture applications will assume that // the 4th component can safely be assumed to be 1.0 all the time) if ((blockNeedsAlpha == FALSE) && (u_BC7Encode->colourRestrict == TRUE) && ((blockMode == 6)||(blockMode == 7))) // COMBINED_ALPHA { return TRUE; } // Optional restriction for blocks with alpha to avoid issues with // punch-through or thresholded alpha encoding if((blockNeedsAlpha == TRUE) && (u_BC7Encode->alphaRestrict == TRUE) && (blockAlphaZeroOne == TRUE) && ((blockMode == 6)||(blockMode == 7))) // COMBINED_ALPHA { return TRUE; } return FALSE; } void BC7_CompressBlock( BC7_EncodeState EncodeState[], uniform CMP_GLOBAL BC7_Encode u_BC7Encode[]) { CGU_BOOL blockNeedsAlpha = FALSE; CGU_BOOL blockAlphaZeroOne = FALSE; CGV_ERROR alpha_err = 0.0f; CGV_IMAGE alpha_min = 255.0F; for (CGU_INT k=0; kimage_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] < alpha_min) alpha_min = EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE]; alpha_err += sq_image( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE]-255.0F); if (blockAlphaZeroOne == FALSE) { if(( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] == 255.0F) || ( EncodeState->image_src[k+COMP_ALPHA*SOURCE_BLOCK_SIZE] == 0.0F)) { blockAlphaZeroOne = TRUE; } } } if (alpha_min != 255.0F) { blockNeedsAlpha = TRUE; } EncodeState->best_err = CMP_FLOAT_MAX; EncodeState->opaque_err = alpha_err; #ifdef USE_ICMP EncodeState->refineIterations = 4; EncodeState->fastSkipTreshold = 4; EncodeState->channels = 4; EncodeState->part_count = 64; EncodeState->cmp_isout16Bytes = FALSE; #else EncodeState->cmp_isout16Bytes = TRUE; #endif // We change the order in which we visit the block modes to try to maximize the chance // that we manage to early out as quickly as possible. // This is a significant performance optimization for the lower quality modes where the // exit threshold is higher, and also tends to improve quality (as the generally higher quality // modes are now enumerated earlier, so the first encoding that passes the threshold will // tend to pass by a greater margin than if we used a dumb ordering, and thus overall error will // be improved) CGU_INT blockModeOrder[NUM_BLOCK_TYPES] = {4, 6, 1, 3, 0, 2, 7, 5}; for (CGU_INT block=0; block < NUM_BLOCK_TYPES; block++) { CGU_INT blockMode = blockModeOrder[block]; if (u_BC7Encode->quality < BC7_qFAST_THRESHOLD) { if ( notValidBlockForMode(blockMode,blockNeedsAlpha,blockAlphaZeroOne,u_BC7Encode) ) continue; } CGU_INT Mode = 0x0001 << blockMode; if (!(u_BC7Encode->validModeMask & Mode)) continue; switch (blockMode) { // image processing with no alpha case 0: #ifdef USE_ICMP icmp_mode02(EncodeState); #else Compress_mode01237(blockMode, EncodeState, u_BC7Encode); #endif break; case 1: #ifdef USE_ICMP icmp_mode13(EncodeState); #else Compress_mode01237(blockMode, EncodeState, u_BC7Encode); #endif break; case 2: #ifdef USE_ICMP icmp_mode13(EncodeState); #else Compress_mode01237(blockMode, EncodeState, u_BC7Encode); #endif break; case 3: #ifdef USE_ICMP icmp_mode13(EncodeState); #else Compress_mode01237(blockMode, EncodeState, u_BC7Encode); #endif break; // image processing with alpha case 4: #ifdef USE_ICMP icmp_mode4(EncodeState); #else Compress_mode45(blockMode, EncodeState, u_BC7Encode); #endif break; case 5: #ifdef USE_ICMP icmp_mode5(EncodeState); #else Compress_mode45(blockMode, EncodeState, u_BC7Encode); #endif break; case 6: #ifdef USE_ICMP icmp_mode6(EncodeState); #else Compress_mode6( EncodeState, u_BC7Encode); #endif break; case 7: #ifdef USE_ICMP icmp_mode7(EncodeState); #else Compress_mode01237(blockMode, EncodeState, u_BC7Encode); #endif break; } // Early out if we found we can compress with error below the quality threshold if( EncodeState->best_err <= u_BC7Encode->errorThreshold) { break; } } } //====================================== BC7_ENCODECLASS END ============================================= #ifndef ASPM_GPU INLINE void load_block_interleaved_rgba2(CGV_IMAGE image_src[64], uniform texture_surface* uniform src, CGUV_BLOCKWIDTH block_xx, CGU_INT block_yy) { for (CGU_INT y=0; y<4; y++) for (CGU_INT x=0; x<4; x++) { CGU_UINT32 * uniform src_ptr = (CGV_SHIFT32*)&src->ptr[(block_yy*4+y)*src->stride]; #ifdef USE_VARYING CGV_SHIFT32 rgba = gather_partid(src_ptr, block_xx*4+x); image_src[16*0+y*4+x] = (CGV_FLOAT)((rgba>> 0)&255); image_src[16*1+y*4+x] = (CGV_FLOAT)((rgba>> 8)&255); image_src[16*2+y*4+x] = (CGV_FLOAT)((rgba>>16)&255); image_src[16*3+y*4+x] = (CGV_FLOAT)((rgba>>24)&255); #else CGV_SHIFT32 rgba = src_ptr[block_xx*4+x]; image_src[16*0+y*4+x] = (CGU_FLOAT)((rgba>> 0)&255); image_src[16*1+y*4+x] = (CGU_FLOAT)((rgba>> 8)&255); image_src[16*2+y*4+x] = (CGU_FLOAT)((rgba>>16)&255); image_src[16*3+y*4+x] = (CGU_FLOAT)((rgba>>24)&255); #endif } } #if defined(CMP_USE_FOREACH_ASPM) || defined(USE_VARYING) INLINE void scatter_uint2(CGU_UINT32 * ptr, CGUV_BLOCKWIDTH idx, CGV_SHIFT32 value) { ptr[idx] = value; // (perf warning expected) } #endif INLINE void store_data_uint32(CGU_UINT8 dst[], CGU_INT width, CGUV_BLOCKWIDTH v_xx, CGU_INT yy, CGV_SHIFT32 data[], CGU_INT data_size) { for (CGU_INT k=0; kimage_src,u_srcptr, block_x, block_y); BC7_CompressBlock(state, u_settings); if (state->cmp_isout16Bytes) store_data_uint8(u_dst, u_srcptr->width, block_x, block_y, state->cmp_out, 16); else store_data_uint32(u_dst, u_srcptr->width, block_x, block_y, state->best_cmp_out, 4); } CMP_EXPORT void CompressBlockBC7_encode( uniform texture_surface src[], CGU_UINT8 dst[], uniform BC7_Encode settings[]) { // bc7_isa(); ASPM_PRINT(("ASPM encode [%d,%d]\n",bc7_isa(),src->width,src->height)); for (CGU_INT u_yy = 0; u_yyheight/4; u_yy++) #ifdef CMP_USE_FOREACH_ASPM foreach (v_xx = 0 ... src->width/4) { #else for (CGUV_BLOCKWIDTH v_xx = 0; v_xxwidth/4; v_xx++) { #endif CompressBlockBC7_XY(src, v_xx, u_yy, dst, settings); } } #endif #ifndef ASPM_GPU #ifndef ASPM //======================= DECOMPRESS ========================================= #ifndef USE_HIGH_PRECISION_INTERPOLATION_BC7 CGU_UINT16 aWeight2[] = { 0, 21, 43, 64 }; CGU_UINT16 aWeight3[] = { 0, 9, 18, 27, 37, 46, 55, 64 }; CGU_UINT16 aWeight4[] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; CGU_UINT8 interpolate(CGU_UINT8 e0, CGU_UINT8 e1, CGU_UINT8 index, CGU_UINT8 indexprecision) { if (indexprecision == 2) return (CGU_UINT8)(((64 - aWeight2[index])*CGU_UINT16(e0) + aWeight2[index] * CGU_UINT16(e1) + 32) >> 6); else if (indexprecision == 3) return (CGU_UINT8)(((64 - aWeight3[index])*CGU_UINT16(e0) + aWeight3[index] * CGU_UINT16(e1) + 32) >> 6); else // indexprecision == 4 return (CGU_UINT8)(((64 - aWeight4[index])*CGU_UINT16(e0) + aWeight4[index] * CGU_UINT16(e1) + 32) >> 6); } #endif void GetBC7Ramp(CGU_UINT32 endpoint[][MAX_DIMENSION_BIG], CGU_FLOAT ramp[MAX_DIMENSION_BIG][(1<> componentBits[i]); ep[1][i] += (CGU_UINT32)(ep[1][i] >> componentBits[i]); ep[0][i] = min8(255, max8(0,CMP_STATIC_CAST(CGU_UINT8,ep[0][i]))); ep[1][i] = min8(255, max8(0,CMP_STATIC_CAST(CGU_UINT8,ep[1][i]))); } } // If this block type has no explicit alpha channel // then make sure alpha is 1.0 for all points on the ramp if(!componentBits[COMP_ALPHA]) { ep[0][COMP_ALPHA] = ep[1][COMP_ALPHA] = 255; } CGU_UINT32 rampIndex = clusters[0]; rampIndex = (CGU_UINT32)(log((double)rampIndex) / log(2.0)); // Generate colours for the RGB ramp for(i=0; i < clusters[0]; i++) { #ifdef USE_HIGH_PRECISION_INTERPOLATION_BC7 ramp[COMP_RED][i] = (CGU_FLOAT)floor((ep[0][COMP_RED] * (1.0-rampLerpWeightsBC7[rampIndex][i])) + (ep[1][COMP_RED] * rampLerpWeightsBC7[rampIndex][i]) + 0.5); ramp[COMP_RED][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_RED][i])); ramp[COMP_GREEN][i] = (CGU_FLOAT)floor((ep[0][COMP_GREEN] * (1.0-rampLerpWeightsBC7[rampIndex][i])) + (ep[1][COMP_GREEN] * rampLerpWeightsBC7[rampIndex][i]) + 0.5); ramp[COMP_GREEN][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_GREEN][i])); ramp[COMP_BLUE][i] = (CGU_FLOAT)floor((ep[0][COMP_BLUE] * (1.0-rampLerpWeightsBC7[rampIndex][i])) + (ep[1][COMP_BLUE] * rampLerpWeightsBC7[rampIndex][i]) + 0.5); ramp[COMP_BLUE][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_BLUE][i])); #else ramp[COMP_RED][i] = interpolate(ep[0][COMP_RED], ep[1][COMP_RED], i, rampIndex); ramp[COMP_GREEN][i] = interpolate(ep[0][COMP_GREEN], ep[1][COMP_GREEN], i, rampIndex); ramp[COMP_BLUE][i] = interpolate(ep[0][COMP_BLUE], ep[1][COMP_BLUE], i, rampIndex); #endif } rampIndex = clusters[1]; rampIndex = (CGU_UINT32)(log((CGU_FLOAT)rampIndex) / log(2.0)); if(!componentBits[COMP_ALPHA]) { for(i=0; i < clusters[1]; i++) { ramp[COMP_ALPHA][i] = 255.; } } else { // Generate alphas for(i=0; i < clusters[1]; i++) { #ifdef USE_HIGH_PRECISION_INTERPOLATION_BC7 ramp[COMP_ALPHA][i] = (CGU_FLOAT)floor((ep[0][COMP_ALPHA] * (1.0-rampLerpWeightsBC7[rampIndex][i])) + (ep[1][COMP_ALPHA] * rampLerpWeightsBC7[rampIndex][i]) + 0.5); ramp[COMP_ALPHA][i] = bc7_minf(255.0, bc7_maxf(0., ramp[COMP_ALPHA][i])); #else ramp[COMP_ALPHA][i] = interpolate(ep[0][COMP_ALPHA], ep[1][COMP_ALPHA], i, rampIndex); #endif } } } // // Bit reader - reads one bit from a buffer at the current bit offset // and increments the offset // CGU_UINT32 ReadBit(const CGU_UINT8 base[],CGU_UINT32 &m_bitPosition) { int byteLocation; int remainder; CGU_UINT32 bit = 0; byteLocation = m_bitPosition/8; remainder = m_bitPosition % 8; bit = base[byteLocation]; bit >>= remainder; bit &= 0x1; // Increment bit position m_bitPosition++; return (bit); } void DecompressDualIndexBlock( CGU_UINT8 out[MAX_SUBSET_SIZE][MAX_DIMENSION_BIG], const CGU_UINT8 in[COMPRESSED_BLOCK_SIZE], CGU_UINT32 endpoint[2][MAX_DIMENSION_BIG], CGU_UINT32 &m_bitPosition, CGU_UINT32 m_rotation, CGU_UINT32 m_blockMode, CGU_UINT32 m_indexSwap, CGU_UINT32 m_componentBits[MAX_DIMENSION_BIG]) { CGU_UINT32 i, j, k; CGU_FLOAT ramp[MAX_DIMENSION_BIG][1<(bti[m_blockMode].indexBits[i] - 1); k++) { blockIndices[i][j] |= (CGU_UINT32)ReadBit(in,m_bitPosition) << k; } } else { for(k=0;k 7) { // Something really bad happened... return; } for (i = 0; i < bti[m_blockMode].rotationBits; i++) { m_rotation |= ReadBit(in, m_bitPosition) << i; } for (i = 0; i < bti[m_blockMode].indexModeBits; i++) { m_indexSwap |= ReadBit(in, m_bitPosition) << i; } for (i = 0; i < bti[m_blockMode].partitionBits; i++) { m_partition |= ReadBit(in, m_bitPosition) << i; } if (bti[m_blockMode].encodingType == NO_ALPHA) { m_componentBits[COMP_ALPHA] = 0; m_componentBits[COMP_RED] = m_componentBits[COMP_GREEN] = m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 3; } else if (bti[m_blockMode].encodingType == COMBINED_ALPHA) { m_componentBits[COMP_ALPHA] = m_componentBits[COMP_RED] = m_componentBits[COMP_GREEN] = m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 4; } else if (bti[m_blockMode].encodingType == SEPARATE_ALPHA) { m_componentBits[COMP_ALPHA] = bti[m_blockMode].scalarBits; m_componentBits[COMP_RED] = m_componentBits[COMP_GREEN] = m_componentBits[COMP_BLUE] = bti[m_blockMode].vectorBits / 3; } CGU_UINT32 subset, ep, component; // Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP) // i.e. components are packed together // Loop over components for (component = 0; component < MAX_DIMENSION_BIG; component++) { // loop over subsets for (subset = 0; subset < (int)bti[m_blockMode].subsetCount; subset++) { // Loop over endpoints for (ep = 0; ep < 2; ep++) { endpoint[subset][ep][component] = 0; for (j = 0; j < m_componentBits[component]; j++) { endpoint[subset][ep][component] |= ReadBit(in, m_bitPosition) << j; } } } } // Now get any parity bits if (bti[m_blockMode].pBitType != NO_PBIT) { for (subset = 0; subset < (int)bti[m_blockMode].subsetCount; subset++) { CGU_UINT32 pBit[2]; if (bti[m_blockMode].pBitType == ONE_PBIT) { pBit[0] = ReadBit(in, m_bitPosition); pBit[1] = pBit[0]; } else if (bti[m_blockMode].pBitType == TWO_PBIT) { pBit[0] = ReadBit(in, m_bitPosition); pBit[1] = ReadBit(in, m_bitPosition); } for (component = 0; component < MAX_DIMENSION_BIG; component++) { if (m_componentBits[component]) { endpoint[subset][0][component] <<= 1; endpoint[subset][1][component] <<= 1; endpoint[subset][0][component] |= pBit[0]; endpoint[subset][1][component] |= pBit[1]; } } } } if (bti[m_blockMode].pBitType != NO_PBIT) { // Now that we've unpacked the parity bits, update the component size information // for the ramp generator for (j = 0; j < MAX_DIMENSION_BIG; j++) { if (m_componentBits[j]) { m_componentBits[j] += 1; } } } // If this block has two independent sets of indices then put it to that decoder if (bti[m_blockMode].encodingType == SEPARATE_ALPHA) { DecompressDualIndexBlock(out, in, endpoint[0], m_bitPosition, m_rotation, m_blockMode, m_indexSwap, m_componentBits); return; } CGU_UINT32 fixup[MAX_SUBSETS] = { 0, 0, 0 }; switch (bti[m_blockMode].subsetCount) { case 3: fixup[1] = BC7_FIXUPINDICES_LOCAL[2][m_partition][1]; fixup[2] = BC7_FIXUPINDICES_LOCAL[2][m_partition][2]; break; case 2: fixup[1] = BC7_FIXUPINDICES_LOCAL[1][m_partition][1]; break; default: break; } //-------------------------------------------------------------------- // New Code : Possible replacement for BC7_PARTITIONS for CPU code //-------------------------------------------------------------------- // Extract index bits // for (i = 0; i < MAX_SUBSET_SIZE; i++) // { // CGV_UINT8 p = get_partition_subset(m_partition, bti[m_blockMode].subsetCount - 1, i); // //CGU_UINT32 p = partitionTable[i]; // blockIndices[i] = 0; // CGU_UINT32 bitsToRead = bti[m_blockMode].indexBits[0]; // // // If this is a fixup index then set the implicit bit // if (i == fixup[p]) // { // blockIndices[i] &= ~(1 << (bitsToRead - 1)); // bitsToRead--; // } // // for (j = 0; j < bitsToRead; j++) // { // blockIndices[i] |= ReadBit(in, m_bitPosition) << j; // } // } CGU_BYTE *partitionTable = (CGU_BYTE*)BC7_PARTITIONS[bti[m_blockMode].subsetCount-1][m_partition]; // Extract index bits for(i=0; i < MAX_SUBSET_SIZE; i++) { CGU_BYTE p = partitionTable[i]; blockIndices[i] = 0; CGU_BYTE bitsToRead = bti[m_blockMode].indexBits[0]; // If this is a fixup index then set the implicit bit if(i==fixup[p]) { blockIndices[i] &= ~(1 << (bitsToRead-1)); bitsToRead--; } for(j=0;jimage_src[offsetR++] = (CGV_IMAGE)image_src[i][0]; state->image_src[offsetG++] = (CGV_IMAGE)image_src[i][1]; state->image_src[offsetB++] = (CGV_IMAGE)image_src[i][2]; state->image_src[offsetA++] = (CGV_IMAGE)image_src[i][3]; } BC7_CompressBlock(state, u_BC7Encode); if (state->cmp_isout16Bytes) { for (CGU_UINT8 i = 0; i < COMPRESSED_BLOCK_SIZE; i++) { cmp_out[i] = state->cmp_out[i]; } } else { #ifdef ASPM_GPU cmp_memcpy(cmp_out, (CGU_UINT8 *)state->best_cmp_out, 16); #else memcpy(cmp_out, state->best_cmp_out, 16); #endif } } //======================= CPU USER INTERFACES ==================================== int CMP_CDECL CreateOptionsBC7(void **options) { (*options) = new BC7_Encode; if (!options) return CGU_CORE_ERR_NEWMEM; init_BC7ramps(); SetDefaultBC7Options((BC7_Encode *)(*options)); return CGU_CORE_OK; } int CMP_CDECL DestroyOptionsBC7(void *options) { if (!options) return CGU_CORE_ERR_INVALIDPTR; BC7_Encode *BCOptions = reinterpret_cast (options); delete BCOptions; return CGU_CORE_OK; } int CMP_CDECL SetErrorThresholdBC7(void *options, CGU_FLOAT minThreshold, CGU_FLOAT maxThreshold) { if (!options) return CGU_CORE_ERR_INVALIDPTR; BC7_Encode *BC7optionsDefault = (BC7_Encode *)options; if (minThreshold < 0.0f) minThreshold = 0.0f; if (maxThreshold < 0.0f) maxThreshold = 0.0f; BC7optionsDefault->minThreshold = minThreshold; BC7optionsDefault->maxThreshold = maxThreshold; return CGU_CORE_OK; } int CMP_CDECL SetQualityBC7(void *options, CGU_FLOAT fquality) { if (!options) return CGU_CORE_ERR_INVALIDPTR; BC7_Encode *BC7optionsDefault = (BC7_Encode *)options; if (fquality < 0.0f) fquality = 0.0f; else if (fquality > 1.0f) fquality = 1.0f; BC7optionsDefault->quality = fquality; // Set Error Thresholds BC7optionsDefault->errorThreshold = BC7optionsDefault->maxThreshold * (1.0f - fquality); if(fquality > BC7_qFAST_THRESHOLD) BC7optionsDefault->errorThreshold += BC7optionsDefault->minThreshold; return CGU_CORE_OK; } int CMP_CDECL SetMaskBC7(void *options, CGU_UINT8 mask) { if (!options) return CGU_CORE_ERR_INVALIDPTR; BC7_Encode *BC7options = (BC7_Encode *)options; BC7options->validModeMask = mask; return CGU_CORE_OK; } int CMP_CDECL SetAlphaOptionsBC7(void *options, CGU_BOOL imageNeedsAlpha, CGU_BOOL colourRestrict, CGU_BOOL alphaRestrict) { if (!options) return CGU_CORE_ERR_INVALIDPTR; BC7_Encode *u_BC7Encode = (BC7_Encode *)options; u_BC7Encode->imageNeedsAlpha = imageNeedsAlpha; u_BC7Encode->colourRestrict = colourRestrict; u_BC7Encode->alphaRestrict = alphaRestrict; return CGU_CORE_OK; } int CMP_CDECL CompressBlockBC7( const unsigned char *srcBlock, unsigned int srcStrideInBytes, CMP_GLOBAL unsigned char cmpBlock[16], const void* options = NULL) { CMP_Vec4uc inBlock[SOURCE_BLOCK_SIZE]; //---------------------------------- // Fill the inBlock with source data //---------------------------------- CGU_INT srcpos = 0; CGU_INT dstptr = 0; for (CGU_UINT8 row = 0; row < 4; row++) { srcpos = row * srcStrideInBytes; for (CGU_UINT8 col = 0; col < 4; col++) { inBlock[dstptr].x = CGU_UINT8(srcBlock[srcpos++]); inBlock[dstptr].y = CGU_UINT8(srcBlock[srcpos++]); inBlock[dstptr].z = CGU_UINT8(srcBlock[srcpos++]); inBlock[dstptr].w = CGU_UINT8(srcBlock[srcpos++]); dstptr++; } } BC7_Encode *u_BC7Encode = (BC7_Encode *)options; BC7_Encode BC7EncodeDefault = { 0 }; if (u_BC7Encode == NULL) { u_BC7Encode = &BC7EncodeDefault; SetDefaultBC7Options(u_BC7Encode); init_BC7ramps(); } BC7_EncodeState EncodeState #ifndef ASPM = { 0 } #endif ; EncodeState.best_err = CMP_FLOAT_MAX; EncodeState.validModeMask = u_BC7Encode->validModeMask; EncodeState.part_count = u_BC7Encode->part_count; EncodeState.channels = CMP_STATIC_CAST(CGU_CHANNEL,u_BC7Encode->channels); CGU_UINT8 offsetR = 0; CGU_UINT8 offsetG = 16; CGU_UINT8 offsetB = 32; CGU_UINT8 offsetA = 48; CGU_UINT32 offsetSRC = 0; for (CGU_UINT8 i = 0; i < SOURCE_BLOCK_SIZE; i++) { EncodeState.image_src[offsetR++] = (CGV_IMAGE)inBlock[offsetSRC].x; EncodeState.image_src[offsetG++] = (CGV_IMAGE)inBlock[offsetSRC].y; EncodeState.image_src[offsetB++] = (CGV_IMAGE)inBlock[offsetSRC].z; EncodeState.image_src[offsetA++] = (CGV_IMAGE)inBlock[offsetSRC].w; offsetSRC++; } BC7_CompressBlock(&EncodeState, u_BC7Encode); if (EncodeState.cmp_isout16Bytes) { for (CGU_UINT8 i = 0; i < COMPRESSED_BLOCK_SIZE; i++) { cmpBlock[i] = EncodeState.cmp_out[i]; } } else { memcpy(cmpBlock, EncodeState.best_cmp_out, 16); } return CGU_CORE_OK; } int CMP_CDECL DecompressBlockBC7(const unsigned char cmpBlock[16], unsigned char srcBlock[64], const void *options = NULL) { BC7_Encode *u_BC7Encode = (BC7_Encode *)options; BC7_Encode BC7EncodeDefault = { 0 }; // for q = 0.05 if (u_BC7Encode == NULL) { // set for q = 1.0 u_BC7Encode = &BC7EncodeDefault; SetDefaultBC7Options(u_BC7Encode); init_BC7ramps(); } DecompressBC7_internal((CGU_UINT8(*)[4])srcBlock, (CGU_UINT8 *)cmpBlock,u_BC7Encode); return CGU_CORE_OK; } #endif #endif //============================================== OpenCL USER INTERFACE ==================================================== #ifdef ASPM_OPENCL CMP_STATIC CMP_KERNEL void CMP_GPUEncoder(uniform CMP_GLOBAL const CGU_Vec4uc ImageSource[], CMP_GLOBAL CGV_CMPOUT ImageDestination[], uniform CMP_GLOBAL Source_Info SourceInfo[], uniform CMP_GLOBAL BC7_Encode BC7Encode[] ) { CGU_INT xID=0; CGU_INT yID=0; xID = get_global_id(0); // ToDo: Define a size_t 32 bit and 64 bit based on clGetDeviceInfo yID = get_global_id(1); CGU_INT srcWidth = SourceInfo->m_src_width; CGU_INT srcHeight = SourceInfo->m_src_height; if (xID >= (srcWidth / BlockX)) return; if (yID >= (srcHeight / BlockY)) return; //ASPM_PRINT(("[ASPM_OCL] %d %d size %d\n",xID,yID,sizeof(BC7_Encode))); CGU_INT destI = (xID*COMPRESSED_BLOCK_SIZE) + (yID*(srcWidth / BlockX)*COMPRESSED_BLOCK_SIZE); CGU_INT srcindex = 4 * (yID * srcWidth + xID); CGU_INT blkindex = 0; BC7_EncodeState EncodeState; cmp_memsetBC7(&EncodeState,0,sizeof(EncodeState)); copy_BC7_Encode_settings(&EncodeState, BC7Encode); //Check if it is a complete 4X4 block if (((xID + 1)*BlockX <= srcWidth) && ((yID + 1)*BlockY <= srcHeight)) { srcWidth = srcWidth - 4; for (CGU_INT j = 0; j < 4; j++) { for (CGU_INT i = 0; i < 4; i++) { EncodeState.image_src[blkindex+0*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].x; EncodeState.image_src[blkindex+1*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].y; EncodeState.image_src[blkindex+2*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].z; EncodeState.image_src[blkindex+3*SOURCE_BLOCK_SIZE] = ImageSource[srcindex].w; blkindex++; srcindex++; } srcindex += srcWidth; } BC7_CompressBlock(&EncodeState, BC7Encode); // printf("CMP %x %x %x %x %x %x %x", // state->cmp_out[0], // state->cmp_out[1], // state->cmp_out[2], // state->cmp_out[3], // state->cmp_out[4], // state->cmp_out[5], // state->cmp_out[6] // ); for (CGU_INT i=0; i