/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2016, Intel Corporation // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated // documentation files (the "Software"), to deal in the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to // permit persons to whom the Software is furnished to do so, subject to the following conditions: // The above copyright notice and this permission notice shall be included in all copies or substantial portions of // the Software. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, // TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #include "ispc_texcomp.h" #include "kernel_astc_ispc.h" #include #include #include #include #include void GetProfile_astc_fast(astc_enc_settings* settings, int block_width, int block_height) { settings->block_width = block_width; settings->block_height = block_height; settings->channels = 3; settings->fastSkipTreshold = 5; settings->refineIterations = 2; } void GetProfile_astc_alpha_fast(astc_enc_settings* settings, int block_width, int block_height) { settings->block_width = block_width; settings->block_height = block_height; settings->channels = 4; settings->fastSkipTreshold = 5; settings->refineIterations = 2; } void GetProfile_astc_alpha_slow(astc_enc_settings* settings, int block_width, int block_height) { settings->block_width = block_width; settings->block_height = block_height; settings->channels = 4; settings->fastSkipTreshold = 64; settings->refineIterations = 2; } struct astc_block { int width; int height; uint8_t dual_plane; int weight_range; uint8_t weights[64]; int color_component_selector; int partitions; int partition_id; int color_endpoint_pairs; int channels; int color_endpoint_modes[4]; int endpoint_range; uint8_t endpoints[18]; }; bool can_store(int value, int bits) { if (value < 0) return false; if (value >= 1 << bits) return false; return true; } int pack_block_mode(astc_block* block) { int block_mode = 0; int D = block->dual_plane; int H = block->weight_range >= 6; int DH = D * 2 + H; int R = block->weight_range + 2 - ((H > 0) ? 6 : 0); R = R / 2 + R % 2 * 4; if (can_store(block->width - 4, 2) && can_store(block->height - 2, 2)) { int B = block->width - 4; int A = block->height - 2; block_mode = (DH << 9) | (B << 7) | (A << 5) | ((R & 4) << 2) | (R & 3); } if (can_store(block->width - 8, 2) && can_store(block->height - 2, 2)) { int B = block->width - 8; int A = block->height - 2; block_mode = (DH << 9) | (B << 7) | (A << 5) | ((R & 4) << 2) | 4 | (R & 3); } if (can_store(block->width - 2, 2) && can_store(block->height - 8, 2)) { int A = block->width - 2; int B = block->height - 8; block_mode = (DH << 9) | (B << 7) | (A << 5) | ((R & 4) << 2) | 8 | (R & 3); } if (can_store(block->width - 2, 2) && can_store(block->height - 6, 1)) { int A = block->width - 2; int B = block->height - 6; block_mode = (DH << 9) | (B << 7) | (A << 5) | ((R & 4) << 2) | 12 | (R & 3); } if (can_store(block->width - 2, 1) && can_store(block->height - 2, 2)) { int B = block->width; int A = block->height - 2; block_mode = (DH << 9) | (B << 7) | (A << 5) | ((R & 4) << 2) | 12 | (R & 3); } if (DH == 0 && can_store(block->width - 6, 2) && can_store(block->height - 6, 2)) { int A = block->width - 6; int B = block->height - 6; block_mode = (B << 9) | 256 | (A << 5) | (R << 2); } return block_mode; } int range_table[][3] = { //2^ 3^ 5^ { 1, 0, 0 }, // 0..1 { 0, 1, 0 }, // 0..2 { 2, 0, 0 }, // 0..3 { 0, 0, 1 }, // 0..4 { 1, 1, 0 }, // 0..5 { 3, 0, 0 }, // 0..7 { 1, 0, 1 }, // 0..9 { 2, 1, 0 }, // 0..11 { 4, 0, 0 }, // 0..15 { 2, 0, 1 }, // 0..19 { 3, 1, 0 }, // 0..23 { 5, 0, 0 }, // 0..31 { 3, 0, 1 }, // 0..39 { 4, 1, 0 }, // 0..47 { 6, 0, 0 }, // 0..63 { 4, 0, 1 }, // 0..79 { 5, 1, 0 }, // 0..95 { 7, 0, 0 }, // 0..127 { 5, 0, 1 }, // 0..159 { 6, 1, 0 }, // 0..191 { 8, 0, 0 }, // 0..255 }; int get_levels(int range) { return (1 + 2 * range_table[range][1] + 4 * range_table[range][2]) << range_table[range][0]; } int sequence_bits(int count, int range) { int bits = count * range_table[range][0]; bits += (count * range_table[range][1] * 8 + 4) / 5; bits += (count * range_table[range][2] * 7 + 2) / 3; return bits; } void set_bits(uint32_t data[4], int* pos, int bits, uint32_t value) { assert(bits <= 25); uint32_t word = *(uint32_t*)(((uint8_t*)data) + *pos / 8); uint32_t mask = (1 << bits) - 1; word |= value << (*pos % 8); *(uint32_t*)(((uint8_t*)data) + *pos / 8) = word; *pos += bits; } uint32_t get_field(uint32_t input, int a, int b) { assert(a >= b); return (input >> b) & ((1 << (a - b + 1)) - 1); } uint32_t get_bit(uint32_t input, int a) { return get_field(input, a, a); } void pack_five_trits(uint32_t data[4], int sequence[5], int* pos, int n) { int t[5]; int m[5]; for (int i = 0; i < 5; i++) { t[i] = sequence[i] >> n; m[i] = sequence[i] - (t[i] << n); } int C; if (t[1] == 2 && t[2] == 2) { C = 3 * 4 + t[0]; } else if (t[2] == 2) { C = t[1] * 16 + t[0] * 4 + 3; } else { C = t[2] * 16 + t[1] * 4 + t[0]; } int T; if (t[3] == 2 && t[4] == 2) { T = get_field(C, 4, 2) * 32 + 7 * 4 + get_field(C, 1, 0); } else { T = get_field(C, 4, 0); if (t[4] == 2) { T += t[3] * 128 + 3 * 32; } else { T += t[4] * 128 + t[3] * 32; } } uint32_t pack1 = 0; pack1 |= m[0]; pack1 |= get_field(T, 1, 0) << n; pack1 |= m[1] << (2 + n); uint32_t pack2 = 0; pack2 |= get_field(T, 3, 2); pack2 |= m[2] << 2; pack2 |= get_field(T, 4, 4) << (2 + n); pack2 |= m[3] << (3 + n); pack2 |= get_field(T, 6, 5) << (3 + n * 2); pack2 |= m[4] << (5 + n * 2); pack2 |= get_field(T, 7, 7) << (5 + n * 3); set_bits(data, pos, 2 + n * 2, pack1); set_bits(data, pos, 6 + n * 3, pack2); } void pack_three_quint(uint32_t data[4], int sequence[3], int* pos, int n) { int q[3]; int m[3]; for (int i = 0; i < 3; i++) { q[i] = sequence[i] >> n; m[i] = sequence[i] - (q[i] << n); } int Q; if (q[0] == 4 && q[1] == 4) { Q = get_field(q[2], 1, 0) * 8 + 3 * 2 + get_bit(q[2], 2); } else { int C; if (q[1] == 4) { C = (q[0] << 3) + 5; } else { C = (q[1] << 3) + q[0]; } if (q[2] == 4) { Q = get_field(~C, 2, 1) * 32 + get_field(C, 4, 3) * 8 + 3 * 2 + get_bit(C, 0); } else { Q = q[2] * 32 + get_field(C, 4, 0); } } uint32_t pack = 0; pack |= m[0]; pack |= get_field(Q, 2, 0) << n; pack |= m[1] << (3 + n); pack |= get_field(Q, 4, 3) << (3 + n * 2); pack |= m[2] << (5 + n * 2); pack |= get_field(Q, 6, 5) << (5 + n * 3); set_bits(data, pos, 7 + n * 3, pack); } void pack_integer_sequence(uint32_t output_data[4], uint8_t sequence[], int pos, int count, int range) { int n = range_table[range][0]; int bits = sequence_bits(count, range); int pos0 = pos; uint32_t data[5] = { 0 }; if (range_table[range][1] == 1) { for (int j = 0; j < (count + 4) / 5; j++) { int temp[5] = { 0 }; for (int i = 0; i < std::min(count - j * 5, 5); i++) temp[i] = sequence[j * 5 + i]; pack_five_trits(data, temp, &pos, n); } } else if (range_table[range][2] == 1) { for (int j = 0; j < (count + 2) / 3; j++) { int temp[3] = { 0 }; for (int i = 0; i < std::min(count - j * 3, 3); i++) temp[i] = sequence[j * 3 + i]; pack_three_quint(data, temp, &pos, n); } } else { for (int i = 0; i < count; i++) { set_bits(data, &pos, n, sequence[i]); } } if (pos0 + bits < 96) data[3] = 0; if (pos0 + bits < 64) data[2] = 0; if (pos0 + bits < 32) data[1] = 0; data[(pos0 + bits) / 32] &= (1 << ((pos0 + bits) % 32)) - 1; for (int k = 0; k < 4; k++) output_data[k] |= data[k]; } uint32_t reverse_bits_32(uint32_t input) { uint32_t t = input; t = (t << 16) | (t >> 16); t = ((t & 0x00FF00FF) << 8) | ((t & 0xFF00FF00) >> 8); t = ((t & 0x0F0F0F0F) << 4) | ((t & 0xF0F0F0F0) >> 4); t = ((t & 0x33333333) << 2) | ((t & 0xCCCCCCCC) >> 2); t = ((t & 0x55555555) << 1) | ((t & 0xAAAAAAAA) >> 1); return t; } void pack_block(uint32_t data[4], astc_block* block) { memset(data, 0, 16); int pos = 0; set_bits(data, &pos, 11, pack_block_mode(block)); int num_weights = block->width * block->height * (block->dual_plane ? 2 : 1); int weight_bits = sequence_bits(num_weights, block->weight_range); int extra_bits = 0; assert(num_weights <= 64); assert(24 <= weight_bits && weight_bits <= 96); set_bits(data, &pos, 2, block->partitions - 1); if (block->partitions > 1) { set_bits(data, &pos, 10, block->partition_id); int min_cem = 16; int max_cem = 0; for (int j = 0; j < block->partitions; j++) { min_cem = std::min(min_cem, block->color_endpoint_modes[j]); max_cem = std::max(max_cem, block->color_endpoint_modes[j]); } assert(max_cem / 4 <= min_cem / 4 + 1); int CEM = block->color_endpoint_modes[0] << 2; if (max_cem != min_cem) { CEM = std::min(3, min_cem / 4 + 1); for (int j = 0; j < block->partitions; j++) { int c = block->color_endpoint_modes[j] / 4 - ((CEM & 3) - 1); int m = block->color_endpoint_modes[j] % 4; assert(c == 0 || c == 1); CEM |= c << (2 + j); CEM |= m << (2 + block->partitions + 2 * j); } extra_bits = 3 * block->partitions - 4; int pos2 = 128 - weight_bits - extra_bits; set_bits(data, &pos2, extra_bits, CEM >> 6); } set_bits(data, &pos, 6, CEM & 63); } else { set_bits(data, &pos, 4, block->color_endpoint_modes[0]); } if (block->dual_plane) { assert(block->partitions < 4); extra_bits += 2; int pos2 = 128 - weight_bits - extra_bits; set_bits(data, &pos2, 2, block->color_component_selector); } int config_bits = pos + extra_bits; int remaining_bits = 128 - config_bits - weight_bits; int num_cem_pairs = 0; for (int j = 0; j < block->partitions; j++) num_cem_pairs += 1 + block->color_endpoint_modes[j] / 4; assert(num_cem_pairs <= 9); int endpoint_range = -1; for (int range = 20; range>0; range--) { int bits = sequence_bits(2 * num_cem_pairs, range); if (bits <= remaining_bits) { endpoint_range = range; break; } } assert(endpoint_range >= 4); assert(block->endpoint_range == endpoint_range); pack_integer_sequence(data, block->endpoints, pos, 2 * num_cem_pairs, endpoint_range); uint32_t rdata[4] = { 0, 0, 0, 0 }; pack_integer_sequence(rdata, block->weights, 0, num_weights, block->weight_range); for (int i = 0; i < 4; i++) data[i] |= reverse_bits_32(rdata[3 - i]); } void atsc_rank(const rgba_surface* src, int xx, int yy, uint32_t* mode_buffer, astc_enc_settings* settings) { ispc::astc_rank_ispc((ispc::rgba_surface*)src, xx, yy, mode_buffer, (ispc::astc_enc_settings*)settings); } extern "C" void pack_block_c(uint32_t data[4], ispc::astc_block* block) { assert(sizeof(ispc::astc_block) == sizeof(astc_block)); pack_block(data, (astc_block*)block); } void setup_list_context(ispc::astc_enc_context* ctx, uint32_t packed_mode) { ctx->width = 2 + get_field(packed_mode, 15, 13); // 2..8 <= 2^3 ctx->height = 2 + get_field(packed_mode, 18, 16); // 2..8 <= 2^3 ctx->dual_plane = get_field(packed_mode, 19, 19); // 0 or 1 ctx->partitions = 1; int color_endpoint_modes0 = get_field(packed_mode, 7, 6) * 2 + 6; // 6, 8, 10 or 12 ctx->color_endpoint_pairs = 1 + (color_endpoint_modes0 / 4); ctx->channels = (color_endpoint_modes0 > 8) ? 4 : 3; } void astc_encode(const rgba_surface* src, float* block_scores, uint8_t* dst, uint64_t* list, astc_enc_settings* settings) { ispc::astc_enc_context list_context; setup_list_context(&list_context, uint32_t(list[1] & 0xFFFFFFFF)); assert(sizeof(ispc::rgba_surface) == sizeof(rgba_surface)); assert(sizeof(ispc::astc_enc_settings) == sizeof(astc_enc_settings)); ispc::astc_encode_ispc((ispc::rgba_surface*)src, block_scores, dst, list, &list_context, (ispc::astc_enc_settings*)settings); } void CompressBlocksASTC(const rgba_surface* src, uint8_t* dst, astc_enc_settings* settings) { assert(src->height % settings->block_height == 0); assert(src->width % settings->block_width == 0); assert(settings->block_height <= 8); assert(settings->block_width <= 8); int tex_width = src->width / settings->block_width; int programCount = ispc::get_programCount(); std::vector block_scores(tex_width * src->height / settings->block_height); for (int yy = 0; yy < src->height / settings->block_height; yy++) for (int xx = 0; xx < tex_width; xx++) { block_scores[yy * tex_width + xx] = std::numeric_limits::infinity(); } int mode_list_size = 3334; int list_size = programCount; std::vector mode_lists(list_size * mode_list_size); std::vector mode_buffer(programCount * settings->fastSkipTreshold); for (int yy = 0; yy < src->height / settings->block_height; yy++) for (int _x = 0; _x < (tex_width + programCount - 1) / programCount; _x++) { int xx = _x * programCount; atsc_rank(src, xx, yy, mode_buffer.data(), settings); for (int i = 0; i < settings->fastSkipTreshold; i++) for (int k = 0; k < programCount; k++) { if (xx + k >= tex_width) continue; uint32_t offset = (yy << 16) + (xx + k); uint32_t mode = mode_buffer[programCount * i + k]; int mode_bin = mode >> 20; uint64_t* mode_list = &mode_lists[list_size * mode_bin]; if (*mode_list < programCount - 1) { int index = int(mode_list[0] + 1); mode_list[0] = index; mode_list[index] = (uint64_t(offset) << 32) + mode; } else { mode_list[0] = (uint64_t(offset) << 32) + mode; astc_encode(src, block_scores.data(), dst, mode_list, settings); memset(mode_list, 0, list_size * sizeof(uint64_t)); } } } for (int mode_bin = 0; mode_bin < mode_list_size; mode_bin++) { uint64_t* mode_list = &mode_lists[list_size * mode_bin]; if (mode_list[0] == 0) continue; mode_list[0] = 0; astc_encode(src, block_scores.data(), dst, mode_list, settings); memset(mode_list, 0, list_size * sizeof(uint64_t)); } }