diff --git a/.github/workflows/ccp-workflow.yml b/.github/workflows/ccp-workflow.yml index da94fd2..94ca3eb 100644 --- a/.github/workflows/ccp-workflow.yml +++ b/.github/workflows/ccp-workflow.yml @@ -10,6 +10,7 @@ on: jobs: build: strategy: + fail-fast: false matrix: include: [ { system: MacOS, runner: macos-latest }, @@ -29,6 +30,7 @@ jobs: build_windows: strategy: + fail-fast: false matrix: include: [ { system: Windows, runner: windows-latest }, @@ -46,9 +48,11 @@ jobs: test: strategy: + fail-fast: false matrix: include: [ - { system: MacOS, runner: macos-latest }, + { system: MacOS-13, runner: macos-13 }, + { system: MacOS-latest, runner: macos-latest }, { system: Ubuntu-latest, runner: ubuntu-latest }, ] name: ${{ matrix.system }} Test @@ -67,6 +71,7 @@ jobs: test_windows: strategy: + fail-fast: false matrix: include: [ { system: Windows, runner: windows-latest }, diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index c2d527a..7d031b5 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -46,11 +46,11 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@v2 + uses: github/codeql-action/init@v3 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -64,7 +64,7 @@ jobs: # Autobuild attempts to build any compiled languages (C/C++, C#, Go, Java, or Swift). # If this step fails, then you should remove it and run the build manually (see below) - name: Autobuild - uses: github/codeql-action/autobuild@v2 + uses: github/codeql-action/autobuild@v3 # ℹī¸ Command-line programs to run using the OS shell. # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun @@ -77,6 +77,6 @@ jobs: # ./location_of_script_within_repo/buildscript.sh - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 + uses: github/codeql-action/analyze@v3 with: category: "/language:${{matrix.language}}" diff --git a/src/apps/common/ojph_img_io.h b/src/apps/common/ojph_img_io.h index 401ad65..a9ee243 100644 --- a/src/apps/common/ojph_img_io.h +++ b/src/apps/common/ojph_img_io.h @@ -54,7 +54,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// // defined elsewhere class mem_fixed_allocator; - struct line_buf; + class line_buf; //////////////////////////////////////////////////////////////////////////// // @@ -760,7 +760,7 @@ namespace ojph { const char* fname; bool is_signed; ui32 bit_depth, bytes_per_sample; - si32 lower_val, upper_val; + si64 lower_val, upper_val; ui32 width; ui8* buffer; ui32 buffer_size; diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp index e7c047a..144c837 100644 --- a/src/apps/ojph_compress/ojph_compress.cpp +++ b/src/apps/ojph_compress/ojph_compress.cpp @@ -592,20 +592,25 @@ int main(int argc, char * argv[]) { ".pfm files receive special treatment. Currently, lossy compression\n" "with these files is not supported, only lossless. When these files are\n" "used, the NLT segment marker is automatically inserted into the\n" - "codestream. For these files the following arguments can be useful\n" - " -signed a comma - separated list of true or false parameters, one\n" + "codestream when needed, as explained shortly. The following arguments\n" + "can be useful for this file type.\n" + " -signed a comma-separated list of true or false parameters, one\n" " for each component; for example: true,false,false.\n" - " The sign only affects how values are treated; for negative\n" - " values the standard requires a special non-linear\n" - " transformation. When signed is false, no transformation\n" - " is employed, as we assume all values are 0 or positive.\n" - " When signed is true, the aforementioned transformation is\n" - " employed on negative values only.\n" + " If you are sure that all sample values are positive or 0,\n" + " set the corresponding entry to false; otherwise set it to\n" + " true.\n" + " When a component entry is set to true, an NLT segment\n" + " marker segment is inserted into the codestream.\n" + " The NLT segment specifies a non-linear transform that\n" + " changes only negative values, producing better coding\n" + " efficiency.\n" + " The NLT segment marker might be less supported in other\n" + " encoders.\n" " -bit_depth a comma-separated list of bit depth values, one per \n" " component; for example: 12,10,10.\n" " Floating value numbers are treated as integers, and they\n" " are shifted to the right, keeping only the specified\n" - " number of bits. Note that a bit depth of 28 upwards is not\n" + " number of bits. Up to 32 bits (which is the default) are\n" " supported.\n" "\n"; @@ -768,10 +773,6 @@ int main(int argc, char * argv[]) { assert(num_comps == 1 || num_comps == 3); siz.set_num_components(num_comps); - if (bit_depth[0] == 0) - OJPH_ERROR(0x01000091, - "-bit_depth must be specified (this is temporary only).\n"); - if (bit_depth[0] != 0) // one was set if (num_bit_depths < num_comps) // but if not enough, repeat for (ojph::ui32 c = num_bit_depths; c < num_comps; ++c) @@ -840,11 +841,8 @@ int main(int argc, char * argv[]) { nlt.set_type3_transformation(c, true); } else - OJPH_ERROR(0x01000093, "The support for pfm image is not " - "complete; I need to figure how to modify the interface " - "to better support the exchange of floating point data. " - "Feeding float point data is not supported yet, unless it " - "is for lossless compression."); + OJPH_ERROR(0x01000093, "We currently support lossless only for " + "pfm images; this may change in the future."); codestream.set_planar(false); if (profile_string[0] != '\0') diff --git a/src/apps/others/ojph_img_io.cpp b/src/apps/others/ojph_img_io.cpp index d812025..05ce4df 100644 --- a/src/apps/others/ojph_img_io.cpp +++ b/src/apps/others/ojph_img_io.cpp @@ -329,9 +329,9 @@ namespace ojph { return; if (bytes_per_sample == 1) - temp_buf = alloc_p->post_alloc_data(num_comps * width, 0); + temp_buf = alloc_p->post_alloc_data(num_comps * (size_t)width, 0); else - temp_buf = alloc_p->post_alloc_data(num_comps * width, 0); + temp_buf = alloc_p->post_alloc_data(num_comps * (size_t)width, 0); } ///////////////////////////////////////////////////////////////////////////// @@ -408,7 +408,7 @@ namespace ojph { "unable to open file %s for writing", filename); fprintf(fh, "P5\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1); - buffer_size = width * bytes_per_sample; + buffer_size = (size_t)width * bytes_per_sample; buffer = (ui8*)malloc(buffer_size); } else @@ -435,7 +435,7 @@ namespace ojph { fprintf(fh, "P6\n%d %d\n%d\n", width, height, (1 << bit_depth) - 1); if (result == 0) OJPH_ERROR(0x03000027, "error writing to file %s", filename); - buffer_size = width * num_components * bytes_per_sample; + buffer_size = (size_t)width * num_components * (size_t)bytes_per_sample; buffer = (ui8*)malloc(buffer_size); } fname = filename; @@ -935,12 +935,12 @@ namespace ojph { // the first time trying to access this line if (PLANARCONFIG_SEPARATE == planar_configuration && 0 == comp_num ) { - for (unsigned short color = 0; color < num_comps; color++) + for (ui32 color = 0; color < num_comps; color++) { if (bytes_per_sample == 1) { TIFFReadScanline(tiff_handle, line_buffer_for_planar_support_uint8, - cur_line, color); + cur_line, (ui16)color); ui32 x = color; uint8_t* line_buffer_of_interleaved_components = (uint8_t*)line_buffer; @@ -953,7 +953,7 @@ namespace ojph { else if (bytes_per_sample == 2) { TIFFReadScanline(tiff_handle, line_buffer_for_planar_support_uint16, - cur_line, color); + cur_line, (ui16)color); ui32 x = color; ui16* line_buffer_of_interleaved_components = (ui16*)line_buffer; for (ui32 i = 0; i < width; i++, x += num_comps) @@ -1070,7 +1070,7 @@ namespace ojph { OJPH_ERROR(0x030000B3, "unable to open file %s for writing", filename); } - buffer_size = width * num_components * bytes_per_sample; + buffer_size = (size_t)width * num_components * (size_t)bytes_per_sample; buffer = (ui8*)malloc(buffer_size); fname = filename; cur_line = 0; @@ -1146,7 +1146,7 @@ namespace ojph { bytes_per_sample = 2; } samples_per_line = num_components * width; - bytes_per_line = bytes_per_sample * samples_per_line; + bytes_per_line = bytes_per_sample * (size_t)samples_per_line; } @@ -1482,7 +1482,7 @@ namespace ojph { cur_line = 0; bytes_per_sample = (bit_depth + 7) >> 3; - buffer_size = width * bytes_per_sample; + buffer_size = (size_t)width * bytes_per_sample; buffer = (ui8*)malloc(buffer_size); fname = filename; } @@ -1618,15 +1618,15 @@ namespace ojph { this->width = width; if (is_signed) { - upper_val = (1 << (bit_depth - 1)); - lower_val = -(1 << (bit_depth - 1)); + upper_val = (1LL << (bit_depth - 1)); + lower_val = -(1LL << (bit_depth - 1)); } else { - upper_val = 1 << bit_depth; - lower_val = 0; + upper_val = 1LL << bit_depth; + lower_val = 0LL; } bytes_per_sample = (bit_depth + 7) >> 3; - buffer_size = width * bytes_per_sample; + buffer_size = (size_t)width * bytes_per_sample; buffer = (ui8*)malloc(buffer_size); } @@ -1637,63 +1637,127 @@ namespace ojph { assert(fh); assert(comp_num == 0); - if (bytes_per_sample > 3) + if (is_signed) { - const si32* sp = line->i32; - ui32* dp = (ui32*)buffer; - for (ui32 i = width; i > 0; --i) + if (bytes_per_sample > 3) { - int val = *sp++; - val = val < upper_val ? val : upper_val; - val = val >= lower_val ? val : lower_val; - *dp++ = (ui32)val; + const si32* sp = line->i32; + si32* dp = (si32*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (si32)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000151, "unable to write to file %s", fname); } - if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x03000151, "unable to write to file %s", fname); - } - else if (bytes_per_sample > 2) - { - const si32* sp = line->i32; - ui32* dp = (ui32*)buffer; - for (ui32 i = width; i > 0; --i) + else if (bytes_per_sample > 2) { - int val = *sp++; - val = val < upper_val ? val : upper_val; - val = val >= lower_val ? val : lower_val; - *dp = (ui32)val; - // this only works for little endian architecture - dp = (ui32*)((ui8*)dp + 3); + const si32* sp = line->i32; + si32* dp = (si32*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp = (si32)val; + // this only works for little endian architecture + dp = (si32*)((ui8*)dp + 3); + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000152, "unable to write to file %s", fname); } - if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x03000152, "unable to write to file %s", fname); - } - else if (bytes_per_sample > 1) - { - const si32* sp = line->i32; - ui16* dp = (ui16*)buffer; - for (ui32 i = width; i > 0; --i) + else if (bytes_per_sample > 1) { - int val = *sp++; - val = val < upper_val ? val : upper_val; - val = val >= lower_val ? val : lower_val; - *dp++ = (ui16)val; + const si32* sp = line->i32; + si16* dp = (si16*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (si16)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000153, "unable to write to file %s", fname); + } + else + { + const si32* sp = line->i32; + si8* dp = (si8*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (si8)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000154, "unable to write to file %s", fname); } - if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x03000153, "unable to write to file %s", fname); } - else + else { - const si32* sp = line->i32; - ui8* dp = (ui8*)buffer; - for (ui32 i = width; i > 0; --i) + if (bytes_per_sample > 3) { - int val = *sp++; - val = val < upper_val ? val : upper_val; - val = val >= lower_val ? val : lower_val; - *dp++ = (ui8)val; + const ui32* sp = (ui32*)line->i32; + ui32* dp = (ui32*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (ui32)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000155, "unable to write to file %s", fname); + } + else if (bytes_per_sample > 2) + { + const ui32* sp = (ui32*)line->i32; + ui32* dp = (ui32*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp = (ui32)val; + // this only works for little endian architecture + dp = (ui32*)((ui8*)dp + 3); + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000156, "unable to write to file %s", fname); + } + else if (bytes_per_sample > 1) + { + const ui32* sp = (ui32*)line->i32; + ui16* dp = (ui16*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (ui16)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000157, "unable to write to file %s", fname); + } + else + { + const ui32* sp = (ui32*)line->i32; + ui8* dp = (ui8*)buffer; + for (ui32 i = width; i > 0; --i) + { + si64 val = *sp++; + val = val < upper_val ? val : upper_val; + val = val >= lower_val ? val : lower_val; + *dp++ = (ui8)val; + } + if (fwrite(buffer, bytes_per_sample, width, fh) != width) + OJPH_ERROR(0x03000158, "unable to write to file %s", fname); } - if (fwrite(buffer, bytes_per_sample, width, fh) != width) - OJPH_ERROR(0x03000154, "unable to write to file %s", fname); } return width; @@ -1940,11 +2004,11 @@ namespace ojph { // allocate line_buffer_16bit_samples to hold a line of image data in memory line_buffer_16bit_samples = - (ui16*) malloc(width * num_comps * sizeof(ui16)); + (ui16*) malloc((size_t)width * num_comps * sizeof(ui16)); if (NULL == line_buffer_16bit_samples) OJPH_ERROR(0x03000179, "Unable to allocate %d bytes for " "line_buffer_16bit_samples[] for file %s", - width * num_comps * sizeof(ui16), filename); + (size_t)width * num_comps * sizeof(ui16), filename); cur_line = 0; diff --git a/src/core/codestream/ojph_bitbuffer_write.h b/src/core/codestream/ojph_bitbuffer_write.h index d5b6bca..ecb9dd2 100644 --- a/src/core/codestream/ojph_bitbuffer_write.h +++ b/src/core/codestream/ojph_bitbuffer_write.h @@ -109,33 +109,25 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static inline + void bb_put_zeros(bit_write_buf *bbp, int num_zeros, + mem_elastic_allocator *elastic, + coded_lists*& cur_coded_list, ui32& ph_bytes) + { + for (int i = num_zeros; i > 0; --i) + bb_put_bit(bbp, 0, elastic, cur_coded_list, ph_bytes); + } + ////////////////////////////////////////////////////////////////////////// static inline void bb_put_bits(bit_write_buf *bbp, ui32 data, int num_bits, mem_elastic_allocator *elastic, coded_lists*& cur_coded_list, ui32& ph_bytes) { -// assert(num_bits <= 32); - for (int i = num_bits - 1; i >= 0; --i) + assert(num_bits <= 32); + for (int i = num_bits - 1; i >= 0; --i) bb_put_bit(bbp, data >> i, elastic, cur_coded_list, ph_bytes); -// while (num_bits) { -// int tx_bits = num_bits < bbp->avail_bits ? num_bits : bbp->avail_bits; -// bbp->tmp |= (data >> (num_bits - tx_bits)) & ((1 << tx_bits) - 1); -// bbp->avail_bits -= tx_bits; -// if (bbp->avail_bits <= 0) -// { -// bbp->avail_bits = 8 - (bbp->tmp != 0xFF ? 0 : 1); -// bbp->buf[bbp->buf_size - bbp->avail_size] = (ui8)(bbp->tmp & 0xFF); -// bbp->tmp = 0; -// --bbp->avail_size; -// if (bbp->avail_size == 0) -// { -// bb_expand_buf(bbp, elastic, cur_coded_list->next_list); -// cur_coded_list = cur_coded_list->next_list; -// ph_bytes += bit_buffer::needed; -// } -// } -// } } ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/codestream/ojph_codeblock.cpp b/src/core/codestream/ojph_codeblock.cpp index 0915951..351284b 100644 --- a/src/core/codestream/ojph_codeblock.cpp +++ b/src/core/codestream/ojph_codeblock.cpp @@ -45,6 +45,7 @@ #include "ojph_codestream_local.h" #include "ojph_codeblock.h" #include "ojph_subband.h" +#include "ojph_resolution.h" namespace ojph { @@ -52,7 +53,7 @@ namespace ojph { { ////////////////////////////////////////////////////////////////////////// - void codeblock::pre_alloc(codestream *codestream, + void codeblock::pre_alloc(codestream *codestream, ui32 comp_num, const size& nominal) { mem_fixed_allocator* allocator = codestream->get_allocator(); @@ -60,7 +61,14 @@ namespace ojph { assert(byte_alignment / sizeof(ui32) > 1); const ui32 f = byte_alignment / sizeof(ui32) - 1; ui32 stride = (nominal.w + f) & ~f; // a multiple of 8 - allocator->pre_alloc_data(nominal.h * stride, 0); + + const param_siz* sz = codestream->get_siz(); + const param_cod* cd = codestream->get_cod(comp_num); + ui32 precision = cd->propose_implementation_precision(sz); + if (precision <= 32) + allocator->pre_alloc_data(nominal.h * (size_t)stride, 0); + else + allocator->pre_alloc_data(nominal.h * (size_t)stride, 0); } ////////////////////////////////////////////////////////////////////////// @@ -75,7 +83,19 @@ namespace ojph { const ui32 f = byte_alignment / sizeof(ui32) - 1; this->stride = (nominal.w + f) & ~f; // a multiple of 8 this->buf_size = this->stride * nominal.h; - this->buf = allocator->post_alloc_data(this->buf_size, 0); + + ui32 comp_num = parent->get_parent()->get_comp_num(); + const param_siz* sz = codestream->get_siz(); + const param_cod* cd = codestream->get_cod(comp_num); + ui32 bit_depth = cd->propose_implementation_precision(sz); + if (bit_depth <= 32) { + precision = BUF32; + this->buf32 = allocator->post_alloc_data(this->buf_size, 0); + } + else { + precision = BUF64; + this->buf64 = allocator->post_alloc_data(this->buf_size, 0); + } this->nominal_size = nominal; this->cb_size = cb_size; @@ -85,8 +105,8 @@ namespace ojph { this->delta = parent->get_delta(); this->delta_inv = 1.0f / this->delta; this->K_max = K_max; - for (int i = 0; i < 8; ++i) - this->max_val[i] = 0; + for (int i = 0; i < 4; ++i) + this->max_val64[i] = 0; ojph::param_cod cod = codestream->access_cod(); this->reversible = cod.is_reversible(); this->resilient = codestream->is_resilient(); @@ -100,28 +120,61 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void codeblock::push(line_buf *line) { - // convert to sign and magnitude and keep max_val - const si32 *sp = line->i32 + line_offset; - ui32 *dp = buf + cur_line * stride; - this->codeblock_functions.tx_to_cb(sp, dp, K_max, delta_inv, cb_size.w, - max_val); - ++cur_line; + // convert to sign and magnitude and keep max_val + if (precision == BUF32) + { + assert(line->flags & line_buf::LFT_32BIT); + const si32 *sp = line->i32 + line_offset; + ui32 *dp = buf32 + cur_line * stride; + this->codeblock_functions.tx_to_cb32(sp, dp, K_max, delta_inv, + cb_size.w, max_val32); + ++cur_line; + } + else + { + assert(precision == BUF64); + assert(line->flags & line_buf::LFT_64BIT); + const si64 *sp = line->i64 + line_offset; + ui64 *dp = buf64 + cur_line * stride; + this->codeblock_functions.tx_to_cb64(sp, dp, K_max, delta_inv, + cb_size.w, max_val64); + ++cur_line; + } } ////////////////////////////////////////////////////////////////////////// void codeblock::encode(mem_elastic_allocator *elastic) { - ui32 mv = this->codeblock_functions.find_max_val(max_val); - if (mv >= 1u<<(31 - K_max)) + if (precision == BUF32) { - coded_cb->missing_msbs = K_max - 1; - assert(coded_cb->missing_msbs > 0); - assert(coded_cb->missing_msbs < K_max); - coded_cb->num_passes = 1; - - this->codeblock_functions.encode_cb(buf, K_max-1, 1, - cb_size.w, cb_size.h, stride, coded_cb->pass_length, - elastic, coded_cb->next_coded); + ui32 mv = this->codeblock_functions.find_max_val32(max_val32); + if (mv >= 1u << (31 - K_max)) + { + coded_cb->missing_msbs = K_max - 1; + assert(coded_cb->missing_msbs > 0); + assert(coded_cb->missing_msbs < K_max); + coded_cb->num_passes = 1; + + this->codeblock_functions.encode_cb32(buf32, K_max-1, 1, + cb_size.w, cb_size.h, stride, coded_cb->pass_length, + elastic, coded_cb->next_coded); + } + } + else + { + assert(precision == BUF64); + ui64 mv = this->codeblock_functions.find_max_val64(max_val64); + if (mv >= 1ULL << (63 - K_max)) + { + coded_cb->missing_msbs = K_max - 1; + assert(coded_cb->missing_msbs > 0); + assert(coded_cb->missing_msbs < K_max); + coded_cb->num_passes = 1; + + this->codeblock_functions.encode_cb64(buf64, K_max-1, 1, + cb_size.w, cb_size.h, stride, coded_cb->pass_length, + elastic, coded_cb->next_coded); + } } } @@ -132,8 +185,8 @@ namespace ojph { this->cb_size = cb_size; this->coded_cb = coded_cb; this->cur_line = 0; - for (int i = 0; i < 8; ++i) - this->max_val[i] = 0; + for (int i = 0; i < 4; ++i) + this->max_val64[i] = 0; this->zero_block = false; } @@ -143,11 +196,24 @@ namespace ojph { if (coded_cb->pass_length[0] > 0 && coded_cb->num_passes > 0 && coded_cb->next_coded != NULL) { - bool result = this->codeblock_functions.decode_cb( + bool result; + if (precision == BUF32) + { + result = this->codeblock_functions.decode_cb32( + coded_cb->next_coded->buf + coded_cb_header::prefix_buf_size, + buf32, coded_cb->missing_msbs, coded_cb->num_passes, + coded_cb->pass_length[0], coded_cb->pass_length[1], + cb_size.w, cb_size.h, stride, stripe_causal); + } + else + { + assert(precision == BUF64); + result = this->codeblock_functions.decode_cb64( coded_cb->next_coded->buf + coded_cb_header::prefix_buf_size, - buf, coded_cb->missing_msbs, coded_cb->num_passes, + buf64, coded_cb->missing_msbs, coded_cb->num_passes, coded_cb->pass_length[0], coded_cb->pass_length[1], cb_size.w, cb_size.h, stride, stripe_causal); + } if (result == false) { @@ -167,15 +233,35 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void codeblock::pull_line(line_buf *line) { - si32 *dp = line->i32 + line_offset; - if (!zero_block) + //convert to sign and magnitude + if (precision == BUF32) { - //convert to sign and magnitude - const ui32 *sp = buf + cur_line * stride; - this->codeblock_functions.tx_from_cb(sp, dp, K_max, delta, cb_size.w); + assert(line->flags & line_buf::LFT_32BIT); + si32 *dp = line->i32 + line_offset; + if (!zero_block) + { + const ui32 *sp = buf32 + cur_line * stride; + this->codeblock_functions.tx_from_cb32(sp, dp, K_max, delta, + cb_size.w); + } + else + this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(ui32)); } else - this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp)); + { + assert(precision == BUF64); + assert(line->flags & line_buf::LFT_64BIT); + si64 *dp = line->i64 + line_offset; + if (!zero_block) + { + const ui64 *sp = buf64 + cur_line * stride; + this->codeblock_functions.tx_from_cb64(sp, dp, K_max, delta, + cb_size.w); + } + else + this->codeblock_functions.mem_clear(dp, cb_size.w * sizeof(*dp)); + } + ++cur_line; assert(cur_line <= cb_size.h); } diff --git a/src/core/codestream/ojph_codeblock.h b/src/core/codestream/ojph_codeblock.h index 2f7d8e7..fde8e6a 100644 --- a/src/core/codestream/ojph_codeblock.h +++ b/src/core/codestream/ojph_codeblock.h @@ -48,7 +48,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class mem_elastic_allocator; class codestream; struct coded_lists; @@ -65,8 +65,14 @@ namespace ojph { class codeblock { friend struct precinct; + enum : ui32 { + BUF32 = 4, + BUF64 = 8, + }; + public: - static void pre_alloc(codestream *codestream, const size& nominal); + static void pre_alloc(codestream *codestream, ui32 comp_num, + const size& nominal); void finalize_alloc(codestream *codestream, subband* parent, const size& nominal, const size& cb_size, coded_cb_header* coded_cb, @@ -79,7 +85,11 @@ namespace ojph { void pull_line(line_buf *line); private: - ui32* buf; + ui32 precision; + union { + ui32* buf32; + ui64* buf64; + }; size nominal_size; size cb_size; ui32 stride; @@ -93,7 +103,10 @@ namespace ojph { bool resilient; bool stripe_causal; bool zero_block; // true when the decoded block is all zero - ui32 max_val[8]; // supports up to 256 bits + union { + ui32 max_val32[8]; // supports up to 256 bits + ui64 max_val64[4]; // supports up to 256 bits + }; coded_cb_header* coded_cb; codeblock_fun codeblock_functions; }; diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp index 51253c1..08d8d73 100644 --- a/src/core/codestream/ojph_codeblock_fun.cpp +++ b/src/core/codestream/ojph_codeblock_fun.cpp @@ -63,72 +63,107 @@ namespace ojph { void wasm_mem_clear(void* addr, size_t count); ////////////////////////////////////////////////////////////////////////// - ui32 gen_find_max_val(ui32* address); - ui32 sse2_find_max_val(ui32* address); - ui32 avx2_find_max_val(ui32* address); - ui32 wasm_find_max_val(ui32* address); + ui32 gen_find_max_val32(ui32* address); + ui32 sse2_find_max_val32(ui32* address); + ui32 avx2_find_max_val32(ui32* address); + ui32 wasm_find_max_val32(ui32* address); + ui64 gen_find_max_val64(ui64* address); + ui64 sse2_find_max_val64(ui64* address); + ui64 avx2_find_max_val64(ui64* address); + ui64 wasm_find_max_val64(ui64* address); + ////////////////////////////////////////////////////////////////////////// - void gen_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void gen_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void wasm_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); - void wasm_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val); + void gen_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void gen_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val); + + void gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); + void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); + void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); + void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); ////////////////////////////////////////////////////////////////////////// - void gen_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void gen_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void wasm_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); - void wasm_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count); + void gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void gen_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count); void codeblock_fun::init(bool reversible) { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) // Default path, no acceleration. We may change this later - decode_cb = ojph_decode_codeblock; - find_max_val = gen_find_max_val; + decode_cb32 = ojph_decode_codeblock32; + find_max_val32 = gen_find_max_val32; mem_clear = gen_mem_clear; if (reversible) { - tx_to_cb = gen_rev_tx_to_cb; - tx_from_cb = gen_rev_tx_from_cb; + tx_to_cb32 = gen_rev_tx_to_cb32; + tx_from_cb32 = gen_rev_tx_from_cb32; } else { - tx_to_cb = gen_irv_tx_to_cb; - tx_from_cb = gen_irv_tx_from_cb; + tx_to_cb32 = gen_irv_tx_to_cb32; + tx_from_cb32 = gen_irv_tx_from_cb32; } - encode_cb = ojph_encode_codeblock; + encode_cb32 = ojph_encode_codeblock32; + + decode_cb64 = ojph_decode_codeblock64; + find_max_val64 = gen_find_max_val64; + if (reversible) { + tx_to_cb64 = gen_rev_tx_to_cb64; + tx_from_cb64 = gen_rev_tx_from_cb64; + } + else + { + tx_to_cb64 = NULL; + tx_from_cb64 = NULL; + } + encode_cb64 = ojph_encode_codeblock64; #ifndef OJPH_DISABLE_SIMD #if (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) - // Accelerated functions for INTEL/AMD CPUs + // Accelerated functions for INTEL/AMD CPUs #ifndef OJPH_DISABLE_SSE if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE) mem_clear = sse_mem_clear; @@ -136,21 +171,31 @@ namespace ojph { #ifndef OJPH_DISABLE_SSE2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) { - find_max_val = sse2_find_max_val; + find_max_val32 = sse2_find_max_val32; if (reversible) { - tx_to_cb = sse2_rev_tx_to_cb; - tx_from_cb = sse2_rev_tx_from_cb; + tx_to_cb32 = sse2_rev_tx_to_cb32; + tx_from_cb32 = sse2_rev_tx_from_cb32; } else { - tx_to_cb = sse2_irv_tx_to_cb; - tx_from_cb = sse2_irv_tx_from_cb; + tx_to_cb32 = sse2_irv_tx_to_cb32; + tx_from_cb32 = sse2_irv_tx_from_cb32; + } + find_max_val64 = sse2_find_max_val64; + if (reversible) { + tx_to_cb64 = sse2_rev_tx_to_cb64; + tx_from_cb64 = sse2_rev_tx_from_cb64; + } + else + { + tx_to_cb64 = NULL; + tx_from_cb64 = NULL; } } #endif // !OJPH_DISABLE_SSE2 #ifndef OJPH_DISABLE_SSSE3 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSSE3) - decode_cb = ojph_decode_codeblock_ssse3; + decode_cb32 = ojph_decode_codeblock_ssse3; #endif // !OJPH_DISABLE_SSSE3 #ifndef OJPH_DISABLE_AVX @@ -160,23 +205,39 @@ namespace ojph { #ifndef OJPH_DISABLE_AVX2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { - find_max_val = avx2_find_max_val; + decode_cb32 = ojph_decode_codeblock_avx2; + find_max_val32 = avx2_find_max_val32; if (reversible) { - tx_to_cb = avx2_rev_tx_to_cb; - tx_from_cb = avx2_rev_tx_from_cb; + tx_to_cb32 = avx2_rev_tx_to_cb32; + tx_from_cb32 = avx2_rev_tx_from_cb32; } else { - tx_to_cb = avx2_irv_tx_to_cb; - tx_from_cb = avx2_irv_tx_from_cb; + tx_to_cb32 = avx2_irv_tx_to_cb32; + tx_from_cb32 = avx2_irv_tx_from_cb32; + } + encode_cb32 = ojph_encode_codeblock_avx2; + bool result = initialize_block_encoder_tables_avx2(); + assert(result); ojph_unused(result); + + find_max_val64 = avx2_find_max_val64; + if (reversible) { + tx_to_cb64 = avx2_rev_tx_to_cb64; + tx_from_cb64 = avx2_rev_tx_from_cb64; + } + else + { + tx_to_cb64 = NULL; + tx_from_cb64 = NULL; } - encode_cb = ojph_encode_codeblock_avx2; - decode_cb = ojph_decode_codeblock_avx2; } #endif // !OJPH_DISABLE_AVX2 #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512)) - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) - encode_cb = ojph_encode_codeblock_avx512; + if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) { + encode_cb32 = ojph_encode_codeblock_avx512; + bool result = initialize_block_encoder_tables_avx512(); + assert(result); ojph_unused(result); + } #endif // !OJPH_DISABLE_AVX512 #elif defined(OJPH_ARCH_ARM) @@ -188,18 +249,31 @@ namespace ojph { #else // OJPH_ENABLE_WASM_SIMD // Accelerated functions for WASM SIMD. - decode_cb = ojph_decode_codeblock_wasm; - find_max_val = wasm_find_max_val; + decode_cb32 = ojph_decode_codeblock_wasm; + find_max_val32 = wasm_find_max_val32; mem_clear = wasm_mem_clear; if (reversible) { - tx_to_cb = wasm_rev_tx_to_cb; - tx_from_cb = wasm_rev_tx_from_cb; + tx_to_cb32 = wasm_rev_tx_to_cb32; + tx_from_cb32 = wasm_rev_tx_from_cb32; } else { - tx_to_cb = wasm_irv_tx_to_cb; - tx_from_cb = wasm_irv_tx_from_cb; + tx_to_cb32 = wasm_irv_tx_to_cb32; + tx_from_cb32 = wasm_irv_tx_from_cb32; + } + encode_cb32 = ojph_encode_codeblock32; + + decode_cb64 = ojph_decode_codeblock64; + find_max_val64 = wasm_find_max_val64; + if (reversible) { + tx_to_cb64 = wasm_rev_tx_to_cb64; + tx_from_cb64 = wasm_rev_tx_from_cb64; + } + else + { + tx_to_cb64 = NULL; + tx_from_cb64 = NULL; } - encode_cb = ojph_encode_codeblock; + encode_cb64 = ojph_encode_codeblock64; #endif // !OJPH_ENABLE_WASM_SIMD diff --git a/src/core/codestream/ojph_codeblock_fun.h b/src/core/codestream/ojph_codeblock_fun.h index 679b2d3..67fbc2b 100644 --- a/src/core/codestream/ojph_codeblock_fun.h +++ b/src/core/codestream/ojph_codeblock_fun.h @@ -51,23 +51,40 @@ namespace ojph { typedef void (*mem_clear_fun)(void* addr, size_t count); // define function signature for max value finding - typedef ui32 (*find_max_val_fun)(ui32* addr); + typedef ui32 (*find_max_val_fun32)(ui32* addr); + + typedef ui64 (*find_max_val_fun64)(ui64* addr); // define line transfer function signature from subbands to codeblocks - typedef void (*tx_to_cb_fun)(const void *sp, ui32 *dp, ui32 K_max, + typedef void (*tx_to_cb_fun32)(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32* max_val); + typedef void (*tx_to_cb_fun64)(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val); + // define line transfer function signature from codeblock to subband - typedef void (*tx_from_cb_fun)(const ui32 *sp, void *dp, ui32 K_max, + typedef void (*tx_from_cb_fun32)(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count); + + typedef void (*tx_from_cb_fun64)(const ui64 *sp, void *dp, ui32 K_max, float delta, ui32 count); // define the block decoder function signature - typedef bool (*cb_decoder_fun)(ui8* coded_data, ui32* decoded_data, + typedef bool (*cb_decoder_fun32)(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, bool stripe_causal); + + typedef bool (*cb_decoder_fun64)(ui8* coded_data, ui64* decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal); // define the block encoder function signature - typedef void (*cb_encoder_fun)(ui32* buf, ui32 missing_msbs, + typedef void (*cb_encoder_fun32)(ui32* buf, ui32 missing_msbs, + ui32 num_passes, ui32 width, ui32 height, ui32 stride, + ui32* lengths, ojph::mem_elastic_allocator* elastic, + ojph::coded_lists*& coded); + + typedef void (*cb_encoder_fun64)(ui64* buf, ui32 missing_msbs, ui32 num_passes, ui32 width, ui32 height, ui32 stride, ui32* lengths, ojph::mem_elastic_allocator* elastic, ojph::coded_lists*& coded); @@ -81,19 +98,24 @@ namespace ojph { mem_clear_fun mem_clear; // a pointer to the max value finding function - find_max_val_fun find_max_val; + find_max_val_fun32 find_max_val32; + find_max_val_fun64 find_max_val64; // a pointer to function transferring samples from subbands to codeblocks - tx_to_cb_fun tx_to_cb; + tx_to_cb_fun32 tx_to_cb32; + tx_to_cb_fun64 tx_to_cb64; // a pointer to function transferring samples from codeblocks to subbands - tx_from_cb_fun tx_from_cb; + tx_from_cb_fun32 tx_from_cb32; + tx_from_cb_fun64 tx_from_cb64; // a pointer to the decoder function - cb_decoder_fun decode_cb; + cb_decoder_fun32 decode_cb32; + cb_decoder_fun64 decode_cb64; // a pointer to the encoder function - cb_encoder_fun encode_cb; + cb_encoder_fun32 encode_cb32; + cb_encoder_fun64 encode_cb64; }; } diff --git a/src/core/codestream/ojph_codestream_avx2.cpp b/src/core/codestream/ojph_codestream_avx2.cpp index 04a81ed..a8e5138 100644 --- a/src/core/codestream/ojph_codestream_avx2.cpp +++ b/src/core/codestream/ojph_codestream_avx2.cpp @@ -35,6 +35,7 @@ // Date: 15 May 2022 //***************************************************************************/ +#include #include #include "ojph_defs.h" @@ -42,7 +43,7 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - ui32 avx2_find_max_val(ui32* address) + ui32 avx2_find_max_val32(ui32* address) { __m128i x0 = _mm_loadu_si128((__m128i*)address); __m128i x1 = _mm_loadu_si128((__m128i*)address + 1); @@ -56,14 +57,26 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void avx2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + ui64 avx2_find_max_val64(ui64* address) + { + __m128i x0 = _mm_loadu_si128((__m128i*)address); + __m128i x1 = _mm_loadu_si128((__m128i*)address + 1); + x0 = _mm_or_si128(x0, x1); + x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3] + x0 = _mm_or_si128(x0, x1); + ui64 t = (ui64)_mm_extract_epi64(x0, 0); + return t; + } + + ////////////////////////////////////////////////////////////////////////// + void avx2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(delta_inv); // convert to sign and magnitude and keep max_val ui32 shift = 31 - K_max; - __m256i m0 = _mm256_set1_epi32((int)0x80000000); + __m256i m0 = _mm256_set1_epi32(INT_MIN); __m256i tmax = _mm256_loadu_si256((__m256i*)max_val); __m256i *p = (__m256i*)sp; for (ui32 i = 0; i < count; i += 8, p += 1, dp += 8) @@ -78,16 +91,16 @@ namespace ojph { } _mm256_storeu_si256((__m256i*)max_val, tmax); } - + ////////////////////////////////////////////////////////////////////////// - void avx2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + void avx2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(K_max); //quantize and convert to sign and magnitude and keep max_val __m256 d = _mm256_set1_ps(delta_inv); - __m256i m0 = _mm256_set1_epi32((int)0x80000000); + __m256i m0 = _mm256_set1_epi32(INT_MIN); __m256i tmax = _mm256_loadu_si256((__m256i*)max_val); float *p = (float*)sp; @@ -106,29 +119,29 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void avx2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void avx2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(delta); ui32 shift = 31 - K_max; - __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF); + __m256i m1 = _mm256_set1_epi32(INT_MAX); si32 *p = (si32*)dp; for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8) { - __m256i v = _mm256_load_si256((__m256i*)sp); - __m256i val = _mm256_and_si256(v, m1); - val = _mm256_srli_epi32(val, (int)shift); - val = _mm256_sign_epi32(val, v); - _mm256_storeu_si256((__m256i*)p, val); + __m256i v = _mm256_load_si256((__m256i*)sp); + __m256i val = _mm256_and_si256(v, m1); + val = _mm256_srli_epi32(val, (int)shift); + val = _mm256_sign_epi32(val, v); + _mm256_storeu_si256((__m256i*)p, val); } } ////////////////////////////////////////////////////////////////////////// - void avx2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void avx2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(K_max); - __m256i m1 = _mm256_set1_epi32(0x7FFFFFFF); + __m256i m1 = _mm256_set1_epi32(INT_MAX); __m256 d = _mm256_set1_ps(delta); float *p = (float*)dp; for (ui32 i = 0; i < count; i += 8, sp += 8, p += 8) @@ -142,5 +155,58 @@ namespace ojph { _mm256_storeu_ps(p, valf); } } + + ////////////////////////////////////////////////////////////////////////// + void avx2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val) + { + ojph_unused(delta_inv); + + // convert to sign and magnitude and keep max_val + ui32 shift = 63 - K_max; + __m256i m0 = _mm256_set1_epi64x(LLONG_MIN); + __m256i zero = _mm256_setzero_si256(); + __m256i one = _mm256_set1_epi64x(1); + __m256i tmax = _mm256_loadu_si256((__m256i*)max_val); + __m256i *p = (__m256i*)sp; + for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4) + { + __m256i v = _mm256_loadu_si256(p); + __m256i sign = _mm256_cmpgt_epi64(zero, v); + __m256i val = _mm256_xor_si256(v, sign); // negate 1's complement + __m256i ones = _mm256_and_si256(sign, one); + val = _mm256_add_epi64(val, ones); // 2's complement + sign = _mm256_and_si256(sign, m0); + val = _mm256_slli_epi64(val, (int)shift); + tmax = _mm256_or_si256(tmax, val); + val = _mm256_or_si256(val, sign); + _mm256_storeu_si256((__m256i*)dp, val); + } + _mm256_storeu_si256((__m256i*)max_val, tmax); + } + + ////////////////////////////////////////////////////////////////////////// + void avx2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + + ui32 shift = 63 - K_max; + __m256i m1 = _mm256_set1_epi64x(LLONG_MAX); + __m256i zero = _mm256_setzero_si256(); + __m256i one = _mm256_set1_epi64x(1); + si64 *p = (si64*)dp; + for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) + { + __m256i v = _mm256_load_si256((__m256i*)sp); + __m256i val = _mm256_and_si256(v, m1); + val = _mm256_srli_epi64(val, (int)shift); + __m256i sign = _mm256_cmpgt_epi64(zero, v); + val = _mm256_xor_si256(val, sign); // negate 1's complement + __m256i ones = _mm256_and_si256(sign, one); + val = _mm256_add_epi64(val, ones); // 2's complement + _mm256_storeu_si256((__m256i*)p, val); + } + } } -} \ No newline at end of file +} diff --git a/src/core/codestream/ojph_codestream_gen.cpp b/src/core/codestream/ojph_codestream_gen.cpp index 466f483..cdc72c6 100644 --- a/src/core/codestream/ojph_codestream_gen.cpp +++ b/src/core/codestream/ojph_codestream_gen.cpp @@ -44,18 +44,21 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void gen_mem_clear(void* addr, size_t count) { - ui32* p = (ui32*)addr; - for (size_t i = 0; i < count; i += 4, p += 1) - *p = 0; + si64* p = (si64*)addr; + for (size_t i = 0; i < count; i += 8) + *p++ = 0; } ////////////////////////////////////////////////////////////////////////// - ui32 gen_find_max_val(ui32* addr) { return addr[0]; } + ui32 gen_find_max_val32(ui32* addr) { return addr[0]; } ////////////////////////////////////////////////////////////////////////// - void gen_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, - ui32* max_val) + ui64 gen_find_max_val64(ui64* addr) { return addr[0]; } + + ////////////////////////////////////////////////////////////////////////// + void gen_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, + ui32* max_val) { ojph_unused(delta_inv); ui32 shift = 31 - K_max; @@ -65,7 +68,7 @@ namespace ojph { for (ui32 i = count; i > 0; --i) { si32 v = *p++; - ui32 sign = v >= 0 ? 0 : 0x80000000; + ui32 sign = v >= 0 ? 0U : 0x80000000U; ui32 val = (ui32)(v >= 0 ? v : -v); val <<= shift; *dp++ = sign | val; @@ -75,9 +78,31 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void gen_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, - ui32* max_val) + void gen_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, + ui64* max_val) + { + ojph_unused(delta_inv); + ui32 shift = 63 - K_max; + // convert to sign and magnitude and keep max_val + ui64 tmax = *max_val; + si64 *p = (si64*)sp; + for (ui32 i = count; i > 0; --i) + { + si64 v = *p++; + ui64 sign = v >= 0 ? 0ULL : 0x8000000000000000ULL; + ui64 val = (ui64)(v >= 0 ? v : -v); + val <<= shift; + *dp++ = sign | val; + tmax |= val; // it is more efficient to use or than max + } + *max_val = tmax; + } + + ////////////////////////////////////////////////////////////////////////// + void gen_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, + ui32* max_val) { ojph_unused(K_max); //quantize and convert to sign and magnitude and keep max_val @@ -87,7 +112,7 @@ namespace ojph { { float v = *p++; si32 t = ojph_trunc(v * delta_inv); - ui32 sign = t >= 0 ? 0 : 0x80000000; + ui32 sign = t >= 0 ? 0U : 0x80000000U; ui32 val = (ui32)(t >= 0 ? t : -t); *dp++ = sign | val; tmax |= val; // it is more efficient to use or than max @@ -96,8 +121,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void gen_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void gen_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(delta); ui32 shift = 31 - K_max; @@ -106,14 +131,30 @@ namespace ojph { for (ui32 i = count; i > 0; --i) { ui32 v = *sp++; - si32 val = (v & 0x7FFFFFFF) >> shift; - *p++ = (v & 0x80000000) ? -val : val; + si32 val = (v & 0x7FFFFFFFU) >> shift; + *p++ = (v & 0x80000000U) ? -val : val; + } + } + + ////////////////////////////////////////////////////////////////////////// + void gen_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + ui32 shift = 63 - K_max; + //convert to sign and magnitude + si64 *p = (si64*)dp; + for (ui32 i = count; i > 0; --i) + { + ui64 v = *sp++; + si64 val = (v & 0x7FFFFFFFFFFFFFFFULL) >> shift; + *p++ = (v & 0x8000000000000000ULL) ? -val : val; } } ////////////////////////////////////////////////////////////////////////// - void gen_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void gen_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(K_max); //convert to sign and magnitude @@ -121,8 +162,8 @@ namespace ojph { for (ui32 i = count; i > 0; --i) { ui32 v = *sp++; - float val = (float)(v & 0x7FFFFFFF) * delta; - *p++ = (v & 0x80000000) ? -val : val; + float val = (float)(v & 0x7FFFFFFFU) * delta; + *p++ = (v & 0x80000000U) ? -val : val; } } diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h index e6930d5..3d03658 100644 --- a/src/core/codestream/ojph_codestream_local.h +++ b/src/core/codestream/ojph_codestream_local.h @@ -46,7 +46,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class mem_fixed_allocator; class mem_elastic_allocator; class codestream; diff --git a/src/core/codestream/ojph_codestream_sse.cpp b/src/core/codestream/ojph_codestream_sse.cpp index 7c64ad9..6a31cbd 100644 --- a/src/core/codestream/ojph_codestream_sse.cpp +++ b/src/core/codestream/ojph_codestream_sse.cpp @@ -49,6 +49,5 @@ namespace ojph { for (size_t i = 0; i < count; i += 16, p += 4) _mm_storeu_ps(p, zero); } - } } \ No newline at end of file diff --git a/src/core/codestream/ojph_codestream_sse2.cpp b/src/core/codestream/ojph_codestream_sse2.cpp index 9bb0643..3352bcd 100644 --- a/src/core/codestream/ojph_codestream_sse2.cpp +++ b/src/core/codestream/ojph_codestream_sse2.cpp @@ -35,6 +35,7 @@ // Date: 15 May 2022 //***************************************************************************/ +#include #include #include "ojph_defs.h" @@ -42,7 +43,7 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - ui32 sse2_find_max_val(ui32* address) + ui32 sse2_find_max_val32(ui32* address) { __m128i x1, x0 = _mm_loadu_si128((__m128i*)address); x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3] @@ -59,14 +60,29 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + ui64 sse2_find_max_val64(ui64* address) + { + __m128i x1, x0 = _mm_loadu_si128((__m128i*)address); + x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3] + x0 = _mm_or_si128(x0, x1); + _mm_storeu_si128((__m128i*)address, x0); + return *address; + // A single movd t, xmm0 can do the trick, but it is not available + // in SSE2 intrinsics. extract_epi32 is available in sse4.1 + // ui32 t = (ui32)_mm_extract_epi16(x0, 0); + // t |= (ui32)_mm_extract_epi16(x0, 1) << 16; + // return t; + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(delta_inv); // convert to sign and magnitude and keep max_val ui32 shift = 31 - K_max; - __m128i m0 = _mm_set1_epi32((int)0x80000000); + __m128i m0 = _mm_set1_epi32(INT_MIN); __m128i zero = _mm_setzero_si128(); __m128i one = _mm_set1_epi32(1); __m128i tmax = _mm_loadu_si128((__m128i*)max_val); @@ -88,8 +104,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + void sse2_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(K_max); @@ -118,34 +134,34 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void sse2_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(delta); ui32 shift = 31 - K_max; - __m128i m1 = _mm_set1_epi32(0x7FFFFFFF); + __m128i m1 = _mm_set1_epi32(INT_MAX); __m128i zero = _mm_setzero_si128(); __m128i one = _mm_set1_epi32(1); si32 *p = (si32*)dp; for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) { - __m128i v = _mm_load_si128((__m128i*)sp); - __m128i val = _mm_and_si128(v, m1); - val = _mm_srli_epi32(val, (int)shift); - __m128i sign = _mm_cmplt_epi32(v, zero); - val = _mm_xor_si128(val, sign); // negate 1's complement - __m128i ones = _mm_and_si128(sign, one); - val = _mm_add_epi32(val, ones); // 2's complement - _mm_storeu_si128((__m128i*)p, val); + __m128i v = _mm_load_si128((__m128i*)sp); + __m128i val = _mm_and_si128(v, m1); + val = _mm_srli_epi32(val, (int)shift); + __m128i sign = _mm_cmplt_epi32(v, zero); + val = _mm_xor_si128(val, sign); // negate 1's complement + __m128i ones = _mm_and_si128(sign, one); + val = _mm_add_epi32(val, ones); // 2's complement + _mm_storeu_si128((__m128i*)p, val); } } ////////////////////////////////////////////////////////////////////////// - void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void sse2_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(K_max); - __m128i m1 = _mm_set1_epi32(0x7FFFFFFF); + __m128i m1 = _mm_set1_epi32(INT_MAX); __m128 d = _mm_set1_ps(delta); float *p = (float*)dp; for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) @@ -159,5 +175,59 @@ namespace ojph { _mm_storeu_ps(p, valf); } } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val) + { + ojph_unused(delta_inv); + + // convert to sign and magnitude and keep max_val + ui32 shift = 63 - K_max; + __m128i m0 = _mm_set1_epi64x(LLONG_MIN); + __m128i zero = _mm_setzero_si128(); + __m128i one = _mm_set1_epi64x(1); + __m128i tmax = _mm_loadu_si128((__m128i*)max_val); + __m128i *p = (__m128i*)sp; + for (ui32 i = 0; i < count; i += 2, p += 1, dp += 2) + { + __m128i v = _mm_loadu_si128(p); + __m128i sign = _mm_cmplt_epi32(v, zero); + sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3]; + __m128i val = _mm_xor_si128(v, sign); // negate 1's complement + __m128i ones = _mm_and_si128(sign, one); + val = _mm_add_epi64(val, ones); // 2's complement + sign = _mm_and_si128(sign, m0); + val = _mm_slli_epi64(val, (int)shift); + tmax = _mm_or_si128(tmax, val); + val = _mm_or_si128(val, sign); + _mm_storeu_si128((__m128i*)dp, val); + } + _mm_storeu_si128((__m128i*)max_val, tmax); + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + ui32 shift = 63 - K_max; + __m128i m1 = _mm_set1_epi64x(LLONG_MAX); + __m128i zero = _mm_setzero_si128(); + __m128i one = _mm_set1_epi64x(1); + si64 *p = (si64*)dp; + for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2) + { + __m128i v = _mm_load_si128((__m128i*)sp); + __m128i val = _mm_and_si128(v, m1); + val = _mm_srli_epi64(val, (int)shift); + __m128i sign = _mm_cmplt_epi32(v, zero); + sign = _mm_shuffle_epi32(sign, 0xF5); // sign = sign[1,1,3,3]; + val = _mm_xor_si128(val, sign); // negate 1's complement + __m128i ones = _mm_and_si128(sign, one); + val = _mm_add_epi64(val, ones); // 2's complement + _mm_storeu_si128((__m128i*)p, val); + } + } } } \ No newline at end of file diff --git a/src/core/codestream/ojph_codestream_wasm.cpp b/src/core/codestream/ojph_codestream_wasm.cpp index 19e47aa..e2cd444 100644 --- a/src/core/codestream/ojph_codestream_wasm.cpp +++ b/src/core/codestream/ojph_codestream_wasm.cpp @@ -35,6 +35,7 @@ // Date: 15 May 2022 //***************************************************************************/ +#include #include #include @@ -43,20 +44,17 @@ namespace ojph { namespace local { - ////////////////////////////////////////////////////////////////////////// - #define REPEAT(a) a,a,a,a - ////////////////////////////////////////////////////////////////////////// void wasm_mem_clear(void* addr, size_t count) { float* p = (float*)addr; - v128_t zero = wasm_i32x4_const(REPEAT(0)); + v128_t zero = wasm_i32x4_splat(0); for (size_t i = 0; i < count; i += 16, p += 4) wasm_v128_store(p, zero); } ////////////////////////////////////////////////////////////////////////// - ui32 wasm_find_max_val(ui32* address) + ui32 wasm_find_max_val32(ui32* address) { v128_t x1, x0 = wasm_v128_load(address); x1 = wasm_i32x4_shuffle(x0, x0, 2, 3, 2, 3); // x1 = x0[2,3,2,3] @@ -68,19 +66,29 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + ui64 wasm_find_max_val64(ui64* address) + { + v128_t x1, x0 = wasm_v128_load(address); + x1 = wasm_i64x2_shuffle(x0, x0, 1, 1); // x1 = x0[2,3,2,3] + x0 = wasm_v128_or(x0, x1); + ui64 t = (ui64)wasm_i64x2_extract_lane(x0, 0); + return t; + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(delta_inv); // convert to sign and magnitude and keep max_val ui32 shift = 31 - K_max; - v128_t m0 = wasm_i32x4_const(REPEAT((int)0x80000000)); - v128_t zero = wasm_i32x4_const(REPEAT(0)); - v128_t one = wasm_i32x4_const(REPEAT(1)); + v128_t m0 = wasm_i32x4_splat(INT_MIN); + v128_t zero = wasm_i32x4_splat(0); + v128_t one = wasm_i32x4_splat(1); v128_t tmax = wasm_v128_load(max_val); - v128_t *p = (v128_t*)sp; - for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4) + si32 *p = (si32*)sp; + for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4) { v128_t v = wasm_v128_load(p); v128_t sign = wasm_i32x4_lt(v, zero); @@ -97,16 +105,16 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, - float delta_inv, ui32 count, ui32* max_val) + void wasm_irv_tx_to_cb32(const void *sp, ui32 *dp, ui32 K_max, + float delta_inv, ui32 count, ui32* max_val) { ojph_unused(K_max); //quantize and convert to sign and magnitude and keep max_val v128_t d = wasm_f32x4_splat(delta_inv); - v128_t zero = wasm_i32x4_const(REPEAT(0)); - v128_t one = wasm_i32x4_const(REPEAT(1)); + v128_t zero = wasm_i32x4_splat(0); + v128_t one = wasm_i32x4_splat(1); v128_t tmax = wasm_v128_load(max_val); float *p = (float*)sp; for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4) @@ -127,14 +135,14 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void wasm_rev_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(delta); ui32 shift = 31 - K_max; - v128_t m1 = wasm_i32x4_const(REPEAT(0x7FFFFFFF)); - v128_t zero = wasm_i32x4_const(REPEAT(0)); - v128_t one = wasm_i32x4_const(REPEAT(1)); + v128_t m1 = wasm_i32x4_splat(INT_MAX); + v128_t zero = wasm_i32x4_splat(0); + v128_t one = wasm_i32x4_splat(1); si32 *p = (si32*)dp; for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) { @@ -150,11 +158,11 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, - float delta, ui32 count) + void wasm_irv_tx_from_cb32(const ui32 *sp, void *dp, ui32 K_max, + float delta, ui32 count) { ojph_unused(K_max); - v128_t m1 = wasm_i32x4_const(REPEAT(0x7FFFFFFF)); + v128_t m1 = wasm_i32x4_splat(INT_MAX); v128_t d = wasm_f32x4_splat(delta); float *p = (float*)dp; for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4) @@ -167,6 +175,58 @@ namespace ojph { valf = wasm_v128_or(valf, sign); wasm_v128_store(p, valf); } - } + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_tx_to_cb64(const void *sp, ui64 *dp, ui32 K_max, + float delta_inv, ui32 count, ui64* max_val) + { + ojph_unused(delta_inv); + + // convert to sign and magnitude and keep max_val + ui32 shift = 63 - K_max; + v128_t m0 = wasm_i64x2_splat(LLONG_MIN); + v128_t zero = wasm_i64x2_splat(0); + v128_t one = wasm_i64x2_splat(1); + v128_t tmax = wasm_v128_load(max_val); + si64 *p = (si64*)sp; + for (ui32 i = 0; i < count; i += 2, p += 2, dp += 2) + { + v128_t v = wasm_v128_load(p); + v128_t sign = wasm_i64x2_lt(v, zero); + v128_t val = wasm_v128_xor(v, sign); // negate 1's complement + v128_t ones = wasm_v128_and(sign, one); + val = wasm_i64x2_add(val, ones); // 2's complement + sign = wasm_v128_and(sign, m0); + val = wasm_i64x2_shl(val, shift); + tmax = wasm_v128_or(tmax, val); + val = wasm_v128_or(val, sign); + wasm_v128_store(dp, val); + } + wasm_v128_store(max_val, tmax); + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_tx_from_cb64(const ui64 *sp, void *dp, ui32 K_max, + float delta, ui32 count) + { + ojph_unused(delta); + ui32 shift = 63 - K_max; + v128_t m1 = wasm_i64x2_splat(LLONG_MAX); + v128_t zero = wasm_i64x2_splat(0); + v128_t one = wasm_i64x2_splat(1); + si64 *p = (si64*)dp; + for (ui32 i = 0; i < count; i += 2, sp += 2, p += 2) + { + v128_t v = wasm_v128_load((v128_t*)sp); + v128_t val = wasm_v128_and(v, m1); + val = wasm_i64x2_shr(val, shift); + v128_t sign = wasm_i64x2_lt(v, zero); + val = wasm_v128_xor(val, sign); // negate 1's complement + v128_t ones = wasm_v128_and(sign, one); + val = wasm_i64x2_add(val, ones); // 2's complement + wasm_v128_store(p, val); + } + } } } \ No newline at end of file diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp index 2bd3987..8a234e5 100644 --- a/src/core/codestream/ojph_params.cpp +++ b/src/core/codestream/ojph_params.cpp @@ -776,6 +776,25 @@ namespace ojph { // ////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// + ui32 + param_cod::propose_implementation_precision(const param_siz* siz) const + { + bool employing_color_transform = is_employing_color_transform() ? 1 : 0; + bool reversible = atk->is_reversible(); + + ui32 bit_depth = 32; + if (reversible) { + bit_depth = siz->get_bit_depth(comp_num); + bit_depth += comp_num < 3 ? employing_color_transform : 0; + // 3 or 4 is how many extra bits are needed for the HH band at the + // bottom most level of decomposition. + bit_depth += get_num_decompositions() > 5 ? 4 : 3; + } + + return bit_depth; + } + ////////////////////////////////////////////////////////////////////////// bool param_cod::write(outfile_base *file) { @@ -929,23 +948,46 @@ namespace ojph { void param_qcd::set_rev_quant(ui32 num_decomps, ui32 bit_depth, bool is_employing_color_transform) { - int guard_bits = 1; - Sqcd = (ui8)(guard_bits << 5); //one guard bit, and no quantization ui32 B = bit_depth; B += is_employing_color_transform ? 1 : 0; //1 bit for RCT int s = 0; double bibo_l = bibo_gains::get_bibo_gain_l(num_decomps, true); ui32 X = (ui32) ceil(log(bibo_l * bibo_l) / M_LN2); - u8_SPqcd[s++] = (ui8)((B + X) << 3); + u8_SPqcd[s++] = (ui8)(B + X); + ui32 max_B_plus_X = (ui32)(B + X); for (ui32 d = num_decomps; d > 0; --d) { double bibo_l = bibo_gains::get_bibo_gain_l(d, true); double bibo_h = bibo_gains::get_bibo_gain_h(d - 1, true); X = (ui32) ceil(log(bibo_h * bibo_l) / M_LN2); - u8_SPqcd[s++] = (ui8)((B + X) << 3); - u8_SPqcd[s++] = (ui8)((B + X) << 3); + u8_SPqcd[s++] = (ui8)(B + X); + max_B_plus_X = ojph_max(max_B_plus_X, B + X); + u8_SPqcd[s++] = (ui8)(B + X); + max_B_plus_X = ojph_max(max_B_plus_X, B + X); X = (ui32) ceil(log(bibo_h * bibo_h) / M_LN2); - u8_SPqcd[s++] = (ui8)((B + X) << 3); + u8_SPqcd[s++] = (ui8)(B + X); + max_B_plus_X = ojph_max(max_B_plus_X, B + X); + } + + if (max_B_plus_X > 38) + OJPH_ERROR(0x00050151, "The specified combination of bit_depth, " + "colour transform, and type of wavelet transform requires more than " + "38 bits; it requires %d bits. This is beyond what is allowed in " + "the JPEG2000 image coding format.", max_B_plus_X); + + int guard_bits = ojph_max(1, (si32)max_B_plus_X - 31); + Sqcd = (ui8)(guard_bits << 5); + s = 0; + u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits)); + s++; + for (ui32 d = num_decomps; d > 0; --d) + { + u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits)); + s++; + u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits)); + s++; + u8_SPqcd[s] = encode_SPqcd((ui8)(u8_SPqcd[s] - guard_bits)); + s++; } } @@ -1001,8 +1043,11 @@ namespace ojph { ui32 B = 0; int irrev = Sqcd & 0x1F; if (irrev == 0) //reversible - for (ui32 i = 0; i < num_subbands; ++i) - B = ojph_max(B, (u8_SPqcd[i] >> 3) + get_num_guard_bits() - 1u); + for (ui32 i = 0; i < num_subbands; ++i) { + ui32 t = decode_SPqcd(u8_SPqcd[i]); + t += get_num_guard_bits() - 1u; + B = ojph_max(B, t); + } else if (irrev == 2) //scalar expounded for (ui32 i = 0; i < num_subbands; ++i) { @@ -1072,9 +1117,9 @@ namespace ojph { } int irrev = Sqcd & 0x1F; - if (irrev == 0) //reversible; this is (10.22) from the J2K book + if (irrev == 0) // reversible; this is (10.22) from the J2K book { - num_bits += u8_SPqcd[idx] >> 3; + num_bits += decode_SPqcd(u8_SPqcd[idx]); num_bits = num_bits == 0 ? 0 : num_bits - 1; } else if (irrev == 1) diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h index ac8bb77..cce5cd8 100644 --- a/src/core/codestream/ojph_params_local.h +++ b/src/core/codestream/ojph_params_local.h @@ -176,10 +176,16 @@ namespace ojph { public: param_siz() { - memset(this, 0, sizeof(param_siz)); + Lsiz = Csiz = 0; + Xsiz = Ysiz = XOsiz = YOsiz = XTsiz = YTsiz = XTOsiz = YTOsiz = 0; + skipped_resolutions = 0; + memset(store, 0, sizeof(store)); + ws_kern_support_needed = dfs_support_needed = false; + cod = NULL; + dfs = NULL; + Rsiz = RSIZ_HT_FLAG; cptr = store; old_Csiz = 4; - Rsiz = RSIZ_HT_FLAG; } ~param_siz() @@ -263,6 +269,7 @@ namespace ojph { ui32 t = ojph_div_ceil(Xsiz, ds) - ojph_div_ceil(XOsiz, ds); return t; } + ui32 get_height(ui32 comp_num) const { assert(comp_num < get_num_components()); @@ -516,6 +523,9 @@ namespace ojph { return (Scod & 4) == 4; } + //////////////////////////////////////// + ui32 propose_implementation_precision(const param_siz* siz) const; + //////////////////////////////////////// bool write(outfile_base *file); @@ -639,7 +649,11 @@ namespace ojph { bool is_employing_color_transform); void set_irrev_quant(ui32 num_decomps); - protected: + ui8 decode_SPqcd(ui8 v) const + { return (ui8)(v >> 3); } + ui8 encode_SPqcd(ui8 v) const + { return (ui8)(v << 3); } + protected: ui16 Lqcd; ui8 Sqcd; union @@ -863,9 +877,10 @@ namespace ojph { }; public: // member functions - param_dfs() { memset(this, 0, sizeof(param_dfs)); } + param_dfs() { init(); } ~param_dfs() { if (next) delete next; } - void init() { memset(this, 0, sizeof(param_dfs)); } + void init() + { Ldfs = Sdfs = Ids = 0; memset(Ddfs, 0, sizeof(Ddfs)); next = NULL; } bool read(infile_base *file); bool exists() const { return Ldfs != 0; } @@ -940,8 +955,17 @@ namespace ojph { bool read_coefficient(infile_base *file, float &K); bool read_coefficient(infile_base *file, si16 &K); void init(bool clear_all = true) { - if (clear_all) - memset(this, 0, sizeof(param_atk)); + if (clear_all) + { + Latk = Satk = 0; + Katk = 0.0f; + Natk = 0; + d = NULL; + max_steps = 0; + memset(d_store, 0, sizeof(d_store)); + next = NULL; + alloced_next = false; + } d = d_store; max_steps = sizeof(d_store) / sizeof(lifting_step); } void init_irv97(); diff --git a/src/core/codestream/ojph_precinct.cpp b/src/core/codestream/ojph_precinct.cpp index 813e33b..803790d 100644 --- a/src/core/codestream/ojph_precinct.cpp +++ b/src/core/codestream/ojph_precinct.cpp @@ -221,7 +221,9 @@ namespace ojph { { int num_zeros = *mmsb_tag.get(x>>levm1, y>>levm1, levm1); num_zeros -= *mmsb_tag.get(x>>cur_lev, y>>cur_lev, cur_lev); - bb_put_bits(&bb, 1, num_zeros + 1, + bb_put_zeros(&bb, num_zeros, + elastic, cur_coded_list, ph_bytes); + bb_put_bits(&bb, 1, 1, elastic, cur_coded_list, ph_bytes); *mmsb_tag_flags.get(x>>levm1, y>>levm1, levm1) = 1; } diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp index 87466e0..0246400 100644 --- a/src/core/codestream/ojph_resolution.cpp +++ b/src/core/codestream/ojph_resolution.cpp @@ -199,6 +199,9 @@ namespace ojph { allocator->pre_alloc_obj((size_t)num_precincts.area()); } + const param_siz* szp = codestream->get_siz(); + ui32 precision = cdp->propose_implementation_precision(szp); + //allocate lines if (skipped_res_for_recon == false) { @@ -207,10 +210,19 @@ namespace ojph { allocator->pre_alloc_obj(num_steps + 2); ui32 width = res_rect.siz.w + 1; - for (ui32 i = 0; i < num_steps; ++i) + if (precision <= 32) { + for (ui32 i = 0; i < num_steps; ++i) + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); allocator->pre_alloc_data(width, 1); - allocator->pre_alloc_data(width, 1); - allocator->pre_alloc_data(width, 1); + } + else + { + for (ui32 i = 0; i < num_steps; ++i) + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); + } } } @@ -436,6 +448,9 @@ namespace ojph { level_index[i] = level_index[i - 1] + val; cur_precinct_loc = point(0, 0); + const param_siz* szp = codestream->get_siz(); + ui32 precision = cdp->propose_implementation_precision(szp); + //allocate lines if (skipped_res_for_recon == false) { @@ -460,11 +475,22 @@ namespace ojph { // initiate storage of line_buf ui32 width = res_rect.siz.w + 1; - for (ui32 i = 0; i < num_steps; ++i) - ssp[i].line->wrap( - allocator->post_alloc_data(width, 1), width, 1); - sig->line->wrap(allocator->post_alloc_data(width, 1), width, 1); - aug->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + if (precision <= 32) + { + for (ui32 i = 0; i < num_steps; ++i) + ssp[i].line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + sig->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + aug->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + } + else + { + for (ui32 i = 0; i < num_steps; ++i) + ssp[i].line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + sig->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + aug->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + } cur_line = 0; rows_to_produce = res_rect.siz.h; @@ -682,8 +708,9 @@ namespace ojph { rev_horz_syn(atk, aug->line, child_res->pull_line(), bands[1].pull_line(), width, horz_even); else - memcpy(aug->line->i32, child_res->pull_line()->i32, - width * sizeof(si32)); + memcpy(aug->line->p, child_res->pull_line()->p, + (size_t)width + * (aug->line->flags & line_buf::LFT_SIZE_MASK)); aug->active = true; vert_even = !vert_even; ++cur_line; @@ -694,8 +721,9 @@ namespace ojph { rev_horz_syn(atk, sig->line, bands[2].pull_line(), bands[3].pull_line(), width, horz_even); else - memcpy(sig->line->i32, bands[2].pull_line()->i32, - width * sizeof(si32)); + memcpy(sig->line->p, bands[2].pull_line()->p, + (size_t)width + * (sig->line->flags & line_buf::LFT_SIZE_MASK)); sig->active = true; vert_even = !vert_even; ++cur_line; @@ -733,8 +761,9 @@ namespace ojph { rev_horz_syn(atk, aug->line, child_res->pull_line(), bands[1].pull_line(), width, horz_even); else - memcpy(aug->line->i32, child_res->pull_line()->i32, - width * sizeof(si32)); + memcpy(aug->line->p, child_res->pull_line()->p, + (size_t)width + * (aug->line->flags & line_buf::LFT_SIZE_MASK)); } else { @@ -742,11 +771,22 @@ namespace ojph { rev_horz_syn(atk, aug->line, bands[2].pull_line(), bands[3].pull_line(), width, horz_even); else - memcpy(aug->line->i32, bands[2].pull_line()->i32, - width * sizeof(si32)); - si32* sp = aug->line->i32; - for (ui32 i = width; i > 0; --i) - *sp++ >>= 1; + memcpy(aug->line->p, bands[2].pull_line()->p, + (size_t)width + * (aug->line->flags & line_buf::LFT_SIZE_MASK)); + if (aug->line->flags & line_buf::LFT_32BIT) + { + si32* sp = aug->line->i32; + for (ui32 i = width; i > 0; --i) + *sp++ >>= 1; + } + else + { + assert(aug->line->flags & line_buf::LFT_64BIT); + si64* sp = aug->line->i64; + for (ui32 i = width; i > 0; --i) + *sp++ >>= 1; + } } return aug->line; } @@ -854,8 +894,8 @@ namespace ojph { rev_horz_syn(atk, aug->line, child_res->pull_line(), bands[1].pull_line(), width, horz_even); else - memcpy(aug->line->i32, child_res->pull_line()->i32, - width * sizeof(si32)); + memcpy(aug->line->p, child_res->pull_line()->p, + (size_t)width * (aug->line->flags & line_buf::LFT_SIZE_MASK)); return aug->line; } else diff --git a/src/core/codestream/ojph_resolution.h b/src/core/codestream/ojph_resolution.h index 635a4ce..6156455 100644 --- a/src/core/codestream/ojph_resolution.h +++ b/src/core/codestream/ojph_resolution.h @@ -45,7 +45,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class mem_elastic_allocator; class codestream; diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp index cf007fc..8efc8de 100644 --- a/src/core/codestream/ojph_subband.cpp +++ b/src/core/codestream/ojph_subband.cpp @@ -91,13 +91,18 @@ namespace ojph { allocator->pre_alloc_obj((size_t)num_blocks.area()); for (ui32 i = 0; i < num_blocks.w; ++i) - codeblock::pre_alloc(codestream, nominal); + codeblock::pre_alloc(codestream, comp_num, nominal); //allocate lines allocator->pre_alloc_obj(1); //allocate line_buf ui32 width = band_rect.siz.w + 1; - allocator->pre_alloc_data(width, 1); + const param_siz* szp = codestream->get_siz(); + ui32 precision = cdp->propose_implementation_precision(szp); + if (precision <= 32) + allocator->pre_alloc_data(width, 1); + else + allocator->pre_alloc_data(width, 1); } ////////////////////////////////////////////////////////////////////////// @@ -192,7 +197,12 @@ namespace ojph { lines = allocator->post_alloc_obj(1); //allocate line_buf ui32 width = band_rect.siz.w + 1; - lines->wrap(allocator->post_alloc_data(width,1),width,1); + const param_siz* szp = codestream->get_siz(); + ui32 precision = cdp->propose_implementation_precision(szp); + if (precision <= 32) + lines->wrap(allocator->post_alloc_data(width, 1), width, 1); + else + lines->wrap(allocator->post_alloc_data(width, 1), width, 1); } ////////////////////////////////////////////////////////////////////////// @@ -256,10 +266,11 @@ namespace ojph { if (empty) return; - assert(l->pre_size == lines[0].pre_size && l->size == lines[0].size); - si32* t = lines[0].i32; - lines[0].i32 = l->i32; - l->i32 = t; + assert(l->pre_size == lines[0].pre_size && l->size == lines[0].size && + l->flags == lines[0].flags); + void* p = lines[0].p; + lines[0].p = l->p; + l->p = p; } ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/codestream/ojph_subband.h b/src/core/codestream/ojph_subband.h index 8cadae0..e1c291a 100644 --- a/src/core/codestream/ojph_subband.h +++ b/src/core/codestream/ojph_subband.h @@ -45,7 +45,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class mem_elastic_allocator; class codestream; @@ -94,6 +94,8 @@ namespace ojph { bool exists() { return !empty; } line_buf* pull_line(); + resolution* get_parent() { return parent; } + const resolution* get_parent() const { return parent; } private: bool empty; // true if the subband has no pixels or diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp index 281e156..4755bb4 100644 --- a/src/core/codestream/ojph_tile.cpp +++ b/src/core/codestream/ojph_tile.cpp @@ -231,8 +231,7 @@ namespace ojph { num_lines = 3; lines = allocator->post_alloc_obj(num_lines); for (int i = 0; i < 3; ++i) - lines[i].wrap( - allocator->post_alloc_data(width,0),width,0); + lines[i].wrap(allocator->post_alloc_data(width, 0), width, 0); } else { @@ -259,17 +258,15 @@ namespace ojph { line_buf *tc = comps[comp_num].get_line(); if (reversible) { - int shift = 1 << (num_bits[comp_num] - 1); - const si32 *sp = line->i32 + line_offsets[comp_num]; - si32* dp = tc->i32; - if (is_signed[comp_num]) { - if (nlt_type3[comp_num]) - cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width); - else - memcpy(dp, sp, comp_width * sizeof(si32)); + si64 shift = (si64)1 << (num_bits[comp_num] - 1); + if (is_signed[comp_num] && nlt_type3[comp_num]) + rev_convert_nlt_type3(line, line_offsets[comp_num], + tc, 0, shift + 1, comp_width); + else { + shift = is_signed[comp_num] ? 0 : -shift; + rev_convert(line, line_offsets[comp_num], tc, 0, + shift, comp_width); } - else - cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width); } else { @@ -285,26 +282,25 @@ namespace ojph { } else { - int shift = 1 << (num_bits[comp_num] - 1); + si64 shift = (si64)1 << (num_bits[comp_num] - 1); ui32 comp_width = comp_rects[comp_num].siz.w; if (reversible) { - const si32 *sp = line->i32 + line_offsets[comp_num]; - si32 *dp = lines[comp_num].i32; - if (is_signed[comp_num]) { - if (nlt_type3[comp_num]) - cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width); - else - memcpy(dp, sp, comp_width * sizeof(si32)); + if (is_signed[comp_num] && nlt_type3[comp_num]) + rev_convert_nlt_type3(line, line_offsets[comp_num], + lines + comp_num, 0, shift + 1, comp_width); + else { + shift = is_signed[comp_num] ? 0 : -shift; + rev_convert(line, line_offsets[comp_num], lines + comp_num, 0, + shift, comp_width); } - else - cnvrt_si32_to_si32_shftd(sp, dp, -shift, comp_width); + if (comp_num == 2) { // reversible color transform - rct_forward(lines[0].i32, lines[1].i32, lines[2].i32, - comps[0].get_line()->i32, - comps[1].get_line()->i32, - comps[2].get_line()->i32, comp_width); + rct_forward(lines + 0, lines + 1, lines + 2, + comps[0].get_line(), + comps[1].get_line(), + comps[2].get_line(), comp_width); comps[0].push_line(); comps[1].push_line(); comps[2].push_line(); @@ -350,17 +346,15 @@ namespace ojph { ui32 comp_width = recon_comp_rects[comp_num].siz.w; if (reversible) { - int shift = 1 << (num_bits[comp_num] - 1); - const si32 *sp = src_line->i32; - si32* dp = tgt_line->i32 + line_offsets[comp_num]; - if (is_signed[comp_num]) { - if (nlt_type3[comp_num]) - cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width); - else - memcpy(dp, sp, comp_width * sizeof(si32)); + si64 shift = (si64)1 << (num_bits[comp_num] - 1); + if (is_signed[comp_num] && nlt_type3[comp_num]) + rev_convert_nlt_type3(src_line, 0, tgt_line, + line_offsets[comp_num], shift + 1, comp_width); + else { + shift = is_signed[comp_num] ? 0 : shift; + rev_convert(src_line, 0, tgt_line, + line_offsets[comp_num], shift, comp_width); } - else - cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width); } else { @@ -380,9 +374,9 @@ namespace ojph { if (comp_num == 0) { if (reversible) - rct_backward(comps[0].pull_line()->i32, comps[1].pull_line()->i32, - comps[2].pull_line()->i32, lines[0].i32, lines[1].i32, - lines[2].i32, comp_width); + rct_backward(comps[0].pull_line(), comps[1].pull_line(), + comps[2].pull_line(), lines + 0, lines + 1, + lines + 2, comp_width); else ict_backward(comps[0].pull_line()->f32, comps[1].pull_line()->f32, comps[2].pull_line()->f32, lines[0].f32, lines[1].f32, @@ -390,21 +384,20 @@ namespace ojph { } if (reversible) { - int shift = 1 << (num_bits[comp_num] - 1); - const si32 *sp; + si64 shift = (si64)1 << (num_bits[comp_num] - 1); + line_buf* src_line; if (comp_num < 3) - sp = lines[comp_num].i32; + src_line = lines + comp_num; else - sp = comps[comp_num].pull_line()->i32; - si32* dp = tgt_line->i32 + line_offsets[comp_num]; - if (is_signed[comp_num]) { - if (nlt_type3[comp_num]) - cnvrt_si32_to_si32_nlt_type3(sp, dp, shift + 1, comp_width); - else - memcpy(dp, sp, comp_width * sizeof(si32)); + src_line = comps[comp_num].pull_line(); + if (is_signed[comp_num] && nlt_type3[comp_num]) + rev_convert_nlt_type3(src_line, 0, tgt_line, + line_offsets[comp_num], shift + 1, comp_width); + else { + shift = is_signed[comp_num] ? 0 : shift; + rev_convert(src_line, 0, tgt_line, + line_offsets[comp_num], shift, comp_width); } - else - cnvrt_si32_to_si32_shftd(sp, dp, +shift, comp_width); } else { diff --git a/src/core/codestream/ojph_tile.h b/src/core/codestream/ojph_tile.h index 4b54242..6b65a13 100644 --- a/src/core/codestream/ojph_tile.h +++ b/src/core/codestream/ojph_tile.h @@ -47,7 +47,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class codestream; namespace local { diff --git a/src/core/codestream/ojph_tile_comp.h b/src/core/codestream/ojph_tile_comp.h index def39e5..62b8fba 100644 --- a/src/core/codestream/ojph_tile_comp.h +++ b/src/core/codestream/ojph_tile_comp.h @@ -48,7 +48,7 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// //defined elsewhere - struct line_buf; + class line_buf; class codestream; namespace local { diff --git a/src/core/coding/ojph_block_common.cpp b/src/core/coding/ojph_block_common.cpp index e6b4de6..2ba138a 100644 --- a/src/core/coding/ojph_block_common.cpp +++ b/src/core/coding/ojph_block_common.cpp @@ -84,11 +84,20 @@ namespace ojph { * + 4 * mel event for initial row of quads when needed \n * \n * Each entry contains, starting from the LSB \n - * \li \c total prefix length for quads 0 and 1 (3 bits) \n - * \li \c total suffix length for quads 0 and 1 (4 bits) \n + * \li \c total total prefix length for quads 0 and 1 (3 bits) \n + * \li \c total total suffix length for quads 0 and 1 (4 bits) \n * \li \c suffix length for quad 0 (3 bits) \n * \li \c prefix for quad 0 (3 bits) \n * \li \c prefix for quad 1 (3 bits) \n + * \n + * Another table is uvlc_bias, which is needed to correctly decode the + * extension u_ext for initial row of quads. Under certain condition, + * we deduct 1 or 2 from u_q0 and u_q1 before encoding them; so for us + * to know that decoding u_ext is needed, we recreate the u_q0 and u_q1 + * that we actually encoded. \n + * For simplicity, we use the same index as before \n + * \li \c u_q0 bias is 2 bits \n + * \li \c u_q1 bias is 2 bits \n */ /// @brief uvlc_tbl0 contains decoding information for initial row of quads @@ -96,6 +105,8 @@ namespace ojph { /// @brief uvlc_tbl1 contains decoding information for non-initial row of /// quads ui16 uvlc_tbl1[256] = { 0 }; + /// @brief uvlc_bias contains decoding info. for initial row of quads + ui8 uvlc_bias[256+64] = { 0 }; /// @} //************************************************************************/ @@ -199,8 +210,10 @@ namespace ojph { ui32 mode = i >> 6; ui32 vlc = i & 0x3F; - if (mode == 0) // both u_off are 0 + if (mode == 0) { // both u_off are 0 uvlc_tbl0[i] = 0; + uvlc_bias[i] = 0; + } else if (mode <= 2) // u_off are either 01 or 10 { ui32 d = dec[vlc & 0x7]; //look at the least significant 3 bits @@ -232,6 +245,7 @@ namespace ojph { total_suffix = u0_suffix_len; u0 = d0 >> 5; u1 = (vlc & 1) + 1; + uvlc_bias[i] = 4; // 0b00 for u0 and 0b01 for u1 } else { @@ -240,6 +254,7 @@ namespace ojph { total_suffix = u0_suffix_len + ((d1 >> 2) & 0x7); u0 = d0 >> 5; u1 = d1 >> 5; + uvlc_bias[i] = 0; } uvlc_tbl0[i] = (ui16)(total_prefix | @@ -265,6 +280,7 @@ namespace ojph { (u0_suffix_len << 7) | (u0 << 10) | (u1 << 13)); + uvlc_bias[i] = 10; // 0b10 for u0 and 0b10 for u1 } } diff --git a/src/core/coding/ojph_block_common.h b/src/core/coding/ojph_block_common.h index 29a84ba..f8d6503 100644 --- a/src/core/coding/ojph_block_common.h +++ b/src/core/coding/ojph_block_common.h @@ -44,6 +44,6 @@ namespace ojph{ extern ui16 vlc_tbl1[1024]; extern ui16 uvlc_tbl0[256+64]; extern ui16 uvlc_tbl1[256]; - + extern ui8 uvlc_bias[256+64]; } // !namespace local } // !namespace ojph diff --git a/src/core/coding/ojph_block_decoder.h b/src/core/coding/ojph_block_decoder.h index ab01961..a197017 100644 --- a/src/core/coding/ojph_block_decoder.h +++ b/src/core/coding/ojph_block_decoder.h @@ -50,7 +50,12 @@ namespace ojph { // generic decoder bool - ojph_decode_codeblock(ui8* coded_data, ui32* decoded_data, + ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, bool stripe_causal); + + bool + ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data, ui32 missing_msbs, ui32 num_passes, ui32 lengths1, ui32 lengths2, ui32 width, ui32 height, ui32 stride, bool stripe_causal); diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder32.cpp similarity index 99% rename from src/core/coding/ojph_block_decoder.cpp rename to src/core/coding/ojph_block_decoder32.cpp index 259371b..f54c77e 100644 --- a/src/core/coding/ojph_block_decoder.cpp +++ b/src/core/coding/ojph_block_decoder32.cpp @@ -739,11 +739,11 @@ namespace ojph { * @param [in] stride is the decoded codeblock buffer stride * @param [in] stripe_causal is true for stripe causal mode */ - bool ojph_decode_codeblock(ui8* coded_data, ui32* decoded_data, - ui32 missing_msbs, ui32 num_passes, - ui32 lengths1, ui32 lengths2, - ui32 width, ui32 height, ui32 stride, - bool stripe_causal) + bool ojph_decode_codeblock32(ui8* coded_data, ui32* decoded_data, + ui32 missing_msbs, ui32 num_passes, + ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, + bool stripe_causal) { static bool insufficient_precision = false; static bool modify_code = false; @@ -1217,7 +1217,7 @@ namespace ojph { ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1? ui32 emax = vp[0] | vp[1]; - emax = 31 - count_leading_zeros(emax | 2); // emax - 1 + emax = 31 - count_leading_zeros(emax | 2); // emax - 1 ui32 kappa = gamma ? emax : 1; ui32 U_q = u_q + kappa; @@ -1613,4 +1613,4 @@ namespace ojph { return true; } } -} +} \ No newline at end of file diff --git a/src/core/coding/ojph_block_decoder64.cpp b/src/core/coding/ojph_block_decoder64.cpp new file mode 100644 index 0000000..8801735 --- /dev/null +++ b/src/core/coding/ojph_block_decoder64.cpp @@ -0,0 +1,1663 @@ +//***************************************************************************/ +// This software is released under the 2-Clause BSD license, included +// below. +// +// Copyright (c) 2019, Aous Naman +// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia +// Copyright (c) 2019, The University of New South Wales, Australia +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************/ +// This file is part of the OpenJPH software implementation. +// File: ojph_block_decoder.cpp +// Author: Aous Naman +// Date: 13 May 2022 +//***************************************************************************/ + +//***************************************************************************/ +/** @file ojph_block_decoder.cpp + * @brief implements a HTJ2K block decoder + */ + +#include +#include + +#include +#include +#include "ojph_block_common.h" +#include "ojph_block_decoder.h" +#include "ojph_arch.h" +#include "ojph_message.h" + +namespace ojph { + namespace local { + + //************************************************************************/ + /** @brief MEL state structure for reading and decoding the MEL bitstream + * + * A number of events is decoded from the MEL bitstream ahead of time + * and stored in run/num_runs. + * Each run represents the number of zero events before a one event. + */ + struct dec_mel_st { + dec_mel_st() : data(NULL), tmp(0), bits(0), size(0), unstuff(false), + k(0), num_runs(0), runs(0) + {} + // data decoding machinery + ui8* data; //!bits > 32) //there are enough bits in the tmp variable + return; // return without reading new data + + ui32 val = 0xFFFFFFFF; // feed in 0xFF if buffer is exhausted + if (melp->size > 4) { // if there is data in the MEL segment + val = *(ui32*)melp->data; // read 32 bits from MEL data + melp->data += 4; // advance pointer + melp->size -= 4; // reduce counter + } + else if (melp->size > 0) + { // 4 or less + int i = 0; + while (melp->size > 1) { + ui32 v = *melp->data++; // read one byte at a time + ui32 m = ~(0xFFu << i); // mask of location + val = (val & m) | (v << i);// put one byte in its correct location + --melp->size; + i += 8; + } + // size equal to 1 + ui32 v = *melp->data++; // the one before the last is different + v |= 0xF; // MEL and VLC segments can overlap + ui32 m = ~(0xFFu << i); + val = (val & m) | (v << i); + --melp->size; + } + + // next we unstuff them before adding them to the buffer + int bits = 32 - melp->unstuff; // number of bits in val, subtract 1 if + // the previously read byte requires + // unstuffing + + // data is unstuffed and accumulated in t + // bits has the number of bits in t + ui32 t = val & 0xFF; + bool unstuff = ((val & 0xFF) == 0xFF); // true if we need unstuffing + bits -= unstuff; // there is one less bit in t if unstuffing is needed + t = t << (8 - unstuff); // move up to make room for the next byte + + //this is a repeat of the above + t |= (val>>8) & 0xFF; + unstuff = (((val >> 8) & 0xFF) == 0xFF); + bits -= unstuff; + t = t << (8 - unstuff); + + t |= (val>>16) & 0xFF; + unstuff = (((val >> 16) & 0xFF) == 0xFF); + bits -= unstuff; + t = t << (8 - unstuff); + + t |= (val>>24) & 0xFF; + melp->unstuff = (((val >> 24) & 0xFF) == 0xFF); + + // move t to tmp, and push the result all the way up, so we read from + // the MSB + melp->tmp |= ((ui64)t) << (64 - bits - melp->bits); + melp->bits += bits; //increment the number of bits in tmp + } + + //************************************************************************/ + /** @brief Decodes unstuffed MEL segment bits stored in tmp to runs + * + * Runs are stored in "runs" and the number of runs in "num_runs". + * Each run represents a number of zero events that may or may not + * terminate in a 1 event. + * Each run is stored in 7 bits. The LSB is 1 if the run terminates in + * a 1 event, 0 otherwise. The next 6 bits, for the case terminating + * with 1, contain the number of consecutive 0 zero events * 2; for the + * case terminating with 0, they store (number of consecutive 0 zero + * events - 1) * 2. + * A total of 6 bits (made up of 1 + 5) should have been enough. + * + * @param [in] melp is a pointer to dec_mel_st structure + */ + static inline + void mel_decode(dec_mel_st *melp) + { + static const int mel_exp[13] = { //MEL exponents + 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5 + }; + + if (melp->bits < 6) // if there are less than 6 bits in tmp + mel_read(melp); // then read from the MEL bitstream + // 6 bits is the largest decodable MEL cwd + + //repeat so long that there is enough decodable bits in tmp, + // and the runs store is not full (num_runs < 8) + while (melp->bits >= 6 && melp->num_runs < 8) + { + int eval = mel_exp[melp->k]; // number of bits associated with state + int run = 0; + if (melp->tmp & (1ull<<63)) //The next bit to decode (stored in MSB) + { //one is found + run = 1 << eval; + run--; // consecutive runs of 0 events - 1 + melp->k = melp->k + 1 < 12 ? melp->k + 1 : 12;//increment, max is 12 + melp->tmp <<= 1; // consume one bit from tmp + melp->bits -= 1; + run = run << 1; // a stretch of zeros not terminating in one + } + else + { //0 is found + run = (int)(melp->tmp >> (63 - eval)) & ((1 << eval) - 1); + melp->k = melp->k - 1 > 0 ? melp->k - 1 : 0; //decrement, min is 0 + melp->tmp <<= eval + 1; //consume eval + 1 bits (max is 6) + melp->bits -= eval + 1; + run = (run << 1) + 1; // a stretch of zeros terminating with one + } + eval = melp->num_runs * 7; // 7 bits per run + melp->runs &= ~((ui64)0x3F << eval); // 6 bits are sufficient + melp->runs |= ((ui64)run) << eval; // store the value in runs + melp->num_runs++; // increment count + } + } + + //************************************************************************/ + /** @brief Initiates a dec_mel_st structure for MEL decoding and reads + * some bytes in order to get the read address to a multiple + * of 4 + * + * @param [in] melp is a pointer to dec_mel_st structure + * @param [in] bbuf is a pointer to byte buffer + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void mel_init(dec_mel_st *melp, ui8* bbuf, int lcup, int scup) + { + melp->data = bbuf + lcup - scup; // move the pointer to the start of MEL + melp->bits = 0; // 0 bits in tmp + melp->tmp = 0; // + melp->unstuff = false; // no unstuffing + melp->size = scup - 1; // size is the length of MEL+VLC-1 + melp->k = 0; // 0 for state + melp->num_runs = 0; // num_runs is 0 + melp->runs = 0; // + + //This code is borrowed; original is for a different architecture + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MEL segment + int num = 4 - (int)(intptr_t(melp->data) & 0x3); + for (int i = 0; i < num; ++i) { // this code is similar to mel_read + assert(melp->unstuff == false || melp->data[0] <= 0x8F); + ui64 d = (melp->size > 0) ? *melp->data : 0xFF;//if buffer is consumed + //set data to 0xFF + if (melp->size == 1) d |= 0xF; //if this is MEL+VLC-1, set LSBs to 0xF + // see the standard + melp->data += melp->size-- > 0; //increment if the end is not reached + int d_bits = 8 - melp->unstuff; //if unstuffing is needed, reduce by 1 + melp->tmp = (melp->tmp << d_bits) | d; //store bits in tmp + melp->bits += d_bits; //increment tmp by number of bits + melp->unstuff = ((d & 0xFF) == 0xFF); //true of next byte needs + //unstuffing + } + melp->tmp <<= (64 - melp->bits); //push all the way up so the first bit + // is the MSB + } + + //************************************************************************/ + /** @brief Retrieves one run from dec_mel_st; if there are no runs stored + * MEL segment is decoded + * + * @param [in] melp is a pointer to dec_mel_st structure + */ + static inline + int mel_get_run(dec_mel_st *melp) + { + if (melp->num_runs == 0) //if no runs, decode more bit from MEL segment + mel_decode(melp); + + int t = melp->runs & 0x7F; //retrieve one run + melp->runs >>= 7; // remove the retrieved run + melp->num_runs--; + return t; // return run + } + + //************************************************************************/ + /** @brief A structure for reading and unstuffing a segment that grows + * backward, such as VLC and MRP + */ + struct rev_struct { + rev_struct() : data(NULL), tmp(0), bits(0), size(0), unstuff(false) + {} + //storage + ui8* data; //!size > 0) // if there are more than 3 bytes left in VLC + { + val = *vlcp->data; // then read 8 bits + --vlcp->data; // increment data pointer + --vlcp->size; // decrement number of bytes in the buffer + } + + // accumulate in tmp, and increment bits, check if unstuffing is needed + ui8 t = (vlcp->unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0; + val = (ui8)(val & (0xFFU >> t)); // protect against erroneous 1 in MSB + vlcp->tmp |= (ui64)val << vlcp->bits; + vlcp->bits += 8 - t; + vlcp->unstuff = val > 0x8F; + } + + //************************************************************************/ + /** @brief Initiates the rev_struct structure and reads the first byte + * + * This subroutine initializes the VLC decoder. It discards the first + * 12 bits (they have the sum of the lengths of VLC and MEL segments), + * and depending on unstuffing, stores 3 or 4 bits in the unstuffed + * decoded buffer. + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] scup is the length of MEL+VLC segments + */ + static inline + void rev_init8(rev_struct *vlcp, ui8* data, int lcup, int scup) + { + //first byte has only the upper 4 bits + vlcp->data = data + lcup - 2; + + //size can not be larger than this, in fact it should be smaller + vlcp->size = scup - 2; + + ui8 val = *vlcp->data--; // read one byte (this is a half byte) + + // the first byte is treated different to other bytes, because only + // the MSB nibble is part of the VLC code. + val = (ui8)(val >> 4); + ui8 t = ((val & 0x7) == 0x7) ? 1 : 0; // unstuffing is needed + val = (ui8)(val & (0xFU >> t)); // protect against erroneous 1 in MSB + vlcp->tmp = val; + vlcp->bits = 4 - t; + vlcp->unstuff = val > 0x8; //this is useful for the next byte + } + + //************************************************************************/ + /** @brief Fills the temporary variable (vlcp->tmp) by up to 64 bits + * + * By the end of this call, vlcp->tmp must have no less than 56 bits + * + * @param [in] vlcp is a pointer to rev_struct structure + */ + static inline + ui64 rev_fetch64(rev_struct *vlcp) + { + while (vlcp->bits <= 56) + rev_read8(vlcp); // read 8 bits, but unstuffing might reduce this + return vlcp->tmp; // return unstuff decoded bits + } + + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] vlcp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + static inline + ui64 rev_advance64(rev_struct *vlcp, ui32 num_bits) + { + assert(num_bits <= vlcp->bits); // vlcp->tmp must have more than num_bits + vlcp->tmp >>= num_bits; // remove bits + vlcp->bits -= num_bits; // decrement the number of bits + return vlcp->tmp; + } + + //************************************************************************/ + /** @brief Reads and unstuffs from rev_struct + * + * This is different than rev_read in that this fills in zeros when the + * the available data is consumed. The other does not care about the + * values when all data is consumed. + * + * See rev_read for more information about unstuffing + * + * @param [in] mrp is a pointer to rev_struct structure + */ + static inline + void rev_read_mrp(rev_struct *mrp) + { + //process 4 bytes at a time + if (mrp->bits > 32) + return; + ui32 val = 0; + if (mrp->size > 3) // If there are 3 byte or more + { // (mrp->data - 3) move pointer back to read 32 bits at once + val = *(ui32*)(mrp->data - 3); // read 32 bits + mrp->data -= 4; // move back pointer + mrp->size -= 4; // reduce count + } + else if (mrp->size > 0) + { + int i = 24; + while (mrp->size > 0) { + ui32 v = *mrp->data--; // read one byte at a time + val |= (v << i); // put byte in its correct location + --mrp->size; + i -= 8; + } + } + + //accumulate in tmp, and keep count in bits + ui32 bits, tmp = val >> 24; + + //test if the last byte > 0x8F (unstuff must be true) and this is 0x7F + bits = 8 - ((mrp->unstuff && (((val >> 24) & 0x7F) == 0x7F)) ? 1 : 0); + bool unstuff = (val >> 24) > 0x8F; + + //process the next byte + tmp |= ((val >> 16) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 16) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 16) & 0xFF) > 0x8F; + + tmp |= ((val >> 8) & 0xFF) << bits; + bits += 8 - ((unstuff && (((val >> 8) & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = ((val >> 8) & 0xFF) > 0x8F; + + tmp |= (val & 0xFF) << bits; + bits += 8 - ((unstuff && ((val & 0x7F) == 0x7F)) ? 1 : 0); + unstuff = (val & 0xFF) > 0x8F; + + mrp->tmp |= (ui64)tmp << mrp->bits; // move data to mrp pointer + mrp->bits += bits; + mrp->unstuff = unstuff; // next byte + } + + //************************************************************************/ + /** @brief Initialized rev_struct structure for MRP segment, and reads + * a number of bytes such that the next 32 bits read are from + * an address that is a multiple of 4. Note this is designed for + * an architecture that read size must be compatible with the + * alignment of the read address + * + * There is another similar subroutine rev_init. This subroutine does + * NOT skip the first 12 bits, and starts with unstuff set to true. + * + * @param [in] mrp is a pointer to rev_struct structure + * @param [in] data is a pointer to byte at the start of the cleanup pass + * @param [in] lcup is the length of MagSgn+MEL+VLC segments + * @param [in] len2 is the length of SPP+MRP segments + */ + static inline + void rev_init_mrp(rev_struct *mrp, ui8* data, int lcup, int len2) + { + mrp->data = data + lcup + len2 - 1; + mrp->size = len2; + mrp->unstuff = true; + mrp->bits = 0; + mrp->tmp = 0; + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the MRP stream + int num = 1 + (int)(intptr_t(mrp->data) & 0x3); + for (int i = 0; i < num; ++i) { + ui64 d; + //read a byte, 0 if no more data + d = (mrp->size-- > 0) ? *mrp->data-- : 0; + //check if unstuffing is needed + ui32 d_bits = 8 - ((mrp->unstuff && ((d & 0x7F) == 0x7F)) ? 1 : 0); + mrp->tmp |= d << mrp->bits; // move data to vlcp->tmp + mrp->bits += d_bits; + mrp->unstuff = d > 0x8F; // for next byte + } + rev_read_mrp(mrp); + } + + //************************************************************************/ + /** @brief Retrieves 32 bits from the head of a rev_struct structure + * + * By the end of this call, mrp->tmp must have no less than 33 bits + * + * @param [in] mrp is a pointer to rev_struct structure + */ + static inline + ui32 rev_fetch_mrp(rev_struct *mrp) + { + if (mrp->bits < 32) // if there are less than 32 bits in mrp->tmp + { + rev_read_mrp(mrp); // read 30-32 bits from mrp + if (mrp->bits < 32) // if there is a space of 32 bits + rev_read_mrp(mrp); // read more + } + return (ui32)mrp->tmp; // return the head of mrp->tmp + } + + //************************************************************************/ + /** @brief Consumes num_bits from a rev_struct structure + * + * @param [in] mrp is a pointer to rev_struct structure + * @param [in] num_bits is the number of bits to be removed + */ + static inline + ui32 rev_advance_mrp(rev_struct *mrp, ui32 num_bits) + { + assert(num_bits <= mrp->bits); // we must not consume more than mrp->bits + mrp->tmp >>= num_bits; // discard the lowest num_bits bits + mrp->bits -= num_bits; + return (ui32)mrp->tmp; // return data after consumption + } + + //************************************************************************/ + /** @brief State structure for reading and unstuffing of forward-growing + * bitstreams; these are: MagSgn and SPP bitstreams + */ + struct frwd_struct { + const ui8* data; //! + static inline + void frwd_read(frwd_struct *msp) + { + assert(msp->bits <= 32); // assert that there is a space for 32 bits + + ui32 val = 0; + if (msp->size > 3) { + val = *(ui32*)msp->data; // read 32 bits + msp->data += 4; // increment pointer + msp->size -= 4; // reduce size + } + else if (msp->size > 0) + { + int i = 0; + val = X != 0 ? 0xFFFFFFFFu : 0; + while (msp->size > 0) { + ui32 v = *msp->data++; // read one byte at a time + ui32 m = ~(0xFFu << i); // mask of location + val = (val & m) | (v << i);// put one byte in its correct location + --msp->size; + i += 8; + } + } + else + val = X != 0 ? 0xFFFFFFFFu : 0; + + // we accumulate in t and keep a count of the number of bits in bits + ui32 bits = 8 - msp->unstuff; + ui32 t = val & 0xFF; + bool unstuff = ((val & 0xFF) == 0xFF); // Do we need unstuffing next? + + t |= ((val >> 8) & 0xFF) << bits; + bits += 8 - unstuff; + unstuff = (((val >> 8) & 0xFF) == 0xFF); + + t |= ((val >> 16) & 0xFF) << bits; + bits += 8 - unstuff; + unstuff = (((val >> 16) & 0xFF) == 0xFF); + + t |= ((val >> 24) & 0xFF) << bits; + bits += 8 - unstuff; + msp->unstuff = (((val >> 24) & 0xFF) == 0xFF); // for next byte + + msp->tmp |= ((ui64)t) << msp->bits; // move data to msp->tmp + msp->bits += bits; + } + + //************************************************************************/ + /** @brief Read and unstuffs 8 bits from forward-growing bitstream + * + * A template is used to accommodate a different requirement for + * MagSgn and SPP bitstreams; in particular, when MagSgn bitstream is + * consumed, 0xFF's are fed, while when SPP is exhausted 0's are fed in. + * X controls this value. + * + * Unstuffing prevent sequences that are more than 0xFF7F from appearing + * in the conpressed sequence. So whenever a value of 0xFF is coded, the + * MSB of the next byte is set 0 and must be ignored during decoding. + * + * @tparam X is the value fed in when the bitstream is exhausted + * @param [in] msp is a pointer to frwd_struct structure + * + */ + template + static inline + void frwd_read8(frwd_struct *msp) + { + ui8 val = X; + if (msp->size > 0) { + val = *msp->data; // read 8 bits + ++msp->data; // increment pointer + --msp->size; // reduce size + } + + // unstuff and accumulate + ui8 t = msp->unstuff ? 1 : 0; + val = (ui8)(val & (0xFFU >> t)); + msp->unstuff = (val == 0xFF); + msp->tmp |= ((ui64)val) << msp->bits; // move data to msp->tmp + msp->bits += 8 - t; + } + + //************************************************************************/ + /** @brief Initialize frwd_struct struct and reads some bytes + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + * @param [in] data is a pointer to the start of data + * @param [in] size is the number of byte in the bitstream + */ + template + static inline + void frwd_init(frwd_struct *msp, const ui8* data, int size) + { + msp->data = data; + msp->tmp = 0; + msp->bits = 0; + msp->unstuff = 0; + msp->size = size; + + //This code is designed for an architecture that read address should + // align to the read size (address multiple of 4 if read size is 4) + //These few lines take care of the case where data is not at a multiple + // of 4 boundary. It reads 1,2,3 up to 4 bytes from the bitstream + int num = 4 - (int)(intptr_t(msp->data) & 0x3); + for (int i = 0; i < num; ++i) + { + ui64 d; + //read a byte if the buffer is not exhausted, otherwise set it to X + d = msp->size-- > 0 ? *msp->data++ : X; + msp->tmp |= (d << msp->bits); // store data in msp->tmp + msp->bits += 8 - msp->unstuff; // number of bits added to msp->tmp + msp->unstuff = ((d & 0xFF) == 0xFF); // unstuffing for next byte + } + frwd_read(msp); // read 32 bits more + } + + //************************************************************************/ + /** @brief Initialize frwd_struct struct and reads some bytes + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + * @param [in] data is a pointer to the start of data + * @param [in] size is the number of byte in the bitstream + */ + template + static inline + void frwd_init8(frwd_struct *msp, const ui8* data, int size) + { + msp->data = data; + msp->tmp = 0; + msp->bits = 0; + msp->unstuff = 0; + msp->size = size; + frwd_read8(msp); // read 8 bits + } + + //************************************************************************/ + /** @brief Consume num_bits bits from the bitstream of frwd_struct + * + * @param [in] msp is a pointer to frwd_struct + * @param [in] num_bits is the number of bit to consume + */ + static inline + void frwd_advance(frwd_struct *msp, ui32 num_bits) + { + assert(num_bits <= msp->bits); + msp->tmp >>= num_bits; // consume num_bits + msp->bits -= num_bits; + } + + //************************************************************************/ + /** @brief Fetches 32 bits from the frwd_struct bitstream + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + */ + template + static inline + ui32 frwd_fetch(frwd_struct *msp) + { + if (msp->bits < 32) + { + frwd_read(msp); + if (msp->bits < 32) //need to test + frwd_read(msp); + } + return (ui32)msp->tmp; + } + + //************************************************************************/ + /** @brief Fetches up to 64 bits from the frwd_struct bitstream + * + * @tparam X is the value fed in when the bitstream is exhausted. + * See frwd_read regarding the template + * @param [in] msp is a pointer to frwd_struct + */ + template + static inline + ui64 frwd_fetch64(frwd_struct *msp) + { + while (msp->bits <= 56) + frwd_read8(msp); + return msp->tmp; + } + + //************************************************************************/ + /** @brief Decodes one codeblock, processing the cleanup, siginificance + * propagation, and magnitude refinement pass + * + * @param [in] coded_data is a pointer to bitstream + * @param [in] decoded_data is a pointer to decoded codeblock data buf. + * @param [in] missing_msbs is the number of missing MSBs + * @param [in] num_passes is the number of passes: 1 if CUP only, + * 2 for CUP+SPP, and 3 for CUP+SPP+MRP + * @param [in] lengths1 is the length of cleanup pass + * @param [in] lengths2 is the length of refinement passes (either SPP + * only or SPP+MRP) + * @param [in] width is the decoded codeblock width + * @param [in] height is the decoded codeblock height + * @param [in] stride is the decoded codeblock buffer stride + * @param [in] stripe_causal is true for stripe causal mode + */ + bool ojph_decode_codeblock64(ui8* coded_data, ui64* decoded_data, + ui32 missing_msbs, ui32 num_passes, + ui32 lengths1, ui32 lengths2, + ui32 width, ui32 height, ui32 stride, + bool stripe_causal) + { + // static bool insufficient_precision = false; + // static bool modify_code = false; + // static bool truncate_spp_mrp = false; + + if (num_passes > 1 && lengths2 == 0) + { + OJPH_WARN(0x00010001, "A malformed codeblock that has more than " + "one coding pass, but zero length for " + "2nd and potential 3rd pass."); + num_passes = 1; + } + + if (num_passes > 3) + { + OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; " + "This codeblocks has %d passes.", + num_passes); + return false; + } + + // if (missing_msbs > 30) // p < 0 + // { + // if (insufficient_precision == false) + // { + // insufficient_precision = true; + // OJPH_WARN(0x00010003, "32 bits are not enough to decode this " + // "codeblock. This message will not be " + // "displayed again."); + // } + // return false; + // } + // else if (missing_msbs == 30) // p == 0 + // { // not enough precision to decode and set the bin center to 1 + // if (modify_code == false) { + // modify_code = true; + // OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup " + // "pass. The code can be modified to support " + // "this case. This message will not be " + // "displayed again."); + // } + // return false; // 32 bits are not enough to decode this + // } + // else if (missing_msbs == 29) // if p is 1, then num_passes must be 1 + // { + // if (num_passes > 1) { + // num_passes = 1; + // if (truncate_spp_mrp == false) { + // truncate_spp_mrp = true; + // OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp " + // "nor MagRef passes; both will be skipped. " + // "This message will not be displayed " + // "again."); + // } + // } + // } + ui32 p = 62 - missing_msbs; // The least significant bitplane for CUP + // There is a way to handle the case of p == 0, but a different path + // is required + + if (lengths1 < 2) + { + OJPH_WARN(0x00010006, "Wrong codeblock length."); + return false; + } + + // read scup and fix the bytes there + int lcup, scup; + lcup = (int)lengths1; // length of CUP + //scup is the length of MEL + VLC + scup = (((int)coded_data[lcup-1]) << 4) + (coded_data[lcup-2] & 0xF); + if (scup < 2 || scup > lcup || scup > 4079) //something is wrong + return false; + + // The temporary storage scratch holds two types of data in an + // interleaved fashion. The interleaving allows us to use one + // memory pointer. + // We have one entry for a decoded VLC code, and one entry for UVLC. + // Entries are 16 bits each, corresponding to one quad, + // but since we want to use XMM registers of the SSE family + // of SIMD; we allocated 16 bytes or more per quad row; that is, + // the width is no smaller than 16 bytes (or 8 entries), and the + // height is 512 quads + // Each VLC entry contains, in the following order, starting + // from MSB + // e_k (4bits), e_1 (4bits), rho (4bits), useless for step 2 (4bits) + // Each entry in UVLC contains u_q + // One extra row to handle the case of SPP propagating downwards + // when codeblock width is 4 + ui16 scratch[8 * 513] = {0}; // 8 kB + + // We need an extra two entries (one inf and one u_q) beyond + // the last column. + // If the block width is 4 (2 quads), then we use sstr of 8 + // (enough for 4 quads). If width is 8 (4 quads) we use + // sstr is 16 (enough for 8 quads). For a width of 16 (8 + // quads), we use 24 (enough for 12 quads). + ui32 sstr = ((width + 2u) + 7u) & ~7u; // multiples of 8 + + ui32 mmsbp2 = missing_msbs + 2; + + // The cleanup pass is decoded in two steps; in step one, + // the VLC and MEL segments are decoded, generating a record that + // has 2 bytes per quad. The 2 bytes contain, u, rho, e^1 & e^k. + // This information should be sufficient for the next step. + // In step 2, we decode the MagSgn segment. + + // step 1 decoding VLC and MEL segments + { + // init structures + dec_mel_st mel; + mel_init(&mel, coded_data, lcup, scup); + rev_struct vlc; + rev_init8(&vlc, coded_data, lcup, scup); + + int run = mel_get_run(&mel); // decode runs of events from MEL bitstrm + // data represented as runs of 0 events + // See mel_decode description + + ui64 vlc_val; + ui32 c_q = 0; + ui16 *sp = scratch; + //initial quad row + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // first quad + vlc_val = rev_fetch64(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl0[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 1 in ITU T.814 + c_q = ((t0 & 0x10U) << 3) | ((t0 & 0xE0U) << 2); + + //remove data from vlc stream (0 bits are removed if vlc is not used) + vlc_val = rev_advance64(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl0[c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + //prepare context for the next quad, eqn. 1 in ITU T.814 + c_q = ((t1 & 0x10U) << 3) | ((t1 & 0xE0U) << 2); + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance64(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + if (uvlc_mode == 0xc0)// if both u_offset are set, get an event from + { // the MEL run of events + run -= 2; //subtract 2, since events number if multiplied by 2 + + uvlc_mode += (run == -1) ? 0x40 : 0; // increment uvlc_mode by + // is 0x40 + + if (run < 0)//if run is consumed (run is -1 or -2), get another run + run = mel_get_run(&mel); + } + //run -= (uvlc_mode == 0xc0) ? 2 : 0; + //uvlc_mode += (uvlc_mode == 0xc0 && run == -1) ? 0x40 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + + //decode uvlc_mode to get u for both quads + ui32 idx = uvlc_mode + (ui32)(vlc_val & 0x3F); + ui32 uvlc_entry = uvlc_tbl0[idx]; + ui16 u_bias = uvlc_bias[idx]; + //remove total prefix length + vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; // suffix length for 2 quads + ui32 tmp = (ui32)(vlc_val&((1<>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q0 = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); + ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len)); + + // decode u_q extensions, which is needed only when u_q > 32 + ui16 u_ext; bool cond0, cond1; + cond0 = u_q0 - (u_bias & 0x3) > 32; + u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0); + vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); + u_q0 = (ui16)(u_q0 + (u_ext << 2)); + sp[1] = (ui16)(u_q0 + 1); // kappa = 1 + cond1 = u_q1 - (u_bias >> 2) > 32; + u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0); + vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); + u_q1 = (ui16)(u_q1 + (u_ext << 2)); + sp[3] = (ui16)(u_q1 + 1); // kappa = 1 + } + sp[0] = sp[1] = 0; + + //non initial quad rows + for (ui32 y = 2; y < height; y += 2) + { + c_q = 0; // context + ui16 *sp = scratch + (y >> 1) * sstr; // this row of quads + + for (ui32 x = 0; x < width; sp += 4) + { + // decode VLC + ///////////// + + // sigma_q (n, ne, nf) + c_q |= ((sp[0 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[2 - (si32)sstr] & 0x20U) << 4); + + // first quad + vlc_val = rev_fetch64(&vlc); + + //decode VLC using the context c_q and the head of VLC bitstream + ui16 t0 = vlc_tbl1[ c_q + (vlc_val & 0x7F) ]; + + // if context is zero, use one MEL event + if (c_q == 0) //zero context + { + run -= 2; //subtract 2, since events number is multiplied by 2 + + // Is the run terminated in 1? if so, use decoded VLC code, + // otherwise, discard decoded data, since we will decoded again + // using a different context + t0 = (run == -1) ? t0 : 0; + + // is run -1 or -2? this means a run has been consumed + if (run < 0) + run = mel_get_run(&mel); // get another run + } + //run -= (c_q == 0) ? 2 : 0; + //t0 = (c_q != 0 || run == -1) ? t0 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[0] = t0; + x += 2; + + // prepare context for the next quad; eqn. 2 in ITU T.814 + // sigma_q (w, sw) + c_q = ((t0 & 0x40U) << 2) | ((t0 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[0 - (si32)sstr] & 0x80; + // sigma_q (n, ne, nf) + c_q |= ((sp[2 - (si32)sstr] & 0xA0U) << 2); + c_q |= ((sp[4 - (si32)sstr] & 0x20U) << 4); + + //remove data from vlc stream (0 bits are removed if vlc is unused) + vlc_val = rev_advance64(&vlc, t0 & 0x7); + + //second quad + ui16 t1 = 0; + + //decode VLC using the context c_q and the head of VLC bitstream + t1 = vlc_tbl1[ c_q + (vlc_val & 0x7F)]; + + // if context is zero, use one MEL event + if (c_q == 0 && x < width) //zero context + { + run -= 2; //subtract 2, since events number if multiplied by 2 + + // if event is 0, discard decoded t1 + t1 = (run == -1) ? t1 : 0; + + if (run < 0) // have we consumed all events in a run + run = mel_get_run(&mel); // if yes, then get another run + } + t1 = x < width ? t1 : 0; + //run -= (c_q == 0 && x < width) ? 2 : 0; + //t1 = (c_q != 0 || run == -1) ? t1 : 0; + //if (run < 0) + // run = mel_get_run(&mel); // get another run + sp[2] = t1; + x += 2; + + // partial c_q, will be completed when we process the next quad + // sigma_q (w, sw) + c_q = ((t1 & 0x40U) << 2) | ((t1 & 0x80U) << 1); + // sigma_q (nw) + c_q |= sp[2 - (si32)sstr] & 0x80; + + //remove data from vlc stream, if qinf is not used, cwdlen is 0 + vlc_val = rev_advance64(&vlc, t1 & 0x7); + + // decode u + ///////////// + // uvlc_mode is made up of u_offset bits from the quad pair + ui32 uvlc_mode = ((t0 & 0x8U) << 3) | ((t1 & 0x8U) << 4); + ui32 uvlc_entry = uvlc_tbl1[uvlc_mode + (vlc_val & 0x3F)]; + //remove total prefix length + vlc_val = rev_advance64(&vlc, uvlc_entry & 0x7); + uvlc_entry >>= 3; + //extract suffixes for quad 0 and 1 + ui32 len = uvlc_entry & 0xF; //suffix length for 2 quads + ui32 tmp = (ui32)(vlc_val&((1<>= 4; + // quad 0 length + len = uvlc_entry & 0x7; // quad 0 suffix length + uvlc_entry >>= 3; + ui16 u_q0 = (ui16)((uvlc_entry & 7) + (tmp & ~(0xFFU << len))); + ui16 u_q1 = (ui16)((uvlc_entry >> 3) + (tmp >> len)); // u_q + + // decode u_q extensions, which is needed only when u_q > 32 + ui16 u_ext; bool cond0, cond1; + cond0 = u_q0 > 32; + u_ext = (ui16)(cond0 ? (vlc_val & 0xF) : 0); + vlc_val = rev_advance64(&vlc, cond0 ? 4 : 0); + u_q0 = (ui16)(u_q0 + (u_ext << 2)); + sp[1] = u_q0; + cond1 = u_q1 > 32; + u_ext = (ui16)(cond1 ? (vlc_val & 0xF) : 0); + vlc_val = rev_advance64(&vlc, cond1 ? 4 : 0); + u_q1 = (ui16)(u_q1 + (u_ext << 2)); + sp[3] = u_q1; + } + sp[0] = sp[1] = 0; + } + } + + // step2 we decode magsgn + { + // We allocate a scratch row for storing v_n values. + // We have 512 quads horizontally. + // We need an extra entry to handle the case of vp[1] + // when vp is at the last column. + // Here, we allocate 4 instead of 1 to make the buffer size + // a multipled of 16 bytes. + const int v_n_size = 512 + 4; + ui64 v_n_scratch[v_n_size] = {0}; // 4+ kB + + frwd_struct magsgn; + frwd_init8<0xFF>(&magsgn, coded_data, lcup - scup); + + const ui16 *sp = scratch; + ui64 *vp = v_n_scratch; + ui64 *dp = decoded_data; + + ui64 prev_v_n = 0; + for (ui32 x = 0; x < width; sp += 2, ++vp) + { + ui32 inf = sp[0]; + ui32 U_q = sp[1]; + if (U_q > mmsbp2) + return false; + + ui64 v_n; + ui64 val = 0; + ui32 bit = 0; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 1; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + vp[0] = prev_v_n | v_n; + prev_v_n = 0; + ++dp; + if (++x >= width) + { ++vp; break; } + + val = 0; + bit = 2; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 3; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8 + bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + prev_v_n = v_n; + ++dp; + ++x; + } + vp[0] = prev_v_n; + + for (ui32 y = 2; y < height; y += 2) + { + const ui16 *sp = scratch + (y >> 1) * sstr; + ui64 *vp = v_n_scratch; + ui64 *dp = decoded_data + y * stride; + + prev_v_n = 0; + for (ui32 x = 0; x < width; sp += 2, ++vp) + { + ui32 inf = sp[0]; + ui32 u_q = sp[1]; + + ui32 gamma = inf & 0xF0; gamma &= gamma - 0x10; //is gamma_q 1? + ui32 emax = 63 - count_leading_zeros(2 | vp[0] | vp[1]); // emax-1 + ui32 kappa = gamma ? emax : 1; + + ui32 U_q = u_q + kappa; + if (U_q > mmsbp2) + return false; + + ui64 v_n; + ui64 val = 0; + ui32 bit = 0; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 1; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + vp[0] = prev_v_n | v_n; + prev_v_n = 0; + ++dp; + if (++x >= width) + { ++vp; break; } + + val = 0; + bit = 2; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[0] = val; + + v_n = 0; + val = 0; + bit = 3; + if (inf & (1 << (4 + bit))) + { + //get 32 bits of magsgn data + ui64 ms_val = frwd_fetch64<0xFF>(&magsgn); + ui32 m_n = U_q - ((inf >> (12 + bit)) & 1); // remove e_k + frwd_advance(&magsgn, m_n); //consume m_n + + val = ms_val << 63; // get sign bit + v_n = ms_val & ((1ULL << m_n) - 1); // keep only m_n bits + v_n |= (ui64)((inf >> (8+bit)) & 1) << m_n; // add EMB e_1 as MSB + v_n |= 1; // add center of bin + //v_n now has 2 * (\mu - 1) + 0.5 with correct sign bit + //add 2 to make it 2*\mu+0.5, shift it up to missing MSBs + val |= (v_n + 2) << (p - 1); + } + dp[stride] = val; + prev_v_n = v_n; + ++dp; + ++x; + } + vp[0] = prev_v_n; + } + } + + if (num_passes > 1) + { + // We use scratch again, we can divide it into multiple regions + // sigma holds all the significant samples, and it cannot + // be modified after it is set. it will be used during the + // Magnitude Refinement Pass + ui16* const sigma = scratch; + + ui32 mstr = (width + 3u) >> 2; // divide by 4, since each + // ui16 contains 4 columns + mstr = ((mstr + 2u) + 7u) & ~7u; // multiples of 8 + + // We re-arrange quad significance, where each 4 consecutive + // bits represent one quad, into column significance, where, + // each 4 consequtive bits represent one column of 4 rows + { + ui32 y; + for (y = 0; y < height; y += 4) + { + ui16* sp = scratch + (y >> 1) * sstr; + ui16* dp = sigma + (y >> 2) * mstr; + for (ui32 x = 0; x < width; x += 4, sp += 4, ++dp) { + ui32 t0 = 0, t1 = 0; + t0 = ((sp[0 ] & 0x30u) >> 4) | ((sp[0 ] & 0xC0u) >> 2); + t0 |= ((sp[2 ] & 0x30u) << 4) | ((sp[2 ] & 0xC0u) << 6); + t1 = ((sp[0+sstr] & 0x30u) >> 2) | ((sp[0+sstr] & 0xC0u) ); + t1 |= ((sp[2+sstr] & 0x30u) << 6) | ((sp[2+sstr] & 0xC0u) << 8); + dp[0] = (ui16)(t0 | t1); + } + dp[0] = 0; // set an extra entry on the right with 0 + } + { + // reset one row after the codeblock + ui16* dp = sigma + (y >> 2) * mstr; + for (ui32 x = 0; x < width; x += 4, ++dp) + dp[0] = 0; + dp[0] = 0; // set an extra entry on the right with 0 + } + } + + // We perform Significance Propagation Pass here + { + // This stores significance information of the previous + // 4 rows. Significance information in this array includes + // all signicant samples in bitplane p - 1; that is, + // significant samples for bitplane p (discovered during the + // cleanup pass and stored in sigma) and samples that have recently + // became significant (during the SPP) in bitplane p-1. + // We store enough for the widest row, containing 1024 columns, + // which is equivalent to 256 of ui16, since each stores 4 columns. + // We add an extra 8 entries, just in case we need more + ui16 prev_row_sig[256 + 8] = {0}; // 528 Bytes + + frwd_struct sigprop; + frwd_init<0>(&sigprop, coded_data + lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui32 pattern = 0xFFFFu; // a pattern needed samples + if (height - y < 4) { + pattern = 0x7777u; + if (height - y < 3) { + pattern = 0x3333u; + if (height - y < 2) + pattern = 0x1111u; + } + } + + // prev holds sign. info. for the previous quad, together + // with the rows on top of it and below it. + ui32 prev = 0; + ui16 *prev_sig = prev_row_sig; + ui16 *cur_sig = sigma + (y >> 2) * mstr; + ui64 *dpp = decoded_data + y * stride; + for (ui32 x = 0; x < width; x += 4, ++cur_sig, ++prev_sig) + { + // only rows and columns inside the stripe are included + si32 s = (si32)x + 4 - (si32)width; + s = ojph_max(s, 0); + pattern = pattern >> (s * 4); + + // We first find locations that need to be tested (potential + // SPP members); these location will end up in mbr + // In each iteration, we produce 16 bits because cwd can have + // up to 16 bits of significance information, followed by the + // corresponding 16 bits of sign information; therefore, it is + // sufficient to fetch 32 bit data per loop. + + // Althougth we are interested in 16 bits only, we load 32 bits. + // For the 16 bits we are producing, we need the next 4 bits -- + // We need data for at least 5 columns out of 8. + // Therefore loading 32 bits is easier than loading 16 bits + // twice. + ui32 ps = *(ui32*)prev_sig; + ui32 ns = *(ui32*)(cur_sig + mstr); + ui32 u = (ps & 0x88888888) >> 3; // the row on top + if (!stripe_causal) + u |= (ns & 0x11111111) << 3; // the row below + + ui32 cs = *(ui32*)cur_sig; + // vertical integration + ui32 mbr = cs; // this sig. info. + mbr |= (cs & 0x77777777) << 1; //above neighbors + mbr |= (cs & 0xEEEEEEEE) >> 1; //below neighbors + mbr |= u; + // horizontal integration + ui32 t = mbr; + mbr |= t << 4; // neighbors on the left + mbr |= t >> 4; // neighbors on the right + mbr |= prev >> 12; // significance of previous group + + // remove outside samples, and already significant samples + mbr &= pattern; + mbr &= ~cs; + + // find samples that become significant during the SPP + ui32 new_sig = mbr; + if (new_sig) + { + ui64 cwd = frwd_fetch<0>(&sigprop); + + ui32 cnt = 0; + ui32 col_mask = 0xFu; + ui32 inv_sig = ~cs & pattern; + for (int i = 0; i < 16; i += 4, col_mask <<= 4) + { + if ((col_mask & new_sig) == 0) + continue; + + //scan one column + ui32 sample_mask = 0x1111u & col_mask; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x33u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0x76u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xECu << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + + sample_mask <<= 1; + if (new_sig & sample_mask) + { + new_sig &= ~sample_mask; + if (cwd & 1) + { + ui32 t = 0xC8u << i; + new_sig |= t & inv_sig; + } + cwd >>= 1; ++cnt; + } + } + + if (new_sig) + { + // new_sig has newly-discovered sig. samples during SPP + // find the signs and update decoded_data + ui64 *dp = dpp + x; + ui64 val = 3u << (p - 2); + col_mask = 0xFu; + for (int i = 0; i < 4; ++i, ++dp, col_mask <<= 4) + { + if ((col_mask & new_sig) == 0) + continue; + + //scan 4 signs + ui32 sample_mask = 0x1111u & col_mask; + if (new_sig & sample_mask) + { + assert(dp[0] == 0); + dp[0] = (cwd << 63) | val; + cwd >>= 1; ++cnt; + } + + sample_mask += sample_mask; + if (new_sig & sample_mask) + { + assert(dp[stride] == 0); + dp[stride] = (cwd << 63) | val; + cwd >>= 1; ++cnt; + } + + sample_mask += sample_mask; + if (new_sig & sample_mask) + { + assert(dp[2 * stride] == 0); + dp[2 * stride] = (cwd << 63) | val; + cwd >>= 1; ++cnt; + } + + sample_mask += sample_mask; + if (new_sig & sample_mask) + { + assert(dp[3 * stride] == 0); + dp[3 * stride] = (cwd << 63) | val; + cwd >>= 1; ++cnt; + } + } + } + frwd_advance(&sigprop, cnt); + } + + new_sig |= cs; + *prev_sig = (ui16)(new_sig); + + // vertical integration for the new sig. info. + t = new_sig; + new_sig |= (t & 0x7777) << 1; //above neighbors + new_sig |= (t & 0xEEEE) >> 1; //below neighbors + // add sig. info. from the row on top and below + prev = new_sig | u; + // we need only the bits in 0xF000 + prev &= 0xF000; + } + } + } + + // We perform Magnitude Refinement Pass here + if (num_passes > 2) + { + rev_struct magref; + rev_init_mrp(&magref, coded_data, (int)lengths1, (int)lengths2); + + for (ui32 y = 0; y < height; y += 4) + { + ui32 *cur_sig = (ui32*)(sigma + (y >> 2) * mstr); + ui64 *dpp = decoded_data + y * stride; + ui64 half = 1ULL << (p - 2); + for (ui32 i = 0; i < width; i += 8) + { + //Process one entry from sigma array at a time + // Each nibble (4 bits) in the sigma array represents 4 rows, + // and the 32 bits contain 8 columns + ui32 cwd = rev_fetch_mrp(&magref); // get 32 bit data + ui32 sig = *cur_sig++; // 32 bit that will be processed now + ui32 col_mask = 0xFu; // a mask for a column in sig + if (sig) // if any of the 32 bits are set + { + for (int j = 0; j < 8; ++j) //one column at a time + { + if (sig & col_mask) // lowest nibble + { + ui64 *dp = dpp + i + j; // next column in decoded samples + ui32 sample_mask = 0x11111111u & col_mask; //LSB + + for (int k = 0; k < 4; ++k) { + if (sig & sample_mask) //if LSB is set + { + assert(dp[0] != 0); // decoded value cannot be zero + assert((dp[0] & half) == 0); // no half + ui64 sym = cwd & 1; // get it value + sym = (1 - sym) << (p - 1); // previous center of bin + sym |= half; // put half the center of bin + dp[0] ^= sym; // remove old bin center and put new + cwd >>= 1; // consume word + } + sample_mask += sample_mask; //next row + dp += stride; // next samples row + } + } + col_mask <<= 4; //next column + } + } + // consume data according to the number of bits set + rev_advance_mrp(&magref, population_count(sig)); + } + } + } + } + return true; + } + } +} \ No newline at end of file diff --git a/src/core/coding/ojph_block_encoder.cpp b/src/core/coding/ojph_block_encoder.cpp index 2023ef1..ffc9e8d 100644 --- a/src/core/coding/ojph_block_encoder.cpp +++ b/src/core/coding/ojph_block_encoder.cpp @@ -65,11 +65,12 @@ namespace ojph { static ui16 vlc_tbl1[2048] = { 0 }; //UVLC encoding - static int ulvc_cwd_pre[33]; - static int ulvc_cwd_pre_len[33]; - static int ulvc_cwd_suf[33]; - static int ulvc_cwd_suf_len[33]; - + const int num_uvlc_entries = 75; + struct uvlc_tbl_struct { + ui8 pre, pre_len, suf, suf_len, ext, ext_len; + }; + static uvlc_tbl_struct uvlc_tbl[num_uvlc_entries]; + ///////////////////////////////////////////////////////////////////////// static bool vlc_init_tables() { @@ -194,23 +195,61 @@ namespace ojph { static bool uvlc_init_tables() { //code goes from 0 to 31, extension and 32 are not supported here - ulvc_cwd_pre[0] = 0; ulvc_cwd_pre[1] = 1; ulvc_cwd_pre[2] = 2; - ulvc_cwd_pre[3] = 4; ulvc_cwd_pre[4] = 4; - ulvc_cwd_pre_len[0] = 0; ulvc_cwd_pre_len[1] = 1; - ulvc_cwd_pre_len[2] = 2; - ulvc_cwd_pre_len[3] = 3; ulvc_cwd_pre_len[4] = 3; - ulvc_cwd_suf[0] = 0; ulvc_cwd_suf[1] = 0; ulvc_cwd_suf[2] = 0; - ulvc_cwd_suf[3] = 0; ulvc_cwd_suf[4] = 1; - ulvc_cwd_suf_len[0] = 0; ulvc_cwd_suf_len[1] = 0; - ulvc_cwd_suf_len[2] = 0; - ulvc_cwd_suf_len[3] = 1; ulvc_cwd_suf_len[4] = 1; + uvlc_tbl[0].pre = 0; + uvlc_tbl[0].pre_len = 0; + uvlc_tbl[0].suf = 0; + uvlc_tbl[0].suf_len = 0; + uvlc_tbl[0].ext = 0; + uvlc_tbl[0].ext_len = 0; + + uvlc_tbl[1].pre = 1; + uvlc_tbl[1].pre_len = 1; + uvlc_tbl[1].suf = 0; + uvlc_tbl[1].suf_len = 0; + uvlc_tbl[1].ext = 0; + uvlc_tbl[1].ext_len = 0; + + uvlc_tbl[2].pre = 2; + uvlc_tbl[2].pre_len = 2; + uvlc_tbl[2].suf = 0; + uvlc_tbl[2].suf_len = 0; + uvlc_tbl[2].ext = 0; + uvlc_tbl[2].ext_len = 0; + + uvlc_tbl[3].pre = 4; + uvlc_tbl[3].pre_len = 3; + uvlc_tbl[3].suf = 0; + uvlc_tbl[3].suf_len = 1; + uvlc_tbl[3].ext = 0; + uvlc_tbl[3].ext_len = 0; + + uvlc_tbl[4].pre = 4; + uvlc_tbl[4].pre_len = 3; + uvlc_tbl[4].suf = 1; + uvlc_tbl[4].suf_len = 1; + uvlc_tbl[4].ext = 0; + uvlc_tbl[4].ext_len = 0; + for (int i = 5; i < 33; ++i) { - ulvc_cwd_pre[i] = 0; - ulvc_cwd_pre_len[i] = 3; - ulvc_cwd_suf[i] = i-5; - ulvc_cwd_suf_len[i] = 5; + uvlc_tbl[i].pre = 0; + uvlc_tbl[i].pre_len = 3; + uvlc_tbl[i].suf = (ui8)(i - 5); + uvlc_tbl[i].suf_len = 5; + uvlc_tbl[i].ext = 0; + uvlc_tbl[i].ext_len = 0; } + + for (int i = 33; i < num_uvlc_entries; ++i) + { + uvlc_tbl[i].pre = 0; + uvlc_tbl[i].pre_len = 3; + uvlc_tbl[i].suf = (ui8)(28 + (i - 33) % 4); + uvlc_tbl[i].suf_len = 5; + uvlc_tbl[i].ext = (ui8)((i - 33) / 4); + uvlc_tbl[i].ext_len = 4; + } + return true; } @@ -440,6 +479,29 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static inline void + ms_encode64(ms_struct* msp, ui64 cwd, int cwd_len) + { + while (cwd_len > 0) + { + if (msp->pos >= msp->buf_size) + OJPH_ERROR(0x00020005, "magnitude sign encoder's buffer is full"); + int t = ojph_min(msp->max_bits - msp->used_bits, cwd_len); + msp->tmp |= (ui32)((cwd & ((1ULL << t) - 1)) << msp->used_bits); + msp->used_bits += t; + cwd >>= t; + cwd_len -= t; + if (msp->used_bits >= msp->max_bits) + { + msp->buf[msp->pos++] = (ui8)msp->tmp; + msp->max_bits = (msp->tmp == 0xFF) ? 7 : 8; + msp->tmp = 0; + msp->used_bits = 0; + } + } + } + ////////////////////////////////////////////////////////////////////////// static inline void ms_terminate(ms_struct* msp) @@ -467,11 +529,11 @@ namespace ojph { // // ////////////////////////////////////////////////////////////////////////// - void ojph_encode_codeblock(ui32* buf, ui32 missing_msbs, ui32 num_passes, - ui32 width, ui32 height, ui32 stride, - ui32* lengths, - ojph::mem_elastic_allocator *elastic, - ojph::coded_lists *& coded) + void ojph_encode_codeblock32(ui32* buf, ui32 missing_msbs, ui32 num_passes, + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded) { assert(num_passes == 1); (void)num_passes; //currently not used @@ -693,23 +755,23 @@ namespace ojph { if (u_q0 > 2 && u_q1 > 2) { - vlc_encode(&vlc, ulvc_cwd_pre[u_q0-2], ulvc_cwd_pre_len[u_q0-2]); - vlc_encode(&vlc, ulvc_cwd_pre[u_q1-2], ulvc_cwd_pre_len[u_q1-2]); - vlc_encode(&vlc, ulvc_cwd_suf[u_q0-2], ulvc_cwd_suf_len[u_q0-2]); - vlc_encode(&vlc, ulvc_cwd_suf[u_q1-2], ulvc_cwd_suf_len[u_q1-2]); + vlc_encode(&vlc, uvlc_tbl[u_q0-2].pre, uvlc_tbl[u_q0-2].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].pre, uvlc_tbl[u_q1-2].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0-2].suf, uvlc_tbl[u_q0-2].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].suf, uvlc_tbl[u_q1-2].suf_len); } else if (u_q0 > 2 && u_q1 > 0) { - vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]); + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); vlc_encode(&vlc, u_q1 - 1, 1); - vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); } else { - vlc_encode(&vlc, ulvc_cwd_pre[u_q0], ulvc_cwd_pre_len[u_q0]); - vlc_encode(&vlc, ulvc_cwd_pre[u_q1], ulvc_cwd_pre_len[u_q1]); - vlc_encode(&vlc, ulvc_cwd_suf[u_q0], ulvc_cwd_suf_len[u_q0]); - vlc_encode(&vlc, ulvc_cwd_suf[u_q1], ulvc_cwd_suf_len[u_q1]); + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len); } //prepare for next iteration @@ -910,10 +972,514 @@ namespace ojph { ms_encode(&ms, s[7] & ((1U<> 1) | ((rho[1] & 8) >> 2); + s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0; + e_q[0]=e_q[1]=e_q[2]=e_q[3]=e_q[4]=e_q[5]=e_q[6]=e_q[7]=0; + rho[0] = rho[1] = 0; e_qmax[0] = e_qmax[1] = 0; + } + } + + + terminate_mel_vlc(&mel, &vlc); + ms_terminate(&ms); + + //copy to elastic + lengths[0] = mel.pos + vlc.pos + ms.pos; + elastic->get_buffer(mel.pos + vlc.pos + ms.pos, coded); + memcpy(coded->buf, ms.buf, ms.pos); + memcpy(coded->buf + ms.pos, mel.buf, mel.pos); + memcpy(coded->buf + ms.pos + mel.pos, vlc.buf - vlc.pos + 1, vlc.pos); + + // put in the interface locator word + ui32 num_bytes = mel.pos + vlc.pos; + coded->buf[lengths[0]-1] = (ui8)(num_bytes >> 4); + coded->buf[lengths[0]-2] = coded->buf[lengths[0]-2] & 0xF0; + coded->buf[lengths[0]-2] = + (ui8)(coded->buf[lengths[0]-2] | (num_bytes & 0xF)); + + coded->avail_size -= lengths[0]; + } + + ////////////////////////////////////////////////////////////////////////// + // + // + // + // + // + ////////////////////////////////////////////////////////////////////////// + void ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes, + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded) + { + assert(num_passes == 1); + (void)num_passes; //currently not used + // 38 bits/sample + 1 color + 4 wavelet = 43 bits per sample. + // * 4096 samples / 8 bits per byte = 22016; then rounded up to the + // nearest 1 kB, givin 22528. This expanded further to take into + // consideration stuffing at a max rate of 16 bits per 15 bits + // (1 bit for every 15 bits of data); in reality, it is much smaller + // than this. + const int ms_size = (22528 * 16 + 14) / 15; //more than enough + ui8 ms_buf[ms_size]; + // For each quad, we need at most, 7 bits for VLC and 12 bits for UVLC. + // So we have 1024 quads * 19 / 8, which is 2432. This must be + // multiplied by 16 / 15 to accommodate stuffing. + // The mel is at most around 1 bit/quad, giving around 128 byte -- in + // practice there was on case where it got to 132 bytes. Even + // accounting for stuffing, it is smaller than 192. Therefore, + // 3072 is more than enough + const int mel_vlc_size = 3072; //more than enough + ui8 mel_vlc_buf[mel_vlc_size]; + const int mel_size = 192; + ui8 *mel_buf = mel_vlc_buf; + const int vlc_size = mel_vlc_size - mel_size; + ui8 *vlc_buf = mel_vlc_buf + mel_size; + + mel_struct mel; + mel_init(&mel, mel_size, mel_buf); + vlc_struct vlc; + vlc_init(&vlc, vlc_size, vlc_buf); + ms_struct ms; + ms_init(&ms, ms_size, ms_buf); + + ui32 p = 62 - missing_msbs; + + //e_val: E values for a line (these are the highest set bit) + //cx_val: is the context values + //Each byte stores the info for the 2 sample. For E, it is maximum + // of the two samples, while for cx, it is the OR of these two samples. + //The maximum is between the pixel at the bottom left of one quad + // and the bottom right of the earlier quad. The same is true for cx. + //For a 1024 pixels, we need 512 bytes, the 2 extra, + // one for the non-existing earlier quad, and one for beyond the + // the end + ui8 e_val[513]; + ui8 cx_val[513]; + ui8* lep = e_val; lep[0] = 0; + ui8* lcxp = cx_val; lcxp[0] = 0; + + //initial row of quads + int e_qmax[2] = {0,0}, e_q[8] = {0,0,0,0,0,0,0,0}; + int rho[2] = {0,0}; + int c_q0 = 0; + ui64 s[8] = {0,0,0,0,0,0,0,0}, val, t; + ui32 y = 0; + ui64 *sp = buf; + for (ui32 x = 0; x < width; x += 4) + { + //prepare two quads + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL; // 2 \mu_p + if (val) + { + rho[0] = 1; + e_q[0] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = e_q[0]; + s[0] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = height > 1 ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 2; + e_q[1] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[1]); + s[1] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + if (x + 1 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 4; + e_q[2] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[2]); + s[2] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = height > 1 ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 8; + e_q[3] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[3]); + s[3] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + } + + int Uq0 = ojph_max(e_qmax[0], 1); //kappa_q = 1 + int u_q0 = Uq0 - 1, u_q1 = 0; //kappa_q = 1 + + int eps0 = 0; + if (u_q0 > 0) + { + eps0 |= (e_q[0] == e_qmax[0]); + eps0 |= (e_q[1] == e_qmax[0]) << 1; + eps0 |= (e_q[2] == e_qmax[0]) << 2; + eps0 |= (e_q[3] == e_qmax[0]) << 3; + } + lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++; + lep[0] = (ui8)e_q[3]; + lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++; + lcxp[0] = (ui8)((rho[0] & 8) >> 3); + + ui16 tuple0 = vlc_tbl0[(c_q0 << 8) + (rho[0] << 4) + eps0]; + vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7); + + if (c_q0 == 0) + mel_encode(&mel, rho[0] != 0); + + int m = (rho[0] & 1) ? Uq0 - (tuple0 & 1) : 0; + ms_encode64(&ms, s[0] & ((1ULL << m) - 1), m); + m = (rho[0] & 2) ? Uq0 - ((tuple0 & 2) >> 1) : 0; + ms_encode64(&ms, s[1] & ((1ULL << m) - 1), m); + m = (rho[0] & 4) ? Uq0 - ((tuple0 & 4) >> 2) : 0; + ms_encode64(&ms, s[2] & ((1ULL << m) - 1), m); + m = (rho[0] & 8) ? Uq0 - ((tuple0 & 8) >> 3) : 0; + ms_encode64(&ms, s[3] & ((1ULL << m) - 1), m); + + if (x + 2 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] = 1; + e_q[4] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = e_q[4]; + s[4] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = height > 1 ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 2; + e_q[5] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[5]); + s[5] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + if (x + 3 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 4; + e_q[6] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[6]); + s[6] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = height > 1 ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 8; + e_q[7] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[7]); + s[7] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + } + + int c_q1 = (rho[0] >> 1) | (rho[0] & 1); + int Uq1 = ojph_max(e_qmax[1], 1); //kappa_q = 1 + u_q1 = Uq1 - 1; //kappa_q = 1 + + int eps1 = 0; + if (u_q1 > 0) + { + eps1 |= (e_q[4] == e_qmax[1]); + eps1 |= (e_q[5] == e_qmax[1]) << 1; + eps1 |= (e_q[6] == e_qmax[1]) << 2; + eps1 |= (e_q[7] == e_qmax[1]) << 3; + } + lep[0] = ojph_max(lep[0], (ui8)e_q[5]); lep++; + lep[0] = (ui8)e_q[7]; + lcxp[0] |= (ui8)(lcxp[0] | (ui8)((rho[1] & 2) >> 1)); lcxp++; + lcxp[0] = (ui8)((rho[1] & 8) >> 3); + ui16 tuple1 = vlc_tbl0[(c_q1 << 8) + (rho[1] << 4) + eps1]; + vlc_encode(&vlc, tuple1 >> 8, (tuple1 >> 4) & 7); + + if (c_q1 == 0) + mel_encode(&mel, rho[1] != 0); + + int m = (rho[1] & 1) ? Uq1 - (tuple1 & 1) : 0; + ms_encode64(&ms, s[4] & ((1ULL << m) - 1), m); + m = (rho[1] & 2) ? Uq1 - ((tuple1 & 2) >> 1) : 0; + ms_encode64(&ms, s[5] & ((1ULL << m) - 1), m); + m = (rho[1] & 4) ? Uq1 - ((tuple1 & 4) >> 2) : 0; + ms_encode64(&ms, s[6] & ((1ULL << m) - 1), m); + m = (rho[1] & 8) ? Uq1 - ((tuple1 & 8) >> 3) : 0; + ms_encode64(&ms, s[7] & ((1ULL << m) - 1), m); + } + + if (u_q0 > 0 && u_q1 > 0) + mel_encode(&mel, ojph_min(u_q0, u_q1) > 2); + + if (u_q0 > 2 && u_q1 > 2) + { + vlc_encode(&vlc, uvlc_tbl[u_q0-2].pre, uvlc_tbl[u_q0-2].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].pre, uvlc_tbl[u_q1-2].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0-2].suf, uvlc_tbl[u_q0-2].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].suf, uvlc_tbl[u_q1-2].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q0-2].ext, uvlc_tbl[u_q0-2].ext_len); + vlc_encode(&vlc, uvlc_tbl[u_q1-2].ext, uvlc_tbl[u_q1-2].ext_len); + } + else if (u_q0 > 2 && u_q1 > 0) + { + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); + vlc_encode(&vlc, u_q1 - 1, 1); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len); + } + else + { + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].ext, uvlc_tbl[u_q1].ext_len); + } + + //prepare for next iteration + c_q0 = (rho[1] >> 1) | (rho[1] & 1); + s[0] = s[1] = s[2] = s[3] = s[4] = s[5] = s[6] = s[7] = 0; + e_q[0]=e_q[1]=e_q[2]=e_q[3]=e_q[4]=e_q[5]=e_q[6]=e_q[7]=0; + rho[0] = rho[1] = 0; e_qmax[0] = e_qmax[1] = 0; + } + + lep[1] = 0; + + for (y = 2; y < height; y += 2) + { + lep = e_val; + int max_e = ojph_max(lep[0], lep[1]) - 1; + lep[0] = 0; + lcxp = cx_val; + c_q0 = lcxp[0] + (lcxp[1] << 2); + lcxp[0] = 0; + + sp = buf + y * stride; + for (ui32 x = 0; x < width; x += 4) + { + //prepare two quads + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] = 1; + e_q[0] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = e_q[0]; + s[0] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = y + 1 < height ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 2; + e_q[1] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[1]); + s[1] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + if (x + 1 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 4; + e_q[2] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[2]); + s[2] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = y + 1 < height ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[0] += 8; + e_q[3] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[0] = ojph_max(e_qmax[0], e_q[3]); + s[3] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + } + + int kappa = (rho[0] & (rho[0]-1)) ? ojph_max(1,max_e) : 1; + int Uq0 = ojph_max(e_qmax[0], kappa); + int u_q0 = Uq0 - kappa, u_q1 = 0; + + int eps0 = 0; + if (u_q0 > 0) + { + eps0 |= (e_q[0] == e_qmax[0]); + eps0 |= (e_q[1] == e_qmax[0]) << 1; + eps0 |= (e_q[2] == e_qmax[0]) << 2; + eps0 |= (e_q[3] == e_qmax[0]) << 3; + } + lep[0] = ojph_max(lep[0], (ui8)e_q[1]); lep++; + max_e = ojph_max(lep[0], lep[1]) - 1; + lep[0] = (ui8)e_q[3]; + lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[0] & 2) >> 1)); lcxp++; + int c_q1 = lcxp[0] + (lcxp[1] << 2); + lcxp[0] = (ui8)((rho[0] & 8) >> 3); + ui16 tuple0 = vlc_tbl1[(c_q0 << 8) + (rho[0] << 4) + eps0]; + vlc_encode(&vlc, tuple0 >> 8, (tuple0 >> 4) & 7); + + if (c_q0 == 0) + mel_encode(&mel, rho[0] != 0); + + int m = (rho[0] & 1) ? Uq0 - (tuple0 & 1) : 0; + ms_encode64(&ms, s[0] & ((1ULL << m) - 1), m); + m = (rho[0] & 2) ? Uq0 - ((tuple0 & 2) >> 1) : 0; + ms_encode64(&ms, s[1] & ((1ULL << m) - 1), m); + m = (rho[0] & 4) ? Uq0 - ((tuple0 & 4) >> 2) : 0; + ms_encode64(&ms, s[2] & ((1ULL << m) - 1), m); + m = (rho[0] & 8) ? Uq0 - ((tuple0 & 8) >> 3) : 0; + ms_encode64(&ms, s[3] & ((1ULL << m) - 1), m); + + if (x + 2 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] = 1; + e_q[4] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = e_q[4]; + s[4] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = y + 1 < height ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 2; + e_q[5] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[5]); + s[5] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + if (x + 3 < width) + { + t = sp[0]; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 4; + e_q[6] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[6]); + s[6] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + + t = y + 1 < height ? sp[stride] : 0; + ++sp; + val = t + t; //multiply by 2 and get rid of sign + val >>= p; // 2 \mu_p + x + val &= ~1ULL;// 2 \mu_p + if (val) + { + rho[1] += 8; + e_q[7] = 64 - (int)count_leading_zeros(--val); //2\mu_p - 1 + e_qmax[1] = ojph_max(e_qmax[1], e_q[7]); + s[7] = --val + (t >> 63); //v_n = 2(\mu_p-1) + s_n + } + } + + kappa = (rho[1] & (rho[1]-1)) ? ojph_max(1,max_e) : 1; + c_q1 |= ((rho[0] & 4) >> 1) | ((rho[0] & 8) >> 2); + int Uq1 = ojph_max(e_qmax[1], kappa); + u_q1 = Uq1 - kappa; + + int eps1 = 0; + if (u_q1 > 0) + { + eps1 |= (e_q[4] == e_qmax[1]); + eps1 |= (e_q[5] == e_qmax[1]) << 1; + eps1 |= (e_q[6] == e_qmax[1]) << 2; + eps1 |= (e_q[7] == e_qmax[1]) << 3; + } + lep[0] = ojph_max(lep[0], (ui8)e_q[5]); lep++; + max_e = ojph_max(lep[0], lep[1]) - 1; + lep[0] = (ui8)e_q[7]; + lcxp[0] = (ui8)(lcxp[0] | (ui8)((rho[1] & 2) >> 1)); lcxp++; + c_q0 = lcxp[0] + (lcxp[1] << 2); + lcxp[0] = (ui8)((rho[1] & 8) >> 3); + ui16 tuple1 = vlc_tbl1[(c_q1 << 8) + (rho[1] << 4) + eps1]; + vlc_encode(&vlc, tuple1 >> 8, (tuple1 >> 4) & 7); + + if (c_q1 == 0) + mel_encode(&mel, rho[1] != 0); + + int m = (rho[1] & 1) ? Uq1 - (tuple1 & 1) : 0; + ms_encode64(&ms, s[4] & ((1ULL << m) - 1), m); + m = (rho[1] & 2) ? Uq1 - ((tuple1 & 2) >> 1) : 0; + ms_encode64(&ms, s[5] & ((1ULL << m) - 1), m); + m = (rho[1] & 4) ? Uq1 - ((tuple1 & 4) >> 2) : 0; + ms_encode64(&ms, s[6] & ((1ULL << m) - 1), m); + m = (rho[1] & 8) ? Uq1 - ((tuple1 & 8) >> 3) : 0; + ms_encode64(&ms, s[7] & ((1ULL << m) - 1), m); + } + + vlc_encode(&vlc, uvlc_tbl[u_q0].pre, uvlc_tbl[u_q0].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].pre, uvlc_tbl[u_q1].pre_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].suf, uvlc_tbl[u_q0].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].suf, uvlc_tbl[u_q1].suf_len); + vlc_encode(&vlc, uvlc_tbl[u_q0].ext, uvlc_tbl[u_q0].ext_len); + vlc_encode(&vlc, uvlc_tbl[u_q1].ext, uvlc_tbl[u_q1].ext_len); //prepare for next iteration c_q0 |= ((rho[1] & 4) >> 1) | ((rho[1] & 8) >> 2); diff --git a/src/core/coding/ojph_block_encoder.h b/src/core/coding/ojph_block_encoder.h index 43d32d8..72b3c0d 100644 --- a/src/core/coding/ojph_block_encoder.h +++ b/src/core/coding/ojph_block_encoder.h @@ -52,11 +52,18 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void - ojph_encode_codeblock(ui32* buf, ui32 missing_msbs, ui32 num_passes, - ui32 width, ui32 height, ui32 stride, - ui32* lengths, - ojph::mem_elastic_allocator *elastic, - ojph::coded_lists *& coded); + ojph_encode_codeblock32(ui32* buf, ui32 missing_msbs, ui32 num_passes, + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded); + + void + ojph_encode_codeblock64(ui64* buf, ui32 missing_msbs, ui32 num_passes, + ui32 width, ui32 height, ui32 stride, + ui32* lengths, + ojph::mem_elastic_allocator *elastic, + ojph::coded_lists *& coded); void ojph_encode_codeblock_avx2(ui32* buf, ui32 missing_msbs, @@ -71,6 +78,9 @@ namespace ojph { ui32 stride, ui32* lengths, ojph::mem_elastic_allocator *elastic, ojph::coded_lists *& coded); + + bool initialize_block_encoder_tables_avx2(); + bool initialize_block_encoder_tables_avx512(); } } diff --git a/src/core/coding/ojph_block_encoder_avx2.cpp b/src/core/coding/ojph_block_encoder_avx2.cpp index d579f83..7624272 100644 --- a/src/core/coding/ojph_block_encoder_avx2.cpp +++ b/src/core/coding/ojph_block_encoder_avx2.cpp @@ -64,8 +64,8 @@ namespace ojph { // index is (c_q << 8) + (rho << 4) + eps // data is (cwd << 8) + (cwd_len << 4) + eps // table 0 is for the initial line of quads - static ui32 vlc_tbl0[2048] = { 0 }; - static ui32 vlc_tbl1[2048] = { 0 }; + static ui32 vlc_tbl0[2048]; + static ui32 vlc_tbl1[2048]; //UVLC encoding static ui32 ulvc_cwd_pre[33]; @@ -218,18 +218,18 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - bool initialize_tables_avx2() { - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { - bool result; - result = vlc_init_tables(); - result = result && uvlc_init_tables(); - return result; - } - return false; - } + static bool tables_initialized = false; ///////////////////////////////////////////////////////////////////////// - static bool tables_initialized = initialize_tables_avx2(); + bool initialize_block_encoder_tables_avx2() { + if (!tables_initialized) { + memset(vlc_tbl0, 0, 2048 * sizeof(ui32)); + memset(vlc_tbl1, 0, 2048 * sizeof(ui32)); + tables_initialized = vlc_init_tables(); + tables_initialized = tables_initialized && uvlc_init_tables(); + } + return tables_initialized; + } ///////////////////////////////////////////////////////////////////////// // diff --git a/src/core/coding/ojph_block_encoder_avx512.cpp b/src/core/coding/ojph_block_encoder_avx512.cpp index 9df0e8e..b35373a 100644 --- a/src/core/coding/ojph_block_encoder_avx512.cpp +++ b/src/core/coding/ojph_block_encoder_avx512.cpp @@ -64,8 +64,8 @@ namespace ojph { // index is (c_q << 8) + (rho << 4) + eps // data is (cwd << 8) + (cwd_len << 4) + eps // table 0 is for the initial line of quads - static ui32 vlc_tbl0[2048] = { 0 }; - static ui32 vlc_tbl1[2048] = { 0 }; + static ui32 vlc_tbl0[2048]; + static ui32 vlc_tbl1[2048]; //UVLC encoding static ui32 ulvc_cwd_pre[33]; @@ -218,18 +218,18 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - bool initialize_tables() { - if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) { - bool result; - result = vlc_init_tables(); - result = result && uvlc_init_tables(); - return result; - } - return false; - } + static bool tables_initialized = false; ///////////////////////////////////////////////////////////////////////// - static bool tables_initialized = initialize_tables(); + bool initialize_block_encoder_tables_avx512() { + if (!tables_initialized) { + memset(vlc_tbl0, 0, 2048 * sizeof(ui32)); + memset(vlc_tbl1, 0, 2048 * sizeof(ui32)); + tables_initialized = vlc_init_tables(); + tables_initialized = tables_initialized && uvlc_init_tables(); + } + return tables_initialized; + } ///////////////////////////////////////////////////////////////////////// // diff --git a/src/core/common/ojph_arch.h b/src/core/common/ojph_arch.h index 947f25b..29ab7a5 100644 --- a/src/core/common/ojph_arch.h +++ b/src/core/common/ojph_arch.h @@ -166,6 +166,32 @@ namespace ojph { #endif } + ///////////////////////////////////////////////////////////////////////////// + static inline ui32 population_count64(ui64 val) + { + #if defined(OJPH_COMPILER_MSVC) \ + && (defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) + return (ui32)__popcnt64(val); + #elif (defined OJPH_COMPILER_GNUC) + return (ui32)__builtin_popcountll(val); + #else + const ui64 k1 = 0x5555555555555555ull; + const ui64 k2 = 0x3333333333333333ull; + const ui64 k4 = 0x0F0F0F0F0F0F0F0Full; + const ui64 kf = 0x0101010101010101ull; + + // put count of each 2 bits into those 2 bits + val = val - ((val >> 1) & k1); + // put count of each 4 bits into those 4 bits + val = (val & k2) + ((val >> 2) & k2); + // put count of each 8 bits into those 8 bits + val = (val + (val >> 4)) & k4 ; + // returns 8 most significant bits of x + (x<<8) + (x<<16) + (x<<24) + ... + val = (val * kf) >> 56; + return (ui32) val; + #endif + } + ///////////////////////////////////////////////////////////////////////////// #ifdef OJPH_COMPILER_MSVC #pragma intrinsic(_BitScanReverse) @@ -188,6 +214,29 @@ namespace ojph { #endif } + ///////////////////////////////////////////////////////////////////////////// +#ifdef OJPH_COMPILER_MSVC + #pragma intrinsic(_BitScanReverse64) +#endif + static inline ui32 count_leading_zeros(ui64 val) + { + #ifdef OJPH_COMPILER_MSVC + unsigned long result = 0; + _BitScanReverse64(&result, val); + return 63 ^ (ui32)result; + #elif (defined OJPH_COMPILER_GNUC) + return (ui32)__builtin_clzll(val); + #else + val |= (val >> 1); + val |= (val >> 2); + val |= (val >> 4); + val |= (val >> 8); + val |= (val >> 16); + val |= (val >> 32); + return 64 - population_count64(val); + #endif + } + ///////////////////////////////////////////////////////////////////////////// #ifdef OJPH_COMPILER_MSVC #pragma intrinsic(_BitScanForward) @@ -237,9 +286,15 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// // constants //////////////////////////////////////////////////////////////////////////// - const ui32 byte_alignment = 64; // 64 bytes == 512 bits - const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment); - const ui32 object_alignment = 8; + #ifndef OJPH_EMSCRIPTEN + const ui32 byte_alignment = 64; // 64 bytes == 512 bits + const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment); + const ui32 object_alignment = 8; + #else + const ui32 byte_alignment = 16; // 16 bytes == 128 bits + const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment); + const ui32 object_alignment = 8; + #endif //////////////////////////////////////////////////////////////////////////// // templates for alignment @@ -247,17 +302,17 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// // finds the size such that it is a multiple of byte_alignment - template + template size_t calc_aligned_size(size_t size) { size = size * sizeof(T) + N - 1; size &= ~((1ULL << (31 - count_leading_zeros(N))) - 1); - size >>= (31 - count_leading_zeros(sizeof(T))); + size >>= (63 - count_leading_zeros((ui64)sizeof(T))); return size; } //////////////////////////////////////////////////////////////////////////// // moves the pointer to first address that is a multiple of byte_alignment - template + template inline T *align_ptr(T *ptr) { intptr_t p = reinterpret_cast(ptr); p += N - 1; diff --git a/src/core/common/ojph_codestream.h b/src/core/common/ojph_codestream.h index c28096e..f7a8065 100644 --- a/src/core/common/ojph_codestream.h +++ b/src/core/common/ojph_codestream.h @@ -61,7 +61,7 @@ namespace ojph { class comment_exchange; class mem_fixed_allocator; struct point; - struct line_buf; + class line_buf; class outfile_base; class infile_base; diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h index d7497cd..99897f3 100644 --- a/src/core/common/ojph_mem.h +++ b/src/core/common/ojph_mem.h @@ -132,9 +132,23 @@ namespace ojph { }; ///////////////////////////////////////////////////////////////////////////// - struct line_buf + class line_buf { - line_buf() : size(0), pre_size(0), i32(0) {} + public: + enum : ui32 { + LFT_UNDEFINED = 0x00, // Type is undefined/uninitialized + // These flags reflects data size in bytes + LFT_BYTE = 0x01, // Set when data is 1 byte + LFT_16BIT = 0x02, // Set when data is 2 bytes + LFT_32BIT = 0x04, // Set when data is 4 bytes + LFT_64BIT = 0x08, // Set when data is 8 bytes + LFT_REVERSIBLE = 0x10, // Set when data is used for reversible coding + // Not all combinations are useful + LFT_SIZE_MASK = 0x0F, // To extract data size + }; + + public: + line_buf() : size(0), pre_size(0), flags(LFT_UNDEFINED), i32(0) {} template void pre_alloc(mem_fixed_allocator *p, size_t num_ele, ui32 pre_size) @@ -153,9 +167,12 @@ namespace ojph { size_t size; ui32 pre_size; + ui32 flags; union { - si32* i32; - float* f32; + si32* i32; // 32bit integer type, used for lossless compression + si64* i64; // 64bit integer type, used for lossless compression + float* f32; // float type, used for lossy compression + void* p; // no type is associated with the pointer }; }; diff --git a/src/core/common/ojph_version.h b/src/core/common/ojph_version.h index 2f3adcc..00faf75 100644 --- a/src/core/common/ojph_version.h +++ b/src/core/common/ojph_version.h @@ -34,5 +34,5 @@ //***************************************************************************/ #define OPENJPH_VERSION_MAJOR 0 -#define OPENJPH_VERSION_MINOR 17 +#define OPENJPH_VERSION_MINOR 18 #define OPENJPH_VERSION_PATCH 0 diff --git a/src/core/others/ojph_mem.cpp b/src/core/others/ojph_mem.cpp index b70d51e..0bb0b5f 100644 --- a/src/core/others/ojph_mem.cpp +++ b/src/core/others/ojph_mem.cpp @@ -65,22 +65,42 @@ namespace ojph { f32 = p->post_alloc_data(size, pre_size); } + //////////////////////////////////////////////////////////////////////////// + template<> + void line_buf::finalize_alloc(mem_fixed_allocator *p) + { + assert(p != 0 && size != 0); + i64 = p->post_alloc_data(size, pre_size); + } + //////////////////////////////////////////////////////////////////////////// template<> void line_buf::wrap(si32 *buffer, size_t num_ele, ui32 pre_size) { - i32 = buffer; + this->i32 = buffer; this->size = num_ele; this->pre_size = pre_size; + this->flags = LFT_32BIT | LFT_REVERSIBLE; } //////////////////////////////////////////////////////////////////////////// template<> void line_buf::wrap(float *buffer, size_t num_ele, ui32 pre_size) { - f32 = buffer; + this->f32 = buffer; + this->size = num_ele; + this->pre_size = pre_size; + this->flags = LFT_32BIT; + } + + //////////////////////////////////////////////////////////////////////////// + template<> + void line_buf::wrap(si64 *buffer, size_t num_ele, ui32 pre_size) + { + this->i64 = buffer; this->size = num_ele; this->pre_size = pre_size; + this->flags = LFT_64BIT | LFT_REVERSIBLE; } //////////////////////////////////////////////////////////////////////////// diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index ca96d2d..a98b477 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -39,19 +39,28 @@ #include "ojph_defs.h" #include "ojph_arch.h" +#include "ojph_mem.h" #include "ojph_colour.h" #include "ojph_colour_local.h" namespace ojph { + + // defined elsewhere + class line_buf; + namespace local { ////////////////////////////////////////////////////////////////////////// - void (*cnvrt_si32_to_si32_shftd) - (const si32 *sp, si32 *dp, int shift, ui32 width) = NULL; + void (*rev_convert) + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// - void (*cnvrt_si32_to_si32_nlt_type3) - (const si32* sp, si32* dp, int shift, ui32 width) = NULL; + void (*rev_convert_nlt_type3) + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*cnvrt_si32_to_float_shftd) @@ -71,13 +80,13 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void (*rct_forward) - (const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) = NULL; + (const line_buf* r, const line_buf* g, const line_buf* b, + line_buf* y, line_buf* cb, line_buf* cr, ui32 repeat) = NULL; ////////////////////////////////////////////////////////////////////////// void (*rct_backward) - (const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) = NULL; + (const line_buf* r, const line_buf* g, const line_buf* b, + line_buf* y, line_buf* cb, line_buf* cr, ui32 repeat) = NULL; ////////////////////////////////////////////////////////////////////////// void (*ict_forward) @@ -100,8 +109,8 @@ namespace ojph { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) - cnvrt_si32_to_si32_shftd = gen_cnvrt_si32_to_si32_shftd; - cnvrt_si32_to_si32_nlt_type3 = gen_cnvrt_si32_to_si32_nlt_type3; + rev_convert = gen_rev_convert; + rev_convert_nlt_type3 = gen_rev_convert_nlt_type3; cnvrt_si32_to_float_shftd = gen_cnvrt_si32_to_float_shftd; cnvrt_si32_to_float = gen_cnvrt_si32_to_float; cnvrt_float_to_si32_shftd = gen_cnvrt_float_to_si32_shftd; @@ -130,10 +139,10 @@ namespace ojph { #ifndef OJPH_DISABLE_SSE2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE2) { + rev_convert = sse2_rev_convert; + rev_convert_nlt_type3 = sse2_rev_convert_nlt_type3; cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd; cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32; - cnvrt_si32_to_si32_shftd = sse2_cnvrt_si32_to_si32_shftd; - cnvrt_si32_to_si32_nlt_type3 = sse2_cnvrt_si32_to_si32_nlt_type3; rct_forward = sse2_rct_forward; rct_backward = sse2_rct_backward; } @@ -154,8 +163,8 @@ namespace ojph { #ifndef OJPH_DISABLE_AVX2 if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX2) { - cnvrt_si32_to_si32_shftd = avx2_cnvrt_si32_to_si32_shftd; - cnvrt_si32_to_si32_nlt_type3 = avx2_cnvrt_si32_to_si32_nlt_type3; + rev_convert = avx2_rev_convert; + rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3; rct_forward = avx2_rct_forward; rct_backward = avx2_rct_backward; } @@ -168,8 +177,9 @@ namespace ojph { #endif // !OJPH_DISABLE_SIMD #else // OJPH_ENABLE_WASM_SIMD - cnvrt_si32_to_si32_shftd = wasm_cnvrt_si32_to_si32_shftd; - cnvrt_si32_to_si32_nlt_type3 = wasm_cnvrt_si32_to_si32_nlt_type3; + + rev_convert = wasm_rev_convert; + rev_convert_nlt_type3 = wasm_rev_convert_nlt_type3; cnvrt_si32_to_float_shftd = wasm_cnvrt_si32_to_float_shftd; cnvrt_si32_to_float = wasm_cnvrt_si32_to_float; cnvrt_float_to_si32_shftd = wasm_cnvrt_float_to_si32_shftd; @@ -178,6 +188,7 @@ namespace ojph { rct_backward = wasm_rct_backward; ict_forward = wasm_ict_forward; ict_backward = wasm_ict_backward; + #endif // !OJPH_ENABLE_WASM_SIMD colour_transform_functions_initialized = true; @@ -201,20 +212,78 @@ namespace ojph { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width) + void gen_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width) { - for (ui32 i = width; i > 0; --i) - *dp++ = *sp++ + shift; + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + si32 s = (si32)shift; + for (ui32 i = width; i > 0; --i) + *dp++ = *sp++ + s; + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + for (ui32 i = width; i > 0; --i) + *dp++ = *sp++ + shift; + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + for (ui32 i = width; i > 0; --i) + *dp++ = (si32)(*sp++ + shift); + } } ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, - int shift, ui32 width) + void gen_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width) { - for (ui32 i = width; i > 0; --i) { - const si32 v = *sp++; - *dp++ = v >= 0 ? v : (- v - shift); + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + si32 s = (si32)shift; + for (ui32 i = width; i > 0; --i) { + const si32 v = *sp++; + *dp++ = v >= 0 ? v : (- v - s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + for (ui32 i = width; i > 0; --i) { + const si64 v = *sp++; + *dp++ = v >= 0 ? v : (- v - shift); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + for (ui32 i = width; i > 0; --i) { + const si64 v = *sp++; + *dp++ = (si32)(v >= 0 ? v : (- v - shift)); + } } } @@ -251,26 +320,104 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void gen_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) + void gen_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat) { - for (ui32 i = repeat; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - *y++ = (*r + (*g << 1) + *b) >> 2; - *cb++ = (*b++ - *g); - *cr++ = (*r++ - *g++); + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; + for (ui32 i = repeat; i > 0; --i) + { + si32 rr = *rp++, gg = *gp++, bb = *bp++; + *yp++ = (rr + (gg << 1) + bb) >> 2; + *cbp++ = (bb - gg); + *crp++ = (rr - gg); + } + } + else + { + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + for (ui32 i = repeat; i > 0; --i) + { + si64 rr = *rp++, gg = *gp++, bb = *bp++; + *yp++ = (rr + (gg << 1) + bb) >> 2; + *cbp++ = (bb - gg); + *crp++ = (rr - gg); + } } } ////////////////////////////////////////////////////////////////////////// - void gen_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) + void gen_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat) { - for (ui32 i = repeat; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) + { + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (ui32 i = repeat; i > 0; --i) + { + si32 yy = *yp++, cbb = *cbp++, crr = *crp++; + si32 gg = yy - ((cbb + crr) >> 2); + *rp++ = crr + gg; + *gp++ = gg; + *bp++ = cbb + gg; + } + } + else { - *g = *y++ - ((*cb + *cr)>>2); - *b++ = *cb++ + *g; - *r++ = *cr++ + *g++; + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (ui32 i = repeat; i > 0; --i) + { + si64 yy = *yp++, cbb = *cbp++, crr = *crp++; + si64 gg = yy - ((cbb + crr) >> 2); + *rp++ = (si32)(crr + gg); + *gp++ = (si32)gg; + *bp++ = (si32)(cbb + gg); + } } } diff --git a/src/core/transform/ojph_colour.h b/src/core/transform/ojph_colour.h index 52df312..cc42aaa 100644 --- a/src/core/transform/ojph_colour.h +++ b/src/core/transform/ojph_colour.h @@ -40,18 +40,26 @@ #define OJPH_COLOR_H namespace ojph { + + // defined elsewhere + class line_buf; + namespace local { //////////////////////////////////////////////////////////////////////////// void init_colour_transform_functions(); //////////////////////////////////////////////////////////////////////////// - extern void (*cnvrt_si32_to_si32_shftd) - (const si32 *sp, si32 *dp, int shift, ui32 width); + extern void (*rev_convert) + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); //////////////////////////////////////////////////////////////////////////// - extern void (*cnvrt_si32_to_si32_nlt_type3) - (const si32 *sp, si32 *dp, int shift, ui32 width); + extern void (*rev_convert_nlt_type3) + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); //////////////////////////////////////////////////////////////////////////// extern void (*cnvrt_si32_to_float_shftd) @@ -71,13 +79,13 @@ namespace ojph { //////////////////////////////////////////////////////////////////////////// extern void (*rct_forward) - (const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + (const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); //////////////////////////////////////////////////////////////////////////// extern void (*rct_backward) - (const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + (const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); //////////////////////////////////////////////////////////////////////////// extern void (*ict_forward) diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index 14e5a35..05bff31 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -35,10 +35,12 @@ // Date: 11 October 2019 //***************************************************************************/ +#include #include #include "ojph_defs.h" #include "ojph_arch.h" +#include "ojph_mem.h" #include "ojph_colour.h" #include @@ -46,82 +48,392 @@ namespace ojph { namespace local { + ///////////////////////////////////////////////////////////////////////// + // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h + static inline + __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) + { + // note than m must be obtained using + // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt)); + __m256i x = _mm256_srli_epi64(a, amt); + x = _mm256_xor_si256(x, m); + __m256i result = _mm256_sub_epi64(x, m); + return result; + } + ////////////////////////////////////////////////////////////////////////// - void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width) + void avx2_rev_convert(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) { - __m256i sh = _mm256_set1_epi32(shift); - for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m256i sh = _mm256_set1_epi32((si32)shift); + for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) + { + __m256i s = _mm256_loadu_si256((__m256i*)sp); + s = _mm256_add_epi32(s, sh); + _mm256_storeu_si256((__m256i*)dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + __m256i sh = _mm256_set1_epi64x(shift); + for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) + { + __m256i s, t; + s = _mm256_loadu_si256((__m256i*)sp); + + t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 0)); + t = _mm256_add_epi64(t, sh); + _mm256_storeu_si256((__m256i*)dp, t); + + t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 1)); + t = _mm256_add_epi64(t, sh); + _mm256_storeu_si256((__m256i*)dp + 1, t); + } + } + } + else { - __m256i s = _mm256_loadu_si256((__m256i*)sp); - s = _mm256_add_epi32(s, sh); - _mm256_storeu_si256((__m256i*)dp, s); + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, + 0, (si64)ULLONG_MAX); + __m256i sh = _mm256_set1_epi64x(shift); + for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) + { + __m256i s, t; + s = _mm256_loadu_si256((__m256i*)sp); + s = _mm256_add_epi64(s, sh); + + t = _mm256_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0)); + t = _mm256_and_si256(low_bits, t); + + s = _mm256_loadu_si256((__m256i*)sp + 1); + s = _mm256_add_epi64(s, sh); + + s = _mm256_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0)); + s = _mm256_andnot_si256(low_bits, s); + + t = _mm256_or_si256(s, t); + t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_storeu_si256((__m256i*)dp, t); + } } } ////////////////////////////////////////////////////////////////////////// - void avx2_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp, - int shift, ui32 width) + void avx2_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) { - __m256i sh = _mm256_set1_epi32(-shift); - __m256i zero = _mm256_setzero_si256(); - for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8) + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m256i sh = _mm256_set1_epi32((si32)(-shift)); + __m256i zero = _mm256_setzero_si256(); + for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8) + { + __m256i s = _mm256_loadu_si256((__m256i*)sp); + __m256i c = _mm256_cmpgt_epi32(zero, s); // 0xFFFFFFFF for -ve val + __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value + v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only -shift-val + s = _mm256_andnot_si256(c, s); // keep only +ve or 0 + s = _mm256_or_si256(s, v_m_sh); // combine + _mm256_storeu_si256((__m256i*)dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + __m256i sh = _mm256_set1_epi64x(-shift); + __m256i zero = _mm256_setzero_si256(); + for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8) + { + __m256i s, t, u0, u1, c, v_m_sh; + s = _mm256_loadu_si256((__m256i*)sp); + + t = _mm256_cmpgt_epi32(zero, s); // find -ve 32bit -1 + u0 = _mm256_unpacklo_epi32(s, t); // correct 64bit data + c = _mm256_unpacklo_epi32(t, t); // 64bit -1 for -ve value + + v_m_sh = _mm256_sub_epi64(sh, u0); // - shift - value + v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value + u0 = _mm256_andnot_si256(c, u0); // keep only +ve or 0 + u0 = _mm256_or_si256(u0, v_m_sh); // combine + + u1 = _mm256_unpackhi_epi32(s, t); // correct 64bit data + c = _mm256_unpackhi_epi32(t, t); // 64bit -1 for -ve value + + v_m_sh = _mm256_sub_epi64(sh, u1); // - shift - value + v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value + u1 = _mm256_andnot_si256(c, u1); // keep only +ve or 0 + u1 = _mm256_or_si256(u1, v_m_sh); // combine + + t = _mm256_permute2x128_si256(u0, u1, (2 << 4) | 0); + _mm256_storeu_si256((__m256i*)dp, t); + + t = _mm256_permute2x128_si256(u0, u1, (3 << 4) | 1); + _mm256_storeu_si256((__m256i*)dp + 1, t); + } + } + } + else { - __m256i s = _mm256_loadu_si256((__m256i*)sp); - __m256i c = _mm256_cmpgt_epi32(s, zero); // 0xFFFFFFFF for +ve value - __m256i z = _mm256_cmpeq_epi32(s, zero); // 0xFFFFFFFF for 0 - c = _mm256_or_si256(c, z); // 0xFFFFFFFF for +ve and 0 - - __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value - v_m_sh = _mm256_andnot_si256(c, v_m_sh); // keep only - shift - value - s = _mm256_and_si256(c, s); // keep only +ve or 0 - s = _mm256_or_si256(s, v_m_sh); // combine - _mm256_storeu_si256((__m256i*)dp, s); + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m256i sh = _mm256_set1_epi64x(-shift); + __m256i zero = _mm256_setzero_si256(); + __m256i half_mask = _mm256_set_epi64x(0, (si64)ULLONG_MAX, + 0, (si64)ULLONG_MAX); + for (int i = (width + 7) >> 3; i > 0; --i, sp += 8, dp += 8) + { + // s for source, t for target, p for positive, n for negative, + // m for mask, and tm for temp + __m256i s, t, p, n, m, tm; + s = _mm256_loadu_si256((__m256i*)sp); + + m = _mm256_cmpgt_epi64(zero, s); // 64b -1 for -ve value + tm = _mm256_sub_epi64(sh, s); // - shift - value + n = _mm256_and_si256(m, tm); // -ve + p = _mm256_andnot_si256(m, s); // +ve + tm = _mm256_or_si256(n, p); + tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0)); + t = _mm256_and_si256(half_mask, tm); + + s = _mm256_loadu_si256((__m256i*)sp + 1); + m = _mm256_cmpgt_epi64(zero, s); // 64b -1 for -ve value + tm = _mm256_sub_epi64(sh, s); // - shift - value + n = _mm256_and_si256(m, tm); // -ve + p = _mm256_andnot_si256(m, s); // +ve + tm = _mm256_or_si256(n, p); + tm = _mm256_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0)); + tm = _mm256_andnot_si256(half_mask, tm); + + t = _mm256_or_si256(t, tm); + t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_storeu_si256((__m256i*)dp, t); + } } } ////////////////////////////////////////////////////////////////////////// - void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) + void avx2_rct_forward(const line_buf *r, + const line_buf *g, + const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, + ui32 repeat) { - for (int i = (repeat + 7) >> 3; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - __m256i mr = _mm256_load_si256((__m256i*)r); - __m256i mg = _mm256_load_si256((__m256i*)g); - __m256i mb = _mm256_load_si256((__m256i*)b); - __m256i t = _mm256_add_epi32(mr, mb); - t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1)); - _mm256_store_si256((__m256i*)y, _mm256_srai_epi32(t, 2)); - t = _mm256_sub_epi32(mb, mg); - _mm256_store_si256((__m256i*)cb, t); - t = _mm256_sub_epi32(mr, mg); - _mm256_store_si256((__m256i*)cr, t); - - r += 8; g += 8; b += 8; - y += 8; cb += 8; cr += 8; + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; + for (int i = (repeat + 7) >> 3; i > 0; --i) + { + __m256i mr = _mm256_load_si256((__m256i*)rp); + __m256i mg = _mm256_load_si256((__m256i*)gp); + __m256i mb = _mm256_load_si256((__m256i*)bp); + __m256i t = _mm256_add_epi32(mr, mb); + t = _mm256_add_epi32(t, _mm256_slli_epi32(mg, 1)); + _mm256_store_si256((__m256i*)yp, _mm256_srai_epi32(t, 2)); + t = _mm256_sub_epi32(mb, mg); + _mm256_store_si256((__m256i*)cbp, t); + t = _mm256_sub_epi32(mr, mg); + _mm256_store_si256((__m256i*)crp, t); + + rp += 8; gp += 8; bp += 8; + yp += 8; cbp += 8; crp += 8; + } } - } + else + { + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + for (int i = (repeat + 7) >> 3; i > 0; --i) + { + __m256i mr32 = _mm256_load_si256((__m256i*)rp); + __m256i mg32 = _mm256_load_si256((__m256i*)gp); + __m256i mb32 = _mm256_load_si256((__m256i*)bp); + __m256i mr, mg, mb, t; + mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 0)); + mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 0)); + mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 0)); + + t = _mm256_add_epi64(mr, mb); + t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1)); + _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2)); + t = _mm256_sub_epi64(mb, mg); + _mm256_store_si256((__m256i*)cbp, t); + t = _mm256_sub_epi64(mr, mg); + _mm256_store_si256((__m256i*)crp, t); + + yp += 4; cbp += 4; crp += 4; + + mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 1)); + mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 1)); + mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 1)); + + t = _mm256_add_epi64(mr, mb); + t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1)); + _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2)); + t = _mm256_sub_epi64(mb, mg); + _mm256_store_si256((__m256i*)cbp, t); + t = _mm256_sub_epi64(mr, mg); + _mm256_store_si256((__m256i*)crp, t); + + rp += 8; gp += 8; bp += 8; + yp += 4; cbp += 4; crp += 4; + } + } + } ////////////////////////////////////////////////////////////////////////// - void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) + void avx2_rct_backward(const line_buf *y, + const line_buf *cb, + const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, + ui32 repeat) { - for (int i = (repeat + 7) >> 3; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - __m256i my = _mm256_load_si256((__m256i*)y); - __m256i mcb = _mm256_load_si256((__m256i*)cb); - __m256i mcr = _mm256_load_si256((__m256i*)cr); - - __m256i t = _mm256_add_epi32(mcb, mcr); - t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2)); - _mm256_store_si256((__m256i*)g, t); - __m256i u = _mm256_add_epi32(mcb, t); - _mm256_store_si256((__m256i*)b, u); - u = _mm256_add_epi32(mcr, t); - _mm256_store_si256((__m256i*)r, u); - - y += 8; cb += 8; cr += 8; - r += 8; g += 8; b += 8; + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 7) >> 3; i > 0; --i) + { + __m256i my = _mm256_load_si256((__m256i*)yp); + __m256i mcb = _mm256_load_si256((__m256i*)cbp); + __m256i mcr = _mm256_load_si256((__m256i*)crp); + + __m256i t = _mm256_add_epi32(mcb, mcr); + t = _mm256_sub_epi32(my, _mm256_srai_epi32(t, 2)); + _mm256_store_si256((__m256i*)gp, t); + __m256i u = _mm256_add_epi32(mcb, t); + _mm256_store_si256((__m256i*)bp, u); + u = _mm256_add_epi32(mcr, t); + _mm256_store_si256((__m256i*)rp, u); + + yp += 8; cbp += 8; crp += 8; + rp += 8; gp += 8; bp += 8; + } + } + else + { + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2)); + __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, + 0, (si64)ULLONG_MAX); + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 7) >> 3; i > 0; --i) + { + __m256i my, mcb, mcr, tr, tg, tb; + my = _mm256_load_si256((__m256i*)yp); + mcb = _mm256_load_si256((__m256i*)cbp); + mcr = _mm256_load_si256((__m256i*)crp); + + tg = _mm256_add_epi64(mcb, mcr); + tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2)); + tb = _mm256_add_epi64(mcb, tg); + tr = _mm256_add_epi64(mcr, tg); + + __m256i mr, mg, mb; + mr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0)); + mr = _mm256_and_si256(low_bits, mr); + mg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0)); + mg = _mm256_and_si256(low_bits, mg); + mb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0)); + mb = _mm256_and_si256(low_bits, mb); + + yp += 4; cbp += 4; crp += 4; + + my = _mm256_load_si256((__m256i*)yp); + mcb = _mm256_load_si256((__m256i*)cbp); + mcr = _mm256_load_si256((__m256i*)crp); + + tg = _mm256_add_epi64(mcb, mcr); + tg = _mm256_sub_epi64(my, avx2_mm256_srai_epi64(tg, 2, v2)); + tb = _mm256_add_epi64(mcb, tg); + tr = _mm256_add_epi64(mcr, tg); + + tr = _mm256_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0)); + tr = _mm256_andnot_si256(low_bits, tr); + mr = _mm256_or_si256(mr, tr); + mr = _mm256_permute4x64_epi64(mr, _MM_SHUFFLE(3, 1, 2, 0)); + + tg = _mm256_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0)); + tg = _mm256_andnot_si256(low_bits, tg); + mg = _mm256_or_si256(mg, tg); + mg = _mm256_permute4x64_epi64(mg, _MM_SHUFFLE(3, 1, 2, 0)); + + tb = _mm256_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0)); + tb = _mm256_andnot_si256(low_bits, tb); + mb = _mm256_or_si256(mb, tb); + mb = _mm256_permute4x64_epi64(mb, _MM_SHUFFLE(3, 1, 2, 0)); + + _mm256_store_si256((__m256i*)rp, mr); + _mm256_store_si256((__m256i*)gp, mg); + _mm256_store_si256((__m256i*)bp, mb); + + yp += 4; cbp += 4; crp += 4; + rp += 8; gp += 8; bp += 8; + } } } diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h index ae5eba1..5eb8b74 100644 --- a/src/core/transform/ojph_colour_local.h +++ b/src/core/transform/ojph_colour_local.h @@ -65,12 +65,16 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width); + void gen_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, - int shift, ui32 width); + void gen_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, @@ -89,12 +93,14 @@ namespace ojph { ui32 width); ////////////////////////////////////////////////////////////////////////// - void gen_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + void gen_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); ////////////////////////////////////////////////////////////////////////// - void gen_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + void gen_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); ////////////////////////////////////////////////////////////////////////// void gen_ict_forward(const float *r, const float *g, const float *b, @@ -161,21 +167,26 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width); + void sse2_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, - int shift, ui32 width); - + void sse2_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + void sse2_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); ////////////////////////////////////////////////////////////////////////// - void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + void sse2_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); ////////////////////////////////////////////////////////////////////////// // @@ -218,20 +229,26 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void avx2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width); + void avx2_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void avx2_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, - int shift, ui32 width); + void avx2_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void avx2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + void avx2_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); ////////////////////////////////////////////////////////////////////////// - void avx2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + void avx2_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); ////////////////////////////////////////////////////////////////////////// // @@ -258,20 +275,26 @@ namespace ojph { ui32 width); ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width); + void wasm_rev_convert( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_si32_to_si32_nlt_type3(const si32 *sp, si32 *dp, - int shift, ui32 width); + void wasm_rev_convert_nlt_type3( + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, + si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat); + void wasm_rct_forward( + const line_buf *r, const line_buf *g, const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat); ////////////////////////////////////////////////////////////////////////// - void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat); + void wasm_rct_backward( + const line_buf *y, const line_buf *cb, const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat); ////////////////////////////////////////////////////////////////////////// void wasm_ict_forward(const float *r, const float *g, const float *b, diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index c50c091..a529c66 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -35,10 +35,12 @@ // Date: 11 October 2019 //***************************************************************************/ +#include #include #include "ojph_defs.h" #include "ojph_arch.h" +#include "ojph_mem.h" #include "ojph_colour.h" #include @@ -46,6 +48,207 @@ namespace ojph { namespace local { + ///////////////////////////////////////////////////////////////////////// + // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h + static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) + { + // note than m must be obtained using + // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt)); + __m128i x = _mm_srli_epi64(a, amt); + x = _mm_xor_si128(x, m); + __m128i result = _mm_sub_epi64(x, m); + return result; + } + + ////////////////////////////////////////////////////////////////////////// + static inline __m128i sse2_cvtlo_epi32_epi64(__m128i a, __m128i zero) + { + __m128i t; + t = _mm_cmplt_epi32(a, zero); // get -ve + t = _mm_unpacklo_epi32(a, t); + return t; + } + + ////////////////////////////////////////////////////////////////////////// + static inline __m128i sse2_cvthi_epi32_epi64(__m128i a, __m128i zero) + { + __m128i t; + t = _mm_cmplt_epi32(a, zero); // get -ve + t = _mm_unpackhi_epi32(a, t); + return t; + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_convert(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m128i sh = _mm_set1_epi32((si32)shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + __m128i s = _mm_loadu_si128((__m128i*)sp); + s = _mm_add_epi32(s, sh); + _mm_storeu_si128((__m128i*)dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + __m128i zero = _mm_setzero_si128(); + __m128i sh = _mm_set1_epi64x(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + __m128i s, t; + s = _mm_loadu_si128((__m128i*)sp); + + t = sse2_cvtlo_epi32_epi64(s, zero); + t = _mm_add_epi64(t, sh); + _mm_storeu_si128((__m128i*)dp, t); + + t = sse2_cvthi_epi32_epi64(s, zero); + t = _mm_add_epi64(t, sh); + _mm_storeu_si128((__m128i*)dp + 1, t); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX); + __m128i sh = _mm_set1_epi64x(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + __m128i s, t; + s = _mm_loadu_si128((__m128i*)sp); + s = _mm_add_epi64(s, sh); + + t = _mm_shuffle_epi32(s, _MM_SHUFFLE(0, 0, 2, 0)); + t = _mm_and_si128(low_bits, t); + + s = _mm_loadu_si128((__m128i*)sp + 1); + s = _mm_add_epi64(s, sh); + + s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0)); + s = _mm_andnot_si128(low_bits, s); + + t = _mm_or_si128(s, t); + _mm_storeu_si128((__m128i*)dp, t); + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m128i sh = _mm_set1_epi32((si32)(-shift)); + __m128i zero = _mm_setzero_si128(); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + __m128i s = _mm_loadu_si128((__m128i*)sp); + __m128i c = _mm_cmplt_epi32(s, zero); // 0xFFFFFFFF for -ve value + __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value + v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value + s = _mm_andnot_si128(c, s); // keep only +ve or 0 + s = _mm_or_si128(s, v_m_sh); // combine + _mm_storeu_si128((__m128i*)dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + __m128i sh = _mm_set1_epi64x(-shift); + __m128i zero = _mm_setzero_si128(); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + __m128i s, t, u, c, v_m_sh; + s = _mm_loadu_si128((__m128i*)sp); + + t = _mm_cmplt_epi32(s, zero); // find -ve 32bit -1 + u = _mm_unpacklo_epi32(s, t); // correct 64bit data + c = _mm_unpacklo_epi32(t, t); // 64bit -1 for -ve value + + v_m_sh = _mm_sub_epi64(sh, u); // - shift - value + v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value + u = _mm_andnot_si128(c, u); // keep only +ve or 0 + u = _mm_or_si128(u, v_m_sh); // combine + + _mm_storeu_si128((__m128i*)dp, u); + u = _mm_unpackhi_epi32(s, t); // correct 64bit data + c = _mm_unpackhi_epi32(t, t); // 64bit -1 for -ve value + + v_m_sh = _mm_sub_epi64(sh, u); // - shift - value + v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value + u = _mm_andnot_si128(c, u); // keep only +ve or 0 + u = _mm_or_si128(u, v_m_sh); // combine + + _mm_storeu_si128((__m128i*)dp + 1, u); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + __m128i sh = _mm_set1_epi64x(-shift); + __m128i zero = _mm_setzero_si128(); + __m128i half_mask = _mm_set_epi64x(0, (si64)ULLONG_MAX); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + // s for source, t for target, p for positive, n for negative, + // m for mask, and tm for temp + __m128i s, t, p, n, m, tm; + s = _mm_loadu_si128((__m128i*)sp); + + tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value + m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b + tm = _mm_sub_epi64(sh, s); // - shift - value + n = _mm_and_si128(m, tm); // -ve + p = _mm_andnot_si128(m, s); // +ve + tm = _mm_or_si128(n, p); + tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(0, 0, 2, 0)); + t = _mm_and_si128(half_mask, tm); + + s = _mm_loadu_si128((__m128i*)sp + 1); + tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value + m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b + tm = _mm_sub_epi64(sh, s); // - shift - value + n = _mm_and_si128(m, tm); // -ve + p = _mm_andnot_si128(m, s); // +ve + tm = _mm_or_si128(n, p); + tm = _mm_shuffle_epi32(tm, _MM_SHUFFLE(2, 0, 0, 0)); + tm = _mm_andnot_si128(half_mask, tm); + + t = _mm_or_si128(t, tm); + _mm_storeu_si128((__m128i*)dp, t); + } + } + } + ////////////////////////////////////////////////////////////////////////// void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width) @@ -80,82 +283,200 @@ namespace ojph { _MM_SET_ROUNDING_MODE(rounding_mode); } - ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width) + void sse2_rct_forward(const line_buf *r, + const line_buf *g, + const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, + ui32 repeat) { - __m128i sh = _mm_set1_epi32(shift); - for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - __m128i s = _mm_loadu_si128((__m128i*)sp); - s = _mm_add_epi32(s, sh); - _mm_storeu_si128((__m128i*)dp, s); - } - } + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + __m128i mr = _mm_load_si128((__m128i*)rp); + __m128i mg = _mm_load_si128((__m128i*)gp); + __m128i mb = _mm_load_si128((__m128i*)bp); + __m128i t = _mm_add_epi32(mr, mb); + t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1)); + _mm_store_si128((__m128i*)yp, _mm_srai_epi32(t, 2)); + t = _mm_sub_epi32(mb, mg); + _mm_store_si128((__m128i*)cbp, t); + t = _mm_sub_epi32(mr, mg); + _mm_store_si128((__m128i*)crp, t); - ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp, - int shift, ui32 width) - { - __m128i sh = _mm_set1_epi32(-shift); - __m128i zero = _mm_setzero_si128(); - for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + rp += 4; gp += 4; bp += 4; + yp += 4; cbp += 4; crp += 4; + } + } + else { - __m128i s = _mm_loadu_si128((__m128i*)sp); - __m128i c = _mm_cmplt_epi32(s, zero); // 0xFFFFFFFF for -ve value - __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value - v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value - s = _mm_andnot_si128(c, s); // keep only +ve or 0 - s = _mm_or_si128(s, v_m_sh); // combine - _mm_storeu_si128((__m128i*)dp, s); + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + __m128i zero = _mm_setzero_si128(); + __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + __m128i mr32 = _mm_load_si128((__m128i*)rp); + __m128i mg32 = _mm_load_si128((__m128i*)gp); + __m128i mb32 = _mm_load_si128((__m128i*)bp); + __m128i mr, mg, mb, t; + mr = sse2_cvtlo_epi32_epi64(mr32, zero); + mg = sse2_cvtlo_epi32_epi64(mg32, zero); + mb = sse2_cvtlo_epi32_epi64(mb32, zero); + + t = _mm_add_epi64(mr, mb); + t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1)); + _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2)); + t = _mm_sub_epi64(mb, mg); + _mm_store_si128((__m128i*)cbp, t); + t = _mm_sub_epi64(mr, mg); + _mm_store_si128((__m128i*)crp, t); + + yp += 2; cbp += 2; crp += 2; + + mr = sse2_cvthi_epi32_epi64(mr32, zero); + mg = sse2_cvthi_epi32_epi64(mg32, zero); + mb = sse2_cvthi_epi32_epi64(mb32, zero); + + t = _mm_add_epi64(mr, mb); + t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1)); + _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2)); + t = _mm_sub_epi64(mb, mg); + _mm_store_si128((__m128i*)cbp, t); + t = _mm_sub_epi64(mr, mg); + _mm_store_si128((__m128i*)crp, t); + + rp += 4; gp += 4; bp += 4; + yp += 2; cbp += 2; crp += 2; + } } } ////////////////////////////////////////////////////////////////////////// - void sse2_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) + void sse2_rct_backward(const line_buf *y, + const line_buf *cb, + const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, + ui32 repeat) { - for (int i = (repeat + 3) >> 2; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - __m128i mr = _mm_load_si128((__m128i*)r); - __m128i mg = _mm_load_si128((__m128i*)g); - __m128i mb = _mm_load_si128((__m128i*)b); - __m128i t = _mm_add_epi32(mr, mb); - t = _mm_add_epi32(t, _mm_slli_epi32(mg, 1)); - _mm_store_si128((__m128i*)y, _mm_srai_epi32(t, 2)); - t = _mm_sub_epi32(mb, mg); - _mm_store_si128((__m128i*)cb, t); - t = _mm_sub_epi32(mr, mg); - _mm_store_si128((__m128i*)cr, t); - - r += 4; g += 4; b += 4; - y += 4; cb += 4; cr += 4; - } - } + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + __m128i my = _mm_load_si128((__m128i*)yp); + __m128i mcb = _mm_load_si128((__m128i*)cbp); + __m128i mcr = _mm_load_si128((__m128i*)crp); - ////////////////////////////////////////////////////////////////////////// - void sse2_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) - { - for (int i = (repeat + 3) >> 2; i > 0; --i) + __m128i t = _mm_add_epi32(mcb, mcr); + t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2)); + _mm_store_si128((__m128i*)gp, t); + __m128i u = _mm_add_epi32(mcb, t); + _mm_store_si128((__m128i*)bp, u); + u = _mm_add_epi32(mcr, t); + _mm_store_si128((__m128i*)rp, u); + + yp += 4; cbp += 4; crp += 4; + rp += 4; gp += 4; bp += 4; + } + } + else { - __m128i my = _mm_load_si128((__m128i*)y); - __m128i mcb = _mm_load_si128((__m128i*)cb); - __m128i mcr = _mm_load_si128((__m128i*)cr); - - __m128i t = _mm_add_epi32(mcb, mcr); - t = _mm_sub_epi32(my, _mm_srai_epi32(t, 2)); - _mm_store_si128((__m128i*)g, t); - __m128i u = _mm_add_epi32(mcb, t); - _mm_store_si128((__m128i*)b, u); - u = _mm_add_epi32(mcr, t); - _mm_store_si128((__m128i*)r, u); - - y += 4; cb += 4; cr += 4; - r += 4; g += 4; b += 4; + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2)); + __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX); + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + __m128i my, mcb, mcr, tr, tg, tb; + my = _mm_load_si128((__m128i*)yp); + mcb = _mm_load_si128((__m128i*)cbp); + mcr = _mm_load_si128((__m128i*)crp); + + tg = _mm_add_epi64(mcb, mcr); + tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2)); + tb = _mm_add_epi64(mcb, tg); + tr = _mm_add_epi64(mcr, tg); + + __m128i mr, mg, mb; + mr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(0, 0, 2, 0)); + mr = _mm_and_si128(low_bits, mr); + mg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(0, 0, 2, 0)); + mg = _mm_and_si128(low_bits, mg); + mb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(0, 0, 2, 0)); + mb = _mm_and_si128(low_bits, mb); + + yp += 2; cbp += 2; crp += 2; + + my = _mm_load_si128((__m128i*)yp); + mcb = _mm_load_si128((__m128i*)cbp); + mcr = _mm_load_si128((__m128i*)crp); + + tg = _mm_add_epi64(mcb, mcr); + tg = _mm_sub_epi64(my, sse2_mm_srai_epi64(tg, 2, v2)); + tb = _mm_add_epi64(mcb, tg); + tr = _mm_add_epi64(mcr, tg); + + tr = _mm_shuffle_epi32(tr, _MM_SHUFFLE(2, 0, 0, 0)); + tr = _mm_andnot_si128(low_bits, tr); + mr = _mm_or_si128(mr, tr); + tg = _mm_shuffle_epi32(tg, _MM_SHUFFLE(2, 0, 0, 0)); + tg = _mm_andnot_si128(low_bits, tg); + mg = _mm_or_si128(mg, tg); + tb = _mm_shuffle_epi32(tb, _MM_SHUFFLE(2, 0, 0, 0)); + tb = _mm_andnot_si128(low_bits, tb); + mb = _mm_or_si128(mb, tb); + + _mm_store_si128((__m128i*)rp, mr); + _mm_store_si128((__m128i*)gp, mg); + _mm_store_si128((__m128i*)bp, mb); + + yp += 2; cbp += 2; crp += 2; + rp += 4; gp += 4; bp += 4; + } } } - } } diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index 57b84c7..5bf6ccd 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -39,12 +39,164 @@ #include #include "ojph_defs.h" +#include "ojph_mem.h" #include "ojph_colour.h" #include "ojph_colour_local.h" namespace ojph { namespace local { + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_convert(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = wasm_i32x4_splat((si32)shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t s = wasm_v128_load(sp); + s = wasm_i32x4_add(s, sh); + wasm_v128_store(dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + v128_t sh = wasm_i64x2_splat(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t s, t; + s = wasm_v128_load(sp); + + t = wasm_i64x2_extend_low_i32x4(s); + t = wasm_i64x2_add(t, sh); + wasm_v128_store(dp, t); + + t = wasm_i64x2_extend_high_i32x4(s); + t = wasm_i64x2_add(t, sh); + wasm_v128_store(dp + 2, t); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = wasm_i64x2_splat(shift); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + v128_t s0, s1; + s0 = wasm_v128_load(sp); + s0 = wasm_i64x2_add(s0, sh); + s1 = wasm_v128_load(sp + 2); + s1 = wasm_i64x2_add(s1, sh); + s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2); + wasm_v128_store(dp, s0); + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, + si64 shift, ui32 width) + { + if (src_line->flags & line_buf::LFT_32BIT) + { + if (dst_line->flags & line_buf::LFT_32BIT) + { + const si32 *sp = src_line->i32 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = wasm_i32x4_splat((si32)(-shift)); + v128_t zero = wasm_i32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + v128_t s = wasm_v128_load(sp); + v128_t c = wasm_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value + v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value + v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value + s = wasm_v128_andnot(c, s); // keep only +ve or 0 + s = wasm_v128_or(s, v_m_sh); // combine + wasm_v128_store(dp, s); + } + } + else + { + const si32 *sp = src_line->i32 + src_line_offset; + si64 *dp = dst_line->i64 + dst_line_offset; + v128_t sh = wasm_i64x2_splat(-shift); + v128_t zero = wasm_i32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + v128_t s, u, c, v_m_sh; + s = wasm_v128_load(sp); + + u = wasm_i64x2_extend_low_i32x4(s); + c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value + v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value + v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value + u = wasm_v128_andnot(c, u); // keep only +ve or 0 + u = wasm_v128_or(u, v_m_sh); // combine + + wasm_v128_store(dp, u); + + u = wasm_i64x2_extend_high_i32x4(s); + c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value + v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value + v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value + u = wasm_v128_andnot(c, u); // keep only +ve or 0 + u = wasm_v128_or(u, v_m_sh); // combine + + wasm_v128_store(dp + 2, u); + } + } + } + else + { + assert(src_line->flags | line_buf::LFT_64BIT); + assert(dst_line->flags | line_buf::LFT_32BIT); + const si64 *sp = src_line->i64 + src_line_offset; + si32 *dp = dst_line->i32 + dst_line_offset; + v128_t sh = wasm_i64x2_splat(-shift); + v128_t zero = wasm_i32x4_splat(0); + for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + { + // s for source, t for target, p for positive, n for negative, + // m for mask, and tm for temp + v128_t s, t0, t1, p, n, m, tm; + s = wasm_v128_load(sp); + m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value + tm = wasm_i64x2_sub(sh, s); // - shift - value + n = wasm_v128_and(m, tm); // -ve + p = wasm_v128_andnot(m, s); // +ve + t0 = wasm_v128_or(n, p); + + s = wasm_v128_load(sp + 2); + m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value + tm = wasm_i64x2_sub(sh, s); // - shift - value + n = wasm_v128_and(m, tm); // -ve + p = wasm_v128_andnot(m, s); // +ve + t1 = wasm_v128_or(n, p); + + t0 = wasm_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2); + wasm_v128_store(dp, t0); + } + } + } + ////////////////////////////////////////////////////////////////////////// void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width) @@ -108,80 +260,182 @@ namespace ojph { } } - ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_si32_to_si32_shftd(const si32 *sp, si32 *dp, int shift, - ui32 width) + void wasm_rct_forward(const line_buf *r, + const line_buf *g, + const line_buf *b, + line_buf *y, line_buf *cb, line_buf *cr, + ui32 repeat) { - v128_t sh = wasm_i32x4_splat(shift); - for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - v128_t s = wasm_v128_load(sp); - s = wasm_i32x4_add(s, sh); - wasm_v128_store(dp, s); - } - } + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; + si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; - ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_si32_to_si32_nlt_type3(const si32* sp, si32* dp, - int shift, ui32 width) - { - v128_t sh = wasm_i32x4_splat(-shift); - v128_t zero = wasm_i32x4_splat(0); - for (int i = (width + 3) >> 2; i > 0; --i, sp += 4, dp += 4) + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t mr = wasm_v128_load(rp); + v128_t mg = wasm_v128_load(gp); + v128_t mb = wasm_v128_load(bp); + v128_t t = wasm_i32x4_add(mr, mb); + t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1)); + wasm_v128_store(yp, wasm_i32x4_shr(t, 2)); + t = wasm_i32x4_sub(mb, mg); + wasm_v128_store(cbp, t); + t = wasm_i32x4_sub(mr, mg); + wasm_v128_store(crp, t); + + rp += 4; gp += 4; bp += 4; + yp += 4; cbp += 4; crp += 4; + } + } + else { - v128_t s = wasm_v128_load(sp); - v128_t c = wasm_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value - v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value - v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value - s = wasm_v128_andnot(c, s); // keep only +ve or 0 - s = wasm_v128_or(s, v_m_sh); // combine - wasm_v128_store(dp, s); + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t mr32 = wasm_v128_load(rp); + v128_t mg32 = wasm_v128_load(gp); + v128_t mb32 = wasm_v128_load(bp); + v128_t mr, mg, mb, t; + mr = wasm_i64x2_extend_low_i32x4(mr32); + mg = wasm_i64x2_extend_low_i32x4(mg32); + mb = wasm_i64x2_extend_low_i32x4(mb32); + + t = wasm_i64x2_add(mr, mb); + t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1)); + wasm_v128_store(yp, wasm_i64x2_shr(t, 2)); + t = wasm_i64x2_sub(mb, mg); + wasm_v128_store(cbp, t); + t = wasm_i64x2_sub(mr, mg); + wasm_v128_store(crp, t); + + yp += 2; cbp += 2; crp += 2; + + mr = wasm_i64x2_extend_high_i32x4(mr32); + mg = wasm_i64x2_extend_high_i32x4(mg32); + mb = wasm_i64x2_extend_high_i32x4(mb32); + + t = wasm_i64x2_add(mr, mb); + t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1)); + wasm_v128_store(yp, wasm_i64x2_shr(t, 2)); + t = wasm_i64x2_sub(mb, mg); + wasm_v128_store(cbp, t); + t = wasm_i64x2_sub(mr, mg); + wasm_v128_store(crp, t); + + rp += 4; gp += 4; bp += 4; + yp += 2; cbp += 2; crp += 2; + } } } ////////////////////////////////////////////////////////////////////////// - void wasm_rct_forward(const si32 *r, const si32 *g, const si32 *b, - si32 *y, si32 *cb, si32 *cr, ui32 repeat) + void wasm_rct_backward(const line_buf *y, + const line_buf *cb, + const line_buf *cr, + line_buf *r, line_buf *g, line_buf *b, + ui32 repeat) { - for (int i = (repeat + 3) >> 2; i > 0; --i) + assert((y->flags & line_buf::LFT_REVERSIBLE) && + (cb->flags & line_buf::LFT_REVERSIBLE) && + (cr->flags & line_buf::LFT_REVERSIBLE) && + (r->flags & line_buf::LFT_REVERSIBLE) && + (g->flags & line_buf::LFT_REVERSIBLE) && + (b->flags & line_buf::LFT_REVERSIBLE)); + + if (y->flags & line_buf::LFT_32BIT) { - v128_t mr = wasm_v128_load(r); - v128_t mg = wasm_v128_load(g); - v128_t mb = wasm_v128_load(b); - v128_t t = wasm_i32x4_add(mr, mb); - t = wasm_i32x4_add(t, wasm_i32x4_shl(mg, 1)); - wasm_v128_store(y, wasm_i32x4_shr(t, 2)); - t = wasm_i32x4_sub(mb, mg); - wasm_v128_store(cb, t); - t = wasm_i32x4_sub(mr, mg); - wasm_v128_store(cr, t); + assert((y->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && + (cr->flags & line_buf::LFT_32BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t my = wasm_v128_load(yp); + v128_t mcb = wasm_v128_load(cbp); + v128_t mcr = wasm_v128_load(crp); - r += 4; g += 4; b += 4; - y += 4; cb += 4; cr += 4; - } - } + v128_t t = wasm_i32x4_add(mcb, mcr); + t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2)); + wasm_v128_store(gp, t); + v128_t u = wasm_i32x4_add(mcb, t); + wasm_v128_store(bp, u); + u = wasm_i32x4_add(mcr, t); + wasm_v128_store(rp, u); - ////////////////////////////////////////////////////////////////////////// - void wasm_rct_backward(const si32 *y, const si32 *cb, const si32 *cr, - si32 *r, si32 *g, si32 *b, ui32 repeat) - { - for (int i = (repeat + 3) >> 2; i > 0; --i) + yp += 4; cbp += 4; crp += 4; + rp += 4; gp += 4; bp += 4; + } + } + else { - v128_t my = wasm_v128_load(y); - v128_t mcb = wasm_v128_load(cb); - v128_t mcr = wasm_v128_load(cr); + assert((y->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && + (cr->flags & line_buf::LFT_64BIT) && + (r->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); + const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; + si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; + for (int i = (repeat + 3) >> 2; i > 0; --i) + { + v128_t my, mcb, mcr, tr0, tg0, tb0, tr1, tg1, tb1; + my = wasm_v128_load(yp); + mcb = wasm_v128_load(cbp); + mcr = wasm_v128_load(crp); - v128_t t = wasm_i32x4_add(mcb, mcr); - t = wasm_i32x4_sub(my, wasm_i32x4_shr(t, 2)); - wasm_v128_store(g, t); - v128_t u = wasm_i32x4_add(mcb, t); - wasm_v128_store(b, u); - u = wasm_i32x4_add(mcr, t); - wasm_v128_store(r, u); + tg0 = wasm_i64x2_add(mcb, mcr); + tg0 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg0, 2)); + tb0 = wasm_i64x2_add(mcb, tg0); + tr0 = wasm_i64x2_add(mcr, tg0); - y += 4; cb += 4; cr += 4; - r += 4; g += 4; b += 4; + yp += 2; cbp += 2; crp += 2; + + my = wasm_v128_load(yp); + mcb = wasm_v128_load(cbp); + mcr = wasm_v128_load(crp); + + tg1 = wasm_i64x2_add(mcb, mcr); + tg1 = wasm_i64x2_sub(my, wasm_i64x2_shr(tg1, 2)); + tb1 = wasm_i64x2_add(mcb, tg1); + tr1 = wasm_i64x2_add(mcr, tg1); + + tr0 = wasm_i32x4_shuffle(tr0, tr1, 0, 2, 4 + 0, 4 + 2); + tg0 = wasm_i32x4_shuffle(tg0, tg1, 0, 2, 4 + 0, 4 + 2); + tb0 = wasm_i32x4_shuffle(tb0, tb1, 0, 2, 4 + 0, 4 + 2); + + wasm_v128_store(rp, tr0); + wasm_v128_store(gp, tg0); + wasm_v128_store(bp, tb0); + + yp += 2; cbp += 2; crp += 2; + rp += 4; gp += 4; bp += 4; + } } } diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp index ee4bb08..c4313ab 100644 --- a/src/core/transform/ojph_transform.cpp +++ b/src/core/transform/ojph_transform.cpp @@ -45,7 +45,9 @@ #include "../codestream/ojph_params_local.h" namespace ojph { - struct line_buf; + + // defined elsewhere + class line_buf; namespace local { @@ -156,9 +158,9 @@ namespace ojph { #if (defined(OJPH_ARCH_X86_64) && !defined(OJPH_DISABLE_AVX512)) if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX512) { - rev_vert_step = avx512_rev_vert_step; - rev_horz_ana = avx512_rev_horz_ana; - rev_horz_syn = avx512_rev_horz_syn; + // rev_vert_step = avx512_rev_vert_step; + // rev_horz_ana = avx512_rev_horz_ana; + // rev_horz_syn = avx512_rev_horz_syn; irv_vert_step = avx512_irv_vert_step; irv_vert_times_K = avx512_irv_vert_times_K; @@ -192,13 +194,14 @@ namespace ojph { #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN) ///////////////////////////////////////////////////////////////////////// - void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + static + void gen_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; si32* dst = aug->i32; const si32* src1 = sig->i32, * src2 = other->i32; @@ -243,9 +246,85 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + static + void gen_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si64 a = s->rev.Aatk; + const si64 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + *src1++ + *src2++) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + *src1++ + *src2++) >> e; + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ += (*src1++ + *src2++) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (*src1++ + *src2++) >> e; + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b - (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b - (*src1++ + *src2++)) >> e; + } + else { // general case + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + a * (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + a * (*src1++ + *src2++)) >> e; + } + } + + ///////////////////////////////////////////////////////////////////////// + void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + gen_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + gen_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void gen_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { @@ -277,7 +356,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; // extension lp[-1] = lp[0]; @@ -319,11 +398,111 @@ namespace ojph { hdst->i32[0] = src->i32[0] << 1; } } - - ////////////////////////////////////////////////////////////////////////// - void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + + ///////////////////////////////////////////////////////////////////////// + static + void gen_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // combine both lsrc and hsrc into dst + si64* dph = hdst->i64; + si64* dpl = ldst->i64; + si64* sp = src->i64; + ui32 w = width; + if (!even) + { + *dph++ = *sp++; --w; + } + for (; w > 1; w -= 2) + { + *dpl++ = *sp++; *dph++ = *sp++; + } + if (w) + { + *dpl++ = *sp++; --w; + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si64 a = s->rev.Aatk; + const si64 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp + (even ? 1 : 0); + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + (sp[-1] + sp[0])) >> e; + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp -= (sp[-1] + sp[0]) >> e; + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b - (sp[-1] + sp[0])) >> e; + } + else { + // general case + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[-1] + sp[0])) >> e; + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + gen_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + gen_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } + + ////////////////////////////////////////////////////////////////////////// + static + void gen_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -337,7 +516,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; // extension oth[-1] = oth[0]; @@ -398,6 +577,105 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static + void gen_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si64 a = s->rev.Aatk; + const si64 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth + (ev ? 0 : 1); + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + (sp[-1] + sp[0])) >> e; + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp += (sp[-1] + sp[0]) >> e; + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b - (sp[-1] + sp[0])) >> e; + } + else { + // general case + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[-1] + sp[0])) >> e; + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + si64* sph = hsrc->i64; + si64* spl = lsrc->i64; + si64* dp = dst->i64; + ui32 w = width; + if (!even) + { + *dp++ = *sph++; --w; + } + for (; w > 1; w -= 2) + { + *dp++ = *spl++; *dp++ = *sph++; + } + if (w) + { + *dp++ = *spl++; --w; + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + gen_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + gen_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + ////////////////////////////////////////////////////////////////////////// void gen_irv_vert_step(const lifting_step* s, const line_buf* sig, const line_buf* other, const line_buf* aug, diff --git a/src/core/transform/ojph_transform.h b/src/core/transform/ojph_transform.h index 0e59632..f7576a1 100644 --- a/src/core/transform/ojph_transform.h +++ b/src/core/transform/ojph_transform.h @@ -42,7 +42,10 @@ #include "ojph_defs.h" namespace ojph { - struct line_buf; + + // defined elsewhere + class line_buf; + namespace local { union lifting_step; struct param_atk; diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp index 0856662..8838d18 100644 --- a/src/core/transform/ojph_transform_avx.cpp +++ b/src/core/transform/ojph_transform_avx.cpp @@ -61,6 +61,40 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static inline + void avx_deinterleave32(float* dpl, float* dph, float* sp, int width) + { + for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) + { + __m256 a = _mm256_load_ps(sp); + __m256 b = _mm256_load_ps(sp + 8); + __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); + __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); + __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); + __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); + _mm256_store_ps(dpl, e); + _mm256_store_ps(dph, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx_interleave32(float* dp, float* spl, float* sph, int width) + { + for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) + { + __m256 a = _mm256_load_ps(spl); + __m256 b = _mm256_load_ps(sph); + __m256 c = _mm256_unpacklo_ps(a, b); + __m256 d = _mm256_unpackhi_ps(a, b); + __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); + __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); + _mm256_store_ps(dp, e); + _mm256_store_ps(dp + 8, f); + } + } + ////////////////////////////////////////////////////////////////////////// void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, const line_buf* other, const line_buf* aug, @@ -100,11 +134,11 @@ namespace ojph { { // split src into ldst and hdst { - float* dpl = ldst->f32; - float* dph = hdst->f32; + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; float* sp = src->f32; int w = (int)width; - AVX_DEINTERLEAVE(dpl, dph, sp, w, even); + avx_deinterleave32(dpl, dph, sp, w); } // the actual horizontal transform @@ -235,10 +269,10 @@ namespace ojph { // combine both lsrc and hsrc into dst { float* dp = dst->f32; - float* spl = lsrc->f32; - float* sph = hsrc->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; int w = (int)width; - AVX_INTERLEAVE(dp, spl, sph, w, even); + avx_interleave32(dp, spl, sph, w); } } else { diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp index 847cd4c..1bc92e6 100644 --- a/src/core/transform/ojph_transform_avx2.cpp +++ b/src/core/transform/ojph_transform_avx2.cpp @@ -35,6 +35,7 @@ // Date: 28 August 2019 //***************************************************************************/ +#include #include #include "ojph_defs.h" @@ -52,13 +53,95 @@ namespace ojph { namespace local { ///////////////////////////////////////////////////////////////////////// - void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h + static inline + __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) + { + // note than m must be obtained using + // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt)); + __m256i x = _mm256_srli_epi64(a, amt); + x = _mm256_xor_si256(x, m); + __m256i result = _mm256_sub_epi64(x, m); + return result; + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx2_deinterleave32(float* dpl, float* dph, float* sp, int width) + { + for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) + { + __m256 a = _mm256_load_ps(sp); + __m256 b = _mm256_load_ps(sp + 8); + __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); + __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); + __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); + __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); + _mm256_store_ps(dpl, e); + _mm256_store_ps(dph, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx2_interleave32(float* dp, float* spl, float* sph, int width) + { + for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) + { + __m256 a = _mm256_load_ps(spl); + __m256 b = _mm256_load_ps(sph); + __m256 c = _mm256_unpacklo_ps(a, b); + __m256 d = _mm256_unpackhi_ps(a, b); + __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); + __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); + _mm256_store_ps(dp, e); + _mm256_store_ps(dp + 8, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx2_deinterleave64(double* dpl, double* dph, double* sp, int width) + { + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + __m256d a = _mm256_load_pd(sp); + __m256d b = _mm256_load_pd(sp + 4); + __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0)); + __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1)); + __m256d e = _mm256_shuffle_pd(c, d, 0x0); + __m256d f = _mm256_shuffle_pd(c, d, 0xF); + _mm256_store_pd(dpl, e); + _mm256_store_pd(dph, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void avx2_interleave64(double* dp, double* spl, double* sph, int width) + { + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + __m256d a = _mm256_load_pd(spl); + __m256d b = _mm256_load_pd(sph); + __m256d c = _mm256_unpacklo_pd(a, b); + __m256d d = _mm256_unpackhi_pd(a, b); + __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0)); + __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1)); + _mm256_store_pd(dp, e); + _mm256_store_pd(dp + 4, f); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void avx2_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m256i va = _mm256_set1_epi32(a); __m256i vb = _mm256_set1_epi32(b); @@ -181,19 +264,154 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + static + void avx2_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m256i vb = _mm256_set1_epi64x(b); + __m256i ve = _mm256_set1_epi64x(1LL << (63 - e)); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + else + for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)src1); + __m256i s2 = _mm256_load_si256((__m256i*)src2); + __m256i d = _mm256_load_si256((__m256i*)dst); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dst, d); + } + } + else { // general case + // 64bit multiplication is not supported in avx2; + // in particular, _mm256_mullo_epi64. + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + a * (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + a * (*src1++ + *src2++)) >> e; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + avx2_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + avx2_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void avx2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { - // combine both lsrc and hsrc into dst + // split src into ldst and hdst { - float* dpl = ldst->f32; - float* dph = hdst->f32; - float* sp = src->f32; + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; int w = (int)width; - AVX_DEINTERLEAVE(dpl, dph, sp, w, even); + avx2_deinterleave32(dpl, dph, sp, w); } si32* hp = hdst->i32, * lp = ldst->i32; @@ -206,7 +424,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m256i va = _mm256_set1_epi32(a); __m256i vb = _mm256_set1_epi32(b); @@ -346,11 +564,181 @@ namespace ojph { hdst->i32[0] = src->i32[0] << 1; } } + + ///////////////////////////////////////////////////////////////////////// + static + void avx2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // split src into ldst and hdst + { + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); + double* sp = (double*)src->p; + int w = (int)width; + avx2_deinterleave64(dpl, dph, sp, w); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m256i vb = _mm256_set1_epi64x(b); + __m256i ve = _mm256_set1_epi64x(1LL << (63 - e)); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else { + // general case + // 64bit multiplication is not supported in avx2; + // in particular, _mm256_mullo_epi64. + if (even) + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[0] + sp[1])) >> e; + else + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[-1] + sp[0])) >> e; + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + avx2_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + avx2_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } ////////////////////////////////////////////////////////////////////////// - void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, - ui32 width, bool even) + static + void avx2_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -364,7 +752,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m256i va = _mm256_set1_epi32(a); __m256i vb = _mm256_set1_epi32(b); @@ -499,11 +887,11 @@ namespace ojph { // combine both lsrc and hsrc into dst { - float* dp = dst->f32; - float* spl = lsrc->f32; - float* sph = hsrc->f32; + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; int w = (int)width; - AVX_INTERLEAVE(dp, spl, sph, w, even); + avx2_interleave32(dp, spl, sph, w); } } else { @@ -514,5 +902,174 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static + void avx2_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m256i vb = _mm256_set1_epi64x(b); + __m256i ve = _mm256_set1_epi64x(1LL << (63 - e)); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else + { + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_add_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i w = avx2_mm256_srai_epi64(t, e, ve); + d = _mm256_add_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + else + for (; i > 0; i -= 4, sp += 4, dp += 4) + { + __m256i s1 = _mm256_load_si256((__m256i*)sp); + __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1)); + __m256i d = _mm256_load_si256((__m256i*)dp); + __m256i t = _mm256_add_epi64(s1, s2); + __m256i v = _mm256_sub_epi64(vb, t); + __m256i w = avx2_mm256_srai_epi64(v, e, ve); + d = _mm256_sub_epi64(d, w); + _mm256_store_si256((__m256i*)dp, d); + } + } + else { + // general case + // 64bit multiplication is not supported in avx2; + // in particular, _mm_mullo_epi64. + if (ev) + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[-1] + sp[0])) >> e; + else + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[0] + sp[1])) >> e; + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)dst->p; + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); + int w = (int)width; + avx2_interleave64(dp, spl, sph, w); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + avx2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + avx2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + } // !local } // !ojph diff --git a/src/core/transform/ojph_transform_avx512.cpp b/src/core/transform/ojph_transform_avx512.cpp index 504aa87..0e92230 100644 --- a/src/core/transform/ojph_transform_avx512.cpp +++ b/src/core/transform/ojph_transform_avx512.cpp @@ -54,8 +54,8 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// // We split multiples of 32 followed by multiples of 16, because // we assume byte_alignment == 64 - static void avx512_deinterleave(float* dpl, float* dph, float* sp, - int width, bool even) + static + void avx512_deinterleave32(float* dpl, float* dph, float* sp, int width) { __m512i idx1 = _mm512_set_epi32( 0x1E, 0x1C, 0x1A, 0x18, 0x16, 0x14, 0x12, 0x10, @@ -65,59 +65,33 @@ namespace ojph { 0x1F, 0x1D, 0x1B, 0x19, 0x17, 0x15, 0x13, 0x11, 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01 ); - if (even) + for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16) { - for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16) - { - __m512 a = _mm512_load_ps(sp); - __m512 b = _mm512_load_ps(sp + 16); - __m512 c = _mm512_permutex2var_ps(a, idx1, b); - __m512 d = _mm512_permutex2var_ps(a, idx2, b); - _mm512_store_ps(dpl, c); - _mm512_store_ps(dph, d); - } - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) - { - __m256 a = _mm256_load_ps(sp); - __m256 b = _mm256_load_ps(sp + 8); - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); - _mm256_store_ps(dpl, e); - _mm256_store_ps(dph, f); - } + __m512 a = _mm512_load_ps(sp); + __m512 b = _mm512_load_ps(sp + 16); + __m512 c = _mm512_permutex2var_ps(a, idx1, b); + __m512 d = _mm512_permutex2var_ps(a, idx2, b); + _mm512_store_ps(dpl, c); + _mm512_store_ps(dph, d); } - else + for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) { - for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16) - { - __m512 a = _mm512_load_ps(sp); - __m512 b = _mm512_load_ps(sp + 16); - __m512 c = _mm512_permutex2var_ps(a, idx2, b); - __m512 d = _mm512_permutex2var_ps(a, idx1, b); - _mm512_store_ps(dpl, c); - _mm512_store_ps(dph, d); - } - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) - { - __m256 a = _mm256_load_ps(sp); - __m256 b = _mm256_load_ps(sp + 8); - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); - _mm256_store_ps(dpl, f); - _mm256_store_ps(dph, e); - } + __m256 a = _mm256_load_ps(sp); + __m256 b = _mm256_load_ps(sp + 8); + __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); + __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); + __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); + __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); + _mm256_store_ps(dpl, e); + _mm256_store_ps(dph, f); } } ////////////////////////////////////////////////////////////////////////// // We split multiples of 32 followed by multiples of 16, because // we assume byte_alignment == 64 - static void avx512_interleave(float* dp, float* spl, float* sph, - int width, bool even) + static + void avx512_interleave32(float* dp, float* spl, float* sph, int width) { __m512i idx1 = _mm512_set_epi32( 0x17, 0x7, 0x16, 0x6, 0x15, 0x5, 0x14, 0x4, @@ -127,51 +101,93 @@ namespace ojph { 0x1F, 0xF, 0x1E, 0xE, 0x1D, 0xD, 0x1C, 0xC, 0x1B, 0xB, 0x1A, 0xA, 0x19, 0x9, 0x18, 0x8 ); - if (even) + for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16) { - for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16) - { - __m512 a = _mm512_load_ps(spl); - __m512 b = _mm512_load_ps(sph); - __m512 c = _mm512_permutex2var_ps(a, idx1, b); - __m512 d = _mm512_permutex2var_ps(a, idx2, b); - _mm512_store_ps(dp, c); - _mm512_store_ps(dp + 16, d); - } - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) - { - __m256 a = _mm256_load_ps(spl); - __m256 b = _mm256_load_ps(sph); - __m256 c = _mm256_unpacklo_ps(a, b); - __m256 d = _mm256_unpackhi_ps(a, b); - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); - _mm256_store_ps(dp, e); - _mm256_store_ps(dp + 8, f); - } + __m512 a = _mm512_load_ps(spl); + __m512 b = _mm512_load_ps(sph); + __m512 c = _mm512_permutex2var_ps(a, idx1, b); + __m512 d = _mm512_permutex2var_ps(a, idx2, b); + _mm512_store_ps(dp, c); + _mm512_store_ps(dp + 16, d); } - else + for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) { - for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16) - { - __m512 a = _mm512_load_ps(spl); - __m512 b = _mm512_load_ps(sph); - __m512 c = _mm512_permutex2var_ps(b, idx1, a); - __m512 d = _mm512_permutex2var_ps(b, idx2, a); - _mm512_store_ps(dp, c); - _mm512_store_ps(dp + 16, d); - } - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) - { - __m256 a = _mm256_load_ps(spl); - __m256 b = _mm256_load_ps(sph); - __m256 c = _mm256_unpacklo_ps(b, a); - __m256 d = _mm256_unpackhi_ps(b, a); - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); - _mm256_store_ps(dp, e); - _mm256_store_ps(dp + 8, f); - } + __m256 a = _mm256_load_ps(spl); + __m256 b = _mm256_load_ps(sph); + __m256 c = _mm256_unpacklo_ps(a, b); + __m256 d = _mm256_unpackhi_ps(a, b); + __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); + __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); + _mm256_store_ps(dp, e); + _mm256_store_ps(dp + 8, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + // We split multiples of 32 followed by multiples of 16, because + // we assume byte_alignment == 64 + static void avx512_deinterleave64(double* dpl, double* dph, double* sp, + int width) + { + __m512i idx1 = _mm512_set_epi64( + 0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00 + ); + __m512i idx2 = _mm512_set_epi64( + 0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01 + ); + for (; width > 8; width -= 16, sp += 16, dpl += 8, dph += 8) + { + __m512d a = _mm512_load_pd(sp); + __m512d b = _mm512_load_pd(sp + 16); + __m512d c = _mm512_permutex2var_pd(a, idx1, b); + __m512d d = _mm512_permutex2var_pd(a, idx2, b); + _mm512_store_pd(dpl, c); + _mm512_store_pd(dph, d); + } + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + __m256d a = _mm256_load_pd(sp); + __m256d b = _mm256_load_pd(sp + 4); + __m256d c = _mm256_permute2f128_pd(a, b, (2 << 4) | (0)); + __m256d d = _mm256_permute2f128_pd(a, b, (3 << 4) | (1)); + __m256d e = _mm256_shuffle_pd(c, d, 0x0); + __m256d f = _mm256_shuffle_pd(c, d, 0xF); + _mm256_store_pd(dpl, e); + _mm256_store_pd(dph, f); + } + } + + ////////////////////////////////////////////////////////////////////////// + // We split multiples of 32 followed by multiples of 16, because + // we assume byte_alignment == 64 + static void avx512_interleave64(double* dp, double* spl, double* sph, + int width) + { + __m512i idx1 = _mm512_set_epi64( + 0xB, 0x3, 0xA, 0x2, 0x9, 0x1, 0x8, 0x0 + ); + __m512i idx2 = _mm512_set_epi64( + 0xF, 0x7, 0xE, 0x6, 0xD, 0x5, 0xC, 0x4 + ); + for (; width > 8; width -= 16, dp += 16, spl += 8, sph += 8) + { + __m512d a = _mm512_load_pd(spl); + __m512d b = _mm512_load_pd(sph); + __m512d c = _mm512_permutex2var_pd(a, idx1, b); + __m512d d = _mm512_permutex2var_pd(a, idx2, b); + _mm512_store_pd(dp, c); + _mm512_store_pd(dp + 16, d); + } + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + __m256d a = _mm256_load_pd(spl); + __m256d b = _mm256_load_pd(sph); + __m256d c = _mm256_unpacklo_pd(a, b); + __m256d d = _mm256_unpackhi_pd(a, b); + __m256d e = _mm256_permute2f128_pd(c, d, (2 << 4) | (0)); + __m256d f = _mm256_permute2f128_pd(c, d, (3 << 4) | (1)); + _mm256_store_pd(dp, e); + _mm256_store_pd(dp + 4, f); } } @@ -224,7 +240,13 @@ namespace ojph { if (width > 1) { // split src into ldst and hdst - avx512_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even); + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + avx512_deinterleave32(dpl, dph, sp, w); + } // the actual horizontal transform float* hp = hdst->f32, * lp = ldst->f32; @@ -352,7 +374,13 @@ namespace ojph { } // combine both lsrc and hsrc into dst - avx512_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even); + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + avx512_interleave32(dp, spl, sph, w); + } } else { if (even) @@ -364,13 +392,13 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// - void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + void avx512_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m512i va = _mm512_set1_epi32(a); __m512i vb = _mm512_set1_epi32(b); @@ -493,14 +521,185 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + void avx512_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m512i vb = _mm512_set1_epi64(b); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + else + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + else + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + else + for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)src1); + __m512i s2 = _mm512_load_si512((__m512i*)src2); + __m512i d = _mm512_load_si512((__m512i*)dst); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dst, d); + } + } + else { + // general case + // 64bit multiplication is not supported in AVX512F + AVX512CD; + // in particular, _mm256_mullo_epi64. + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + a * (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + a * (*src1++ + *src2++)) >> e; + } + + // This can only be used if you have AVX512DQ + // { // general case + // __m512i va = _mm512_set1_epi64(a); + // int i = (int)repeat; + // if (synthesis) + // for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)src1); + // __m512i s2 = _mm512_load_si512((__m512i*)src2); + // __m512i d = _mm512_load_si512((__m512i*)dst); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_sub_epi64(d, w); + // _mm512_store_si512((__m512i*)dst, d); + // } + // else + // for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)src1); + // __m512i s2 = _mm512_load_si512((__m512i*)src2); + // __m512i d = _mm512_load_si512((__m512i*)dst); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_add_epi64(d, w); + // _mm512_store_si512((__m512i*)dst, d); + // } + // } + } + + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + avx512_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + avx512_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { - // combine both lsrc and hsrc into dst - avx512_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even); + // split src into ldst and hdst + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + avx512_deinterleave32(dpl, dph, sp, w); + } si32* hp = hdst->i32, * lp = ldst->i32; ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass @@ -512,7 +711,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m512i va = _mm512_set1_epi32(a); __m512i vb = _mm512_set1_epi32(b); @@ -653,10 +852,211 @@ namespace ojph { } } - ////////////////////////////////////////////////////////////////////////// - void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // split src into ldst and hdst + { + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); + double* sp = (double*)(src->p); + int w = (int)width; + avx512_deinterleave64(dpl, dph, sp, w); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m512i vb = _mm512_set1_epi64(b); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else + { + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + else + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + else + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else + { + // general case + // 64bit multiplication is not supported in AVX512F + AVX512CD; + // in particular, _mm256_mullo_epi64. + if (even) + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[0] + sp[1])) >> e; + else + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[-1] + sp[0])) >> e; + } + + // This can only be used if you have AVX512DQ + // { + // // general case + // __m512i va = _mm512_set1_epi64(a); + // int i = (int)h_width; + // if (even) + // for (; i > 0; i -= 8, sp += 8, dp += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)sp); + // __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + // __m512i d = _mm512_load_si512((__m512i*)dp); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_add_epi64(d, w); + // _mm512_store_si512((__m512i*)dp, d); + // } + // else + // for (; i > 0; i -= 8, sp += 8, dp += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)sp); + // __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + // __m512i d = _mm512_load_si512((__m512i*)dp); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_add_epi64(d, w); + // _mm512_store_si512((__m512i*)dp, d); + // } + // } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + avx512_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + avx512_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } + + ////////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -670,7 +1070,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m512i va = _mm512_set1_epi32(a); __m512i vb = _mm512_set1_epi32(b); @@ -804,7 +1204,13 @@ namespace ojph { } // combine both lsrc and hsrc into dst - avx512_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even); + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + avx512_interleave32(dp, spl, sph, w); + } } else { if (even) @@ -814,5 +1220,206 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m512i vb = _mm512_set1_epi64(b); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else + { + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_add_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + else + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i w = _mm512_srai_epi64(t, e); + d = _mm512_add_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + else + for (; i > 0; i -= 8, sp += 8, dp += 8) + { + __m512i s1 = _mm512_load_si512((__m512i*)sp); + __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + __m512i d = _mm512_load_si512((__m512i*)dp); + __m512i t = _mm512_add_epi64(s1, s2); + __m512i v = _mm512_sub_epi64(vb, t); + __m512i w = _mm512_srai_epi64(v, e); + d = _mm512_sub_epi64(d, w); + _mm512_store_si512((__m512i*)dp, d); + } + } + else + { + // general case + // 64bit multiplication is not supported in AVX512F + AVX512CD; + // in particular, _mm256_mullo_epi64. + if (ev) + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[-1] + sp[0])) >> e; + else + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[0] + sp[1])) >> e; + } + + // This can only be used if you have AVX512DQ + // { + // // general case + // __m512i va = _mm512_set1_epi64(a); + // int i = (int)aug_width; + // if (ev) + // for (; i > 0; i -= 8, sp += 8, dp += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)sp); + // __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1)); + // __m512i d = _mm512_load_si512((__m512i*)dp); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_sub_epi64(d, w); + // _mm512_store_si512((__m512i*)dp, d); + // } + // else + // for (; i > 0; i -= 8, sp += 8, dp += 8) + // { + // __m512i s1 = _mm512_load_si512((__m512i*)sp); + // __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1)); + // __m512i d = _mm512_load_si512((__m512i*)dp); + // __m512i t = _mm512_add_epi64(s1, s2); + // __m512i u = _mm512_mullo_epi64(va, t); + // __m512i v = _mm512_add_epi64(vb, u); + // __m512i w = _mm512_srai_epi64(v, e); + // d = _mm512_sub_epi64(d, w); + // _mm512_store_si512((__m512i*)dp, d); + // } + // } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)(dst->p); + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); + int w = (int)width; + avx512_interleave64(dp, spl, sph, w); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + avx512_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + avx512_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + } // !local } // !ojph diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h index ec2a2e1..acf9ee6 100644 --- a/src/core/transform/ojph_transform_local.h +++ b/src/core/transform/ojph_transform_local.h @@ -42,7 +42,10 @@ #include "ojph_defs.h" namespace ojph { - struct line_buf; + + // defined elsewhere + class line_buf; + namespace local { struct param_atk; union lifting_step; @@ -104,60 +107,6 @@ namespace ojph { // ////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////// - // Supporting macros - ////////////////////////////////////////////////////////////////////////// - - ////////////////////////////////////////////////////////////////////////// - #define SSE_DEINTERLEAVE(dpl, dph, sp, width, even) \ - { \ - if (even) \ - for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) \ - { \ - __m128 a = _mm_load_ps(sp); \ - __m128 b = _mm_load_ps(sp + 4); \ - __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ - __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ - _mm_store_ps(dpl, c); \ - _mm_store_ps(dph, d); \ - } \ - else \ - for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) \ - { \ - __m128 a = _mm_load_ps(sp); \ - __m128 b = _mm_load_ps(sp + 4); \ - __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \ - __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \ - _mm_store_ps(dpl, d); \ - _mm_store_ps(dph, c); \ - } \ - } - - ////////////////////////////////////////////////////////////////////////// - #define SSE_INTERLEAVE(dp, spl, sph, width, even) \ - { \ - if (even) \ - for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) \ - { \ - __m128 a = _mm_load_ps(spl); \ - __m128 b = _mm_load_ps(sph); \ - __m128 c = _mm_unpacklo_ps(a, b); \ - __m128 d = _mm_unpackhi_ps(a, b); \ - _mm_store_ps(dp, c); \ - _mm_store_ps(dp + 4, d); \ - } \ - else \ - for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) \ - { \ - __m128 a = _mm_load_ps(spl); \ - __m128 b = _mm_load_ps(sph); \ - __m128 c = _mm_unpacklo_ps(b, a); \ - __m128 d = _mm_unpackhi_ps(b, a); \ - _mm_store_ps(dp, c); \ - _mm_store_ps(dp + 4, d); \ - } \ - } - ////////////////////////////////////////////////////////////////////////// // Irreversible functions ////////////////////////////////////////////////////////////////////////// @@ -216,76 +165,6 @@ namespace ojph { // ////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////// - // Supporting macros - ////////////////////////////////////////////////////////////////////////// - - ////////////////////////////////////////////////////////////////////////// - #define AVX_DEINTERLEAVE(dpl, dph, sp, width, even) \ - { \ - if (even) \ - { \ - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) \ - { \ - __m256 a = _mm256_load_ps(sp); \ - __m256 b = _mm256_load_ps(sp + 8); \ - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); \ - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); \ - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ - _mm256_store_ps(dpl, e); \ - _mm256_store_ps(dph, f); \ - } \ - } \ - else \ - { \ - for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8) \ - { \ - __m256 a = _mm256_load_ps(sp); \ - __m256 b = _mm256_load_ps(sp + 8); \ - __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0)); \ - __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1)); \ - __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \ - __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \ - _mm256_store_ps(dpl, f); \ - _mm256_store_ps(dph, e); \ - } \ - } \ - } - - ////////////////////////////////////////////////////////////////////////// - #define AVX_INTERLEAVE(dp, spl, sph, width, even) \ - { \ - if (even) \ - { \ - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) \ - { \ - __m256 a = _mm256_load_ps(spl); \ - __m256 b = _mm256_load_ps(sph); \ - __m256 c = _mm256_unpacklo_ps(a, b); \ - __m256 d = _mm256_unpackhi_ps(a, b); \ - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); \ - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); \ - _mm256_store_ps(dp, e); \ - _mm256_store_ps(dp + 8, f); \ - } \ - } \ - else \ - { \ - for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8) \ - { \ - __m256 a = _mm256_load_ps(spl); \ - __m256 b = _mm256_load_ps(sph); \ - __m256 c = _mm256_unpacklo_ps(b, a); \ - __m256 d = _mm256_unpackhi_ps(b, a); \ - __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0)); \ - __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1)); \ - _mm256_store_ps(dp, e); \ - _mm256_store_ps(dp + 8, f); \ - } \ - } \ - } - ////////////////////////////////////////////////////////////////////////// // Irreversible functions ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp index 897a193..dcb5e53 100644 --- a/src/core/transform/ojph_transform_sse.cpp +++ b/src/core/transform/ojph_transform_sse.cpp @@ -50,6 +50,36 @@ namespace ojph { namespace local { + ////////////////////////////////////////////////////////////////////////// + static inline + void sse_deinterleave32(float* dpl, float* dph, float* sp, int width) + { + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + __m128 a = _mm_load_ps(sp); + __m128 b = _mm_load_ps(sp + 4); + __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); + __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); + _mm_store_ps(dpl, c); + _mm_store_ps(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse_interleave32(float* dp, float* spl, float* sph, int width) \ + { + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + __m128 a = _mm_load_ps(spl); + __m128 b = _mm_load_ps(sph); + __m128 c = _mm_unpacklo_ps(a, b); + __m128 d = _mm_unpackhi_ps(a, b); + _mm_store_ps(dp, c); + _mm_store_ps(dp + 4, d); + } + } + ////////////////////////////////////////////////////////////////////////// static inline void sse_multiply_const(float* p, float f, int width) { @@ -100,11 +130,11 @@ namespace ojph { { // split src into ldst and hdst { - float* dpl = ldst->f32; - float* dph = hdst->f32; + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; float* sp = src->f32; int w = (int)width; - SSE_DEINTERLEAVE(dpl, dph, sp, w, even); + sse_deinterleave32(dpl, dph, sp, w); } // the actual horizontal transform @@ -235,10 +265,10 @@ namespace ojph { // combine both lsrc and hsrc into dst { float* dp = dst->f32; - float* spl = lsrc->f32; - float* sph = hsrc->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; int w = (int)width; - SSE_INTERLEAVE(dp, spl, sph, w, even); + sse_interleave32(dp, spl, sph, w); } } else { diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp index 8328842..a69b1fb 100644 --- a/src/core/transform/ojph_transform_sse2.cpp +++ b/src/core/transform/ojph_transform_sse2.cpp @@ -35,6 +35,7 @@ // Date: 28 August 2019 //***************************************************************************/ +#include #include #include "ojph_defs.h" @@ -52,13 +53,86 @@ namespace ojph { namespace local { ///////////////////////////////////////////////////////////////////////// - void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h + static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) + { + // note than m must be obtained using + // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt)); + __m128i x = _mm_srli_epi64(a, amt); + x = _mm_xor_si128(x, m); + __m128i result = _mm_sub_epi64(x, m); + return result; + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse2_deinterleave32(float* dpl, float* dph, float* sp, int width) + { + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + __m128 a = _mm_load_ps(sp); + __m128 b = _mm_load_ps(sp + 4); + __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); + __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); + _mm_store_ps(dpl, c); + _mm_store_ps(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse2_interleave32(float* dp, float* spl, float* sph, int width) \ + { + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + __m128 a = _mm_load_ps(spl); + __m128 b = _mm_load_ps(sph); + __m128 c = _mm_unpacklo_ps(a, b); + __m128 d = _mm_unpackhi_ps(a, b); + _mm_store_ps(dp, c); + _mm_store_ps(dp + 4, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse2_deinterleave64(double* dpl, double* dph, double* sp, int width) + { + for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2) + { + __m128d a = _mm_load_pd(sp); + __m128d b = _mm_load_pd(sp + 2); + __m128d c = _mm_shuffle_pd(a, b, 0); + __m128d d = _mm_shuffle_pd(a, b, 3); + _mm_store_pd(dpl, c); + _mm_store_pd(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void sse2_interleave64(double* dp, double* spl, double* sph, int width) + { + for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2) + { + __m128d a = _mm_load_pd(spl); + __m128d b = _mm_load_pd(sph); + __m128d c = _mm_unpacklo_pd(a, b); + __m128d d = _mm_unpackhi_pd(a, b); + _mm_store_pd(dp, c); + _mm_store_pd(dp + 2, d); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void sse2_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m128i vb = _mm_set1_epi32(b); si32* dst = aug->i32; @@ -162,19 +236,153 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + static + void sse2_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si64 a = s->rev.Aatk; + const si64 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m128i vb = _mm_set1_epi64x(b); + __m128i ve = _mm_set1_epi64x(1LL << (63 - e)); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)src1); + __m128i s2 = _mm_load_si128((__m128i*)src2); + __m128i d = _mm_load_si128((__m128i*)dst); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dst, d); + } + } + else { // general case + // 64bit multiplication is not supported in sse2 + if (synthesis) + for (ui32 i = repeat; i > 0; --i) + *dst++ -= (b + a * (*src1++ + *src2++)) >> e; + else + for (ui32 i = repeat; i > 0; --i) + *dst++ += (b + a * (*src1++ + *src2++)) >> e; + } + } + + ///////////////////////////////////////////////////////////////////////// + void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + sse2_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + sse2_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void sse2_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { - // combine both lsrc and hsrc into dst + // split src into ldst and hdst { - float* dpl = ldst->f32; - float* dph = hdst->f32; + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; float* sp = src->f32; int w = (int)width; - SSE_DEINTERLEAVE(dpl, dph, sp, w, even); + sse2_deinterleave32(dpl, dph, sp, w); } si32* hp = hdst->i32, * lp = ldst->i32; @@ -187,7 +395,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m128i vb = _mm_set1_epi32(b); // extension @@ -284,9 +492,7 @@ namespace ojph { } else { // general case - // 32bit multiplication is not supported in sse2; we need sse4.1, - // where we can use _mm_mullo_epi32, which multiplies - // 32bit x 32bit, keeping the LSBs + // 64bit multiplication is not supported in sse2 if (even) for (ui32 i = h_width; i > 0; --i, sp++, dp++) *dp += (b + a * (sp[0] + sp[1])) >> e; @@ -308,11 +514,179 @@ namespace ojph { hdst->i32[0] = src->i32[0] << 1; } } + + ///////////////////////////////////////////////////////////////////////// + static + void sse2_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // split src into ldst and hdst + { + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); + double* sp = (double*)src->p; + int w = (int)width; + sse2_deinterleave64(dpl, dph, sp, w); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m128i vb = _mm_set1_epi64x(b); + __m128i ve = _mm_set1_epi64x(1LL << (63 - e)); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else { + // general case + // 64bit multiplication is not supported in sse2 + if (even) + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[0] + sp[1])) >> e; + else + for (ui32 i = h_width; i > 0; --i, sp++, dp++) + *dp += (b + a * (sp[-1] + sp[0])) >> e; + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + sse2_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + sse2_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } ////////////////////////////////////////////////////////////////////////// - void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, - ui32 width, bool even) + void sse2_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -326,7 +700,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const si32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; __m128i vb = _mm_set1_epi32(b); // extension @@ -443,10 +817,10 @@ namespace ojph { // combine both lsrc and hsrc into dst { float* dp = dst->f32; - float* spl = lsrc->f32; - float* sph = hsrc->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; int w = (int)width; - SSE_INTERLEAVE(dp, spl, sph, w, even); + sse2_interleave32(dp, spl, sph, w); } } else { @@ -457,5 +831,172 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void sse2_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + __m128i vb = _mm_set1_epi64x(b); + __m128i ve = _mm_set1_epi64x(1LL << (63 - e)); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_add_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i w = sse2_mm_srai_epi64(t, e, ve); + d = _mm_add_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + __m128i s1 = _mm_load_si128((__m128i*)sp); + __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1)); + __m128i d = _mm_load_si128((__m128i*)dp); + __m128i t = _mm_add_epi64(s1, s2); + __m128i v = _mm_sub_epi64(vb, t); + __m128i w = sse2_mm_srai_epi64(v, e, ve); + d = _mm_sub_epi64(d, w); + _mm_store_si128((__m128i*)dp, d); + } + } + else { + // general case + // 64bit multiplication is not supported in sse2 + if (ev) + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[-1] + sp[0])) >> e; + else + for (ui32 i = aug_width; i > 0; --i, sp++, dp++) + *dp -= (b + a * (sp[0] + sp[1])) >> e; + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)dst->p; + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); + int w = (int)width; + sse2_interleave64(dp, spl, sph, w); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + sse2_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + sse2_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + } // !local } // !ojph diff --git a/src/core/transform/ojph_transform_wasm.cpp b/src/core/transform/ojph_transform_wasm.cpp index bd652df..341cfc3 100644 --- a/src/core/transform/ojph_transform_wasm.cpp +++ b/src/core/transform/ojph_transform_wasm.cpp @@ -51,65 +51,69 @@ namespace ojph { namespace local { ////////////////////////////////////////////////////////////////////////// - void wasm_deinterleave(float* dpl, float* dph, float* sp, - int width, bool even) + static inline + void wasm_deinterleave32(float* dpl, float* dph, float* sp, int width) { - if (even) - for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) - { - v128_t a = wasm_v128_load(sp); - v128_t b = wasm_v128_load(sp + 4); - v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2); - v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3); - // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); - // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); - wasm_v128_store(dpl, c); - wasm_v128_store(dph, d); - } - else - for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) - { - v128_t a = wasm_v128_load(sp); - v128_t b = wasm_v128_load(sp + 4); - v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2); - v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3); - // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); - // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); - wasm_v128_store(dpl, d); - wasm_v128_store(dph, c); - } + for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4) + { + v128_t a = wasm_v128_load(sp); + v128_t b = wasm_v128_load(sp + 4); + v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2); + v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3); + // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0)); + // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1)); + wasm_v128_store(dpl, c); + wasm_v128_store(dph, d); + } } ////////////////////////////////////////////////////////////////////////// - void wasm_interleave(float* dp, float* spl, float* sph, - int width, bool even) + static inline + void wasm_interleave32(float* dp, float* spl, float* sph, int width) { - if (even) - for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) - { - v128_t a = wasm_v128_load(spl); - v128_t b = wasm_v128_load(sph); - v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1); - v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3); - // v128_t c = _mm_unpacklo_ps(a, b); - // v128_t d = _mm_unpackhi_ps(a, b); - wasm_v128_store(dp, c); - wasm_v128_store(dp + 4, d); - } - else - for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) - { - v128_t a = wasm_v128_load(spl); - v128_t b = wasm_v128_load(sph); - v128_t c = wasm_i32x4_shuffle(b, a, 0, 4 + 0, 1, 4 + 1); - v128_t d = wasm_i32x4_shuffle(b, a, 2, 4 + 2, 3, 4 + 3); - // v128_t c = _mm_unpacklo_ps(b, a); - // v128_t d = _mm_unpackhi_ps(b, a); - wasm_v128_store(dp, c); - wasm_v128_store(dp + 4, d); - } + for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4) + { + v128_t a = wasm_v128_load(spl); + v128_t b = wasm_v128_load(sph); + v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1); + v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3); + // v128_t c = _mm_unpacklo_ps(a, b); + // v128_t d = _mm_unpackhi_ps(a, b); + wasm_v128_store(dp, c); + wasm_v128_store(dp + 4, d); + } } + ////////////////////////////////////////////////////////////////////////// + static inline + void wasm_deinterleave64(double* dpl, double* dph, double* sp, int width) + { + for (; width > 0; width -= 4, sp += 4, dpl += 2, dph += 2) + { + v128_t a = wasm_v128_load(sp); + v128_t b = wasm_v128_load(sp + 2); + v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0); + v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1); + wasm_v128_store(dpl, c); + wasm_v128_store(dph, d); + } + } + + ////////////////////////////////////////////////////////////////////////// + static inline + void wasm_interleave64(double* dp, double* spl, double* sph, int width) + { + for (; width > 0; width -= 4, dp += 4, spl += 2, sph += 2) + { + v128_t a = wasm_v128_load(spl); + v128_t b = wasm_v128_load(sph); + v128_t c = wasm_i64x2_shuffle(a, b, 0, 2 + 0); + v128_t d = wasm_i64x2_shuffle(a, b, 1, 2 + 1); + wasm_v128_store(dp, c); + wasm_v128_store(dp + 2, d); + } + } + ////////////////////////////////////////////////////////////////////////// static inline void wasm_multiply_const(float* p, float f, int width) { @@ -159,7 +163,13 @@ namespace ojph { if (width > 1) { // split src into ldst and hdst - wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even); + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + wasm_deinterleave32(dpl, dph, sp, w); + } // the actual horizontal transform float* hp = hdst->f32, * lp = ldst->f32; @@ -287,7 +297,13 @@ namespace ojph { } // combine both lsrc and hsrc into dst - wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even); + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + wasm_interleave32(dp, spl, sph, w); + } } else { if (even) @@ -298,13 +314,13 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, - const line_buf* other, const line_buf* aug, - ui32 repeat, bool synthesis) + void wasm_rev_vert_step32(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) { const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; v128_t va = wasm_i32x4_splat(a); v128_t vb = wasm_i32x4_splat(b); @@ -428,14 +444,174 @@ namespace ojph { } ///////////////////////////////////////////////////////////////////////// - void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, - const line_buf* hdst, const line_buf* src, - ui32 width, bool even) + void wasm_rev_vert_step64(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = wasm_i64x2_splat(a); + v128_t vb = wasm_i64x2_splat(b); + + si64* dst = aug->i64; + const si64* src1 = sig->i64, * src2 = other->i64; + // The general definition of the wavelet in Part 2 is slightly + // different to part 2, although they are mathematically equivalent + // here, we identify the simpler form from Part 1 and employ them + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dst, d); + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dst, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dst, d); + } + } + else + { // general case + int i = (int)repeat; + if (synthesis) + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dst, d); + } + else + for (; i > 0; i -= 2, dst += 2, src1 += 2, src2 += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)src1); + v128_t s2 = wasm_v128_load((v128_t*)src2); + v128_t d = wasm_v128_load((v128_t*)dst); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dst, d); + } + } + } + + ///////////////////////////////////////////////////////////////////////// + void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, + const line_buf* other, const line_buf* aug, + ui32 repeat, bool synthesis) + { + if (((sig != NULL) && (sig->flags & line_buf::LFT_32BIT)) || + ((aug != NULL) && (aug->flags & line_buf::LFT_32BIT)) || + ((other != NULL) && (other->flags & line_buf::LFT_32BIT))) + { + assert((sig == NULL || sig->flags & line_buf::LFT_32BIT) && + (other == NULL || other->flags & line_buf::LFT_32BIT) && + (aug == NULL || aug->flags & line_buf::LFT_32BIT)); + wasm_rev_vert_step32(s, sig, other, aug, repeat, synthesis); + } + else + { + assert((sig == NULL || sig->flags & line_buf::LFT_64BIT) && + (other == NULL || other->flags & line_buf::LFT_64BIT) && + (aug == NULL || aug->flags & line_buf::LFT_64BIT)); + wasm_rev_vert_step64(s, sig, other, aug, repeat, synthesis); + } + } + + ///////////////////////////////////////////////////////////////////////// + static + void wasm_rev_horz_ana32(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) { if (width > 1) { // combine both lsrc and hsrc into dst - wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even); + { + float* dpl = even ? ldst->f32 : hdst->f32; + float* dph = even ? hdst->f32 : ldst->f32; + float* sp = src->f32; + int w = (int)width; + wasm_deinterleave32(dpl, dph, sp, w); + } si32* hp = hdst->i32, * lp = ldst->i32; ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass @@ -447,7 +623,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j - 1); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; v128_t va = wasm_i32x4_splat(a); v128_t vb = wasm_i32x4_splat(b); @@ -587,11 +763,199 @@ namespace ojph { hdst->i32[0] = src->i32[0] << 1; } } - - ////////////////////////////////////////////////////////////////////////// - void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, - const line_buf* lsrc, const line_buf* hsrc, + + ///////////////////////////////////////////////////////////////////////// + static + void wasm_rev_horz_ana64(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, + ui32 width, bool even) + { + if (width > 1) + { + // combine both lsrc and hsrc into dst + { + double* dpl = (double*)(even ? ldst->p : hdst->p); + double* dph = (double*)(even ? hdst->p : ldst->p); + double* sp = (double*)src->p; + int w = (int)width; + wasm_deinterleave64(dpl, dph, sp, w); + } + + si64* hp = hdst->i64, * lp = ldst->i64; + ui32 l_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 h_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = num_steps; j > 0; --j) + { + // first lifting step + const lifting_step* s = atk->get_step(j - 1); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = wasm_i64x2_splat(a); + v128_t vb = wasm_i64x2_splat(b); + + // extension + lp[-1] = lp[0]; + lp[l_width] = lp[l_width - 1]; + // lifting step + const si64* sp = lp; + si64* dp = hp; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)h_width; + if (even) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else + { // general case + int i = (int)h_width; + if (even) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + + // swap buffers + si64* t = lp; lp = hp; hp = t; + even = !even; + ui32 w = l_width; l_width = h_width; h_width = w; + } + } + else { + if (even) + ldst->i64[0] = src->i64[0]; + else + hdst->i64[0] = src->i64[0] << 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, + const line_buf* hdst, const line_buf* src, ui32 width, bool even) + { + if (src->flags & line_buf::LFT_32BIT) + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_32BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_32BIT)); + wasm_rev_horz_ana32(atk, ldst, hdst, src, width, even); + } + else + { + assert((ldst == NULL || ldst->flags & line_buf::LFT_64BIT) && + (hdst == NULL || hdst->flags & line_buf::LFT_64BIT) && + (src == NULL || src->flags & line_buf::LFT_64BIT)); + wasm_rev_horz_ana64(atk, ldst, hdst, src, width, even); + } + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_horz_syn32(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) { if (width > 1) { @@ -605,7 +969,7 @@ namespace ojph { const lifting_step* s = atk->get_step(j); const si32 a = s->rev.Aatk; const si32 b = s->rev.Batk; - const ui32 e = s->rev.Eatk; + const ui8 e = s->rev.Eatk; v128_t va = wasm_i32x4_splat(a); v128_t vb = wasm_i32x4_splat(b); @@ -739,7 +1103,13 @@ namespace ojph { } // combine both lsrc and hsrc into dst - wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even); + { + float* dp = dst->f32; + float* spl = even ? lsrc->f32 : hsrc->f32; + float* sph = even ? hsrc->f32 : lsrc->f32; + int w = (int)width; + wasm_interleave32(dp, spl, sph, w); + } } else { if (even) @@ -749,5 +1119,192 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void wasm_rev_horz_syn64(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (width > 1) + { + bool ev = even; + si64* oth = hsrc->i64, * aug = lsrc->i64; + ui32 aug_width = (width + (even ? 1 : 0)) >> 1; // low pass + ui32 oth_width = (width + (even ? 0 : 1)) >> 1; // high pass + ui32 num_steps = atk->get_num_steps(); + for (ui32 j = 0; j < num_steps; ++j) + { + const lifting_step* s = atk->get_step(j); + const si32 a = s->rev.Aatk; + const si32 b = s->rev.Batk; + const ui8 e = s->rev.Eatk; + v128_t va = wasm_i64x2_splat(a); + v128_t vb = wasm_i64x2_splat(b); + + // extension + oth[-1] = oth[0]; + oth[oth_width] = oth[oth_width - 1]; + // lifting step + const si64* sp = oth; + si64* dp = aug; + if (a == 1) + { // 5/3 update and any case with a == 1 + int i = (int)aug_width; + if (ev) + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else + { + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_add(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + } + else if (a == -1 && b == 1 && e == 1) + { // 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t w = wasm_i64x2_shr(t, e); + d = wasm_i64x2_add(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else if (a == -1) + { // any case with a == -1, which is not 5/3 predict + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t v = wasm_i64x2_sub(vb, t); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + else + { // general case + int i = (int)aug_width; + if (ev) + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp - 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + else + for (; i > 0; i -= 2, sp += 2, dp += 2) + { + v128_t s1 = wasm_v128_load((v128_t*)sp); + v128_t s2 = wasm_v128_load((v128_t*)(sp + 1)); + v128_t d = wasm_v128_load((v128_t*)dp); + v128_t t = wasm_i64x2_add(s1, s2); + v128_t u = wasm_i64x2_mul(va, t); + v128_t v = wasm_i64x2_add(vb, u); + v128_t w = wasm_i64x2_shr(v, e); + d = wasm_i64x2_sub(d, w); + wasm_v128_store((v128_t*)dp, d); + } + } + + // swap buffers + si64* t = aug; aug = oth; oth = t; + ev = !ev; + ui32 w = aug_width; aug_width = oth_width; oth_width = w; + } + + // combine both lsrc and hsrc into dst + { + double* dp = (double*)dst->p; + double* spl = (double*)(even ? lsrc->p : hsrc->p); + double* sph = (double*)(even ? hsrc->p : lsrc->p); + int w = (int)width; + wasm_interleave64(dp, spl, sph, w); + } + } + else { + if (even) + dst->i64[0] = lsrc->i64[0]; + else + dst->i64[0] = hsrc->i64[0] >> 1; + } + } + + ///////////////////////////////////////////////////////////////////////// + void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, + const line_buf* lsrc, const line_buf* hsrc, + ui32 width, bool even) + { + if (dst->flags & line_buf::LFT_32BIT) + { + assert((lsrc == NULL || lsrc->flags & line_buf::LFT_32BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_32BIT)); + wasm_rev_horz_syn32(atk, dst, lsrc, hsrc, width, even); + } + else + { + assert((dst == NULL || dst->flags & line_buf::LFT_64BIT) && + (lsrc == NULL || lsrc->flags & line_buf::LFT_64BIT) && + (hsrc == NULL || hsrc->flags & line_buf::LFT_64BIT)); + wasm_rev_horz_syn64(atk, dst, lsrc, hsrc, width, even); + } + } + } // !local } // !ojph diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 000409f..8cc1d72 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -3,7 +3,7 @@ include(FetchContent) FetchContent_Declare( googletest - URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.tar.gz + URL https://github.com/google/googletest/archive/refs/tags/v1.14.0.tar.gz EXCLUDE_FROM_ALL ) # For Windows: Prevent overriding the parent project's compiler/linker settings diff --git a/tests/test_executables.cpp b/tests/test_executables.cpp index 9f77f75..22f148e 100644 --- a/tests/test_executables.cpp +++ b/tests/test_executables.cpp @@ -107,8 +107,27 @@ int execute(const std::string& cmd, std::string& result) #define REF_FILE_DIR "./jp2k_test_codestreams/openjph/references/" #define MSE_PAE_PATH "./mse_pae" #define COMPARE_FILES_PATH "./compare_files" + +// This is a comment to me, to help with emscripten testing. +// This is written after the completion of the tests. +// 1. Compile for the target platform (Linux), selecting from the following +// code the version that suits you; in particular it should be the one +// the uses node. Ideally create two versions of test_executables, one +// for WASM SIMD, and for WASM without SIMD -- use linux cp command to +// create test_executables_simd and test_executables_no_simd +// 2. Compile again, without deleting what compiled; this time compile using +// emscripten, targeting WASM. The compilation is very finicky, do +// 'make clean && make' after every change in code. +// 3. cd to tests, and run test_executables_simd or test_executables_no_simd. + #define EXPAND_EXECUTABLE "./ojph_expand" #define COMPRESS_EXECUTABLE "./ojph_compress" +//#define EXPAND_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_expand.js" +//#define COMPRESS_EXECUTABLE "20.18.0_64bit/bin/node ./ojph_compress.js" +//#define EXPAND_EXECUTABLE "node-v18.7.0-linux-x64/bin/node ./ojph_expand_simd.js" +//#define COMPRESS_EXECUTABLE "node-v18.7.0-linux-x64/bin/node ./ojph_compress_simd.js" +//#define EXPAND_EXECUTABLE "./../../../sde/sde64 -skx -- ./ojph_expand" +//#define COMPRESS_EXECUTABLE "./../../../sde/sde64 -skx -- ./ojph_compress" #endif #define TOL_DOUBLE 0.01 #define TOL_INTEGER 1