diff --git a/src/gpujpeg_common.c b/src/gpujpeg_common.c index cd06e8e..4732a44 100644 --- a/src/gpujpeg_common.c +++ b/src/gpujpeg_common.c @@ -916,11 +916,15 @@ gpujpeg_coder_init_image(struct gpujpeg_coder * coder, const struct gpujpeg_para // (Re)allocate huffman coder data in host memory if (coder->data_compressed != NULL) { - cudaFreeHost(coder->data_compressed); + cudaHostUnregister(coder->data_compressed); + free(coder->data_compressed); coder->data_compressed = NULL; } - cudaMallocHost((void**)&coder->data_compressed, max_compressed_data_size * sizeof(uint8_t)); - gpujpeg_cuda_check_error("Coder data compressed host allocation", return 0); + coder->data_compressed = malloc(max_compressed_data_size); + coder->data_compressed_pinned_sz = max_compressed_data_size / (GPUJPEG_MAX_BLOCK_COMPRESSED_SIZE + / GPUJPEG_BLOCK_SQUARED_SIZE); // divide by 8 to get the WxHxCH bytes + cudaHostRegister(coder->data_compressed, coder->data_compressed_pinned_sz, cudaHostRegisterDefault); + gpujpeg_cuda_check_error("Coder data compressed host registration", return 0); // (Re)allocate huffman coder data in device memory if (coder->d_data_compressed != NULL) { @@ -1088,8 +1092,10 @@ gpujpeg_coder_deinit(struct gpujpeg_coder* coder) cudaFreeHost(coder->data_quantized); if ( coder->d_data_quantized != NULL ) cudaFree(coder->d_data_quantized); - if ( coder->data_compressed != NULL ) - cudaFreeHost(coder->data_compressed); + if ( coder->data_compressed != NULL ) { + cudaHostUnregister(coder->data_compressed); + free(coder->data_compressed); + } if ( coder->d_data_compressed != NULL ) cudaFree(coder->d_data_compressed); if ( coder->segment != NULL ) diff --git a/src/gpujpeg_common_internal.h b/src/gpujpeg_common_internal.h index 29fbc1b..e557e5e 100644 --- a/src/gpujpeg_common_internal.h +++ b/src/gpujpeg_common_internal.h @@ -339,6 +339,7 @@ struct gpujpeg_coder /// Huffman coder data in host memory (output/input for encoder/decoder) uint8_t* data_compressed; + size_t data_compressed_pinned_sz; ///< amount of pinned memory from data_compressed /// Huffman coder data in device memory (output/input for encoder/decoder) uint8_t* d_data_compressed; /// Huffman coder temporary data (in device memory only) diff --git a/src/gpujpeg_encoder.c b/src/gpujpeg_encoder.c index c3048ab..4916bf1 100644 --- a/src/gpujpeg_encoder.c +++ b/src/gpujpeg_encoder.c @@ -537,8 +537,14 @@ gpujpeg_encoder_encode(struct gpujpeg_encoder* encoder, const struct gpujpeg_par GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_from, coder->param.perf_stats, encoder->stream, return -1); // Copy compressed data from device memory to cpu memory - if ( cudaSuccess != cudaMemcpyAsync(coder->data_compressed, coder->d_data_compressed, output_size, cudaMemcpyDeviceToHost, encoder->stream) ) { - return -1; + cudaMemcpyAsync(coder->data_compressed, coder->d_data_compressed, + MIN(output_size, coder->data_compressed_pinned_sz), cudaMemcpyDeviceToHost, encoder->stream); + gpujpeg_cuda_check_error("Encoder copy compressed data to pinned memory", return -1); + if ( output_size > coder->data_compressed_pinned_sz ) { + cudaMemcpyAsync(coder->data_compressed + coder->data_compressed_pinned_sz, + coder->d_data_compressed + coder->data_compressed_pinned_sz, + output_size - coder->data_compressed_pinned_sz, cudaMemcpyDeviceToHost, encoder->stream); + gpujpeg_cuda_check_error("Encoder copy compressed data to pageable emory", return -1); } // Copy segments from device memory if ( cudaSuccess != cudaMemcpyAsync(coder->segment, coder->d_segment, coder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyDeviceToHost, encoder->stream) ) {