diff --git a/src/gpujpeg_common.c b/src/gpujpeg_common.c
index cd06e8e..4732a44 100644
--- a/src/gpujpeg_common.c
+++ b/src/gpujpeg_common.c
@@ -916,11 +916,15 @@ gpujpeg_coder_init_image(struct gpujpeg_coder * coder, const struct gpujpeg_para
 
         // (Re)allocate huffman coder data in host memory
         if (coder->data_compressed != NULL) {
-            cudaFreeHost(coder->data_compressed);
+            cudaHostUnregister(coder->data_compressed);
+            free(coder->data_compressed);
             coder->data_compressed = NULL;
         }
-        cudaMallocHost((void**)&coder->data_compressed, max_compressed_data_size * sizeof(uint8_t));
-        gpujpeg_cuda_check_error("Coder data compressed host allocation", return 0);
+        coder->data_compressed = malloc(max_compressed_data_size);
+        coder->data_compressed_pinned_sz = max_compressed_data_size / (GPUJPEG_MAX_BLOCK_COMPRESSED_SIZE
+          / GPUJPEG_BLOCK_SQUARED_SIZE); // divide by 8 to get the WxHxCH bytes
+        cudaHostRegister(coder->data_compressed, coder->data_compressed_pinned_sz, cudaHostRegisterDefault);
+        gpujpeg_cuda_check_error("Coder data compressed host registration", return 0);
 
         // (Re)allocate huffman coder data in device memory
         if (coder->d_data_compressed != NULL) {
@@ -1088,8 +1092,10 @@ gpujpeg_coder_deinit(struct gpujpeg_coder* coder)
         cudaFreeHost(coder->data_quantized);
     if ( coder->d_data_quantized != NULL )
         cudaFree(coder->d_data_quantized);
-    if ( coder->data_compressed != NULL )
-        cudaFreeHost(coder->data_compressed);
+    if ( coder->data_compressed != NULL ) {
+        cudaHostUnregister(coder->data_compressed);
+        free(coder->data_compressed);
+    }
     if ( coder->d_data_compressed != NULL )
         cudaFree(coder->d_data_compressed);
     if ( coder->segment != NULL )
diff --git a/src/gpujpeg_common_internal.h b/src/gpujpeg_common_internal.h
index 29fbc1b..e557e5e 100644
--- a/src/gpujpeg_common_internal.h
+++ b/src/gpujpeg_common_internal.h
@@ -339,6 +339,7 @@ struct gpujpeg_coder
 
     /// Huffman coder data in host memory (output/input for encoder/decoder)
     uint8_t* data_compressed;
+    size_t data_compressed_pinned_sz; ///< amount of pinned memory from data_compressed
     /// Huffman coder data in device memory (output/input for encoder/decoder)
     uint8_t* d_data_compressed;
     /// Huffman coder temporary data (in device memory only)
diff --git a/src/gpujpeg_encoder.c b/src/gpujpeg_encoder.c
index c3048ab..4916bf1 100644
--- a/src/gpujpeg_encoder.c
+++ b/src/gpujpeg_encoder.c
@@ -537,8 +537,14 @@ gpujpeg_encoder_encode(struct gpujpeg_encoder* encoder, const struct gpujpeg_par
         GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_from, coder->param.perf_stats, encoder->stream, return -1);
 
         // Copy compressed data from device memory to cpu memory
-        if ( cudaSuccess != cudaMemcpyAsync(coder->data_compressed, coder->d_data_compressed, output_size, cudaMemcpyDeviceToHost, encoder->stream) ) {
-            return -1;
+        cudaMemcpyAsync(coder->data_compressed, coder->d_data_compressed,
+                        MIN(output_size, coder->data_compressed_pinned_sz), cudaMemcpyDeviceToHost, encoder->stream);
+        gpujpeg_cuda_check_error("Encoder copy compressed data to pinned memory", return -1);
+        if ( output_size > coder->data_compressed_pinned_sz ) {
+            cudaMemcpyAsync(coder->data_compressed + coder->data_compressed_pinned_sz,
+                            coder->d_data_compressed + coder->data_compressed_pinned_sz,
+                            output_size - coder->data_compressed_pinned_sz, cudaMemcpyDeviceToHost, encoder->stream);
+            gpujpeg_cuda_check_error("Encoder copy compressed data to pageable emory", return -1);
         }
         // Copy segments from device memory
         if ( cudaSuccess != cudaMemcpyAsync(coder->segment, coder->d_segment, coder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyDeviceToHost, encoder->stream) ) {