diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po index 6ef9804027d8f..acc7e8b4943f7 100644 --- a/addons/resource.language.en_gb/resources/strings.po +++ b/addons/resource.language.en_gb/resources/strings.po @@ -7284,6 +7284,16 @@ msgctxt "#13467" msgid "Unlimited / 1080 (>30Hz)" msgstr "" +#: system/settings/settings.xml +msgctxt "#13500" +msgid "Only allow acceleration for HEVC" +msgstr "" + +#: system/settings/settings.xml +msgctxt "#13501" +msgid "This option disables acceleration for other codecs as they don't currently support seeking with V4L2" +msgstr "" + #empty strings from id 13468 to 13504 #: system/settings/settings.xml diff --git a/cmake/modules/FindFFMPEG.cmake b/cmake/modules/FindFFMPEG.cmake index 81d26979e3d1b..9ce807c3fdd29 100644 --- a/cmake/modules/FindFFMPEG.cmake +++ b/cmake/modules/FindFFMPEG.cmake @@ -277,7 +277,10 @@ if(NOT FFMPEG_FOUND) && ${CMAKE_COMMAND} -E copy ${CMAKE_SOURCE_DIR}/tools/depends/target/ffmpeg/FindGnuTls.cmake - ) + && + patch -p1 < ${CMAKE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-rpi-Add-hevc-acceleration.patch && + echo "########################################## patched ffmpeg ##############################" + ) if (ENABLE_INTERNAL_DAV1D) add_dependencies(ffmpeg dav1d) diff --git a/system/settings/appliance.xml b/system/settings/appliance.xml new file mode 100644 index 0000000000000..640f1758ae4ae --- /dev/null +++ b/system/settings/appliance.xml @@ -0,0 +1,29 @@ + + +
+ + + + true + + + true + + + false + + + 0 + + + + + + + true + + + +
+
+ diff --git a/system/settings/linux.xml b/system/settings/linux.xml index 6d1fb9cd49ed1..691a84e65eeab 100644 --- a/system/settings/linux.xml +++ b/system/settings/linux.xml @@ -168,6 +168,18 @@ true + + HAS_GLES + false + + + true + + + 3 + true + + HAS_GLES false diff --git a/tools/depends/target/ffmpeg/0001-rpi-Add-hevc-acceleration.patch b/tools/depends/target/ffmpeg/0001-rpi-Add-hevc-acceleration.patch new file mode 100644 index 0000000000000..af887b3e384e5 --- /dev/null +++ b/tools/depends/target/ffmpeg/0001-rpi-Add-hevc-acceleration.patch @@ -0,0 +1,58100 @@ +diff --git a/.gitignore b/.gitignore +index 2450ee8fc5..4bcc3ae643 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -1,6 +1,7 @@ + *.a + *.o + *.o.* ++*.bin + *.d + *.def + *.dll +@@ -26,6 +27,7 @@ + .\#* + /.config + /.version ++/build/ + /ffmpeg + /ffplay + /ffprobe +diff --git a/BUILD.txt b/BUILD.txt +new file mode 100644 +index 0000000000..49ed1f119d +--- /dev/null ++++ b/BUILD.txt +@@ -0,0 +1,55 @@ ++# Setup & Build instructions for testing Argon30 mesa support (on Pi4) ++ ++# These assume that the drm_mmal test for Sand8 has been built on this Pi ++# as build relies on many of the same files ++ ++# 1st get everything required to build ffmpeg ++# If sources aren't already enabled on your Pi then enable them ++sudo su ++sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list ++sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list ++mv /tmp/sources.list /etc/apt/ ++mv /tmp/raspi.list /etc/apt/sources.list.d/ ++apt update ++ ++# Get dependancies ++sudo apt build-dep ffmpeg ++ ++# Enable H265 V4L2 request decoder ++sudo su ++echo dtoverlay=rpivid-v4l2 >> /boot/config.txt ++reboot ++# Check it has turned up ++ls -la /dev/video* ++# This should include video19 ++# crw-rw----+ 1 root video 81, 7 Aug 4 17:25 /dev/video19 ++ ++# Config ++pi-util/conf_native.sh ++ ++# Build (this is a bit dull) ++# If you want to poke the source the libavdevice/egl_vout.c contains the ++# output code - ++make -j6 ++ ++# Grab test streams ++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv ++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv ++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv ++ ++# Test i420 output (works currently) ++./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl - ++ ++# Test Sand8 output - doesn't currently work but should once you have ++# Sand8 working in drm_mmal. I can't guarantee that this will work as ++# I can't test this path with a known working format, but the debug looks ++# good. If this doesn't work & drm_mmal does with sand8 then come back to me ++./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -f vout_egl - ++ ++# Test Sand30 - doesn't currently work ++# (Beware that when FFmpeg errors out it often leaves your teminal window ++# in a state where you need to reset it) ++./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl - ++ ++ ++ +diff --git a/configure b/configure +index 8569a60bf8..277d36cf9a 100755 +--- a/configure ++++ b/configure +@@ -274,6 +274,7 @@ External library support: + --enable-libtls enable LibreSSL (via libtls), needed for https support + if openssl, gnutls or mbedtls is not used [no] + --enable-libtwolame enable MP2 encoding via libtwolame [no] ++ --enable-libudev enable libudev [no] + --enable-libv4l2 enable libv4l2/v4l-utils [no] + --enable-libvidstab enable video stabilization using vid.stab [no] + --enable-libvmaf enable vmaf filter via libvmaf [no] +@@ -336,12 +337,17 @@ External library support: + --enable-libmfx enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no] + --enable-libnpp enable Nvidia Performance Primitives-based code [no] + --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no] ++ --enable-rpi enable other rpi specific stuff [no] ++ --enable-sand enable sand video formats [rpi] ++ --enable-vout-drm enable the vout_drm module - for internal testing only [no] ++ --enable-vout-egl enable the vout_egl module - for internal testing only [no] + --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect] + --disable-nvenc disable Nvidia video encoding code [autodetect] + --enable-omx enable OpenMAX IL code [no] + --enable-omx-rpi enable OpenMAX IL code for Raspberry Pi [no] + --enable-rkmpp enable Rockchip Media Process Platform code [no] + --disable-v4l2-m2m disable V4L2 mem2mem code [autodetect] ++ --enable-v4l2-request enable V4L2 request API code [no] + --disable-vaapi disable Video Acceleration API (mainly Unix/Intel) code [autodetect] + --disable-vdpau disable Nvidia Video Decode and Presentation API for Unix code [autodetect] + --disable-videotoolbox disable VideoToolbox code [autodetect] +@@ -1771,6 +1777,7 @@ EXTERNAL_LIBRARY_LIST=" + libdav1d + libdc1394 + libdrm ++ epoxy + libflite + libfontconfig + libfreetype +@@ -1807,6 +1814,7 @@ EXTERNAL_LIBRARY_LIST=" + libtesseract + libtheora + libtwolame ++ libudev + libv4l2 + libvorbis + libvpx +@@ -1861,7 +1869,10 @@ HWACCEL_LIBRARY_LIST=" + mmal + omx + opencl ++ v4l2_request + vulkan ++ rpi4_8 ++ rpi4_10 + " + + DOCUMENT_LIST=" +@@ -1877,12 +1888,16 @@ FEATURE_LIST=" + gray + hardcoded_tables + omx_rpi ++ rpi + runtime_cpudetect + safe_bitstream_reader ++ sand + shared + small + static + swscale_alpha ++ vout_drm ++ vout_egl + " + + # this list should be kept in linking order +@@ -1923,6 +1938,7 @@ SUBSYSTEM_LIST=" + pixelutils + network + rdft ++ rpi + " + + # COMPONENT_LIST needs to come last to ensure correct dependency checking +@@ -2405,9 +2421,11 @@ CONFIG_EXTRA=" + rangecoder + riffdec + riffenc ++ rpi + rtpdec + rtpenc_chain + rv34dsp ++ sand + scene_sad + sinewin + snappy +@@ -2737,6 +2755,8 @@ hap_decoder_select="snappy texturedsp" + hap_encoder_deps="libsnappy" + hap_encoder_select="texturedspenc" + hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp" ++hevc_rpi_decoder_deps="rpi" ++hevc_rpi_decoder_select="hevc_decoder sand" + huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp" + huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp" + hymt_decoder_select="huffyuv_decoder" +@@ -2903,6 +2923,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext" + dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32" + ffnvcodec_deps_any="libdl LoadLibrary" + nvdec_deps="ffnvcodec" ++v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev" + vaapi_x11_deps="xlib" + videotoolbox_hwaccel_deps="videotoolbox pthreads" + videotoolbox_hwaccel_extralibs="-framework QuartzCore" +@@ -2920,6 +2941,8 @@ h264_dxva2_hwaccel_deps="dxva2" + h264_dxva2_hwaccel_select="h264_decoder" + h264_nvdec_hwaccel_deps="nvdec" + h264_nvdec_hwaccel_select="h264_decoder" ++h264_v4l2request_hwaccel_deps="v4l2_request" ++h264_v4l2request_hwaccel_select="h264_decoder" + h264_vaapi_hwaccel_deps="vaapi" + h264_vaapi_hwaccel_select="h264_decoder" + h264_vdpau_hwaccel_deps="vdpau" +@@ -2934,6 +2957,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC" + hevc_dxva2_hwaccel_select="hevc_decoder" + hevc_nvdec_hwaccel_deps="nvdec" + hevc_nvdec_hwaccel_select="hevc_decoder" ++hevc_v4l2request_hwaccel_deps="v4l2_request" ++hevc_v4l2request_hwaccel_select="hevc_decoder" ++hevc_rpi4_10_hwaccel_deps="rpi" ++hevc_rpi4_10_hwaccel_select="hevc_decoder" ++hevc_rpi4_8_hwaccel_deps="rpi" ++hevc_rpi4_8_hwaccel_select="hevc_decoder" + hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC" + hevc_vaapi_hwaccel_select="hevc_decoder" + hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC" +@@ -2962,6 +2991,8 @@ mpeg2_dxva2_hwaccel_deps="dxva2" + mpeg2_dxva2_hwaccel_select="mpeg2video_decoder" + mpeg2_nvdec_hwaccel_deps="nvdec" + mpeg2_nvdec_hwaccel_select="mpeg2video_decoder" ++mpeg2_v4l2request_hwaccel_deps="v4l2_request mpeg2_v4l2_request" ++mpeg2_v4l2request_hwaccel_select="mpeg2video_decoder" + mpeg2_vaapi_hwaccel_deps="vaapi" + mpeg2_vaapi_hwaccel_select="mpeg2video_decoder" + mpeg2_vdpau_hwaccel_deps="vdpau" +@@ -2992,6 +3023,8 @@ vc1_vdpau_hwaccel_deps="vdpau" + vc1_vdpau_hwaccel_select="vc1_decoder" + vp8_nvdec_hwaccel_deps="nvdec" + vp8_nvdec_hwaccel_select="vp8_decoder" ++vp8_v4l2request_hwaccel_deps="v4l2_request" ++vp8_v4l2request_hwaccel_select="vp8_decoder" + vp8_vaapi_hwaccel_deps="vaapi" + vp8_vaapi_hwaccel_select="vp8_decoder" + vp9_d3d11va_hwaccel_deps="d3d11va DXVA_PicParams_VP9" +@@ -3002,6 +3035,8 @@ vp9_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_VP9" + vp9_dxva2_hwaccel_select="vp9_decoder" + vp9_nvdec_hwaccel_deps="nvdec" + vp9_nvdec_hwaccel_select="vp9_decoder" ++vp9_v4l2request_hwaccel_deps="v4l2_request" ++vp9_v4l2request_hwaccel_select="vp9_decoder" + vp9_vaapi_hwaccel_deps="vaapi VADecPictureParameterBufferVP9_bit_depth" + vp9_vaapi_hwaccel_select="vp9_decoder" + vp9_vdpau_hwaccel_deps="vdpau VdpPictureInfoVP9" +@@ -3401,8 +3436,14 @@ sndio_indev_deps="sndio" + sndio_outdev_deps="sndio" + v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h" + v4l2_indev_suggest="libv4l2" ++v4l2_outdev_deps="libdrm" + v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h" + v4l2_outdev_suggest="libv4l2" ++vout_drm_outdev_deps="libdrm vout_drm" ++vout_egl_outdev_deps="vout_egl" ++vout_egl_outdev_select="epoxy" ++vout_rpi_outdev_deps="rpi" ++vout_rpi_outdev_select="sand" + vfwcap_indev_deps="vfw32 vfwcap_defines" + xcbgrab_indev_deps="libxcb" + xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes" +@@ -3618,6 +3659,8 @@ tonemap_vaapi_filter_deps="vaapi VAProcFilterParameterBufferHDRToneMapping" + tonemap_opencl_filter_deps="opencl const_nan" + transpose_opencl_filter_deps="opencl" + transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags" ++unsand_filter_deps="rpi" ++unsand_filter_select="sand" + unsharp_opencl_filter_deps="opencl" + uspp_filter_deps="gpl avcodec" + vaguedenoiser_filter_deps="gpl" +@@ -6299,6 +6342,7 @@ enabled libdav1d && require_pkg_config libdav1d "dav1d >= 0.4.0" "dav1d + enabled libdavs2 && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open + enabled libdc1394 && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new + enabled libdrm && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion ++enabled epoxy && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version + enabled libfdk_aac && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen || + { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac && + warn "using libfdk without pkg-config"; } } +@@ -6376,6 +6420,7 @@ enabled libtls && require_pkg_config libtls libtls tls.h tls_configur + enabled libtwolame && require libtwolame twolame.h twolame_init -ltwolame && + { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame || + die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; } ++enabled libudev && require_pkg_config libudev libudev libudev.h udev_new + enabled libv4l2 && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl + enabled libvidstab && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit + enabled libvmaf && require_pkg_config libvmaf "libvmaf >= 1.3.9" libvmaf.h compute_vmaf +@@ -6430,11 +6475,12 @@ enabled mbedtls && { check_pkg_config mbedtls mbedtls mbedtls/x509_crt + check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto || + die "ERROR: mbedTLS not found"; } + enabled mediacodec && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; } +-enabled mmal && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || ++( enabled rpi || ++ enabled mmal ) && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host || + { ! enabled cross_compile && + add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline && + add_ldflags -L/opt/vc/lib/ && +- check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } || ++ check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } || + die "ERROR: mmal not found" && + check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; } + enabled openal && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do +@@ -6475,6 +6521,10 @@ enabled rkmpp && { require_pkg_config rkmpp rockchip_mpp rockchip/r + { enabled libdrm || + die "ERROR: rkmpp requires --enable-libdrm"; } + } ++enabled v4l2_request && { enabled libdrm || ++ die "ERROR: v4l2-request requires --enable-libdrm"; } && ++ { enabled libudev || ++ die "ERROR: v4l2-request requires --enable-libudev"; } + enabled vapoursynth && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init + + +@@ -6556,6 +6606,13 @@ if enabled v4l2_m2m; then + check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;" + fi + ++check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns ++check_cc h264_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_H264_SLICE;" ++check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;" ++check_cc mpeg2_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_MPEG2_SLICE;" ++check_cc vp8_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_VP8_FRAME;" ++check_cc vp9_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_VP9_FRAME;" ++ + check_headers sys/videoio.h + test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete + +diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c +index 2e9448ea2b..faa8501dd0 100644 +--- a/fftools/ffmpeg.c ++++ b/fftools/ffmpeg.c +@@ -2118,8 +2118,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame) + ifilter->channel_layout != frame->channel_layout; + break; + case AVMEDIA_TYPE_VIDEO: +- need_reinit |= ifilter->width != frame->width || +- ifilter->height != frame->height; ++ need_reinit |= ifilter->width != av_frame_cropped_width(frame) || ++ ifilter->height != av_frame_cropped_height(frame); + break; + } + +@@ -2367,6 +2367,8 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_ + if (ist->dec_ctx->codec_id == AV_CODEC_ID_H264) { + ist->st->codecpar->video_delay = ist->dec_ctx->has_b_frames; + } else ++ { ++#if 0 + av_log(ist->dec_ctx, AV_LOG_WARNING, + "video_delay is larger in decoder than demuxer %d > %d.\n" + "If you want to help, upload a sample " +@@ -2374,6 +2376,8 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_ + "and contact the ffmpeg-devel mailing list. (ffmpeg-devel@ffmpeg.org)\n", + ist->dec_ctx->has_b_frames, + ist->st->codecpar->video_delay); ++#endif ++ } + } + + if (ret != AVERROR_EOF) +@@ -2400,8 +2404,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_ + decoded_frame->top_field_first = ist->top_field_first; + + ist->frames_decoded++; +- +- if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { ++ if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) { + err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame); + if (err < 0) + goto fail; +@@ -2913,6 +2916,15 @@ static int init_input_stream(int ist_index, char *error, int error_len) + return ret; + } + ++#if CONFIG_HEVC_RPI_DECODER ++ ret = -1; ++ if (strcmp(codec->name, "hevc_rpi") == 0 && ++ (ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) { ++ ist->dec = codec = avcodec_find_decoder_by_name("hevc"); ++ av_log(NULL, AV_LOG_INFO, "Failed to open hevc_rpi - trying hevc\n"); ++ } ++ if (ret < 0) ++#endif + if ((ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) { + if (ret == AVERROR_EXPERIMENTAL) + abort_codec_experimental(codec, 0); +diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h +index 828cb2a4ff..55d4db293e 100644 +--- a/fftools/ffmpeg.h ++++ b/fftools/ffmpeg.h +@@ -61,6 +61,7 @@ enum HWAccelID { + HWACCEL_GENERIC, + HWACCEL_VIDEOTOOLBOX, + HWACCEL_QSV, ++ HWACCEL_RPI, + }; + + typedef struct HWAccel { +@@ -590,6 +591,7 @@ extern int video_sync_method; + extern float frame_drop_threshold; + extern int do_benchmark; + extern int do_benchmark_all; ++extern int no_cvt_hw; + extern int do_deinterlace; + extern int do_hex_dump; + extern int do_pkt_dump; +@@ -653,6 +655,7 @@ int ffmpeg_parse_options(int argc, char **argv); + + int videotoolbox_init(AVCodecContext *s); + int qsv_init(AVCodecContext *s); ++int rpi_init(AVCodecContext *s); + + HWDevice *hw_device_get_by_name(const char *name); + int hw_device_init_from_string(const char *arg, HWDevice **dev); +diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c +index 422e1268e9..deb89c076d 100644 +--- a/fftools/ffmpeg_filter.c ++++ b/fftools/ffmpeg_filter.c +@@ -1186,8 +1186,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame) + + ifilter->format = frame->format; + +- ifilter->width = frame->width; +- ifilter->height = frame->height; ++ ifilter->width = av_frame_cropped_width(frame); ++ ifilter->height = av_frame_cropped_height(frame); + ifilter->sample_aspect_ratio = frame->sample_aspect_ratio; + + ifilter->sample_rate = frame->sample_rate; +diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c +index 2eb4e1c973..98207be2e2 100644 +--- a/fftools/ffmpeg_opt.c ++++ b/fftools/ffmpeg_opt.c +@@ -130,12 +130,22 @@ static const char *opt_name_enc_time_bases[] = {"enc_time_base", NULL + }\ + } + ++#if CONFIG_RPI ++int rpi_init(AVCodecContext *avctx) { ++ return 0; ++} ++#endif ++ + const HWAccel hwaccels[] = { + #if CONFIG_VIDEOTOOLBOX + { "videotoolbox", videotoolbox_init, HWACCEL_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX }, + #endif + #if CONFIG_LIBMFX + { "qsv", qsv_init, HWACCEL_QSV, AV_PIX_FMT_QSV }, ++#endif ++#if CONFIG_RPI ++ { "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_8 }, ++ { "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_10 }, + #endif + { 0 }, + }; +@@ -155,6 +165,7 @@ float frame_drop_threshold = 0; + int do_deinterlace = 0; + int do_benchmark = 0; + int do_benchmark_all = 0; ++int no_cvt_hw = 0; + int do_hex_dump = 0; + int do_pkt_dump = 0; + int copy_ts = 0; +@@ -755,7 +766,9 @@ static AVCodec *choose_decoder(OptionsContext *o, AVFormatContext *s, AVStream * + st->codecpar->codec_id = codec->id; + return codec; + } else ++ { + return avcodec_find_decoder(st->codecpar->codec_id); ++ } + } + + /* Add all the streams from the given input file to the global +@@ -3460,6 +3473,8 @@ const OptionDef options[] = { + "add timings for benchmarking" }, + { "benchmark_all", OPT_BOOL | OPT_EXPERT, { &do_benchmark_all }, + "add timings for each task" }, ++ { "no_cvt_hw", OPT_BOOL | OPT_EXPERT, { &no_cvt_hw }, ++ "do not auto-convert hw frames to sw" }, + { "progress", HAS_ARG | OPT_EXPERT, { .func_arg = opt_progress }, + "write program-readable progress information", "url" }, + { "stdin", OPT_BOOL | OPT_EXPERT, { &stdin_interaction }, +diff --git a/libavcodec/Makefile b/libavcodec/Makefile +index 5a6ea59715..c9d056101d 100644 +--- a/libavcodec/Makefile ++++ b/libavcodec/Makefile +@@ -19,6 +19,7 @@ HEADERS = ac3_parser.h \ + mediacodec.h \ + packet.h \ + qsv.h \ ++ rpi_zc.h \ + vaapi.h \ + vdpau.h \ + version.h \ +@@ -138,6 +139,7 @@ OBJS-$(CONFIG_QSVDEC) += qsvdec.o + OBJS-$(CONFIG_QSVENC) += qsvenc.o + OBJS-$(CONFIG_RANGECODER) += rangecoder.o + OBJS-$(CONFIG_RDFT) += rdft.o ++OBJS-$(CONFIG_RPI) += rpi_qpu.o rpi_mailbox.o rpi_zc.o + OBJS-$(CONFIG_RV34DSP) += rv34dsp.o + OBJS-$(CONFIG_SHARED) += log2_tab.o reverse.o + OBJS-$(CONFIG_SINEWIN) += sinewin.o sinewin_fixed.o +@@ -153,6 +155,7 @@ OBJS-$(CONFIG_VP3DSP) += vp3dsp.o + OBJS-$(CONFIG_VP56DSP) += vp56dsp.o + OBJS-$(CONFIG_VP8DSP) += vp8dsp.o + OBJS-$(CONFIG_V4L2_M2M) += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o ++OBJS-$(CONFIG_V4L2_REQUEST) += v4l2_request.o v4l2_phase.o + OBJS-$(CONFIG_WMA_FREQS) += wma_freqs.o + OBJS-$(CONFIG_WMV2DSP) += wmv2dsp.o + +@@ -381,6 +384,15 @@ OBJS-$(CONFIG_HCOM_DECODER) += hcom.o + OBJS-$(CONFIG_HEVC_DECODER) += hevcdec.o hevc_mvs.o \ + hevc_cabac.o hevc_refs.o hevcpred.o \ + hevcdsp.o hevc_filter.o hevc_data.o ++OBJS-$(CONFIG_RPI) += rpi_mem.o \ ++ rpi_mailbox.o rpi_zc.o ++OBJS-$(CONFIG_HEVC_RPI_DECODER) += rpi_hevcdec.o rpi_hevc_mvs.o \ ++ rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o \ ++ rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o \ ++ rpi_hevc_shader.o rpi_hevc_shader_template.o \ ++ rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \ ++ rpi_hevc_sei.o rpi_hevc_data.o rpi_qpu.o rpi_mem.o ++OBJS-$(CONFIG_HEVC_CUVID_DECODER) += cuvid.o + OBJS-$(CONFIG_HEVC_AMF_ENCODER) += amfenc_hevc.o + OBJS-$(CONFIG_HEVC_CUVID_DECODER) += cuviddec.o + OBJS-$(CONFIG_HEVC_MEDIACODEC_DECODER) += mediacodecdec.o +@@ -902,6 +914,7 @@ OBJS-$(CONFIG_H264_D3D11VA_HWACCEL) += dxva2_h264.o + OBJS-$(CONFIG_H264_DXVA2_HWACCEL) += dxva2_h264.o + OBJS-$(CONFIG_H264_NVDEC_HWACCEL) += nvdec_h264.o + OBJS-$(CONFIG_H264_QSV_HWACCEL) += qsvdec_h2645.o ++OBJS-$(CONFIG_H264_V4L2REQUEST_HWACCEL) += v4l2_request_h264.o + OBJS-$(CONFIG_H264_VAAPI_HWACCEL) += vaapi_h264.o + OBJS-$(CONFIG_H264_VDPAU_HWACCEL) += vdpau_h264.o + OBJS-$(CONFIG_H264_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o +@@ -909,8 +922,11 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL) += dxva2_hevc.o + OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL) += dxva2_hevc.o + OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL) += nvdec_hevc.o + OBJS-$(CONFIG_HEVC_QSV_HWACCEL) += qsvdec_h2645.o ++OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL) += v4l2_request_hevc.o + OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL) += vaapi_hevc.o h265_profile_level.o + OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL) += vdpau_hevc.o ++OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL) += rpivid_hevc.o ++OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL) += rpivid_hevc.o + OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL) += nvdec_mjpeg.o + OBJS-$(CONFIG_MJPEG_VAAPI_HWACCEL) += vaapi_mjpeg.o + OBJS-$(CONFIG_MPEG1_NVDEC_HWACCEL) += nvdec_mpeg12.o +@@ -921,6 +937,7 @@ OBJS-$(CONFIG_MPEG2_D3D11VA_HWACCEL) += dxva2_mpeg2.o + OBJS-$(CONFIG_MPEG2_DXVA2_HWACCEL) += dxva2_mpeg2.o + OBJS-$(CONFIG_MPEG2_NVDEC_HWACCEL) += nvdec_mpeg12.o + OBJS-$(CONFIG_MPEG2_QSV_HWACCEL) += qsvdec_other.o ++OBJS-$(CONFIG_MPEG2_V4L2REQUEST_HWACCEL) += v4l2_request_mpeg2.o + OBJS-$(CONFIG_MPEG2_VAAPI_HWACCEL) += vaapi_mpeg2.o + OBJS-$(CONFIG_MPEG2_VDPAU_HWACCEL) += vdpau_mpeg12.o + OBJS-$(CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL) += videotoolbox.o +@@ -936,10 +953,12 @@ OBJS-$(CONFIG_VC1_QSV_HWACCEL) += qsvdec_other.o + OBJS-$(CONFIG_VC1_VAAPI_HWACCEL) += vaapi_vc1.o + OBJS-$(CONFIG_VC1_VDPAU_HWACCEL) += vdpau_vc1.o + OBJS-$(CONFIG_VP8_NVDEC_HWACCEL) += nvdec_vp8.o ++OBJS-$(CONFIG_VP8_V4L2REQUEST_HWACCEL) += v4l2_request_vp8.o + OBJS-$(CONFIG_VP8_VAAPI_HWACCEL) += vaapi_vp8.o + OBJS-$(CONFIG_VP9_D3D11VA_HWACCEL) += dxva2_vp9.o + OBJS-$(CONFIG_VP9_DXVA2_HWACCEL) += dxva2_vp9.o + OBJS-$(CONFIG_VP9_NVDEC_HWACCEL) += nvdec_vp9.o ++OBJS-$(CONFIG_VP9_V4L2REQUEST_HWACCEL) += v4l2_request_vp9.o + OBJS-$(CONFIG_VP9_VAAPI_HWACCEL) += vaapi_vp9.o + OBJS-$(CONFIG_VP9_VDPAU_HWACCEL) += vdpau_vp9.o + OBJS-$(CONFIG_VP8_QSV_HWACCEL) += qsvdec_other.o +@@ -1261,3 +1280,31 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h + $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h + $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h + endif ++ ++ifdef CONFIG_HEVC_RPI_DECODER ++QASM_PY := ../local/bin/qasm.py ++VASMVIDCORE := ../local/bin/vasmvidcore_std ++ ++ifneq ("$(wildcard $(QASM_PY))","") ++$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm ++ $(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@ ++ ++$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm ++ $(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@ ++endif ++ ++ifneq ("$(wildcard $(VASMVIDCORE))","") ++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s ++ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@ ++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s ++ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@ ++ ++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin ++ python pi-util/make_array.py $< ++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin ++ python pi-util/make_array.py $< ++endif ++ ++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h ++$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h ++endif +diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c +index 80f128cade..ac4cf9a90e 100644 +--- a/libavcodec/allcodecs.c ++++ b/libavcodec/allcodecs.c +@@ -149,6 +149,7 @@ extern AVCodec ff_hap_decoder; + extern AVCodec ff_hevc_decoder; + extern AVCodec ff_hevc_qsv_decoder; + extern AVCodec ff_hevc_rkmpp_decoder; ++extern AVCodec ff_hevc_rpi_decoder; + extern AVCodec ff_hevc_v4l2m2m_decoder; + extern AVCodec ff_hnm4_video_decoder; + extern AVCodec ff_hq_hqa_decoder; +@@ -890,6 +891,41 @@ static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id) + } + } + ++static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt) ++{ ++ const enum AVPixelFormat *pf = p->pix_fmts; ++ ++ // Assume good if we lack info ++ if (pf == NULL) ++ return 1; ++ if (fmt == AV_PIX_FMT_NONE) ++ return 0; ++ ++ for (; *pf != AV_PIX_FMT_NONE; ++pf) { ++ if (*pf == fmt) ++ return 1; ++ } ++ return 0; ++} ++ ++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt) ++{ ++ const AVCodec *p, *experimental = NULL; ++ void *i = 0; ++ ++ id= remap_deprecated_codec_id(id); ++ while ((p = av_codec_iterate(&i))) { ++ if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) { ++ if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) { ++ experimental = p; ++ } else ++ return (AVCodec *)p; ++ } ++ p = p->next; ++ } ++ return (AVCodec *)experimental; ++} ++ + static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *)) + { + const AVCodec *p, *experimental = NULL; +diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile +index c6be814153..442d60efe4 100644 +--- a/libavcodec/arm/Makefile ++++ b/libavcodec/arm/Makefile +@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \ + arm/sbrdsp_init_arm.o + OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o + OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o ++OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o \ ++ arm/rpi_hevcpred_init_arm.o + OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o + OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o + OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o +@@ -140,10 +142,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \ + NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o + NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o + NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \ ++ arm/hevcdsp_idct_neon.o \ + arm/hevcdsp_deblock_neon.o \ + arm/hevcdsp_idct_neon.o \ + arm/hevcdsp_qpel_neon.o \ + arm/hevcdsp_sao_neon.o ++NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_neon.o \ ++ arm/rpi_hevc_misc_neon.o \ ++ arm/rpi_hevcdsp_deblock_neon.o \ ++ arm/rpi_hevcdsp_idct_neon.o \ ++ arm/rpi_hevcdsp_res8_neon.o \ ++ arm/rpi_hevcdsp_res16_neon.o \ ++ arm/rpi_hevcdsp_sao_neon.o \ ++ arm/rpi_hevcpred_init_neon.o \ ++ arm/rpi_hevcpred_intra_angular_neon.o \ ++ arm/rpi_hevcpred_intra_dc_neon.o \ ++ arm/rpi_hevcpred_intra_filter_neon.o \ ++ arm/rpi_hevcpred_intra_hv_neon.o \ ++ arm/rpi_hevcpred_intra_planar_neon.o + NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o + NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \ + arm/rv40dsp_neon.o +diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h +index fdbf86b45e..4755f20e2e 100644 +--- a/libavcodec/arm/cabac.h ++++ b/libavcodec/arm/cabac.h +@@ -26,83 +26,209 @@ + #include "libavutil/internal.h" + #include "libavcodec/cabac.h" + ++ + #define get_cabac_inline get_cabac_inline_arm + static av_always_inline int get_cabac_inline_arm(CABACContext *c, +- uint8_t *const state) ++ uint8_t *state) + { +- int bit; +- void *reg_b, *reg_c, *tmp; ++ const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128; ++ int bit, ptr, low, tmp1, tmp2; ++ __asm__ volatile ( ++ "ldr %[bit], [%[c], %[range_off]] \n\t" ++ "ldrb %[ptr], [%[state]] \n\t" ++ "sub %[tmp1], %[mlps_tables], %[lps_off] \n\t" ++ "and %[tmp2], %[bit], #0xc0 \n\t" ++ "add %[tmp1], %[tmp1], %[ptr] \n\t" ++ "ldr %[low], [%[c], %[low_off]] \n\t" ++ "ldrb %[tmp2], [%[tmp1], %[tmp2], lsl #1] \n\t" ++ "sub %[bit], %[bit], %[tmp2] \n\t" ++ "mov %[tmp1], %[bit] \n\t" ++ "cmp %[low], %[bit], lsl #17 \n\t" ++ "itt ge \n\t" ++ "movge %[tmp1], %[tmp2] \n\t" ++ "mvnge %[ptr], %[ptr] \n\t" ++ "clz %[tmp2], %[tmp1] \n\t" ++ "it ge \n\t" ++ "subge %[low], %[low], %[bit], lsl #17 \n\t" ++ "sub %[tmp2], %[tmp2], #23 \n\t" ++ "and %[bit], %[ptr], #1 \n\t" ++ "ldrb %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t" ++ "lsl %[low], %[low], %[tmp2] \n\t" ++ "lsls %[ptr], %[low], #16 \n\t" ++ "bne 1f \n\t" ++ "ldr %[ptr], [%[c], %[ptr_off]] \n\t" ++ "lsl %[tmp2], %[tmp1], %[tmp2] \n\t" ++#if UNCHECKED_BITSTREAM_READER ++ "strb %[mlps_tables], [%[state]] \n\t" ++ "rbit %[state], %[low] \n\t" ++ "ldrh %[tmp1], [%[ptr]], #2 \n\t" ++#else ++ "ldr %[tmp1], [%[c], %[end_off]] \n\t" ++ "strb %[mlps_tables], [%[state]] \n\t" ++ "rbit %[state], %[low] \n\t" ++ "cmp %[tmp1], %[ptr] \n\t" ++#if CONFIG_THUMB ++ "it cs \n\t" ++ "ldrhcs %[tmp1], [%[ptr]], #2 \n\t" ++#else ++ "ldrcsh %[tmp1], [%[ptr]], #2 \n\t" ++#endif ++#endif ++ "clz %[state], %[state] \n\t" ++ "movw %[mlps_tables], #0xffff \n\t" ++ "sub %[state], %[state], #16 \n\t" ++ "str %[tmp2], [%[c], %[range_off]] \n\t" ++ "rev %[tmp1], %[tmp1] \n\t" ++ "str %[ptr], [%[c], %[ptr_off]] \n\t" ++ "lsr %[tmp1], %[tmp1], #15 \n\t" ++ "sub %[tmp1], %[tmp1], %[mlps_tables] \n\t" ++#if CONFIG_THUMB ++ "lsl %[tmp1], %[tmp1], %[state] \n\t" ++ "add %[low], %[low], %[tmp1] \n\t" ++#else ++ "add %[low], %[low], %[tmp1], lsl %[state] \n\t" ++#endif ++ "str %[low], [%[c], %[low_off]] \n\t" ++ "b 2f \n\t" ++ "1: \n\t" ++ "strb %[mlps_tables], [%[state]] \n\t" ++ "lsl %[tmp1], %[tmp1], %[tmp2] \n\t" ++ "str %[low], [%[c], %[low_off]] \n\t" ++ "str %[tmp1], [%[c], %[range_off]] \n\t" ++ "2: \n\t" ++ : // Outputs ++ [state]"+r"(state), ++ [mlps_tables]"+r"(mlps_tables), ++ [bit]"=&r"(bit), ++ [ptr]"=&r"(ptr), ++ [low]"=&r"(low), ++ [tmp1]"=&r"(tmp1), ++ [tmp2]"=&r"(tmp2) ++ : // Inputs ++ [c]"r"(c), ++ [low_off]"J"(offsetof(CABACContext, low)), ++ [range_off]"J"(offsetof(CABACContext, range)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)), ++ [end_off]"J"(offsetof(CABACContext, bytestream_end)), ++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ return bit; ++} + +- __asm__ volatile( +- "ldrb %[bit] , [%[state]] \n\t" +- "add %[r_b] , %[tables] , %[lps_off] \n\t" +- "mov %[tmp] , %[range] \n\t" +- "and %[range] , %[range] , #0xC0 \n\t" +- "add %[r_b] , %[r_b] , %[bit] \n\t" +- "ldrb %[range] , [%[r_b], %[range], lsl #1] \n\t" +- "add %[r_b] , %[tables] , %[norm_off] \n\t" +- "sub %[r_c] , %[tmp] , %[range] \n\t" +- "lsl %[tmp] , %[r_c] , #17 \n\t" +- "cmp %[tmp] , %[low] \n\t" +- "it gt \n\t" +- "movgt %[range] , %[r_c] \n\t" +- "itt cc \n\t" +- "mvncc %[bit] , %[bit] \n\t" +- "subcc %[low] , %[low] , %[tmp] \n\t" +- "add %[r_c] , %[tables] , %[mlps_off] \n\t" +- "ldrb %[tmp] , [%[r_b], %[range]] \n\t" +- "ldrb %[r_b] , [%[r_c], %[bit]] \n\t" +- "lsl %[low] , %[low] , %[tmp] \n\t" +- "lsl %[range] , %[range] , %[tmp] \n\t" +- "uxth %[r_c] , %[low] \n\t" +- "strb %[r_b] , [%[state]] \n\t" +- "tst %[r_c] , %[r_c] \n\t" +- "bne 2f \n\t" +- "ldr %[r_c] , [%[c], %[byte]] \n\t" ++#define get_cabac_bypass get_cabac_bypass_arm ++static inline int get_cabac_bypass_arm(CABACContext * const c) ++{ ++ uint32_t low = c->low, range, ptr, tmp; ++ int rv; ++ __asm volatile ( ++ "ldr %[range] , [%[c], %[range_off]] \n\t" ++ "mov %[rv] , #0 \n\t" ++ "ldr %[ptr] , [%[c], %[ptr_off]] \n\t" ++ "lsl %[low] , #1 \n\t" ++#if !UNCHECKED_BITSTREAM_READER ++ "ldr %[tmp] , [%[c], %[end_off]] \n\t" ++#endif ++ "cmp %[low] , %[range], lsl #17 \n\t" ++ "itt cs \n\t" ++ "subcs %[low] , %[low], %[range], lsl #17 \n\t" ++ "movcs %[rv] , #1 \n\t" + #if UNCHECKED_BITSTREAM_READER +- "ldrh %[tmp] , [%[r_c]] \n\t" +- "add %[r_c] , %[r_c] , #2 \n\t" +- "str %[r_c] , [%[c], %[byte]] \n\t" ++ "ldrh %[tmp] , [%[ptr]], #2 \n\t" ++#else ++ "cmp %[tmp] , %[ptr] \n\t" ++#if CONFIG_THUMB ++ "it cs \n\t" ++ "ldrhcs %[tmp] , [%[ptr]], #2 \n\t" + #else +- "ldr %[r_b] , [%[c], %[end]] \n\t" +- "ldrh %[tmp] , [%[r_c]] \n\t" +- "cmp %[r_c] , %[r_b] \n\t" +- "itt lt \n\t" +- "addlt %[r_c] , %[r_c] , #2 \n\t" +- "strlt %[r_c] , [%[c], %[byte]] \n\t" ++ "ldrcsh %[tmp] , [%[ptr]], #2 \n\t" ++#endif + #endif +- "sub %[r_c] , %[low] , #1 \n\t" +- "add %[r_b] , %[tables] , %[norm_off] \n\t" +- "eor %[r_c] , %[low] , %[r_c] \n\t" +- "rev %[tmp] , %[tmp] \n\t" +- "lsr %[r_c] , %[r_c] , #15 \n\t" +- "lsr %[tmp] , %[tmp] , #15 \n\t" +- "ldrb %[r_c] , [%[r_b], %[r_c]] \n\t" +- "movw %[r_b] , #0xFFFF \n\t" +- "sub %[tmp] , %[tmp] , %[r_b] \n\t" +- "rsb %[r_c] , %[r_c] , #7 \n\t" +- "lsl %[tmp] , %[tmp] , %[r_c] \n\t" +- "add %[low] , %[low] , %[tmp] \n\t" +- "2: \n\t" +- : [bit]"=&r"(bit), +- [low]"+&r"(c->low), +- [range]"+&r"(c->range), +- [r_b]"=&r"(reg_b), +- [r_c]"=&r"(reg_c), +- [tmp]"=&r"(tmp) +- : [c]"r"(c), +- [state]"r"(state), +- [tables]"r"(ff_h264_cabac_tables), +- [byte]"M"(offsetof(CABACContext, bytestream)), +- [end]"M"(offsetof(CABACContext, bytestream_end)), +- [norm_off]"I"(H264_NORM_SHIFT_OFFSET), +- [lps_off]"I"(H264_LPS_RANGE_OFFSET), +- [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128) +- : "memory", "cc" +- ); ++ "lsls %[range] , %[low], #16 \n\t" ++ "bne 1f \n\t" + +- return bit & 1; ++ "str %[ptr] , [%[c], %[ptr_off]] \n\t" ++ "rev %[tmp] , %[tmp] \n\t" ++ "add %[low] , %[low], %[tmp], lsr #15 \n\t" ++ "movw %[tmp] , 0xFFFF \n\t" ++ "sub %[low] , %[tmp] \n\t" ++ "1: \n\t" ++ "str %[low] , [%[c], %[low_off]] \n\t" ++ : // Outputs ++ [rv]"=&r"(rv), ++ [low]"+r"(low), ++ [range]"=&r"(range), ++ [ptr]"=&r"(ptr), ++ [tmp]"=&r"(tmp) ++ : // Inputs ++ [c]"r"(c), ++ [low_off]"J"(offsetof(CABACContext, low)), ++ [range_off]"J"(offsetof(CABACContext, range)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)), ++ [end_off]"J"(offsetof(CABACContext, bytestream_end)) ++ : // Clobbers ++ "memory", "cc" ++ ); ++ return rv; + } ++ ++ ++#define get_cabac_bypass_sign get_cabac_bypass_sign_arm ++static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv) ++{ ++ uint32_t low = c->low, range, ptr, tmp; ++ __asm volatile ( ++ "ldr %[range] , [%[c], %[range_off]] \n\t" ++ "ldr %[ptr] , [%[c], %[ptr_off]] \n\t" ++ "lsl %[low] , #1 \n\t" ++#if !UNCHECKED_BITSTREAM_READER ++ "ldr %[tmp] , [%[c], %[end_off]] \n\t" ++#endif ++ "cmp %[low] , %[range], lsl #17 \n\t" ++ "it cs \n\t" ++ "subcs %[low] , %[low], %[range], lsl #17 \n\t" ++ "it cc \n\t" ++ "rsbcc %[rv] , %[rv], #0 \n\t" ++#if UNCHECKED_BITSTREAM_READER ++ "ldrh %[tmp] , [%[ptr]], #2 \n\t" ++#else ++ "cmp %[tmp] , %[ptr] \n\t" ++#if CONFIG_THUMB ++ "it cs \n\t" ++ "ldrhcs %[tmp] , [%[ptr]], #2 \n\t" ++#else ++ "ldrcsh %[tmp] , [%[ptr]], #2 \n\t" ++#endif ++#endif ++ "lsls %[range] , %[low], #16 \n\t" ++ "bne 1f \n\t" ++ ++ "str %[ptr] , [%[c], %[ptr_off]] \n\t" ++ "rev %[tmp] , %[tmp] \n\t" ++ "add %[low] , %[low], %[tmp], lsr #15 \n\t" ++ "movw %[tmp] , 0xFFFF \n\t" ++ "sub %[low] , %[tmp] \n\t" ++ "1: \n\t" ++ "str %[low] , [%[c], %[low_off]] \n\t" ++ : // Outputs ++ [rv]"+r"(rv), ++ [low]"+r"(low), ++ [range]"=&r"(range), ++ [ptr]"=&r"(ptr), ++ [tmp]"=&r"(tmp) ++ : // Inputs ++ [c]"r"(c), ++ [low_off]"J"(offsetof(CABACContext, low)), ++ [range_off]"J"(offsetof(CABACContext, range)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)), ++ [end_off]"J"(offsetof(CABACContext, bytestream_end)) ++ : // Clobbers ++ "memory", "cc" ++ ); ++ return rv; ++} ++ + #endif /* HAVE_ARMV6T2_INLINE */ + + #endif /* AVCODEC_ARM_CABAC_H */ +diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h +new file mode 100644 +index 0000000000..c88dec6eff +--- /dev/null ++++ b/libavcodec/arm/rpi_hevc_cabac.h +@@ -0,0 +1,607 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_ARM_HEVC_CABAC_H ++#define AVCODEC_ARM_HEVC_CABAC_H ++ ++#include "config.h" ++#if HAVE_ARMV6T2_INLINE ++ ++#define hevc_mem_bits32 hevc_mem_bits32_arm ++static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits) ++{ ++ unsigned int n; ++ __asm__ ( ++ "rev %[n], %[x] \n\t" ++ : [n]"=r"(n) ++ : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3))) ++ : ++ ); ++ return n << (bits & 7); ++} ++ ++ ++// --------------------------------------------------------------------------- ++// ++// Helper fns - little bits of code where ARM has an instraction that the ++// compiler doesn't know about / use ++ ++#define trans_scale_sat trans_scale_sat_arm ++static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift) ++{ ++ int rv; ++ int t = ((level * (int)(scale * scale_m)) >> shift) + 1; ++ ++ __asm__ ( ++ "ssat %[rv], #16, %[t], ASR #1 \n\t" ++ : [rv]"=r"(rv) ++ : [t]"r"(t) ++ : ++ ); ++ return rv; ++} ++ ++#define update_rice update_rice_arm ++static inline void update_rice_arm(uint8_t * const stat_coeff, ++ const unsigned int last_coeff_abs_level_remaining, ++ const unsigned int c_rice_param) ++{ ++ int t = last_coeff_abs_level_remaining << 1; ++ __asm__ ( ++ "lsrs %[t], %[t], %[shift] \n\t" ++ ++ "it eq \n\t" ++ "subeq %[stat], %[stat], #1 \n\t" ++ "cmp %[t], #6 \n\t" ++ "adc %[stat], %[stat], #0 \n\t" ++ "usat %[stat], #8, %[stat] \n\t" ++ : [stat]"+r"(*stat_coeff), ++ [t]"+r"(t) ++ : [shift]"r"(c_rice_param) ++ : "cc" ++ ); ++} ++ ++// --------------------------------------------------------------------------- ++// ++// CABAC get loops ++// ++// Where the loop is simple enough we can normally do 10-30% better than the ++// compiler ++ ++// Get the residual greater than 1 bits ++ ++#define get_cabac_greater1_bits get_cabac_greater1_bits_arm ++static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n, ++ uint8_t * const state0) ++{ ++ unsigned int i, reg_b, st, tmp, bit, rv; ++ __asm__ ( ++ "mov %[i] , #0 \n\t" ++ "mov %[rv] , #0 \n\t" ++ "1: \n\t" ++ "add %[i] , %[i] , #1 \n\t" ++ "cmp %[rv] , #0 \n\t" ++ "ite eq \n\t" ++ "usateq %[st] , #2 , %[i] \n\t" ++ "movne %[st] , #0 \n\t" ++ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t" ++ "and %[tmp] , %[range] , #0xC0 \n\t" ++ ++ "ldrb %[bit] , [%[state0], %[st]] \n\t" ++ "add %[r_b] , %[r_b] , %[bit] \n\t" ++ "ldrb %[tmp] , [%[r_b], %[tmp], lsl #1] \n\t" ++ "sub %[range] , %[range] , %[tmp] \n\t" ++ ++ "cmp %[low] , %[range], lsl #17 \n\t" ++ "ittt ge \n\t" ++ "subge %[low] , %[low] , %[range], lsl #17 \n\t" ++ "movge %[range] , %[tmp] \n\t" ++ "mvnge %[bit] , %[bit] \n\t" ++ ++ "clz %[tmp] , %[range] \n\t" ++ "sub %[tmp] , #23 \n\t" ++ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t" ++ "and %[bit] , %[bit] , #1 \n\t" ++ "strb %[r_b] , [%[state0], %[st]] \n\t" ++ "lsl %[low] , %[low] , %[tmp] \n\t" ++ "orr %[rv] , %[bit] , %[rv], lsl #1 \n\t" ++ "lsl %[range] , %[range] , %[tmp] \n\t" ++ ++// There is a small speed gain from combining both conditions, using a single ++// branch and then working out what that meant later ++ "lsls %[tmp] , %[low] , #16 \n\t" ++ "it ne \n\t" ++ "cmpne %[n] , %[i] \n\t" ++ "bne 1b \n\t" ++ ++// If reload is not required then we must have run out of flags to decode ++ "tst %[tmp] , %[tmp] \n\t" ++ "bne 2f \n\t" ++ ++// Do reload ++ "ldrh %[tmp] , [%[bptr]] , #2 \n\t" ++ "rbit %[bit] , %[low] \n\t" ++ "movw %[r_b] , #0xFFFF \n\t" ++ "clz %[bit] , %[bit] \n\t" ++ "rev %[tmp] , %[tmp] \n\t" ++ "sub %[bit] , %[bit] , #16 \n\t" ++ "cmp %[n] , %[i] \n\t" ++ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t" ++ ++#if CONFIG_THUMB ++ "lsl %[tmp] , %[tmp] , %[bit] \n\t" ++ "add %[low] , %[low] , %[tmp] \n\t" ++#else ++ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t" ++#endif ++ ++ "bne 1b \n\t" ++ "2: \n\t" ++ : [bit]"=&r"(bit), ++ [low]"+r"(c->low), ++ [range]"+r"(c->range), ++ [r_b]"=&r"(reg_b), ++ [bptr]"+r"(c->bytestream), ++ [i]"=&r"(i), ++ [tmp]"=&r"(tmp), ++ [st]"=&r"(st), ++ [rv]"=&r"(rv) ++ : [state0]"r"(state0), ++ [n]"r"(n), ++ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128), ++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) ++ : "memory", "cc" ++ ); ++ return rv; ++} ++ ++ ++// n must be > 0 on entry ++#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm ++static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0, ++ unsigned int n, ++ const uint8_t * ctx_map, ++ uint8_t * p) ++{ ++ unsigned int reg_b, tmp, st, bit; ++ __asm__ ( ++// Get bin from map ++#if CONFIG_THUMB ++ "add %[ctx_map] , %[n] \n\t" ++ "ldrb %[st] , [%[ctx_map]] \n\t" ++#else ++ "ldrb %[st] , [%[ctx_map], %[n]]! \n\t" ++#endif ++ "1: \n\t" ++ ++// Load state & ranges ++ "ldrb %[bit] , [%[state0], %[st]] \n\t" ++ "and %[tmp] , %[range] , #0xC0 \n\t" ++ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t" ++ "add %[r_b] , %[r_b] , %[tmp], lsl #1 \n\t" ++ "ldrb %[tmp] , [%[r_b], %[bit]] \n\t" ++ "sub %[range] , %[range] , %[tmp] \n\t" ++ ++ "cmp %[low] , %[range], lsl #17 \n\t" ++ "ittt ge \n\t" ++ "mvnge %[bit] , %[bit] \n\t" ++ "subge %[low] , %[low] , %[range], lsl #17 \n\t" ++ "movge %[range] , %[tmp] \n\t" ++ ++// Renorm ++ "clz %[tmp] , %[range] \n\t" ++ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t" ++ "sub %[tmp] , #23 \n\t" ++ "strb %[r_b] , [%[state0], %[st]] \n\t" ++ "tst %[bit] , #1 \n\t" ++ "ldrb %[st] , [%[ctx_map], #-1]! \n\t" ++ "lsl %[low] , %[low] , %[tmp] \n\t" ++// GCC asm seems to need strbne written differently for thumb and arm ++#if CONFIG_THUMB ++ "it ne \n\t" ++ "strbne %[n] , [%[idx]] , #1 \n\t" ++#else ++ "strneb %[n] , [%[idx]] , #1 \n\t" ++#endif ++ ++// There is a small speed gain from combining both conditions, using a single ++// branch and then working out what that meant later ++ "subs %[n] , %[n] , #1 \n\t" ++ "lsl %[range] , %[range] , %[tmp] \n\t" ++#if CONFIG_THUMB ++ "itt ne \n\t" ++ "lslsne %[tmp] , %[low] , #16 \n\t" ++#else ++ "lslnes %[tmp] , %[low] , #16 \n\t" ++#endif ++ "bne 1b \n\t" ++ ++// If we have bits left then n must be 0 so give up now ++ "lsls %[tmp] , %[low] , #16 \n\t" ++ "bne 2f \n\t" ++ ++// Do reload ++ "ldrh %[tmp] , [%[bptr]] , #2 \n\t" ++ "rbit %[bit] , %[low] \n\t" ++ "movw %[r_b] , #0xFFFF \n\t" ++ "clz %[bit] , %[bit] \n\t" ++ "cmp %[n] , #0 \n\t" ++ "rev %[tmp] , %[tmp] \n\t" ++ "sub %[bit] , %[bit] , #16 \n\t" ++ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t" ++ ++#if CONFIG_THUMB ++ "lsl %[tmp] , %[tmp] , %[bit] \n\t" ++ "add %[low] , %[low] , %[tmp] \n\t" ++#else ++ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t" ++#endif ++ ++// Check to see if we still have more to do ++ "bne 1b \n\t" ++ "2: \n\t" ++ : [bit]"=&r"(bit), ++ [low]"+r"(c->low), ++ [range]"+r"(c->range), ++ [r_b]"=&r"(reg_b), ++ [bptr]"+r"(c->bytestream), ++ [idx]"+r"(p), ++ [n]"+r"(n), ++ [tmp]"=&r"(tmp), ++ [st]"=&r"(st), ++ [ctx_map]"+r"(ctx_map) ++ : [state0]"r"(state0), ++ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128), ++ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET) ++ : "memory", "cc" ++ ); ++ ++ return p; ++} ++ ++// --------------------------------------------------------------------------- ++// ++// CABAC_BY22 functions ++ ++ ++#define get_cabac_by22_start get_cabac_by22_start_arm ++static inline void get_cabac_by22_start_arm(CABACContext * const c) ++{ ++ const uint8_t *ptr = c->bytestream; ++ register uint32_t low __asm__("r1"), range __asm__("r2"); ++ uint32_t m, range8, bits; ++#if !USE_BY22_DIV ++ uintptr_t inv; ++#endif ++ ++ av_assert2(offsetof (CABACContext, low) == 0); ++ av_assert2(offsetof (CABACContext, range) == 4); ++ av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2); ++ __asm__ volatile ( ++ "ldmia %[c], {%[low], %[range]} \n\t" ++ : // Outputs ++ [low]"=r"(low), ++ [range]"=r"(range) ++ : // Inputs ++ [c]"r"(c) ++ : // Clobbers ++ ); ++#if !USE_BY22_DIV ++ inv = (uintptr_t)cabac_by22_inv_range; ++#endif ++ __asm__ volatile ( ++ "ldr %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t" ++#if !USE_BY22_DIV ++ "uxtb %[range8], %[range] \n\t" ++#endif ++ "rbit %[bits], %[low] \n\t" ++ "lsl %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t" ++ "clz %[bits], %[bits] \n\t" ++ "str %[ptr], [%[c], %[ptr_off]] \n\t" ++ "rev %[m], %[m] \n\t" ++ "rsb %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t" ++ "eor %[m], %[m], #0x80000000 \n\t" ++#if !USE_BY22_DIV ++ "ldr %[inv], [%[inv], %[range8], lsl #2] \n\t" ++ "pkhbt %[range], %[bits], %[range], lsl #16 \n\t" ++ "str %[range], [%[c], %[bits_off]] \n\t" ++#else ++ "strh %[bits], [%[c], %[bits_off]] \n\t" ++#endif ++#if CONFIG_THUMB ++ "lsr %[m], %[ptr] \n\t" ++ "eor %[range], %[low], %[m] \n\t" ++#else ++ "eor %[range], %[low], %[m], lsr %[ptr] \n\t" ++#endif ++ : // Outputs ++ [ptr]"+&r"(ptr), ++ [low]"+&r"(low), ++ [range]"+&r"(range), ++#if !USE_BY22_DIV ++ [inv]"+&r"(inv), ++#endif ++ [m]"=&r"(m), ++ [range8]"=&r"(range8), ++ [bits]"=&r"(bits) ++ : // Inputs ++ [c]"r"(c), ++ [bits_off]"J"(offsetof (CABACContext, by22.bits)), ++ [ptr_off]"J"(offsetof (CABACContext, bytestream)) ++ : // Clobbers ++ "memory" ++ ); ++ c->low = range; ++#if !USE_BY22_DIV ++ c->range = inv; ++#endif ++} ++ ++#define get_cabac_by22_peek get_cabac_by22_peek_arm ++static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c) ++{ ++ uint32_t rv = c->low &~ 1, tmp; ++ __asm__ ( ++ "cmp %[inv] , #0 \n\t" ++ "it ne \n\t" ++ "umullne %[tmp] , %[rv] , %[inv], %[rv] \n\t" ++ : // Outputs ++ [rv]"+r"(rv), ++ [tmp]"=r"(tmp) ++ : // Inputs ++ [inv]"r"(c->range) ++ : // Clobbers ++ "cc" ++ ); ++ return rv << 1; ++} ++ ++#define get_cabac_by22_flush get_cabac_by22_flush_arm ++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val) ++{ ++ uint32_t bits, ptr, tmp1, tmp2; ++ __asm__ volatile ( ++ "ldrh %[bits], [%[cc], %[bits_off]] \n\t" ++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" ++ "rsb %[tmp1], %[n], #32 \n\t" ++ "add %[bits], %[bits], %[n] \n\t" ++ "ldrh %[tmp2], [%[cc], %[range_off]] \n\t" ++ "lsr %[tmp1], %[val], %[tmp1] \n\t" ++ "ldr %[val], [%[cc], %[low_off]] \n\t" ++#if CONFIG_THUMB ++ "add %[ptr], %[ptr], %[bits], lsr #3 \n\t" ++ "ldr %[ptr], [%[ptr]] \n\t" ++#else ++ "ldr %[ptr], [%[ptr], %[bits], lsr #3] \n\t" ++#endif ++ "mul %[tmp1], %[tmp2], %[tmp1] \n\t" ++ "and %[tmp2], %[bits], #7 \n\t" ++ "strh %[bits], [%[cc], %[bits_off]] \n\t" ++ "rev %[ptr], %[ptr] \n\t" ++ "lsl %[tmp1], %[tmp1], #23 \n\t" ++#if CONFIG_THUMB ++ "lsl %[val], %[n] \n\t" ++ "sub %[val], %[tmp1] \n\t" ++#else ++ "rsb %[val], %[tmp1], %[val], lsl %[n] \n\t" ++#endif ++ "lsl %[ptr], %[ptr], %[tmp2] \n\t" ++ "orr %[val], %[val], %[ptr], lsr #9 \n\t" ++ "str %[val], [%[cc], %[low_off]] \n\t" ++ : // Outputs ++ [val]"+r"(val), ++ [bits]"=&r"(bits), ++ [ptr]"=&r"(ptr), ++ [tmp1]"=&r"(tmp1), ++ [tmp2]"=&r"(tmp2) ++ : // Inputs ++ [cc]"r"(c), ++ [n]"r"(n), ++ [bits_off]"J"(offsetof(CABACContext, by22.bits)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)), ++ [range_off]"J"(offsetof(CABACContext, by22.range)), ++ [low_off]"J"(offsetof(CABACContext, low)) ++ : // Clobbers ++ "memory" ++ ); ++} ++ ++#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm ++static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param) ++{ ++ uint32_t last_coeff_abs_level_remaining; ++ uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2; ++ __asm__ volatile ( ++ "ldr %[remain], [%[cc], %[low_off]] \n\t" ++ "ldr %[prefix], [%[cc], %[range_off]] \n\t" ++ "bic %[remain], %[remain], #1 \n\t" ++ "ldrh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" ++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" ++ "cmp %[prefix], #0 \n\t" ++ "it ne \n\t" ++ "umullne %[prefix], %[remain], %[prefix], %[remain] \n\t" ++ "ldrh %[range], [%[cc], %[by22_range_off]] \n\t" ++ "lsl %[remain], %[remain], #1 \n\t" ++ "mvn %[prefix], %[remain] \n\t" ++ "clz %[prefix], %[prefix] \n\t" ++ "rsbs %[n1], %[prefix], #2 \n\t" ++ "bcc 1f \n\t" ++ "adc %[n1], %[rice], %[prefix] \n\t" ++ "add %[tmp2], %[tmp2], %[n1] \n\t" ++ "rsb %[n2], %[n1], #32 \n\t" ++ "and %[tmp1], %[tmp2], #7 \n\t" ++ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" ++ "lsr %[tmp2], %[tmp2], #3 \n\t" ++ "lsr %[n2], %[remain], %[n2] \n\t" ++ "mul %[n2], %[range], %[n2] \n\t" ++ "ldr %[range], [%[cc], %[low_off]] \n\t" ++ "ldr %[ptr], [%[ptr], %[tmp2]] \n\t" ++ "rsb %[tmp2], %[rice], #31 \n\t" ++ "lsl %[remain], %[remain], %[prefix] \n\t" ++ "lsl %[n2], %[n2], #23 \n\t" ++#if CONFIG_THUMB ++ "lsl %[range], %[n1] \n\t" ++ "sub %[range], %[n2] \n\t" ++#else ++ "rsb %[range], %[n2], %[range], lsl %[n1] \n\t" ++#endif ++ "rev %[ptr], %[ptr] \n\t" ++ "lsl %[n2], %[prefix], %[rice] \n\t" ++#if CONFIG_THUMB ++ "lsr %[remain], %[tmp2] \n\t" ++ "add %[remain], %[n2] \n\t" ++#else ++ "add %[remain], %[n2], %[remain], lsr %[tmp2] \n\t" ++#endif ++ "b 3f \n\t" ++ "1: \n\t" ++ "add %[n2], %[rice], %[prefix], lsl #1 \n\t" ++ "cmp %[n2], %[peek_bits_plus_2] \n\t" ++ "bhi 2f \n\t" ++ "sub %[n1], %[n2], #2 \n\t" ++ "add %[tmp2], %[tmp2], %[n1] \n\t" ++ "rsb %[n2], %[n1], #32 \n\t" ++ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t" ++ "lsr %[tmp1], %[tmp2], #3 \n\t" ++ "lsr %[n2], %[remain], %[n2] \n\t" ++ "mul %[n2], %[range], %[n2] \n\t" ++ "rsb %[range], %[rice], #34 \n\t" ++ "ldr %[ptr], [%[ptr], %[tmp1]] \n\t" ++ "and %[tmp1], %[tmp2], #7 \n\t" ++ "lsl %[remain], %[remain], %[prefix] \n\t" ++ "ldr %[tmp2], [%[cc], %[low_off]] \n\t" ++ "rsb %[prefix], %[prefix], %[range] \n\t" ++ "orr %[remain], %[remain], #0x80000000 \n\t" ++ "rev %[ptr], %[ptr] \n\t" ++ "lsl %[n2], %[n2], #23 \n\t" ++ "mov %[range], #2 \n\t" ++#if CONFIG_THUMB ++ "lsl %[tmp2], %[n1] \n\t" ++ "sub %[tmp2], %[n2] \n\t" ++#else ++ "rsb %[tmp2], %[n2], %[tmp2], lsl %[n1] \n\t" ++#endif ++ "lsl %[ptr], %[ptr], %[tmp1] \n\t" ++ "lsl %[rice], %[range], %[rice] \n\t" ++ "orr %[range], %[tmp2], %[ptr], lsr #9 \n\t" ++#if CONFIG_THUMB ++ "lsr %[remain], %[prefix] \n\t" ++ "add %[remain], %[rice] \n\t" ++#else ++ "add %[remain], %[rice], %[remain], lsr %[prefix] \n\t" ++#endif ++ "b 4f \n\t" ++ "2: \n\t" ++ "add %[n1], %[tmp2], %[prefix] \n\t" ++#if CONFIG_THUMB ++ "add %[tmp2], %[ptr], %[n1], lsr #3 \n\t" ++ "ldr %[tmp2], [%[tmp2]] \n\t" ++#else ++ "ldr %[tmp2], [%[ptr], %[n1], lsr #3] \n\t" ++#endif ++ "rsb %[tmp1], %[prefix], #32 \n\t" ++ "push {%[rice]} \n\t" ++ "and %[rice], %[n1], #7 \n\t" ++ "lsr %[tmp1], %[remain], %[tmp1] \n\t" ++ "ldr %[ptr], [%[cc], %[low_off]] \n\t" ++ "mul %[remain], %[range], %[tmp1] \n\t" ++ "rev %[tmp2], %[tmp2] \n\t" ++ "rsb %[n2], %[prefix], %[n2] \n\t" ++ "ldr %[tmp1], [%[cc], %[range_off]] \n\t" ++ "lsl %[rice], %[tmp2], %[rice] \n\t" ++ "sub %[tmp2], %[n2], #2 \n\t" ++ "lsl %[remain], %[remain], #23 \n\t" ++#if CONFIG_THUMB ++ "lsl %[ptr], %[prefix] \n\t" ++ "rsb %[remain], %[ptr] \n\t" ++#else ++ "rsb %[remain], %[remain], %[ptr], lsl %[prefix] \n\t" ++#endif ++ "orr %[remain], %[remain], %[rice], lsr #9 \n\t" ++ "add %[prefix], %[n1], %[tmp2] \n\t" ++ "bic %[n1], %[remain], #1 \n\t" ++ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t" ++ "cmp %[tmp1], #0 \n\t" ++ "rsb %[rice], %[tmp2], #32 \n\t" ++ "it ne \n\t" ++ "umullne %[tmp1], %[n1], %[tmp1], %[n1] \n\t" ++ "and %[tmp1], %[prefix], #7 \n\t" ++#if CONFIG_THUMB ++ "add %[ptr], %[ptr], %[prefix], lsr #3 \n\t" ++ "ldr %[ptr], [%[ptr]] \n\t" ++#else ++ "ldr %[ptr], [%[ptr], %[prefix], lsr #3] \n\t" ++#endif ++ "lsl %[n1], %[n1], #1 \n\t" ++ "lsr %[rice], %[n1], %[rice] \n\t" ++ "rsb %[n2], %[n2], #34 \n\t" ++ "mul %[range], %[range], %[rice] \n\t" ++ "pop {%[rice]} \n\t" ++ "rev %[ptr], %[ptr] \n\t" ++ "orr %[n1], %[n1], #0x80000000 \n\t" ++ "strh %[prefix], [%[cc], %[by22_bits_off]] \n\t" ++ "mov %[prefix], #2 \n\t" ++ "lsl %[range], %[range], #23 \n\t" ++#if CONFIG_THUMB ++ "lsl %[remain], %[tmp2] \n\t" ++ "rsb %[range], %[remain] \n\t" ++#else ++ "rsb %[range], %[range], %[remain], lsl %[tmp2] \n\t" ++#endif ++ "lsl %[remain], %[prefix], %[rice] \n\t" ++#if CONFIG_THUMB ++ "lsr %[n1], %[n2] \n\t" ++ "add %[remain], %[n1] \n\t" ++#else ++ "add %[remain], %[remain], %[n1], lsr %[n2] \n\t" ++#endif ++ "3: \n\t" ++ "lsl %[ptr], %[ptr], %[tmp1] \n\t" ++ "orr %[range], %[range], %[ptr], lsr #9 \n\t" ++ "4: \n\t" ++ "str %[range], [%[cc], %[low_off]] \n\t" ++ : // Outputs ++ [remain]"=&r"(last_coeff_abs_level_remaining), ++ [rice]"+r"(rice_param), ++ [prefix]"=&r"(prefix), ++ [n1]"=&r"(n1), ++ [range]"=&r"(range), ++ [n2]"=&r"(n2), ++ [ptr]"=&r"(ptr), ++ [tmp1]"=&r"(tmp1), ++ [tmp2]"=&r"(tmp2) ++ : // Inputs ++ [cc]"r"(c), ++ [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2), ++ [low_off]"J"(offsetof(CABACContext, low)), ++ [range_off]"J"(offsetof(CABACContext, range)), ++ [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)), ++ [by22_range_off]"J"(offsetof(CABACContext, by22.range)), ++ [ptr_off]"J"(offsetof(CABACContext, bytestream)) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ return last_coeff_abs_level_remaining; ++} ++ ++#endif /* HAVE_ARMV6T2_INLINE */ ++ ++#endif /* AVCODEC_ARM_HEVC_CABAC_H */ +diff --git a/libavcodec/arm/rpi_hevc_idct_fn_neon.S b/libavcodec/arm/rpi_hevc_idct_fn_neon.S +new file mode 100644 +index 0000000000..978b7b6947 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S +@@ -0,0 +1,183 @@ ++/* ++ * ARM NEON optimised IDCT functions for HEVC decoding ++ * Copyright (c) 2014 Seppo Tomperi ++ * Copyright (C) 2018 John Cox, ben Avison for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++@ Included multiple times from hevc_idct_neon.S ++@ Macros defined there ++ ++#define DC_SHIFT (15 - BIT_DEPTH) ++#define DC_ADD (1 | (1 << (14 - BIT_DEPTH))) ++#define TRN_SHIFT (20 - BIT_DEPTH) ++ ++function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r1, #DC_ADD ++ asr r1, #DC_SHIFT ++ vdup.16 q0, r1 ++ vdup.16 q1, r1 ++ vst1.16 {q0, q1}, [r0] ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r2, r0, #32 ++ mov r3, #64 ++ add r1, #DC_ADD ++ asr r1, #DC_SHIFT ++ vdup.16 q8, r1 ++ vdup.16 q9, r1 ++ vst1.16 {q8, q9}, [r0], r3 ++ vst1.16 {q8, q9}, [r2], r3 ++ vst1.16 {q8, q9}, [r0] ++ vst1.16 {q8, q9}, [r2] ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r2, r0, #32 ++ mov r3, #64 ++ add r1, #DC_ADD ++ mov ip, #16*16 ++ asr r1, #DC_SHIFT ++ vdup.16 q8, r1 ++ vdup.16 q9, r1 ++1: vst1.16 {q8, q9}, [r0], r3 ++ subs ip, ip, #32 ++ vst1.16 {q8, q9}, [r2], r3 ++ bhi 1b ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1 ++ ldrsh r1, [r0] ++ add r2, r0, #32 ++ mov r3, #64 ++ add r1, #DC_ADD ++ mov ip, #32*32 ++ asr r1, #DC_SHIFT ++ vdup.16 q8, r1 ++ vdup.16 q9, r1 ++1: vst1.16 {q8, q9}, [r0], r3 ++ subs ip, ip, #32 ++ vst1.16 {q8, q9}, [r2], r3 ++ bhi 1b ++ bx lr ++endfunc ++ ++ ++function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1 ++ vldr.i32 s0, =0x00240053 // 36 and 83 ++ vld1.16 {q14, q15}, [r0 :256] // coeffs ++ ++ tr4_shift #7 ++ ++ vzip.16 d28, d29 ++ vzip.16 d30, d31 ++ vzip.32 q14, q15 ++ ++ tr4_shift #TRN_SHIFT ++ ++ vst4.16 {q14, q15}, [r0 :256] ++ bx lr ++ ++ .ltorg ++endfunc ++ ++ ++ ++function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1 ++ vmov.i32 d0, #0x4a // 74 ++ vld1.16 {q14, q15}, [r0 :256] // coeffs ++ vmov.i32 d1, #0x1d // 29 ++ vmov.i32 d2, #0x37 // 55 ++ ++ tr4_luma_shift #7 ++ ++ vzip.16 d28, d29 ++ vzip.16 d30, d31 ++ vzip.32 q14, q15 ++ ++ tr4_luma_shift #TRN_SHIFT ++ ++ vst4.16 {q14, q15}, [r0 :256] ++ bx lr ++endfunc ++ ++function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1 ++ add r2, r0, #16 ++ adr r3, tr4f ++ vpush {d8-d15} ++ vld1.16 {d0, d1}, [r3] ++ mov r3, #32 ++ ++ tr8_vert d16, d17, d18, d19, d24, d25, d26, d27, q8, q9, \ ++ "sub r0, r0, #128-8", \ ++ "sub r2, r2, #128-8", \ ++ "cmp r1, #4" ++ ble 2f ++ ++ tr8_vert d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \ ++ "sub r0, r0, #128+8", \ ++ "sub r2, r2, #128+8+16-32", \ ++ "mov r3, #64" ++ ++ vzip.16 d16, d17 ++ vzip.16 d18, d19 ++ ++ vzip.16 d20, d21 ++ vzip.16 d22, d23 ++ vzip.16 d28, d29 ++ vzip.16 d30, d31 ++ vzip.32 q10, q11 ++ vzip.32 q14, q15 ++1: ++ vzip.16 d24, d25 ++ vzip.16 d26, d27 ++ vzip.32 q8, q9 ++ vzip.32 q12, q13 ++ ++ tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8, q9, TRN_SHIFT ++ tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT ++ ++ vpop {d8-d15} ++ bx lr ++ ++2: vmov.i64 q10, #0 ++ sub r0, r0, #8 ++ vmov.i64 q11, #0 ++ sub r2, r2, #8+16-32 ++ vmov.i64 q14, #0 ++ mov r3, #64 ++ vmov.i64 q15, #0 ++ ++ vzip.16 d16, d17 ++ vzip.16 d18, d19 ++ ++ b 1b ++ ++endfunc ++ ++#undef DC_SHIFT ++#undef DC_ADD ++#undef TRN_SHIFT ++ +diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S +new file mode 100644 +index 0000000000..161bb0d7c9 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevc_misc_neon.S +@@ -0,0 +1,267 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Written by John Cox, Ben Avison ++*/ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ rpi_zap_coeff_vals_neon( ++@ uint16_t * buf, [r0] ++@ unsigned int log_n_m2) [r1] ++ ++function rpi_zap_coeff_vals_neon, export=1 ++ mov ip, #1 ++ vmov.i64 q0, #0 ++ teq r1, #0 ++ vmov.i64 q1, #0 ++ beq 2f ++ ++ lsl ip, r1 @ 2, 4 or 8 ++ add r2, r0, #32 ++ lsl ip, r1 @ 4, 16 or 64 = number of 32-byte blocks to zero ++ mov r3, #64 ++1: vst1.8 {q0,q1}, [r0:256], r3 ++ subs ip, #2 ++ vst1.8 {q0,q1}, [r2:256], r3 ++ bne 1b ++ bx lr ++ ++2: vst1.8 {q0,q1}, [r0:256] ++ bx lr ++endfunc ++ ++@ PIC jump tables are more expensive than absolute for A32 code ++.set jent_pic, CONFIG_PIC || CONFIG_THUMB ++ ++@ Jump table entry - if in neon mode the bottom bit must be set ++@ ? There is probably a real asm instruction to do this but I haven't found it ++.macro jent lab ++.if jent_pic ++T .short ((0 + \lab) - (0 + 98b)) / 2 ++A .short (0 + \lab) - (4 + 98b) ++.else ++T .word 1 + \lab ++A .word \lab ++.endif ++.endm ++ ++.set expected_next, 0 ++ ++.macro cpy_compound val, p1, p2, drop_thru=0 ++.if \p1 + \p2 != \val ++.error "Bad addition! \p1 + \p2 != \val" ++.endif ++.if expected_next != 0 && expected_next != \val ++.error "Drop thru failure" ++.endif ++\val\(): ++ push {r0-r3} ++ bl 100\p1\()b ++ pop {r0-r3} ++ add r0, #\p1 ++ add r2, #\p1 ++.if \drop_thru == 0 ++ b \p2\()b ++.set expected_next, 0 ++.else ++.set expected_next, \p2 ++.endif ++.endm ++ ++@ ff_hevc_cpy_blks8x4_neon( ++@ dst [r0] ++@ dst_stride [r1] ++@ src [r2] ++@ src_stride [r3] ++@ width [sp, #0] (bytes) ++@ height) [sp, #4] ++@ ++@ Power of 2 widths are directly coded, all others are done in stripes ++@ We expect the vast majority of calls to be power of 2 ++@ ++@ Currently has min width of 8, but we could make that 4 without issue ++@ Min height is 4 ++ ++function ff_hevc_rpi_cpy_blks8x4_neon, export=1 ++ ldr r12, [sp, #0] ++ push {r11, lr} ++.if jent_pic ++A adr lr, 98f - 2 ++.else ++A adr lr, 98f - 4 ++.endif ++ lsr r12, #3 ++ ldr r11, [sp, #(8 + 4)] ++.if jent_pic ++A lsl r12, #1 ++A ldrsh lr, [lr, r12] ++A add pc, lr ++T tbh [pc, r12, lsl #1] ++.else ++ @ A32 only, Thumb is always PIC ++ ldr pc, [lr, r12, lsl #2] ++.endif ++ ++98: ++T .short 0 @ unused ++ jent 8f ++ jent 16f ++ jent 24f ++ jent 32f ++ jent 40f ++ jent 48f ++ jent 56f ++ jent 64f ++ jent 72f ++ jent 80f ++ jent 88f ++ jent 96f ++ jent 104f ++ jent 112f ++ jent 120f ++ jent 128f ++ ++1008: ++ push {r11, lr} ++8: ++ add lr, r2, r3 ++ lsl r3, #1 ++ add r12, r0, r1 ++ lsl r1, #1 ++1: ++ vld1.32 {d0 }, [r2], r3 ++ vld1.32 {d1 }, [lr], r3 ++ vld1.32 {d2 }, [r2], r3 ++ vld1.32 {d3 }, [lr], r3 ++ subs r11, #4 ++ vst1.32 {d0 }, [r0], r1 ++ vst1.32 {d1 }, [r12], r1 ++ vst1.32 {d2 }, [r0], r1 ++ vst1.32 {d3 }, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++10016: ++ push {r11, lr} ++16: ++ add lr, r2, r3 ++ lsl r3, #1 ++ add r12, r0, r1 ++ lsl r1, #1 ++1: ++ vld1.32 {q0 }, [r2], r3 ++ vld1.32 {q1 }, [lr], r3 ++ vld1.32 {q2 }, [r2], r3 ++ vld1.32 {q3 }, [lr], r3 ++ subs r11, #4 ++ vst1.32 {q0 }, [r0], r1 ++ vst1.32 {q1 }, [r12], r1 ++ vst1.32 {q2 }, [r0], r1 ++ vst1.32 {q3 }, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++10032: ++ push {r11, lr} ++32: ++ add lr, r2, r3 ++ lsl r3, #1 ++ add r12, r0, r1 ++ lsl r1, #1 ++1: ++ vld1.32 {q8, q9 }, [r2], r3 ++ vld1.32 {q10, q11}, [lr], r3 ++ vld1.32 {q12, q13}, [r2], r3 ++ vld1.32 {q14, q15}, [lr], r3 ++ subs r11, #4 ++ vst1.32 {q8, q9 }, [r0], r1 ++ vst1.32 {q10, q11}, [r12], r1 ++ vst1.32 {q12, q13}, [r0], r1 ++ vst1.32 {q14, q15}, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++10064: ++ push {r11, lr} ++64: ++ add lr, r2, #32 ++ add r12, r0, #32 ++1: ++ vld1.32 {q8, q9 }, [r2], r3 ++ vld1.32 {q10, q11}, [lr], r3 ++ vld1.32 {q12, q13}, [r2], r3 ++ vld1.32 {q14, q15}, [lr], r3 ++ subs r11, #2 ++ vst1.32 {q8, q9 }, [r0], r1 ++ vst1.32 {q10, q11}, [r12], r1 ++ vst1.32 {q12, q13}, [r0], r1 ++ vst1.32 {q14, q15}, [r12], r1 ++ bgt 1b ++ pop {r11, pc} ++ ++128: ++ push {r4, r5} ++ @ We could do this with fewer registers if we jump around but I ++ @ have a primative urge to load sequentially ++ mov r4, #64 ++ add lr, r2, #32 ++ add r12, r0, #32 ++ sub r3, r4 ++ sub r1, r4 ++1: ++ vld1.32 {q8, q9 }, [r2], r4 ++ vld1.32 {q10, q11}, [lr], r4 ++ vld1.32 {q12, q13}, [r2], r3 ++ vld1.32 {q14, q15}, [lr], r3 ++ subs r11, #1 ++ vst1.32 {q8, q9 }, [r0], r4 ++ vst1.32 {q10, q11}, [r12], r4 ++ vst1.32 {q12, q13}, [r0], r1 ++ vst1.32 {q14, q15}, [r12], r1 ++ bgt 1b ++ pop {r4, r5, r11, pc} ++ ++@ Use drop_thru where we can ++cpy_compound 104, 64, 40, 1 ++cpy_compound 40, 32, 8 ++ ++cpy_compound 112, 64, 48, 1 ++cpy_compound 48, 32, 16 ++ ++cpy_compound 120, 64, 56, 1 ++cpy_compound 56, 32, 24, 1 ++cpy_compound 24, 16, 8 ++ ++cpy_compound 72, 64, 8 ++cpy_compound 80, 64, 16 ++cpy_compound 88, 64, 24 ++cpy_compound 96, 64, 32 ++ ++ ++endfunc ++ +diff --git a/libavcodec/arm/rpi_hevc_misc_neon.h b/libavcodec/arm/rpi_hevc_misc_neon.h +new file mode 100644 +index 0000000000..9d21f6a882 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevc_misc_neon.h +@@ -0,0 +1,438 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H ++#define AVCODEC_ARM_RPI_HEVC_MISC_H ++ ++#include "config.h" ++#if HAVE_NEON_INLINE && !CONFIG_THUMB ++ ++static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_src) ++{ ++ const uint8_t *src2 = src + stride_src; ++ stride_src <<= 1; ++ switch (pixel_shift) ++ { ++ case 2: ++ __asm__ volatile ( ++ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t" ++ "subs %[height], #4 \n\t" ++ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.32 {d2[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d2[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.32 {d3[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d3[1]}, [%[src2]], %[stride_src] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.32 {q0}, [%[dst]]! \n\t" ++ "beq 3f \n\t" ++ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.32 {q1}, [%[dst]]! \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vst1.32 {q0}, [%[dst]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vst1.32 {q1}, [%[dst]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [src]"+r"(src), ++ [src2]"+r"(src2), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ case 1: ++ __asm__ volatile ( ++ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t" ++ "subs %[height], #4 \n\t" ++ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.16 {d2[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d3[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.16 {d2[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d3[1]}, [%[src2]], %[stride_src] \n\t" ++ "vzip.16 d0, d1 \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.16 {d0}, [%[dst]]! \n\t" ++ "beq 3f \n\t" ++ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "vzip.16 d2, d3 \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.16 {d2}, [%[dst]]! \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vzip.16 d0, d1 \n\t" ++ "vst1.16 {d0}, [%[dst]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vzip.16 d2, d3 \n\t" ++ "vst1.16 {d2}, [%[dst]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [src]"+r"(src), ++ [src2]"+r"(src2), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ default: ++ __asm__ volatile ( ++ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t" ++ "subs %[height], #8 \n\t" ++ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.8 {d2[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d3[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d2[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d3[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d2[2]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d3[2]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d2[3]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d3[3]}, [%[src2]], %[stride_src] \n\t" ++ "vzip.8 d0, d1 \n\t" ++ "subs %[height], #8 \n\t" ++ "vst1.8 {d0}, [%[dst]]! \n\t" ++ "beq 3f \n\t" ++ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t" ++ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t" ++ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t" ++ "vzip.8 d2, d3 \n\t" ++ "subs %[height], #8 \n\t" ++ "vst1.8 {d2}, [%[dst]]! \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vzip.8 d0, d1 \n\t" ++ "vst1.8 {d0}, [%[dst]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vzip.8 d2, d3 \n\t" ++ "vst1.8 {d2}, [%[dst]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [src]"+r"(src), ++ [src2]"+r"(src2), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ } ++} ++ ++static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst) ++{ ++ uint8_t *dst2 = dst + stride_dst; ++ stride_dst <<= 1; ++ switch (pixel_shift) ++ { ++ case 2: ++ __asm__ volatile ( ++ "subs %[height], #4 \n\t" ++ "vld1.32 {q0}, [%[src]]! \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.32 {q1}, [%[src]]! \n\t" ++ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.32 {d1[0]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.32 {d1[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "beq 3f \n\t" ++ "vld1.32 {q0}, [%[src]]! \n\t" ++ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.32 {d3[0]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.32 {d3[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.32 {d1[0]}, [%[dst]] \n\t" ++ "vst1.32 {d1[1]}, [%[dst2]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.32 {d3[0]}, [%[dst]] \n\t" ++ "vst1.32 {d3[1]}, [%[dst2]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [dst]"+r"(dst), ++ [dst2]"+r"(dst2), ++ [src]"+r"(src), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ case 1: ++ __asm__ volatile ( ++ "subs %[height], #4 \n\t" ++ "vld1.16 {d0}, [%[src]]! \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.16 {d2}, [%[src]]! \n\t" ++ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.16 {d0[2]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.16 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "beq 3f \n\t" ++ "vld1.16 {d0}, [%[src]]! \n\t" ++ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.16 {d2[2]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #4 \n\t" ++ "vst1.16 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.16 {d0[2]}, [%[dst]] \n\t" ++ "vst1.16 {d0[3]}, [%[dst2]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.16 {d2[2]}, [%[dst]] \n\t" ++ "vst1.16 {d2[3]}, [%[dst2]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [dst]"+r"(dst), ++ [dst2]"+r"(dst2), ++ [src]"+r"(src), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ default: ++ __asm__ volatile ( ++ "subs %[height], #8 \n\t" ++ "vld1.8 {d0}, [%[src]]! \n\t" ++ "beq 2f \n\t" ++ "1: \n\t" ++ "vld1.8 {d2}, [%[src]]! \n\t" ++ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[6]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #8 \n\t" ++ "vst1.8 {d0[7]}, [%[dst2]], %[stride_dst] \n\t" ++ "beq 3f \n\t" ++ "vld1.8 {d0}, [%[src]]! \n\t" ++ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[6]}, [%[dst]], %[stride_dst] \n\t" ++ "subs %[height], #8 \n\t" ++ "vst1.8 {d2[7]}, [%[dst2]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "2: \n\t" ++ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d0[6]}, [%[dst]] \n\t" ++ "vst1.8 {d0[7]}, [%[dst2]] \n\t" ++ "b 4f \n\t" ++ "3: \n\t" ++ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t" ++ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t" ++ "vst1.8 {d2[6]}, [%[dst]] \n\t" ++ "vst1.8 {d2[7]}, [%[dst2]] \n\t" ++ "4: \n\t" ++ : // Outputs ++ [dst]"+r"(dst), ++ [dst2]"+r"(dst2), ++ [src]"+r"(src), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ } ++} ++ ++static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src) ++{ ++ int x, y; ++ switch (pixel_shift) ++ { ++ case 2: ++ __asm__ volatile ( ++ "ldr %[x], [%[src]], %[stride_src] \n\t" ++ "ldr %[y], [%[src]], %[stride_src] \n\t" ++ "str %[x], [%[dst]], %[stride_dst] \n\t" ++ "sub %[height], #2 \n\t" ++ "1: \n\t" ++ "ldr %[x], [%[src]], %[stride_src] \n\t" ++ "str %[y], [%[dst]], %[stride_dst] \n\t" ++ "ldr %[y], [%[src]], %[stride_src] \n\t" ++ "subs %[height], #2 \n\t" ++ "str %[x], [%[dst]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "str %[y], [%[dst]] \n\t" ++ : // Outputs ++ [x]"=&r"(x), ++ [y]"=&r"(y), ++ [src]"+r"(src), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src), ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ case 1: ++ __asm__ volatile ( ++ "ldrh %[x], [%[src]], %[stride_src] \n\t" ++ "ldrh %[y], [%[src]], %[stride_src] \n\t" ++ "strh %[x], [%[dst]], %[stride_dst] \n\t" ++ "sub %[height], #2 \n\t" ++ "1: \n\t" ++ "ldrh %[x], [%[src]], %[stride_src] \n\t" ++ "strh %[y], [%[dst]], %[stride_dst] \n\t" ++ "ldrh %[y], [%[src]], %[stride_src] \n\t" ++ "subs %[height], #2 \n\t" ++ "strh %[x], [%[dst]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "strh %[y], [%[dst]] \n\t" ++ : // Outputs ++ [x]"=&r"(x), ++ [y]"=&r"(y), ++ [src]"+r"(src), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src), ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ default: ++ __asm__ volatile ( ++ "ldrb %[x], [%[src]], %[stride_src] \n\t" ++ "ldrb %[y], [%[src]], %[stride_src] \n\t" ++ "strb %[x], [%[dst]], %[stride_dst] \n\t" ++ "sub %[height], #2 \n\t" ++ "1: \n\t" ++ "ldrb %[x], [%[src]], %[stride_src] \n\t" ++ "strb %[y], [%[dst]], %[stride_dst] \n\t" ++ "ldrb %[y], [%[src]], %[stride_src] \n\t" ++ "subs %[height], #2 \n\t" ++ "strb %[x], [%[dst]], %[stride_dst] \n\t" ++ "bne 1b \n\t" ++ "strb %[y], [%[dst]] \n\t" ++ : // Outputs ++ [x]"=&r"(x), ++ [y]"=&r"(y), ++ [src]"+r"(src), ++ [dst]"+r"(dst), ++ [height]"+r"(height) ++ : // Inputs ++ [stride_src]"r"(stride_src), ++ [stride_dst]"r"(stride_dst) ++ : // Clobbers ++ "cc", "memory" ++ ); ++ break; ++ } ++} ++ ++#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon ++static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src) ++{ ++ if (stride_dst == 1 << pixel_shift) ++ ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src); ++ else if (stride_src == 1 << pixel_shift) ++ ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst); ++ else ++ ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src); ++} ++ ++#endif /* HAVE_NEON_INLINE */ ++ ++#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */ +diff --git a/libavcodec/arm/rpi_hevc_mv_arm.h b/libavcodec/arm/rpi_hevc_mv_arm.h +new file mode 100644 +index 0000000000..325c26a49b +--- /dev/null ++++ b/libavcodec/arm/rpi_hevc_mv_arm.h +@@ -0,0 +1,93 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Written by John Cox, Ben Avison ++*/ ++ ++#ifndef AVCODEC_ARM_RPI_HEVC_MV_H ++#define AVCODEC_ARM_RPI_HEVC_MV_H ++ ++#if HAVE_ARMV6T2_INLINE ++static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b) ++{ ++ MvXY r; ++ __asm__ ( ++ "sadd16 %[r], %[a], %[b] \n\t" ++ : [r]"=r"(r) ++ : [a]"r"(a), ++ [b]"r"(b) ++ : ++ ); ++ return r; ++} ++#define mvxy_add mvxy_add_arm ++#endif ++ ++#if HAVE_ARMV6T2_INLINE ++#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV)) ++static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb) ++{ ++ int t; ++ __asm__ ( ++ "ssat %[td], #8, %[td] \n\t" ++ "ssat %[tb], #8, %[tb] \n\t" ++ "eor %[t], %[td], %[td], asr #31 \n\t" ++ "adds %[t], %[t], %[td], lsr #31 \n\t" ++ "asr %[t], #1 \n\t" ++ "add %[t], #0x4000 \n\t" ++ "it ne \n\t" ++ "sdivne %[t], %[t], %[td] \n\t" ++ "mov %[td], #32 \n\t" ++ "smlabb %[td], %[t], %[tb], %[td] \n\t" ++ "ssat %[td], #13, %[td], asr #6 \n\t" ++ "mov %[tb], #127 \n\t" ++ "smlatb %[t], %[xy], %[td], %[tb] \n\t" ++ "smlabb %[tb], %[xy], %[td], %[tb] \n\t" ++// This takes the sign of x & y for rounding at the "wrong" point ++// (i.e. after adding 127) but for the range of values (-1,-127) ++// where it does the wrong thing you get the right answer (0) anyway ++ "add %[t], %[t], %[t], lsr #31 \n\t" ++ "add %[xy], %[tb], %[tb], lsr #31 \n\t" ++ "ssat %[t], #16, %[t], asr #8 \n\t" ++ "ssat %[xy], #16, %[xy], asr #8 \n\t" ++ "pkhbt %[xy], %[xy], %[t], lsl #16 \n\t" ++ : ++ [t]"=&r"(t), ++ [xy]"+r"(xy), ++ [td]"+r"(td), ++ [tb]"+r"(tb) ++ : ++ : ++ "cc" ++ ); ++ return xy; ++} ++#define mv_scale_xy mv_scale_xy_arm ++#endif ++#endif ++ ++#endif // AVCODEC_ARM_RPI_HEVC_MV_H ++ +diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h +new file mode 100644 +index 0000000000..62b9326532 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_arm.h +@@ -0,0 +1,26 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_ARM_HEVCDSP_ARM_H ++#define AVCODEC_ARM_HEVCDSP_ARM_H ++ ++#include "libavcodec/rpi_hevcdsp.h" ++ ++void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth); ++ ++#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */ +diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S +new file mode 100644 +index 0000000000..88a3b4e5e7 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S +@@ -0,0 +1,1634 @@ ++/* ++ * Copyright (c) 2014 Seppo Tomperi ++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1 ++ */ ++ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8 ++ vsubl.u8 q0, \Q0a, \P0a ++ vsubl.u8 q1, \P1a, \Q1a ++ vdup.16 d4, r2 ++ \I1 ++ vshl.i16 q0, #2 ++ \I2 ++ vadd.i16 q0, q1 ++ \I3 ++ vmovl.u8 q2, d4 ++ \I4 ++ vneg.s16 q1, q2 ++ \I5 ++ vrshr.s16 q0, #3 ++ \I6 ++ \I7 ++ \I8 ++ vmin.s16 q0, q2 ++ vmovl.u8 q2, \Q0a ++ vmax.s16 q0, q1 ++ vaddw.u8 q1, q0, \P0a ++ vsub.i16 q0, q2, q0 ++ vqmovun.s16 \P0a, q1 ++ vqmovun.s16 \Q0a, q0 ++.endm ++ ++ ++.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7 ++ vsubl.u8 q0, \Q0a, \P0a @ q0a - p0a ++ lsr r12, r2, #16 ++ vsubl.u8 q1, \Q0b, \P0b @ q0b - p0b ++ vsubl.u8 q2, \P1a, \Q1a @ p1a - q1a ++ vsubl.u8 q3, \P1b, \Q1b @ p1b - q1b ++ vshl.i16 q0, #2 @ (q0a - p0a) * 4 ++ vshl.i16 q1, #2 @ (q0b - p0b) * 4 ++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a ++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b ++ vdup.16 d4, r2 @ tc0a, tc0b ++ vdup.16 d6, r12 @ tc1a, tc1b ++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3 ++ \I1 ++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3 ++ \I2 ++ vmovl.u8 q2, d4 @ tc0a, tc0b ++ \I3 ++ vmovl.u8 q3, d6 @ tc1a, tc1b ++ \I4 ++ vmin.s16 q0, q2 ++ \I5 ++ vneg.s16 q2, q2 @ -tc0a, -tc0b ++ \I6 ++ vmin.s16 q1, q3 ++ \I7 ++ vneg.s16 q3, q3 @ -tc1a, -tc1b ++ vmax.s16 q0, q2 @ delta0a ++ vmovl.u8 q2, \Q0a ++ vmax.s16 q1, q3 @ delta0b ++ vaddw.u8 q3, q0, \P0a @ p0a + delta0a ++ vsub.i16 q0, q2, q0 @ q0a - delta0a ++ vmovl.u8 q2, \Q0b ++ vsub.i16 q2, q1 @ q0b - delta0b ++ vaddw.u8 q1, \P0b @ p0b + delta0b ++ vqmovun.s16 \Q0a, q0 ++ vqmovun.s16 \P0a, q3 ++ vqmovun.s16 \Q0b, q2 ++ vqmovun.s16 \P0b, q1 ++.endm ++ ++ ++@ Preserves r12 ++@ Clobbers r2 ++@ P0a et al all contain UVUVUVUV ++@ r2 (tc4) contains ++@ [0..7] tc U a ++@ [8..15] tc V a ++ ++.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8 ++ vsub.i16 q0, \Q0a, \P0a ++ vsub.i16 q1, \P1a, \Q1a ++ vdup.16 d4, r2 ++ \I1 ++ vshl.i16 q0, #2 ++ \I2 ++ vadd.i16 q0, q1 ++ \I3 ++ vshll.u8 q2, d4, #\bit_depth - 8 ++ \I4 ++ vneg.s16 q1, q2 ++ \I5 ++ vrshr.s16 q0, #3 ++ \I6 ++ \I7 ++ \I8 ++ vmin.s16 q0, q2 ++ vmov.i16 q2, #0 ++ vmax.s16 q0, q1 ++ vadd.i16 \P0a, q0 ++ vsub.i16 \Q0a, q0 ++ vmov.i16 q1, #(1 << \bit_depth) - 1 ++ vmax.s16 \P0a, q2 ++ vmax.s16 \Q0a, q2 ++ vmin.s16 \P0a, q1 ++ vmin.s16 \Q0a, q1 ++.endm ++ ++@ Clobbers r2, r12 ++@ P0a et al all contain UVUVUVUV ++@ r2 (tc4) contains ++@ [0..7] tc U a ++@ [8..15] tc V a ++@ [16..23] tc U b ++@ [24..31] tc V b ++ ++.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7 ++ vsub.i16 q0, \Q0a, \P0a @ q0a - p0a ++ lsr r12, r2, #16 ++ vsub.i16 q1, \Q0b, \P0b @ q0b - p0b ++ vsub.i16 q2, \P1a, \Q1a @ p1a - q1a ++ vsub.i16 q3, \P1b, \Q1b @ p1b - q1b ++ vshl.i16 q0, #2 @ (q0a - p0a) * 4 ++ vshl.i16 q1, #2 @ (q0b - p0b) * 4 ++ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a ++ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b ++ vdup.16 d4, r2 @ tc0a, tc0b ++ vdup.16 d6, r12 @ tc1a, tc1b ++ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3 ++ \I1 ++ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3 ++ \I2 ++ vshll.u8 q2, d4, #\bit_depth - 8 @ tc0a, tc0b ++ \I3 ++ vshll.u8 q3, d6, #\bit_depth - 8 @ tc1a, tc1b ++ \I4 ++ vmin.s16 q0, q2 ++ \I5 ++ vneg.s16 q2, q2 @ -tc0a, -tc0b ++ \I6 ++ vmin.s16 q1, q3 ++ \I7 ++ vneg.s16 q3, q3 @ -tc1a, -tc1b ++ vmax.s16 q0, q2 @ delta0a ++ vadd.i16 \P0a, q0 @ p0a + delta0a ++ vsub.i16 \Q0a, q0 @ q0a - delta0a ++ vmax.s16 q1, q3 @ delta0b ++ vadd.i16 \P0b, q1 @ p0b + delta0b ++ vsub.i16 \Q0b, q1 @ q0b - delta0b ++ vmov.i16 q2, #0 ++ vmov.i16 q3, #(1 << \bit_depth) - 1 ++ vmax.s16 \P0a, q2 ++ vmax.s16 \Q0a, q2 ++ vmax.s16 \P0b, q2 ++ vmax.s16 \Q0b, q2 ++ vmin.s16 \P0a, q3 ++ vmin.s16 \Q0a, q3 ++ vmin.s16 \P0b, q3 ++ vmin.s16 \Q0b, q3 ++.endm ++ ++ ++ ++@ uint8_t *_no_p, [sp+0] ++@ uint8_t *_no_q) [sp+4] ++ ++.macro hevc_loop_filter_luma_start ++ ldr r12, [r3] ++ ldr r3, [r3, #4] ++ orrs r3, r12, r3, lsl #16 ++ it eq ++ bxeq lr ++ push {r4-r10,lr} @ 32 bytes ++ ldrd r4, r5, [sp, #32] @ &_no_p ++ ldrb r4, [r4] ++ ldrb r5, [r5] ++ movs r10, r4 ++ it ne ++ movne r10, #1 ++ cmp r5, #0 ++ it ne ++ orrne r10, #2 ++.endm ++ ++@ Input: ++@ r2 beta (raw: needs shift for bitdepth > 8) ++@ r3[ 0:15] tc[0] (raw: needs shift for bitdepth > 8) ++@ r3[16:31] tc[1] (raw: needs shift for bitdepth > 8) ++@ ++@ Input & output ++@ 8-bit: d16-d23 (Q3,Q2,Q1,Q0,P0,P1,P2,P3) ++@ 16-bit: q8-q15 ++@ ++@ r1 -r1 ++@ r10 b1->C, b0->N (r10 junk) ++@ ++@ Junks: ++@ r5, r6, r7, r8, r9 ++ ++.macro m_filter_luma bit_depth, Q11, Q15 ++.if \bit_depth == 8 ++ vmovl.u8 q14, d22 @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2 ++ vmovl.u8 q13, d21 @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1 ++ vmovl.u8 q12, d20 @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0 ++ vmovl.u8 \Q11, d19 @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0 ++ vmovl.u8 q10, d18 @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1 ++ vmovl.u8 q9, d17 @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2 ++.endif ++ vadd.i16 q0, q9, \Q11 @ P2 + P0 ++.if \bit_depth > 8 ++ lsl r3, r3, #(\bit_depth - 8) ++.endif ++ vadd.i16 q1, q14, q12 @ Q2 + Q0 ++.if \bit_depth > 8 ++ lsl r2, r2, #(\bit_depth - 8) ++.endif ++ vsub.i16 q0, q10 @ P2 - P1 + P0 ++ lsr r5, r3, #16 ++ vsub.i16 q1, q13 @ Q2 - Q1 + Q0 ++.if \bit_depth == 8 ++ vmovl.u8 q8, d16 @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3 ++ vmovl.u8 \Q15, d23 @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3 ++.endif ++ vabd.s16 q0, q10 @ dp0 = abs(P2 - 2 * P1 + P0) ++ vabd.s16 q1, q13 @ dq0 = abs(Q2 - 2 * Q1 + Q0) ++ vmov.i64 q2, #0xffffffff0000 ++ vbic q0, q2 @ only dp0(') and dp3(') ++ vbic q1, q2 @ only dq0(') and dq3(') ++ vsra.u64 q0, #16 ++ vsra.u64 q1, #16 ++ vdup.16 q3, r2 @ beta ++ vdup.16 d14, r3 @ tC[0] ++ vdup.16 d15, r5 @ tC[1] ++ vabd.s16 q4, q8, \Q11 @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0) ++ vmovn.i32 d0, q0 @ dp3' dp0' dp3 dp0 ++ vmovn.i32 d1, q1 @ dq3' dq0' dq3 dq0 ++ vadd.i16 d5, d0, d1 @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0 ++ vabd.s16 q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0) ++ vaba.s16 q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0) ++ vpadd.i16 d2, d5, d5 @ dontcare dontcare d0'+d3' d0+d3 ++ vshl.s16 q6, q7, #2 @ tC[] * 4 ++ vrhadd.s16 q6, q7 @ tc25 = (tc[] * 5 + 1) >> 1 ++ vcgt.s16 d2, d6, d2 @ if (d0 + d3 < beta) ++ vmov r7, s4 @ (d2) r7 = mask of blocks to apply filtering (16b/block) ++ vshr.s16 q1, q3, #3 @ beta_3 = beta >> 3 ++ cmp r7, #0 ++ beq .Lbypasswrite ++ ++ vcgt.s16 q5, q6, q5 @ if < tc25 ++ vcgt.s16 q4, q1, q4 @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3) ++ vand q4, q5 ++ vbic d8, d4 ++ vbic d9, d4 ++ vshr.s16 q3, #2 @ beta_2 = beta >> 2 ++ vsra.u64 q4, #16 ++ vshl.s16 d5, #1 @ d3'<<1 d0'<<1 d3<<1 d0<<1 ++ vshl.i16 q7, #1 @ tc2 = tC[] << 1 ++ vcgt.s16 d6, d5 @ if (d3'<<1 < beta_2) etc ++ vmovn.i32 d8, q4 @ beta_3 && tc25 tests, prime block in ms half ++ vand d6, d8 @ && beta_2 tests, prime in ms half ++ vpadd.i16 d0, d1 @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3 ++ vneg.s16 q6, q7 @ -tc2 ++ vmovn.i32 d8, q3 ++ vshrn.i32 d6, q3, #16 ++ vand d6, d8 ++ vmov r5, r6, d0 @ r5 = dp0'+dp3' dp0+dp3 r6 = dq0'+dq3' dq0+dq3 ++ vmov r8, s12 @ (d6) r8 = mask of strong filtering blocks (16b/block) ++ vadd.i16 q0, \Q11, q12 @ p0 + q0 ++ ands r9, r7, r8 ++ beq 1f ++ ++ vadd.i16 q2, q0, q10 @ p1 + p0 + q0 ++ vadd.i16 q3, q0, q13 @ p0 + q0 + q1 ++ lsr r3, r9, #16 ++ vadd.i16 q1, q2, q9 @ p2 + p1 + p0 + q0 (new P1 before clipping) ++ vadd.i16 q4, q3, q14 @ p0 + q0 + q1 + q2 (new Q1 before clipping) ++ vadd.i16 q0, q8, q9 @ p3 + p2 ++ vadd.i16 q5, \Q15, q14 @ q2 + q3 ++ vadd.i16 q2, q1 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 ++ vadd.i16 q3, q4 @ 2 * p0 + 2 * q0 + 2 * q1 + q2 ++ vshl.i16 q0, #1 @ 2 * p3 + 2 * p2 ++ vshl.i16 q5, #1 @ 2 * q2 + 2 * q3 ++ vadd.i16 q0, q1 @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping) ++ vadd.i16 q5, q4 @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping) ++ vadd.i16 q2, q13 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping) ++ vadd.i16 q3, q10 @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping) ++ vrshr.s16 q0, #3 @ scale, with rounding ++ vrshr.s16 q5, #3 ++ vrshr.s16 q1, #2 ++ vrshr.s16 q4, #2 ++ vrshr.s16 q2, #3 ++ vrshr.s16 q3, #3 ++ vsub.i16 q0, q9 @ find difference ++ vsub.i16 q5, q14 ++ vsub.i16 q1, q10 ++ vsub.i16 q4, q13 ++ vsub.i16 q2, \Q11 ++ vsub.i16 q3, q12 ++ vmax.s16 q0, q6 @ clip difference to -tc2 .. tc2 ++ vmax.s16 q5, q6 ++ vmax.s16 q1, q6 ++ vmax.s16 q4, q6 ++ vmax.s16 q2, q6 ++ vmax.s16 q3, q6 ++ vdup.16 d12, r9 @ expand mask, reuse q6 due to register pressure ++ vdup.16 d13, r3 ++ vmin.s16 q0, q7 ++ vmin.s16 q5, q7 ++ vmin.s16 q1, q7 ++ vmin.s16 q4, q7 ++ vmin.s16 q2, q7 ++ vmin.s16 q3, q7 ++ vadd.i16 q0, q9 @ apply difference ++ vadd.i16 q5, q14 ++ vadd.i16 q1, q10 ++ vadd.i16 q4, q13 ++ vadd.i16 q2, \Q11 ++ vadd.i16 q3, q12 ++ vbit q9, q0, q6 @ apply filtered values according to mask ++ vbit q14, q5, q6 ++ vbit q10, q1, q6 ++ vbit q13, q4, q6 ++ vbit \Q11, q2, q6 ++ vbit q12, q3, q6 ++ vneg.s16 q6, q7 @ restore -tc2 ++ ++1: ++ bics r9, r7, r8 ++ beq 2f ++ ++ vsub.i16 q0, q12, \Q11 @ q0 - p0 ++ vsub.i16 q1, q13, q10 @ q1 - p1 ++ lsr r3, r9, #16 ++ vshl.i16 q2, q0, #3 ++ lsr r7, r5, #16 ++ vadd.i16 q3, q0, q2 @ 9 * (q0 - p0) ++ lsr r8, r6, #16 ++ vshl.i16 q2, q1, #1 ++ vadd.i16 q4, q1, q2 @ 3 * (q1 - p1) ++ vshr.s16 q6, #1 @ -tc = -tc2 >> 1 ++ vsub.i16 q5, q3, q4 ++ vrhadd.s16 q1, q9, \Q11 @ (p2 + p0 + 1) >> 1 ++ vrhadd.s16 q3, q14, q12 @ (q2 + q0 + 1) >> 1 ++ vrshr.s16 q5, #4 @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4 ++ vsub.i16 q1, q10 @ ((p2 + p0 + 1) >> 1) - p1 ++ vsub.i16 q3, q13 @ ((q2 + q0 + 1) >> 1) - q1 ++ vmax.s16 q6, q5 @ ++ vshr.s16 q4, q7, #1 @ tc = tc2 >> 1 ++ vdup.16 q0, r2 @ beta ++ vmin.s16 q6, q4 @ delta0 clamped to [-tc, tc] ++ vshr.s16 q4, #1 @ tc_2 = tc >> 1 ++ vhadd.s16 q1, q6 @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1 ++ vhsub.s16 q3, q6 @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1 ++ vshr.s16 q2, q0, #1 @ beta >> 1 ++ vadd.i16 q2, q0 @ beta + (beta >> 1) ++ vneg.s16 q0, q4 @ -tc_2 ++ vabs.s16 q5, q5 @ abs(original delta0) ++ vshr.s16 q2, #3 @ (beta + (beta >> 1)) >> 3 ++ vmax.s16 q1, q0 ++ vmax.s16 q3, q0 ++ vshl.s16 q0, q7, #2 @ 8 * tc ++ vadd.i16 q7, q0 @ 10 * tc ++ vdup.16 d0, r9 ++ vdup.16 d1, r3 @ q0 = mask of blocks to apply filtering ++ vmin.s16 q1, q4 @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2) ++ vmin.s16 q3, q4 @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2) ++ vdup.16 d8, r5 @ dp0 + dp3 ++ vdup.16 d9, r7 @ dp0' + dp3' ++ vcgt.s16 q7, q5 @ if ((10 * tc) > abs(delta0)) ++ vdup.16 d10, r6 @ dq0 + dq3 ++ vdup.16 d11, r8 @ dq0' + dq3' ++ vand q7, q0 @ AND block and line masks ++ vcgt.s16 q4, q2, q4 @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1) ++ vadd.i16 q0, q1, q10 @ p1 + deltap1 ++ vcgt.s16 q5, q2, q5 @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1) ++ vadd.i16 q3, q3, q13 @ q1 + deltaq1 ++ vadd.i16 q1, \Q11, q6 @ p0 + delta0 ++ vsub.i16 q2, q12, q6 @ q0 - delta0 ++ vand q4, q7 @ AND nd_p test with block/line masks ++ vand q5, q7 @ AND nd_q test with block/line masks ++ vbit q10, q0, q4 ++ vbit \Q11, q1, q7 ++ vbit q12, q2, q7 ++ vbit q13, q3, q5 ++ ++2: ++.if \bit_depth == 8 ++ vmovn.i16 d16, q8 ++ vmovn.i16 d23, \Q15 ++ neg r1, r1 ++ vqmovun.s16 d17, q9 ++ vqmovun.s16 d18, q10 ++ vqmovun.s16 d19, \Q11 ++ lsls r10, #31 ++ vqmovun.s16 d20, q12 ++ vqmovun.s16 d21, q13 ++ vqmovun.s16 d22, q14 ++.else ++ vmov.i16 q0, #0 ++ vmov.i16 q1, #(1 << \bit_depth - 1) ++ @ q8 & q15 should be unaltered and so don't require clipping ++ neg r1, r1 ++ vmax.s16 q9, q0 ++ vmax.s16 q10, q0 ++ vmax.s16 q11, q0 ++ vmax.s16 q12, q0 ++ vmax.s16 q13, q0 ++ vmax.s16 q14, q0 ++ lsls r10, #31 ++ vmin.s16 q9, q1 ++ vmin.s16 q10, q1 ++ vmin.s16 q11, q1 ++ vmin.s16 q12, q1 ++ vmin.s16 q13, q1 ++ vmin.s16 q14, q1 ++.endif ++ bx lr ++.endm ++ ++function hevc_loop_filter_luma_body ++ m_filter_luma 8, q15, q11 ++endfunc ++ ++@ void ff_hevc_rpi_v_loop_filter_luma_neon_8( ++@ uint8_t *_pix, [r0] ++@ ptrdiff_t _stride, [r1] ++@ int _beta, [r2] ++@ int *_tc, [r3] ++@ uint8_t *_no_p, [sp+0] ++@ uint8_t *_no_q) [sp+4] ++ ++function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1 ++ hevc_loop_filter_luma_start ++ ++ sub r4, r0, #4 ++ b .Lv_loop_luma_common ++endfunc ++ ++@ void ff_hevc_rpi_v_loop_filter2_luma_neon( ++@ uint8_t * pix_r, [r0] ++@ ptrdiff_t _stride, [r1] ++@ int _beta, [r2] ++@ int tc2, [r3] ++@ int no_f, [sp+0] ++@ uint8_t * pix_l) [sp+4] ++ ++function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1 ++ cmp r3, #0 ++ it eq ++ bxeq lr ++ push {r4-r10,lr} @ 32 bytes ++ ldr r4, [sp, #36] ++ ldr r10, [sp, #32] ++ ++.Lv_loop_luma_common: ++ vpush {d8-d15} ++ ++ @ It's slightly faster to do unlaned loads and transpose in the ++ @ 8-bit case, even though it needs more instructions, because ++ @ VLD4.8 is a really slow way to read from memory. ++ vld1.32 {d16[0]}, [r4:32], r1 ++ vld1.32 {d20[0]}, [r0:32], r1 ++ vld1.32 {d16[1]}, [r4:32], r1 ++ vld1.32 {d20[1]}, [r0:32], r1 ++ vld1.32 {d17[0]}, [r4:32], r1 ++ vld1.32 {d21[0]}, [r0:32], r1 ++ vld1.32 {d17[1]}, [r4:32], r1 ++ vld1.32 {d21[1]}, [r0:32], r1 ++ vld1.32 {d18[0]}, [r4:32], r1 ++ vld1.32 {d22[0]}, [r0:32], r1 ++ vld1.32 {d18[1]}, [r4:32], r1 ++ vld1.32 {d22[1]}, [r0:32], r1 ++ vld1.32 {d19[0]}, [r4:32], r1 ++ vld1.32 {d23[0]}, [r0:32], r1 ++ vld1.32 {d19[1]}, [r4:32] ++ vld1.32 {d23[1]}, [r0:32] ++ vuzp.16 q8, q9 ++ vuzp.16 q10, q11 ++ vuzp.8 q8, q9 ++ vuzp.8 q10, q11 ++ vswp d17, d18 ++ vswp d21, d22 ++ ++ bl hevc_loop_filter_luma_body ++ ++ add r6, r4, r1 ++ add r2, r0, r1 ++ lsl r1, #1 ++ ++ vpop {d8-d15} ++ ++ @ no_p[1] ++ bmi 1f ++ vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1 ++ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1 ++ vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1 ++ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1 ++ ++ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1 ++ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1 ++ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1 ++ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r6:32] ++1: ++ @ no_q[1] ++ bcs 1f ++ vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1 ++ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1 ++ vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1 ++ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1 ++ ++ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1 ++ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1 ++ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1 ++ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32] ++1: ++ pop {r4-r10,pc} ++ ++.Lbypasswrite: ++ vpop {d8-d15} ++ pop {r4-r10,pc} ++endfunc ++ ++.macro m_filter_v_luma_16 bit_depth ++ vpush {d8-d15} ++ ++ @ Uses slightly fewer instructions to do laned loads than unlaned ++ @ and transpose. This also means that we can use the same code for ++ @ both split & unsplit deblock ++ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4], r1 ++ vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1 ++ ++ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1 ++ vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 ++ ++ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1 ++ vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1 ++ ++ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1 ++ vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 ++ ++ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1 ++ vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1 ++ ++ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 ++ vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 ++ ++ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1 ++ vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1 ++ ++ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4] ++ vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0] ++ ++ bl hevc_loop_filter_luma_body_\bit_depth ++ ++ add r6, r4, r1 ++ add r2, r0, r1 ++ lsl r1, #1 ++ ++ vpop {d8-d15} ++ ++ @ p[1] ++ bmi 1f ++ vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1 ++ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r6], r1 ++ vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1 ++ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r6], r1 ++ vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1 ++ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r6], r1 ++ vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1 ++ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r6] ++1: ++ @ q[1] ++ bcs 1f ++ vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1 ++ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r2], r1 ++ vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1 ++ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r2], r1 ++ vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1 ++ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1 ++ vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1 ++ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2] ++1: ++ pop {r4-r10,pc} ++.endm ++ ++ ++ ++ ++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix, [r0] ++@ ptrdiff_t stride, [r1] ++@ int beta, [r2] ++@ int32_t *tc, [r3] ++@ uint8_t *no_p, sp[0] ++@ uint8_t *no_q); sp[4] ++@ ++@ Src should always be on 8 byte boundry & all in the same slice ++ ++function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1 ++ hevc_loop_filter_luma_start ++ b .Lh_loop_filter_luma_common_8 ++endfunc ++ ++function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1 ++ cmp r3, #0 ++ it eq ++ bxeq lr ++ push {r4-r10,lr} @ 32 bytes ++ ldr r10, [sp, #32] ++ ++.Lh_loop_filter_luma_common_8: ++ sub r4, r0, r1, lsl #2 ++ add r0, r4, r1 ++ lsl r1, #1 ++ vpush {d8-d15} ++ ++ vld1.8 {d16}, [r4], r1 ++ vld1.8 {d17}, [r0], r1 ++ vld1.8 {d18}, [r4], r1 ++ vld1.8 {d19}, [r0], r1 ++ vld1.8 {d20}, [r4], r1 ++ vld1.8 {d21}, [r0], r1 ++ vld1.8 {d22}, [r4] ++ vld1.8 {d23}, [r0] ++ ++ bl hevc_loop_filter_luma_body ++ ++ add r0, r0, r1, lsl #1 ++ add r2, r4, r1, lsl #1 ++ add r6, r4, r1, asr #1 ++ vpop {d8-d15} ++ ++ @ P2-P0 ++ bcs 1f ++ vst1.8 {d22}, [r4], r1 ++ vst1.8 {d21}, [r6] ++ vst1.8 {d20}, [r4] ++1: ++ @ Q0-Q2 ++ bmi 1f ++ vst1.8 {d19}, [r0], r1 ++ vst1.8 {d18}, [r2] ++ vst1.8 {d17}, [r0] ++1: ++ pop {r4-r10,pc} ++endfunc ++ ++ ++.macro m_filter_h_luma_16 bit_depth ++ sub r4, r0, r1, lsl #2 ++ add r0, r4, r1 ++ lsl r1, #1 ++ vpush {d8-d15} ++ ++ vld1.16 { q8}, [r4], r1 ++ vld1.16 { q9}, [r0], r1 ++ vld1.16 {q10}, [r4], r1 ++ vld1.16 {q11}, [r0], r1 ++ vld1.16 {q12}, [r4], r1 ++ vld1.16 {q13}, [r0], r1 ++ vld1.16 {q14}, [r4] ++ vld1.16 {q15}, [r0] ++ ++ bl hevc_loop_filter_luma_body_\bit_depth ++ ++ add r0, r0, r1, lsl #1 ++ add r2, r4, r1, lsl #1 ++ add r6, r4, r1, asr #1 ++ vpop {d8-d15} ++ ++ @ P2-P0 ++ bcs 1f ++ vst1.16 {q14}, [r4], r1 ++ vst1.16 {q13}, [r6] ++ vst1.16 {q12}, [r4] ++1: ++ bmi 1f ++ vst1.16 {q11}, [r0], r1 ++ vst1.16 {q10}, [r2] ++ vst1.16 { q9}, [r0] ++1: ++ pop {r4-r10,pc} ++.endm ++ ++ ++@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ unsigned int no_f); // r3 ++@ ++@ no_f ++@ 0 tl P0 ++@ 1 tr P1 ++@ 2 bl Q0 ++@ 3 br Q1 ++@ ++@ Probably not worth having the P/Qa only special case in this direction ++@ Given layout we won't save any memory reads or avoid any cache dirtying ++@ We would save a bit of computation but I expect the partials to be less ++@ common in the H direction than V due to how we arrange deblock. ++ ++function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1 ++ sub r12, r0, r1 ++ cmp r2, #0 ++ it eq ++ bxeq lr ++ vld1.8 {d26,d27}, [r0] ++ lsl r1, #1 ++ sub r0, r1 ++ vld1.8 {d18,d19}, [r12], r1 ++ vld1.8 {d16,d17}, [r0], r1 ++ vld1.8 {d28,d29}, [r12] ++ ++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \ ++ "sub r12, r0, r1, asr #1" ++ ++ lsls r3, #29 @ b2 -> N, b3 -> C ++ it pl ++ vstrpl d26, [r0, #0] ++ it cc ++ vstrcc d27, [r0, #8] ++ lsls r3, #2 @ b0 -> N, b1 -> C ++ it pl ++ vstrpl d18, [r12, #0] ++ it cc ++ vstrcc d19, [r12, #8] ++ bx lr ++ ++endfunc ++ ++ ++@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ unsigned int no_f); // r3 ++@ ++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1] ++@ ++@ Macro here actual function near bottom ++ ++.macro m_filter_h_uv_16 bit_depth ++ sub r12, r0, r1 ++ cmp r2, #0 ++ it eq ++ bxeq lr ++ vld1.16 {q12, q13}, [r0] ++ lsl r1, #1 ++ sub r0, r1 ++ vld1.16 {q10, q11}, [r12], r1 ++ vld1.16 {q8, q9 }, [r0], r1 ++ vld1.16 {q14, q15}, [r12] ++ ++ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \ ++ "sub r12, r0, r1, asr #1", \ ++ "cmp r3, #0" ++ ++ bne 1f ++ vst1.16 {q10, q11}, [r12] ++ vst1.16 {q12, q13}, [r0] ++ bx lr ++ ++ @ At least one no_f bit is set ++ @ Which means we need to break this apart in an ugly fashion ++1: ++ lsls r3, #29 @ b2 -> N, b3 -> C ++ itt pl ++ vstrpl d24, [r0, #0] ++ vstrpl d25, [r0, #8] ++ itt cc ++ vstrcc d26, [r0, #16] ++ vstrcc d27, [r0, #24] ++ lsls r3, #2 @ b0 -> N, b1 -> C ++ itt pl ++ vstrpl d20, [r12, #0] ++ vstrpl d21, [r12, #8] ++ itt cc ++ vstrcc d22, [r12, #16] ++ vstrcc d23, [r12, #24] ++ bx lr ++.endm ++ ++ ++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ uint8_t * src_l, // r3 ++@ unsigned int no_f); // sp[0] ++@ ++@ no_f: ++@ 0 tl P0 ++@ 1 tr Q0 ++@ 2 bl P1 ++@ 3 br Q1 ++ ++function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1 ++ cmp r2, #0 ++ it eq ++ bxeq lr ++ push {lr} ++ vld2.16 {d16[0], d18[0]}, [r3], r1 ++ vld2.16 {d20[0], d22[0]}, [r0], r1 ++ ++ cmp r2, #0x10000 ++ vld2.16 {d16[1], d18[1]}, [r3], r1 ++ vld2.16 {d20[1], d22[1]}, [r0], r1 ++ ++ vld2.16 {d16[2], d18[2]}, [r3], r1 ++ vld2.16 {d20[2], d22[2]}, [r0], r1 ++ ++ vld2.16 {d16[3], d18[3]}, [r3], r1 ++ vld2.16 {d20[3], d22[3]}, [r0], r1 ++ blo 10f ++ ++ vld2.16 {d17[0], d19[0]}, [r3], r1 ++ vld2.16 {d21[0], d23[0]}, [r0], r1 ++ ++ sub ip, r0, r3 ++ vld2.16 {d17[1], d19[1]}, [r3], r1 ++ vld2.16 {d21[1], d23[1]}, [r0], r1 ++ ++ cmp ip, #4 ++ vld2.16 {d17[2], d19[2]}, [r3], r1 ++ vld2.16 {d21[2], d23[2]}, [r0], r1 ++ ++ vld2.16 {d17[3], d19[3]}, [r3] ++ vld2.16 {d21[3], d23[3]}, [r0] ++ ++ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \ ++ "ldr lr, [sp, #4]", \ ++ "neg r1, r1", \ ++ "it eq; cmpeq lr, #0", \ ++ "add r3, #2", \ ++ "add ip, r3, r1", \ ++ "add r2, r0, r1", \ ++ "lsl r1, #1" ++ ++ bne 1f ++ ++@ Much/most of the time r0 == r3 + 4 and no_f == 0 ++@ so it is worth having this special case ++ vst2.16 {d19[3], d21[3]}, [r3], r1 @ P0b, Q0b ++ vst2.16 {d19[2], d21[2]}, [ip], r1 ++ vst2.16 {d19[1], d21[1]}, [r3], r1 ++ vst2.16 {d19[0], d21[0]}, [ip], r1 ++ vst2.16 {d18[3], d20[3]}, [r3], r1 @ P0a, Q0a ++ vst2.16 {d18[2], d20[2]}, [ip], r1 ++ vst2.16 {d18[1], d20[1]}, [r3] ++ vst2.16 {d18[0], d20[0]}, [ip] ++ pop {pc} ++ ++@ Either split or partial ++1: ++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 ++ ittt cs ++ addcs r0, r0, r1, lsl #1 ++ addcs r2, r2, r1, lsl #1 ++ bcs 1f ++ @ Q0b ++ vst1.16 {d21[3]}, [r0], r1 ++ vst1.16 {d21[2]}, [r2], r1 ++ vst1.16 {d21[1]}, [r0], r1 ++ vst1.16 {d21[0]}, [r2], r1 ++1: ++ ittt mi ++ addmi r3, r3, r1, lsl #1 ++ addmi ip, ip, r1, lsl #1 ++ bmi 1f ++ @ P0b ++ vst1.16 {d19[3]}, [r3], r1 ++ vst1.16 {d19[2]}, [ip], r1 ++ vst1.16 {d19[1]}, [r3], r1 ++ vst1.16 {d19[0]}, [ip], r1 ++1: ++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31 ++ bcs 1f ++ @ Q0a ++ vst1.16 {d20[3]}, [r0], r1 ++ vst1.16 {d20[2]}, [r2], r1 ++ vst1.16 {d20[1]}, [r0] ++ vst1.16 {d20[0]}, [r2] ++1: ++ it mi ++ popmi {pc} ++ @ P0a ++ vst1.16 {d18[3]}, [r3], r1 ++ vst1.16 {d18[2]}, [ip], r1 ++ vst1.16 {d18[1]}, [r3] ++ vst1.16 {d18[0]}, [ip] ++ pop {pc} ++ ++@ Single lump (rather than double) ++10: ++ @ As we have post inced r0/r3 in the load the easiest thing to do is ++ @ to subtract and write forwards, rather than backwards (as above) ++ @ b0 (P0a) -> N, b1 (Q0a) -> C ++ ++ hevc_loop_filter_uv_body1 d16, d18, d20, d22 \ ++ "ldr lr, [sp, #4]", \ ++ "add r3, #2", \ ++ "sub r0, r0, r1, lsl #2", \ ++ "sub r3, r3, r1, lsl #2", \ ++ "lsls lr, #31", \ ++ "add r2, r0, r1", \ ++ "add ip, r3, r1", \ ++ "lsl r1, #1" ++ ++ bcs 3f ++ @ Q0a ++ vst1.16 {d20[0]}, [r0], r1 ++ vst1.16 {d20[1]}, [r2], r1 ++ vst1.16 {d20[2]}, [r0] ++ vst1.16 {d20[3]}, [r2] ++3: ++ it mi ++ popmi {pc} ++ @ P0a ++ vst1.16 {d18[0]}, [r3], r1 ++ vst1.16 {d18[1]}, [ip], r1 ++ vst1.16 {d18[2]}, [r3] ++ vst1.16 {d18[3]}, [ip] ++ pop {pc} ++ ++endfunc ++ ++ ++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0 ++@ unsigned int stride, // r1 ++@ uint32_t tc4, // r2 ++@ uint8_t * src_l, // r3 ++@ unsigned int no_f); // sp[0] ++@ ++ ++@ no_f ++@ 0 tl P0a ++@ 1 tr Q0a ++@ 2 bl P0b ++@ 3 br Q0b ++ ++@ P1: q8, q12 ++@ P0: q9, q13 ++@ Q0: q10, q14 ++@ Q1: q11, q15 ++ ++.macro m_filter_v_uv2_16 bit_depth ++ cmp r2, #0 ++ it eq ++ bxeq lr ++ push {lr} ++ vld2.32 {d16[0], d18[0]}, [r3], r1 ++ vld2.32 {d20[0], d22[0]}, [r0], r1 ++ ++ cmp r2, #0x10000 ++ vld2.32 {d16[1], d18[1]}, [r3], r1 ++ vld2.32 {d20[1], d22[1]}, [r0], r1 ++ ++ vld2.32 {d17[0], d19[0]}, [r3], r1 ++ vld2.32 {d21[0], d23[0]}, [r0], r1 ++ ++ vld2.32 {d17[1], d19[1]}, [r3], r1 ++ vld2.32 {d21[1], d23[1]}, [r0], r1 ++ blo 10f ++ ++ vld2.32 {d24[0], d26[0]}, [r3], r1 ++ vld2.32 {d28[0], d30[0]}, [r0], r1 ++ ++ sub ip, r0, r3 ++ vld2.32 {d24[1], d26[1]}, [r3], r1 ++ vld2.32 {d28[1], d30[1]}, [r0], r1 ++ ++ cmp ip, #8 ++ vld2.32 {d25[0], d27[0]}, [r3], r1 ++ vld2.32 {d29[0], d31[0]}, [r0], r1 ++ ++ vld2.32 {d25[1], d27[1]}, [r3] ++ vld2.32 {d29[1], d31[1]}, [r0] ++ ++ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \ ++ "ldr lr, [sp, #4]", \ ++ "neg r1, r1", \ ++ "it eq; cmpeq lr, #0", \ ++ "add r3, #4", \ ++ "add ip, r3, r1", \ ++ "add r2, r0, r1", \ ++ "lsl r1, #1" ++ ++ bne 1f ++ ++@ Much/most of the time r0 == r3 + 8 and no_f == 0 ++@ so it is worth having this special case ++ vst2.32 {d27[1], d29[1]}, [r3], r1 @ P0b, Q0b ++ vst2.32 {d27[0], d29[0]}, [ip], r1 ++ vst2.32 {d26[1], d28[1]}, [r3], r1 ++ vst2.32 {d26[0], d28[0]}, [ip], r1 ++ vst2.32 {d19[1], d21[1]}, [r3], r1 @ P0a, Q0a ++ vst2.32 {d19[0], d21[0]}, [ip], r1 ++ vst2.32 {d18[1], d20[1]}, [r3] ++ vst2.32 {d18[0], d20[0]}, [ip] ++ pop {pc} ++ ++@ Either split or partial ++1: ++ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29 ++ ittt cs ++ addcs r0, r0, r1, lsl #1 ++ addcs r2, r2, r1, lsl #1 ++ bcs 1f ++ @ Q0b ++ vst1.32 {d29[1]}, [r0], r1 ++ vst1.32 {d29[0]}, [r2], r1 ++ vst1.32 {d28[1]}, [r0], r1 ++ vst1.32 {d28[0]}, [r2], r1 ++1: ++ ittt mi ++ addmi r3, r3, r1, lsl #1 ++ addmi ip, ip, r1, lsl #1 ++ bmi 1f ++ @ P0b ++ vst1.32 {d27[1]}, [r3], r1 ++ vst1.32 {d27[0]}, [ip], r1 ++ vst1.32 {d26[1]}, [r3], r1 ++ vst1.32 {d26[0]}, [ip], r1 ++1: ++ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31 ++ bcs 1f ++ @ Q0a ++ vst1.32 {d21[1]}, [r0], r1 ++ vst1.32 {d21[0]}, [r2], r1 ++ vst1.32 {d20[1]}, [r0] ++ vst1.32 {d20[0]}, [r2] ++1: ++ it mi ++ popmi {pc} ++ @ P0a ++ vst1.32 {d19[1]}, [r3], r1 ++ vst1.32 {d19[0]}, [ip], r1 ++ vst1.32 {d18[1]}, [r3] ++ vst1.32 {d18[0]}, [ip] ++ pop {pc} ++ ++@ Single lump (rather than double) ++10: ++ @ As we have post inced r0/r3 in the load the easiest thing to do is ++ @ to subtract and write forwards, rather than backwards (as above) ++ @ b0 (P0a) -> N, b1 (Q0a) -> C ++ ++ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth, \ ++ "ldr lr, [sp, #4]", \ ++ "add r3, #4", \ ++ "sub r0, r0, r1, lsl #2", \ ++ "sub r3, r3, r1, lsl #2", \ ++ "lsls lr, #31", \ ++ "add r2, r0, r1", \ ++ "add ip, r3, r1", \ ++ "lsl r1, #1" ++ ++ bcs 3f ++ @ Q0a ++ vst1.32 {d20[0]}, [r0], r1 ++ vst1.32 {d20[1]}, [r2], r1 ++ vst1.32 {d21[0]}, [r0] ++ vst1.32 {d21[1]}, [r2] ++3: ++ it mi ++ popmi {pc} ++ @ P0a ++ vst1.32 {d18[0]}, [r3], r1 ++ vst1.32 {d18[1]}, [ip], r1 ++ vst1.32 {d19[0]}, [r3] ++ vst1.32 {d19[1]}, [ip] ++ pop {pc} ++.endm ++ ++ ++@ The NEON version is faster under ideal circumstances (i.e. everything in L1) ++@ But in real world testing it is ~20% slower, presumably due to code size ++ ++#if 0 // NEON version ++ ++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, ++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ * int in_inc0, int in_inc1) ++ */ ++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 ++ mov ip, sp ++ push {a1-a3,v1-v8,lr} ++ ldm ip, {v1-v6} ++ cmp a1, #2 ++ bls 2f ++ vpush {d8-d13} ++ sub v5, v5, #10 ++ sub v6, v6, #10 ++1: ++ vld2.32 {d0[0], d2[0]}, [a3]! ++ vld2.32 {d4[0], d6[0]}, [a4]! ++ vmov.u8 q12, #0 ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb v8, [a3], #1 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[0]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[0]}, [a4], v6 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d16[0]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d20[0]}, [ip] ++ vld1.32 {d18[0]}, [v8] ++ vld1.32 {d22[0]}, [lr] ++ ++ vld2.32 {d0[1], d2[1]}, [a3]! ++ vld2.32 {d4[1], d6[1]}, [a4]! ++ ldrb a2, [a3], #1 ++ vmov.u16 d12, #1 ++ ldrb ip, [a4], #1 ++ vmov.u16 d13, #2 ++ ldrb v8, [a3], #1 ++ vmov.u16 d27, #4 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[2]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[2]}, [a4], v6 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d16[1]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d20[1]}, [ip] ++ vld1.32 {d18[1]}, [v8] ++ vld1.32 {d22[1]}, [lr] ++ ++ vld2.32 {d1[0], d3[0]}, [a3]! ++ vld2.32 {d5[0], d7[0]}, [a4]! ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb lr, [a4], #1 ++ ldrb v8, [a3], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[4]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[4]}, [a4], v6 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d17[0]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d21[0]}, [ip] ++ vld1.32 {d19[0]}, [v8] ++ vld1.32 {d23[0]}, [lr] ++ ++ vld2.32 {d1[1], d3[1]}, [a3]! ++ vld2.32 {d5[1], d7[1]}, [a4]! ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb v8, [a3], #1 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d24[6]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d25[6]}, [a4], v6 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d17[1]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d21[1]}, [ip] ++ vld1.32 {d19[1]}, [v8] ++ vld1.32 {d23[1]}, [lr] ++ ++ @ So now we have: ++ @ q0.32[i] = curr[i].mv[0] ++ @ q1.32[i] = curr[i].mv[1] ++ @ q2.32[i] = neigh[i].mv[0] ++ @ q3.32[i] = neigh[i].mv[1] ++ @ q8.32[i] = curr_rpl0[curr[i].ref_idx[0]] ++ @ q9.32[i] = curr_rpl1[curr[i].ref_idx[1]] ++ @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]] ++ @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]] ++ @ d24.16[i] = curr[i].pred_flag ++ @ d25.16[i] = neigh[i].pred_flag ++ ++ vtst.16 d28, d24, d12 ++ vtst.16 d29, d24, d13 ++ vadd.i16 d8, d24, d12 ++ vadd.i16 d9, d25, d12 ++ vtst.16 d30, d25, d12 ++ vtst.16 d31, d25, d13 ++ veor d26, d8, d9 ++ ldr lr, [sp, 6*8 + 1*4] ++ vmovl.s16 q4, d28 ++ vmovl.s16 q5, d29 ++ teq lr, #1 ++ vmovl.s16 q14, d30 ++ it ne ++ lslne v1, lr, #1 ++ vmovl.s16 q15, d31 ++ it ne ++ rsbne v2, v1, #32 ++ vbif q0, q1, q4 ++ vbif q2, q3, q14 ++ vbif q1, q0, q5 ++ vbif q3, q2, q15 ++ vabd.s16 q12, q0, q2 ++ vabd.s16 q2, q1 ++ vabd.s16 q0, q3 ++ vabd.s16 q1, q3 ++ vbif q8, q9, q4 ++ vbif q10, q11, q14 ++ vbif q9, q8, q5 ++ vbif q11, q10, q15 ++ vclt.u16 d6, d24, d27 ++ vclt.u16 d8, d2, d27 ++ vclt.u16 d7, d25, d27 ++ vclt.u16 d9, d3, d27 ++ vclt.u16 d2, d0, d27 ++ vclt.u16 d0, d4, d27 ++ vclt.u16 d3, d1, d27 ++ vclt.u16 d1, d5, d27 ++ vceq.i32 q12, q10, q8 ++ vceq.i32 q10, q9 ++ vceq.i32 q8, q11 ++ vceq.i32 q9, q11 ++ vshrn.i32 d6, q3, #8 ++ vshrn.i32 d7, q4, #8 ++ vshrn.i32 d8, q1, #8 ++ vshrn.i32 d9, q0, #8 ++ vmovn.i32 d4, q12 ++ vmovn.i32 d2, q10 ++ vmovn.i32 d3, q8 ++ vmovn.i32 d5, q9 ++ vand q2, q3 ++ vrev16.8 q3, q3 ++ vand q2, q3 ++ vand q1, q4 ++ vrev16.8 q4, q4 ++ vand q1, q4 ++ vand d4, d5 ++ vand d2, d3 ++ vbic d0, d12, d4 ++ vshr.u16 d26, #2 ++ vbic d0, d2 ++ vmov.i16 d1, #0x5555 ++ vorr d0, d26 ++ bne 10f ++ ++ @ Merge results into result word, no duplicates ++ vmov a2, s0 ++ vmov v8, s1 ++ vmov.u16 ip, d0[1] ++ vmov.u16 lr, d0[3] ++ lsl a2, #30 ++ lsl v8, #30 ++ lsl ip, #30 ++ lsl lr, #30 ++ orr a2, ip, a2, lsr #2 ++ orr v8, lr, v8, lsr #2 ++ orr a2, v8, a2, lsr #4 ++ subs a1, #4 ++ orr v7, a2, v7, lsr #8 ++ bhi 1b ++ ++ mov a1, #32 ++ ldr a3, [sp, #6*8] ++ vpop {d8-d13} ++ sub a1, a1, a3, lsl #1 ++ mov a1, v7, lsr a1 ++ pop {a2-a4,v1-v8,pc} ++10: ++ @ Merge results into result word, with duplicates ++ vmul.i16 d0, d1 ++ vmov a2, s0 ++ vmov v8, s1 ++ vmov.u16 ip, d0[1] ++ vmov.u16 lr, d0[3] ++ lsl a2, v2 ++ subs a1, #4 ++ lsl v8, v2 ++ lsl ip, v2 ++ lsl lr, v2 ++ ldr v2, [sp, #6*8 + 12*4 + 1*4] ++T lsr a2, v1 ++T orr a2, ip, a2 ++A orr a2, ip, a2, lsr v1 ++ lsl ip, v1, #1 ++T lsr v8, v1 ++T orr v8, lr, v8 ++A orr v8, lr, v8, lsr v1 ++ lsl lr, v1, #2 ++T lsr a2, ip ++T orr a2, v8, a2 ++A orr a2, v8, a2, lsr ip ++ ldr v1, [sp, #6*8 + 12*4] ++T lsr v7, lr ++T orr v7, a2, v7 ++A orr v7, a2, v7, lsr lr ++ bhi 1b ++ ++ mov a1, #32 ++ ldrd a3, a4, [sp, #6*8] ++ vpop {d8-d13} ++ mls a1, a3, a4, a1 ++ mls a1, a3, a4, a1 ++ mov a1, v7, lsr a1 ++ pop {a2-a4,v1-v8,pc} ++ ++ ++2: ++ sub v5, v5, #10 ++ sub v6, v6, #10 ++ vmov.u8 d16, #0 ++ blo 3f ++ vld2.32 {d0[0], d1[0]}, [a3]! ++ vld2.32 {d2[0], d3[0]}, [a4]! ++ ldrb a2, [a3], #1 ++ ldrb ip, [a4], #1 ++ ldrb lr, [a4], #1 ++ ldrb v8, [a3], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d16[0]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d16[4]}, [a4], v6 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d4[0]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d5[0]}, [ip] ++ vld1.32 {d6[0]}, [v8] ++ vld1.32 {d7[0]}, [lr] ++ ++3: ++ vld2.32 {d0[1], d1[1]}, [a3]! ++ vld2.32 {d2[1], d3[1]}, [a4]! ++ ldrb a2, [a3], #1 ++ vmov.u16 d17, #1 ++ ldrb ip, [a4], #1 ++ vmov.u16 d18, #2 ++ ldrb v8, [a3], #1 ++ vmov.u16 d19, #4 ++ ldrb lr, [a4], #1 ++ add a2, v1, a2, lsl #2 ++ vld1.8 {d16[2]}, [a3], v5 ++ add ip, v3, ip, lsl #2 ++ vld1.8 {d16[6]}, [a4], v6 ++ add v8, v2, v8, lsl #2 ++ vld1.32 {d4[1]}, [a2] ++ add lr, v4, lr, lsl #2 ++ vld1.32 {d5[1]}, [ip] ++ vld1.32 {d6[1]}, [v8] ++ vld1.32 {d7[1]}, [lr] ++ ++ @ So now we have: ++ @ d0.32[i] = curr[i].mv[0] ++ @ d1.32[i] = curr[i].mv[1] ++ @ d2.32[i] = neigh[i].mv[0] ++ @ d3.32[i] = neigh[i].mv[1] ++ @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]] ++ @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]] ++ @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]] ++ @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]] ++ @ d16.16[i] = curr[i].pred_flag ++ @ d16.16[2+i] = neigh[i].pred_flag ++ ++ vtst.16 d20, d16, d17 ++ vtst.16 d22, d16, d18 ++ vadd.i16 d30, d16, d17 ++ vswp d2, d3 ++ ldr lr, [sp, #1*4] ++ vmovl.s16 q10, d20 ++ teq lr, #1 ++ vmovl.s16 q11, d22 ++ it ne ++ lslne v1, lr, #1 ++ vbif d0, d1, d20 ++ vbif d4, d6, d20 ++ vbif d3, d2, d21 ++ vbif d5, d7, d21 ++ vbif d1, d0, d22 ++ vbif d6, d4, d22 ++ vbif d2, d3, d23 ++ vbif d7, d5, d23 ++ vshr.u16 d30, #2 ++ vabd.s16 d24, d0, d3 ++ vabd.s16 d25, d1, d2 ++ vabd.s16 q0, q0, q1 ++ vceq.i32 d2, d4, d5 ++ vceq.i32 d20, d5, d6 ++ vceq.i32 d21, d4, d7 ++ vceq.i32 d3, d6, d7 ++ vclt.u16 d6, d24, d19 ++ vclt.u16 d7, d25, d19 ++ vclt.u16 d22, d1, d19 ++ vclt.u16 d23, d0, d19 ++ vshrn.i32 d6, q3, #8 ++ vmovn.i32 d2, q1 ++ vshrn.i32 d7, q11, #8 ++ vmovn.i32 d3, q10 ++ vand q0, q3, q1 ++ it ne ++ rsbne v2, v1, #32 ++ vrev16.8 q3, q3 ++ vand q0, q3 ++ vsra.u64 d30, #32 ++ vshr.u64 q1, q0, #32 ++ vand q0, q1 ++ vbic d0, d17, d0 ++ vand d30, d30, d17 ++ vbic d0, d1 ++ vmov.i16 d1, #0x5555 ++ vorr d0, d30 ++ bne 10f ++ ++ @ Construct result word, no duplicates ++ cmp a1, #2 ++ vmov.u16 a1, d0[1] ++ vmov.u16 a2, d0[0] ++ it eq ++ orreq a1, a2, a1, lsl #2 ++ pop {a2-a4,v1-v8,pc} ++10: ++ @ Construct result word, with duplicates ++ cmp a1, #2 ++ vmul.i16 d0, d1 ++ vmov a2, s0 ++ vmov.u16 a1, d0[1] ++ lsl a2, #16 ++ pkhbt a1, a1, a1, lsl #16 ++ lsr a2, v2 ++ lsr a1, v2 ++T itt eq ++T lsleq a1, v1 ++T orreq a1, a2, a1 ++A orreq a1, a2, a1, lsl v1 ++ pop {a2-a4,v1-v8,pc} ++endfunc ++ ++ ++ ++#else // non-NEON version ++ ++ ++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, ++ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ * int in_inc0, in_inc1) ++ */ ++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1 ++ add ip, sp, #4*4 ++ push {a2-a4,v1-v8,lr} ++ mov v6, #32 ++1: ldmdb ip, {v1-v4} ++ ldrsb v5, [a3, #8] @ curr->ref_idx ++ ldrsb v8, [a3, #9] ++ ldrsb ip, [a4, #8] @ neigh->ref_idx ++ ldrsb lr, [a4, #9] ++ ldr v1, [v1, v5, lsl #2] ++ ldrb v5, [a3, #10] @ curr->pred_flag ++ ldr v2, [v2, v8, lsl #2] ++ ldrb v8, [a4, #10] @ neigh->pred_flag ++ ldr v3, [v3, ip, lsl #2] ++ ldr v4, [v4, lr, lsl #2] ++ teq v5, #3 ++ beq 20f ++ teq v8, #3 ++ beq 90f ++ ++ tst v5, #1 ++ itee ne ++ ldrne v5, [a3, #0] @ curr->mv[0] ++ moveq v1, v2 ++ ldreq v5, [a3, #4] @ curr->mv[1] ++ tst v8, #1 ++ itee ne ++ ldrne v8, [a4, #0] @ neigh->mv[0] ++ moveq v3, v4 ++ ldreq v8, [a4, #4] @ neigh->mv[1] ++ teq v1, v3 ++ bne 10f ++ ldr lr, =0xFFFCFFFC ++ ssub16 ip, v8, v5 ++ ssub16 v5, v5, v8 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ @ drop through ++10: it ne ++ movne v5, #1<<30 ++11: ++ sub v6, v6, #2 ++T mov v7, v7, lsr #2 ++ subs a2, a2, #1 ++A orr v7, v5, v7, lsr #2 ++T orr v7, v5, v7 ++ bhi 11b ++ ++ ldrd v3, v4, [sp, #16*4] ++ ldr a2, [sp] ++ add ip, sp, #16*4 ++ subs a1, a1, #1 ++ add a3, a3, v3 ++ add a4, a4, v4 ++ bhi 1b ++ mov a1, v7, lsr v6 ++ pop {a2-a4,v1-v8,pc} ++ ++20: teq v8, #3 ++ bne 10b ++ ++ teq v1, v3 ++ it eq ++ teqeq v2, v4 ++ bne 40f ++ teq v1, v2 ++ bne 30f ++ ++ ldrd v1, v2, [a3] @ curr->mv ++ ldrd v3, v4, [a4] @ neigh->mv ++ ldr lr, =0xFFFCFFFC ++ ssub16 ip, v3, v1 ++ ssub16 v5, v1, v3 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ bne 25f ++ ssub16 ip, v4, v2 ++ ssub16 v5, v2, v4 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ beq 11b ++ @ drop through ++25: ssub16 ip, v4, v1 ++ ssub16 v5, v1, v4 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ bne 10b ++ ssub16 ip, v3, v2 ++ ssub16 v5, v2, v3 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ b 10b ++ ++30: ldrd v1, v2, [a3] @ curr->mv ++ ldrd v3, v4, [a4] @ neigh->mv ++ ldr lr, =0xFFFCFFFC ++ ssub16 ip, v3, v1 ++ ssub16 v5, v1, v3 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ bne 10b ++ ssub16 ip, v4, v2 ++ ssub16 v5, v2, v4 ++ sel v5, v5, ip ++ ands v5, v5, lr ++ b 10b ++ ++40: teq v1, v4 ++ ite eq ++ teqeq v2, v3 ++ bne 10b ++ ++ ldrd v1, v2, [a3] @ curr->mv ++ ldrd v3, v4, [a4] @ neigh->mv ++ ldr lr, =0xFFFCFFFC ++ b 25b ++ ++90: ++ mov v5, #1<<30 ++ b 11b ++endfunc ++ ++ ++#endif ++ ++ ++@ ============================================================================= ++@ ++@ 10 bit ++ ++function hevc_loop_filter_luma_body_10 ++ m_filter_luma 10, q11, q15 ++endfunc ++ ++function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1 ++ hevc_loop_filter_luma_start ++ b .Lh_loop_luma_common_10 ++endfunc ++ ++function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1 ++ cmp r3, #0 ++ it eq ++ bxeq lr ++ push {r4-r10,lr} @ 32 bytes ++ ldr r10, [sp, #32] ++.Lh_loop_luma_common_10: ++ m_filter_h_luma_16 10 ++endfunc ++ ++function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1 ++ hevc_loop_filter_luma_start ++ sub r4, r0, #8 ++ b .Lv_loop_luma_common_10 ++endfunc ++ ++function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1 ++ cmp r3, #0 ++ it eq ++ bxeq lr ++ push {r4-r10,lr} @ 32 bytes ++ ldr r4, [sp, #36] ++ ldr r10, [sp, #32] ++ ++.Lv_loop_luma_common_10: ++ m_filter_v_luma_16 10 ++endfunc ++ ++function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1 ++ m_filter_h_uv_16 10 ++endfunc ++ ++function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1 ++ m_filter_v_uv2_16 10 ++endfunc ++ +diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S +new file mode 100644 +index 0000000000..7ed5c7dc52 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S +@@ -0,0 +1,184 @@ ++/* ++ * Copyright (c) 2014 Seppo Tomperi ++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++/* uses registers q8 - q13 for temp values */ ++.macro tr4_luma_shift shift ++ vaddl.s16 q8, d28, d30 // c0 = src0 + src2 ++ vaddl.s16 q9, d30, d31 // c1 = src2 + src3 ++ vsubl.s16 q10, d28, d31 // c2 = src0 - src3 ++ vaddl.s16 q11, d28, d31 // src0 + src3 ++ ++ vmul.i32 q12, q8, d1[0] // 29 * c0 ++ vmul.i32 q13, q10, d2[0] // 55 * c2 ++ vmul.i32 q8, q8, d2[0] // 55 * c0 ++ vmull.s16 q14, d29, d0[0] // c3 = 74 * src1 ++ ++ vsubw.s16 q11, q11, d30 // src0 - src2 + src3 ++ vmla.i32 q12, q9, d2[0] // 29 * c0 + 55 * c1 ++ vmls.i32 q13, q9, d1[0] // 55 * c2 - 29 * c1 ++ vmla.i32 q8, q10, d1[0] // 55 * c0 + 29 * c2 ++ ++ vmul.i32 q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3) ++ vadd.i32 q12, q12, q14 // dst0 = 29 * c0 + 55 * c1 + c3 ++ vadd.i32 q13, q13, q14 // dst1 = 55 * c2 - 29 * c1 + c3 ++ vsub.i32 q8, q8, q14 // dst3 = 55 * c0 + 29 * c2 - c3 ++ ++ vqrshrn.s32 d28, q12, \shift ++ vqrshrn.s32 d29, q13, \shift ++ vqrshrn.s32 d30, q11, \shift ++ vqrshrn.s32 d31, q8, \shift ++.endm ++ ++/* uses registers q8 - q11 for temp values */ ++.macro tr4_shift shift ++ vmull.s16 q9, d29, d0[0] // 83 * src1 ++ vmull.s16 q8, d29, d0[1] // 36 * src1 ++ vshll.s16 q14, d28, #6 // 64 * src0 ++ vshll.s16 q10, d30, #6 // 64 * src2 ++ vmlal.s16 q9, d31, d0[1] // 83 * src1 + 36 * src3 o0 ++ vmlsl.s16 q8, d31, d0[0] // 36 * src1 - 83 * src3 o1 ++ vadd.s32 q11, q14, q10 // 64 * (src0 + src2) e0 ++ vsub.s32 q10, q14, q10 // 64 * (src0 - src2) e1 ++ vadd.s32 q14, q11, q9 // e0 + o0 ++ vadd.s32 q15, q10, q8 // e1 + o1 ++ vsub.s32 q8, q10, q8 // e1 - o1 ++ vsub.s32 q9, q11, q9 // e0 - o0 ++ ++ vqrshrn.s32 d28, q14, \shift ++ vqrshrn.s32 d29, q15, \shift ++ vqrshrn.s32 d30, q8, \shift ++ vqrshrn.s32 d31, q9, \shift ++.endm ++ ++.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7, \ ++ tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \ ++ tmp1, /* Q reg which doesn't alias with d7 or d0 */ \ ++ shift, I1, I2, I3 ++ ++ vmull.s16 q4, \d1, d1[1] // 89 * src1 ++ \I1 ++ vmull.s16 q5, \d1, d1[0] // 75 * src1 ++ \I2 ++ vmull.s16 q6, \d1, d1[3] // 50 * src1 ++ \I3 ++ vmull.s16 q7, \d1, d1[2] // 18 * src1 ++ vmlal.s16 q4, \d3, d1[0] // 75 * src3 ++ vmlsl.s16 q5, \d3, d1[2] //-18 * src3 ++ vmlsl.s16 q6, \d3, d1[1] //-89 * src3 ++ vmlsl.s16 q7, \d3, d1[3] //-50 * src3 ++ ++ // tr4 ++ vmull.s16 q1, \d2, d0[0] // 83 * src(1*2) ++ vmull.s16 q2, \d2, d0[1] // 36 * src(1*2) ++ ++ vmlal.s16 q4, \d5, d1[3] // 50 * src5 ++ vmlsl.s16 q5, \d5, d1[1] //-89 * src5 ++ vmlal.s16 q6, \d5, d1[2] // 18 * src5 ++ vmlal.s16 q7, \d5, d1[0] // 75 * src5 ++ ++ vshll.s16 q3, \d0, #6 // 64 * src(0*2) ++ vshll.s16 \tmp0, \d4, #6 // 64 * src(2*2) ++ vmlal.s16 q1, \d6, d0[1] // 83 * src(1*2) + 36 * src(3*2) o0 ++ vmlsl.s16 q2, \d6, d0[0] // 36 * src(1*2) - 83 * src(3*2) o1 ++ vadd.i32 \tmp1, q3, \tmp0 // 64 * (src(0*2) + src(2*2)) e0 ++ vsub.i32 \tmp0, q3, \tmp0 // 64 * (src(0*2) - src(2*2)) e1 ++ ++ vmlal.s16 q4, \d7, d1[2] // 18 * src7 ++ vmlsl.s16 q5, \d7, d1[3] //-50 * src7 ++ vmlal.s16 q6, \d7, d1[0] // 75 * src7 ++ vmlsl.s16 q7, \d7, d1[1] //-89 * src7 ++ ++ vsub.i32 q3, \tmp1, q1 // e0 - o0 ++ vadd.i32 \tmp1, \tmp1, q1 // e0 + o0 ++ vadd.i32 q1, \tmp0, q2 // e1 + o1 ++ vsub.i32 q2, \tmp0, q2 // e1 - o1 ++ ++ vadd.i32 \tmp0, \tmp1, q4 // e_8[0] + o_8[0], dst[0] ++ vsub.i32 q4, \tmp1, q4 // e_8[0] - o_8[0], dst[7] ++ vsub.i32 \tmp1, q3, q7 // e_8[3] - o_8[3], dst[4] ++ vadd.i32 q7, q3, q7 // e_8[3] + o_8[3], dst[3] ++ vadd.i32 q3, q1, q5 // e_8[1] + o_8[1], dst[1] ++ vsub.i32 q5, q1, q5 // e_8[1] - o_8[1], dst[6] ++ vsub.i32 q1, q2, q6 // e_8[2] - o_8[2], dst[5] ++ vadd.i32 q6, q2, q6 // e_8[2] + o_8[2], dst[2] ++ vqrshrn.s32 \d0, \tmp0, #\shift ++ vqrshrn.s32 \d4, \tmp1, #\shift ++ vqrshrn.s32 \d1, q3, #\shift ++ vqrshrn.s32 \d5, q1, #\shift ++ vqrshrn.s32 \d2, q6, #\shift ++ vqrshrn.s32 \d6, q5, #\shift ++ vqrshrn.s32 \d3, q7, #\shift ++ vqrshrn.s32 \d7, q4, #\shift ++.endm ++ ++.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3 ++ vld1.16 {\d0}, [r0 :64], r3 ++ vld1.16 {\d1}, [r2 :64], r3 ++ vld1.16 {\d2}, [r0 :64], r3 ++ vld1.16 {\d3}, [r2 :64], r3 ++ vld1.16 {\d4}, [r0 :64], r3 ++ vld1.16 {\d5}, [r2 :64], r3 ++ vld1.16 {\d6}, [r0 :64], r3 ++ vld1.16 {\d7}, [r2 :64], r3 ++ ++ tr8_process \ ++ \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \ ++ \q01, \q23, 7, "\I1", "\I2", "\I3" ++.endm ++ ++.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift ++ tr8_process \ ++ \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \ ++ \q01, \q23, \shift ++ ++ vzip.16 \d0, \d4 ++ vzip.16 \d1, \d5 ++ vzip.16 \d2, \d6 ++ vzip.16 \d3, \d7 ++ vst4.16 {\d0-\d3}, [r0 :128], r3 ++ vst4.16 {\d4-\d7}, [r2 :128], r3 ++.endm ++ ++#define BIT_DEPTH 8 ++#include "rpi_hevc_idct_fn_neon.S" ++ ++.text ++ ++.align 4 ++tr4f: ++.word 0x00240053 // 36 and d1[0] = 83 ++.word 0x00000000 ++tr8f: ++.word 0x0059004b // 89, d0[0] = 75 ++.word 0x00320012 // 50, d0[2] = 18 ++tr16: ++.word 0x005a0057 // 90, d2[0] = 87 ++.word 0x00500046 // 80, d2[2] = 70 ++.word 0x0039002b // 57, d2[0] = 43 ++.word 0x00190009 // 25, d2[2] = 9 ++ ++#undef BIT_DEPTH ++#define BIT_DEPTH 10 ++#include "rpi_hevc_idct_fn_neon.S" ++ +diff --git a/libavcodec/arm/rpi_hevcdsp_init_arm.c b/libavcodec/arm/rpi_hevcdsp_init_arm.c +new file mode 100644 +index 0000000000..109fa98c29 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c +@@ -0,0 +1,32 @@ ++/* ++ * Copyright (c) 2014 Seppo Tomperi ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/attributes.h" ++#include "libavutil/arm/cpu.h" ++#include "libavcodec/rpi_hevcdsp.h" ++#include "rpi_hevcdsp_arm.h" ++ ++av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth) ++{ ++ int cpu_flags = av_get_cpu_flags(); ++ ++ if (have_neon(cpu_flags)) ++ ff_hevcdsp_rpi_init_neon(c, bit_depth); ++} +diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c +new file mode 100644 +index 0000000000..9294ab8010 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c +@@ -0,0 +1,467 @@ ++/* ++ * Copyright (c) 2014 Seppo Tomperi ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "config.h" ++#include "libavutil/attributes.h" ++#include "libavutil/arm/cpu.h" ++#include "libavcodec/rpi_hevcdsp.h" ++#include "rpi_hevcdsp_arm.h" ++#include "libavcodec/avcodec.h" ++#include "libavcodec/bit_depth_template.c" ++ ++// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but ++// have been removed from head as we never use them. ++ ++void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++ ++void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q); ++ ++void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f); ++void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f, ++ uint8_t * _pix_l); ++void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4, ++ unsigned int no_f); ++void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f); ++ ++void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f); ++void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f, ++ uint8_t * _pix_l); ++void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4, ++ unsigned int no_f); ++void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f); ++ ++void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit); ++void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit); ++void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs); ++void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs); ++void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs); ++void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs); ++void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs); ++ ++void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit); ++void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit); ++void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs); ++void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs); ++void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs); ++void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs); ++void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs); ++ ++void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++ ++void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc); ++ ++ ++void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs, ++ ptrdiff_t stride); ++ ++void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc); ++ ++ ++void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++ ++ ++void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_v); ++void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride, int dc_u); ++void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual, ++ ptrdiff_t stride); ++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc); ++ ++void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++ ++void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height); ++ ++void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++ ++void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height); ++ ++void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++ ++void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height); ++ ++void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++ ++void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height); ++ ++ ++uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh, ++ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ int in_inc0, int in_inc1); ++void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height); ++ ++ ++static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); ++ ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height); ++} ++static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height); ++ ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height); ++} ++ ++static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); ++ ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++} ++static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height); ++ ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++} ++ ++#if SAO_FILTER_N == 6 ++static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); ++ ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height); ++} ++static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height) ++{ ++ ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height); ++ ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height); ++} ++ ++static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++ ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); ++} ++static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, int width, int height) ++{ ++ ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height); ++ ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height); ++} ++ ++static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height) ++{ ++ ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); ++ ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); ++} ++static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v, ++ int eo, int width, int height) ++{ ++ ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height); ++ ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height); ++} ++ ++static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); ++ ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); ++} ++static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height); ++ ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, ++ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height); ++} ++#endif ++ ++ ++ ++#if RPI_HEVC_SAO_BUF_STRIDE != 160 ++#error SAO edge src stride not 160 - value used in .S ++#endif ++ ++av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth) ++{ ++ if (bit_depth == 8) { ++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_8; ++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_8; ++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_8; ++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_8; ++ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_8; ++ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_8; ++ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_8; ++ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_8; ++ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_8; ++ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_8; ++ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_8; ++ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_8; ++ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_8; ++ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_8; ++ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_8; ++ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_8; ++ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_8; ++ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_8; ++ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_8; ++ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_8; ++ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_8; ++ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_8; ++ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_8; ++ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_8; ++ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_8; ++ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_8; ++ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_8; ++ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_8; ++ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_8; ++ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_8; ++ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_8; ++ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8; ++ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8; ++ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8; ++ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_8; ++ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_8; ++ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_8; ++ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_8; ++ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_8; ++ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_8; ++ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_8; ++ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_8; ++ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_8; ++ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_8; ++ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_8; ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_8; ++ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_8; ++#endif ++ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_8; ++ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_8; ++ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_8; ++ ++ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_8; ++ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_8; ++ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_8; ++ ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_8; ++ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_8; ++#endif ++ } ++ else if (bit_depth == 10) { ++ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_10; ++ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_10; ++ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_10; ++ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_10; ++ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_10; ++ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_10; ++ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_10; ++ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_10; ++ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_10; ++ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_10; ++ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_10; ++ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_10; ++ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_10; ++ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_10; ++ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_10; ++ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_10; ++ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_10; ++ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_10; ++ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_10; ++ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_10; ++ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_10; ++ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_10; ++ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_10; ++ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_10; ++ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_10; ++ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_10; ++ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_10; ++ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_10; ++ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_10; ++ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_10; ++ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_10; ++ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10; ++ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10; ++ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10; ++ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_10; ++ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_10; ++ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_10; ++ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_10; ++ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_10; ++ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_10; ++ ++ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_10; ++ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_10; ++ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_10; ++ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_10; ++ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_10; ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_10; ++ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_10; ++#endif ++ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_10; ++ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_10; ++ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_10; ++ ++ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_10; ++ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_10; ++ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_10; ++ ++#if SAO_FILTER_N == 6 ++ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_10; ++ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_10; ++#endif ++ } ++ ++ assert(offsetof(HEVCRpiMvField, mv) == 0); ++ assert(offsetof(HEVCRpiMvField, ref_idx) == 8); ++ assert(offsetof(HEVCRpiMvField, pred_flag) == 10); ++ c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon; ++ c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon; ++} +diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S +new file mode 100644 +index 0000000000..93876d14c0 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S +@@ -0,0 +1,620 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++ .arch_extension mp @ enable PLDW ++ ++#define BIT_DEPTH 10 ++ ++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmax.s16 \Q2, \Q_MIN ++ vmax.s16 \Q3, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++ vmin.s16 \Q2, \Q_MAX ++ vmin.s16 \Q3, \Q_MAX ++.endm ++ ++@ add_residual4x4( ++@ uint16_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1 ++ add ip, r0, r2 ++ vld1.16 {q10, q11}, [r1] ++ lsl r2, #1 ++ vld1.16 {d0}, [r0 :64], r2 ++ vld1.16 {d1}, [ip :64], r2 ++ vld1.16 {d2}, [r0 :64] ++ vld1.16 {d3}, [ip :64] ++ sub r0, r2 ++ vqadd.s16 q0, q10 ++ sub ip, r2 ++ vqadd.s16 q1, q11 ++ vmov.i16 q8, #0 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vst1.16 {d0}, [r0 :64], r2 ++ vst1.16 {d1}, [ip :64], r2 ++ vst1.16 {d2}, [r0 :64] ++ vst1.16 {d3}, [ip :64] ++ bx lr ++ ++endfunc ++ ++@ add_residual4x4_dc( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1 ++ add ip, r0, r1 ++ vdup.16 q15, r2 ++ lsl r1, #1 ++ vld1.16 {d0}, [r0 :64], r1 ++ vld1.16 {d1}, [ip :64], r1 ++ vld1.16 {d2}, [r0 :64] ++ vld1.16 {d3}, [ip :64] ++ sub r0, r1 ++ vqadd.s16 q0, q15 ++ sub ip, r1 ++ vqadd.s16 q1, q15 ++ vmov.i16 q8, #0 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ vmax.s16 q0, q0, q8 ++ vmax.s16 q1, q1, q8 ++ vmin.s16 q0, q0, q9 ++ vmin.s16 q1, q1, q9 ++ vst1.16 {d0}, [r0 :64], r1 ++ vst1.16 {d1}, [ip :64], r1 ++ vst1.16 {d2}, [r0 :64] ++ vst1.16 {d3}, [ip :64] ++ bx lr ++ ++endfunc ++ ++ ++@ add_residual8x8( ++@ uint16_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1 ++ mov r3, #8 ++ vmov.i64 q8, #0 ++ add ip, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ lsl r2, #1 ++1: ++ vldm r1!, {q10-q13} ++ vld1.16 {q0}, [r0 :128], r2 ++ vld1.16 {q1}, [ip :128], r2 ++ vld1.16 {q2}, [r0 :128] ++ vld1.16 {q3}, [ip :128] ++ sub r0, r2 ++ vqadd.s16 q0, q10 ++ sub ip, r2 ++ vqadd.s16 q1, q11 ++ subs r3, #4 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0}, [r0 :128], r2 ++ vst1.16 {q1}, [ip :128], r2 ++ vst1.16 {q2}, [r0 :128], r2 ++ vst1.16 {q3}, [ip :128], r2 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ add_residual4x4_dc_c( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r3, #4 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual8x8_dc( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r2 ++ mov r3, #8 ++9: ++ vmov.i16 q8, #0 ++ add ip, r0, r1 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ lsl r1, #1 ++1: ++ vld1.16 {q0}, [r0 :128], r1 ++ vld1.16 {q1}, [ip :128], r1 ++ vld1.16 {q2}, [r0 :128] ++ vld1.16 {q3}, [ip :128] ++ sub r0, r1 ++ vqadd.s16 q0, q15 ++ sub ip, r1 ++ vqadd.s16 q1, q15 ++ subs r3, #4 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0}, [r0 :128], r1 ++ vst1.16 {q1}, [ip :128], r1 ++ vst1.16 {q2}, [r0 :128], r1 ++ vst1.16 {q3}, [ip :128], r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ add_residual16x16( ++@ uint16_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1 ++ add ip, r0, r2 ++ vmov.i16 q8, #0 ++ lsl r2, #1 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ mov r3, #16 ++1: ++ vldm r1!, {q10-q13} ++ @ For RPI Sand we could guarantee :256 but not for general ++ @ non-RPI allocation. :128 is as good as we can claim ++ vld1.16 {q0, q1}, [r0 :128] ++ subs r3, #2 ++ vld1.16 {q2, q3}, [ip :128] ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q11 ++ vqadd.s16 q2, q12 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0, q1}, [r0 :128], r2 ++ vst1.16 {q2, q3}, [ip :128], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual8x8_dc_c( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r3, #8 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual16x16_dc( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1 ++ vdup.i16 q15, r2 ++ mov r3, #16 ++9: ++ vmov.i16 q8, #0 ++ add ip, r0, r1 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ lsl r1, #1 ++1: ++ @ For RPI Sand we could guarantee :256 but not for general ++ @ non-RPI allocation. :128 is as good as we can claim ++ vld1.16 {q0, q1}, [r0 :128] ++ subs r3, #2 ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q15 ++ vld1.16 {q2, q3}, [ip :128] ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0, q1}, [r0 :128], r1 ++ vst1.16 {q2, q3}, [ip :128], r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++ ++@ add_residual32x32( ++@ uint16_t *_dst, [r0] ++@ int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1 ++ push {lr} ++ mov r3, #32 ++ vmov.i16 q8, #0 ++ add lr, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ add ip, r0, #32 ++1: ++ vldm r1!, {q10-q13} ++ vldm r0, {q0-q3} ++ vqadd.s16 q0, q10 ++ pldw [lr] ++ vqadd.s16 q1, q11 ++ add lr, r2 ++ vqadd.s16 q2, q12 ++ subs r3, #1 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0-q1}, [r0], r2 ++ vst1.16 {q2-q3}, [ip], r2 ++ bne 1b ++ pop {pc} ++ ++endfunc ++ ++@ add_residual16x16_dc_c( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc_uv) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1 ++ mov r3, #16 ++ vdup.32 q15, r2 ++ b 9f ++endfunc ++ ++@ add_residual32x32_dc( ++@ uint16_t *_dst, [r0] ++@ ptrdiff_t stride, [r1] ++@ int dc) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r2 ++ mov r3, #32 ++9: ++ vmov.i16 q8, #0 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ add ip, r0, #32 ++1: ++ vldm r0, {q0-q3} ++ vqadd.s16 q0, q15 ++ subs r3, #1 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst1.16 {q0-q1}, [r0], r1 ++ vst1.16 {q2-q3}, [ip], r1 ++ bne 1b ++ bx lr ++ ++endfunc ++ ++@ ============================================================================ ++@ U add ++ ++@ add_residual4x4_u( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ add ip, r0, r2 ++ vld1.16 {q10, q11}, [r1 :256] ++ lsl r2, #1 ++ vld2.16 {d0, d2}, [r0 :128], r2 ++ vld2.16 {d1, d3}, [ip :128], r2 ++ vld2.16 {d4, d6}, [r0 :128] ++ vld2.16 {d5, d7}, [ip :128] ++ sub r0, r2 ++ vmov.i16 q8, #0 ++ sub ip, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ ++ vst2.16 {d0, d2}, [r0 :128], r2 ++ vst2.16 {d1, d3}, [ip :128], r2 ++ vst2.16 {d4, d6}, [r0 :128] ++ vst2.16 {d5, d7}, [ip :128] ++ bx lr ++endfunc ++ ++@ add_residual8x8_u( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ mov r3, #8 ++ vmov.i16 q8, #0 ++ add ip, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ lsl r2, #1 ++1: ++ vld2.16 {q0, q1}, [r0 :256] ++ subs r3, #2 ++ vld2.16 {q2, q3}, [ip :256] ++ vld1.16 {q10, q11}, [r1 :256]! ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q15 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0 :256], r2 ++ vst2.16 {q2, q3}, [ip :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_u( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1 ++ push {lr} ++ vdup.16 q15, r3 ++ mov r3, #16 ++ vmov.i16 q8, #0 ++ add lr, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ add ip, r0, #32 ++1: ++ vld2.16 {q0, q1}, [r0 :256] ++ vld2.16 {q2, q3}, [ip :256] ++ vld1.16 {q10, q11}, [r1 :256]! ++ vqadd.s16 q0, q10 ++ pldw [lr] ++ vqadd.s16 q1, q15 ++ add lr, r2 ++ vqadd.s16 q2, q11 ++ subs r3, #1 ++ vqadd.s16 q3, q15 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0 :256], r2 ++ vst2.16 {q2, q3}, [ip :256], r2 ++ bne 1b ++ pop {pc} ++endfunc ++ ++@ ============================================================================ ++@ V add ++ ++@ add_residual4x4_v( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ add ip, r0, r2 ++ vld1.16 {q10, q11}, [r1 :256] ++ lsl r2, #1 ++ vld2.16 {d0, d2}, [r0 :128], r2 ++ vld2.16 {d1, d3}, [ip :128], r2 ++ vld2.16 {d4, d6}, [r0 :128] ++ vld2.16 {d5, d7}, [ip :128] ++ sub r0, r2 ++ vmov.i16 q8, #0 ++ sub ip, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ ++ vst2.16 {d0, d2}, [r0 :128], r2 ++ vst2.16 {d1, d3}, [ip :128], r2 ++ vst2.16 {d4, d6}, [r0 :128] ++ vst2.16 {d5, d7}, [ip :128] ++ bx lr ++endfunc ++ ++@ add_residual8x8_v( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1 ++ vdup.16 q15, r3 ++ mov r3, #8 ++ vmov.i16 q8, #0 ++ add ip, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ lsl r2, #1 ++1: ++ vld2.16 {q0, q1}, [r0 :256] ++ subs r3, #2 ++ vld2.16 {q2, q3}, [ip :256] ++ vld1.16 {q10, q11}, [r1 :256]! ++ vqadd.s16 q0, q15 ++ vqadd.s16 q1, q10 ++ vqadd.s16 q2, q15 ++ vqadd.s16 q3, q11 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0 :256], r2 ++ vst2.16 {q2, q3}, [ip :256], r2 ++ bne 1b ++ bx lr ++endfunc ++ ++@ add_residual16x16_v( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc) [r3] ++ ++function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1 ++ push {lr} ++ vdup.16 q15, r3 ++ mov r3, #16 ++ vmov.i16 q8, #0 ++ add lr, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ add ip, r0, #32 ++1: ++ vld2.16 {q0, q1}, [r0 :256] ++ vld2.16 {q2, q3}, [ip :256] ++ vld1.16 {q10, q11}, [r1 :256]! ++ vqadd.s16 q0, q15 ++ pldw [lr] ++ vqadd.s16 q1, q10 ++ add lr, r2 ++ vqadd.s16 q2, q15 ++ subs r3, #1 ++ vqadd.s16 q3, q11 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0 :256], r2 ++ vst2.16 {q2, q3}, [ip :256], r2 ++ bne 1b ++ pop {pc} ++endfunc ++ ++@ ============================================================================ ++@ U & V add ++ ++@ add_residual4x4_c( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1 ++ vmov.i16 q8, #0 ++ add ip, r0, r2 ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ lsl r2, #1 ++ vldm r1, {q10-q13} ++ vld2.16 {d0, d2}, [r0 :128], r2 ++ vld2.16 {d1, d3}, [ip :128], r2 ++ vld2.16 {d4, d6}, [r0 :128] ++ vld2.16 {d5, d7}, [ip :128] ++ ++ sub r0, r2 ++ vqadd.s16 q0, q10 ++ sub ip, r2 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ ++ vst2.16 {d0, d2}, [r0 :128], r2 ++ vst2.16 {d1, d3}, [ip :128], r2 ++ vst2.16 {d4, d6}, [r0 :128] ++ vst2.16 {d5, d7}, [ip :128] ++ bx lr ++endfunc ++ ++@ add_residual8x8_c( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1 ++ push {lr} ++ add ip, r0, r2 ++ lsl r2, #1 ++ vmov.i16 q8, #0 ++ add r3, r1, #(8*8*2) @ Offset to V ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ mov lr, #8 ++1: ++ vld1.16 {q10, q11}, [r1 :256]! ++ subs lr, #2 ++ vld2.16 {q0, q1}, [r0 :256] ++ vld2.16 {q2, q3}, [ip :256] ++ vld1.16 {q12, q13}, [r3 :256]! ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0 :256], r2 ++ vst2.16 {q2, q3}, [ip :256], r2 ++ bne 1b ++ pop {pc} ++endfunc ++ ++@ add_residual16x16_c( ++@ uint16_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1 ++ push {r4, lr} ++ vmov.i16 q8, #0 ++ add r3, r1, #(16*16*2) @ Offset to V ++ vmov.i16 q9, #(1 << BIT_DEPTH) - 1 ++ add ip, r0, #32 ++ add r4, r0, r2 ++ mov lr, #16 ++1: ++ vld2.16 {q0, q1}, [r0 :256] ++ vld2.16 {q2, q3}, [ip :256] ++ vld1.16 {q10, q11}, [r1 :256]! ++ vld1.16 {q12, q13}, [r3 :256]! ++ vqadd.s16 q0, q10 ++ pldw [r4] ++ vqadd.s16 q1, q12 ++ add r4, r2 ++ vqadd.s16 q2, q11 ++ subs lr, #1 ++ vqadd.s16 q3, q13 ++ clip16_4 q0, q1, q2, q3, q8, q9 ++ vst2.16 {q0, q1}, [r0 :256], r2 ++ vst2.16 {q2, q3}, [ip :256], r2 ++ bne 1b ++ pop {r4,pc} ++endfunc ++ +diff --git a/libavcodec/arm/rpi_hevcdsp_res8_neon.S b/libavcodec/arm/rpi_hevcdsp_res8_neon.S +new file mode 100644 +index 0000000000..d9a1d7d98c +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S +@@ -0,0 +1,741 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++ .arch_extension mp @ enable PLDW ++ ++@ General notes: ++@ ++@ Residual is generally only guaranteed to be clipped to 16 bits. ++@ This means that we do need to do vmovl, vqadd, vqmovun ++@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away ++@ with this). ++@ ++@ There is an exception for the DC case because its transform is guaranteed ++@ to be small enough that overflow cannot occur during the first add. ++ ++@ ============================================================================ ++@ Y add ++ ++function ff_hevc_rpi_add_residual_4x4_neon_8, export=1 ++ add ip, r0, r2 ++ vld1.16 {q0, q1}, [r1] ++ lsl r2, #1 ++ vld1.32 d4[0], [r0], r2 ++ rsb r3, r2, #0 ++ vld1.32 d4[1], [ip], r2 ++ vld1.32 d5[0], [r0], r3 ++ vld1.32 d5[1], [ip], r3 ++ vmovl.u8 q8, d4 ++ vmovl.u8 q9, d5 ++ vqadd.s16 q0, q8 ++ vqadd.s16 q1, q9 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vst1.32 d0[0], [r0], r2 ++ vst1.32 d0[1], [ip], r2 ++ vst1.32 d1[0], [r0] ++ vst1.32 d1[1], [ip] ++ bx lr ++endfunc ++ ++function ff_hevc_rpi_add_residual_8x8_neon_8, export=1 ++ push {r4, lr} ++ vld1.16 {q0, q1}, [r1]! ++ add ip, r0, r2 ++ vld1.8 {d6}, [r0] ++ add r4, r0, r2, lsl #1 ++ vld1.8 {d7}, [ip] ++ add lr, ip, r2, lsl #1 ++ lsl r2, #1 ++ mov r3, #8-2 ++ vmovl.u8 q2, d6 ++ vmovl.u8 q3, d7 ++ vqadd.s16 q2, q0 ++ vqadd.s16 q3, q1 ++1: ++ vld1.16 {q0, q1}, [r1]! ++ subs r3, #2 ++ vqmovun.s16 d4, q2 ++ vqmovun.s16 d5, q3 ++ vld1.8 {d6}, [r4], r2 ++ vld1.8 {d7}, [lr], r2 ++ vst1.8 {d4}, [r0], r2 ++ vst1.8 {d5}, [ip], r2 ++ vmovl.u8 q2, d6 ++ pldw [r4] ++ vmovl.u8 q3, d7 ++ vqadd.s16 q2, q0 ++ vqadd.s16 q3, q1 ++ bne 1b ++ ++ vqmovun.s16 d4, q2 ++ vqmovun.s16 d5, q3 ++ vst1.8 {d4}, [r0] ++ vst1.8 {d5}, [ip] ++ pop {r4, pc} ++endfunc ++ ++function ff_hevc_rpi_add_residual_16x16_neon_8, export=1 ++ vld1.16 {q0, q1}, [r1]! ++ add ip, r0, r2 ++ vld1.8 {q3}, [r0] ++ mov r3, #16-1 ++ vmovl.u8 q2, d6 ++ vmovl.u8 q3, d7 ++ vqadd.s16 q2, q0 ++ vqadd.s16 q3, q1 ++1: ++ vld1.16 {q0, q1}, [r1]! ++ subs r3, #1 ++ vqmovun.s16 d4, q2 ++ vqmovun.s16 d5, q3 ++ vld1.8 {q3}, [ip], r2 ++ vst1.8 {q2}, [r0], r2 ++ vmovl.u8 q2, d6 ++ pldw [ip] ++ vmovl.u8 q3, d7 ++ vqadd.s16 q2, q0 ++ vqadd.s16 q3, q1 ++ bne 1b ++ ++ vqmovun.s16 d4, q2 ++ vqmovun.s16 d5, q3 ++ vst1.8 {q2}, [r0] ++ bx lr ++endfunc ++ ++function ff_hevc_rpi_add_residual_32x32_neon_8, export=1 ++ vldm r1!, {q0-q3} ++ vld1.8 {q8, q9}, [r0] ++ add ip, r0, r2 ++ vmovl.u8 q10, d16 ++ mov r3, #32-1 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vqadd.s16 q10, q0 ++ vqadd.s16 q11, q1 ++ vqadd.s16 q12, q2 ++ vqadd.s16 q13, q3 ++1: ++ vldm r1!, {q0-q3} ++ vqmovun.s16 d20, q10 ++ vqmovun.s16 d21, q11 ++ vqmovun.s16 d22, q12 ++ vqmovun.s16 d23, q13 ++ vld1.8 {q8, q9}, [ip], r2 ++ subs r3, #1 ++ vst1.8 {q10, q11}, [r0], r2 ++ vmovl.u8 q10, d16 ++ pldw [ip] ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vqadd.s16 q10, q0 ++ vqadd.s16 q11, q1 ++ vqadd.s16 q12, q2 ++ vqadd.s16 q13, q3 ++ bne 1b ++ ++ vqmovun.s16 d20, q10 ++ vqmovun.s16 d21, q11 ++ vqmovun.s16 d22, q12 ++ vqmovun.s16 d23, q13 ++ vst1.8 {q10, q11}, [r0] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_add_residual_4x4_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1 ++ add ip, r0, r1 ++ vdup.16 q15, r2 ++ lsl r1, #1 ++ vld1.32 d4[0], [r0], r1 ++ rsb r3, r1, #0 ++ vld1.32 d4[1], [ip], r1 ++ vld1.32 d5[0], [r0], r3 ++ vld1.32 d5[1], [ip], r3 ++ vaddw.u8 q0, q15, d4 ++ vaddw.u8 q1, q15, d5 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q1 ++ vst1.32 d0[0], [r0], r1 ++ vst1.32 d0[1], [ip], r1 ++ vst1.32 d1[0], [r0] ++ vst1.32 d1[1], [ip] ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ DC Y or C add ++ ++@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1 ++ mov r3, #4-2 ++ vdup.32 q15, r2 ++ b 1f ++endfunc ++ ++@ ff_hevc_rpi_add_residual_8x8_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ mov r3, #8-2 ++1: vld1.8 d16, [r0] ++ add ip, r0, r1 ++ push {r4, lr} ++ vld1.8 d17, [ip] ++ add r4, r0, r1, lsl #1 ++ vaddw.u8 q0, q15, d16 ++ lsl r1, #1 ++ vaddw.u8 q1, q15, d17 ++ add lr, ip, r1 ++1: ++ vld1.8 {d16}, [r4], r1 ++ vld1.8 {d17}, [lr], r1 ++ subs r3, #2 ++ vqmovun.s16 d4, q0 ++ vqmovun.s16 d5, q1 ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++ vst1.8 {d4}, [r0], r1 ++ vst1.8 {d5}, [ip], r1 ++ bne 1b ++ ++ vqmovun.s16 d4, q0 ++ vqmovun.s16 d5, q1 ++ vst1.8 {d4}, [r0] ++ vst1.8 {d5}, [ip] ++ pop {r4, pc} ++endfunc ++ ++ ++@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1 ++ mov r3, #8-1 ++ vdup.32 q15, r2 ++ b 1f ++endfunc ++ ++@ ff_hevc_rpi_add_residual_16x16_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ mov r3, #16-1 ++1: vld1.8 {q8}, [r0] ++ add ip, r0, r1 ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++1: ++ vld1.8 {q8}, [ip], r1 ++ subs r3, #1 ++ vqmovun.s16 d4, q0 ++ vqmovun.s16 d5, q1 ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++ vst1.8 {q2}, [r0], r1 ++ bne 1b ++ ++ vqmovun.s16 d4, q0 ++ vqmovun.s16 d5, q1 ++ vst1.8 {q2}, [r0] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1 ++ mov r3, #16-1 ++ vdup.32 q15, r2 ++ b 1f ++endfunc ++ ++@ ff_hevc_rpi_add_residual_32x32_dc_neon_8( ++@ uint8_t * dst, // [r0] ++@ unsigned int stride, // [r1] ++@ int dc) // [r2] ++ ++function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1 ++ vdup.16 q15, r2 ++ mov r3, #32-1 ++1: vld1.8 {q8, q9}, [r0] ++ add ip, r0, r1 ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++ vaddw.u8 q2, q15, d18 ++ vaddw.u8 q3, q15, d19 ++1: ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d21, q1 ++ vqmovun.s16 d22, q2 ++ vqmovun.s16 d23, q3 ++ vld1.8 {q8, q9}, [ip], r1 ++ subs r3, #1 ++ vaddw.u8 q0, q15, d16 ++ vaddw.u8 q1, q15, d17 ++ vaddw.u8 q2, q15, d18 ++ vaddw.u8 q3, q15, d19 ++ vst1.8 {q10, q11}, [r0], r1 ++ bne 1b ++ ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d21, q1 ++ vqmovun.s16 d22, q2 ++ vqmovun.s16 d23, q3 ++ vst1.8 {q10, q11}, [r0] ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ U add ++ ++@ add_residual4x4_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride, [r2] ++@ int dc_v) [r3] ++ ++function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1 ++ add ip, r0, r2 ++ vld1.16 {q0, q1}, [r1] ++ lsl r2, #1 ++ vld1.8 {d16}, [r0 :64], r2 ++ vld1.8 {d17}, [ip :64], r2 ++ vld1.8 {d18}, [r0 :64] ++ sub r0, r2 ++ vld1.8 {d19}, [ip :64] ++ sub ip, r2 ++ vdup.16 q2, r3 ++ vdup.16 q3, r3 ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vzip.16 q0, q2 ++ vzip.16 q1, q3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q2 ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q3 ++ vst1.8 {d0}, [r0 :64], r2 ++ vst1.8 {d1}, [ip :64], r2 ++ vst1.8 {d2}, [r0 :64] ++ vst1.8 {d3}, [ip :64] ++ bx lr ++endfunc ++ ++@ add_residual8x8_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++@ int dc_v) [r3] ++ ++function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1 ++ vdup.16 q15, r3 ++ add ip, r0, r2 ++ push {r4, lr} ++ vld2.8 {d16, d17}, [r0 :128] ++ lsl r2, #1 ++ vld2.8 {d18, d19}, [ip :128] ++ mov r3, #8-2 ++ vld1.16 {q0, q1}, [r1 :256]! ++ add r4, r0, r2 ++ vmovl.u8 q10, d16 ++ add lr, ip, r2 ++ vmovl.u8 q11, d18 ++ vqadd.s16 q0, q10 ++ vaddw.u8 q2, q15, d17 ++ vqadd.s16 q1, q11 ++ vaddw.u8 q3, q15, d19 ++1: ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d21, q2 ++ vld2.8 {d16, d17}, [r4 :128], r2 ++ subs r3, #2 ++ vqmovun.s16 d22, q1 ++ vqmovun.s16 d23, q3 ++ vst2.8 {d20, d21}, [r0 :128], r2 ++ vld2.8 {d18, d19}, [lr :128], r2 ++ vst2.8 {d22, d23}, [ip :128], r2 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d18 ++ vqadd.s16 q0, q10 ++ vaddw.u8 q2, q15, d17 ++ vqadd.s16 q1, q11 ++ vaddw.u8 q3, q15, d19 ++ bne 1b ++ ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d21, q2 ++ vqmovun.s16 d22, q1 ++ vqmovun.s16 d23, q3 ++ vst2.8 {d20, d21}, [r0 :128] ++ vst2.8 {d22, d23}, [ip :128] ++ pop {r4, pc} ++endfunc ++ ++@ add_residual16x16_u( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++@ int dc_v) [r3] ++ ++function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1 ++ vdup.16 q15, r3 ++ add ip, r0, r2 ++ vld2.8 {q8, q9}, [r0 :256] ++ mov r3, #16-1 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vmovl.u8 q11, d16 ++ vmovl.u8 q12, d17 ++ vqadd.s16 q0, q11 ++ vaddw.u8 q11, q15, d18 ++ vqadd.s16 q1, q12 ++ vaddw.u8 q12, q15, d19 ++1: ++ vld2.8 {q8, q9}, [ip :256], r2 ++ subs r3, #1 ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d22, q11 ++ vqmovun.s16 d21, q1 ++ vqmovun.s16 d23, q12 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vst2.8 {q10, q11}, [r0 :256], r2 ++ vmovl.u8 q11, d16 ++ pldw [ip] ++ vmovl.u8 q12, d17 ++ vqadd.s16 q0, q11 ++ vaddw.u8 q11, q15, d18 ++ vqadd.s16 q1, q12 ++ vaddw.u8 q12, q15, d19 ++ bne 1b ++ ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d22, q11 ++ vqmovun.s16 d21, q1 ++ vqmovun.s16 d23, q12 ++ vst2.8 {q10, q11}, [r0 :256] ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ V add ++ ++@ add_residual4x4_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1 ++ add ip, r0, r2 ++ vld1.16 {q2, q3}, [r1] ++ lsl r2, #1 ++ vld1.8 {d16}, [r0 :64], r2 ++ vld1.8 {d17}, [ip :64], r2 ++ vld1.8 {d18}, [r0 :64] ++ sub r0, r2 ++ vld1.8 {d19}, [ip :64] ++ sub ip, r2 ++ vdup.16 q0, r3 ++ vdup.16 q1, r3 ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vzip.16 q0, q2 ++ vzip.16 q1, q3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q2 ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q3 ++ vst1.8 {d0}, [r0 :64], r2 ++ vst1.8 {d1}, [ip :64], r2 ++ vst1.8 {d2}, [r0 :64] ++ vst1.8 {d3}, [ip :64] ++ bx lr ++endfunc ++ ++@ add_residual8x8_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1 ++ vdup.16 q15, r3 ++ add ip, r0, r2 ++ push {r4, lr} ++ vld2.8 {d16, d17}, [r0 :128] ++ lsl r2, #1 ++ vld2.8 {d18, d19}, [ip :128] ++ mov r3, #8-2 ++ vld1.16 {q0, q1}, [r1 :256]! ++ add r4, r0, r2 ++ vmovl.u8 q10, d17 ++ add lr, ip, r2 ++ vmovl.u8 q11, d19 ++ vqadd.s16 q0, q10 ++ vaddw.u8 q2, q15, d16 ++ vqadd.s16 q1, q11 ++ vaddw.u8 q3, q15, d18 ++1: ++ vqmovun.s16 d20, q2 ++ vqmovun.s16 d21, q0 ++ vld2.8 {d16, d17}, [r4 :128], r2 ++ subs r3, #2 ++ vqmovun.s16 d22, q3 ++ vqmovun.s16 d23, q1 ++ vst2.8 {d20, d21}, [r0 :128], r2 ++ vld2.8 {d18, d19}, [lr :128], r2 ++ vst2.8 {d22, d23}, [ip :128], r2 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vmovl.u8 q10, d17 ++ vmovl.u8 q11, d19 ++ vqadd.s16 q0, q10 ++ vaddw.u8 q2, q15, d16 ++ vqadd.s16 q1, q11 ++ vaddw.u8 q3, q15, d18 ++ bne 1b ++ ++ vqmovun.s16 d20, q2 ++ vqmovun.s16 d21, q0 ++ vqmovun.s16 d22, q3 ++ vqmovun.s16 d23, q1 ++ vst2.8 {d20, d21}, [r0 :128] ++ vst2.8 {d22, d23}, [ip :128] ++ pop {r4, pc} ++endfunc ++ ++@ add_residual16x16_v( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1 ++ vdup.16 q15, r3 ++ add ip, r0, r2 ++ vld2.8 {q8, q9}, [r0 :256] ++ mov r3, #16-1 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vmovl.u8 q11, d18 ++ vmovl.u8 q12, d19 ++ vqadd.s16 q0, q11 ++ vaddw.u8 q11, q15, d16 ++ vqadd.s16 q1, q12 ++ vaddw.u8 q12, q15, d17 ++1: ++ vld2.8 {q8, q9}, [ip :256], r2 ++ subs r3, #1 ++ vqmovun.s16 d20, q11 ++ vqmovun.s16 d22, q0 ++ vqmovun.s16 d21, q12 ++ vqmovun.s16 d23, q1 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vst2.8 {q10, q11}, [r0 :256], r2 ++ vmovl.u8 q11, d18 ++ pldw [ip] ++ vmovl.u8 q12, d19 ++ vqadd.s16 q0, q11 ++ vaddw.u8 q11, q15, d16 ++ vqadd.s16 q1, q12 ++ vaddw.u8 q12, q15, d17 ++ bne 1b ++ ++ vqmovun.s16 d20, q11 ++ vqmovun.s16 d22, q0 ++ vqmovun.s16 d21, q12 ++ vqmovun.s16 d23, q1 ++ vst2.8 {q10, q11}, [r0 :256] ++ bx lr ++endfunc ++ ++@ ============================================================================ ++@ U & V add ++ ++@ add_residual4x4_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1 ++ add ip, r0, r2 ++ vld1.16 {q0, q1}, [r1]! @ all of U ++ lsl r2, #1 ++ vld1.8 {d16}, [r0 :64], r2 ++ rsb r3, r2, #0 ++ vld1.8 {d17}, [ip :64], r2 ++ vld1.16 {q2, q3}, [r1] @ all of V ++ vld1.8 {d18}, [r0 :64], r3 ++ vld1.8 {d19}, [ip :64], r3 ++ vmovl.u8 q10, d16 ++ vmovl.u8 q11, d17 ++ vmovl.u8 q12, d18 ++ vmovl.u8 q13, d19 ++ vzip.16 q0, q2 ++ vzip.16 q1, q3 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q1, q12 ++ vqadd.s16 q3, q13 ++ vqmovun.s16 d0, q0 ++ vqmovun.s16 d1, q2 ++ vqmovun.s16 d2, q1 ++ vqmovun.s16 d3, q3 ++ vst1.8 {d0}, [r0 :64], r2 ++ vst1.8 {d1}, [ip :64], r2 ++ vst1.8 {d2}, [r0 :64] ++ vst1.8 {d3}, [ip :64] ++ bx lr ++endfunc ++ ++@ add_residual8x8_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1 ++ vld2.8 {d16, d17}, [r0 :128] ++ add r3, r1, #(8*8*2) @ Offset to V ++ vld1.16 {q0}, [r1 :128]! ++ add ip, r0, r2 ++ vld1.16 {q1}, [r3 :128]! ++ vmovl.u8 q10, d16 ++ push {lr} ++ vmovl.u8 q8, d17 ++ mov lr, #8-1 ++ vqadd.s16 q10, q0 ++ vqadd.s16 q1, q8 ++1: ++ vld2.8 {d16, d17}, [ip :128], r2 ++ subs lr, #1 ++ vld1.16 {q0}, [r1 :128]! ++ vqmovun.s16 d20, q10 ++ vqmovun.s16 d21, q1 ++ vld1.16 {q1}, [r3 :128]! ++ vst2.8 {d20, d21}, [r0 :128], r2 ++ vmovl.u8 q10, d16 ++ pldw [ip] ++ vmovl.u8 q8, d17 ++ vqadd.s16 q10, q0 ++ vqadd.s16 q1, q8 ++ bne 1b ++ ++ vqmovun.s16 d20, q10 ++ vqmovun.s16 d21, q1 ++ vst2.8 {d20, d21}, [r0 :128] ++ pop {pc} ++endfunc ++ ++@ add_residual16x16_c( ++@ uint8_t *_dst, [r0] ++@ const int16_t *res, [r1] ++@ ptrdiff_t stride) [r2] ++ ++function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1 ++ vld2.8 {q8, q9}, [r0 :256] ++ add r3, r1, #(16*16*2) @ Offset to V ++ vld1.16 {q0, q1}, [r1 :256]! ++ add ip, r0, r2 ++ vld1.16 {q2, q3}, [r3 :256]! ++ vmovl.u8 q10, d16 ++ push {lr} ++ vmovl.u8 q8, d17 ++ mov lr, #16-1 ++ vmovl.u8 q11, d18 ++ vmovl.u8 q9, d19 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q8 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q9 ++1: ++ vld2.8 {q8, q9}, [ip :256], r2 ++ subs lr, #1 ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d22, q2 ++ vqmovun.s16 d21, q1 ++ vqmovun.s16 d23, q3 ++ vld1.16 {q0, q1}, [r1 :256]! ++ vst2.8 {d20-d23}, [r0 :256], r2 ++ vld1.16 {q2, q3}, [r3 :256]! ++ vmovl.u8 q10, d16 ++ pldw [ip] ++ vmovl.u8 q8, d17 ++ vmovl.u8 q11, d18 ++ vmovl.u8 q9, d19 ++ vqadd.s16 q0, q10 ++ vqadd.s16 q1, q8 ++ vqadd.s16 q2, q11 ++ vqadd.s16 q3, q9 ++ bne 1b ++ ++ vqmovun.s16 d20, q0 ++ vqmovun.s16 d22, q2 ++ vqmovun.s16 d21, q1 ++ vqmovun.s16 d23, q3 ++ vst2.8 {d20-d23}, [r0 :256] ++ pop {pc} ++endfunc ++ ++@ 32x32 chroma never occurs so NIF ++ ++@ ============================================================================ +diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S +new file mode 100644 +index 0000000000..b56e0f9644 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S +@@ -0,0 +1,2245 @@ ++/* ++ * Copyright (c) 2014 - 2015 Seppo Tomperi ++ * 2017 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++.set EDGE_SRC_STRIDE, 160 ++ ++@ PIC jump tables are fractionally more expensive than absolute in our code ++.set jent_pic, CONFIG_PIC ++ ++ ++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4 ++ vshr.u8 q12, q8, #3 ++ \I1 ++ vadd.i8 q8, \Q_K128 ++ \I2 ++ vshr.u8 q13, q9, #3 ++ \I3 ++ vadd.i8 q9, \Q_K128 ++ \I4 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT0, d25 ++ vtbl.8 d26, \XLAT1, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ ++ vqadd.s8 q8, q12 ++ vshr.u8 q12, q10, #3 ++ vadd.i8 q10, \Q_K128 ++ vqadd.s8 q9, q13 ++ vshr.u8 q13, q11, #3 ++ vadd.i8 q11, \Q_K128 ++ ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT0, d25 ++ vtbl.8 d26, \XLAT1, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ vqadd.s8 q10, q12 ++ vsub.i8 q8, \Q_K128 ++ vqadd.s8 q11, q13 ++ vsub.i8 q9, \Q_K128 ++ vsub.i8 q10, \Q_K128 ++ vsub.i8 q11, \Q_K128 ++.endm ++ ++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4 ++ \L1 ++ \L2 ++ \L3 ++ \L4 ++ \L5 ++ vadd.i8 q12, q8, \Q_K128 ++ vshr.u8 q8, #3 ++ vtbl.8 d16, \XLAT0, d16 ++ vtbl.8 d17, \XLAT1, d17 ++ vqadd.s8 q12, q8 ++ bmi 2f ++1: \L1 ++ \L2 ++ \L3 ++ \L4 ++ \L5 ++ vsub.i8 q13, q12, \Q_K128 ++ vadd.i8 q12, q8, \Q_K128 ++ vshr.u8 q8, #3 ++ \S1 ++ \S2 ++ \S3 ++ \S4 ++ vtbl.8 d16, \XLAT0, d16 ++ vtbl.8 d17, \XLAT1, d17 ++ vqadd.s8 q12, q8 ++ bpl 1b ++2: vsub.i8 q13, q12, \Q_K128 ++ \S1 ++ \S2 ++ \S3 ++ \S4 ++.endm ++ ++ ++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX ++ vmax.s16 \Q0, \Q_MIN ++ vmax.s16 \Q1, \Q_MIN ++ vmax.s16 \Q2, \Q_MIN ++ vmax.s16 \Q3, \Q_MIN ++ vmin.s16 \Q0, \Q_MAX ++ vmin.s16 \Q1, \Q_MAX ++ vmin.s16 \Q2, \Q_MAX ++ vmin.s16 \Q3, \Q_MAX ++.endm ++ ++@ Clobbers q12, q13 ++.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2 ++ vshrn.i16 d24, \Q0, #(\bit_depth - 5) ++ vshrn.i16 d25, \Q1, #(\bit_depth - 5) ++ vshrn.i16 d26, \Q2, #(\bit_depth - 5) ++ \I1 ++ vtbl.8 d24, \XLAT0, d24 ++ vshrn.i16 d27, \Q3, #(\bit_depth - 5) ++ vtbl.8 d25, \XLAT1, d25 ++ \I2 ++ vtbl.8 d26, \XLAT0, d26 ++ vtbl.8 d27, \XLAT1, d27 ++ vaddw.s8 \Q0, d24 ++ vaddw.s8 \Q1, d25 ++ vaddw.s8 \Q2, d26 ++ vaddw.s8 \Q3, d27 ++ clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX ++.endm ++ ++@ Clobbers q10, q11, q12 ++.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4 ++ \L1 ++ \L2 ++ \L3 ++ \L4 ++ \L5 ++ vshrn.i16 d24, \Q0, #\bit_depth - 5 ++ vshrn.i16 d25, \Q1, #\bit_depth - 5 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ vaddw.s8 q10, \Q0, d24 ++ vaddw.s8 q11, \Q1, d25 ++ bmi 2f ++1: \L1 ++ \L2 ++ \L3 ++ \L4 ++ \L5 ++ vmax.s16 q10, \Q_MIN ++ vmax.s16 q11, \Q_MIN ++ vshrn.i16 d24, \Q0, #\bit_depth - 5 ++ vshrn.i16 d25, \Q1, #\bit_depth - 5 ++ vmin.s16 q10, \Q_MAX ++ vmin.s16 q11, \Q_MAX ++ \S1 ++ \S2 ++ \S3 ++ \S4 ++ vtbl.8 d24, \XLAT0, d24 ++ vtbl.8 d25, \XLAT1, d25 ++ vaddw.s8 q10, \Q0, d24 ++ vaddw.s8 q11, \Q1, d25 ++ bpl 1b ++2: vmax.s16 q10, \Q_MIN ++ vmax.s16 q11, \Q_MIN ++ vmin.s16 q10, \Q_MAX ++ vmin.s16 q11, \Q_MAX ++ \S1 ++ \S2 ++ \S3 ++ \S4 ++.endm ++ ++ ++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38) ++@ so we are quite safe stuffing it into a byte array ++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma ++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of ++@ precision ++ ++@ This, somewhat nasty, bit of code builds the {d0-d3} translation ++@ array via the stack ++@ Given that sao_left_class > 28 can cause wrap we can't just poke ++@ all 4 bytes in at once ++@ ++@ It also loads other common regs ++ ++@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately ++function band_load_y ++ ldr ip, [sp, #16] @ &sao_offset_val[0] ++ ldr r4, [sp, #20] @ sao_left_class ++ vmov.i64 d4, #0 ++ vmov.i64 q0, #0 ++ pld [r1] ++ vld2.8 {q8}, [ip] ++ sub ip, sp, #8*5 ++ vmov.i64 q1, #0 ++ add r4, ip, r4 ++ vpush {d0-d4} @ Put zero array on stack ++ vshr.u64 d16, d16, #8 @ 1st interesting val is [1] ++ ldr ip, [ip, #8*5 + 28] @ height ++ vst1.32 {d16[0]}, [r4] ++ add r4, r1, r3 ++ vpop {d0-d4} @ Pop modified array ++ sub ip, ip, #1 ++ vorr d0, d0, d4 ++ bx lr ++endfunc ++ ++@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately ++function band_load_c ++ ldr ip, [sp, #16] @ &sao_offset_val1[0] ++ ldr r4, [sp, #20] @ sao_left_class1 ++ vmov.i64 d24, #0 ++ vmov.i64 q10, #0 ++ pld [r1] ++ vld2.8 {q8}, [ip] ++ sub ip, sp, #8*5 ++ vmov.i64 q11, #0 ++ add r4, ip, r4 ++ ldr ip, [sp, #24] @ &sao_offset_val2[0] ++ vpush {d20-d24} @ Put zero array on stack ++ vld2.8 {q9}, [ip] ++ vshr.u64 d16, d16, #8 @ 1st interesting val is [1] ++ ldr ip, [sp, #8*5 + 28] @ sao_left_class2 ++ vst1.32 {d16[0]}, [r4] ++ add ip, sp, ip ++ vshr.u64 d18, d18, #8 @ 1st interesting val is [1] ++ vldmia sp, {d0-d3} @ Load modified array ++ vldr d16, [sp, #8*4] ++ add r4, r1, r3 ++ vstmia sp, {d20-d24} @ Put zero array on stack (again) ++ vst1.32 {d18[0]}, [ip] ++ vorr d0, d0, d16 ++ vldmia sp, {d4-d7} @ Load modified array ++ vldr d18, [sp, #8*4] ++ ldr ip, [sp, #8*5 + 36] @ height ++ add sp, sp, #8*5 ++ vorr d4, d4, d18 ++ sub ip, ip, #1 ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_band_64_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_band_64_neon_8, export=1 ++ push {r4-r6, lr} ++ vmov.u8 q15, #128 ++ bl band_load_y ++ ++1: vldmia r1, {q8-q11} ++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \ ++ "pld [r4]", \ ++ "subs ip, #1", \ ++ "it ne; addne r4, r3", \ ++ "add r1, r3" ++ vstmia r0, {q8-q11} ++ add r0, r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_32_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_band_32_neon_8, export=1 ++ push {r4-r6, lr} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ vmov.u8 q15, #128 ++ bl band_load_y ++ ++1: vld1.8 { q8, q9 }, [r1, :128], r3 ++ subs ip, #2 ++ vld1.8 {q10, q11}, [r6, :128], r3 ++ ++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15 ++ ++ vst1.8 { q8, q9 }, [r0, :128], r2 ++ vst1.8 {q10, q11}, [r5, :128], r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_16_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_band_16_neon_8, export=1 ++ push {r4-r6, lr} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ vmov.u8 q15, #128 ++ bl band_load_y ++ ++1: vld1.8 { q8}, [r1, :128], r3 ++ subs ip, #4 ++ vld1.8 { q9}, [r6, :128], r3 ++ vld1.8 {q10}, [r1, :128], r3 ++ vld1.8 {q11}, [r6, :128], r3 ++ ++ sao_band_64b_8 {d0-d3}, {d0-d3}, q15 ++ ++ vst1.8 { q8}, [r0, :128], r2 ++ vst1.8 { q9}, [r5, :128], r2 ++ vst1.8 {q10}, [r0, :128], r2 ++ vst1.8 {q11}, [r5, :128], r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_8_neon_8 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_band_8_neon_8, export=1 ++ ldr ip, [sp, #8] @ width ++ push {r4-r6, lr} ++ vmov.u8 q15, #128 ++ cmp ip, #8 ++ bl band_load_y ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ blt 4f ++ ++ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \ ++ "vld1.8 {d16}, [r1, :64], r3", \ ++ "subs ip, #2", \ ++ "vld1.8 {d17}, [r6, :64], r3", \ ++ "", \ ++ "", \ ++ "vst1.8 {d26}, [r0, :64], r2", \ ++ "vst1.8 {d27}, [r5, :64], r2" ++ pop {r4-r6, pc} ++4: ++ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \ ++ "vld1.32 {d16[0]}, [r1, :32], r3", \ ++ "subs ip, #4", \ ++ "vld1.32 {d16[1]}, [r6, :32], r3", \ ++ "vld1.32 {d17[0]}, [r1, :32], r3", \ ++ "vld1.32 {d17[1]}, [r6, :32], r3", \ ++ "vst1.32 {d26[0]}, [r0, :32], r2", \ ++ "vst1.32 {d26[1]}, [r5, :32], r2", \ ++ "vst1.32 {d27[0]}, [r0, :32], r2", \ ++ "vst1.32 {d27[1]}, [r5, :32], r2" ++ pop {r4-r6, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_c_32_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_rpi_sao_band_c_32_neon_8, export=1 ++ push {r4-r6, lr} ++ add r5, r0, #32 ++ add r6, r1, #32 ++ vmov.u8 q15, #128 ++ bl band_load_c ++ ++1: vld2.8 { q8, q9 }, [r1, :128], r3 ++ subs ip, #1 ++ vld2.8 {q10, q11}, [r6, :128], r3 ++ ++ sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \ ++ "pld [r4]", \ ++ "it ne; addne r4, r3" ++ ++ vst2.8 { q8, q9 }, [r0, :128], r2 ++ vst2.8 {q10, q11}, [r5, :128], r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_c_16_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_rpi_sao_band_c_16_neon_8, export=1 ++ push {r4-r6, lr} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ vmov.u8 q15, #128 ++ bl band_load_c ++ ++1: vld2.8 { q8, q9 }, [r1, :128], r3 ++ subs ip, #2 ++ vld2.8 {q10, q11}, [r6, :128], r3 ++ ++ sao_band_64b_8 {d0-d3}, {d4-d7}, q15 ++ ++ vst2.8 { q8, q9 }, [r0, :128], r2 ++ vst2.8 {q10, q11}, [r5, :128], r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++endfunc ++ ++@ ff_hevc_rpi_sao_band_c_8_neon_8( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++function ff_hevc_rpi_sao_band_c_8_neon_8, export=1 ++ ldr ip, [sp, #16] @ width ++ push {r4-r6, lr} ++ vmov.u8 q15, #128 ++ cmp ip, #8 ++ bl band_load_c ++ blt 4f ++ ++ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \ ++ "vld2.8 {d16-d17}, [r1, :128], r3", \ ++ "subs ip, #1", \ ++ "", \ ++ "", \ ++ "", \ ++ "vst2.8 {d26-d27}, [r0, :128], r2" ++ pop {r4-r6, pc} ++4: ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \ ++ "vld1.8 {d16}, [r1, :64], r3", \ ++ "subs ip, #2", \ ++ "vld1.8 {d17}, [r6, :64], r3", \ ++ "vuzp.8 d16, d17", \ ++ "", \ ++ "vzip.8 d26, d27", \ ++ "vst1.8 {d26}, [r0, :64], r2", \ ++ "vst1.8 {d27}, [r5, :64], r2" ++ pop {r4-r6, pc} ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_band_64_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_64_16 bit_depth ++ push {r4-r6, lr} ++ vmov.i64 q2, #0 ++ vmov.i16 q3, #(1 << \bit_depth) - 1 ++ bl band_load_y ++ vpush {q4-q7} ++ ++1: vldm r1, {q4-q11} ++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \ ++ "subs ip, #1", \ ++ "add r1, r3" ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth ++ vstm r0, {q4-q11} ++ add r0, r2 ++ bpl 1b ++ ++ vpop {q4-q7} ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_64_neon_10, export=1 ++ band_64_16 10 ++endfunc ++ ++@ ff_hevc_rpi_sao_band_32_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_32_16 bit_depth ++ push {r4-r6, lr} ++ vmov.i64 q2, #0 ++ vmov.i16 q3, #(1 << \bit_depth) - 1 ++ bl band_load_y ++ ++1: vldm r1, {q8-q11} ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \ ++ "subs ip, #1", \ ++ "add r1, r3" ++ vstm r0, {q8-q11} ++ add r0, r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_32_neon_10, export=1 ++ band_32_16 10 ++endfunc ++ ++@ ff_hevc_rpi_sao_band_16_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_16_16 bit_depth ++ push {r4-r6, lr} ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ vmov.i64 q14, #0 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_y ++ ++1: vld1.16 { q8, q9 }, [r1, :128], r3 ++ subs r12, #2 ++ vld1.16 {q10, q11}, [r6, :128], r3 ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth ++ vst1.16 { q8, q9 }, [r0, :128], r2 ++ vst1.16 {q10, q11}, [r5, :128], r2 ++ bpl 1b ++ ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_16_neon_10, export=1 ++ band_16_16 10 ++endfunc ++ ++@ ff_hevc_rpi_sao_band_8_neon_10 ( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ ptrdiff_t stride_src, [r3] ++@ int16_t *sao_offset_val, [sp, #0] ++@ int sao_left_class, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++.macro band_8_16 bit_depth ++ ldr ip, [sp, #8] @ width ++ push {r4-r6, lr} ++ vmov.i64 q14, #0 ++ cmp ip, #8 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_y ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ blt 4f ++ ++ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \ ++ "vld1.16 {q8}, [r1, :128], r3", \ ++ "subs ip, #2", \ ++ "vld1.16 {q9}, [r6, :128], r3", \ ++ "", \ ++ "", \ ++ "vst1.16 {q10}, [r0, :128], r2", \ ++ "vst1.16 {q11}, [r5, :128], r2" ++ pop {r4-r6, pc} ++4: ++ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \ ++ "vld1.16 {d16}, [r1, :64], r3", \ ++ "subs ip, #4", \ ++ "vld1.16 {d17}, [r6, :64], r3", \ ++ "vld1.16 {d18}, [r1, :64], r3", \ ++ "vld1.16 {d19}, [r6, :64], r3", \ ++ "vst1.16 {d20}, [r0, :64], r2", \ ++ "vst1.16 {d21}, [r5, :64], r2", \ ++ "vst1.16 {d22}, [r0, :64], r2", \ ++ "vst1.16 {d23}, [r5, :64], r2" ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_8_neon_10, export=1 ++ band_8_16 10 ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_band_c_32_neon_10( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++.macro band_c_32_16 bit_depth ++ push {r4-r6, lr} ++ add r5, r0, #32 ++ add r6, r1, #32 ++ sub r2, #64 ++ sub r3, #64 ++ vmov.i64 q14, #0 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_c ++ mov lr, #64 ++ vpush {q4-q7} ++ ++1: vld2.16 { q4, q5 }, [r1, :128], lr ++ subs ip, #1 ++ vld2.16 { q6, q7 }, [r6, :128], lr ++ vld2.16 { q8, q9 }, [r1, :128], r3 ++ vld2.16 {q10, q11}, [r6, :128], r3 ++ ++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ ++ "pld [r4]", \ ++ "it ne; addne r4, r3" ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth ++ ++ vst2.16 { q4, q5 }, [r0, :128], lr ++ vst2.16 { q6, q7 }, [r5, :128], lr ++ vst2.16 { q8, q9 }, [r0, :128], r2 ++ vst2.16 {q10, q11}, [r5, :128], r2 ++ ++ bpl 1b ++ ++ vpop {q4-q7} ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_c_32_neon_10, export=1 ++ band_c_32_16 10 ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_band_c_16_neon_10( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++.macro band_c_16_16 bit_depth ++ push {r4-r6, lr} ++ add r5, r0, #32 ++ add r6, r1, #32 ++ vmov.i64 q14, #0 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_c ++ ++1: vld2.16 { q8, q9 }, [r1, :128], r3 ++ subs ip, #1 ++ vld2.16 {q10, q11}, [r6, :128], r3 ++ ++ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth ++ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth ++ ++ vst2.16 { q8, q9 }, [r0, :128], r2 ++ vst2.16 {q10, q11}, [r5, :128], r2 ++ ++ bpl 1b ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_c_16_neon_10, export=1 ++ band_c_16_16 10 ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_band_c_8_neon_10( ++@ uint8_t * dst [r0] ++@ uint8_t * src [r1] ++@ uint32_t dst_stride [r2] ++@ uint32_t src_stride [r3] ++@ const int16_t * table1 sp[0] ++@ uint32_t offset1 sp[4] ++@ const int16_t * table2 sp[8] ++@ uint32_t offset2 sp[12] ++@ int width sp[16] ++@ int height sp[20] ++ ++.macro band_c_8_16 bit_depth ++ ldr ip, [sp, #16] @ width ++ push {r4-r6, lr} ++ vmov.i64 q14, #0 ++ cmp ip, #8 ++ vmov.i16 q15, #(1 << \bit_depth) - 1 ++ bl band_load_c ++ blt 4f ++ ++ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ ++ "vld2.16 {q8,q9}, [r1, :128], r3", \ ++ "subs ip, #1", \ ++ "", \ ++ "", \ ++ "", \ ++ "vst2.16 {q10,q11}, [r0, :128], r2" ++ pop {r4-r6, pc} ++4: ++ add r5, r0, r2 ++ add r6, r1, r3 ++ lsl r2, #1 ++ lsl r3, #1 ++ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \ ++ "vld2.16 {d16,d18}, [r1, :128], r3", \ ++ "subs ip, #2", \ ++ "vld2.16 {d17,d19}, [r6, :128], r3", \ ++ "", \ ++ "", \ ++ "vst2.16 {d20,d22}, [r0, :128], r2", \ ++ "vst2.16 {d21,d23}, [r5, :128], r2" ++ pop {r4-r6, pc} ++.endm ++ ++function ff_hevc_rpi_sao_band_c_8_neon_10, export=1 ++ band_c_8_16 10 ++endfunc ++ ++ ++@ ============================================================================= ++@ SAO EDGE ++ ++@ r0 destination address ++@ r2 stride to post-increment r0 with ++@ [r5] translate values ++@ ++@ a <- c <- b ++@ a in q0 - q3 ++@ c in q4 - q7 ++@ b in q8 - q11 ++@ ++@ q12-15 used as temp ++@ ++@ Can be used for both Y & C as we unzip/zip the deltas and ++@ transform "u/v" separately via d26/d27. For Y d26=d27 ++ ++function edge_64b_body_8 ++ ++ vcgt.u8 q12, q4, q0 @ c > a -> -1 , otherwise 0 ++ vcgt.u8 q13, q5, q1 ++ vcgt.u8 q14, q6, q2 ++ vcgt.u8 q15, q7, q3 ++ ++ vcgt.u8 q0, q4 @ a > c -> -1 , otherwise 0 ++ vcgt.u8 q1, q5 ++ vcgt.u8 q2, q6 ++ vcgt.u8 q3, q7 ++ ++ vsub.s8 q0, q12 @ a = sign(c-a) ++ vsub.s8 q1, q13 ++ vsub.s8 q2, q14 ++ vsub.s8 q3, q15 ++ ++ vcgt.u8 q12, q4, q8 @ c > b -> -1 , otherwise 0 ++ vcgt.u8 q13, q5, q9 ++ vcgt.u8 q14, q6, q10 ++ vcgt.u8 q15, q7, q11 ++ ++ vsub.s8 q0, q12 ++ vsub.s8 q1, q13 ++ vsub.s8 q2, q14 ++ vsub.s8 q3, q15 ++ ++ vcgt.u8 q12, q8, q4 @ c < b -> -1 , otherwise 0 ++ vcgt.u8 q13, q9, q5 ++ vcgt.u8 q14, q10, q6 ++ vcgt.u8 q15, q11, q7 ++ ++ vadd.s8 q0, q12 @ a = sign(c-a) + sign(c-b) ++ vadd.s8 q1, q13 ++ vmov.u8 q12, #2 ++ vadd.s8 q2, q14 ++ vadd.s8 q3, q15 ++ ++ vadd.s8 q0, q12 ++ vadd.s8 q1, q12 ++ ++ vld1.8 {d26, d27}, [r5] ++ ++ vadd.s8 q2, q12 ++ vuzp.8 q0, q1 ++ vmov.u8 q15, #128 ++ vadd.s8 q3, q12 @ a = 2 + sign(c-a) + sign(c-b) ++ ++ vtbl.8 d0, {d26}, d0 ++ vadd.s8 q12, q4, q15 @ Add -128 so we can use saturating signed add ++ ++ vtbl.8 d1, {d26}, d1 ++ vadd.s8 q14, q5, q15 ++ ++ vtbl.8 d2, {d27}, d2 ++ vuzp.8 q2, q3 ++ ++ vtbl.8 d3, {d27}, d3 ++ ++ vtbl.8 d4, {d26}, d4 ++ vzip.8 q0, q1 ++ ++ vtbl.8 d5, {d26}, d5 ++ vqadd.s8 q0, q12 ++ vqadd.s8 q1, q14 ++ vadd.s8 q12, q6, q15 @ Add -128 so we can use saturating signed add ++ ++ vtbl.8 d6, {d27}, d6 ++ vtbl.8 d7, {d27}, d7 ++ vadd.s8 q14, q7, q15 @ Add -128 so we can use saturating signed add ++ vzip.8 q2, q3 ++ ++ vsub.s8 q0, q15 ++ vqadd.s8 q2, q12 ++ vqadd.s8 q3, q14 ++ vsub.s8 q1, q15 ++ vsub.s8 q2, q15 ++ vsub.s8 q3, q15 ++ ++ bx lr ++endfunc ++ ++@ r0 destination address ++@ r2 stride to post-increment r0 with ++@ r4 upper clip value ++@ [r5] translate values ++@ ++@ a <- c <- b ++@ a in q0 - q3 ++@ c in q4 - q7 ++@ b in q8 - q11 ++@ ++@ q12-15 used as temp ++@ ++@ Can be used for both Y & C as we unzip/zip the deltas and ++@ transform "u/v" separately via d26/d27. For Y d26=d27 ++ ++function edge_64b_body_16 ++ ++ vcgt.u16 q12, q4, q0 // c > a -> -1 , otherwise 0 ++ vcgt.u16 q13, q5, q1 ++ vcgt.u16 q14, q6, q2 ++ vcgt.u16 q15, q7, q3 ++ ++ vcgt.u16 q0, q0, q4 // a > c -> -1 , otherwise 0 ++ vcgt.u16 q1, q1, q5 ++ vcgt.u16 q2, q2, q6 ++ vcgt.u16 q3, q3, q7 ++ ++ vsub.s16 q0, q0, q12 // a = sign(c-a) ++ vsub.s16 q1, q1, q13 ++ vsub.s16 q2, q2, q14 ++ vsub.s16 q3, q3, q15 ++ ++ vcgt.u16 q12, q4, q8 // c > b -> -1 , otherwise 0 ++ vcgt.u16 q13, q5, q9 ++ vcgt.u16 q14, q6, q10 ++ vcgt.u16 q15, q7, q11 ++ ++ vsub.s16 q0, q0, q12 ++ vsub.s16 q1, q1, q13 ++ vsub.s16 q2, q2, q14 ++ vsub.s16 q3, q3, q15 ++ ++ vcgt.u16 q12, q8, q4 // c < b -> -1 , otherwise 0 ++ vcgt.u16 q13, q9, q5 ++ vcgt.u16 q14, q10, q6 ++ vcgt.u16 q15, q11, q7 ++ ++ vadd.s16 q0, q0, q12 // a = sign(c-a) + sign(c-b) ++ vadd.s16 q1, q1, q13 ++ vadd.s16 q2, q2, q14 ++ vadd.s16 q3, q3, q15 ++ ++ vmov.u8 q12, #2 ++ ++ vmovn.s16 d0, q0 ++ vmovn.s16 d1, q1 ++ vmovn.s16 d2, q2 ++ vmovn.s16 d3, q3 ++ ++ vldr d26, [r5] ++ ++ vuzp.8 q0, q1 ++ ++ vldr d27, [r5, #8] ++ ++ vadd.s8 q0, q0, q12 ++ vadd.s8 q1, q1, q12 ++ ++ vmov.i64 q12, #0 ++ ++ vtbl.8 d0, {d26}, d0 ++ vtbl.8 d1, {d26}, d1 ++ vtbl.8 d2, {d27}, d2 ++ vtbl.8 d3, {d27}, d3 ++ ++ vdup.i16 q13, r4 ++ ++ vzip.8 q0, q1 ++ ++ @ Avoid overwrite whilst widening ++ vaddw.s8 q2, q6, d2 ++ vaddw.s8 q3, q7, d3 ++ vaddw.s8 q1, q5, d1 ++ vaddw.s8 q0, q4, d0 ++ ++ @ now clip ++ clip16_4 q2, q3, q1, q0, q12, q13 ++ ++ bx lr ++endfunc ++ ++ ++@ a <- c <- b ++@ a in q0 ++@ c in q1 ++@ b in q2 ++@ Temp q3, q9, q10 ++@ ++@ d16, d17 (q8) xlat U, V ++@ q14.u8 #2 ++@ q15.u8 #128 ++ ++function edge_16b_body_8 ++ vcgt.u8 q9, q0, q1 @ a > c -> -1 , otherwise 0 ++ vadd.u8 q9, q14, q9 ++ vcgt.u8 q0, q1, q0 @ c > a -> -1 , otherwise 0 ++ vsub.u8 q9, q9, q0 ++ vcgt.u8 q0, q2, q1 @ c < b -> -1 , otherwise 0 ++ vadd.u8 q9, q9, q0 ++ vcgt.u8 q0, q1, q2 @ c > b -> -1 , otherwise 0 ++ vsub.u8 q0, q9, q0 ++ ++ vadd.s8 q3, q1, q15 @ Add -128 so we can use saturating signed add ++ ++ vuzp.8 d0, d1 ++ ++ vtbl.8 d0, {d16}, d0 ++ vtbl.8 d1, {d17}, d1 ++ ++ vzip.8 d0, d1 ++ vqadd.s8 q0, q3 ++ vsub.s8 q0, q15 ++ ++ bx lr ++endfunc ++ ++@ a <- c <- b ++@ a in q0 ++@ c in q1 ++@ b in q2 ++@ Temp q3 ++@ ++@ q12, #0 ++@ d16, d17 xlat U, V ++@ q14.u8 #2 ++@ q15.u16 max ++function edge_16b_body_16 ++ vcgt.u16 q9, q0, q1 @ a > c -> -1 , otherwise 0 ++ vadd.u16 q9, q14, q9 ++ vcgt.u16 q0, q1, q0 @ c > a -> -1 , otherwise 0 ++ vsub.u16 q9, q9, q0 ++ vcgt.u16 q0, q2, q1 @ c < b -> -1 , otherwise 0 ++ vadd.u16 q9, q9, q0 ++ vcgt.u16 q0, q1, q2 @ c > b -> -1 , otherwise 0 ++ vsub.u16 q0, q9, q0 ++ ++ vmovn.s16 d0, q0 ++ @ d1 will have random contents that we transform but ++ @ that doesn't matter as we then discard them ++ vuzp.8 d0, d1 ++ ++ vtbl.8 d0, {d16}, d0 ++ vtbl.8 d1, {d17}, d1 ++ ++ vzip.8 d0, d1 ++ ++ vaddw.s8 q0, q1, d0 ++ ++ @ now clip ++ vmax.s16 q0, q12 ++ vmin.s16 q0, q15 ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_edge_[c_]xx_neon( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] // Chroma only ++@ int eo, [sp, #sp_base + 0] ++@ int width, [sp, #sp_base + 4] ++@ int height) [sp, #sp_base + 8] ++ ++@ Jumps via jump_tab with ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ EDGE_SRC_STRIDE [r3] ++@ (1 << \bit_depth) - 1 [r4] ++@ * xlat_table [r5] // setup_64b only ++@ int height [r12] ++@ ++@ 0 [q12] // > 8 bit ++@ 2 [q14] ++@ 128 [q15] // = 8 bit ++@ r4 [q15] // > 8 bit ++ ++.macro edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0 ++ ++@ Build translate registers ++@ As translate values can only be 0-4 we don't care about junk in the rest ++@ of the register ++.if \is_chroma ++ ldr ip, [sp, #0] ++ push {r4-r6, lr} @ 16 bytes ++ vld1.8 {d16[2]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[2]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[0]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[0]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[1]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[1]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[3]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[3]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[4]}, [r3] ++ vld1.8 {d17[4]}, [ip] ++ movw r3, EDGE_SRC_STRIDE ++.set sp_base, 20 ++.else ++ add ip, r3, #4 ++ vld1.8 {d16[1]}, [r3] ++ add r3, r3, #2 ++ vld1.8 {d17[0]}, [ip] ++ add ip, ip, #2 ++ vld1.8 {d16[0]}, [r3] ++ add r3, r3, #6 ++ vld1.8 {d17[1]}, [ip] ++ vld1.8 {d16[2]}, [r3] ++ movw r3, EDGE_SRC_STRIDE ++ push {r4-r6, lr} @ 16 bytes ++ vzip.8 d16, d17 ++ vmov d17, d16 ++.set sp_base, 16 ++.endif ++ ++@ If setup_64b we need the xlat table on the stack ++.if \setup_64b ++ sub r5, sp, #16 ++.endif ++ ++@ Get jump address ++@ We have a special case for width 4 as the calling code doesn't detect it ++@ If we may have w4 then we add a 2nd jump table after the 1st ++.if \check_w4 ++ ldr r12, [sp, #sp_base + 4] @ width ++ adr r6, \jump_tab ++ ldr lr, [sp, #sp_base + 0] @ e0 ++ cmp r12, #8 ++ it lt ++ addlt r6, #16 ++.else ++ ldr lr, [sp, #sp_base + 0] @ e0 ++ adr r6, \jump_tab ++.endif ++ ++ ldr r12, [sp, #sp_base + 8] @ height ++ ++.if \bit_depth > 8 ++ movw r4, (1 << \bit_depth) - 1 ++.endif ++.if \setup_16b ++.if \bit_depth > 8 ++ vmov.i64 q12, #0 ++ vdup.16 q15, r4 ++ vmov.u16 q14, #2 ++.else ++ vmov.u8 q15, #128 ++ vmov.u8 q14, #2 ++.endif ++.endif ++ ++@ If setup_64b we need q4-q7 saved. ++.if \setup_64b ++ vpush {q4-q8} @ 80 bytes, q8 pushed first ++.set sp_base, sp_base + 80 ++.endif ++ ++ ldr r6, [r6, lr, lsl #2] ++ ++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes ++.if \do2 ++ push {r0, r1, r6, r12} ++.if jent_pic ++ bl 98f ++.else ++ blx r6 ++.endif ++ pop {r0, r1, r6, r12} ++ ++ add r0, #64 ++ add r1, #64 ++.endif ++ ++.if jent_pic ++ bl 98f ++.else ++ blx r6 ++.endif ++ ++@ Tidy up & return ++.if \setup_64b ++ vpop {q4-q8} @ spurious but harmless load of q8 ++.endif ++ pop {r4-r6, pc} ++ ++.if jent_pic && !\xjump ++@ Magic label - used as 98b in jent macro ++98: ++ add pc, r6 ++.endif ++.endm ++ ++ ++.macro edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab ++ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1 ++.endm ++ ++.macro edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0 ++ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump ++.endm ++ ++ ++.macro edge_64b_e0, body_fn, pb ++ sub r1, #8 ++ mov r6, lr ++1: vldm r1, {d7-d16} ++ // load a ++ vext.8 q0, q3, q4, #(16 - \pb) ++ add r1, r3 ++ vext.8 q1, q4, q5, #(16 - \pb) ++ subs r12, #1 ++ vext.8 q2, q5, q6, #(16 - \pb) ++ vext.8 q3, q6, q7, #(16 - \pb) ++ pld [r1] ++ // load b ++ vext.8 q11, q7, q8, #\pb @ Avoid overwrite ++ pld [r1, #64] ++ vext.8 q8, q4, q5, #\pb ++ vext.8 q9, q5, q6, #\pb ++ vext.8 q10, q6, q7, #\pb ++ bl \body_fn ++ vstm r0, {q0-q3} ++ add r0, r0, r2 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_32bx2_e0, body_fn, pb ++ add r6, r1, r3 ++ push {r7,lr} ++ sub r1, #8 ++ add r7, r0, r2 ++ lsl r2, #1 ++1: vldmia r1, {d7-d12} ++ // load a ++ vext.8 q0, q3, q4, #16 - \pb ++ add r1, r1, r3, lsl #1 ++ vext.8 q1, q4, q5, #16 - \pb ++ subs r12, #2 ++ // load b ++ vext.8 q8, q4, q5, #\pb ++ vext.8 q9, q5, q6, #\pb ++ vldr d25, [r6, #-8] ++ vldmia r6, {d12-d15} ++ vldr d26, [r6, #32] ++ // load a ++ vext.8 q2, q12, q6, #16 - \pb ++ add r6, r6, r3, lsl #1 ++ vext.8 q3, q6, q7, #16 - \pb ++ // load b ++ vext.8 q10, q6, q7, #\pb ++ vext.8 q11, q7, q13, #\pb ++ bl \body_fn ++ vst1.8 {q0-q1}, [r0, :256], r2 ++ vst1.8 {q2-q3}, [r7, :256], r2 ++ bgt 1b ++ pop {r7,pc} ++.endm ++ ++.macro edge_16b_e0, body_fn, pb ++ sub r1, #8 ++ mov r6, lr ++1: vldmia r1, {d1-d4} ++ add r1, r3 ++ subs r12, #1 ++ vext.8 q0, q0, q1, #16 - \pb ++ vext.8 q2, q1, q2, #\pb ++ ++ bl \body_fn ++ vst1.8 {q0}, [r0, :128], r2 ++ bgt 1b ++ bx r6 ++.endm ++ ++.macro edge_8bx2_e0, body_fn, pb ++ add r6, r1, r3 ++ push {r7,lr} ++ sub r1, #8 ++ add r7, r0, r2 ++ lsl r2, #1 ++1: vldmia r1, {d1-d2} ++ vldmia r6, {d3-d4} ++ vldr d6, [r1, #16] ++ subs r12, #2 ++ vldr d7, [r6, #-8] ++ add r1, r1, r3, lsl #1 ++ vext.8 d0, d1, d2, #8 - \pb ++ add r6, r6, r3, lsl #1 ++ vext.8 d5, d3, d4, #\pb ++ vext.8 d4, d2, d6, #\pb ++ vext.8 d1, d7, d3, #8 - \pb ++ ++ bl \body_fn ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r7, :64], r2 ++ bgt 1b ++ pop {r7,pc} ++.endm ++ ++.macro edge_4bx4_e0, body_fn, pb ++ add r6, r1, r3 ++ push {r7,lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ ++ tst r1, #4 ++ bne 2f ++1: // r1 (and assumed r6) are 64-bit aligned ++ vldr d2, [r1] ++ vldr d0, [r1, #-8] ++ add r1, r1, r3, lsl #1 ++ vldr d20, [r6] ++ subs r12, #4 ++ vldr d18, [r6, #-8] ++ add r6, r6, r3, lsl #1 ++ vldr d3, [r1] ++ vshr.u64 d4, d2, #\pb * 8 ++ vldr d1, [r1, #-8] ++ add r1, r1, r3, lsl #1 ++ vldr d21, [r6] ++ vext.8 d0, d0, d2, #8 - \pb ++ vldr d19, [r6,#-8] ++ add r6, r6, r3, lsl #1 ++ vshr.u64 d22, d20, #\pb * 8 ++ vext.8 d18, d18, d20, #8 - \pb ++ vshr.u64 d5, d3, #\pb * 8 ++ vext.8 d1, d1, d3, #8 - \pb ++ vshr.u64 d23, d21, #\pb * 8 ++ vext.8 d19, d19, d21, #8 - \pb ++ vsli.64 q1, q10, #32 ++ vsli.64 q2, q11, #32 ++ vsli.64 q0, q9, #32 ++ ++ bl \body_fn ++ vst1.32 {d0[0]}, [r0, :32], r2 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vst1.32 {d1[0]}, [r0, :32], r2 ++ vst1.32 {d1[1]}, [r7, :32], r2 ++ bgt 1b ++ pop {r7,pc} ++ ++2: // r1 (and assumed r6) are 32-bit but not 64-bit aligned ++ vldr d20, [r1, #-4] ++ vldr d22, [r1, #4] ++ add r1, r1, r3, lsl #1 ++ vldr d2, [r6, #-4] ++ subs r12, #4 ++ vldr d4, [r6, #4] ++ add r6, r6, r3, lsl #1 ++ vldr d21, [r1, #-4] ++ vshl.i64 d18, d20, #\pb * 8 ++ vldr d23, [r1, #4] ++ add r1, r1, r3, lsl #1 ++ vldr d3, [r6, #-4] ++ vext.8 d22, d20, d22, #\pb ++ vldr d5, [r6, #4] ++ add r6, r6, r3, lsl #1 ++ vshl.i64 d0, d2, #\pb * 8 ++ vext.8 d4, d2, d4, #\pb ++ vshl.i64 d19, d21, #\pb * 8 ++ vext.8 d23, d21, d23, #\pb ++ vshl.i64 d1, d3, #\pb * 8 ++ vext.8 d5, d3, d5, #\pb ++ vsri.64 q1, q10, #32 ++ vsri.64 q0, q9, #32 ++ vsri.64 q2, q11, #32 ++ ++ bl \body_fn ++ vst1.32 {d0[0]}, [r0, :32], r2 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vst1.32 {d1[0]}, [r0, :32], r2 ++ vst1.32 {d1[1]}, [r7, :32], r2 ++ bgt 2b ++ pop {r7,pc} ++.endm ++ ++ ++.macro edge_64b_e1, body_fn ++ sub r1, r3 ++ push {lr} ++ add r6, r1, #32 ++ // load a ++ vld1.8 {q0-q1}, [r1, :256], r3 ++ vld1.8 {q2-q3}, [r6, :256], r3 ++ // load c ++ vld1.8 {q4-q5}, [r1, :256], r3 ++ vld1.8 {q6-q7}, [r6, :256], r3 ++1: // load b ++ vld1.8 {q8-q9}, [r1, :256], r3 ++ subs r12, #1 ++ vld1.8 {q10-q11}, [r6, :256], r3 ++ bl \body_fn ++ vstm r0, {q0-q3} ++ // copy c to a ++ vmov.64 q0, q4 ++ pld [r1, r3] ++ vmov.64 q1, q5 ++ it le ++ pople {lr} ++ vmov.64 q2, q6 ++ it le ++ bxle lr ++ vmov.64 q3, q7 ++ add r0, r0, r2 ++ // copy b to c ++ vmov.64 q4, q8 ++ vmov.64 q5, q9 ++ vmov.64 q6, q10 ++ vmov.64 q7, q11 ++ b 1b ++.endm ++ ++.macro edge_32bx2_e1, body_fn ++ sub r6, r1, r3 ++ vld1.8 {q2-q3}, [r1, :256], r3 ++ vld1.8 {q0-q1}, [r6, :256] ++ mov r6, lr ++ ++1: @ Given the data duplication here we could obviously do better than ++ @ using the generic body_fn but it almost certainly isn't worth it ++ vld1.8 {q8-q9}, [r1, :256], r3 ++ subs r12, #2 ++ vmov q4, q2 ++ vmov q5, q3 ++ vld1.8 {q10-q11}, [r1, :256], r3 ++ vmov q6, q8 ++ vmov q7, q9 ++ ++ bl \body_fn ++ ++ vst1.8 {q0-q1}, [r0, :256], r2 ++ // copy b to a ++ vmov q0, q8 ++ vmov q1, q9 ++ vst1.8 {q2-q3}, [r0, :256], r2 ++ vmov q2, q10 ++ it le ++ bxle r6 ++ vmov q3, q11 ++ b 1b ++.endm ++ ++.macro edge_16b_e1, body_fn ++ sub r6, r1, r3 ++ // load c ++ vld1.8 {q1}, [r1, :128], r3 ++ // load a ++ vld1.8 {q0}, [r6, :128] ++ mov r6, lr ++1: // load b ++ vld1.8 {q2}, [r1, :128], r3 ++ bl \body_fn ++ vst1.8 {q0}, [r0, :128], r2 ++ subs r12, #1 ++ // copy c to a ++ vmov.64 q0, q1 ++ it le ++ bxle r6 ++ // copy b to c ++ vmov.64 q1, q2 ++ b 1b ++.endm ++ ++.macro edge_8bx2_e1, body_fn ++ sub r6, r1, r3 ++ lsl r3, #1 ++ push {r7, lr} ++ vld1.8 {d1}, [r1, :64], r3 ++ vld1.8 {d0}, [r6, :64], r3 ++ add r7, r0, r2 ++ lsl r2, #1 ++1: @ Given the data duplication here we could obviously do better than ++ @ using the generic body_fn but it almost certainly isn't worth it ++ vld1.8 {d4}, [r6, :64], r3 ++ vmov d2, d1 ++ vld1.8 {d5}, [r1, :64], r3 ++ subs r12, #2 ++ vmov d3, d4 ++ ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r7, :64], r2 ++ ++ // copy b to a ++ vmov q0, q2 ++ bgt 1b ++ pop {r7, pc} ++.endm ++ ++.macro edge_4bx4_e1, body_fn ++ sub r6, r1, r3 ++ lsl r3, #1 ++ push {r7, lr} ++ vld1.32 {d0[1]}, [r1, :32], r3 ++ add r7, r0, r2 ++ vld1.32 {d0[0]}, [r6, :32], r3 ++ lsl r2, #1 ++ vld1.32 {d4[1]}, [r1, :32], r3 ++ vld1.32 {d4[0]}, [r6, :32], r3 ++ vld1.32 {d5[1]}, [r1, :32], r3 ++ vld1.32 {d5[0]}, [r6, :32], r3 ++ vmov d1, d4 ++ vext.32 d2, d0, d4, #1 ++ subs r12, #4 ++ vmov d22, d5 ++ vext.32 d3, d4, d5, #1 ++ b 2f ++ ++1: vst1.32 {d0[0]}, [r0, :32], r2 ++ vext.32 d2, d22, d4, #1 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vmov d0, d22 ++ vst1.32 {d1[0]}, [r0, :32], r2 ++ vext.32 d3, d4, d5, #1 ++ vst1.32 {d1[1]}, [r7, :32], r2 ++ vmov d1, d4 ++ vmov d22, d5 ++2: @ Given the data duplication here we could probably do better than ++ @ using the generic body_fn but it almost certainly isn't worth it ++ bl \body_fn ++ ble 3f ++ vld1.32 {d4[0]}, [r6, :32], r3 ++ subs r12, #4 ++ vld1.32 {d4[1]}, [r1, :32], r3 ++ vld1.32 {d5[0]}, [r6, :32], r3 ++ vld1.32 {d5[1]}, [r1, :32], r3 ++ b 1b ++ ++3: vst1.32 {d0[0]}, [r0, :32], r2 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vst1.32 {d1[0]}, [r0, :32] ++ vst1.32 {d1[1]}, [r7, :32] ++ pop {r7, pc} ++.endm ++ ++.macro edge_64b_e2, body_fn, pb ++ push {lr} ++ sub r6, r1, r3 ++ // load c and a ++ vld1.8 {q4-q5}, [r1, :128] ++ vldr d25, [r6, #-8] ++ vldmia r6, {d16-d23} ++ vext.8 q0, q12, q8, #16 - \pb ++ add r6, r1, #32 ++ vext.8 q1, q8, q9, #16 - \pb ++ add r1, r1, r3 ++ vext.8 q2, q9, q10, #16 - \pb ++ vld1.8 {q6-q7}, [r6, :128] ++ sub r6, r1, r3 ++ vext.8 q3, q10, q11, #16 - \pb ++ ++1: // load b ++ vldmia r1, {d16-d24} ++ vext.8 q8, q8, q9, #\pb ++ pld [r1, r3] ++ vext.8 q9, q9, q10, #\pb ++ subs r12, #1 ++ vext.8 q10, q10, q11, #\pb ++ vext.8 q11, q11, q12, #\pb ++ bl \body_fn ++ // next a is mostly available in c ++ vldr d25, [r6, #-8] ++ vstmia r0, {q0-q3} ++ vext.8 q3, q6, q7, #16 - \pb ++ it le ++ pople {lr} ++ vext.8 q2, q5, q6, #16 - \pb ++ it le ++ bxle lr ++ vext.8 q1, q4, q5, #16 - \pb ++ add r6, r6, r3 ++ vext.8 q0, q12, q4, #16 - \pb ++ add r0, r0, r2 ++ // next c is mostly available in b ++ vldr d8, [r1] ++ vext.8 d9, d16, d17, #8 - \pb ++ vext.8 q5, q8, q9, #16 - \pb ++ add r1, r1, r3 ++ vext.8 q6, q9, q10, #16 - \pb ++ pld [r6, #-8] ++ vext.8 q7, q10, q11, #16 - \pb ++ b 1b ++.endm ++ ++.macro edge_32bx2_e2, body_fn, pb ++ sub r6, r1, r3 ++ push {r7, lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ // load a and first 32b of c ++ vld1.8 {q4-q5}, [r1, :256] ++ vldr d25, [r6, #-8] ++ vld1.8 {q13-q14}, [r6, :256] ++ vldr d31, [r1, #-8] ++ add r6, r6, r3, lsl #1 ++ vext.8 q0, q12, q13, #16 - \pb ++ add r1, r1, r3, lsl #1 ++ vext.8 q1, q13, q14, #16 - \pb ++ vext.8 q2, q15, q4, #16 - \pb ++ vext.8 q3, q4, q5, #16 - \pb ++1: ++ // load second 32b of c and second 32b of b ++ vldmia r6, {d12-d16} ++ vldmia r1, {d20-d24} ++ // first 32b of b is mostly available in second 32b of c ++ vext.8 q9, q7, q8, #\pb ++ subs r12, #2 ++ vext.8 q8, q6, q7, #\pb ++ vext.8 q10, q10, q11, #\pb ++ vext.8 q11, q11, q12, #\pb ++ ++ bl \body_fn ++ ++ vst1.8 {q0-q1}, [r0, :256], r2 ++ vst1.8 {q2-q3}, [r7, :256], r2 ++ ble 2f ++ ++ vldr d25, [r6, #-8] ++ add r6, r6, r3, lsl #1 ++ vldr d8, [r1] ++ vext.8 d9, d20, d21, #8 - \pb ++ vldr d31, [r1, #-8] ++ add r1, r1, r3, lsl #1 ++ // first 32b of a is mostly available in second 32b of c ++ vext.8 q1, q6, q7, #16 - \pb ++ vext.8 q0, q12, q6, #16 - \pb ++ // first 32b of c is mostly available in second 32b of b ++ vext.8 q5, q10, q11, #16 - \pb ++ // second 32b of a is mostly available in first 32b of c ++ vext.8 q2, q15, q4, #16 - \pb ++ vext.8 q3, q4, q5, #16 - \pb ++ b 1b ++ ++2: pop {r7, pc} ++.endm ++ ++.macro edge_16b_e2, body_fn, pb ++ push {lr} ++ sub r6, r1, r3 ++ vld1.8 {q1}, [r1, :128], r3 ++ vldr d19, [r6, #-8] ++ vld1.8 {q10}, [r6, :128], r3 ++ ++1: vldmia r1, {d4-d6} ++ vext.8 q0, q9, q10, #16 - \pb ++ subs r12, #1 ++ vext.8 q2, q2, q3, #\pb ++ bl \body_fn ++ vst1.8 {q0}, [r0, :128], r2 ++ ble 2f ++ vmov q10, q1 ++ vldr d2, [r1] ++ add r1, r1, r3 ++ vldr d19, [r6, #-8] ++ add r6, r6, r3 ++ vext.8 d3, d4, d5, #8 - \pb ++ b 1b ++ ++2: pop {pc} ++.endm ++ ++.macro edge_8bx2_e2, body_fn, pb ++ sub r6, r1, r3 ++ push {r7, lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ vldr d18, [r6, #-8] ++ vldr d19, [r6] ++ add r6, r6, r3, lsl #1 ++ vldr d20, [r1, #-8] ++ vldr d2, [r1] ++ add r1, r1, r3, lsl #1 ++ vldmia r6, {d3-d4} ++ vld1.8 {d21-d22}, [r1, :128] ++ ++1: vext.8 d0, d18, d19, #8 - \pb ++ vext.8 d4, d3, d4, #\pb ++ vext.8 d1, d20, d2, #8 - \pb ++ subs r12, #2 ++ vext.8 d5, d21, d22, #\pb ++ ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r7, :64], r2 ++ ble 2f ++ ++ vldr d18, [r6, #-8] ++ add r6, r6, r3, lsl #1 ++ vldr d20, [r1, #-8] ++ vmov d19, d3 ++ vldr d2, [r1] ++ add r1, r1, r3, lsl #1 ++ vldmia r6, {d3-d4} ++ vld1.8 {d21-d22}, [r1, :128] ++ b 1b ++ ++2: pop {r7, pc} ++.endm ++ ++.macro edge_4bx4_e2, body_fn, pb ++ sub r6, r1, r3 ++ push {r7-r9, lr} ++ add r8, r1, r3 ++ sub r6, r6, #\pb ++ add r8, r8, #\pb ++ add r7, r0, r2 ++ lsl r2, #1 ++ ++1: vld1.32 {d0[0]}, [r6], r3 ++ subs r12, #4 ++ vld1.32 {d2[0]}, [r1], r3 ++ vld1.32 {d4[0]}, [r8], r3 ++ vld1.32 {d0[1]}, [r6], r3 ++ vld1.32 {d2[1]}, [r1], r3 ++ vld1.32 {d4[1]}, [r8], r3 ++ vld1.32 {d1[0]}, [r6], r3 ++ vld1.32 {d3[0]}, [r1], r3 ++ vld1.32 {d5[0]}, [r8], r3 ++ vld1.32 {d1[1]}, [r6], r3 ++ vld1.32 {d3[1]}, [r1], r3 ++ vld1.32 {d5[1]}, [r8], r3 ++ ++ bl \body_fn ++ ++ vst1.32 {d0[0]}, [r0, :32], r2 ++ vst1.32 {d0[1]}, [r7, :32], r2 ++ vst1.32 {d1[0]}, [r0, :32], r2 ++ vst1.32 {d1[1]}, [r7, :32], r2 ++ bgt 1b ++ ++ pop {r7-r9,pc} ++.endm ++ ++.macro edge_64b_e3, body_fn, pb ++ push {lr} ++ sub r6, r1, r3 ++ // load c and a ++ vld1.8 {q4-q5}, [r1, :128] ++ vldmia r6, {d16-d24} ++ vext.8 q0, q8, q9, #\pb ++ add r6, r1, #32 ++ vext.8 q1, q9, q10, #\pb ++ add r1, r1, r3 ++ vext.8 q2, q10, q11, #\pb ++ vld1.8 {q6-q7}, [r6, :128] ++ sub r6, r1, r3 ++ vext.8 q3, q11, q12, #\pb ++ ++1: // load b ++ vldr d17, [r1, #-8] ++ vldmia r1, {d18-d25} ++ vext.8 q8, q8, q9, #16 - \pb ++ pld [r1, r3] ++ vext.8 q9, q9, q10, #16 - \pb ++ subs r12, #1 ++ vext.8 q10, q10, q11, #16 - \pb ++ vext.8 q11, q11, q12, #16 - \pb ++ bl \body_fn ++ // next a is mostly available in c ++ vldr d24, [r6, #64] ++ vstmia r0, {q0-q3} ++ vext.8 q0, q4, q5, #\pb ++ it le ++ pople {lr} ++ vext.8 q1, q5, q6, #\pb ++ it le ++ bxle lr ++ vext.8 q2, q6, q7, #\pb ++ add r6, r6, r3 ++ vext.8 q3, q7, q12, #\pb ++ add r0, r0, r2 ++ // next c is mostly available in b ++ vext.8 d14, d22, d23, #\pb ++ vldr d15, [r1, #56] ++ vext.8 q4, q8, q9, #\pb ++ add r1, r1, r3 ++ vext.8 q5, q9, q10, #\pb ++ vext.8 q6, q10, q11, #\pb ++ b 1b ++.endm ++ ++.macro edge_32bx2_e3, body_fn, pb ++ sub r6, r1, r3 ++ push {r7, lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ // load a and first 32b of c ++ vldmia r1, {d8-d12} ++ vldmia r6, {d24-d28} ++ vext.8 q2, q4, q5, #\pb ++ add r6, r6, r3, lsl #1 ++ vext.8 q3, q5, q6, #\pb ++ add r1, r1, r3, lsl #1 ++ vext.8 q0, q12, q13, #\pb ++ vext.8 q1, q13, q14, #\pb ++1: ++ // load second 32b of c and second 32b of b ++ vldr d25, [r6, #-8] ++ subs r12, #2 ++ vldmia r6, {d12-d15} ++ vldr d27, [r1, #-8] ++ vldmia r1, {d20-d23} ++ // first 32b of b is mostly available in second 32b of c ++ vext.8 q8, q12, q6, #16 - \pb ++ vext.8 q9, q6, q7, #16 - \pb ++ vext.8 q11, q10, q11, #16 - \pb ++ vext.8 q10, q13, q10, #16 - \pb ++ ++ bl \body_fn ++ ++ vst1.8 {q0-q1}, [r0, :256], r2 ++ vst1.8 {q2-q3}, [r7, :256], r2 ++ ble 2f ++ ++ vldr d24, [r6, #32] ++ add r6, r6, r3, lsl #1 ++ vldr d11, [r1, #24] ++ vext.8 d10, d22, d23, #\pb ++ vldr d30, [r1, #32] ++ add r1, r1, r3, lsl #1 ++ // first 32b of a is mostly available in second 32b of c ++ vext.8 q0, q6, q7, #\pb ++ vext.8 q1, q7, q12, #\pb ++ // first 32b of c is mostly available in second 32b of b ++ vext.8 q4, q10, q11, #\pb ++ // second 32b of a is mostly available in first 32b of c ++ vext.8 q3, q5, q15, #\pb ++ vext.8 q2, q4, q5, #\pb ++ b 1b ++ ++2: pop {r7, pc} ++.endm ++ ++.macro edge_16b_e3, body_fn, pb ++ push {lr} ++ sub r6, r1, r3 ++ vld1.8 {q1}, [r1, :128], r3 ++ vldmia r6, {d18-d20} ++ add r6, r6, r3 ++ ++1: vldr d5, [r1, #-8] ++ vld1.8 {q3}, [r1, :128] ++ subs r12, #1 ++ vext.8 q0, q9, q10, #\pb ++ vext.8 q2, q2, q3, #16 - \pb ++ bl \body_fn ++ vst1.8 {q0}, [r0, :128], r2 ++ ble 2f ++ vmov q9, q1 ++ vldr d3, [r1, #8] ++ add r1, r1, r3 ++ vldr d20, [r6, #16] ++ add r6, r6, r3 ++ vext.8 d2, d4, d5, #\pb ++ b 1b ++ ++2: pop {pc} ++.endm ++ ++.macro edge_8bx2_e3, body_fn, pb ++ sub r6, r1, r3 ++ push {r7, lr} ++ add r7, r0, r2 ++ lsl r2, #1 ++ vld1.8 {d18-d19}, [r6] ++ add r6, r6, r3, lsl #1 ++ vldr d20, [r1, #8] ++ vldr d2, [r1] ++ add r1, r1, r3, lsl #1 ++ vldr d4, [r6, #-8] ++ vldr d3, [r6] ++ vldr d21, [r1, #-8] ++ vldr d22, [r1] ++ ++1: vext.8 d0, d18, d19, #\pb ++ vext.8 d4, d4, d3, #8 - \pb ++ vext.8 d1, d2, d20, #\pb ++ subs r12, #2 ++ vext.8 d5, d21, d22, #8 - \pb ++ ++ bl \body_fn ++ ++ vst1.8 {d0}, [r0, :64], r2 ++ vst1.8 {d1}, [r7, :64], r2 ++ ble 2f ++ ++ vldr d19, [r6, #8] ++ add r6, r6, r3, lsl #1 ++ vldr d20, [r1, #8] ++ vmov d18, d3 ++ vldr d2, [r1] ++ add r1, r1, r3, lsl #1 ++ vldr d4, [r6, #-8] ++ vldr d3, [r6] ++ vldr d21, [r1, #-8] ++ vldr d22, [r1] ++ b 1b ++ ++2: pop {r7, pc} ++.endm ++ ++.macro edge_4bx4_e3, body_fn, pb ++ @ e3 is the same as e2 but with the X offset reversed ++ edge_4bx4_e2 \body_fn, (-\pb) ++.endm ++ ++@ Jump table entry - if in neon mode the bottom bit must be set ++@ ? There is probably a real asm instruction to do this but I haven't found it ++.macro jent lab ++.if jent_pic ++@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is ++@ simpler and clearer in the code to stick with .word ++T .word (0 + \lab) - (4 + 98b) ++A .word (0 + \lab) - (8 + 98b) ++.else ++T .word 1 + \lab ++A .word \lab ++.endif ++.endm ++ ++.macro edge_64b_bodies, body_fn, pb ++ jent 0f ++ jent 10f ++ jent 20f ++ jent 30f ++ ++0: edge_64b_e0 \body_fn, \pb ++10: edge_64b_e1 \body_fn ++20: edge_64b_e2 \body_fn, \pb ++30: edge_64b_e3 \body_fn, \pb ++.endm ++ ++.macro edge_32bx2_bodies, body_fn, pb ++ jent 0f ++ jent 10f ++ jent 20f ++ jent 30f ++ ++0: edge_32bx2_e0 \body_fn, \pb ++10: edge_32bx2_e1 \body_fn ++20: edge_32bx2_e2 \body_fn, \pb ++30: edge_32bx2_e3 \body_fn, \pb ++.endm ++ ++.macro edge_16b_bodies, body_fn, pb ++ jent 0f ++ jent 10f ++ jent 20f ++ jent 30f ++ ++0: edge_16b_e0 \body_fn, \pb ++10: edge_16b_e1 \body_fn ++20: edge_16b_e2 \body_fn, \pb ++30: edge_16b_e3 \body_fn, \pb ++.endm ++ ++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb ++ jent 0f ++ jent 10f ++ jent 20f ++ jent 30f ++ jent 5f ++ jent 15f ++ jent 25f ++ jent 35f ++ ++0: edge_32bx2_e0 \body_fn_64b, \pb ++10: edge_32bx2_e1 \body_fn_64b ++20: edge_32bx2_e2 \body_fn_64b, \pb ++30: edge_32bx2_e3 \body_fn_64b, \pb ++5: edge_16b_e0 \body_fn_16b, \pb ++15: edge_16b_e1 \body_fn_16b ++25: edge_16b_e2 \body_fn_16b, \pb ++35: edge_16b_e3 \body_fn_16b, \pb ++.endm ++ ++.macro edge_16b_8bx2_bodies, body_fn, pb ++ jent 0f ++ jent 10f ++ jent 20f ++ jent 30f ++ jent 5f ++ jent 15f ++ jent 25f ++ jent 35f ++ ++0: edge_16b_e0 \body_fn, \pb ++10: edge_16b_e1 \body_fn ++20: edge_16b_e2 \body_fn, \pb ++30: edge_16b_e3 \body_fn, \pb ++5: edge_8bx2_e0 \body_fn, \pb ++15: edge_8bx2_e1 \body_fn ++25: edge_8bx2_e2 \body_fn, \pb ++35: edge_8bx2_e3 \body_fn, \pb ++.endm ++ ++.macro edge_8bx2_4bx4_bodies, body_fn, pb ++ jent 0f ++ jent 10f ++ jent 20f ++ jent 30f ++ jent 5f ++ jent 15f ++ jent 25f ++ jent 35f ++ ++0: edge_8bx2_e0 \body_fn, \pb ++10: edge_8bx2_e1 \body_fn ++20: edge_8bx2_e2 \body_fn, \pb ++30: edge_8bx2_e3 \body_fn, \pb ++5: edge_4bx4_e0 \body_fn, \pb ++15: edge_4bx4_e1 \body_fn ++25: edge_4bx4_e2 \body_fn, \pb ++35: edge_4bx4_e3 \body_fn, \pb ++.endm ++ ++@ void ff_hevc_rpi_sao_edge_8_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_8_neon_8, export=1 ++ edge_16b_init 8, 0, 1, 99f ++99: ++ edge_8bx2_4bx4_bodies edge_16b_body_8, 1 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_16_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_16_neon_8, export=1 ++ edge_16b_init 8, 0, 0, 99f ++99: ++ edge_16b_bodies edge_16b_body_8, 1 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_32_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_32_neon_8, export=1 ++ edge_64b_init 8, 0, 0, 99f ++99: ++ edge_32bx2_bodies edge_64b_body_8, 1 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_64_neon_8( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_64_neon_8, export=1 ++ edge_64b_init 8, 0, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_8, 1 ++endfunc ++ ++@ ff_hevc_rpi_sao_edge_c_8_neon_8( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1 ++ edge_16b_init 8, 1, 1, 99f ++99: ++ edge_16b_8bx2_bodies edge_16b_body_8, 2 ++endfunc ++ ++@ ff_hevc_rpi_sao_edge_c_16_neon_8( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1 ++ edge_64b_init 8, 1, 0, 99f ++99: ++ edge_32bx2_bodies edge_64b_body_8, 2 ++endfunc ++ ++@ ff_hevc_rpi_sao_edge_c_32_neon_8( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1 ++ edge_64b_init 8, 1, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_8, 2 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_8_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_8_neon_10, export=1 ++ edge_16b_init 10, 0, 1, 99f ++99: ++ edge_16b_8bx2_bodies edge_16b_body_16, 2 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_16_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_16_neon_10, export=1 ++ edge_64b_init 10, 0, 0, 99f ++99: ++ edge_32bx2_bodies edge_64b_body_16, 2 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_64_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++@ We simply split the 32 case into 2 vertical stripes ++@ and call the fns for w32 ++@ ++@ Calling code will always have src != dst so we don't have to worry ++@ about edge effects ++ ++function ff_hevc_rpi_sao_edge_64_neon_10, export=1 ++ edge_64b_init 10, 0, 1, 99f, xjump=1 ++endfunc ++ ++@ void ff_hevc_rpi_sao_edge_32_neon_10( ++@ uint8_t *_dst, [r0] ++@ uint8_t *_src, [r1] ++@ int stride_dst, [r2] ++@ int16_t *_sao_offset_val, [r3] ++@ int eo, [sp, #0] ++@ int width, [sp, #4] ++@ int height) [sp, #8] ++ ++function ff_hevc_rpi_sao_edge_32_neon_10, export=1 ++ edge_64b_init 10, 0, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_16, 2 ++endfunc ++ ++@ ff_hevc_rpi_sao_edge_c_8_neon_10( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1 ++ edge_xxb_init 10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1 ++99: ++ edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4 ++endfunc ++ ++@ ff_hevc_rpi_sao_edge_c_32_neon_10( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1 ++ edge_64b_init 10, 1, 1, 99f, xjump=1 ++endfunc ++ ++ ++@ ff_hevc_rpi_sao_edge_c_16_neon_10( ++@ uint8_t *_dst, [r0] ++@ const uint8_t *_src, [r1] ++@ ptrdiff_t stride_dst, [r2] ++@ const int16_t *_sao_offset_val_u, [r3] ++@ const int16_t *_sao_offset_val_v, [sp, #0] ++@ int eo, [sp, #4] ++@ int width, [sp, #8] ++@ int height) [sp, #12] ++ ++function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1 ++ edge_64b_init 10, 1, 0, 99f ++99: ++ edge_64b_bodies edge_64b_body_16, 4 ++endfunc ++ +diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h +new file mode 100644 +index 0000000000..36a23a5bf9 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_arm.h +@@ -0,0 +1,28 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_ARM_HEVCPRED_ARM_H ++#define AVCODEC_ARM_HEVCPRED_ARM_H ++ ++#include "libavcodec/rpi_hevcpred.h" ++ ++void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth); ++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth); ++ ++#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c +new file mode 100644 +index 0000000000..80724d4cf3 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_init_arm.c +@@ -0,0 +1,35 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/attributes.h" ++#include "libavutil/cpu.h" ++#include "libavutil/arm/cpu.h" ++ ++#include "libavcodec/rpi_hevcpred.h" ++#include "rpi_hevcpred_arm.h" ++ ++av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth) ++{ ++ int cpu_flags = av_get_cpu_flags(); ++ ++ if (have_neon(cpu_flags)) ++ ff_hevc_rpi_pred_init_neon(c, bit_depth); ++} ++ +diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c +new file mode 100644 +index 0000000000..21e7700174 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_init_neon.c +@@ -0,0 +1,210 @@ ++/* ++ * Copyright (c) 2018 John Cox (for Raspberry Pi) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "rpi_hevcpred_arm.h" ++ ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32; ++ ++void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode); ++ ++void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++ ++void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride); ++ ++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth) ++{ ++ switch (bit_depth) ++ { ++ case 8: ++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8; ++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8; ++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16; // Equivalent to c_4_neon_8 ++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16; ++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16; ++ ++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8; ++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8; ++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8; ++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8; ++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8; ++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8; ++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8; ++ ++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8; ++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8; ++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8; ++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8; ++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8; ++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8; ++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8; ++ ++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8; ++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8; ++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8; ++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8; ++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8; ++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8; ++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8; ++ ++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8; ++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8; ++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8; ++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8; ++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8; ++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8; ++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8; ++ ++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_8; ++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_8; ++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_8; ++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_8; ++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8; ++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8; ++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8; ++ break; ++ case 10: ++ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16; ++ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16; ++ c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16; ++ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32; ++ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32; ++ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32; ++ ++ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10; ++ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10; ++ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10; ++ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10; ++ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10; ++ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10; ++ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10; ++ ++ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10; ++ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10; ++ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10; ++ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10; ++ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10; ++ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10; ++ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10; ++ ++ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10; ++ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10; ++ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10; ++ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10; ++ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10; ++ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10; ++ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10; ++ ++ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10; ++ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10; ++ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10; ++ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10; ++ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10; ++ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10; ++ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10; ++ ++ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_10; ++ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_10; ++ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_10; ++ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_10; ++ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10; ++ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10; ++ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10; ++ break; ++ default: ++ break; ++ } ++} ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S +new file mode 100644 +index 0000000000..fa8f67cf03 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S +@@ -0,0 +1,2984 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++/* ++ * General angular pred ++ * ++ * Horizontal (10) & Vertical (26) cases have their own file ++ * and are not dealt with properly here (luma filtering is missing) ++ * ++ * The inv_angle calculations are annoying - if it wasn't for the +128 ++ * rounding step then the result would simply be the loop counter :-( ++ */ ++ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++.text ++ ++@ Horizontal Patch functions ++@ These need a transpose before store so exist as smaller patches ++@ Patches can be called repeatedly without any intermediate setup ++@ to generate a horizontal block ++@ ++@ It is almost certainly the case that larger patch fns can be built ++@ and they would be a little faster, but we would still need the small ++@ fns and code size (or at least instruction cache size) is an issue ++@ given how much code we already have here ++ ++@ Generate 8x8 luma 8 patch ++@ ++@ r3 Out stride ++@ r4 Angle add ++@ r7 Inv angle (_up only) ++@ ++@ In/Out (updated) ++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width) ++@ r2 Left ptr - updated ++@ r10 Inv angle accumulator (_up only) ++@ r12 32 - angle frac (_down) or angle frac (_up) ++@ d0 Older reference samples ++@ d1=r8+r9 Newer reference samples ++@ d2 32 - angle frac ++@ d3 Angle frac ++@ q2 Partially computed next result (_up only) ++@ ++@ Temps ++@ r5 Loop counter ++@ r6 ++@ r7 (_down only) ++@ r11 (_up only) ++@ q2, q8-q11 ++ ++patch_h_down_8x8_8: ++ ldrd r8, r9, [r2] @ Left ++ rsb r12, r6, #32 ++ vmov d0, r8, r9 ++ vdup.8 d3, r6 ++ lsr r8, #8 ++ vdup.8 d2, r12 ++ orr r8, r8, r9, lsl #24 ++ ldr r9, [r2, #5]! ++ vmov d1, r8, r9 ++ // drop through... ++patch_h_down_8x8_8_continue: ++ mov r5, #8 ++1: ++ subs r12, r4 ++ vmull.u8 q2, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmlal.u8 q2, d1, d3 ++ rsb r6, r12, #32 ++ vext.8 q8, q8, q9, #8 ++ itt mi ++ lsrmi r7, r8, #8 ++ vmovmi d0, r8, r9 ++ vdup.8 d2, r12 ++ vext.8 q9, q9, q10, #8 ++ it mi ++ orrmi r8, r7, r9, lsl #24 ++ vext.8 q10, q10, q11, #8 ++ it mi ++ ldrmi r9, [r2, #1]! ++ vmov d22, d23 ++ vrshrn.u16 d23, q2, #5 ++ it mi ++ vmovmi d1, r8, r9 ++ subs r5, #1 ++ vdup.8 d3, r6 ++ bne 1b ++ // drop through... ++store_tran_8x8_8: ++ vzip.8 d16, d17 ++ add r6, r0, r3 ++ vzip.8 d18, d19 ++ lsl r3, #1 ++ vzip.8 d20, d21 ++ add r5, r0, r3 ++ vzip.8 d22, d23 ++ vzip.16 q8, q9 ++ vzip.16 q10, q11 ++ vzip.32 q8, q10 ++ vzip.32 q9, q11 ++ vst1.8 {d16}, [r0]! ++ vst1.8 {d17}, [r6], r3 ++ vst1.8 {d20}, [r5], r3 ++ vst1.8 {d21}, [r6], r3 ++ vst1.8 {d18}, [r5], r3 ++ vst1.8 {d19}, [r6], r3 ++ vst1.8 {d22}, [r5] ++ asr r3, #1 ++ vst1.8 {d23}, [r6] ++ ++ bx lr ++ ++patch_h_up_8x8_8: ++ ldrd r8, r9, [r2] ++ rsb r6, r4, #32 ++ vmov d0, r8, r9 ++ vdup.8 d3, r4 ++ lsr r11, r8, #24 ++ vdup.8 d2, r6 ++ ldr r8, [r2, #-1]! ++ orr r9, r11, r9, lsl #8 ++ vmov d1, r8, r9 ++ mov r12, r4 ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++patch_h_up_8x8_8_continue: ++ mov r5, #8 ++1: ++ add r12, r4 ++ mov r11, #0 ++ cmp r12, #33 ++ it cs ++ addcs r10, r7 ++ vext.8 q8, q8, q9, #8 ++ itt cs ++ subcs r12, #32 ++ tstcs r10, #1<<31 ++ rsb r6, r12, #32 ++ it eq ++ asreq r11, r10, #8 ++ it cs ++ vmovcs d0, r8, r9 ++ vdup.8 d2, r6 ++ it cs ++ lsrcs r6, r8, #24 ++ vext.8 q9, q9, q10, #8 ++ itt cs ++ orrcs r9, r6, r9, lsl #8 ++ ldrbcs r11, [r1, r11] ++ vdup.8 d3, r12 ++ vext.8 q10, q10, q11, #8 ++ it hi ++ ldrbhi r11, [r2, #-1]! ++ vmov d22, d23 ++ vrshrn.u16 d23, q2, #5 ++ itt cs ++ orrcs r8, r11, r8, lsl #8 ++ vmovcs d1, r8, r9 ++ vmull.u8 q2, d0, d2 ++ subs r5, #1 ++ vmlal.u8 q2, d1, d3 ++ bne 1b ++ ++ b store_tran_8x8_8 ++ ++ ++.macro ADRT reg, val ++@ adr in T32 has enough range but not in A32 ++A adrl \reg, \val ++T adr \reg, \val ++.endm ++ ++@ ff_hevc_rpi_pred_angular_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_4_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r8, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ ldr lr, [r2], #1 @ Top ++ rsb r12, r6, #32 ++ vmov s0, lr ++ vdup.8 d3, r6 ++ ldr lr, [r2], #1 ++ vdup.8 d2, r12 ++ vmov s2, lr ++ subs r12, r4 ++ vmull.u8 q2, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmlal.u8 q2, d1, d3 ++ rsb r6, r12, #32 ++ itt mi ++ vmovmi s0, lr ++ ldrmi lr, [r2], #1 ++ vdup.8 d2, r12 ++ it mi ++ vmovmi s2, lr ++ vdup.8 d3, r6 ++ mov r5, #2 ++1: ++ vrshrn.u16 d20, q2, #5 ++ subs r12, r4 ++ vmull.u8 q2, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmlal.u8 q2, d1, d3 ++ rsb r6, r12, #32 ++ vext.64 q8, q8, q9, #1 ++ it mi ++ vmovmi s0, lr ++ vext.64 q9, q9, q10, #1 ++ it mi ++ ldrmi lr, [r2], #1 ++ vdup.8 d2, r12 ++ it mi ++ vmovmi s2, lr ++ subs r5, #1 ++ vdup.8 d3, r6 ++ bne 1b ++ ++ vrshrn.u16 d20, q2, #5 ++ vmull.u8 q2, d0, d2 ++ add r12, r0, r3 ++ vmlal.u8 q2, d1, d3 ++ lsl r3, #1 ++ vext.64 q8, q8, q9, #1 ++ vext.64 q9, q9, q10, #1 ++ vrshrn.u16 d20, q2, #5 ++ ++98: ++ vst4.8 {d17[0], d18[0], d19[0], d20[0]}, [r0], r3 ++ vst4.8 {d17[1], d18[1], d19[1], d20[1]}, [r12], r3 ++ vst4.8 {d17[2], d18[2], d19[2], d20[2]}, [r0] ++ vst4.8 {d17[3], d18[3], d19[3], d20[3]}, [r12] ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ rsb r12, r6, #32 ++ ldr lr, [r2] @ Left ++ ldrb r2, [r2, #-1] @ Top-left ++ vmov s0, lr ++ vdup.8 d2, r12 ++ vdup.8 d3, r6 ++ orr lr, r2, lr, lsl #8 ++ vmov s2, lr ++ sub r8, r7, #128 ++ mov r5, #3 ++2: ++ vmull.u8 q2, d0, d2 ++ subs r12, r4 ++ vmlal.u8 q2, d1, d3 ++T it mi ++ addmi r12, #32 ++T asr r6, r8, #8 ++T it mi ++T ldrbmi r2, [r1, r6] ++A ldrbmi r2, [r1, r8, asr #8] ++ rsb r6, r12, #32 ++ vdup.8 d2, r12 ++ ittt mi ++ vmovmi s0, lr ++ orrmi lr, r2, lr, lsl #8 ++ vmovmi s2, lr ++ vrshrn.u16 d20, q2, #5 ++ vdup.8 d3, r6 ++ it mi ++ addmi r8, r7 ++ subs r5, #1 ++ vext.64 q8, q8, q9, #1 ++ vext.64 q9, q9, q10, #1 ++ bne 2b ++ ++ vmull.u8 q2, d0, d2 ++ add r12, r0, r3 ++ vmlal.u8 q2, d1, d3 ++ lsl r3, #1 ++ vrshrn.u16 d20, q2, #5 ++ b 98b ++ ++@ Left of vertical - works down left ++18: ++ ldrh r7, [r7] ++ rsb r12, r6, #32 ++ ldr lr, [r1] @ Top ++ ldrb r1, [r2, #-1] @ Top-left ++ vmov s0, lr ++ vdup.8 d2, r12 ++ vdup.8 d3, r6 ++ orr lr, r1, lr, lsl #8 ++ vmov s2, lr ++ sub r8, r7, #128 ++ mov r5, #3 ++2: ++ vmull.u8 q2, d0, d2 ++ subs r12, r4 ++ vmlal.u8 q2, d1, d3 ++T it mi ++ addmi r12, #32 ++T asr r6, r8, #8 ++T it mi ++T ldrbmi r1, [r2, r6] ++A ldrbmi r1, [r2, r8, asr #8] ++ rsb r6, r12, #32 ++ vdup.8 d2, r12 ++ ittt mi ++ vmovmi s0, lr ++ orrmi lr, r1, lr, lsl #8 ++ vmovmi s2, lr ++ vrshrn.u16 d4, q2, #5 ++ vdup.8 d3, r6 ++ it mi ++ addmi r8, r7 ++ subs r5, #1 ++ vst1.32 {d4[0]}, [r0], r3 ++ bne 2b ++ ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++ vrshrn.u16 d4, q2, #5 ++ vst1.32 {d4[0]}, [r0] ++ ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ ldr lr, [r1], #1 @ Top ++ rsb r12, r6, #32 ++ vmov s0, lr ++ vdup.8 d3, r6 ++ ldr lr, [r1], #1 ++ vdup.8 d2, r12 ++ vmov s2, lr ++ subs r12, r4 ++ vmull.u8 q2, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmlal.u8 q2, d1, d3 ++ rsb r6, r12, #32 ++ itt mi ++ vmovmi s0, lr ++ ldrmi lr, [r1], #1 ++ vdup.8 d2, r12 ++ it mi ++ vmovmi s2, lr ++ vdup.8 d3, r6 ++ mov r5, #2 ++1: ++ vrshrn.u16 d6, q2, #5 ++ subs r12, r4 ++ vmull.u8 q2, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmlal.u8 q2, d1, d3 ++ rsb r6, r12, #32 ++ vst1.32 {d6[0]}, [r0], r3 ++ itt mi ++ vmovmi s0, lr ++ ldrmi lr, [r1], #1 ++ vdup.8 d2, r12 ++ it mi ++ vmovmi s2, lr ++ subs r5, #1 ++ vdup.8 d3, r6 ++ bne 1b ++ ++ vrshrn.u16 d6, q2, #5 ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++ vst1.32 {d6[0]}, [r0], r3 ++ vrshrn.u16 d6, q2, #5 ++ vst1.32 {d6[0]}, [r0] ++ ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++ ++@ ff_hevc_rpi_pred_angular_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_8_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ bl patch_h_down_8x8_8 ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ bl patch_h_up_8x8_8 ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ ldrd r8, r9, [r1] @ Top ++ rsb r12, r6, #32 ++ ldrb lr, [r2, #-1] @ Top-left ++ ldrh r7, [r7] ++ vmov d0, r8, r9 ++ lsl r9, r9, #8 ++ vdup.8 d2, r12 ++ orr r9, r9, r8, lsr #24 ++ orr r8, lr, r8, lsl #8 ++ vmov d1, r8, r9 ++ sub r1, r7, #128 ++ mov r5, #7 ++1: ++ vdup.8 d3, r6 ++ vmull.u8 q2, d0, d2 ++ subs r12, r12, r4 ++ vmlal.u8 q2, d1, d3 ++ ittt mi ++ addmi lr, r2, r1, asr #8 ++ addmi r12, r12, #32 ++ vmovmi d0, r8, r9 ++ rsb r6, r12, #32 ++ itt mi ++ lslmi r9, r9, #8 ++ ldrbmi lr, [lr] ++ vdup.8 d2, r12 ++ vrshrn.u16 d4, q2, #5 ++ itttt mi ++ orrmi r9, r9, r8, lsr #24 ++ orrmi r8, lr, r8, lsl #8 ++ vmovmi d1, r8, r9 ++ addmi r1, r1, r7 ++ subs r5, r5, #1 ++ vst1.8 {d4}, [r0], r3 ++ bne 1b ++ ++ vdup.8 d3, r6 ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++ vrshrn.u16 d4, q2, #5 ++ vst1.8 {d4}, [r0] ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ ldrd r8, r9, [r1] @ Top ++ rsb r12, r6, #32 ++ vmov d0, r8, r9 ++ vdup.8 d3, r6 ++ mov r5, #7 ++ lsr r8, #8 ++ vdup.8 d2, r12 ++ orr r8, r8, r9, lsl #24 ++ ldr r9, [r1, #5]! ++ vmov d1, r8, r9 ++1: ++ vmull.u8 q2, d0, d2 ++ subs r12, r4 ++ vmlal.u8 q2, d1, d3 ++ it mi ++ addmi r12, #32 ++ rsb r6, r12, #32 ++ itt mi ++ vmovmi d0, r8, r9 ++ lsrmi r8, #8 ++ vdup.8 d2, r12 ++ itt mi ++ orrmi r8, r8, r9, lsl #24 ++ ldrmi r9, [r1, #1]! ++ vrshrn.u16 d6, q2, #5 ++ it mi ++ vmovmi d1, r8, r9 ++ vdup.8 d3, r6 ++ subs r5, #1 ++ vst1.8 {d6}, [r0], r3 ++ bne 1b ++ ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++ vrshrn.u16 d6, q2, #5 ++ vst1.8 {d6}, [r0] ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_16_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 @ save r2 - r1 unused by patch_down ++ ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8_continue ++ ++ add r2, r1, #8 @ restore r2, but 8 rows further down left ++ sub r0, #16 ++ mov r6, r4 ++ add r0, r0, r3, lsl #3 ++ ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8_continue ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ ++ push {r2} ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8_continue ++ pop {r2} ++ ++ sub r0, #16 ++ mov r10, #-128 ++ add r2, #8 ++ add r0, r0, r3, lsl #3 ++ sub r10, r10, r7, lsl #3 ++ ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8_continue ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.8 {q9}, [r1] ++ sub r1, r2, #1 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ vdup.8 d6, r6 ++ vext.8 q8, q9, q9, #15 ++ sub r8, r7, #128 ++ vld1.8 {d16[0]}, [r1] ++ vdup.8 d7, r12 ++ mov r5, #15 ++1: ++ vmull.u8 q0, d18, d7 ++ subs r12, r4 ++ vmlal.u8 q0, d16, d6 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d19, d7 ++ it cc ++ addcc r1, r2, r8, asr #8 ++ vmlal.u8 q1, d17, d6 ++ rsb r6, r12, #32 ++ vext.8 q10, q8, q8, #15 ++ sub r5, #1 ++ vld1.8 {d20[0]}, [r1] ++ it cc ++ addcc r8, r7 ++ vmov q11, q8 ++ teq r5, #0 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmull.u8 q0, d22, d7 ++ subs r12, r4 ++ vmlal.u8 q0, d20, d6 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d23, d7 ++ it cc ++ addcc r1, r2, r8, asr #8 ++ vmlal.u8 q1, d21, d6 ++ rsb r6, r12, #32 ++ vext.8 q8, q10, q10, #15 ++ sub r5, #1 ++ vld1.8 {d16[0]}, [r1] ++ it cc ++ addcc r8, r7 ++ vmov q9, q10 ++ teq r5, #0 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmull.u8 q0, d22, d7 ++ vmlal.u8 q0, d20, d6 ++ vmull.u8 q1, d23, d7 ++ vmlal.u8 q1, d21, d6 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++4: ++ bcc 3b ++5: ++ vmull.u8 q0, d18, d7 ++ vmlal.u8 q0, d16, d6 ++ vmull.u8 q1, d19, d7 ++ vmlal.u8 q1, d17, d6 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q9}, [r1]! ++ rsb r12, r6, #32 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vext.8 q8, q9, q9, #1 ++ vld1.8 {d17[7]}, [r1]! ++ mov r5, #15 ++1: ++ vmull.u8 q0, d16, d6 ++ subs r12, r4 ++ vmlal.u8 q0, d18, d7 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d17, d6 ++ rsb r6, r12, #32 ++ vmlal.u8 q1, d19, d7 ++ sub r5, #1 ++ vext.8 q10, q8, q8, #1 ++ teq r5, #0 ++ vld1.8 {d21[7]}, [r1] ++ it cc ++ addcc r1, #1 ++ vmov q11, q8 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmull.u8 q0, d20, d6 ++ subs r12, r4 ++ vmlal.u8 q0, d22, d7 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d21, d6 ++ rsb r6, r12, #32 ++ vmlal.u8 q1, d23, d7 ++ sub r5, #1 ++ vext.8 q8, q10, q10, #1 ++ teq r5, #0 ++ vld1.8 {d17[7]}, [r1] ++ it cc ++ addcc r1, #1 ++ vmov q9, q10 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmull.u8 q0, d20, d6 ++ vmlal.u8 q0, d22, d7 ++ vmull.u8 q1, d21, d6 ++ vmlal.u8 q1, d23, d7 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++4: ++ bcc 3b ++5: ++ vmull.u8 q0, d16, d6 ++ vmlal.u8 q0, d18, d7 ++ vmull.u8 q1, d17, d6 ++ vmlal.u8 q1, d19, d7 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_32_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r10, #4 ++ mov r1, r2 ++1: ++ bl patch_h_down_8x8_8 ++ bl patch_h_down_8x8_8_continue ++ bl patch_h_down_8x8_8_continue ++ bl patch_h_down_8x8_8_continue ++ ++ add r2, r1, #8 @ restore r2, but 8 rows further down left ++ add r1, r1, #8 ++ mov r6, r4 ++ sub r0, #32 ++ subs r10, #1 ++ add r0, r0, r3, lsl #3 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ vmov.i8 d6, #1<<2 ++1: ++ push {r2,r10} ++ bl patch_h_up_8x8_8 ++ bl patch_h_up_8x8_8_continue ++ bl patch_h_up_8x8_8_continue ++ bl patch_h_up_8x8_8_continue ++ pop {r2,r10} ++ ++ vmov r8, s12 ++ sub r0, #32 ++ add r2, #8 ++ add r0, r0, r3, lsl #3 ++ sub r10, r10, r7, lsl #3 ++ vshr.u8 d6, #1 ++ teq r8, #0 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.8 {q0-q1}, [r1] ++ sub r9, r2, #1 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++ mov r5, #32 ++1: ++ vld1.8 {d17[7]}, [r9] ++ add r8, r7 ++ vmov q2, q0 ++ vmov q3, q1 ++ add r9, r2, r8, asr #8 ++ vext.8 q1, q0, q1, #15 ++ vext.8 q0, q8, q0, #15 ++2: ++ vmull.u8 q10, d4, d19 ++ subs r12, r4 ++ vmlal.u8 q10, d0, d18 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q11, d5, d19 ++ rsb r6, r12, #32 ++ vmlal.u8 q11, d1, d18 ++ sub r5, #1 ++ vmull.u8 q12, d6, d19 ++ teq r5, #0 ++ vmlal.u8 q12, d2, d18 ++ vmull.u8 q13, d7, d19 ++ vmlal.u8 q13, d3, d18 ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++ vrshrn.u16 d20, q10, #5 ++ vrshrn.u16 d21, q11, #5 ++ vrshrn.u16 d22, q12, #5 ++ vrshrn.u16 d23, q13, #5 ++ vst1.8 {q10-q11}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ add r5, r1, #32 ++ vld1.8 {q0-q1}, [r1]! ++ rsb r12, r6, #32 ++ vld1.8 {d16[0]}, [r5] ++ mov r5, #32 ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++1: ++ vmov q2, q0 ++ add r1, #1 ++ vmov q3, q1 ++ vext.8 q0, q0, q1, #1 ++ vext.8 q1, q1, q8, #1 ++2: ++ vmull.u8 q10, d0, d18 ++ subs r12, r4 ++ vmlal.u8 q10, d4, d19 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q11, d1, d18 ++ rsb r6, r12, #32 ++ vmlal.u8 q11, d5, d19 ++ sub r5, #1 ++ vmull.u8 q12, d2, d18 ++ teq r5, #0 ++ vmlal.u8 q12, d6, d19 ++ vmull.u8 q13, d3, d18 ++ vmlal.u8 q13, d7, d19 ++ vld1.8 {d16[0]}, [r1] ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++ vrshrn.u16 d20, q10, #5 ++ vrshrn.u16 d21, q11, #5 ++ vrshrn.u16 d22, q12, #5 ++ vrshrn.u16 d23, q13, #5 ++ vst1.8 {q10-q11}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ Chroma 8 bit 4x4 patch fns ++ .text ++ ++patch_h_down_c_4x4_8: ++ ldrd r8, r9, [r2] @ Left ++ rsb r12, r6, #32 ++ vmov d0, r8, r9 ++ vdup.8 d3, r6 ++ lsr r8, #16 ++ vdup.8 d2, r12 ++ orr r8, r8, r9, lsl #16 ++ ldr r9, [r2, #6]! ++ vmov d1, r8, r9 ++ // drop through... ++patch_h_down_c_4x4_8_continue: ++ mov r5, #4 ++1: ++ subs r12, r4 ++ vmull.u8 q2, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmlal.u8 q2, d1, d3 ++ rsb r6, r12, #32 ++ vext.8 q8, q8, q9, #8 ++ it mi ++ lsrmi r7, r8, #16 ++ vmov d18, d19 ++ it mi ++ vmovmi d0, r8, r9 ++ vdup.8 d2, r12 ++ it mi ++ orrmi r8, r7, r9, lsl #16 ++ vrshrn.u16 d19, q2, #5 ++ itt mi ++ ldrmi r9, [r2, #2]! ++ vmovmi d1, r8, r9 ++ subs r5, #1 ++ vdup.8 d3, r6 ++ bne 1b ++ // drop through... ++store_tran_c_4x4_8: ++ vzip.16 d16, d17 ++ add r6, r0, r3 ++ vzip.16 d18, d19 ++ lsl r3, #1 ++ vzip.32 q8, q9 ++ add r5, r0, r3 ++ vst1.16 {d16}, [r0]! ++ vst1.16 {d17}, [r6], r3 ++ vst1.16 {d18}, [r5] ++ asr r3, #1 ++ vst1.16 {d19}, [r6] ++ ++ bx lr ++ ++patch_h_up_c_4x4_8: ++ ldrd r8, r9, [r2] ++ rsb r6, r4, #32 ++ vmov d0, r8, r9 ++ vdup.8 d3, r4 ++ lsr r11, r8, #16 ++ vdup.8 d2, r6 ++ ldr r8, [r2, #-2]! ++ orr r9, r11, r9, lsl #16 ++ vmov d1, r8, r9 ++ mov r12, r4 ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++patch_h_up_c_4x4_8_continue: ++ mov r5, #4 ++1: ++ add r12, r4 ++ cmp r12, #33 ++ it cs ++ addcs r10, r7 ++ mov r11, #0 ++ itt cs ++ subcs r12, #32 ++ tstcs r10, #1<<31 ++ rsb r6, r12, #32 ++ it eq ++ asreq r11, r10, #7 ++ it cs ++ vmovcs d0, r8, r9 ++ it eq ++ biceq r11, #1 ++ vdup.8 d2, r6 ++ it cs ++ lsrcs r6, r8, #16 ++ vdup.8 d3, r12 ++ vext.8 q8, q8, q9, #8 ++ itt cs ++ orrcs r9, r6, r9, lsl #16 ++ ldrhcs r11, [r1, r11] ++ vmov d18, d19 ++ it hi ++ ldrhhi r11, [r2, #-2]! ++ vrshrn.u16 d19, q2, #5 ++ itt cs ++ orrcs r8, r11, r8, lsl #16 ++ vmovcs d1, r8, r9 ++ vmull.u8 q2, d0, d2 ++ subs r5, #1 ++ vmlal.u8 q2, d1, d3 ++ bne 1b ++ ++ b store_tran_c_4x4_8 ++ ++ ++@ ff_hevc_rpi_pred_angular_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ bl patch_h_down_c_4x4_8 ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ bl patch_h_up_c_4x4_8 ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ ldrd r8, r9, [r1] @ Top ++ rsb r12, r6, #32 ++ ldrh lr, [r2, #-2] @ Top-left ++ ldrh r7, [r7] ++ vmov d0, r8, r9 ++ lsl r9, r9, #16 ++ vdup.8 d2, r12 ++ orr r9, r9, r8, lsr #16 ++ orr r8, lr, r8, lsl #16 ++ vmov d1, r8, r9 ++ sub r1, r7, #128 ++ mov r5, #3 ++1: ++ vdup.8 d3, r6 ++ vmull.u8 q2, d0, d2 ++ subs r12, r12, r4 ++ vmlal.u8 q2, d1, d3 ++ itttt mi ++ addmi lr, r2, r1, asr #7 ++ bicmi lr, #1 ++ addmi r12, r12, #32 ++ vmovmi d0, r8, r9 ++ rsb r6, r12, #32 ++ itt mi ++ lslmi r9, r9, #16 ++ ldrhmi lr, [lr] ++ vdup.8 d2, r12 ++ vrshrn.u16 d4, q2, #5 ++ itttt mi ++ orrmi r9, r9, r8, lsr #16 ++ orrmi r8, lr, r8, lsl #16 ++ vmovmi d1, r8, r9 ++ addmi r1, r1, r7 ++ subs r5, r5, #1 ++ vst1.16 {d4}, [r0], r3 ++ bne 1b ++ ++ vdup.8 d3, r6 ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++ vrshrn.u16 d4, q2, #5 ++ vst1.16 {d4}, [r0] ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ ldrd r8, r9, [r1] @ Top ++ rsb r12, r6, #32 ++ vmov d0, r8, r9 ++ vdup.8 d3, r6 ++ mov r5, #3 ++ lsr r8, #16 ++ vdup.8 d2, r12 ++ orr r8, r8, r9, lsl #16 ++ ldr r9, [r1, #6]! ++ vmov d1, r8, r9 ++1: ++ vmull.u8 q2, d0, d2 ++ subs r12, r4 ++ vmlal.u8 q2, d1, d3 ++ it mi ++ addmi r12, #32 ++ rsb r6, r12, #32 ++ itt mi ++ vmovmi d0, r8, r9 ++ lsrmi r8, #16 ++ vdup.8 d2, r12 ++ itt mi ++ orrmi r8, r8, r9, lsl #16 ++ ldrmi r9, [r1, #2]! ++ vrshrn.u16 d6, q2, #5 ++ it mi ++ vmovmi d1, r8, r9 ++ vdup.8 d3, r6 ++ subs r5, #1 ++ vst1.16 {d6}, [r0], r3 ++ bne 1b ++ ++ vmull.u8 q2, d0, d2 ++ vmlal.u8 q2, d1, d3 ++ vrshrn.u16 d6, q2, #5 ++ vst1.16 {d6}, [r0] ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 @ save r2 - r1 unused by patch_down ++ ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8_continue ++ ++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left ++ sub r0, #16 ++ mov r6, r4 ++ add r0, r0, r3, lsl #2 ++ ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8_continue ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ ++ push {r2} ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8_continue ++ pop {r2} ++ ++ sub r0, #16 ++ mov r10, #-128 ++ add r2, #8 ++ add r0, r0, r3, lsl #2 ++ sub r10, r10, r7, lsl #2 ++ ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8_continue ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.8 {q9}, [r1] ++ sub r1, r2, #2 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ vdup.8 d6, r6 ++ vext.8 q8, q9, q9, #14 ++ sub r8, r7, #128 ++ vld1.16 {d16[0]}, [r1] ++ vdup.8 d7, r12 ++ mov r5, #7 ++1: ++ subs r12, r4 ++ vmull.u8 q0, d18, d7 ++ it cc ++ asrcc r1, r8, #8 ++ vmlal.u8 q0, d16, d6 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d19, d7 ++ it cc ++ addcc r1, r2, r1, lsl #1 ++ vmlal.u8 q1, d17, d6 ++ rsb r6, r12, #32 ++ vext.8 q10, q8, q8, #14 ++ sub r5, #1 ++ vld1.16 {d20[0]}, [r1] ++ it cc ++ addcc r8, r7 ++ vmov q11, q8 ++ teq r5, #0 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ subs r12, r4 ++ vmull.u8 q0, d22, d7 ++ it cc ++ asrcc r1, r8, #8 ++ vmlal.u8 q0, d20, d6 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d23, d7 ++ it cc ++ addcc r1, r2, r1, lsl #1 ++ vmlal.u8 q1, d21, d6 ++ rsb r6, r12, #32 ++ vext.8 q8, q10, q10, #14 ++ sub r5, #1 ++ vld1.16 {d16[0]}, [r1] ++ it cc ++ addcc r8, r7 ++ vmov q9, q10 ++ teq r5, #0 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmull.u8 q0, d22, d7 ++ vmlal.u8 q0, d20, d6 ++ vmull.u8 q1, d23, d7 ++ vmlal.u8 q1, d21, d6 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++4: ++ bcc 3b ++5: ++ vmull.u8 q0, d18, d7 ++ vmlal.u8 q0, d16, d6 ++ vmull.u8 q1, d19, d7 ++ vmlal.u8 q1, d17, d6 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.8 {q9}, [r1]! ++ rsb r12, r6, #32 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vext.8 q8, q9, q9, #2 ++ vld1.16 {d17[3]}, [r1]! ++ mov r5, #7 ++1: ++ vmull.u8 q0, d16, d6 ++ subs r12, r4 ++ vmlal.u8 q0, d18, d7 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d17, d6 ++ rsb r6, r12, #32 ++ vmlal.u8 q1, d19, d7 ++ sub r5, #1 ++ vext.8 q10, q8, q8, #2 ++ teq r5, #0 ++ vld1.16 {d21[3]}, [r1] ++ it cc ++ addcc r1, #2 ++ vmov q11, q8 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmull.u8 q0, d20, d6 ++ subs r12, r4 ++ vmlal.u8 q0, d22, d7 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q1, d21, d6 ++ rsb r6, r12, #32 ++ vmlal.u8 q1, d23, d7 ++ sub r5, #1 ++ vext.8 q8, q10, q10, #2 ++ teq r5, #0 ++ vld1.16 {d17[3]}, [r1] ++ it cc ++ addcc r1, #2 ++ vmov q9, q10 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vdup.8 d6, r6 ++ vdup.8 d7, r12 ++ vst1.8 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmull.u8 q0, d20, d6 ++ vmlal.u8 q0, d22, d7 ++ vmull.u8 q1, d21, d6 ++ vmlal.u8 q1, d23, d7 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++4: ++ bcc 3b ++5: ++ vmull.u8 q0, d16, d6 ++ vmlal.u8 q0, d18, d7 ++ vmull.u8 q1, d17, d6 ++ vmlal.u8 q1, d19, d7 ++ vrshrn.u16 d0, q0, #5 ++ vrshrn.u16 d1, q1, #5 ++ vst1.8 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r10, #4 ++ mov r1, r2 ++1: ++ bl patch_h_down_c_4x4_8 ++ bl patch_h_down_c_4x4_8_continue ++ bl patch_h_down_c_4x4_8_continue ++ bl patch_h_down_c_4x4_8_continue ++ ++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left ++ add r1, r1, #4*2 ++ mov r6, r4 ++ sub r0, #32 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ vmov.i8 d6, #1<<2 ++1: ++ push {r2, r10} ++ bl patch_h_up_c_4x4_8 ++ bl patch_h_up_c_4x4_8_continue ++ bl patch_h_up_c_4x4_8_continue ++ bl patch_h_up_c_4x4_8_continue ++ pop {r2, r10} ++ ++ vmov r8, s12 ++ sub r0, #32 ++ add r2, #8 ++ add r0, r0, r3, lsl #2 ++ sub r10, r10, r7, lsl #2 ++ vshr.u8 d6, #1 ++ teq r8, #0 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.8 {q0-q1}, [r1] ++ sub r9, r2, #2 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++ mov r5, #16 ++1: ++ vld1.16 {d17[3]}, [r9] ++ add r8, r7 ++ vmov q2, q0 ++ vmov q3, q1 ++ asr r9, r8, #8 ++ vext.8 q1, q0, q1, #14 ++ add r9, r2, r9, lsl #1 ++ vext.8 q0, q8, q0, #14 ++2: ++ vmull.u8 q10, d4, d19 ++ subs r12, r4 ++ vmlal.u8 q10, d0, d18 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q11, d5, d19 ++ rsb r6, r12, #32 ++ vmlal.u8 q11, d1, d18 ++ sub r5, #1 ++ vmull.u8 q12, d6, d19 ++ teq r5, #0 ++ vmlal.u8 q12, d2, d18 ++ vmull.u8 q13, d7, d19 ++ vmlal.u8 q13, d3, d18 ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++ vrshrn.u16 d20, q10, #5 ++ vrshrn.u16 d21, q11, #5 ++ vrshrn.u16 d22, q12, #5 ++ vrshrn.u16 d23, q13, #5 ++ vst1.8 {q10-q11}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ add r5, r1, #32 ++ vld1.8 {q0-q1}, [r1]! ++ rsb r12, r6, #32 ++ vld1.16 {d16[0]}, [r5] ++ mov r5, #16 ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++1: ++ vmov q2, q0 ++ add r1, #2 ++ vmov q3, q1 ++ vext.8 q0, q0, q1, #2 ++ vext.8 q1, q1, q8, #2 ++2: ++ vmull.u8 q10, d0, d18 ++ subs r12, r4 ++ vmlal.u8 q10, d4, d19 ++ it cc ++ addcc r12, #32 ++ vmull.u8 q11, d1, d18 ++ rsb r6, r12, #32 ++ vmlal.u8 q11, d5, d19 ++ sub r5, #1 ++ vmull.u8 q12, d2, d18 ++ teq r5, #0 ++ vmlal.u8 q12, d6, d19 ++ vmull.u8 q13, d3, d18 ++ vmlal.u8 q13, d7, d19 ++ vld1.16 {d16[0]}, [r1] ++ vdup.8 d18, r6 ++ vdup.8 d19, r12 ++ vrshrn.u16 d20, q10, #5 ++ vrshrn.u16 d21, q11, #5 ++ vrshrn.u16 d22, q12, #5 ++ vrshrn.u16 d23, q13, #5 ++ vst1.8 {q10-q11}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++@------------------------------------------------------------------------------ ++@ Data ++ ++ .text ++ .balign 64 ++angle_2: ++ .byte 32 ++ .byte 26, 21, 17, 13, 9, 5, 2, 0 ++ @ Sign inverted from standards table ++ .byte 2, 5, 9, 13, 17, 21, 26, 32 ++ .byte 26, 21, 17, 13, 9, 5, 2, 0 ++ @ Standard sign ++ .byte 2, 5, 9, 13, 17, 21, 26, 32 ++ ++ .balign 2 ++ ++ @ Sign inverted from standards table ++inv_angle: ++ .short 4096, 1638, 910, 630, 482, 390, 315 ++ .short 256 ++ .short 315, 390, 482, 630, 910, 1638, 4096 ++ ++@------------------------------------------------------------------------------ ++@ ++@ 10 bit fns ++@ Should work for 9 & 11 bit as there is no actual bit-depth specific code ++@ but runs out of register width for 12+ bit ++ ++ .text ++ .balign 64 ++ ++patch_h_down_4x4_10: ++ ldrd r8, r9, [r2] @ Left ++ rsb r12, r6, #32 ++ vmov d0, r8, r9 ++ vdup.16 d3, r6 ++ lsr r8, #16 ++ vdup.16 d2, r12 ++ orr r8, r8, r9, lsl #16 ++ ldr r9, [r2, #6]! ++ vmov d1, r8, r9 ++ // drop through... ++patch_h_down_4x4_10_continue: ++ mov r5, #4 ++1: ++ subs r12, r4 ++ vmul.u16 d4, d0, d2 ++ it mi ++ addmi r12, #32 ++ vmla.u16 d4, d1, d3 ++ rsb r6, r12, #32 ++ vext.16 q8, q8, q9, #4 ++ it mi ++ lsrmi r7, r8, #16 ++ vmov d18, d19 ++ it mi ++ vmovmi d0, r8, r9 ++ vdup.16 d2, r12 ++ it mi ++ orrmi r8, r7, r9, lsl #16 ++ vrshr.u16 d19, d4, #5 ++ itt mi ++ ldrmi r9, [r2, #2]! ++ vmovmi d1, r8, r9 ++ subs r5, #1 ++ vdup.16 d3, r6 ++ bne 1b ++ // drop through... ++store_tran_4x4_10: ++ vzip.16 d16, d17 ++ add r6, r0, r3 ++ vzip.16 d18, d19 ++ lsl r3, #1 ++ vzip.32 q8, q9 ++ add r5, r0, r3 ++ vst1.16 {d16}, [r0]! ++ vst1.16 {d17}, [r6], r3 ++ vst1.16 {d18}, [r5] ++ asr r3, #1 ++ vst1.16 {d19}, [r6] ++ ++ bx lr ++ ++patch_h_up_4x4_10: ++ ldrd r8, r9, [r2] ++ rsb r6, r4, #32 ++ vmov d0, r8, r9 ++ vdup.16 d3, r4 ++ lsr r11, r8, #16 ++ vdup.16 d2, r6 ++ ldr r8, [r2, #-2]! ++ orr r9, r11, r9, lsl #16 ++ vmov d1, r8, r9 ++ mov r12, r4 ++ vmul.u16 d4, d0, d2 ++ vmla.u16 d4, d1, d3 ++patch_h_up_4x4_10_continue: ++ mov r5, #4 ++1: ++ add r12, r4 ++ cmp r12, #33 ++ it cs ++ addcs r10, r7 ++ mov r11, #0 ++ itt cs ++ subcs r12, #32 ++ tstcs r10, #1<<31 ++ rsb r6, r12, #32 ++ it eq ++ asreq r11, r10, #7 ++ it cs ++ vmovcs d0, r8, r9 ++ it eq ++ biceq r11, #1 ++ vdup.16 d2, r6 ++ it cs ++ lsrcs r6, r8, #16 ++ vdup.16 d3, r12 ++ vext.16 q8, q8, q9, #4 ++ itt cs ++ orrcs r9, r6, r9, lsl #16 ++ ldrhcs r11, [r1, r11] ++ vmov d18, d19 ++ it hi ++ ldrhhi r11, [r2, #-2]! ++ vrshr.u16 d19, d4, #5 ++ itt cs ++ orrcs r8, r11, r8, lsl #16 ++ vmovcs d1, r8, r9 ++ vmul.u16 d4, d0, d2 ++ subs r5, #1 ++ vmla.u16 d4, d1, d3 ++ bne 1b ++ ++ b store_tran_4x4_10 ++ ++ ++@ ff_hevc_rpi_pred_angular_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_4_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ bl patch_h_down_4x4_10 ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ bl patch_h_up_4x4_10 ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ ldrd r8, r9, [r1] @ Top ++ rsb r12, r6, #32 ++ ldrh lr, [r2, #-2] @ Top-left ++ ldrh r7, [r7] ++ vmov d0, r8, r9 ++ lsl r9, r9, #16 ++ vdup.16 d2, r12 ++ orr r9, r9, r8, lsr #16 ++ orr r8, lr, r8, lsl #16 ++ vmov d1, r8, r9 ++ sub r1, r7, #128 ++ mov r5, #3 ++1: ++ sel lr, lr, lr @ force pipeline 0 on Cortex-A53 ++ vdup.16 d3, r6 ++ vmul.u16 d4, d0, d2 ++ subs r12, r12, r4 ++ vmla.u16 d4, d1, d3 ++ itttt mi ++ addmi lr, r2, r1, asr #7 ++ bicmi lr, #1 ++ addmi r12, r12, #32 ++ vmovmi d0, r8, r9 ++ rsb r6, r12, #32 ++ itt mi ++ lslmi r9, r9, #16 ++ ldrhmi lr, [lr] ++ vdup.16 d2, r12 ++ vrshr.u16 d4, d4, #5 ++ itttt mi ++ orrmi r9, r9, r8, lsr #16 ++ orrmi r8, lr, r8, lsl #16 ++ vmovmi d1, r8, r9 ++ addmi r1, r1, r7 ++ subs r5, r5, #1 ++ vst1.16 {d4}, [r0], r3 ++ bne 1b ++ ++ vdup.16 d3, r6 ++ nop @ force next insn into pipeline 0 to enable ++ vmul.u16 d4, d0, d2 @ vmla to execute back-to-back on Cortex-A53 ++ vmla.u16 d4, d1, d3 ++ vrshr.u16 d4, d4, #5 ++ vst1.16 {d4}, [r0] ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ ldrd r8, r9, [r1] @ Top ++ rsb r12, r6, #32 ++ vmov d0, r8, r9 ++ vdup.16 d3, r6 ++ lsr r8, #16 ++ vdup.16 d2, r12 ++ orr r8, r8, r9, lsl #16 ++ ldr r9, [r1, #6]! ++ vmov d1, r8, r9 ++ mov r5, #3 ++1: ++ vmul.u16 d4, d0, d2 ++ subs r12, r4 ++ vmla.u16 d4, d1, d3 ++ it mi ++ addmi r12, #32 ++ rsb r6, r12, #32 ++ itt mi ++ vmovmi d0, r8, r9 ++ lsrmi r8, #16 ++ vdup.16 d2, r12 ++ itt mi ++ orrmi r8, r8, r9, lsl #16 ++ ldrmi r9, [r1, #2]! ++ vrshr.u16 d4, d4, #5 ++ it mi ++ vmovmi d1, r8, r9 ++ vdup.16 d3, r6 ++ subs r5, #1 ++ vst1.16 {d4}, [r0], r3 ++ bne 1b ++ ++ vmul.u16 d4, d0, d2 ++ vmla.u16 d4, d1, d3 ++ vrshr.u16 d4, d4, #5 ++ vst1.16 {d4}, [r0] ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_8_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 @ save r2 - r1 unused by patch_down ++ ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10_continue ++ ++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left ++ sub r0, #16 ++ mov r6, r4 ++ add r0, r0, r3, lsl #2 ++ ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10_continue ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ ++ push {r2} ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10_continue ++ pop {r2} ++ ++ sub r0, #16 ++ mov r10, #-128 ++ add r2, #8 ++ add r0, r0, r3, lsl #2 ++ sub r10, r10, r7, lsl #2 ++ ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10_continue ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.16 {q9}, [r1] ++ sub r1, r2, #2 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ vdup.16 q2, r6 ++ vext.16 q8, q9, q9, #7 ++ sub r8, r7, #128 ++ vld1.16 {d16[0]}, [r1] ++ vdup.16 q3, r12 ++ mov r5, #7 ++1: ++ vmul.u16 q0, q9, q3 ++ subs r12, r4 ++ vmla.u16 q0, q8, q2 ++ ittt cc ++ asrcc r1, r8, #8 ++ addcc r12, #32 ++ addcc r1, r2, r1, lsl #1 ++ vext.16 q10, q8, q8, #7 ++ rsb r6, r12, #32 ++ vmov q11, q8 ++ sub r5, #1 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r8, r7 ++ vld1.16 {d20[0]}, [r1] ++ teq r5, #0 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmul.u16 q0, q11, q3 ++ subs r12, r4 ++ vmla.u16 q0, q10, q2 ++ ittt cc ++ asrcc r1, r8, #8 ++ addcc r12, #32 ++ addcc r1, r2, r1, lsl #1 ++ vext.16 q8, q10, q10, #7 ++ rsb r6, r12, #32 ++ vmov q9, q10 ++ sub r5, #1 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r8, r7 ++ vld1.16 {d16[0]}, [r1] ++ teq r5, #0 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmul.u16 q0, q11, q3 ++ vmla.u16 q0, q10, q2 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++4: ++ bcc 3b ++5: ++ vmul.u16 q0, q9, q3 ++ vmla.u16 q0, q8, q2 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {q9}, [r1]! ++ rsb r12, r6, #32 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vext.16 q8, q9, q9, #1 ++ vld1.16 {d17[3]}, [r1]! ++ mov r5, #7 ++1: ++ vmul.u16 q0, q8, q2 ++ subs r12, r4 ++ vmla.u16 q0, q9, q3 ++ it cc ++ addcc r12, #32 ++ vext.16 q10, q8, q8, #1 ++ rsb r6, r12, #32 ++ vld1.16 {d21[3]}, [r1] ++ sub r5, #1 ++ vmov q11, q8 ++ teq r5, #0 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r1, #2 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmul.u16 q0, q10, q2 ++ subs r12, r4 ++ vmla.u16 q0, q11, q3 ++ it cc ++ addcc r12, #32 ++ vext.16 q8, q10, q10, #1 ++ rsb r6, r12, #32 ++ vld1.16 {d17[3]}, [r1] ++ sub r5, #1 ++ vmov q9, q10 ++ teq r5, #0 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r1, #2 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmul.u16 q0, q10, q2 ++ vmla.u16 q0, q11, q3 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++4: ++ bcc 3b ++5: ++ vmul.u16 q0, q8, q2 ++ vmla.u16 q0, q9, q3 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_16_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r10, #4 ++ mov r1, r2 ++1: ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ ++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left ++ add r1, r1, #4*2 ++ mov r6, r4 ++ sub r0, #32 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r10, #-128 ++ vmov.i8 d6, #1<<2 ++1: ++ push {r2, r10} ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ pop {r2, r10} ++ ++ vmov r8, s12 ++ sub r0, #32 ++ add r2, #8 ++ add r0, r0, r3, lsl #2 ++ sub r10, r10, r7, lsl #2 ++ vshr.u8 d6, #1 ++ teq r8, #0 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.16 {q0-q1}, [r1] ++ sub r9, r2, #2 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++ mov r5, #16 ++1: ++ vld1.16 {d17[3]}, [r9] ++ add r8, r7 ++ vmov q2, q0 ++ vmov q3, q1 ++ asr r9, r8, #8 ++ vext.16 q1, q0, q1, #7 ++ add r9, r2, r9, lsl #1 ++ vext.16 q0, q8, q0, #7 ++2: ++ vmul.u16 q11, q2, q10 ++ subs r12, r4 ++ vmla.u16 q11, q0, q9 ++ it cc ++ addcc r12, #32 ++ vmul.u16 q12, q3, q10 ++ rsb r6, r12, #32 ++ vmla.u16 q12, q1, q9 ++ sub r5, #1 ++ teq r5, #0 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++ vrshr.u16 q11, q11, #5 ++ vrshr.u16 q12, q12, #5 ++ vst1.16 {q11-q12}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ add r5, r1, #32 ++ vld1.16 {q0-q1}, [r1]! ++ rsb r12, r6, #32 ++ vld1.16 {d16[0]}, [r5] ++ mov r5, #16 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++1: ++ vmov q2, q0 ++ add r1, #2 ++ vmov q3, q1 ++ vext.16 q0, q0, q1, #1 ++ vext.16 q1, q1, q8, #1 ++2: ++ vmul.u16 q11, q0, q9 ++ subs r12, r4 ++ vmla.u16 q11, q2, q10 ++ it cc ++ addcc r12, #32 ++ vmul.u16 q12, q1, q9 ++ rsb r6, r12, #32 ++ vmla.u16 q12, q3, q10 ++ sub r5, #1 ++ vld1.16 {d16[0]}, [r1] ++ teq r5, #0 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++ vrshr.u16 q11, q11, #5 ++ vrshr.u16 q12, q12, #5 ++ vst1.16 {q11-q12}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_32_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r11, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #1 ++ vpush {d8} ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ add sp, #8 ++ mov r10, #8 ++ mov r1, r2 ++1: ++ bl patch_h_down_4x4_10 ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ bl patch_h_down_4x4_10_continue ++ ++ add r2, r1, #4*2 @ restore r2, but 4 rows further down left ++ add r1, r1, #4*2 ++ mov r6, r4 ++ sub r0, #64 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ add sp, #8 ++ ldrh r7, [r7] ++ mov r10, #-128 ++ vmov.i8 d6, #1<<6 ++1: ++ push {r2, r10} ++ bl patch_h_up_4x4_10 ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ bl patch_h_up_4x4_10_continue ++ pop {r2, r10} ++ ++ vmov r8, s12 ++ sub r0, #64 ++ add r2, #8 ++ add r0, r0, r3, lsl #2 ++ sub r10, r10, r7, lsl #2 ++ vshr.u8 d6, #1 ++ teq r8, #0 ++ bne 1b ++ ++ pop {r4-r11, pc} ++ ++@ Left of vertical - works down left ++18: ++ add r5, r1, #32 ++ vld1.16 {q1-q2}, [r1] ++ rsb r12, r6, r6, lsl #16 ++ vld1.16 {q3-q4}, [r5] ++ sub r9, r2, #2 ++ rsb r4, r12, #0 ++ rsb r12, r12, #32 << 16 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ vmov d0, d9 ++ vmov s2, r12 ++ add r10, r0, #32 ++ mov r5, #32 ++1: ++ vld1.16 {d1[3]}, [r9] ++ add r8, r7 ++ vmov q11, q4 ++ vmov q10, q3 ++ asr r9, r8, #8 ++ vmov q9, q2 ++ add r9, r2, r9, lsl #1 ++ vmov q8, q1 ++ vext.16 q4, q3, q4, #7 ++ vext.16 q3, q2, q3, #7 ++ vext.16 q2, q1, q2, #7 ++ vext.16 q1, q0, q1, #7 ++2: ++ vmul.u16 q12, q8, d1[1] ++ adds r12, r4 ++ vmla.u16 q12, q1, d1[0] ++ it cc ++ addcc r12, #32 << 16 ++ vmul.u16 q13, q9, d1[1] ++ it cc ++ subcc r12, #32 ++ vmla.u16 q13, q2, d1[0] ++ sub r5, #1 ++ vmul.u16 q14, q10, d1[1] ++ teq r5, #0 ++ vmla.u16 q14, q3, d1[0] ++ vmul.u16 q15, q11, d1[1] ++ vmla.u16 q15, q4, d1[0] ++ vmov s2, r12 ++ vrshr.u16 q12, q12, #5 ++ vrshr.u16 q13, q13, #5 ++ vrshr.u16 q14, q14, #5 ++ vrshr.u16 q15, q15, #5 ++ vst1.16 {q12-q13}, [r0], r3 ++ vst1.16 {q14-q15}, [r10], r3 ++ bhi 2b ++ bne 1b ++ ++ vpop {d8} ++ vmov d9, d0 ++ pop {r4-r11, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ add r5, r1, #32 ++ vld1.16 {q1-q2}, [r1] ++ rsb r12, r6, r6, lsl #16 ++ vld1.16 {q3-q4}, [r5] ++ add r1, r1, #64 ++ rsb r4, r12, #0 ++ rsb r12, r12, #32 << 16 ++ vmov d1, d9 ++ vmov s1, r12 ++ add r10, r0, #32 ++ mov r5, #32 ++1: ++ vld1.16 {d0[0]}, [r1]! ++ vmov q8, q1 ++ vmov q9, q2 ++ vmov q10, q3 ++ vmov q11, q4 ++ vext.16 q1, q1, q2, #1 ++ vext.16 q2, q2, q3, #1 ++ vext.16 q3, q3, q4, #1 ++ vext.16 q4, q4, q0, #1 ++2: ++ vmul.u16 q12, q1, d0[2] ++ adds r12, r4 ++ vmla.u16 q12, q8, d0[3] ++ it cc ++ addcc r12, #32 << 16 ++ vmul.u16 q13, q2, d0[2] ++ it cc ++ subcc r12, #32 ++ vmla.u16 q13, q9, d0[3] ++ sub r5, #1 ++ vmul.u16 q14, q3, d0[2] ++ teq r5, #0 ++ vmla.u16 q14, q10, d0[3] ++ vmul.u16 q15, q4, d0[2] ++ vmla.u16 q15, q11, d0[3] ++ vmov s1, r12 ++ vrshr.u16 q12, q12, #5 ++ vrshr.u16 q13, q13, #5 ++ vrshr.u16 q14, q14, #5 ++ vrshr.u16 q15, q15, #5 ++ vst1.16 {q12-q13}, [r0], r3 ++ vst1.16 {q14-q15}, [r10], r3 ++ bhi 2b ++ bne 1b ++ ++ vpop {d8} ++ vmov d9, d1 ++ pop {r4-r11, pc} ++ ++endfunc ++ ++ ++ ++@ Generate 4x4 chroma patch ++@ ++@ In (const) ++@ r1 Up ptr (_up only) ++@ r3 Out stride ++@ r4 Angle add ++@ r7 Inv angle (_up only) ++@ ++@ In/Out (updated) ++@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width) ++@ r2 Left ptr - updated ++@ r6 Angle frac (init to r4 + 32) ++@ r8 Inv angle accumulator ++@ q2 Cur Line - load before 1st call for down - set by _up ++@ q8 Cur Line - load before 1st call for up - set by _down ++@ ++@ Temps ++@ r5 Loop counter ++@ r12 ++@ d0, q1, q12-q15 ++ ++patch_h_down_c_4x4_10: ++ vld1.16 {q12}, [r2]! ++ rsb r12, r6, #32 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ mov r5, #4 ++1: ++ vmov q13, q12 ++ vext.16 q12, q12, q12, #2 ++ vld1.32 {d25[1]}, [r2]! ++patch_h_down_c_4x4_10_continue: ++2: ++ vmov q8, q9 ++ subs r12, r4 ++ vmul.u16 q0, q13, q3 ++ it cc ++ addcc r12, #32 ++ vmla.u16 q0, q12, q2 ++ rsb r6, r12, #32 ++ vmov q9, q10 ++ sub r5, #1 ++ vmov q10, q11 ++ teq r5, #0 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vrshr.u16 q11, q0, #5 ++ bhi 2b ++ bne 1b ++ ++ bcs 3f ++ vmov q13, q12 ++ vext.16 q12, q12, q12, #2 ++ vld1.32 {d25[1]}, [r2]! ++3: ++ ++store_tran_c_4x4_10: ++T add r6, r0, r3 ++ vzip.32 q8, q10 ++A add r6, r0, r3 ++T lsl r3, #1 ++ vzip.32 q9, q11 ++A add r5, r0, r3, lsl #1 ++T add r5, r0, r3 ++ vst2.32 {d16,d18}, [r0]! ++A lsl r3, #1 ++ vst2.32 {d17,d19}, [r6], r3 ++ asr r3, #1 ++ vst2.32 {d20,d22}, [r5] ++ mov r5, #4 ++ vst2.32 {d21,d23}, [r6] ++ bx lr ++ ++patch_h_up_c_4x4_10: ++ vld1.16 {q1}, [r2] ++ rsb r12, r6, #32 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ mov r5, #4 ++1: ++ adds r8, r7 ++ vmov q12, q1 ++ it mi ++ ldrmi r6, [r2, #-4]! ++ vext.16 q1, q1, q1, #6 ++ itt pl ++ asrpl r6, r8, #8 ++ ldrpl r6, [r1, r6, lsl #2] ++ vmov s4, r6 ++patch_h_up_c_4x4_10_continue: ++2: ++ vmov q8, q9 ++ subs r12, r4 ++ vmul.u16 q0, q12, q3 ++ it cc ++ addcc r12, #32 ++ vmla.u16 q0, q1, q2 ++ rsb r6, r12, #32 ++ vmov q9, q10 ++ sub r5, #1 ++ vmov q10, q11 ++ teq r5, #0 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vrshr.u16 q11, q0, #5 ++ bhi 2b ++ bne 1b ++ ++ bcs store_tran_c_4x4_10 ++ adds r8, r7 ++ vmov q12, q1 ++ it mi ++ ldrmi r6, [r2, #-4]! ++ vext.16 q1, q1, q1, #6 ++ itt pl ++ asrpl r6, r8, #8 ++ ldrpl r6, [r1, r6, lsl #2] ++ vmov s4, r6 ++ b store_tran_c_4x4_10 ++ ++ ++@ ff_hevc_rpi_pred_angular_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r8, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #2 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ bl patch_h_down_c_4x4_10 ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r8, #-128 ++ sub r8, r7 ++ bl patch_h_up_c_4x4_10 ++ pop {r4-r8, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.16 {q9}, [r1] ++ sub r1, r2, #4 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ vdup.16 q2, r6 ++ vext.16 q8, q9, q9, #6 ++ sub r8, r7, #128 ++ vld1.32 {d16[0]}, [r1] ++ vdup.16 q3, r12 ++ mov r5, #3 ++1: ++ vmul.u16 q0, q9, q3 ++ subs r12, r4 ++ vmla.u16 q0, q8, q2 ++ ittt cc ++ asrcc r1, r8, #8 ++ addcc r12, #32 ++ addcc r1, r2, r1, lsl #2 ++ vext.16 q10, q8, q8, #6 ++ rsb r6, r12, #32 ++ vmov q11, q8 ++ sub r5, #1 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r8, r7 ++ vld1.32 {d20[0]}, [r1] ++ teq r5, #0 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmul.u16 q0, q11, q3 ++ subs r12, r4 ++ vmla.u16 q0, q10, q2 ++ ittt cc ++ asrcc r1, r8, #8 ++ addcc r12, #32 ++ addcc r1, r2, r1, lsl #2 ++ vext.16 q8, q10, q10, #6 ++ rsb r6, r12, #32 ++ vmov q9, q10 ++ sub r5, #1 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r8, r7 ++ vld1.32 {d16[0]}, [r1] ++ teq r5, #0 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmul.u16 q0, q11, q3 ++ vmla.u16 q0, q10, q2 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r8, pc} ++4: ++ bcc 3b ++5: ++ vmul.u16 q0, q9, q3 ++ vmla.u16 q0, q8, q2 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ vld1.16 {q9}, [r1]! ++ rsb r12, r6, #32 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vext.16 q8, q9, q9, #2 ++ vld1.32 {d17[1]}, [r1]! ++ mov r5, #3 ++1: ++ vmul.u16 q0, q8, q2 ++ subs r12, r4 ++ vmla.u16 q0, q9, q3 ++ it cc ++ addcc r12, #32 ++ vext.16 q10, q8, q8, #2 ++ rsb r6, r12, #32 ++ vld1.32 {d21[1]}, [r1] ++ sub r5, #1 ++ vmov q11, q8 ++ teq r5, #0 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r1, #4 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 1b ++ beq 4f ++2: ++ vmul.u16 q0, q10, q2 ++ subs r12, r4 ++ vmla.u16 q0, q11, q3 ++ it cc ++ addcc r12, #32 ++ vext.16 q8, q10, q10, #2 ++ rsb r6, r12, #32 ++ vld1.32 {d17[1]}, [r1] ++ sub r5, #1 ++ vmov q9, q10 ++ teq r5, #0 ++ vrshr.u16 q0, q0, #5 ++ it cc ++ addcc r1, #4 ++ vdup.16 q2, r6 ++ vdup.16 q3, r12 ++ vst1.16 {q0}, [r0], r3 ++ bhi 2b ++ bne 1b ++ bcc 5f ++3: ++ vmul.u16 q0, q10, q2 ++ vmla.u16 q0, q11, q3 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r8, pc} ++4: ++ bcc 3b ++5: ++ vmul.u16 q0, q8, q2 ++ vmla.u16 q0, q9, q3 ++ vrshr.u16 q0, q0, #5 ++ vst1.16 {q0}, [r0] ++ ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r8, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #2 ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ mov r1, r2 @ save r2 - r1 unused by patch_down ++ ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10_continue ++ ++ add r2, r1, #4*4 @ restore r2, but 4 rows further down left ++ sub r0, #32 ++ mov r6, r4 ++ add r0, r0, r3, lsl #2 ++ ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10_continue ++ ++ pop {r4-r8, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ ldrh r7, [r7] ++ mov r8, #-128 ++ sub r8, r7 ++ ++ push {r2, r8} ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10_continue ++ pop {r2, r8} ++ ++ sub r0, #32 ++ mov r6, r4 ++ add r2, #16 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10_continue ++ ++ pop {r4-r8, pc} ++ ++@ Left of vertical - works down left ++18: ++ vld1.16 {q0-q1}, [r1] ++ sub r9, r2, #4 ++ rsb r12, r6, #32 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++ mov r5, #8 ++1: ++ vld1.32 {d17[1]}, [r9] ++ add r8, r7 ++ vmov q2, q0 ++ vmov q3, q1 ++ asr r9, r8, #8 ++ vext.16 q1, q0, q1, #6 ++ add r9, r2, r9, lsl #2 ++ vext.16 q0, q8, q0, #6 ++2: ++ vmul.u16 q11, q2, q10 ++ subs r12, r4 ++ vmla.u16 q11, q0, q9 ++ it cc ++ addcc r12, #32 ++ vmul.u16 q12, q3, q10 ++ rsb r6, r12, #32 ++ vmla.u16 q12, q1, q9 ++ sub r5, #1 ++ teq r5, #0 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++ vrshr.u16 q11, q11, #5 ++ vrshr.u16 q12, q12, #5 ++ vst1.16 {q11-q12}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r8, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ add r5, r1, #32 ++ vld1.16 {q0-q1}, [r1]! ++ rsb r12, r6, #32 ++ vld1.32 {d16[0]}, [r5] ++ mov r5, #8 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++1: ++ vmov q2, q0 ++ add r1, #4 ++ vmov q3, q1 ++ vext.16 q0, q0, q1, #2 ++ vext.16 q1, q1, q8, #2 ++2: ++ vmul.u16 q11, q0, q9 ++ subs r12, r4 ++ vmla.u16 q11, q2, q10 ++ it cc ++ addcc r12, #32 ++ vmul.u16 q12, q1, q9 ++ rsb r6, r12, #32 ++ vmla.u16 q12, q3, q10 ++ sub r5, #1 ++ vld1.32 {d16[0]}, [r1] ++ teq r5, #0 ++ vdup.16 q9, r6 ++ vdup.16 q10, r12 ++ vrshr.u16 q11, q11, #5 ++ vrshr.u16 q12, q12, #5 ++ vst1.16 {q11-q12}, [r0], r3 ++ bhi 2b ++ bne 1b ++ ++ pop {r4-r8, pc} ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_angular_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride [r3] ++@ unsigned int mode [sp, #0] 2..34 ++ ++function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1 ++ ldr r12, [sp] ++ push {r4-r10, lr} ++ ADRT r4, angle_2 - 2 ++ ADRT r7, inv_angle - 11*2 ++ add r7, r7, r12, lsl #1 ++ lsl r3, #2 ++ vpush {d8} ++ ldrsb r6, [r4, r12] ++ cmp r12, #26 ++ ldrsb r4, [r4, r12] ++ bge 26f ++ cmp r12, #18 ++ bge 18f ++ cmp r12, #10 ++ bge 10f ++ ++@ Down of Horizontal - works down left ++ add sp, #8 ++ mov r10, #4 ++ mov r1, r2 ++1: ++ bl patch_h_down_c_4x4_10 ++ bl patch_h_down_c_4x4_10_continue ++ bl patch_h_down_c_4x4_10_continue ++ bl patch_h_down_c_4x4_10_continue ++ ++ add r2, r1, #4*4 @ restore r2, but 4 rows further down left ++ add r1, r1, #4*4 ++ mov r6, r4 ++ sub r0, #64 ++ subs r10, #1 ++ add r0, r0, r3, lsl #2 ++ bne 1b ++ ++ pop {r4-r10, pc} ++ ++@ Up of Horizontal - works down up ++10: ++ add sp, #8 ++ mov r10, #4 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ sub r8, r7 ++2: ++ push {r2, r8} ++ bl patch_h_up_c_4x4_10 ++ bl patch_h_up_c_4x4_10_continue ++ bl patch_h_up_c_4x4_10_continue ++ bl patch_h_up_c_4x4_10_continue ++ pop {r2, r8} ++ ++ sub r0, #64 ++ mov r6, r4 ++ add r2, #16 ++ sub r8, r8, r7, lsl #2 ++ add r0, r0, r3, lsl #2 ++ subs r10, #1 ++ bne 2b ++ ++ pop {r4-r10, pc} ++ ++@ Left of vertical - works down left ++18: ++ add r5, r1, #32 ++ vld1.16 {q1-q2}, [r1] ++ rsb r12, r6, r6, lsl #16 ++ vld1.16 {q3-q4}, [r5] ++ sub r9, r2, #4 ++ rsb r4, r12, #0 ++ rsb r12, r12, #32 << 16 ++ ldrh r7, [r7] ++ mov r8, #-128 ++ vmov d0, d9 ++ vmov s2, r12 ++ add r10, r0, #32 ++ mov r5, #16 ++1: ++ vld1.32 {d1[1]}, [r9] ++ add r8, r7 ++ vmov q11, q4 ++ vmov q10, q3 ++ asr r9, r8, #8 ++ vmov q9, q2 ++ add r9, r2, r9, lsl #2 ++ vmov q8, q1 ++ vext.16 q4, q3, q4, #6 ++ vext.16 q3, q2, q3, #6 ++ vext.16 q2, q1, q2, #6 ++ vext.16 q1, q0, q1, #6 ++2: ++ vmul.u16 q12, q8, d1[1] ++ adds r12, r4 ++ vmla.u16 q12, q1, d1[0] ++ it cc ++ addcc r12, #32 << 16 ++ vmul.u16 q13, q9, d1[1] ++ it cc ++ subcc r12, #32 ++ vmla.u16 q13, q2, d1[0] ++ sub r5, #1 ++ vmul.u16 q14, q10, d1[1] ++ teq r5, #0 ++ vmla.u16 q14, q3, d1[0] ++ vmul.u16 q15, q11, d1[1] ++ vmla.u16 q15, q4, d1[0] ++ vmov s2, r12 ++ vrshr.u16 q12, q12, #5 ++ vrshr.u16 q13, q13, #5 ++ vrshr.u16 q14, q14, #5 ++ vrshr.u16 q15, q15, #5 ++ vst1.16 {q12-q13}, [r0], r3 ++ vst1.16 {q14-q15}, [r10], r3 ++ bhi 2b ++ bne 1b ++ ++ vpop {d8} ++ vmov d9, d0 ++ pop {r4-r10, pc} ++ ++@ Right of vertical - works along top - left unused ++26: ++ add r5, r1, #32 ++ vld1.16 {q1-q2}, [r1] ++ rsb r12, r6, r6, lsl #16 ++ vld1.16 {q3-q4}, [r5] ++ add r1, r1, #64 ++ rsb r4, r12, #0 ++ rsb r12, r12, #32 << 16 ++ vmov d1, d9 ++ vmov s1, r12 ++ add r10, r0, #32 ++ mov r5, #16 ++1: ++ vld1.32 {d0[0]}, [r1]! ++ vmov q8, q1 ++ vmov q9, q2 ++ vmov q10, q3 ++ vmov q11, q4 ++ vext.16 q1, q1, q2, #2 ++ vext.16 q2, q2, q3, #2 ++ vext.16 q3, q3, q4, #2 ++ vext.16 q4, q4, q0, #2 ++2: ++ vmul.u16 q12, q1, d0[2] ++ adds r12, r4 ++ vmla.u16 q12, q8, d0[3] ++ it cc ++ addcc r12, #32 << 16 ++ vmul.u16 q13, q2, d0[2] ++ it cc ++ subcc r12, #32 ++ vmla.u16 q13, q9, d0[3] ++ sub r5, #1 ++ vmul.u16 q14, q3, d0[2] ++ teq r5, #0 ++ vmla.u16 q14, q10, d0[3] ++ vmul.u16 q15, q4, d0[2] ++ vmla.u16 q15, q11, d0[3] ++ vmov s1, r12 ++ vrshr.u16 q12, q12, #5 ++ vrshr.u16 q13, q13, #5 ++ vrshr.u16 q14, q14, #5 ++ vrshr.u16 q15, q15, #5 ++ vst1.16 {q12-q13}, [r0], r3 ++ vst1.16 {q14-q15}, [r10], r3 ++ bhi 2b ++ bne 1b ++ ++ vpop {d8} ++ vmov d9, d1 ++ pop {r4-r10, pc} ++ ++endfunc +diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S +new file mode 100644 +index 0000000000..df8c1c25b9 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S +@@ -0,0 +1,705 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++ ++@ ff_hevc_rpi_pred_dc_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_4_neon_8, export=1 ++ ++ @ Average the els of top & left ++ ldr r2, [r2] ++ vld1.32 {d0[0]}, [r1] ++ mov r1, #2 ++ vmov s1, r2 ++ vmov s2, r2 ++ vmov.i16 q2, #3 ++ add r2, r0, r3 ++ vaddl.u8 q1, d0, d1 @ d2[0] = top[0] + left[0] ++ lsl r3, #1 ++ vmovl.u8 q0, d0 ++ vmov.i64 d7, #0xffff ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vmov.i64 d7, #0xff ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #3 ++ vmla.i16 q0, q2, d6[0] ++ vdup.8 d6, d6[0] ++ vrshrn.i16 d0, q0, #2 ++ ++ @ Store top line ++ vst1.32 {d0[0]}, [r0], r3 ++ ++ @ Store the rest ++ vshr.u64 d1, d0, #5*8 ++ vshr.u64 d2, d0, #6*8 ++ vshr.u64 d3, d0, #7*8 ++ vbif d1, d6, d7 ++ vbif d2, d6, d7 ++ vst1.32 {d1[0]}, [r2], r3 ++ vbif d3, d6, d7 ++ vst1.32 {d2[0]}, [r0] ++ vst1.32 {d3[0]}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {d0}, [r1] ++ vld1.8 {d1}, [r2] ++A add r2, r0, r3, lsl #1 ++A lsl r3, #2 ++T lsl r3, #1 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vaddl.u8 q0, d0, d1 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d2, d0, d0 @ This adds U & V separately ++ vpadd.i32 d3, d0, d0 ++ vrshrn.u16 d0, q1, #3 ++ ++ @ Store ++ vst1.8 {d0}, [r0], r3 ++ vst1.8 {d0}, [r2], r3 ++ vst1.8 {d0}, [r0] ++ vst1.8 {d0}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_8_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {d0}, [r1] ++ mov r1, #2 ++ vld1.8 {d16}, [r2] ++ vmov.i16 q2, #3 ++ vmov.i64 d7, #0xffff ++ vaddl.u8 q1, d0, d16 @ d2[0] = top[0] + left[0] ++ vmovl.u8 q0, d0 ++ vadd.i16 d6, d2, d3 @ d6 has 4 vals ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vmov.i64 d7, #0xff ++ vmovl.u8 q1, d16 ++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #4 ++ vmla.i16 q1, q2, d6[0] ++ vmla.i16 q0, q2, d6[0] ++ vdup.8 d6, d6[0] ++ vrshrn.i16 d2, q1, #2 ++ vrshrn.i16 d0, q0, #2 ++ ++ @ Store top line ++ vst1.8 {d0}, [r0], r3 ++ ++ @ Store the rest ++ vshr.u64 d2, #8 ++ vbit d6, d2, d7 ++ vshr.u64 d2, #8 ++ vst1.8 {d6}, [r0], r3 ++ mov r1, #6 ++1: ++ vbit d6, d2, d7 ++ vshr.u64 d2, #8 ++ vst1.8 {d6}, [r0], r3 ++ subs r1, #2 ++ vbit d6, d2, d7 ++ vshr.u64 d2, #8 ++ vst1.8 {d6}, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q0}, [r1] ++ mov r1, #8 ++ vld1.8 {q1}, [r2] ++T lsl r3, #1 ++ vaddl.u8 q0, d0, d1 ++A add r2, r0, r3, lsl #1 ++A lsl r3, #2 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vaddl.u8 q1, d2, d3 ++ vadd.i16 q1, q0 ++ vadd.i16 d3, d2 @ d3 has 2 val pairs ++ vpadd.i32 d2, d3, d3 @ This add U & V separately ++ vpadd.i32 d3, d3, d3 ++ vrshrn.u16 d0, q1, #4 ++ vrshrn.u16 d1, q1, #4 ++ ++ @ Store ++1: ++ vst1.8 {q0}, [r0], r3 ++ subs r1, #4 ++ vst1.8 {q0}, [r2], r3 ++ vst1.8 {q0}, [r0], r3 ++ vst1.8 {q0}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_16_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q8}, [r1] ++ mov r1, #2 ++ vld1.8 {q9}, [r2] ++ vaddl.u8 q10, d16, d17 ++ vaddl.u8 q11, d16, d18 ++ vaddl.u8 q0, d18, d19 ++ vmov.i16 q1, #3 ++ vadd.i16 q10, q0 ++ vmovl.u8 q0, d18 ++ vadd.i16 d20, d21 ++ vmov.i16 d2[0], r1 @ 2, 3, 3, 3... ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vmovl.u8 q2, d16 ++ vmovl.u8 q9, d19 ++ vpadd.i16 d20, d20 @ 2 (top & bottom of vector the same) ++ vmov.i64 d7, #0xffff ++ vmovl.u8 q8, d17 ++ vbit d4, d22, d7 @ q2 = top[0]+left[0], top[1..7] ++ vmov.i64 d7, #0xff ++ vpadd.i16 d20, d20 @ 1 (all the same) ++ vrshr.u16 d21, d20, #5 ++ vrshr.u16 d20, d20, #5 ++ vmla.i16 q0, q10, d2[1] ++ vmla.i16 q9, q10, d2[1] ++ vmla.i16 q2, q10, q1 ++ vmla.i16 q8, q10, d2[1] ++ vdup.8 q1, d20[0] ++ vrshrn.i16 d0, q0, #2 ++ vrshrn.i16 d1, q9, #2 ++ vrshrn.i16 d4, q2, #2 ++ vrshrn.i16 d5, q8, #2 ++ vext.8 q0, q0, q0, #1 ++ ++ @ Store top line ++ vst1.8 {q2}, [r0], r3 ++ ++ @ Store the rest ++ mov r1, #15 ++1: ++ vbit d2, d0, d7 ++ vext.8 q0, q0, q0, #1 ++ subs r1, #1 ++ vst1.8 {q1}, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q0-q1}, [r1] ++ mov r1, #16 ++ vld1.8 {q2-q3}, [r2] ++T lsl r3, #1 ++ vaddl.u8 q0, d0, d1 ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vaddl.u8 q1, d2, d3 ++A lsl r3, #2 ++T lsl r3, #1 ++ vaddl.u8 q2, d4, d5 ++ vaddl.u8 q3, d6, d7 ++ vadd.i16 q0, q1 ++ vadd.i16 q2, q3 ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d4, d0, d0 @ This adds U & V separately ++ vpadd.i32 d5, d0, d0 ++ vrshrn.u16 d0, q2, #5 ++ vrshrn.u16 d1, q2, #5 ++ vrshrn.u16 d2, q2, #5 ++ vrshrn.u16 d3, q2, #5 ++ ++ @ Store ++1: ++ vst1.8 {q0-q1}, [r0], r3 ++ subs r1, #2 ++ vst1.8 {q0-q1}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_32_neon_8, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q0-q1}, [r1] ++ mov r1, #32 ++ vld1.8 {q2-q3}, [r2] ++ add r2, r0, r3 ++ vaddl.u8 q0, d0, d1 ++ lsl r3, #1 ++ vaddl.u8 q1, d2, d3 ++ vaddl.u8 q2, d4, d5 ++ vaddl.u8 q3, d6, d7 ++ vadd.i16 q0, q1 ++ vadd.i16 q2, q3 ++ vadd.i16 q0, q2 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ vpadd.i16 d4, d0, d0 @ 1 (all the same) ++ vpadd.i16 d5, d0, d0 ++ vrshrn.u16 d0, q2, #6 ++ vrshrn.u16 d1, q2, #6 ++ vrshrn.u16 d2, q2, #6 ++ vrshrn.u16 d3, q2, #6 ++ ++ @ Store ++1: ++ vst1.8 {q0-q1}, [r0], r3 ++ subs r1, #2 ++ vst1.8 {q0-q1}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ----------------------------------------------------------------------------- ++@ ++@ 10 Bit versions ++@ ++@ There is no actual bit depth dependency in this code except that our ++@ intermediate results will overflow the 16 bits they are stored in ++@ All there functions are good to 10 bits - with the worst case being ++@ in dc_32 where we use all 16 bits. ++ ++ ++@ ff_hevc_rpi_pred_dc_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_4_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {d0}, [r1] ++ mov r1, #2 ++ vld1.16 {d1}, [r2] ++T lsl r3, #1 ++ vmov.i16 q2, #3 ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vadd.u16 d2, d0, d1 @ d2[0] = top[0] + left[0] ++A lsl r3, #2 ++T lsl r3, #1 ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vmov.i64 d7, #0xffff ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #3 ++ vmla.i16 q0, q2, d6[0] ++ vrshr.u16 q0, #2 ++ ++ @ Store top line ++ vst1.16 {d0}, [r0], r3 ++ ++ @ Store the rest ++ vshr.u64 d3, d1, #1*16 ++ vshr.u64 d4, d1, #2*16 ++ vshr.u64 d5, d1, #3*16 ++ vbif d3, d6, d7 ++ vbif d4, d6, d7 ++ vst1.16 {d3}, [r2], r3 ++ vbif d5, d6, d7 ++ vst1.16 {d4}, [r0] ++ vst1.16 {d5}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels - needs * 4) ++ ++function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.8 {q0}, [r1] ++ vld1.8 {q1}, [r2] ++A add r2, r0, r3, lsl #2 ++A lsl r3, #3 ++T lsl r3, #2 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vadd.i16 q0, q1 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d2, d0, d0 @ This adds U & V separately ++ vpadd.i32 d3, d0, d0 ++ vrshr.u16 q0, q1, #3 ++ ++ vst1.16 {q0}, [r0], r3 ++ vst1.16 {q0}, [r2], r3 ++ vst1.16 {q0}, [r0] ++ vst1.16 {q0}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_8_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {q0}, [r1] ++ mov r1, #2 ++ vld1.16 {q8}, [r2] ++T lsl r3, #1 ++ vmov.i16 q2, #3 ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vadd.i16 q1, q0, q8 @ q1[0] = top[0] + left[0] ++A lsl r3, #2 ++T lsl r3, #1 ++ vmov.i64 d7, #0xffff ++ vmov.16 d4[0], r1 @ 2, 3, 3, 3... ++ vadd.i16 d6, d2, d3 @ d6 has 4 vals ++ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7] ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ top_line[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d6, d6 @ 1 (all the same) ++ vrshr.u16 d6, #4 ++ vmla.i16 q8, q2, d6[0] ++ vmla.i16 q0, q2, d6[0] ++ vdup.16 q2, d6[0] ++ vdup.16 q9, d6[0] ++ vrshr.u16 q8, q8, #2 ++ vrshr.u16 q0, q0, #2 ++ vext.16 q1, q8, q8, #1 ++ ++ @ Store top line ++ vst1.16 {q0}, [r0], r3 ++ ++ @ Store the rest ++ vbit d18, d2, d7 ++ vst1.16 {q9}, [r2], r3 ++ mov r1, #6 ++1: ++ vext.16 q8, q8, q8, #2 ++ subs r1, #2 ++ vext.16 q1, q1, q1, #2 ++ vbit d4, d16, d7 ++ vst1.16 {q2}, [r0], r3 ++ vbit d18, d2, d7 ++ vst1.16 {q9}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels - needs * 4) ++ ++function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {q0-q1}, [r1] ++ mov r1, #8 ++ vld1.16 {q2-q3}, [r2] ++T lsl r3, #2 ++ vadd.i16 q1, q0 ++A add r2, r0, r3, lsl #2 ++A lsl r3, #3 ++T add r2, r0, r3 ++T lsl r3, #1 ++ vadd.i16 q2, q3 ++ vadd.i16 q1, q2 ++ vadd.i16 d3, d2 @ d3 has 2 val pairs ++ vpadd.i32 d2, d3, d3 @ This add U & V separately ++ vpadd.i32 d3, d3, d3 ++ vrshr.u16 q0, q1, #4 ++ vrshr.u16 q1, q1, #4 ++ ++ @ Store ++1: ++ vst1.8 {q0-q1}, [r0], r3 ++ subs r1, #2 ++ vst1.8 {q0-q1}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_dc_16_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vld1.16 {q8-q9}, [r1] ++ mov r1, #2 ++ vld1.16 {q10-q11}, [r2] ++ lsl r3, #1 @ stride given in pels ++ vadd.i16 q0, q8, q9 ++ vadd.i16 q1, q10, q11 ++ vmov.i16 q3, #3 ++ vadd.i16 q1, q0 ++ vadd.i16 d0, d16, d20 ++ vmov.i64 d31, #0xffff ++ vadd.i16 d3, d2 ++ vmov.16 d6[0], r1 @ 2, 3, 3, 3... ++ ++ @ top line gets some smoothing ++ @ (top[i] + 3*dc + 2) >> 2 ++ @ as does left ++ @ topline[0] is extra special ++ @ (top[0] + left[0] + 2*dc + 2) >> 2 ++ ++ vbit d16, d0, d31 @ q8 = top[0]+left[0], top[1..7] ++ vpadd.i16 d3, d3 @ 2 (top & bottom of vector the same) ++ vpadd.i16 d3, d3 @ 1 (all the same) ++ vrshr.u16 d2, d3, #5 ++ vrshr.u16 d3, d3, #5 ++ vmov q0, q1 ++ vmla.i16 q10, q1, d6[1] ++ vmla.i16 q11, q1, d6[1] ++ vmla.i16 q8, q1, q3 ++ vmla.i16 q9, q1, d6[1] ++ vrshr.u16 q2, q10, #2 ++ vrshr.u16 q3, q11, #2 ++ vrshr.u16 q8, #2 ++ vrshr.u16 q9, #2 ++ vext.16 q2, q2, q2, #1 ++ mov r1, #7<<29 ++ ++ @ Store top line ++ vst1.16 {q8-q9}, [r0], r3 ++ ++ @ Store the rest ++1: ++ vbit d0, d4, d31 ++ vext.16 q2, q2, q2, #1 ++ subs r1, #1<<29 ++ vst1.16 {q0-q1}, [r0], r3 ++ bne 1b ++1: ++ vbit d0, d6, d31 ++ vext.16 q3, q3, q3, #1 ++ subs r1, #1<<29 ++ vst1.16 {q0-q1}, [r0], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels - needs * 4) ++ ++function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1 ++ ++ @ Average the els of top & left ++ vldm r1, {q0-q3} ++ vldm r2, {q8-q11} ++ vadd.i16 q0, q1 ++ mov r1, #16 ++ vadd.i16 q2, q3 ++ add r2, r0, #32 ++ vadd.i16 q8, q9 ++ lsl r3, #2 ++ vadd.i16 q10, q11 ++ vadd.u16 q0, q2 ++ vadd.u16 q8, q10 ++ vadd.i16 q0, q8 ++ vadd.i16 d0, d1 @ d0 has 2 val pairs ++ vpadd.i32 d4, d0, d0 @ This adds U & V separately ++ vpadd.i32 d5, d0, d0 ++ vrshr.u16 q0, q2, #5 ++ vrshr.u16 q1, q2, #5 ++ ++ @ Store ++1: ++ vst1.16 {q0-q1}, [r0], r3 ++ subs r1, #1 ++ vst1.16 {q0-q1}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_dc_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] (In pels) ++ ++function ff_hevc_rpi_pred_dc_32_neon_10, export=1 ++ ++ @ Average the els of top & left ++ @ With 10 bits we are (just) safe from overflow in i16 ++ vldm r1, {q0-q3} ++ vldm r2, {q8-q11} ++ vadd.i16 q0, q1 ++ mov r1, #32 ++ vadd.i16 q2, q3 ++ add r2, r0, #32 ++ vadd.i16 q8, q9 ++ lsl r3, #1 ++ vadd.i16 q10, q11 ++ vadd.u16 q0, q2 ++ vadd.u16 q8, q10 ++ vadd.i16 q0, q8 ++ vadd.i16 d0, d1 @ d0 has 4 vals ++ vpadd.i16 d0, d0 @ 2 (top & bottom the same) ++ vpadd.i16 d4, d0, d0 @ 1 (all the same) ++ vpadd.i16 d5, d0, d0 ++ vrshr.u16 q0, q2, #6 ++ vrshr.u16 q1, q2, #6 ++ ++ @ Store ++1: ++ vst1.16 {q0-q1}, [r0], r3 ++ subs r1, #1 ++ vst1.16 {q0-q1}, [r2], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S +new file mode 100644 +index 0000000000..f6969d3591 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S +@@ -0,0 +1,881 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ All functions have the call ++@ ++@ int ff_hevc_rpi_intra_filter_N_neon_PW( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++@ ++@ Assumptions: ++@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware ++@ if reuseing this code) ++@ ++@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for ++@ N==4, but do for chroma N>=8. As we share Y/C fns that means we can ignore ++@ N==8,PW=8 (chroma always PW>8) but have to cope for larger ++@ ++@ We always have at least 64 pixel H frame width rounding - this lets us ++@ load UR widthout having to worry about exactly how many pixels are actually ++@ within the frame. As partial loads will only occur very occasionally this ++@ should be a win in nearly all cases. ++@ ++@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters ++@ so we do no maths on the contents ++@ ++@ No filtering in 32bit fns as they are chroma only ++ ++ ++.equ AVAIL_UR, 1 ++.equ AVAIL_U, 2 ++.equ AVAIL_UL, 4 ++.equ AVAIL_L, 8 ++.equ AVAIL_DL, 16 ++ ++.equ FILTER_LIGHT, 0x40 ++.equ FILTER_STRONG, 0x80 ++ ++.equ AVAIL_S_UR_N_U_C, 32 - 1 ++.equ AVAIL_S_U_N_UL_C, 32 - 2 ++.equ AVAIL_S_UL_N_L_C, 32 - 3 ++.equ AVAIL_S_L_N_DL_C, 32 - 4 ++ ++.equ AVAIL_S_U_DL_CPSR, 31 - 4 @ Shift for u..dl to go into flags via cpsr ++ ++@ On entry ++@ r2 req ++@ r3 avail ++@ [sp, #sp_offset...] args ++@ ++@ On Exit: ++@ ++@ Extend values: ++@ d_l scalar contains value for L & DL ++@ if DL avail then this is is DL[0] so we don't need to load that ++@ d_ul scalar containing value for UL ++@ d_u scalar containing value for U ++@ d_ur scalar containing value for UR ++@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else... ++@ This means that L-light-filter works even if nreq DL (we never filter ++@ req-DL without req-L, but we do filter req-L without req-DL) ++@ If UR avail then d_ur == a_ur so U-filter good too ++@ ++@ Data load pointers (only load if req & avail): ++@ r4 DL + stride ++@ r10 L ++@ r6 U ++@ r5 UR ++@ ++@ Others: ++@ r2 req ++@ r7 req & avail ++@ r3 L + stride ++@ r8 DL + stride * 2 ++@ r9 stride * 2 ++@ cs Load U ++@ mi Load UR ++@ ++@ Clobbered: ++@ r12 ++ ++.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur ++ ++.equ src_l\@, \sp_offset + 0 ++.equ src_u\@, \sp_offset + 4 ++.equ src_ur\@, \sp_offset + 8 ++.equ stride\@, \sp_offset + 12 ++.equ pw\@, (1 << \pw_s) @ pel width in bytes ++.equ b_size\@, (1 << (\pw_s + \log2_s)) @ size in bytes ++ ++@ r9 stride ++@ r7 = ab_ul, r6 = a_u, r5 = a_ur ++@ r4 = b_dl, r10 = b_l, r8 = b_u ++ ++ ldr r5, [sp, #src_ur\@] ++ lsl r12, r3, #AVAIL_S_U_DL_CPSR ++ ldr r10, [sp, #src_l\@] ++ ldr r9, [sp, #stride\@] ++ ldr r6, [sp, #src_u\@] ++ ++ @ This is quite a slow instruction but it replaces ++ @ a decent number of tests that yield a max of 2 flags/op ++ @ It is annoying we can't branch on Q! ++ @ If L navail (ne) then DL must be navail (pl) ++ msr APSR_nzcvq, r12 @ n=dl, z=l, c=ul, v=u, q=ur ++ ++ mov r4, r5 ++ sub r7, r10, r9 ++ it vs ++ movvs r4, r6 ++ add r8, r6, #b_size\@ - pw\@ ++ it cs ++ movcs r4, r7 ++ ite ne ++ movne r10, r4 ++ addeq r4, r7, r9, lsl #\log2_s ++ it cc ++ movcc r7, r10 ++ it mi ++ addmi r4, r10, r9, lsl #\log2_s ++ vld1.\d_type {\d_ul}, [r7] ++ itt vc ++ movvc r8, r7 ++ movvc r6, r7 ++ vld1.\d_type {\d_l }, [r4], r9 ++ tst r3, #AVAIL_UR ++ vld1.\d_type {\d_u }, [r6] ++ it eq ++ moveq r5, r8 ++ and r7, r2, r3 ++ add r8, r4, r9 ++ vld1.\d_type {\d_ur}, [r5] ++ lsls r12, r7, #AVAIL_S_UR_N_U_C ++ add r3, r10, r9 ++ lsl r9, #1 ++.endm ++ ++ ++ ++@ int ff_hevc_rpi_intra_filter_4_neon_8( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 0 ++.set pw, (1 << pw_s) ++.set log2_s, 2 ++ ++function ff_hevc_rpi_intra_filter_4_neon_8, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[] ++ ++ it cs ++ vldrcs s2, [r6] ++ ite pl ++ vmovpl s3, s4 ++ vldrmi s3, [r5] ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ add r12, r0, #-pw ++ bpl 1f ++ ++ vld1.8 {d0[0]}, [r10], r9 ++ vld1.8 {d0[1]}, [r3], r9 ++ vld1.8 {d0[2]}, [r10] ++ vld1.8 {d0[3]}, [r3] ++1: ++ bcc 1f ++ vld1.8 {d0[5]}, [r4], r9 ++ vld1.8 {d0[6]}, [r8] ++ vld1.8 {d0[7]}, [r4] ++1: ++ vstr d1, [r1] @ Up ++ vst1.8 {d31[7]}, [r12] ++ vstr d0, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_4_neon_16( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 1 ++.set pw, (1 << pw_s) ++.set log2_s, 2 ++ ++function ff_hevc_rpi_intra_filter_4_neon_16, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[] ++ ++ it cs ++ vldrcs d2, [r6] ++ it mi ++ vldrmi d3, [r5] ++ lsls r7, #AVAIL_S_L_N_DL_C ++ add r12, r0, #-pw ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10] ++ vld1.16 {d0[3]}, [r3] ++1: ++ bcc 1f ++ vld1.16 {d1[1]}, [r4], r9 ++ vld1.16 {d1[2]}, [r8] ++ vld1.16 {d1[3]}, [r4] ++1: ++ vst1.16 {q1}, [r1] @ Up ++ vst1.16 {d31[3]}, [r12] ++ vst1.16 {q0}, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_8_neon_8( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 0 ++.set pw, (1 << pw_s) ++.set log2_s, 3 ++ ++function ff_hevc_rpi_intra_filter_8_neon_8, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[] ++ ++ it cs ++ vldrcs d4, [r6] ++ it mi ++ vldrmi d5, [r5] ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ bpl 1f ++ vld1.8 {d0[0]}, [r10], r9 ++ vld1.8 {d0[1]}, [r3], r9 ++ vld1.8 {d0[2]}, [r10], r9 ++ vld1.8 {d0[3]}, [r3], r9 ++ vld1.8 {d0[4]}, [r10], r9 ++ vld1.8 {d0[5]}, [r3], r9 ++ vld1.8 {d0[6]}, [r10] ++ vld1.8 {d0[7]}, [r3] ++1: ++ bcc 1f ++ vld1.8 {d1[1]}, [r4], r9 ++ vld1.8 {d1[2]}, [r8], r9 ++ vld1.8 {d1[3]}, [r4], r9 ++ vld1.8 {d1[4]}, [r8], r9 ++ vld1.8 {d1[5]}, [r4], r9 ++ vld1.8 {d1[6]}, [r8] ++ vld1.8 {d1[7]}, [r4] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ @ Luma light filter ++ vext.8 q8, q15, q2, #15 ++ vext.8 q12, q15, q0, #15 ++ vaddl.u8 q9, d17, d5 ++ vaddl.u8 q8, d16, d4 ++ vaddl.u8 q13, d25, d1 ++ vaddl.u8 q12, d24, d0 ++ vmov.u8 r3, d5[7] @ Save final pel ++ vmov.u8 r2, d1[7] @ Save final pel ++ ++ vext.16 q2, q8, q9, #1 ++ vext.16 q3, q9, q9, #1 ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q13, #1 ++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q2, q8 ++ vadd.u16 q3, q9 ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ ++ vrshrn.u16 d4, q2, #2 ++ vrshrn.u16 d5, q3, #2 ++ vrshrn.u16 d0, q0, #2 ++ vrshrn.u16 d1, q1, #2 ++ vrshr.u16 d30, #2 ++ vmov.u8 d5[7], r3 @ Restore final pel ++ vmov.u8 d1[7], r2 @ Restore final pel ++ vdup.u8 d31, d30[0] @ d31[3] = d30[0] ++ ++10: ++ vst1.8 {q2 }, [r1] @ Up ++ vst1.8 {d31[7]}, [r12] @ Up-left ++ vst1.8 {q0 }, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_8_neon_16( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 1 ++.set pw, (1 << pw_s) ++.set log2_s, 3 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_8_neon_16, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]" ++ ++ it cs ++ vldmcs r6, {d4, d5} ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #4 ++ vldm r5, {d6, d7} ++ bgt 1f ++ vdup.16 d7, d6[3] ++1: ++ lsls r12, r7, #AVAIL_S_L_N_DL_C ++ vdup.16 q1, d0[0] ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10], r9 ++ vld1.16 {d0[3]}, [r3], r9 ++ vld1.16 {d1[0]}, [r10], r9 ++ vld1.16 {d1[1]}, [r3], r9 ++ vld1.16 {d1[2]}, [r10] ++ vld1.16 {d1[3]}, [r3] ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ vld1.16 {d2[1]}, [r4], r9 ++ cmp r12, #p_size ++ vld1.16 {d2[2]}, [r8], r9 ++ vld1.16 {d2[3]}, [r4], r9 ++ blt 2f ++ vld1.16 {d3[0]}, [r8], r9 ++ vld1.16 {d3[1]}, [r4], r9 ++ vld1.16 {d3[2]}, [r8] ++ vld1.16 {d3[3]}, [r4] ++ b 1f ++2: ++ vdup.16 d3, d2[3] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ @ Luma light filter ++ vext.16 q9, q2, q3, #7 ++ vext.16 q8, q15, q2, #7 ++ vext.16 q13, q0, q1, #7 ++ vext.16 q12, q15, q0, #7 ++ vadd.u16 q9, q3 ++ vadd.u16 q8, q2 ++ vadd.u16 q13, q1 ++ vadd.u16 q12, q0 ++ vmov.u16 r3, d7[3] @ Save final pel ++ vmov.u16 r2, d3[3] @ Save final pel ++ ++ vext.16 q2, q8, q9, #1 ++ vext.16 q3, q9, q9, #1 ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q13, #1 ++ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q2, q8 ++ vadd.u16 q3, q9 ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ ++ vrshr.u16 q2, #2 ++ vrshr.u16 q3, #2 ++ vrshr.u16 q0, #2 ++ vrshr.u16 q1, #2 ++ vrshr.u16 d30, #2 ++ vmov.u16 d7[3], r3 @ Restore final pel ++ vmov.u16 d3[3], r2 @ Restore final pel ++ vdup.u16 d31, d30[0] @ d31[3] = d30[0] ++ ++10: ++ vst1.16 {q2, q3}, [r1] @ Up ++ vst1.16 {d31[3]}, [r12] @ Up-left ++ vst1.16 {q0, q1}, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++@ int ff_hevc_rpi_intra_filter_16_neon_16( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 1 ++.set pw, (1 << pw_s) ++.set log2_s, 4 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_16_neon_16, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]" ++ ++ vdup.16 q9, d16[0] ++ vdup.16 q11, d20[0] ++ ++ it cs ++ vldmcs r6, {d16-d19} ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #12 ++ @ Given chroma frame layout, if UR exists then it is always legit to ++ @ load all of it even if most of it is outside the frame. ++ vldm r5, {d20-d23} ++ bgt 1f ++ bge 4f ++ cmp r12, #8 ++ bge 3f ++ vdup.16 d21, d20[3] ++3: vdup.16 d22, d21[3] ++4: vdup.16 d23, d22[3] ++ ++1: ++ lsls r7, #AVAIL_S_L_N_DL_C ++ ldr r12, [sp, #dl_size] ++ vdup.16 q1, d0[0] ++ vdup.16 q2, d0[0] ++ vdup.16 q3, d0[0] ++ bpl 1f ++ vld1.16 {d0[0]}, [r10], r9 ++ vld1.16 {d0[1]}, [r3], r9 ++ vld1.16 {d0[2]}, [r10], r9 ++ vld1.16 {d0[3]}, [r3], r9 ++ vld1.16 {d1[0]}, [r10], r9 ++ vld1.16 {d1[1]}, [r3], r9 ++ vld1.16 {d1[2]}, [r10], r9 ++ vld1.16 {d1[3]}, [r3], r9 ++ vld1.16 {d2[0]}, [r10], r9 ++ vld1.16 {d2[1]}, [r3], r9 ++ vld1.16 {d2[2]}, [r10], r9 ++ vld1.16 {d2[3]}, [r3], r9 ++ vld1.16 {d3[0]}, [r10], r9 ++ vld1.16 {d3[1]}, [r3], r9 ++ vld1.16 {d3[2]}, [r10] ++ vld1.16 {d3[3]}, [r3] ++1: ++ bcc 1f ++ vld1.16 {d4[1]}, [r4], r9 ++ cmp r12, #4 ++ vld1.16 {d4[2]}, [r8], r9 ++ vld1.16 {d4[3]}, [r4], r9 ++ ble 2f ++ vld1.16 {d5[0]}, [r8], r9 ++ vld1.16 {d5[1]}, [r4], r9 ++ cmp r12, #12 ++ vld1.16 {d5[2]}, [r8], r9 ++ vld1.16 {d5[3]}, [r4], r9 ++ blt 3f ++ vld1.16 {d6[0]}, [r8], r9 ++ vld1.16 {d6[1]}, [r4], r9 ++ vld1.16 {d6[2]}, [r8], r9 ++ vld1.16 {d6[3]}, [r4], r9 ++ ble 4f ++ vld1.16 {d7[0]}, [r8], r9 ++ vld1.16 {d7[1]}, [r4], r9 ++ vld1.16 {d7[2]}, [r8] ++ vld1.16 {d7[3]}, [r4] ++ b 1f ++2: vdup.16 d5, d4[3] ++3: vdup.16 d6, d5[3] ++4: vdup.16 d7, d6[3] ++1: ++ tst r2, #FILTER_LIGHT ++ add r12, r0, #-pw ++ beq 10f ++ ++ vpush {q5} ++ @ Luma light filter ++ @ Left ++ vext.16 q5, q2, q3, #7 ++ vext.16 q14, q1, q2, #7 ++ vext.16 q13, q0, q1, #7 ++ vext.16 q12, q15, q0, #7 ++ ++ vadd.u16 q5, q3 ++ vadd.u16 q14, q2 ++ vadd.u16 q13, q1 ++ vadd.u16 q12, q0 ++ vmov.u16 r2, d7[3] @ Save final pel ++ ++ vext.16 q0, q12, q13, #1 ++ vext.16 q1, q13, q14, #1 ++ vext.16 q2, q14, q5, #1 ++ vext.16 q3, q5, q5, #1 ++ ++ vmov d30, d24 @ d30[0] = l[0] + ul ++ vadd.u16 q0, q12 ++ vadd.u16 q1, q13 ++ vadd.u16 q2, q14 ++ vadd.u16 q3, q5 ++ ++ vrshr.u16 q0, #2 ++ vrshr.u16 q1, #2 ++ vrshr.u16 q2, #2 ++ vrshr.u16 q3, #2 ++ ++ @ Up ++ vext.16 q5, q10, q11, #7 ++ vext.16 q14, q9, q10, #7 ++ vext.16 q13, q8, q9, #7 ++ vext.16 q12, q15, q8, #7 ++ ++ vadd.u16 q5, q11 ++ vadd.u16 q14, q10 ++ vadd.u16 q13, q9 ++ vadd.u16 q12, q8 ++ vmov.u16 r3, d23[3] @ Save final pel ++ ++ vext.16 q8, q12, q13, #1 ++ vext.16 q9, q13, q14, #1 ++ vext.16 q10, q14, q5, #1 ++ vext.16 q11, q5, q5, #1 ++ ++ vadd.u16 d30, d24 @ d30[0] = l[0] + 2ul + u[0] ++ vadd.u16 q8, q12 ++ vadd.u16 q9, q13 ++ vadd.u16 q10, q14 ++ vadd.u16 q11, q5 ++ ++ vrshr.u16 q8, #2 ++ vrshr.u16 q9, #2 ++ vrshr.u16 q10, #2 ++ vrshr.u16 q11, #2 ++ ++ @ Misc ++ vrshr.u16 d30, #2 ++ vmov.u16 d7[3], r2 @ Restore final pel ++ vmov.u16 d23[3], r3 @ Restore final pel ++ vdup.u16 d31, d30[0] @ d31[3] = d30[0] ++ vpop {q5} ++ ++10: ++ vstm r1, {d16-d23} @ Up ++ vst1.16 {d31[3]}, [r12] @ Up-left ++ vstm r0, { d0-d7 } @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++@ int ff_hevc_rpi_intra_filter_4_neon_32( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set pw_s, 2 ++.set pw, (1 << pw_s) ++.set log2_s, 2 ++ ++function ff_hevc_rpi_intra_filter_4_neon_32, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]" ++ ++ it cs ++ vldmcs r6, {d4, d5} ++ it mi ++ vldmmi r5, {d6, d7} ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q1, d0[0] ++ add r12, r0, #-pw ++ bpl 1f ++ vld1.32 {d0[0]}, [r10], r9 ++ vld1.32 {d0[1]}, [r3], r9 ++ vld1.32 {d1[0]}, [r10] ++ vld1.32 {d1[1]}, [r3] ++1: ++ bcc 1f ++ vld1.32 {d2[1]}, [r4], r9 ++ vld1.32 {d3[0]}, [r8] ++ vld1.32 {d3[1]}, [r4] ++1: ++ vst1.32 {q2, q3 }, [r1] @ Up ++ vst1.32 {d31[1]}, [r12] ++ vst1.32 {q0, q1 }, [r0] @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_8_neon_32( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 2 ++.set pw, (1 << pw_s) ++.set log2_s, 3 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_8_neon_32, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]" ++ ++ vdup.32 q9, d16[0] ++ vdup.32 q11, d20[0] ++ ++ it cs ++ vldmcs r6, {q8, q9 } ++ ldr r12, [sp, #ur_size] ++ bpl 1f ++ cmp r12, #p_size ++ vldm r5, {q10, q11} ++ bge 1f ++ vdup.32 q11, d21[1] ++1: ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q1, d0[0] ++ vdup.32 q2, d0[0] ++ vdup.32 q3, d0[0] ++ bpl 1f ++ vld1.32 {d0[0]}, [r10], r9 ++ vld1.32 {d0[1]}, [r3], r9 ++ vld1.32 {d1[0]}, [r10], r9 ++ vld1.32 {d1[1]}, [r3], r9 ++ vld1.32 {d2[0]}, [r10], r9 ++ vld1.32 {d2[1]}, [r3], r9 ++ vld1.32 {d3[0]}, [r10] ++ vld1.32 {d3[1]}, [r3] ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ vld1.32 {d4[1]}, [r4], r9 ++ cmp r12, #p_size ++ vld1.32 {d5[0]}, [r8], r9 ++ vld1.32 {d5[1]}, [r4], r9 ++ blt 2f ++ vld1.32 {d6[0]}, [r8], r9 ++ vld1.32 {d6[1]}, [r4], r9 ++ vld1.32 {d7[0]}, [r8] ++ vld1.32 {d7[1]}, [r4] ++ b 1f ++2: ++ vdup.32 q3, d5[1] ++1: ++ add r12, r0, #-pw ++ vstm r1, { q8-q11} @ Up ++ vst1.32 {d31[1]}, [r12] ++ vstm r0, { q0-q3 } @ Left ++ pop {r4-r10, pc} ++endfunc ++ ++ ++@ int ff_hevc_rpi_intra_filter_16_neon_32( ++@ pixel * const left, [r0] ++@ pixel * const top, [r1] ++@ const unsigned int req, [r2] ++@ const unsigned int avail, [r3] ++@ const pixel * const src_l, [sp, #0] ++@ const pixel * const src_u, [sp, #4] ++@ const pixel * const src_ur, [sp, #8] ++@ const unsigned int stride, [sp, #12] (pels) ++@ const unsigned int top_right_size, [sp, #16] ++@ const unsigned int down_left_size) [sp, #20] ++ ++.set sp_base, 8*4 ++.set ur_size, sp_base + 16 ++.set dl_size, sp_base + 20 ++.set pw_s, 2 ++.set pw, (1 << pw_s) ++.set log2_s, 4 ++.set p_size, (1 << log2_s) @ size in pels ++ ++function ff_hevc_rpi_intra_filter_16_neon_32, export=1 ++ push {r4-r10, lr} ++ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1] ++ ++ @ Once we get this big we have run out of neon regs to store ++ @ everything at once so do in pieces ++ ++ @ Up (have) ++ it cs ++ vldmcs r6, { q0-q3 } ++ ldr r12, [sp, #ur_size] ++ it mi ++ vldmmi r5, { q8-q11} ++ it cs ++ vstmcs r1, { q0-q3 } ++ bpl 1f ++ cmp r12, #12 ++ add lr, r1, #(pw << log2_s) ++ bgt 2f ++ cmp r12, #8 ++ bge 3f ++ vdup.16 q9, d17[1] ++4: vdup.16 d10, d19[1] ++3: vdup.16 q11, d21[1] ++2: vstm lr, { q8-q11} ++1: ++ ++ @ Left (have) ++ add lr, r0, #-pw ++ lsls r12, r7, #AVAIL_S_L_N_DL_C ++ vst1.32 {d30[1]}, [lr] @ UL ++ bpl 1f ++ vld1.32 { d0[0]}, [r10], r9 ++ vld1.32 { d0[1]}, [r3], r9 ++ vld1.32 { d1[0]}, [r10], r9 ++ vld1.32 { d1[1]}, [r3], r9 ++ vld1.32 { d2[0]}, [r10], r9 ++ vld1.32 { d2[1]}, [r3], r9 ++ vld1.32 { d3[0]}, [r10], r9 ++ vld1.32 { d3[1]}, [r3], r9 ++ vld1.32 { d4[0]}, [r10], r9 ++ vld1.32 { d4[1]}, [r3], r9 ++ vld1.32 { d5[0]}, [r10], r9 ++ vld1.32 { d5[1]}, [r3], r9 ++ vld1.32 { d6[0]}, [r10], r9 ++ vld1.32 { d6[1]}, [r3], r9 ++ vld1.32 { d7[0]}, [r10] ++ vld1.32 { d7[1]}, [r3] ++ vstm r0, { q0-q3 } ++1: ++ bcc 1f ++ ldr r12, [sp, #dl_size] ++ vdup.32 d16, d30[0] @ d16[0] = d30[0] ++ add lr, r0, #(pw << log2_s) ++ vld1.32 {d16[1]}, [r4], r9 ++ cmp r12, #4 ++ vld1.32 {d17[0]}, [r8], r9 ++ vld1.32 {d17[1]}, [r4], r9 ++ ble 2f ++ vld1.32 {d18[0]}, [r8], r9 ++ vld1.32 {d18[1]}, [r4], r9 ++ cmp r12, #12 ++ vld1.32 {d19[0]}, [r8], r9 ++ vld1.32 {d19[1]}, [r4], r9 ++ blt 3f ++ vld1.32 {d20[0]}, [r8], r9 ++ vld1.32 {d20[1]}, [r4], r9 ++ vld1.32 {d21[0]}, [r8], r9 ++ vld1.32 {d21[1]}, [r4], r9 ++ ble 4f ++ vld1.32 {d22[0]}, [r8], r9 ++ vld1.32 {d22[1]}, [r4], r9 ++ vld1.32 {d23[0]}, [r8] ++ vld1.32 {d23[1]}, [r4] ++ b 5f ++2: vdup.32 q9, d17[1] ++3: vdup.32 q10, d19[1] ++4: vdup.32 q11, d21[1] ++5: vstm lr, { q8-q11} ++1: ++ eors r7, r2 ++ beq 99f ++ ++ lsls r12, r7, #AVAIL_S_UR_N_U_C ++ vdup.32 q0, d31[0] ++ vdup.32 q1, d31[0] ++ vdup.32 q2, d31[0] ++ vdup.32 q3, d31[0] ++ add lr, r1, #(pw << log2_s) ++ vdup.32 q8, d31[1] ++ vdup.32 q9, d31[1] ++ vdup.32 q10, d31[1] ++ vdup.32 q11, d31[1] ++ it cs ++ vstmcs r1, { q0-q3 } ++ it mi ++ vstmmi lr, { q8-q11} ++ ++ lsls r7, #AVAIL_S_L_N_DL_C ++ vdup.32 q0, d30[0] ++ vdup.32 q1, d30[0] ++ vdup.32 q2, d30[0] ++ vdup.32 q3, d30[0] ++ add lr, r0, #(pw << log2_s) ++ it mi ++ vstmmi r0, { q0-q3 } ++ it cs ++ vstmcs lr, { q0-q3 } ++ ++99: ++ pop {r4-r10, pc} ++endfunc ++ ++ ++ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S +new file mode 100644 +index 0000000000..56819ae439 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S +@@ -0,0 +1,920 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++/* ++ * Horizontal & Vertical special cases of angular intra pred ++ * ++ * Split out because: ++ * Vertical, at least, is relatively common ++ * Much simpler code than the general angular case ++ * Luma with size < 32 has extra filtering that doesn't happen anywhere else ++ * ++ * *** Currently luma filtering is mandatory where it occurs, but there are ++ * cases where it should be turned off (rdpcm & an extension sps flag). ++ * These don't occur in the standard conformance suite for Main Profile ++ */ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ ff_hevc_rpi_pred_vertical_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_4_neon_8, export=1 ++ ldrb ip, [r2, #-1] @ Top-left ++ vld1.32 {d0[0]}, [r2 :32] @ Left ++ add r2, r0, r3 ++ vld1.8 {d1[]}, [r1] ++ lsl r3, #1 ++ vdup.8 d4, ip ++ vmov.i8 d2, #128 ++ vhsub.u8 d4, d0, d4 ++ veor d1, d2 ++ vld1.32 {d0[0]}, [r1 :32] @ Top ++ vqadd.s8 d1, d4 ++ vmov.i64 d3, #0xff ++ vmov d4, d0 ++ veor d5, d1, d2 ++ veor d1, d1, d2 ++ vbit d0, d1, d3 ++ vshr.u64 d5, #8 ++ vst1.32 {d0[0]}, [r0], r3 ++ vshr.u64 d1, #16 ++ vbit d4, d5, d3 ++ vshr.u64 d5, #16 ++ vst1.32 {d4[0]}, [r2], r3 ++ vbit d0, d1, d3 ++ vst1.32 {d0[0]}, [r0] ++ vbit d4, d5, d3 ++ vst1.32 {d4[0]}, [r2] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_8_neon_8, export=1 ++ ldrb ip, [r2, #-1] @ Top-left ++ vld1.8 {d0}, [r2 :64] @ Left ++ vmov.i8 d1, #128 ++ vld1.8 {d2[]}, [r1] ++ vld1.8 {d3}, [r1 :64] @ Top ++ vdup.8 d4, ip ++ vhsub.u8 d4, d0, d4 ++ veor d2, d1 ++ vmov.i64 d0, #0xff ++ mov r1, #8 ++ vqadd.s8 d2, d4, d2 ++ veor d1, d2, d1 ++1: ++ vbit d3, d1, d0 ++ vshr.u64 d1, #8 ++ vst1.8 {d3}, [r0 :64], r3 ++ subs r1, #2 ++ vbit d3, d1, d0 ++ vshr.u64 d1, #8 ++ vst1.8 {d3}, [r0 :64], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_16_neon_8, export=1 ++ ldrb ip, [r2, #-1] @ Top-left ++ vld1.8 {q0}, [r2 :128] @ Left ++ vdup.8 q1, ip ++ vld1.8 {d4[],d5[]}, [r1] ++ vhsub.u8 q0, q1 ++ vmov.i8 q1, #128 ++ veor q2, q1 ++ vmov.i64 d16, #0xff ++ vqadd.s8 q0, q2 ++ vld1.8 {q3}, [r1 :128] @ Top ++ mov r1, #16 ++ veor q0, q1 ++ vmov q1, q3 ++ vext.8 q2, q0, q0, #1 ++1: ++ vbit d2, d0, d16 ++ vbit d6, d4, d16 ++ vext.8 q0, q0, q0, #2 ++ subs r1, #2 ++ vst1.8 {q1}, [r0 :128], r3 ++ vext.8 q2, q2, q2, #2 ++ vst1.8 {q3}, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vert_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_32_neon_8, export=1 ++ vld1.8 {q0, q1 }, [r1 :128] @ Up ++ add r2, r0, r3 ++ lsl r3, #1 ++ mov r1, #16 ++1: ++ vst1.8 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.8 {q0, q1 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1 ++ vld1.16 {d0 }, [r1 :64] @ Up ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ ++ vst1.16 {d0 }, [r0 :64], r3 ++ vst1.16 {d0 }, [r2 :64], r3 ++ vst1.16 {d0 }, [r0 :64] ++ vst1.16 {d0 }, [r2 :64] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1 ++ vld1.16 {q0 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ mov r1, #4 ++1: ++ vst1.16 {q0 }, [r0 :128], r3 ++ subs r1, #2 ++ vst1.16 {q0 }, [r2 :128], r3 ++ vst1.16 {q0 }, [r0 :128], r3 ++ vst1.16 {q0 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1 ++ vld1.16 {q0, q1 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #1 ++ lsl r3, #2 ++ mov r1, #8 ++1: ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontalal_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++@ ? Might be faster as simple arm ++ ++function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1 ++ ldrb ip, [r2, #-1] @ Top-left ++ vld1.32 {d0[0]}, [r1 :32] @ Top ++ add r1, r2, #3 ++ vld1.8 {d1[]}, [r2]! ++ vdup.8 d2, ip ++ vmov.i8 d3, #128 ++ vhsub.u8 d0, d2 ++ veor d1, d3 ++ vld1.8 {d2[]}, [r2]! ++ add ip, r0, r3 ++ vqadd.s8 d0, d0, d1 ++ lsl r3, #1 ++ vld1.8 {d1[]}, [r2] ++ vld1.8 {d4[]}, [r1] ++ veor d0, d3 ++ vst1.32 {d0[0]}, [r0 :32], r3 ++ vst1.32 {d2[0]}, [ip :32], r3 ++ vst1.32 {d1[0]}, [r0 :32] ++ vst1.32 {d4[0]}, [ip :32] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1 ++ ldrb ip, [r2, #-1] @ Top-left ++ vld1.8 {d0}, [r1 :64] @ Top ++ vmov.i8 d1, #128 ++ vld1.8 {d2[]}, [r2]! ++ mov r1, #8-2 ++ vdup.8 d3, ip ++ vhsub.u8 d0, d3 ++ veor d2, d1 ++ vqadd.s8 d0, d2 ++ vld1.8 {d2[]}, [r2]! ++ veor d0, d1 ++ vst1.8 {d0}, [r0], r3 ++1: ++ vld1.8 {d0[]}, [r2]! ++ subs r1, #2 ++ vst1.8 {d2}, [r0 :64], r3 ++ vld1.8 {d2[]}, [r2]! ++ vst1.8 {d0}, [r0 :64], r3 ++ bne 1b ++ ++ vst1.8 {d2}, [r0 :64] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1 ++ ldrb ip, [r2, #-1] @ Top-left ++ vld1.8 {q0}, [r1 :64] @ Top ++ mov r1, #16-2 ++ vld1.8 {d4[],d5[]}, [r2]! ++ vdup.8 q3, ip ++ vhsub.u8 q0, q3 ++ vmov.i8 q1, #128 ++ veor q2, q1 ++ vqadd.s8 q0, q2 ++ vld1.8 {d4[],d5[]}, [r2]! ++ veor q0, q1 ++ vst1.8 {q0}, [r0], r3 ++1: ++ vld1.8 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.8 {q2}, [r0 :64], r3 ++ vld1.8 {d4[],d5[]}, [r2]! ++ vst1.8 {q0}, [r0 :64], r3 ++ bne 1b ++ ++ vst1.8 {q2}, [r0 :64] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1 ++ vld1.8 {d0[],d1[]}, [r2]! ++ add ip, r0, #16 ++ mov r1, #32-2 ++ vld1.8 {d2[],d3[]}, [r2]! ++ vst1.8 {q0}, [r0 :128], r3 ++ vst1.8 {q0}, [ip :128], r3 ++1: ++ vld1.8 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.8 {q1}, [r0 :128], r3 ++ vst1.8 {q1}, [ip :128], r3 ++ vld1.8 {d2[],d3[]}, [r2]! ++ vst1.8 {q0}, [r0 :128], r3 ++ vst1.8 {q0}, [ip :128], r3 ++ bne 1b ++ ++ vst1.8 {q1}, [r0 :128] ++ vst1.8 {q1}, [ip :128] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1 ++ add r1, r2, #2 ++ vld1.16 {d0[]}, [r2] ++ add r2, #4 ++ vld1.16 {d1[]}, [r1] ++ add r1, #4 ++ vld1.16 {d2[]}, [r2] ++A add r2, r0, r3, lsl #1 ++T lsl r3, #1 ++T add r2, r0, r3 ++ vld1.16 {d3[]}, [r1] ++A lsl r3, #2 ++T lsl r3, #1 ++ vst1.16 {d0}, [r0 :64], r3 ++ vst1.16 {d1}, [r2 :64], r3 ++ vst1.16 {d2}, [r0 :64] ++ vst1.16 {d3}, [r2 :64] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1 ++ vld1.16 {d0[],d1[]}, [r2]! ++ lsl r3, #1 ++ vld1.16 {d2[],d3[]}, [r2]! ++ mov r1, #8-2 ++ vst1.16 {q0}, [r0 :64], r3 ++1: ++ vld1.16 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.16 {q1}, [r0 :64], r3 ++ vld1.16 {d2[],d3[]}, [r2]! ++ vst1.16 {q0}, [r0 :64], r3 ++ bne 1b ++ ++ vst1.16 {q1}, [r0 :64] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1 ++ vld1.16 {d0[],d1[]}, [r2]! ++ lsl r3, #1 ++ add ip, r0, #16 ++ mov r1, #16-2 ++ vld1.16 {d2[],d3[]}, [r2]! ++ vst1.16 {q0}, [r0 :128], r3 ++ vst1.16 {q0}, [ip :128], r3 ++1: ++ vld1.16 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.16 {q1}, [r0 :128], r3 ++ vst1.16 {q1}, [ip :128], r3 ++ vld1.16 {d2[],d3[]}, [r2]! ++ vst1.16 {q0}, [r0 :128], r3 ++ vst1.16 {q0}, [ip :128], r3 ++ bne 1b ++ ++ vst1.16 {q1}, [r0 :128] ++ vst1.16 {q1}, [ip :128] ++ bx lr ++endfunc ++ ++ ++@------------------------------------------------------------------------------ ++@ ++@ 10 Bit ++@ Has clipping constants so 10-bit only but could easily be macroed up to ++@ 14-bit before we run out of bits ++ ++ ++@ ff_hevc_rpi_pred_vertical_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_4_neon_10, export=1 ++ ldrh ip, [r2, #-2] @ Top-left ++ vld1.16 {d0}, [r2 :64] @ Left ++ vmov.i16 d2, #0 ++ vld1.16 {d1[]}, [r1] ++T lsl r3, #1 ++ vdup.16 d4, ip ++ vmov.i16 d3, #0x3ff ++ vld1.16 {d5}, [r1 :64] @ Top ++ vhsub.u16 d4, d0, d4 ++ vmov.i64 d0, #0xffff ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vadd.i16 d1, d1, d4 ++ vmov d6, d5 ++ vmax.s16 d1, d1, d2 ++ vmin.s16 d2, d1, d3 ++ vmin.s16 d1, d1, d3 ++ vbit d5, d1, d0 ++A lsl r3, #2 ++T lsl r3, #1 ++ vshr.u64 d2, #16 ++ vshr.u64 d1, #32 ++ vbit d6, d2, d0 ++ vst1.16 {d5}, [r0], r3 ++ vshr.u64 d2, #32 ++ vst1.16 {d6}, [r2], r3 ++ vbit d5, d1, d0 ++ vst1.16 {d5}, [r0] ++ vbit d6, d2, d0 ++ vst1.16 {d6}, [r2] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_8_neon_10, export=1 ++ ldrh ip, [r2, #-2] @ Top-left ++ vld1.16 {q0}, [r2 :128] @ Left ++ lsl r3, #1 ++ vdup.16 q1, ip ++ vld1.16 {d4[],d5[]}, [r1] ++ vhsub.u16 q0, q0, q1 ++ vmov.i16 q1, #0 ++ vadd.i16 q0, q2 ++ vmov.i16 q2, #0x3ff ++ vld1.16 {q3}, [r1 :128] @ Top ++ mov r1, #8 ++ vmax.s16 q0, q1 ++ vmov q1, q3 ++ vmin.s16 q0, q2 ++ vmov.i64 d16, #0xffff ++ vext.16 q2, q0, q0, #1 ++1: ++ vbit d2, d0, d16 ++ vbit d6, d4, d16 ++ vext.16 q0, q0, q0, #2 ++ subs r1, #2 ++ vst1.16 {q1}, [r0 :128], r3 ++ vext.16 q2, q2, q2, #2 ++ vst1.16 {q3}, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_16_neon_10, export=1 ++ ldrh ip, [r2, #-2] @ Top-left ++ vld1.16 {q0-q1}, [r2 :128] @ Left ++T lsl r3, #1 ++ vdup.16 q2, ip ++A add r2, r0, r3, lsl #1 ++T add r2, r0, r3 ++ vld1.16 {d6[],d7[]}, [r1] ++A lsl r3, #2 ++T lsl r3, #1 ++ vhsub.u16 q0, q2 ++ vhsub.u16 q1, q2 ++ vadd.i16 q0, q3 ++ vadd.i16 q1, q3 ++ vmov.i16 q2, #0 ++ vld1.16 {q8-q9}, [r1 :128] @ Top ++ mov r1, #0 ++ vmov.i16 q3, #0x3ff ++ vmax.s16 q0, q2 ++ vmax.s16 q1, q2 ++ vmin.s16 q0, q3 ++ vmin.s16 q1, q3 ++ vmov q10, q8 ++ vmov q11, q9 ++ vext.16 q2, q0, q1, #1 ++ vext.16 q3, q1, q1, #1 ++ vmov.i64 d24, #0xffff ++1: ++ vbit d16, d0, d24 ++ vbit d20, d4, d24 ++ vext.16 q0, q0, q0, #2 ++ subs r1, #1<<30 ++ vst1.16 {q8-q9}, [r0 :128], r3 ++ vext.16 q2, q2, q2, #2 ++ vst1.16 {q10-q11}, [r2 :128], r3 ++ bne 1b ++1: ++ vbit d16, d2, d24 ++ vbit d20, d6, d24 ++ vext.16 q1, q1, q1, #2 ++ subs r1, #1<<30 ++ vst1.16 {q8-q9}, [r0 :128], r3 ++ vext.16 q3, q3, q3, #2 ++ vst1.16 {q10-q11}, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_32_neon_10, export=1 ++ vldm r1, { q0-q3 } @ Up ++ lsl r3, #1 ++ mov r1, #32 ++ add r2, r0, #32 ++1: ++ vst1.16 {q0-q1}, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q2-q3}, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1 ++ vld1.16 {q0 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #2 ++ lsl r3, #3 ++ ++ vst1.16 {q0 }, [r0 :128], r3 ++ vst1.16 {q0 }, [r2 :128], r3 ++ vst1.16 {q0 }, [r0 :128] ++ vst1.16 {q0 }, [r2 :128] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1 ++ vld1.16 {q0, q1 }, [r1 :128] @ Up ++ add r2, r0, r3, lsl #2 ++ lsl r3, #3 ++ mov r1, #4 ++1: ++ vst1.16 {q0, q1 }, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q0, q1 }, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_vertical_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1 ++ vldm r1, { q0-q3 } @ Up ++ lsl r3, #2 ++ mov r1, #16 ++ add r2, r0, #32 ++1: ++ vst1.16 {q0-q1}, [r0 :128], r3 ++ subs r1, #1 ++ vst1.16 {q2-q3}, [r2 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++@ ff_hevc_rpi_pred_horizontal_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1 ++ ldrh ip, [r2, #-2] @ Top-left ++ vld1.16 {d0}, [r1 :64] @ Top ++ vmov.i16 d1, #0 ++ vld1.16 {d2[]}, [r2]! ++T lsl r3, #1 ++ vdup.16 d3, ip ++ vmov.i16 d4, #0x3ff ++ vhsub.u16 d0, d3 ++A add ip, r0, r3, lsl #1 ++T add ip, r0, r3 ++ vld1.16 {d3[]}, [r2]! ++A lsl r3, #2 ++T lsl r3, #1 ++ vadd.i16 d0, d2 ++ vld1.16 {d2[]}, [r2]! ++ vmax.s16 d0, d1 ++ vld1.16 {d1[]}, [r2] ++ vmin.s16 d0, d4 ++ vst1.16 {d0}, [r0 :64], r3 ++ vst1.16 {d3}, [ip :64], r3 ++ vst1.16 {d2}, [r0 :64] ++ vst1.16 {d1}, [ip :64] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1 ++ ldrh ip, [r2, #-2] @ Top-left ++ vld1.16 {q0}, [r1 :128] @ Top ++ lsl r3, #1 ++ vdup.16 q1, ip ++ mov r1, #8-2 ++ vhsub.u16 q0, q1 ++ vld1.16 {d2[],d3[]}, [r2]! ++ vmov.i16 q2, #0 ++ vadd.i16 q0, q1 ++ vmov.i16 q1, #0x3ff ++ vmax.s16 q0, q2 ++ vld1.16 {d4[],d5[]}, [r2]! ++ vmin.s16 q0, q1 ++ vst1.16 {q0}, [r0 :128], r3 ++1: ++ vld1.16 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.16 {q2}, [r0 :128], r3 ++ vld1.16 {d4[],d5[]}, [r2]! ++ vst1.16 {q0}, [r0 :128], r3 ++ bne 1b ++ ++ vst1.16 {q2}, [r0 :128] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontalal_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1 ++ ldrh ip, [r2, #-2] @ Top-left ++ vld1.16 {q0-q1}, [r1 :128] @ Top ++ lsl r3, #1 ++ vdup.16 q2, ip ++ add ip, r0, r3 ++ vhsub.u16 q0, q2 ++ add ip, #16 ++ vhsub.u16 q1, q2 ++ mov r1, #16-2 ++ vld1.16 {d4[],d5[]}, [r2]! ++ vmov.i16 q3, #0 ++ vadd.u16 q0, q2 ++ vadd.i16 q1, q2 ++ vmov.i16 q2, #0x3ff ++ vmax.s16 q0, q3 ++ vmax.s16 q1, q3 ++ vld1.16 {d6[],d7[]}, [r2]! ++ vmin.s16 q0, q2 ++ vmin.s16 q1, q2 ++ vst1.16 {q0-q1}, [r0 :128], r3 ++1: ++ vld1.16 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.16 {q3}, [r0 :128], r3 ++ vst1.16 {q3}, [ip :128], r3 ++ vld1.16 {d6[],d7[]}, [r2]! ++ vst1.16 {q0}, [r0 :128], r3 ++ vst1.16 {q0}, [ip :128], r3 ++ bne 1b ++ ++ vst1.16 {q3}, [r0 :128] ++ vst1.16 {q3}, [ip :128] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1 ++ vld1.16 {d0[],d1[]}, [r2]! ++ add ip, r0, #16 ++ push {lr} ++ mov lr, #32 ++ vld1.16 {d2[],d3[]}, [r2]! ++ lsl r3, #1 ++ vst1.16 {q0}, [r0 :128], lr ++ sub r3, #32 ++ vst1.16 {q0}, [ip :128], lr ++ mov r1, #32-2 ++ vst1.16 {q0}, [r0 :128], r3 ++ vst1.16 {q0}, [ip :128], r3 ++1: ++ vld1.16 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.16 {q1}, [r0 :128], lr ++ vst1.16 {q1}, [ip :128], lr ++ vst1.16 {q1}, [r0 :128], r3 ++ vst1.16 {q1}, [ip :128], r3 ++ vld1.16 {d2[],d3[]}, [r2]! ++ vst1.16 {q0}, [r0 :128], lr ++ vst1.16 {q0}, [ip :128], lr ++ vst1.16 {q0}, [r0 :128], r3 ++ vst1.16 {q0}, [ip :128], r3 ++ bne 1b ++ ++ vst1.16 {q1}, [r0 :128], lr ++ vst1.16 {q1}, [ip :128], lr ++ vst1.16 {q1}, [r0 :128] ++ vst1.16 {q1}, [ip :128] ++ pop {pc} ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1 ++ add r1, r2, #4 ++ vld1.32 {d0[],d1[]}, [r2] ++ add r2, #8 ++ vld1.32 {d2[],d3[]}, [r1] ++ add r1, #8 ++ vld1.32 {d4[],d5[]}, [r2] ++A add r2, r0, r3, lsl #2 ++T lsl r3, #2 ++T add r2, r0, r3 ++ vld1.32 {d6[],d7[]}, [r1] ++A lsl r3, #3 ++T lsl r3, #1 ++ vst1.32 {q0}, [r0 :128], r3 ++ vst1.32 {q1}, [r2 :128], r3 ++ vst1.32 {q2}, [r0 :128] ++ vst1.32 {q3}, [r2 :128] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1 ++ vld1.32 {d0[],d1[]}, [r2]! ++ lsl r3, #2 ++ add ip, r0, #16 ++ mov r1, #8-2 ++ vld1.32 {d2[],d3[]}, [r2]! ++ vst1.32 {q0}, [r0 :128], r3 ++ vst1.32 {q0}, [ip :128], r3 ++1: ++ vld1.32 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.32 {q1}, [r0 :128], r3 ++ vst1.32 {q1}, [ip :128], r3 ++ vld1.32 {d2[],d3[]}, [r2]! ++ vst1.32 {q0}, [r0 :128], r3 ++ vst1.32 {q0}, [ip :128], r3 ++ bne 1b ++ ++ vst1.32 {q1}, [r0 :128] ++ vst1.32 {q1}, [ip :128] ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_horizontal_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1 ++ vld1.32 {d0[],d1[]}, [r2]! ++ add ip, r0, #16 ++ push {lr} ++ mov lr, #32 ++ vld1.32 {d2[],d3[]}, [r2]! ++ lsl r3, #2 ++ vst1.32 {q0}, [r0 :128], lr ++ sub r3, #32 ++ vst1.32 {q0}, [ip :128], lr ++ mov r1, #16-2 ++ vst1.32 {q0}, [r0 :128], r3 ++ vst1.32 {q0}, [ip :128], r3 ++1: ++ vld1.32 {d0[],d1[]}, [r2]! ++ subs r1, #2 ++ vst1.32 {q1}, [r0 :128], lr ++ vst1.32 {q1}, [ip :128], lr ++ vst1.32 {q1}, [r0 :128], r3 ++ vst1.32 {q1}, [ip :128], r3 ++ vld1.32 {d2[],d3[]}, [r2]! ++ vst1.32 {q0}, [r0 :128], lr ++ vst1.32 {q0}, [ip :128], lr ++ vst1.32 {q0}, [r0 :128], r3 ++ vst1.32 {q0}, [ip :128], r3 ++ bne 1b ++ ++ vst1.32 {q1}, [r0 :128], lr ++ vst1.32 {q1}, [ip :128], lr ++ vst1.32 {q1}, [r0 :128] ++ vst1.32 {q1}, [ip :128] ++ pop {pc} ++endfunc ++ ++ ++ +diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S +new file mode 100644 +index 0000000000..af8c4c03f0 +--- /dev/null ++++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S +@@ -0,0 +1,1043 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#include "libavutil/arm/asm.S" ++#include "neon.S" ++ ++@ Planar intra pred (8.4.4.2.4) ++@ ++@ predSamples[ x ][ y ] = ++@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] + ++@ ( x + 1 ) * p[ nTbS ][ -1 ] + ++@ ( nTbS - 1 - y ) * p[ x ][ -1 ] + ++@ ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 ) ++ ++@ All 10-bit functions would work with 9 ++ ++ ++@ ff_hevc_rpi_pred_planar_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_4_neon_8, export=1 ++ ++ vld1.8 {d0}, [r1] @ Top ++ adr ip, nb_3_0_1_4 ++ vld1.8 {d1}, [r2] @ Left ++ vmov.i64 d2, #0xffffffff ++ vldr d3, [ip, #8] @ {1,2,3,4,1,2,3,4} ++ add r1, r0, r3 ++ vdup.32 d4, d0[0] @ {t0,t1,t2,t3,t0,t1,t2,t3} ++ vdup.8 d0, d0[4] @ {t4,t4,t4,t4,t4,t4,t4,t4} ++ vdup.8 d5, d1[4] @ {l4,l4,l4,l4,l4,l4,l4,l4} ++ vdup.8 d6, d1[0] @ {l0,l0,l0,l0,l0,l0,l0,l0} ++ vshll.u8 q8, d4, #2 ++ lsl r3, #1 ++ vsubl.u8 q2, d5, d4 ++ vmlal.u8 q8, d0, d3 ++ vld1.8 {d0}, [ip] @ {3,2,1,0,3,2,1,0} ++ vdup.8 d7, d1[1] @ {l1,l1,l1,l1,l1,l1,l1,l1} ++ vshl.s16 q9, q2, #1 ++ vbif d6, d7, d2 @ {l0,l0,l0,l0,l1,l1,l1,l1} ++ vadd.i16 d16, d4 ++ vdup.8 d7, d1[2] @ {l2,l2,l2,l2,l2,l2,l2,l2} ++ vadd.i16 d17, d18 ++ vdup.8 d1, d1[3] @ {l3,l3,l3,l3,l3,l3,l3,l3} ++ vadd.i16 q2, q8, q9 ++ vmlal.u8 q8, d0, d6 ++ vbif d7, d1, d2 @ {l2,l2,l2,l2,l3,l3,l3,l3} ++ vmlal.u8 q2, d0, d7 ++ vrshrn.i16 d0, q8, #3 ++ vst1.32 d0[0], [r0 :32], r3 ++ vst1.32 d0[1], [r1 :32], r3 ++ vrshrn.i16 d0, q2, #3 ++ vst1.32 d0[0], [r0 :32] ++ vst1.32 d0[1], [r1 :32] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_4_neon_10, export=1 ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ vld1.16 {q0}, [r1 :64] @ Top ++ adr ip, nbh_3_0_1_4 ++ vldr d2, [r2, #8] @ Left (lower) ++ vldr d3, [ip, #8] @ {1,2,3,4} ++T lsl r3, #1 ++ vshl.s16 d4, d0, #2 ++ vdup.16 d1, d1[0] @ {t4,t4,t4,t4} ++ vldr d5, [r2] @ Left (upper) ++ vdup.16 d2, d2[0] @ {l4,l4,l4,l4} ++ vldr d6, [ip] @ {3,2,1,0} ++ vmla.i16 d4, d3, d1 @ Acc set up ++ vsub.i16 d0, d2, d0 @ Add set up ++ vmov d7, d6 ++ vdup.16 d2, d5[0] ++ vdup.16 d3, d5[1] ++ vdup.16 d16, d5[2] ++ vadd.i16 d18, d0, d4 ++ vshl.s16 d0, #1 @ x2 ++ vadd.i16 d19, d0, d4 ++ vdup.16 d17, d5[3] ++ vadd.i16 d4, d0, d18 ++A add r1, r0, r3, lsl #1 ++T add r1, r0, r3 ++ vadd.i16 d5, d0, d19 ++A lsl r3, #2 ++T lsl r3, #1 ++ vmla.i16 q9, q1, q3 ++ vmla.i16 q2, q8, q3 ++ vrshr.u16 q0, q9, #3 ++ vst1.16 {d0}, [r0], r3 ++ vrshr.u16 d2, d4, #3 ++ vst1.16 {d1}, [r1], r3 ++ vrshr.u16 d3, d5, #3 ++ vst1.16 {d2}, [r0] ++ vst1.16 {d3}, [r1] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_8_neon_8, export=1 ++ ++ vld1.8 {q0}, [r1] @ Top ++ adr ip, nb_7_0_1_8 ++ vldr d2, [r2, #8] @ Left (lower) ++ mov r1, #8 ++ vldr d3, [ip, #8] @ {1,2,3,4,5,6,7,8} ++ vshll.u8 q2, d0, #3 ++ vdup.8 d1, d1[0] @ {t8,t8,t8,t8,t8,t8,t8,t8} ++ vdup.8 d2, d2[0] @ {l8,l8,l8,l8,l8,l8,l8,l8} ++ vldr d6, [r2] @ Left (upper) ++ vmlal.u8 q2, d3, d1 ++ vsubl.u8 q0, d2, d0 ++ vldr d7, [ip] @ {7,6,5,4,3,2,1,0} ++ ++@ u8 7..0 [1] d7 ++@ u8 left[y] [1] d6 ++@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1] ++ ++ vdup.8 d2, d6[0] ++ vadd.i16 q2, q0 ++ vdup.8 d3, d6[1] ++ vadd.i16 q8, q2, q0 ++1: ++ vmlal.u8 q2, d7, d2 ++ subs r1, #2 ++ vadd.i16 q9, q8, q0 ++ vmlal.u8 q8, d7, d3 ++ vdup.8 d2, d6[2] ++ vdup.8 d3, d6[3] ++ vrshrn.i16 d20, q2, #4 ++ vshr.u64 d6, #16 ++ vmov q2, q9 ++ vst1.8 {d20}, [r0], r3 ++ vrshrn.i16 d20, q8, #4 ++ vadd.i16 q8, q2, q0 ++ vst1.8 {d20}, [r0], r3 ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_8_neon_10, export=1 ++ ++ adr ip, nb_7_0_1_8 ++ vld1.16 {q0}, [r1 :128]! @ Top (left) ++ lsl r3, #1 ++ vld1.16 {q1}, [ip :128] @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8} ++ add ip, r2, #16 ++ vld1.16 {d4[],d5[]}, [r1] @ Top (right) ++ mov r1, #8-2 ++ vshl.s16 q3, q0, #3 ++ vmovl.u8 q8, d3 @ {1,2,3,4,5,6,7,8} ++ vld1.16 {d18[],d19[]}, [ip] @ Left (lower) ++ vmla.i16 q3, q8, q2 @ Acc set up ++ vsub.i16 q0, q9, q0 @ Add set up ++ vmovl.u8 q1, d2 @ {7,6,5,4,3,2,1,0} ++ vadd.i16 q2, q3, q0 ++ ++@ u16 7..0 [1] q1 ++@ u32 left[y] [1] [r2] ++@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1] ++ ++ vld1.16 {d6[],d7[]}, [r2]! ++ vadd.i16 q8, q2, q0 ++ vld1.16 {d18[],d19[]}, [r2]! ++ vmla.i16 q2, q1, q3 ++ vadd.i16 q3, q8, q0 ++ vmla.i16 q8, q1, q9 ++1: ++ vrshr.u16 q9, q2, #4 ++ subs r1, #2 ++ vmov q2, q3 ++ vrshr.u16 q10, q8, #4 ++ vld1.16 {d6[],d7[]}, [r2]! ++ vst1.16 {q9}, [r0 :128], r3 ++ vadd.i16 q8, q2, q0 ++ vld1.16 {d18[],d19[]}, [r2]! ++ vmla.i16 q2, q1, q3 ++ vadd.i16 q3, q8, q0 ++ vmla.i16 q8, q1, q9 ++ vst1.16 {q10}, [r0 :128], r3 ++ bne 1b ++ ++ vrshr.u16 q9, q2, #4 ++ add r3, r0 ++ vrshr.u16 q10, q8, #4 ++ vst1.16 {q9}, [r0 :128] ++ vst1.16 {q10}, [r3 :128] ++ ++ bx lr ++endfunc ++ ++ ++@------------------------------------------------------------------------------ ++@ ++@ Data - has to be in two lumps to ensure we can always reach using adr ++ ++ .balign 64 ++ ++nb_31_0_1_32: ++ .byte 31, 30, 29, 28, 27, 26, 25, 24 ++ .byte 23, 22, 21, 20, 19, 18, 17, 16 ++nb_15_0_1_16: ++ .byte 15, 14, 13, 12, 11, 10, 9, 8 ++ .byte 7, 6, 5, 4, 3, 2, 1, 0 ++ .byte 1, 2, 3, 4, 5, 6, 7, 8 ++ .byte 9, 10, 11, 12, 13, 14, 15, 16 ++ .byte 17, 18, 19, 20, 21, 22, 23, 24 ++ .byte 25, 26, 27, 28, 29, 30, 31, 32 ++ ++ @ should be back on a 64-byte boundary here ++ ++ @ These could be extracted from the above array, but separate out ++ @ out for better (16 byte) alignment ++nb_3_0_1_4: ++ .byte 3, 2, 1, 0, 3, 2, 1, 0 ++ .byte 1, 2, 3, 4, 1, 2, 3, 4 ++nb_7_0_1_8: ++ .byte 7, 6, 5, 4, 3, 2, 1, 0 ++ .byte 1, 2, 3, 4, 5, 6, 7, 8 ++nbh_3_0_1_4: ++ .short 3, 2, 1, 0, 1, 2, 3, 4 ++ ++@------------------------------------------------------------------------------ ++ ++ ++@ ff_hevc_rpi_pred_planar_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_16_neon_8, export=1 ++ ++ adr ip, nb_15_0_1_16 + 16 ++ vld1.8 {q0}, [r1 :128]! @ Top (left) ++ add r2, #16 ++ vld1.8 {q1}, [ip: 128] @ {1,2,3...16} ++ vld1.8 {d4[]}, [r1] @ Top (right) ++ sub ip, #16 ++ vshll.u8 q3, d0, #4 ++ mov r1, #16 ++ vshll.u8 q8, d1, #4 ++ vld1.8 {d5[]}, [r2] @ Left (lower) ++ sub r2, #16 ++ vmlal.u8 q3, d2, d4 ++ vmlal.u8 q8, d3, d4 @ Acc set up ++ vsubl.u8 q1, d5, d0 ++ vsubl.u8 q0, d5, d1 @ Add set up ++ vld1.8 {q2}, [ip :128] @ {15,14,13...0} ++ ++@ u8 15..0 [1] q2 ++@ u8 left[y] [1] [r2] ++@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1] ++ ++ vadd.i16 q3, q1 ++ vadd.i16 q8, q0 ++1: ++ vadd.i16 q10, q3, q1 ++ subs r1, #2 ++ vld1.8 {d18[]}, [r2]! ++ vadd.i16 q11, q8, q0 ++ vld1.8 {d19[]}, [r2]! ++ vmlal.u8 q3, d4, d18 ++ vmlal.u8 q8, d5, d18 ++ vadd.i16 q12, q10, q1 ++ vmlal.u8 q10, d4, d19 ++ vadd.i16 q13, q11, q0 ++ vmlal.u8 q11, d5, d19 ++ vrshrn.u16 d18, q3, #5 ++ vrshrn.u16 d19, q8, #5 ++ vmov q3, q12 ++ vst1.8 {q9}, [r0 :128], r3 ++ vrshrn.u16 d18, q10, #5 ++ vrshrn.u16 d19, q11, #5 ++ vmov q8, q13 ++ vst1.8 {q9}, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_16_neon_10, export=1 ++ ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr ip, nb_15_0_1_16 + 16 ++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left) ++ add r2, #32 ++ vld1.8 {q2}, [ip :128] @ {1,2,3...16} ++ lsl r3, #1 ++ vld1.16 {d6[],d7[]}, [r1] @ Top (right) ++ sub ip, #16 ++ vmovl.u8 q8, d4 ++ mov r1, #16 ++ vshl.i16 q9, q0, #4 ++ vmovl.u8 q2, d5 ++ vshl.i16 q10, q1, #4 ++ vld1.16 {d22[],d23[]}, [r2] @ Left (lower) ++ sub r2, #32 ++ vld1.8 {q12}, [ip] @ {15,14,13...0} ++ vmla.i16 q9, q8, q3 ++ vmla.i16 q10, q2, q3 @ Acc set up ++ vsub.i16 q0, q11, q0 ++ vsub.i16 q1, q11, q1 @ Add set up ++ vadd.i16 q2, q9, q0 ++ vadd.i16 q3, q10, q1 ++ vmovl.u8 q8, d24 ++ vmovl.u8 q9, d25 ++ ++@ u16 15..0 [2] q8,q9 ++@ u32 left[y] [2] [r2] ++@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1] ++ ++1: ++ vadd.i16 q10, q2, q0 ++ subs r1, #2 ++ vld1.16 {d24[],d25[]}, [r2]! ++ vadd.i16 q11, q3, q1 ++ vld1.16 {d28[],d29[]}, [r2]! ++ vmla.i16 q2, q8, q12 ++ vmla.i16 q3, q9, q12 ++ vadd.i16 q12, q10, q0 ++ vmla.i16 q10, q8, q14 ++ vadd.i16 q13, q11, q1 ++ vmla.i16 q11, q9, q14 ++ vrshr.u16 q14, q2, #5 ++ vrshr.u16 q15, q3, #5 ++ vmov q2, q12 ++ vst1.16 {q14-q15}, [r0 :128], r3 ++ vrshr.u16 q14, q10, #5 ++ vrshr.u16 q15, q11, #5 ++ vmov q3, q13 ++ vst1.16 {q14-q15}, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_32_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_32_neon_8, export=1 ++ ++ vld1.8 {q0-q1}, [r1 :128]! @ Top (left) ++ adr ip, nb_31_0_1_32 + 32 ++ vpush {d8-d12} ++ vld1.8 {q2-q3}, [ip :128] @ {1,2,3...32} ++ add r2, #32 ++ vld1.8 {d8[]}, [r1] @ Top (right) ++ sub ip, #32 ++ vshll.u8 q8, d0, #5 ++ mov r1, #32 ++ vld1.8 {d9[]}, [r2] @ Left (lower) ++ sub r2, #32 ++ vshll.u8 q9, d1, #5 ++ vshll.u8 q10, d2, #5 ++ vshll.u8 q11, d3, #5 ++ vmlal.u8 q8, d4, d8 ++ vsubl.u8 q12, d9, d0 ++ vmlal.u8 q9, d5, d8 ++ vsubl.u8 q13, d9, d1 ++ vmlal.u8 q10, d6, d8 ++ vsubl.u8 q14, d9, d2 ++ vmlal.u8 q11, d7, d8 @ Acc set up ++ vsubl.u8 q15, d9, d3 @ Add set up ++ vadd.i16 q8, q12 ++ vadd.i16 q9, q13 ++ vadd.i16 q10, q14 ++ vadd.i16 q11, q15 ++ vld1.8 {q4-q5}, [ip :128] @ {31,30,29...0} ++ ++@ u8 31..0 [2] q4,q5 ++@ u8 left[y] [2] [r2] ++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1] ++ ++ vld1.8 {d12[]}, [r2]! ++ vadd.i16 q0, q8, q12 ++ b 2f ++1: ++ vld1.8 {d12[]}, [r2]! ++ vrshrn.u16 d3, q1, #6 ++ vrshrn.u16 d2, q0, #6 ++ vadd.i16 q0, q8, q12 ++ vrshrn.u16 d4, q2, #6 ++ vrshrn.u16 d5, q3, #6 ++ vst1.8 {q1-q2}, [r0 :128], r3 ++2: vadd.i16 q1, q9, q13 ++ subs r1, #2 ++ vadd.i16 q2, q10, q14 ++ vadd.i16 q3, q11, q15 ++ vmlal.u8 q8, d8, d12 ++ vmlal.u8 q9, d9, d12 ++ vmlal.u8 q10, d10, d12 ++ vmlal.u8 q11, d11, d12 ++ vld1.8 {d12[]}, [r2]! ++ vrshrn.u16 d19, q9, #6 ++ vrshrn.u16 d18, q8, #6 ++ vadd.i16 q8, q0, q12 ++ vrshrn.u16 d20, q10, #6 ++ vrshrn.u16 d21, q11, #6 ++ vst1.8 {q9-q10}, [r0 :128], r3 ++ vadd.i16 q9, q1, q13 ++ vadd.i16 q10, q2, q14 ++ vadd.i16 q11, q3, q15 ++ vmlal.u8 q0, d8, d12 ++ vmlal.u8 q1, d9, d12 ++ vmlal.u8 q2, d10, d12 ++ vmlal.u8 q3, d11, d12 ++ ++ bne 1b ++ ++ vpop {d8-d12} ++ ++ vrshrn.u16 d3, q1, #6 ++ vrshrn.u16 d2, q0, #6 ++ vrshrn.u16 d4, q2, #6 ++ vrshrn.u16 d5, q3, #6 ++ vst1.8 {q1-q2}, [r0 :128] ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_32_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_32_neon_10, export=1 ++ ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left) ++ adr ip, nb_31_0_1_32 + 32 ++ vpush {q4-q7} ++ vld1.16 {q2-q3}, [r1 :128]! @ Top (centre) ++ add r2, #64 ++ vld1.8 {q14-q15}, [ip :128] @ {1,2,3...32} ++T lsl r3, #1 ++ vld1.16 {d8[],d9[]}, [r1] @ Top (right) ++ sub ip, #32 ++ vmovl.u8 q12, d28 ++ mov r1, #32 ++ vmovl.u8 q13, d29 ++ vld1.8 {q6-q7}, [ip :128] @ {31,30,29...0} ++ vmovl.u8 q14, d30 ++ vmovl.u8 q15, d31 ++ vld1.16 {d10[],d11[]}, [r2] @ Left (lower) ++ sub r2, #64 ++ vshl.i16 q8, q0, #5 ++ vshl.i16 q9, q1, #5 ++ vshl.i16 q10, q2, #5 ++ vshl.i16 q11, q3, #5 ++ vmla.i16 q8, q12, q4 ++ vsub.i16 q0, q5, q0 ++ vmla.i16 q9, q13, q4 ++ vsub.i16 q1, q5, q1 ++ vmla.i16 q10, q14, q4 ++ vmov.u16 ip, d0[0] ++ vsub.i16 q2, q5, q2 ++ vmla.i16 q11, q15, q4 @ Acc set up ++ vsub.i16 q3, q5, q3 @ Add set up ++ vadd.i16 q8, q0 ++ vadd.i16 q9, q1 ++ vadd.i16 q10, q2 ++ vadd.i16 q11, q3 ++ vmovl.u8 q4, d12 ++ vmovl.u8 q5, d13 ++ vmovl.u8 q6, d14 ++ vmovl.u8 q7, d15 ++ ++@ u16 31..0 [4] q4-q7 ++@ u16 left[y] [4] [r2] ++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1] ++ ++ vadd.i16 q12, q8, q0 ++A sub r0, r0, r3, lsl #1 ++T sub r0, r3 ++1: ++ vld1.16 {d0[0]}, [r2]! ++A add r0, r0, r3, lsl #1 ++T add r0, r3 ++ vadd.i16 q13, q9, q1 ++ subs r1, #2 ++ vadd.i16 q14, q10, q2 ++ vadd.i16 q15, q11, q3 ++ vmla.i16 q8, q4, d0[0] ++ vmla.i16 q9, q5, d0[0] ++ vmla.i16 q10, q6, d0[0] ++ vmla.i16 q11, q7, d0[0] ++ vmov.16 d0[0], ip ++ vrshr.u16 q8, #6 ++ vrshr.u16 q9, #6 ++ vrshr.u16 q10, #6 ++ vrshr.u16 q11, #6 ++ vstm r0, {q8-q11} ++ vadd.i16 q8, q12, q0 ++A add r0, r0, r3, lsl #1 ++T add r0, r3 ++ vld1.16 {d0[0]}, [r2]! ++ vadd.i16 q9, q13, q1 ++ vadd.i16 q10, q14, q2 ++ vadd.i16 q11, q15, q3 ++ vmla.i16 q12, q4, d0[0] ++ vmla.i16 q13, q5, d0[0] ++ vmla.i16 q14, q6, d0[0] ++ vmla.i16 q15, q7, d0[0] ++ vmov.16 d0[0], ip ++ vrshr.u16 q12, #6 ++ vrshr.u16 q13, #6 ++ vrshr.u16 q14, #6 ++ vrshr.u16 q15, #6 ++ vstm r0, {q12-q15} ++ vadd.i16 q12, q8, q0 ++ bne 1b ++ ++ vpop {q4-q7} ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_4_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1 ++ ++ vld1.8 {q0}, [r1] @ Top ++ adr ip, nbx2_3_0_1_4 ++ vldr d2, [r2, #8] @ Left (lower) ++ mov r1, #4 ++ vldr d3, [ip, #8] @ {1,1,2,2,3,3,4,4} ++ lsl r3, #1 ++ vshll.u8 q2, d0, #2 ++ vdup.16 d1, d1[0] @ {t4,t4,t4,t4,t4,t4,t4,t4} ++ vdup.16 d2, d2[0] @ {l4,l4,l4,l4,l4,l4,l4,l4} ++ vldr d6, [r2] @ Left (upper) ++ vmlal.u8 q2, d3, d1 ++ vsubl.u8 q0, d2, d0 ++ vldr d7, [ip] @ {3,3,2,2,1,1,0,0} ++ ++@ u8 3..0 [1] d7 ++@ u8 left[y] [1] d6 ++@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1] ++ ++ vdup.16 d2, d6[0] ++ vadd.i16 q2, q0 ++ vdup.16 d3, d6[1] ++ vadd.i16 q8, q2, q0 ++1: ++ vmlal.u8 q2, d7, d2 ++ subs r1, #2 ++ vadd.i16 q9, q8, q0 ++ vmlal.u8 q8, d7, d3 ++ vdup.16 d2, d6[2] ++ vdup.16 d3, d6[3] ++ vrshrn.i16 d20, q2, #3 ++ vmov q2, q9 ++ vst1.8 {d20}, [r0], r3 ++ vrshrn.i16 d20, q8, #3 ++ vadd.i16 q8, q2, q0 ++ vst1.8 {d20}, [r0], r3 ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_4_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1 ++ ++ adr ip, nbx2_3_0_1_4 ++ vld1.16 {q0}, [r1 :128]! @ Top (left) ++ lsl r3, #2 ++ vld1.16 {q1}, [ip :128] @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4} ++ add ip, r2, #16 ++ vld1.32 {d4[],d5[]}, [r1] @ Top (right) ++ vshl.s16 q3, q0, #2 ++ vmovl.u8 q8, d3 @ {1,1,2,2,3,3,4,4} ++ vld1.32 {d18[],d19[]}, [ip] @ Left (lower) ++ vmla.i16 q3, q8, q2 @ Acc set up ++ vsub.i16 q0, q9, q0 @ Add set up ++ vmovl.u8 q1, d2 @ {3,3,2,2,1,1,0,0} ++ vadd.i16 q2, q3, q0 ++ ++@ u16 3..0 [1] q1 ++@ u32 left[y] [1] [r2] ++@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1] ++ ++ vld1.32 {d6[],d7[]}, [r2]! ++ vadd.i16 q8, q2, q0 ++ vld1.32 {d18[],d19[]}, [r2]! ++ vmla.i16 q2, q1, q3 ++ vadd.i16 q3, q8, q0 ++ vmla.i16 q8, q1, q9 ++ ++ vrshr.u16 q9, q2, #3 ++ vmov q2, q3 ++ vrshr.u16 q10, q8, #3 ++ vld1.32 {d6[],d7[]}, [r2]! ++ vst1.16 {q9}, [r0 :128], r3 ++ vadd.i16 q8, q2, q0 ++ vld1.32 {d18[],d19[]}, [r2]! ++ vmla.i16 q2, q1, q3 ++ vadd.i16 q3, q8, q0 ++ vmla.i16 q8, q1, q9 ++ vst1.16 {q10}, [r0 :128], r3 ++ ++ vrshr.u16 q9, q2, #3 ++ add r3, r0 ++ vrshr.u16 q10, q8, #3 ++ vst1.16 {q9}, [r0 :128] ++ vst1.16 {q10}, [r3 :128] ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_8_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1 ++ ++ adr ip, nbx2_7_0_1_8 + 16 ++ vld1.8 {q0}, [r1 :128]! @ Top (left) ++ add r2, #16 ++ vld1.8 {q1}, [ip: 128] @ {1,1,2,2,3,3...8,8} ++ lsl r3, #1 ++ vld1.16 {d4[]}, [r1] @ Top (right) ++ sub ip, #16 ++ vshll.u8 q3, d0, #3 ++ mov r1, #8 ++ vshll.u8 q8, d1, #3 ++ vld1.16 {d5[]}, [r2] @ Left (lower) ++ sub r2, #16 ++ vmlal.u8 q3, d2, d4 ++ vmlal.u8 q8, d3, d4 @ Acc set up ++ vsubl.u8 q1, d5, d0 ++ vsubl.u8 q0, d5, d1 @ Add set up ++ vld1.8 {q2}, [ip :128] @ {7,7,6,6,5,5...0,0} ++ ++@ u8 7..0 [1] q2 ++@ u8 left[y] [1] [r2] ++@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1] ++ ++ vadd.i16 q3, q1 ++ vadd.i16 q8, q0 ++1: ++ vadd.i16 q10, q3, q1 ++ subs r1, #2 ++ vld1.16 {d18[]}, [r2]! ++ vadd.i16 q11, q8, q0 ++ vld1.16 {d19[]}, [r2]! ++ vmlal.u8 q3, d4, d18 ++ vmlal.u8 q8, d5, d18 ++ vadd.i16 q12, q10, q1 ++ vmlal.u8 q10, d4, d19 ++ vadd.i16 q13, q11, q0 ++ vmlal.u8 q11, d5, d19 ++ vrshrn.u16 d18, q3, #4 ++ vrshrn.u16 d19, q8, #4 ++ vmov q3, q12 ++ vst1.8 {q9}, [r0 :128], r3 ++ vrshrn.u16 d18, q10, #4 ++ vrshrn.u16 d19, q11, #4 ++ vmov q8, q13 ++ vst1.8 {q9}, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++ ++endfunc ++ ++ ++@------------------------------------------------------------------------------ ++@ ++@ Data - has to be in two lumps to ensure we can always reach using adr ++ ++ .balign 64 ++ ++nbx2_15_0_1_16: ++ .byte 15, 15, 14, 14, 13, 13, 12, 12 ++ .byte 11, 11, 10, 10, 9, 9, 8, 8 ++nbx2_7_0_1_8: ++ .byte 7, 7, 6, 6, 5, 5, 4, 4 ++ .byte 3, 3, 2, 2, 1, 1, 0, 0 ++ .byte 1, 1, 2, 2, 3, 3, 4, 4 ++ .byte 5, 5, 6, 6, 7, 7, 8, 8 ++ .byte 9, 9, 10, 10, 11, 11, 12, 12 ++ .byte 13, 13, 14, 14, 15, 15, 16, 16 ++ ++ @ should be back on a 64-byte boundary here ++ ++nbx2_3_0_1_4: ++ .byte 3, 3, 2, 2, 1, 1, 0, 0 ++ .byte 1, 1, 2, 2, 3, 3, 4, 4 ++ ++@------------------------------------------------------------------------------ ++ ++ ++@ ff_hevc_rpi_pred_planar_c_8_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1 ++ ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ adr ip, nbx2_7_0_1_8 + 16 ++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left) ++ add r2, #32 ++ vld1.8 {q2}, [ip :128] @ {1,1,2,2,3,3...8,8} ++ lsl r3, #2 ++ vld1.32 {d6[],d7[]}, [r1] @ Top (right) ++ sub ip, #16 ++ vmovl.u8 q8, d4 ++ mov r1, #8 ++ vshl.i16 q9, q0, #3 ++ vmovl.u8 q2, d5 ++ vshl.i16 q10, q1, #3 ++ vld1.32 {d22[],d23[]}, [r2] @ Left (lower) ++ sub r2, #32 ++ vld1.8 {q12}, [ip] @ {7,7,6,6,5,5...0,0} ++ vmla.i16 q9, q8, q3 ++ vmla.i16 q10, q2, q3 @ Acc set up ++ vsub.i16 q0, q11, q0 ++ vsub.i16 q1, q11, q1 @ Add set up ++ vadd.i16 q2, q9, q0 ++ vadd.i16 q3, q10, q1 ++ vmovl.u8 q8, d24 ++ vmovl.u8 q9, d25 ++ ++@ u16 7..0 [2] q8,q9 ++@ u32 left[y] [2] [r2] ++@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1] ++ ++1: ++ vadd.i16 q10, q2, q0 ++ subs r1, #2 ++ vld1.32 {d24[],d25[]}, [r2]! ++ vadd.i16 q11, q3, q1 ++ vld1.32 {d28[],d29[]}, [r2]! ++ vmla.i16 q2, q8, q12 ++ vmla.i16 q3, q9, q12 ++ vadd.i16 q12, q10, q0 ++ vmla.i16 q10, q8, q14 ++ vadd.i16 q13, q11, q1 ++ vmla.i16 q11, q9, q14 ++ vrshr.u16 q14, q2, #4 ++ vrshr.u16 q15, q3, #4 ++ vmov q2, q12 ++ vst1.16 {q14-q15}, [r0 :128], r3 ++ vrshr.u16 q14, q10, #4 ++ vrshr.u16 q15, q11, #4 ++ vmov q3, q13 ++ vst1.16 {q14-q15}, [r0 :128], r3 ++ bne 1b ++ ++ bx lr ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_16_neon_8 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1 ++ ++ vld1.8 {q0-q1}, [r1 :128]! @ Top (left) ++ adr ip, nbx2_15_0_1_16 + 32 ++ vpush {d8-d12} ++ vld1.8 {q2-q3}, [ip :128] @ {1,1,2,2,3,3...16,16} ++ add r2, #32 ++ vld1.16 {d8[]}, [r1] @ Top (right) ++ sub ip, #32 ++ vshll.u8 q8, d0, #4 ++ mov r1, #16 ++ vld1.16 {d9[]}, [r2] @ Left (lower) ++ sub r2, #32 ++ vshll.u8 q9, d1, #4 ++ lsl r3, #1 ++ vshll.u8 q10, d2, #4 ++ vshll.u8 q11, d3, #4 ++ vmlal.u8 q8, d4, d8 ++ vsubl.u8 q12, d9, d0 ++ vmlal.u8 q9, d5, d8 ++ vsubl.u8 q13, d9, d1 ++ vmlal.u8 q10, d6, d8 ++ vsubl.u8 q14, d9, d2 ++ vmlal.u8 q11, d7, d8 @ Acc set up ++ vsubl.u8 q15, d9, d3 @ Add set up ++ vadd.i16 q8, q12 ++ vadd.i16 q9, q13 ++ vadd.i16 q10, q14 ++ vadd.i16 q11, q15 ++ vld1.8 {q4-q5}, [ip :128] @ {15,15,14,14,13,13...0,0} ++ ++@ u8 15..0 [2] q4,q5 ++@ u8 left[y] [2] [r2] ++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1] ++ ++ vld1.16 {d12[]}, [r2]! ++ vadd.i16 q0, q8, q12 ++ b 2f ++1: ++ vld1.16 {d12[]}, [r2]! ++ vrshrn.u16 d3, q1, #5 ++ vrshrn.u16 d2, q0, #5 ++ vadd.i16 q0, q8, q12 ++ vrshrn.u16 d4, q2, #5 ++ vrshrn.u16 d5, q3, #5 ++ vst1.8 {q1-q2}, [r0 :128], r3 ++2: vadd.i16 q1, q9, q13 ++ subs r1, #2 ++ vadd.i16 q2, q10, q14 ++ vadd.i16 q3, q11, q15 ++ vmlal.u8 q8, d8, d12 ++ vmlal.u8 q9, d9, d12 ++ vmlal.u8 q10, d10, d12 ++ vmlal.u8 q11, d11, d12 ++ vld1.16 {d12[]}, [r2]! ++ vrshrn.u16 d19, q9, #5 ++ vrshrn.u16 d18, q8, #5 ++ vadd.i16 q8, q0, q12 ++ vrshrn.u16 d20, q10, #5 ++ vrshrn.u16 d21, q11, #5 ++ vst1.8 {q9-q10}, [r0 :128], r3 ++ vadd.i16 q9, q1, q13 ++ vadd.i16 q10, q2, q14 ++ vadd.i16 q11, q3, q15 ++ vmlal.u8 q0, d8, d12 ++ vmlal.u8 q1, d9, d12 ++ vmlal.u8 q2, d10, d12 ++ vmlal.u8 q3, d11, d12 ++ ++ bne 1b ++ ++ vpop {d8-d12} ++ ++ vrshrn.u16 d3, q1, #5 ++ vrshrn.u16 d2, q0, #5 ++ vrshrn.u16 d4, q2, #5 ++ vrshrn.u16 d5, q3, #5 ++ vst1.8 {q1-q2}, [r0 :128] ++ ++ bx lr ++ ++endfunc ++ ++ ++@ ff_hevc_rpi_pred_planar_c_16_neon_10 ++@ uint8_t *_src, [r0] ++@ const uint8_t *_top, [r1] ++@ const uint8_t *_left, [r2] ++@ ptrdiff_t stride) [r3] ++ ++function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1 ++ ++ @ Load from bytes & expand later - at the very least this uses less ++ @ memory than having a short table ++ vld1.16 {q0-q1}, [r1 :128]! @ Top (left) ++ adr ip, nbx2_15_0_1_16 + 32 ++ vpush {q4-q7} ++ vld1.16 {q2-q3}, [r1 :128]! @ Top (centre) ++ add r2, #64 ++ vld1.8 {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16} ++T lsl r3, #2 ++ vld1.32 {d8[],d9[]}, [r1] @ Top (right) ++ sub ip, #32 ++ vmovl.u8 q12, d28 ++ mov r1, #16 ++ vmovl.u8 q13, d29 ++ vld1.8 {q6-q7}, [ip :128] @ {15,15,14,14,13,13...0,0} ++ vmovl.u8 q14, d30 ++ vmovl.u8 q15, d31 ++ vld1.32 {d10[],d11[]}, [r2] @ Left (lower) ++ sub r2, #64 ++ vshl.i16 q8, q0, #4 ++ vshl.i16 q9, q1, #4 ++ vshl.i16 q10, q2, #4 ++ vshl.i16 q11, q3, #4 ++ vmla.i16 q8, q12, q4 ++ vsub.i16 q0, q5, q0 ++ vmla.i16 q9, q13, q4 ++ vpush {q0} ++ vsub.i16 q1, q5, q1 ++ vmla.i16 q10, q14, q4 ++ vsub.i16 q2, q5, q2 ++ vmla.i16 q11, q15, q4 @ Acc set up ++ vsub.i16 q3, q5, q3 @ Add set up ++ vadd.i16 q8, q0 ++ vadd.i16 q9, q1 ++ vadd.i16 q10, q2 ++ vadd.i16 q11, q3 ++ vmovl.u8 q4, d12 ++ vmovl.u8 q5, d13 ++ vmovl.u8 q6, d14 ++ vmovl.u8 q7, d15 ++ ++@ u16 31..0 [4] q4-q7 ++@ u16 left[y] [4] [r2] ++@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially ++@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1] ++ ++ vadd.i16 q12, q8, q0 ++A sub r0, r0, r3, lsl #2 ++T sub r0, r3 ++1: ++ vld1.32 {d0[],d1[]}, [r2]! ++A add r0, r0, r3, lsl #2 ++T add r0, r3 ++ vadd.i16 q13, q9, q1 ++ subs r1, #2 ++ vadd.i16 q14, q10, q2 ++ vadd.i16 q15, q11, q3 ++ vmla.i16 q8, q4, q0 ++ vmla.i16 q9, q5, q0 ++ vmla.i16 q10, q6, q0 ++ vmla.i16 q11, q7, q0 ++ vld1.16 {q0}, [sp] ++ vrshr.u16 q8, #5 ++ vrshr.u16 q9, #5 ++ vrshr.u16 q10, #5 ++ vrshr.u16 q11, #5 ++ vstm r0, {q8-q11} ++ vadd.i16 q8, q12, q0 ++A add r0, r0, r3, lsl #2 ++T add r0, r3 ++ vld1.32 {d0[],d1[]}, [r2]! ++ vadd.i16 q9, q13, q1 ++ vadd.i16 q10, q14, q2 ++ vadd.i16 q11, q15, q3 ++ vmla.i16 q12, q4, q0 ++ vmla.i16 q13, q5, q0 ++ vmla.i16 q14, q6, q0 ++ vmla.i16 q15, q7, q0 ++ vld1.16 {q0}, [sp] ++ vrshr.u16 q12, #5 ++ vrshr.u16 q13, #5 ++ vrshr.u16 q14, #5 ++ vrshr.u16 q15, #5 ++ vstm r0, {q12-q15} ++ vadd.i16 q12, q8, q0 ++ bne 1b ++ ++ vpop {q3-q7} ++ bx lr ++ ++endfunc +diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h +index c91b2fd169..d6e019bbe1 100644 +--- a/libavcodec/avcodec.h ++++ b/libavcodec/avcodec.h +@@ -2236,8 +2236,7 @@ typedef struct AVCodecContext { + #define FF_SUB_TEXT_FMT_ASS_WITH_TIMINGS 1 + #endif + +- /** +- * Audio only. The amount of padding (in samples) appended by the encoder to ++ /* Audio only. The amount of padding (in samples) appended by the encoder to + * the end of the audio. I.e. this number of decoded samples must be + * discarded by the caller from the end of the stream to get the original + * audio without any trailing padding. +@@ -2567,6 +2566,17 @@ typedef struct AVHWAccel { + * that avctx->hwaccel_priv_data is invalid. + */ + int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); ++ ++ /** ++ * Called if parsing fails ++ * ++ * An error has occured, end_frame will not be called ++ * start_frame & decode_slice may or may not have been called ++ * Optional ++ * ++ * @param avctx the codec context ++ */ ++ void (*abort_frame)(AVCodecContext *avctx); + } AVHWAccel; + + /** +diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h +index 1bf1c620d6..ccfa991f60 100644 +--- a/libavcodec/cabac.h ++++ b/libavcodec/cabac.h +@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63]; + typedef struct CABACContext{ + int low; + int range; +- int outstanding_count; ++ union ++ { ++ int outstanding_count; ++ struct { ++ uint16_t bits; ++ uint16_t range; ++ } by22; ++ }; + const uint8_t *bytestream_start; + const uint8_t *bytestream; + const uint8_t *bytestream_end; +diff --git a/libavcodec/codec.h b/libavcodec/codec.h +index 1fda619ee7..b4650f9ec9 100644 +--- a/libavcodec/codec.h ++++ b/libavcodec/codec.h +@@ -349,6 +349,17 @@ const AVCodec *av_codec_iterate(void **opaque); + */ + AVCodec *avcodec_find_decoder(enum AVCodecID id); + ++/** ++ * Find a registered decoder with a matching codec ID and pix_fmt. ++ * A decoder will pix_fmt set to NULL will match any fmt. ++ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL. ++ * ++ * @param id AVCodecID of the requested decoder ++ * @param fmt AVPixelForma that msut be supported by decoder ++ * @return A decoder if one was found, NULL otherwise. ++ */ ++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt); ++ + /** + * Find a registered decoder with the specified name. + * +diff --git a/libavcodec/h264-ctrls.h b/libavcodec/h264-ctrls.h +new file mode 100644 +index 0000000000..ec47991544 +--- /dev/null ++++ b/libavcodec/h264-ctrls.h +@@ -0,0 +1,231 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * These are the H.264 state controls for use with stateless H.264 ++ * codec drivers. ++ * ++ * It turns out that these structs are not stable yet and will undergo ++ * more changes. So keep them private until they are stable and ready to ++ * become part of the official public API. ++ */ ++ ++#ifndef _H264_CTRLS_H_ ++#define _H264_CTRLS_H_ ++ ++#include ++ ++/* ++ * Maximum DPB size, as specified by section 'A.3.1 Level limits ++ * common to the Baseline, Main, and Extended profiles'. ++ */ ++#define V4L2_H264_NUM_DPB_ENTRIES 16 ++ ++#define V4L2_H264_REF_LIST_LEN (2 * V4L2_H264_NUM_DPB_ENTRIES) ++ ++/* Our pixel format isn't stable at the moment */ ++#define V4L2_PIX_FMT_H264_SLICE v4l2_fourcc('S', '2', '6', '4') /* H264 parsed slices */ ++ ++/* ++ * This is put insanely high to avoid conflicting with controls that ++ * would be added during the phase where those controls are not ++ * stable. It should be fixed eventually. ++ */ ++#define V4L2_CID_MPEG_VIDEO_H264_SPS (V4L2_CID_MPEG_BASE+1000) ++#define V4L2_CID_MPEG_VIDEO_H264_PPS (V4L2_CID_MPEG_BASE+1001) ++#define V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX (V4L2_CID_MPEG_BASE+1002) ++#define V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS (V4L2_CID_MPEG_BASE+1003) ++#define V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS (V4L2_CID_MPEG_BASE+1004) ++#define V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE (V4L2_CID_MPEG_BASE+1005) ++#define V4L2_CID_MPEG_VIDEO_H264_START_CODE (V4L2_CID_MPEG_BASE+1006) ++#define V4L2_CID_MPEG_VIDEO_H264_PRED_WEIGHTS (V4L2_CID_MPEG_BASE+1007) ++ ++/* enum v4l2_ctrl_type type values */ ++#define V4L2_CTRL_TYPE_H264_SPS 0x0110 ++#define V4L2_CTRL_TYPE_H264_PPS 0x0111 ++#define V4L2_CTRL_TYPE_H264_SCALING_MATRIX 0x0112 ++#define V4L2_CTRL_TYPE_H264_SLICE_PARAMS 0x0113 ++#define V4L2_CTRL_TYPE_H264_DECODE_PARAMS 0x0114 ++#define V4L2_CTRL_TYPE_H264_PRED_WEIGHTS 0x0115 ++ ++enum v4l2_mpeg_video_h264_decode_mode { ++ V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED, ++ V4L2_MPEG_VIDEO_H264_DECODE_MODE_FRAME_BASED, ++}; ++ ++enum v4l2_mpeg_video_h264_start_code { ++ V4L2_MPEG_VIDEO_H264_START_CODE_NONE, ++ V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B, ++}; ++ ++#define V4L2_H264_SPS_CONSTRAINT_SET0_FLAG 0x01 ++#define V4L2_H264_SPS_CONSTRAINT_SET1_FLAG 0x02 ++#define V4L2_H264_SPS_CONSTRAINT_SET2_FLAG 0x04 ++#define V4L2_H264_SPS_CONSTRAINT_SET3_FLAG 0x08 ++#define V4L2_H264_SPS_CONSTRAINT_SET4_FLAG 0x10 ++#define V4L2_H264_SPS_CONSTRAINT_SET5_FLAG 0x20 ++ ++#define V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE 0x01 ++#define V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS 0x02 ++#define V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO 0x04 ++#define V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED 0x08 ++#define V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY 0x10 ++#define V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD 0x20 ++#define V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE 0x40 ++ ++struct v4l2_ctrl_h264_sps { ++ __u8 profile_idc; ++ __u8 constraint_set_flags; ++ __u8 level_idc; ++ __u8 seq_parameter_set_id; ++ __u8 chroma_format_idc; ++ __u8 bit_depth_luma_minus8; ++ __u8 bit_depth_chroma_minus8; ++ __u8 log2_max_frame_num_minus4; ++ __u8 pic_order_cnt_type; ++ __u8 log2_max_pic_order_cnt_lsb_minus4; ++ __u8 max_num_ref_frames; ++ __u8 num_ref_frames_in_pic_order_cnt_cycle; ++ __s32 offset_for_ref_frame[255]; ++ __s32 offset_for_non_ref_pic; ++ __s32 offset_for_top_to_bottom_field; ++ __u16 pic_width_in_mbs_minus1; ++ __u16 pic_height_in_map_units_minus1; ++ __u32 flags; ++}; ++ ++#define V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE 0x0001 ++#define V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT 0x0002 ++#define V4L2_H264_PPS_FLAG_WEIGHTED_PRED 0x0004 ++#define V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT 0x0008 ++#define V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED 0x0010 ++#define V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT 0x0020 ++#define V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE 0x0040 ++#define V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT 0x0080 ++ ++struct v4l2_ctrl_h264_pps { ++ __u8 pic_parameter_set_id; ++ __u8 seq_parameter_set_id; ++ __u8 num_slice_groups_minus1; ++ __u8 num_ref_idx_l0_default_active_minus1; ++ __u8 num_ref_idx_l1_default_active_minus1; ++ __u8 weighted_bipred_idc; ++ __s8 pic_init_qp_minus26; ++ __s8 pic_init_qs_minus26; ++ __s8 chroma_qp_index_offset; ++ __s8 second_chroma_qp_index_offset; ++ __u16 flags; ++}; ++ ++struct v4l2_ctrl_h264_scaling_matrix { ++ __u8 scaling_list_4x4[6][16]; ++ __u8 scaling_list_8x8[6][64]; ++}; ++ ++struct v4l2_h264_weight_factors { ++ __s16 luma_weight[32]; ++ __s16 luma_offset[32]; ++ __s16 chroma_weight[32][2]; ++ __s16 chroma_offset[32][2]; ++}; ++ ++#define V4L2_H264_CTRL_PRED_WEIGHTS_REQUIRED(pps, slice) \ ++ ((((pps)->flags & V4L2_H264_PPS_FLAG_WEIGHTED_PRED) && \ ++ ((slice)->slice_type == V4L2_H264_SLICE_TYPE_P || \ ++ (slice)->slice_type == V4L2_H264_SLICE_TYPE_SP)) || \ ++ ((pps)->weighted_bipred_idc == 1 && \ ++ (slice)->slice_type == V4L2_H264_SLICE_TYPE_B)) ++ ++struct v4l2_ctrl_h264_pred_weights { ++ __u16 luma_log2_weight_denom; ++ __u16 chroma_log2_weight_denom; ++ struct v4l2_h264_weight_factors weight_factors[2]; ++}; ++ ++#define V4L2_H264_SLICE_TYPE_P 0 ++#define V4L2_H264_SLICE_TYPE_B 1 ++#define V4L2_H264_SLICE_TYPE_I 2 ++#define V4L2_H264_SLICE_TYPE_SP 3 ++#define V4L2_H264_SLICE_TYPE_SI 4 ++ ++#define V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED 0x01 ++#define V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH 0x02 ++ ++#define V4L2_H264_TOP_FIELD_REF 0x1 ++#define V4L2_H264_BOTTOM_FIELD_REF 0x2 ++#define V4L2_H264_FRAME_REF 0x3 ++ ++struct v4l2_h264_reference { ++ __u8 fields; ++ ++ /* Index into v4l2_ctrl_h264_decode_params.dpb[] */ ++ __u8 index; ++}; ++ ++struct v4l2_ctrl_h264_slice_params { ++ /* Offset in bits to slice_data() from the beginning of this slice. */ ++ __u32 header_bit_size; ++ ++ __u32 first_mb_in_slice; ++ ++ __u8 slice_type; ++ __u8 colour_plane_id; ++ __u8 redundant_pic_cnt; ++ __u8 cabac_init_idc; ++ __s8 slice_qp_delta; ++ __s8 slice_qs_delta; ++ __u8 disable_deblocking_filter_idc; ++ __s8 slice_alpha_c0_offset_div2; ++ __s8 slice_beta_offset_div2; ++ __u8 num_ref_idx_l0_active_minus1; ++ __u8 num_ref_idx_l1_active_minus1; ++ ++ __u8 reserved; ++ ++ struct v4l2_h264_reference ref_pic_list0[V4L2_H264_REF_LIST_LEN]; ++ struct v4l2_h264_reference ref_pic_list1[V4L2_H264_REF_LIST_LEN]; ++ ++ __u32 flags; ++}; ++ ++#define V4L2_H264_DPB_ENTRY_FLAG_VALID 0x01 ++#define V4L2_H264_DPB_ENTRY_FLAG_ACTIVE 0x02 ++#define V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM 0x04 ++#define V4L2_H264_DPB_ENTRY_FLAG_FIELD 0x08 ++ ++struct v4l2_h264_dpb_entry { ++ __u64 reference_ts; ++ __u32 pic_num; ++ __u16 frame_num; ++ __u8 fields; ++ __u8 reserved[5]; ++ /* Note that field is indicated by v4l2_buffer.field */ ++ __s32 top_field_order_cnt; ++ __s32 bottom_field_order_cnt; ++ __u32 flags; /* V4L2_H264_DPB_ENTRY_FLAG_* */ ++}; ++ ++#define V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC 0x01 ++#define V4L2_H264_DECODE_PARAM_FLAG_FIELD_PIC 0x02 ++#define V4L2_H264_DECODE_PARAM_FLAG_BOTTOM_FIELD 0x04 ++ ++struct v4l2_ctrl_h264_decode_params { ++ struct v4l2_h264_dpb_entry dpb[V4L2_H264_NUM_DPB_ENTRIES]; ++ __u16 nal_ref_idc; ++ __u16 frame_num; ++ __s32 top_field_order_cnt; ++ __s32 bottom_field_order_cnt; ++ __u16 idr_pic_id; ++ __u16 pic_order_cnt_lsb; ++ __s32 delta_pic_order_cnt_bottom; ++ __s32 delta_pic_order_cnt0; ++ __s32 delta_pic_order_cnt1; ++ /* Size in bits of dec_ref_pic_marking() syntax element. */ ++ __u32 dec_ref_pic_marking_bit_size; ++ /* Size in bits of pic order count syntax. */ ++ __u32 pic_order_cnt_bit_size; ++ __u32 slice_group_change_cycle; ++ ++ __u32 reserved; ++ __u32 flags; /* V4L2_H264_DECODE_PARAM_FLAG_* */ ++}; ++ ++#endif +diff --git a/libavcodec/h264_slice.c b/libavcodec/h264_slice.c +index db8363e4cc..39ae8fabfd 100644 +--- a/libavcodec/h264_slice.c ++++ b/libavcodec/h264_slice.c +@@ -759,6 +759,7 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback) + #define HWACCEL_MAX (CONFIG_H264_DXVA2_HWACCEL + \ + (CONFIG_H264_D3D11VA_HWACCEL * 2) + \ + CONFIG_H264_NVDEC_HWACCEL + \ ++ CONFIG_H264_V4L2REQUEST_HWACCEL + \ + CONFIG_H264_VAAPI_HWACCEL + \ + CONFIG_H264_VIDEOTOOLBOX_HWACCEL + \ + CONFIG_H264_VDPAU_HWACCEL) +@@ -784,10 +785,17 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback) + *fmt++ = AV_PIX_FMT_GBRP10; + } else + *fmt++ = AV_PIX_FMT_YUV444P10; +- } else if (CHROMA422(h)) ++ } else if (CHROMA422(h)) { ++#if CONFIG_H264_V4L2REQUEST_HWACCEL ++ *fmt++ = AV_PIX_FMT_DRM_PRIME; ++#endif + *fmt++ = AV_PIX_FMT_YUV422P10; +- else ++ } else { ++#if CONFIG_H264_V4L2REQUEST_HWACCEL ++ *fmt++ = AV_PIX_FMT_DRM_PRIME; ++#endif + *fmt++ = AV_PIX_FMT_YUV420P10; ++ } + break; + case 12: + if (CHROMA444(h)) { +@@ -826,6 +834,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback) + else + *fmt++ = AV_PIX_FMT_YUV444P; + } else if (CHROMA422(h)) { ++#if CONFIG_H264_V4L2REQUEST_HWACCEL ++ *fmt++ = AV_PIX_FMT_DRM_PRIME; ++#endif + if (h->avctx->color_range == AVCOL_RANGE_JPEG) + *fmt++ = AV_PIX_FMT_YUVJ422P; + else +@@ -843,6 +854,9 @@ static enum AVPixelFormat get_pixel_format(H264Context *h, int force_callback) + #endif + #if CONFIG_H264_VIDEOTOOLBOX_HWACCEL + *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; ++#endif ++#if CONFIG_H264_V4L2REQUEST_HWACCEL ++ *fmt++ = AV_PIX_FMT_DRM_PRIME; + #endif + if (h->avctx->codec->pix_fmts) + choices = h->avctx->codec->pix_fmts; +@@ -1736,7 +1750,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, + unsigned int slice_type, tmp, i; + int field_pic_flag, bottom_field_flag; + int first_slice = sl == h->slice_ctx && !h->current_slice; +- int picture_structure; ++ int picture_structure, pos; + + if (first_slice) + av_assert0(!h->setup_finished); +@@ -1818,8 +1832,9 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, + } + + if (nal->type == H264_NAL_IDR_SLICE) +- get_ue_golomb_long(&sl->gb); /* idr_pic_id */ ++ sl->idr_pic_id = get_ue_golomb_long(&sl->gb); + ++ pos = sl->gb.index; + if (sps->poc_type == 0) { + sl->poc_lsb = get_bits(&sl->gb, sps->log2_max_poc_lsb); + +@@ -1833,6 +1848,7 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, + if (pps->pic_order_present == 1 && picture_structure == PICT_FRAME) + sl->delta_poc[1] = get_se_golomb(&sl->gb); + } ++ sl->pic_order_cnt_bit_size = sl->gb.index - pos; + + sl->redundant_pic_count = 0; + if (pps->redundant_pic_cnt_present) +@@ -1872,9 +1888,11 @@ static int h264_slice_header_parse(const H264Context *h, H264SliceContext *sl, + + sl->explicit_ref_marking = 0; + if (nal->ref_idc) { ++ int bit_pos = sl->gb.index; + ret = ff_h264_decode_ref_pic_marking(sl, &sl->gb, nal, h->avctx); + if (ret < 0 && (h->avctx->err_recognition & AV_EF_EXPLODE)) + return AVERROR_INVALIDDATA; ++ sl->ref_pic_marking_size_in_bits = sl->gb.index - bit_pos; + } + + if (sl->slice_type_nos != AV_PICTURE_TYPE_I && pps->cabac) { +diff --git a/libavcodec/h264dec.c b/libavcodec/h264dec.c +index 5eedeb3c27..a504c89565 100644 +--- a/libavcodec/h264dec.c ++++ b/libavcodec/h264dec.c +@@ -1102,6 +1102,9 @@ AVCodec ff_h264_decoder = { + #endif + #if CONFIG_H264_VIDEOTOOLBOX_HWACCEL + HWACCEL_VIDEOTOOLBOX(h264), ++#endif ++#if CONFIG_H264_V4L2REQUEST_HWACCEL ++ HWACCEL_V4L2REQUEST(h264), + #endif + NULL + }, +diff --git a/libavcodec/h264dec.h b/libavcodec/h264dec.h +index a419615124..b3dcd6e7da 100644 +--- a/libavcodec/h264dec.h ++++ b/libavcodec/h264dec.h +@@ -190,6 +190,8 @@ typedef struct H264SliceContext { + int slice_type_nos; ///< S free slice type (SI/SP are remapped to I/P) + int slice_type_fixed; + ++ int idr_pic_id; ++ + int qscale; + int chroma_qp[2]; // QPc + int qp_thresh; ///< QP threshold to skip loopfilter +@@ -328,11 +330,13 @@ typedef struct H264SliceContext { + MMCO mmco[MAX_MMCO_COUNT]; + int nb_mmco; + int explicit_ref_marking; ++ int ref_pic_marking_size_in_bits; + + int frame_num; + int poc_lsb; + int delta_poc_bottom; + int delta_poc[2]; ++ int pic_order_cnt_bit_size; + int curr_pic_num; + int max_pic_num; + } H264SliceContext; +diff --git a/libavcodec/hevc-ctrls.h b/libavcodec/hevc-ctrls.h +new file mode 100644 +index 0000000000..13698d3f33 +--- /dev/null ++++ b/libavcodec/hevc-ctrls.h +@@ -0,0 +1,230 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * These are the HEVC state controls for use with stateless HEVC ++ * codec drivers. ++ * ++ * It turns out that these structs are not stable yet and will undergo ++ * more changes. So keep them private until they are stable and ready to ++ * become part of the official public API. ++ */ ++ ++#ifndef _HEVC_CTRLS_H_ ++#define _HEVC_CTRLS_H_ ++ ++#include ++ ++/* The pixel format isn't stable at the moment and will likely be renamed. */ ++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */ ++ ++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS (V4L2_CID_MPEG_BASE + 1008) ++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS (V4L2_CID_MPEG_BASE + 1009) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS (V4L2_CID_MPEG_BASE + 1010) ++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX (V4L2_CID_MPEG_BASE + 1011) ++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE (V4L2_CID_MPEG_BASE + 1015) ++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE (V4L2_CID_MPEG_BASE + 1016) ++ ++/* enum v4l2_ctrl_type type values */ ++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120 ++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121 ++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122 ++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123 ++ ++enum v4l2_mpeg_video_hevc_decode_mode { ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED, ++ V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED, ++}; ++ ++enum v4l2_mpeg_video_hevc_start_code { ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE, ++ V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B, ++}; ++ ++#define V4L2_HEVC_SLICE_TYPE_B 0 ++#define V4L2_HEVC_SLICE_TYPE_P 1 ++#define V4L2_HEVC_SLICE_TYPE_I 2 ++ ++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE (1ULL << 0) ++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED (1ULL << 1) ++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET (1ULL << 3) ++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED (1ULL << 4) ++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED (1ULL << 5) ++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT (1ULL << 6) ++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED (1ULL << 7) ++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED (1ULL << 8) ++ ++/* The controls are not stable at the moment and will likely be reworked. */ ++struct v4l2_ctrl_hevc_sps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ ++ __u16 pic_width_in_luma_samples; ++ __u16 pic_height_in_luma_samples; ++ __u8 bit_depth_luma_minus8; ++ __u8 bit_depth_chroma_minus8; ++ __u8 log2_max_pic_order_cnt_lsb_minus4; ++ __u8 sps_max_dec_pic_buffering_minus1; ++ __u8 sps_max_num_reorder_pics; ++ __u8 sps_max_latency_increase_plus1; ++ __u8 log2_min_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_luma_coding_block_size; ++ __u8 log2_min_luma_transform_block_size_minus2; ++ __u8 log2_diff_max_min_luma_transform_block_size; ++ __u8 max_transform_hierarchy_depth_inter; ++ __u8 max_transform_hierarchy_depth_intra; ++ __u8 pcm_sample_bit_depth_luma_minus1; ++ __u8 pcm_sample_bit_depth_chroma_minus1; ++ __u8 log2_min_pcm_luma_coding_block_size_minus3; ++ __u8 log2_diff_max_min_pcm_luma_coding_block_size; ++ __u8 num_short_term_ref_pic_sets; ++ __u8 num_long_term_ref_pics_sps; ++ __u8 chroma_format_idc; ++ ++ __u8 padding; ++ ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 0) ++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT (1ULL << 1) ++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED (1ULL << 2) ++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT (1ULL << 3) ++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED (1ULL << 4) ++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED (1ULL << 5) ++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED (1ULL << 6) ++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT (1ULL << 7) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED (1ULL << 8) ++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED (1ULL << 9) ++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED (1ULL << 10) ++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED (1ULL << 11) ++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED (1ULL << 12) ++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED (1ULL << 13) ++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14) ++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED (1ULL << 15) ++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER (1ULL << 16) ++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT (1ULL << 17) ++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18) ++ ++struct v4l2_ctrl_hevc_pps { ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ ++ __u8 num_extra_slice_header_bits; ++ __s8 init_qp_minus26; ++ __u8 diff_cu_qp_delta_depth; ++ __s8 pps_cb_qp_offset; ++ __s8 pps_cr_qp_offset; ++ __u8 num_tile_columns_minus1; ++ __u8 num_tile_rows_minus1; ++ __u8 column_width_minus1[20]; ++ __u8 row_height_minus1[22]; ++ __s8 pps_beta_offset_div2; ++ __s8 pps_tc_offset_div2; ++ __u8 log2_parallel_merge_level_minus2; ++ ++ __u8 padding[4]; ++ __u64 flags; ++}; ++ ++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE 0x01 ++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER 0x02 ++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR 0x03 ++ ++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX 16 ++ ++struct v4l2_hevc_dpb_entry { ++ __u64 timestamp; ++ __u8 rps; ++ __u8 field_pic; ++ __u16 pic_order_cnt[2]; ++ __u8 padding[2]; ++}; ++ ++struct v4l2_hevc_pred_weight_table { ++ __s8 delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __s8 delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __s8 delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ __s8 chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2]; ++ ++ __u8 padding[6]; ++ ++ __u8 luma_log2_weight_denom; ++ __s8 delta_chroma_log2_weight_denom; ++}; ++ ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA (1ULL << 0) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA (1ULL << 1) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED (1ULL << 2) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO (1ULL << 3) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT (1ULL << 4) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0 (1ULL << 5) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV (1ULL << 6) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8) ++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT (1ULL << 9) ++ ++struct v4l2_ctrl_hevc_slice_params { ++ __u32 bit_size; ++ __u32 data_bit_offset; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u32 slice_segment_addr; ++ __u32 num_entry_point_offsets; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ __u8 nal_unit_type; ++ __u8 nuh_temporal_id_plus1; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 slice_type; ++ __u8 colour_plane_id; ++ __u16 slice_pic_order_cnt; ++ __u8 num_ref_idx_l0_active_minus1; ++ __u8 num_ref_idx_l1_active_minus1; ++ __u8 collocated_ref_idx; ++ __u8 five_minus_max_num_merge_cand; ++ __s8 slice_qp_delta; ++ __s8 slice_cb_qp_offset; ++ __s8 slice_cr_qp_offset; ++ __s8 slice_act_y_qp_offset; ++ __s8 slice_act_cb_qp_offset; ++ __s8 slice_act_cr_qp_offset; ++ __s8 slice_beta_offset_div2; ++ __s8 slice_tc_offset_div2; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ __u8 pic_struct; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ __u8 num_active_dpb_entries; ++ __u8 ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ __u8 ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ ++ __u8 num_rps_poc_st_curr_before; ++ __u8 num_rps_poc_st_curr_after; ++ __u8 num_rps_poc_lt_curr; ++ ++ __u8 padding; ++ ++ __u32 entry_point_offset_minus1[256]; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX]; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */ ++ struct v4l2_hevc_pred_weight_table pred_weight_table; ++ ++ __u64 flags; ++}; ++ ++struct v4l2_ctrl_hevc_scaling_matrix { ++ __u8 scaling_list_4x4[6][16]; ++ __u8 scaling_list_8x8[6][64]; ++ __u8 scaling_list_16x16[6][64]; ++ __u8 scaling_list_32x32[2][64]; ++ __u8 scaling_list_dc_coef_16x16[6]; ++ __u8 scaling_list_dc_coef_32x32[2]; ++}; ++ ++#endif +diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c +index 0772608a30..91a7536ee5 100644 +--- a/libavcodec/hevcdec.c ++++ b/libavcodec/hevcdec.c +@@ -372,14 +372,20 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) + #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \ + CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \ + CONFIG_HEVC_NVDEC_HWACCEL + \ ++ CONFIG_HEVC_V4L2REQUEST_HWACCEL + \ + CONFIG_HEVC_VAAPI_HWACCEL + \ + CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \ ++ CONFIG_HEVC_RPI4_8_HWACCEL + \ ++ CONFIG_HEVC_RPI4_10_HWACCEL + \ + CONFIG_HEVC_VDPAU_HWACCEL) + enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts; + + switch (sps->pix_fmt) { + case AV_PIX_FMT_YUV420P: + case AV_PIX_FMT_YUVJ420P: ++#if CONFIG_HEVC_RPI4_8_HWACCEL ++ *fmt++ = AV_PIX_FMT_RPI4_8; ++#endif + #if CONFIG_HEVC_DXVA2_HWACCEL + *fmt++ = AV_PIX_FMT_DXVA2_VLD; + #endif +@@ -398,9 +404,15 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) + #endif + #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX; ++#endif ++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL ++ *fmt++ = AV_PIX_FMT_DRM_PRIME; + #endif + break; + case AV_PIX_FMT_YUV420P10: ++#if CONFIG_HEVC_RPI4_10_HWACCEL ++ *fmt++ = AV_PIX_FMT_RPI4_10; ++#endif + #if CONFIG_HEVC_DXVA2_HWACCEL + *fmt++ = AV_PIX_FMT_DXVA2_VLD; + #endif +@@ -416,6 +428,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps) + #endif + #if CONFIG_HEVC_NVDEC_HWACCEL + *fmt++ = AV_PIX_FMT_CUDA; ++#endif ++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL ++ *fmt++ = AV_PIX_FMT_DRM_PRIME; + #endif + break; + case AV_PIX_FMT_YUV444P: +@@ -3225,7 +3240,14 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output, + s->ref = NULL; + ret = decode_nal_units(s, avpkt->data, avpkt->size); + if (ret < 0) ++ { ++ // Ensure that hwaccel knows this frame is over ++ if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) { ++ s->avctx->hwaccel->abort_frame(s->avctx); ++ } ++ + return ret; ++ } + + if (avctx->hwaccel) { + if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) { +@@ -3588,6 +3610,15 @@ AVCodec ff_hevc_decoder = { + #endif + #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + HWACCEL_VIDEOTOOLBOX(hevc), ++#endif ++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL ++ HWACCEL_V4L2REQUEST(hevc), ++#endif ++#if CONFIG_HEVC_RPI4_8_HWACCEL ++ HWACCEL_RPI4_8(hevc), ++#endif ++#if CONFIG_HEVC_RPI4_10_HWACCEL ++ HWACCEL_RPI4_10(hevc), + #endif + NULL + }, +diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h +index 6109c89bd6..30927fda99 100644 +--- a/libavcodec/hwaccels.h ++++ b/libavcodec/hwaccels.h +@@ -27,6 +27,7 @@ extern const AVHWAccel ff_h264_d3d11va_hwaccel; + extern const AVHWAccel ff_h264_d3d11va2_hwaccel; + extern const AVHWAccel ff_h264_dxva2_hwaccel; + extern const AVHWAccel ff_h264_nvdec_hwaccel; ++extern const AVHWAccel ff_h264_v4l2request_hwaccel; + extern const AVHWAccel ff_h264_vaapi_hwaccel; + extern const AVHWAccel ff_h264_vdpau_hwaccel; + extern const AVHWAccel ff_h264_videotoolbox_hwaccel; +@@ -34,6 +35,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel; + extern const AVHWAccel ff_hevc_d3d11va2_hwaccel; + extern const AVHWAccel ff_hevc_dxva2_hwaccel; + extern const AVHWAccel ff_hevc_nvdec_hwaccel; ++extern const AVHWAccel ff_hevc_v4l2request_hwaccel; + extern const AVHWAccel ff_hevc_vaapi_hwaccel; + extern const AVHWAccel ff_hevc_vdpau_hwaccel; + extern const AVHWAccel ff_hevc_videotoolbox_hwaccel; +@@ -47,6 +49,7 @@ extern const AVHWAccel ff_mpeg2_d3d11va_hwaccel; + extern const AVHWAccel ff_mpeg2_d3d11va2_hwaccel; + extern const AVHWAccel ff_mpeg2_nvdec_hwaccel; + extern const AVHWAccel ff_mpeg2_dxva2_hwaccel; ++extern const AVHWAccel ff_mpeg2_v4l2request_hwaccel; + extern const AVHWAccel ff_mpeg2_vaapi_hwaccel; + extern const AVHWAccel ff_mpeg2_vdpau_hwaccel; + extern const AVHWAccel ff_mpeg2_videotoolbox_hwaccel; +@@ -62,11 +65,13 @@ extern const AVHWAccel ff_vc1_nvdec_hwaccel; + extern const AVHWAccel ff_vc1_vaapi_hwaccel; + extern const AVHWAccel ff_vc1_vdpau_hwaccel; + extern const AVHWAccel ff_vp8_nvdec_hwaccel; ++extern const AVHWAccel ff_vp8_v4l2request_hwaccel; + extern const AVHWAccel ff_vp8_vaapi_hwaccel; + extern const AVHWAccel ff_vp9_d3d11va_hwaccel; + extern const AVHWAccel ff_vp9_d3d11va2_hwaccel; + extern const AVHWAccel ff_vp9_dxva2_hwaccel; + extern const AVHWAccel ff_vp9_nvdec_hwaccel; ++extern const AVHWAccel ff_vp9_v4l2request_hwaccel; + extern const AVHWAccel ff_vp9_vaapi_hwaccel; + extern const AVHWAccel ff_vp9_vdpau_hwaccel; + extern const AVHWAccel ff_wmv3_d3d11va_hwaccel; +@@ -75,5 +80,7 @@ extern const AVHWAccel ff_wmv3_dxva2_hwaccel; + extern const AVHWAccel ff_wmv3_nvdec_hwaccel; + extern const AVHWAccel ff_wmv3_vaapi_hwaccel; + extern const AVHWAccel ff_wmv3_vdpau_hwaccel; ++extern const AVHWAccel ff_hevc_rpi4_8_hwaccel; ++extern const AVHWAccel ff_hevc_rpi4_10_hwaccel; + + #endif /* AVCODEC_HWACCELS_H */ +diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h +index f421dc909f..ed44e01de4 100644 +--- a/libavcodec/hwconfig.h ++++ b/libavcodec/hwconfig.h +@@ -24,6 +24,7 @@ + + + #define HWACCEL_CAP_ASYNC_SAFE (1 << 0) ++#define HWACCEL_CAP_MT_SAFE (1 << 1) + + + typedef struct AVCodecHWConfigInternal { +@@ -80,6 +81,12 @@ typedef struct AVCodecHWConfigInternal { + HW_CONFIG_HWACCEL(0, 0, 1, D3D11VA_VLD, NONE, ff_ ## codec ## _d3d11va_hwaccel) + #define HWACCEL_XVMC(codec) \ + HW_CONFIG_HWACCEL(0, 0, 1, XVMC, NONE, ff_ ## codec ## _xvmc_hwaccel) ++#define HWACCEL_V4L2REQUEST(codec) \ ++ HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME, DRM, ff_ ## codec ## _v4l2request_hwaccel) ++#define HWACCEL_RPI4_8(codec) \ ++ HW_CONFIG_HWACCEL(0, 0, 1, RPI4_8, NONE, ff_ ## codec ## _rpi4_8_hwaccel) ++#define HWACCEL_RPI4_10(codec) \ ++ HW_CONFIG_HWACCEL(0, 0, 1, RPI4_10, NONE, ff_ ## codec ## _rpi4_10_hwaccel) + + #define HW_CONFIG_ENCODER(device, frames, ad_hoc, format, device_type_) \ + &(const AVCodecHWConfigInternal) { \ +diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c +index 547bece576..bfd1083c16 100644 +--- a/libavcodec/mmaldec.c ++++ b/libavcodec/mmaldec.c +@@ -24,6 +24,9 @@ + * MMAL Video Decoder + */ + ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" + #include + #include + #include +@@ -31,6 +34,7 @@ + #include + #include + #include ++#pragma GCC diagnostic pop + #include + + #include "avcodec.h" +diff --git a/libavcodec/mpeg12dec.c b/libavcodec/mpeg12dec.c +index 99e56532a5..15aaf97a34 100644 +--- a/libavcodec/mpeg12dec.c ++++ b/libavcodec/mpeg12dec.c +@@ -1154,6 +1154,9 @@ static const enum AVPixelFormat mpeg2_hwaccel_pixfmt_list_420[] = { + #endif + #if CONFIG_MPEG2_VIDEOTOOLBOX_HWACCEL + AV_PIX_FMT_VIDEOTOOLBOX, ++#endif ++#if CONFIG_MPEG2_V4L2REQUEST_HWACCEL ++ AV_PIX_FMT_DRM_PRIME, + #endif + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NONE +@@ -2952,6 +2955,9 @@ AVCodec ff_mpeg2video_decoder = { + #endif + #if CONFIG_MPEG2_XVMC_HWACCEL + HWACCEL_XVMC(mpeg2), ++#endif ++#if CONFIG_MPEG2_V4L2REQUEST_HWACCEL ++ HWACCEL_V4L2REQUEST(mpeg2), + #endif + NULL + }, +diff --git a/libavcodec/mpeg2-ctrls.h b/libavcodec/mpeg2-ctrls.h +new file mode 100644 +index 0000000000..6601455b3d +--- /dev/null ++++ b/libavcodec/mpeg2-ctrls.h +@@ -0,0 +1,82 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * These are the MPEG2 state controls for use with stateless MPEG-2 ++ * codec drivers. ++ * ++ * It turns out that these structs are not stable yet and will undergo ++ * more changes. So keep them private until they are stable and ready to ++ * become part of the official public API. ++ */ ++ ++#ifndef _MPEG2_CTRLS_H_ ++#define _MPEG2_CTRLS_H_ ++ ++#define V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS (V4L2_CID_MPEG_BASE+250) ++#define V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION (V4L2_CID_MPEG_BASE+251) ++ ++/* enum v4l2_ctrl_type type values */ ++#define V4L2_CTRL_TYPE_MPEG2_SLICE_PARAMS 0x0103 ++#define V4L2_CTRL_TYPE_MPEG2_QUANTIZATION 0x0104 ++ ++#define V4L2_MPEG2_PICTURE_CODING_TYPE_I 1 ++#define V4L2_MPEG2_PICTURE_CODING_TYPE_P 2 ++#define V4L2_MPEG2_PICTURE_CODING_TYPE_B 3 ++#define V4L2_MPEG2_PICTURE_CODING_TYPE_D 4 ++ ++struct v4l2_mpeg2_sequence { ++ /* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence header */ ++ __u16 horizontal_size; ++ __u16 vertical_size; ++ __u32 vbv_buffer_size; ++ ++ /* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */ ++ __u16 profile_and_level_indication; ++ __u8 progressive_sequence; ++ __u8 chroma_format; ++}; ++ ++struct v4l2_mpeg2_picture { ++ /* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture header */ ++ __u8 picture_coding_type; ++ ++ /* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture coding extension */ ++ __u8 f_code[2][2]; ++ __u8 intra_dc_precision; ++ __u8 picture_structure; ++ __u8 top_field_first; ++ __u8 frame_pred_frame_dct; ++ __u8 concealment_motion_vectors; ++ __u8 q_scale_type; ++ __u8 intra_vlc_format; ++ __u8 alternate_scan; ++ __u8 repeat_first_field; ++ __u16 progressive_frame; ++}; ++ ++struct v4l2_ctrl_mpeg2_slice_params { ++ __u32 bit_size; ++ __u32 data_bit_offset; ++ __u64 backward_ref_ts; ++ __u64 forward_ref_ts; ++ ++ struct v4l2_mpeg2_sequence sequence; ++ struct v4l2_mpeg2_picture picture; ++ ++ /* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */ ++ __u32 quantiser_scale_code; ++}; ++ ++struct v4l2_ctrl_mpeg2_quantization { ++ /* ISO/IEC 13818-2, ITU-T Rec. H.262: Quant matrix extension */ ++ __u8 load_intra_quantiser_matrix; ++ __u8 load_non_intra_quantiser_matrix; ++ __u8 load_chroma_intra_quantiser_matrix; ++ __u8 load_chroma_non_intra_quantiser_matrix; ++ ++ __u8 intra_quantiser_matrix[64]; ++ __u8 non_intra_quantiser_matrix[64]; ++ __u8 chroma_intra_quantiser_matrix[64]; ++ __u8 chroma_non_intra_quantiser_matrix[64]; ++}; ++ ++#endif +diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c +index 601f170447..f890f99931 100644 +--- a/libavcodec/pthread_frame.c ++++ b/libavcodec/pthread_frame.c +@@ -191,7 +191,8 @@ static attribute_align_arg void *frame_worker_thread(void *arg) + + /* if the previous thread uses hwaccel then we take the lock to ensure + * the threads don't run concurrently */ +- if (avctx->hwaccel) { ++ if (avctx->hwaccel && ++ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) { + pthread_mutex_lock(&p->parent->hwaccel_mutex); + p->hwaccel_serializing = 1; + } +@@ -614,7 +615,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) { + + if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return; + +- if (avctx->hwaccel && !p->hwaccel_serializing) { ++ if (avctx->hwaccel && ++ !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) && ++ !p->hwaccel_serializing) { + pthread_mutex_lock(&p->parent->hwaccel_mutex); + p->hwaccel_serializing = 1; + } +diff --git a/libavcodec/raw.c b/libavcodec/raw.c +index b6fb91c1c6..7b2770e780 100644 +--- a/libavcodec/raw.c ++++ b/libavcodec/raw.c +@@ -289,10 +289,20 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = { + { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') }, + { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') }, + ++ /* RPI (Might as well define for everything) */ ++ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, ++ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') }, ++ + /* special */ + { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */ + { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */ + ++ /* RPI (Might as well define for everything) */ ++ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') }, ++ { AV_PIX_FMT_RPI4_8, MKTAG('S', 'A', 'N', 'D') }, ++ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') }, ++ { AV_PIX_FMT_RPI4_10, MKTAG('S', 'N', 'D', 'B') }, ++ + { AV_PIX_FMT_NONE, 0 }, + }; + +diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c +index d181b74570..3fe2ab445f 100644 +--- a/libavcodec/rawenc.c ++++ b/libavcodec/rawenc.c +@@ -24,6 +24,7 @@ + * Raw Video Encoder + */ + ++#include "config.h" + #include "avcodec.h" + #include "raw.h" + #include "internal.h" +@@ -31,6 +32,10 @@ + #include "libavutil/intreadwrite.h" + #include "libavutil/imgutils.h" + #include "libavutil/internal.h" ++#include "libavutil/avassert.h" ++#if CONFIG_SAND ++#include "libavutil/rpi_sand_fns.h" ++#endif + + static av_cold int raw_encode_init(AVCodecContext *avctx) + { +@@ -49,12 +54,95 @@ FF_ENABLE_DEPRECATION_WARNINGS + return 0; + } + ++#if CONFIG_SAND ++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const int width = av_frame_cropped_width(frame); ++ const int height = av_frame_cropped_height(frame); ++ const int x0 = frame->crop_left; ++ const int y0 = frame->crop_top; ++ const int size = width * height * 3 / 2; ++ uint8_t * dst; ++ int ret; ++ ++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); ++ dst += width * height; ++ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2); ++ return 0; ++} ++ ++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const int width = av_frame_cropped_width(frame); ++ const int height = av_frame_cropped_height(frame); ++ const int x0 = frame->crop_left; ++ const int y0 = frame->crop_top; ++ const int size = width * height * 3; ++ uint8_t * dst; ++ int ret; ++ ++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height); ++ dst += width * height * 2; ++ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2); ++ return 0; ++} ++ ++static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt, ++ const AVFrame *frame) ++{ ++ const int width = av_frame_cropped_width(frame); ++ const int height = av_frame_cropped_height(frame); ++ const int x0 = frame->crop_left; ++ const int y0 = frame->crop_top; ++ const int size = width * height * 3; ++ uint8_t * dst; ++ int ret; ++ ++ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0) ++ return ret; ++ ++ dst = pkt->data; ++ ++ av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height); ++ dst += width * height * 2; ++ av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width, ++ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2); ++ return 0; ++} ++#endif ++ ++ + static int raw_encode(AVCodecContext *avctx, AVPacket *pkt, + const AVFrame *frame, int *got_packet) + { +- int ret = av_image_get_buffer_size(frame->format, +- frame->width, frame->height, 1); ++ int ret; ++ ++#if CONFIG_SAND ++ if (av_rpi_is_sand_frame(frame)) { ++ ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : ++ av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) : ++ av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1; ++ *got_packet = (ret == 0); ++ return ret; ++ } ++#endif + ++ ret = av_image_get_buffer_size(frame->format, ++ frame->width, frame->height, 1); + if (ret < 0) + return ret; + +diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c +new file mode 100644 +index 0000000000..58c094c5f8 +--- /dev/null ++++ b/libavcodec/rpi_hevc_cabac.c +@@ -0,0 +1,2257 @@ ++/* ++ * HEVC CABAC decoding ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#define UNCHECKED_BITSTREAM_READER 1 ++ ++#include "libavutil/attributes.h" ++#include "libavutil/common.h" ++ ++#include "cabac_functions.h" ++#include "rpi_hevc_data.h" ++#include "hevc.h" ++#include "rpi_hevcdec.h" ++#include "rpi_hevc_cabac_fns.h" ++ ++#include "libavutil/rpi_sand_fns.h" ++ ++// BY22 is probably faster than simple bypass if the processor has ++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction ++// x86 has fast int divide ++// Arm doesn't have divide or general fast 64 bit, but does have the multiply ++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used ++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86) ++// Use native divide if we have a fast one - otherwise use mpy 1/x ++// x86 has a fast integer divide - arm doesn't - unsure about other ++// architectures ++#define USE_BY22_DIV ARCH_X86 ++ ++// Special case blocks with a single significant ceoff ++// Decreases the complexity of the code for a common case but increases the ++// code size. ++#define USE_N_END_1 1 ++ ++#if !USE_BY22_DIV ++// * 1/x @ 32 bits gets us 22 bits of accuracy ++#define CABAC_BY22_PEEK_BITS 22 ++#else ++// A real 32-bit divide gets us another bit ++// If we have a 64 bit int & a unit time divider then we should get a lot ++// of bits (55) but that is untested and it is unclear if it would give ++// us a large advantage ++#define CABAC_BY22_PEEK_BITS 23 ++#endif ++ ++#define CABAC_MAX_BIN 31 ++ ++ ++#if USE_BY22 && !USE_BY22_DIV ++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL) ++ ++static const uint32_t cabac_by22_inv_range[256] = { ++ 0, I(257), I(258), I(259), ++ I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269), ++ I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279), ++ I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289), ++ I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299), ++ I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309), ++ I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319), ++ I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329), ++ I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339), ++ I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349), ++ I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359), ++ I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369), ++ I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379), ++ I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389), ++ I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399), ++ I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409), ++ I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419), ++ I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429), ++ I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439), ++ I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449), ++ I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459), ++ I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469), ++ I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479), ++ I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489), ++ I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499), ++ I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509), ++ I(510), I(511) ++}; ++#undef I ++#endif // USE_BY22 ++ ++#if ARCH_ARM ++#include "arm/rpi_hevc_cabac.h" ++#endif ++ ++/** ++ * number of bin by SyntaxElement. ++ */ ++static const int8_t num_bins_in_se[] = { ++ 1, // sao_merge_flag ++ 1, // sao_type_idx ++ 0, // sao_eo_class ++ 0, // sao_band_position ++ 0, // sao_offset_abs ++ 0, // sao_offset_sign ++ 0, // end_of_slice_flag ++ 3, // split_coding_unit_flag ++ 1, // cu_transquant_bypass_flag ++ 3, // skip_flag ++ 3, // cu_qp_delta ++ 1, // pred_mode ++ 4, // part_mode ++ 0, // pcm_flag ++ 1, // prev_intra_luma_pred_mode ++ 0, // mpm_idx ++ 0, // rem_intra_luma_pred_mode ++ 2, // intra_chroma_pred_mode ++ 1, // merge_flag ++ 1, // merge_idx ++ 5, // inter_pred_idc ++ 2, // ref_idx_l0 ++ 2, // ref_idx_l1 ++ 2, // abs_mvd_greater0_flag ++ 2, // abs_mvd_greater1_flag ++ 0, // abs_mvd_minus2 ++ 0, // mvd_sign_flag ++ 1, // mvp_lx_flag ++ 1, // no_residual_data_flag ++ 3, // split_transform_flag ++ 2, // cbf_luma ++ 4, // cbf_cb, cbf_cr ++ 2, // transform_skip_flag[][] ++ 2, // explicit_rdpcm_flag[][] ++ 2, // explicit_rdpcm_dir_flag[][] ++ 18, // last_significant_coeff_x_prefix ++ 18, // last_significant_coeff_y_prefix ++ 0, // last_significant_coeff_x_suffix ++ 0, // last_significant_coeff_y_suffix ++ 4, // significant_coeff_group_flag ++ 44, // significant_coeff_flag ++ 24, // coeff_abs_level_greater1_flag ++ 6, // coeff_abs_level_greater2_flag ++ 0, // coeff_abs_level_remaining ++ 0, // coeff_sign_flag ++ 8, // log2_res_scale_abs ++ 2, // res_scale_sign_flag ++ 1, // cu_chroma_qp_offset_flag ++ 1, // cu_chroma_qp_offset_idx ++}; ++ ++/** ++ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement. ++ */ ++static const int elem_offset[sizeof(num_bins_in_se)] = { ++ 0, // sao_merge_flag ++ 1, // sao_type_idx ++ 2, // sao_eo_class ++ 2, // sao_band_position ++ 2, // sao_offset_abs ++ 2, // sao_offset_sign ++ 2, // end_of_slice_flag ++ 2, // split_coding_unit_flag ++ 5, // cu_transquant_bypass_flag ++ 6, // skip_flag ++ 9, // cu_qp_delta ++ 12, // pred_mode ++ 13, // part_mode ++ 17, // pcm_flag ++ 17, // prev_intra_luma_pred_mode ++ 18, // mpm_idx ++ 18, // rem_intra_luma_pred_mode ++ 18, // intra_chroma_pred_mode ++ 20, // merge_flag ++ 21, // merge_idx ++ 22, // inter_pred_idc ++ 27, // ref_idx_l0 ++ 29, // ref_idx_l1 ++ 31, // abs_mvd_greater0_flag ++ 33, // abs_mvd_greater1_flag ++ 35, // abs_mvd_minus2 ++ 35, // mvd_sign_flag ++ 35, // mvp_lx_flag ++ 36, // no_residual_data_flag ++ 37, // split_transform_flag ++ 40, // cbf_luma ++ 42, // cbf_cb, cbf_cr ++ 46, // transform_skip_flag[][] ++ 48, // explicit_rdpcm_flag[][] ++ 50, // explicit_rdpcm_dir_flag[][] ++ 52, // last_significant_coeff_x_prefix ++ 70, // last_significant_coeff_y_prefix ++ 88, // last_significant_coeff_x_suffix ++ 88, // last_significant_coeff_y_suffix ++ 88, // significant_coeff_group_flag ++ 92, // significant_coeff_flag ++ 136, // coeff_abs_level_greater1_flag ++ 160, // coeff_abs_level_greater2_flag ++ 166, // coeff_abs_level_remaining ++ 166, // coeff_sign_flag ++ 166, // log2_res_scale_abs ++ 174, // res_scale_sign_flag ++ 176, // cu_chroma_qp_offset_flag ++ 177, // cu_chroma_qp_offset_idx ++}; ++ ++#define CNU 154 ++/** ++ * Indexed by init_type ++ */ ++static const uint8_t init_values[3][HEVC_CONTEXTS] = { ++ { // sao_merge_flag ++ 153, ++ // sao_type_idx ++ 200, ++ // split_coding_unit_flag ++ 139, 141, 157, ++ // cu_transquant_bypass_flag ++ 154, ++ // skip_flag ++ CNU, CNU, CNU, ++ // cu_qp_delta ++ 154, 154, 154, ++ // pred_mode ++ CNU, ++ // part_mode ++ 184, CNU, CNU, CNU, ++ // prev_intra_luma_pred_mode ++ 184, ++ // intra_chroma_pred_mode ++ 63, 139, ++ // merge_flag ++ CNU, ++ // merge_idx ++ CNU, ++ // inter_pred_idc ++ CNU, CNU, CNU, CNU, CNU, ++ // ref_idx_l0 ++ CNU, CNU, ++ // ref_idx_l1 ++ CNU, CNU, ++ // abs_mvd_greater1_flag ++ CNU, CNU, ++ // abs_mvd_greater1_flag ++ CNU, CNU, ++ // mvp_lx_flag ++ CNU, ++ // no_residual_data_flag ++ CNU, ++ // split_transform_flag ++ 153, 138, 138, ++ // cbf_luma ++ 111, 141, ++ // cbf_cb, cbf_cr ++ 94, 138, 182, 154, ++ // transform_skip_flag ++ 139, 139, ++ // explicit_rdpcm_flag ++ 139, 139, ++ // explicit_rdpcm_dir_flag ++ 139, 139, ++ // last_significant_coeff_x_prefix ++ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111, ++ 79, 108, 123, 63, ++ // last_significant_coeff_y_prefix ++ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111, ++ 79, 108, 123, 63, ++ // significant_coeff_group_flag ++ 91, 171, 134, 141, ++ // significant_coeff_flag ++ 111, 111, 125, 110, 110, 94, 124, 108, 124, 107, 125, 141, 179, 153, ++ 125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140, ++ 139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111, ++ 141, 111, ++ // coeff_abs_level_greater1_flag ++ 140, 92, 137, 138, 140, 152, 138, 139, 153, 74, 149, 92, 139, 107, ++ 122, 152, 140, 179, 166, 182, 140, 227, 122, 197, ++ // coeff_abs_level_greater2_flag ++ 138, 153, 136, 167, 152, 152, ++ // log2_res_scale_abs ++ 154, 154, 154, 154, 154, 154, 154, 154, ++ // res_scale_sign_flag ++ 154, 154, ++ // cu_chroma_qp_offset_flag ++ 154, ++ // cu_chroma_qp_offset_idx ++ 154, ++ }, ++ { // sao_merge_flag ++ 153, ++ // sao_type_idx ++ 185, ++ // split_coding_unit_flag ++ 107, 139, 126, ++ // cu_transquant_bypass_flag ++ 154, ++ // skip_flag ++ 197, 185, 201, ++ // cu_qp_delta ++ 154, 154, 154, ++ // pred_mode ++ 149, ++ // part_mode ++ 154, 139, 154, 154, ++ // prev_intra_luma_pred_mode ++ 154, ++ // intra_chroma_pred_mode ++ 152, 139, ++ // merge_flag ++ 110, ++ // merge_idx ++ 122, ++ // inter_pred_idc ++ 95, 79, 63, 31, 31, ++ // ref_idx_l0 ++ 153, 153, ++ // ref_idx_l1 ++ 153, 153, ++ // abs_mvd_greater1_flag ++ 140, 198, ++ // abs_mvd_greater1_flag ++ 140, 198, ++ // mvp_lx_flag ++ 168, ++ // no_residual_data_flag ++ 79, ++ // split_transform_flag ++ 124, 138, 94, ++ // cbf_luma ++ 153, 111, ++ // cbf_cb, cbf_cr ++ 149, 107, 167, 154, ++ // transform_skip_flag ++ 139, 139, ++ // explicit_rdpcm_flag ++ 139, 139, ++ // explicit_rdpcm_dir_flag ++ 139, 139, ++ // last_significant_coeff_x_prefix ++ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95, ++ 94, 108, 123, 108, ++ // last_significant_coeff_y_prefix ++ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95, ++ 94, 108, 123, 108, ++ // significant_coeff_group_flag ++ 121, 140, 61, 154, ++ // significant_coeff_flag ++ 155, 154, 139, 153, 139, 123, 123, 63, 153, 166, 183, 140, 136, 153, ++ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, ++ 153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140, ++ 140, 140, ++ // coeff_abs_level_greater1_flag ++ 154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121, ++ 136, 137, 169, 194, 166, 167, 154, 167, 137, 182, ++ // coeff_abs_level_greater2_flag ++ 107, 167, 91, 122, 107, 167, ++ // log2_res_scale_abs ++ 154, 154, 154, 154, 154, 154, 154, 154, ++ // res_scale_sign_flag ++ 154, 154, ++ // cu_chroma_qp_offset_flag ++ 154, ++ // cu_chroma_qp_offset_idx ++ 154, ++ }, ++ { // sao_merge_flag ++ 153, ++ // sao_type_idx ++ 160, ++ // split_coding_unit_flag ++ 107, 139, 126, ++ // cu_transquant_bypass_flag ++ 154, ++ // skip_flag ++ 197, 185, 201, ++ // cu_qp_delta ++ 154, 154, 154, ++ // pred_mode ++ 134, ++ // part_mode ++ 154, 139, 154, 154, ++ // prev_intra_luma_pred_mode ++ 183, ++ // intra_chroma_pred_mode ++ 152, 139, ++ // merge_flag ++ 154, ++ // merge_idx ++ 137, ++ // inter_pred_idc ++ 95, 79, 63, 31, 31, ++ // ref_idx_l0 ++ 153, 153, ++ // ref_idx_l1 ++ 153, 153, ++ // abs_mvd_greater1_flag ++ 169, 198, ++ // abs_mvd_greater1_flag ++ 169, 198, ++ // mvp_lx_flag ++ 168, ++ // no_residual_data_flag ++ 79, ++ // split_transform_flag ++ 224, 167, 122, ++ // cbf_luma ++ 153, 111, ++ // cbf_cb, cbf_cr ++ 149, 92, 167, 154, ++ // transform_skip_flag ++ 139, 139, ++ // explicit_rdpcm_flag ++ 139, 139, ++ // explicit_rdpcm_dir_flag ++ 139, 139, ++ // last_significant_coeff_x_prefix ++ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111, ++ 79, 108, 123, 93, ++ // last_significant_coeff_y_prefix ++ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111, ++ 79, 108, 123, 93, ++ // significant_coeff_group_flag ++ 121, 140, 61, 154, ++ // significant_coeff_flag ++ 170, 154, 139, 153, 139, 123, 123, 63, 124, 166, 183, 140, 136, 153, ++ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170, ++ 153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140, ++ 140, 140, ++ // coeff_abs_level_greater1_flag ++ 154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121, ++ 136, 122, 169, 208, 166, 167, 154, 152, 167, 182, ++ // coeff_abs_level_greater2_flag ++ 107, 167, 91, 107, 107, 167, ++ // log2_res_scale_abs ++ 154, 154, 154, 154, 154, 154, 154, 154, ++ // res_scale_sign_flag ++ 154, 154, ++ // cu_chroma_qp_offset_flag ++ 154, ++ // cu_chroma_qp_offset_idx ++ 154, ++ }, ++}; ++ ++static const uint8_t scan_1x1[1] = { ++ 0, ++}; ++ ++static const uint8_t horiz_scan2x2_x[4] = { ++ 0, 1, 0, 1, ++}; ++ ++static const uint8_t horiz_scan2x2_y[4] = { ++ 0, 0, 1, 1 ++}; ++ ++static const uint8_t horiz_scan4x4_x[16] = { ++ 0, 1, 2, 3, ++ 0, 1, 2, 3, ++ 0, 1, 2, 3, ++ 0, 1, 2, 3, ++}; ++ ++static const uint8_t horiz_scan4x4_y[16] = { ++ 0, 0, 0, 0, ++ 1, 1, 1, 1, ++ 2, 2, 2, 2, ++ 3, 3, 3, 3, ++}; ++ ++static const uint8_t horiz_scan8x8_inv[8][8] = { ++ { 0, 1, 2, 3, 16, 17, 18, 19, }, ++ { 4, 5, 6, 7, 20, 21, 22, 23, }, ++ { 8, 9, 10, 11, 24, 25, 26, 27, }, ++ { 12, 13, 14, 15, 28, 29, 30, 31, }, ++ { 32, 33, 34, 35, 48, 49, 50, 51, }, ++ { 36, 37, 38, 39, 52, 53, 54, 55, }, ++ { 40, 41, 42, 43, 56, 57, 58, 59, }, ++ { 44, 45, 46, 47, 60, 61, 62, 63, }, ++}; ++ ++static const uint8_t diag_scan2x2_x[4] = { ++ 0, 0, 1, 1, ++}; ++ ++static const uint8_t diag_scan2x2_y[4] = { ++ 0, 1, 0, 1, ++}; ++ ++static const uint8_t diag_scan2x2_inv[2][2] = { ++ { 0, 2, }, ++ { 1, 3, }, ++}; ++ ++static const uint8_t diag_scan4x4_inv[4][4] = { ++ { 0, 2, 5, 9, }, ++ { 1, 4, 8, 12, }, ++ { 3, 7, 11, 14, }, ++ { 6, 10, 13, 15, }, ++}; ++ ++static const uint8_t diag_scan8x8_inv[8][8] = { ++ { 0, 2, 5, 9, 14, 20, 27, 35, }, ++ { 1, 4, 8, 13, 19, 26, 34, 42, }, ++ { 3, 7, 12, 18, 25, 33, 41, 48, }, ++ { 6, 11, 17, 24, 32, 40, 47, 53, }, ++ { 10, 16, 23, 31, 39, 46, 52, 57, }, ++ { 15, 22, 30, 38, 45, 51, 56, 60, }, ++ { 21, 29, 37, 44, 50, 55, 59, 62, }, ++ { 28, 36, 43, 49, 54, 58, 61, 63, }, ++}; ++ ++ ++typedef struct ++{ ++ uint16_t coeff; ++ uint16_t scale; ++} xy_off_t; ++ ++#define XYT_C(x,y,t) ((x) + ((y) << (t))) ++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t)) ++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t)) ++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t))) ++ ++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)} ++ ++#define OFF_DIAG(t) {\ ++ XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\ ++ XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\ ++ XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\ ++ XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\ ++} ++ ++#define OFF_HORIZ(t) {\ ++ XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\ ++ XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\ ++ XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\ ++ XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\ ++} ++ ++#define OFF_VERT(t) {\ ++ XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\ ++ XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\ ++ XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\ ++ XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\ ++} ++ ++static const xy_off_t off_xys[3][4][16] = ++{ ++ {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)}, ++ {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)}, ++ {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)} ++}; ++ ++ ++// Helper fns ++#ifndef hevc_mem_bits32 ++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset) ++{ ++ return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7); ++} ++#endif ++ ++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32) ++#define hevc_clz32 hevc_clz32_builtin ++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x) ++{ ++ // __builtin_clz says it works on ints - so adjust if int is >32 bits long ++ return __builtin_clz(x) - (sizeof(int) * 8 - 32); ++} ++#endif ++ ++// It is unlikely that we will ever need this but include for completeness ++#ifndef hevc_clz32 ++static inline unsigned int hevc_clz32(unsigned int x) ++{ ++ unsigned int n = 1; ++ if ((x & 0xffff0000) == 0) { ++ n += 16; ++ x <<= 16; ++ } ++ if ((x & 0xff000000) == 0) { ++ n += 8; ++ x <<= 8; ++ } ++ if ((x & 0xf0000000) == 0) { ++ n += 4; ++ x <<= 4; ++ } ++ if ((x & 0xc0000000) == 0) { ++ n += 2; ++ x <<= 2; ++ } ++ return n - ((x >> 31) & 1); ++} ++#endif ++ ++static inline int cabac_overflow(const CABACContext * const cc) ++{ ++ av_assert0(cc->bytestream >= cc->bytestream_start); ++ return cc->bytestream >= cc->bytestream_end + 4; ++} ++ ++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc) ++{ ++ return cabac_overflow(&lc->cc); ++} ++ ++#if !USE_BY22 ++// If no by22 then _by22 functions will revert to normal and so _peek/_flush ++// will no longer be called but the setup calls will still exist and we want ++// to null them out ++#define bypass_start(s) ++#define bypass_finish(s) ++#else ++// Use BY22 for residual bypass block ++ ++#define bypass_start(cc) get_cabac_by22_start(cc) ++#define bypass_finish(cc) get_cabac_by22_finish(cc) ++ ++// BY22 notes that bypass is simply a divide into the bitstream and so we ++// can peek out large quantities of bits at once and treat the result as if ++// it was VLC. In many cases this will lead to O(1) processing rather than ++// O(n) though the setup and teardown is sufficiently expensive that it is ++// only worth using if we expect to be dealing with more than a few bits ++// The definition of "a few bits" will vary from platform to platform but ++// tests on ARM show that it probably isn't worth it for a single coded ++// residual, but is for >1 - it also seems likely that if there are ++// more residuals then they are likely to be bigger and this will make the ++// O(1) nature of the code more worthwhile. ++ ++ ++// Bypass block start ++// Must be called before _by22_peek is used as it sets the CABAC environment ++// into the correct state. _by22_finish must be called to return to 'normal' ++// (i.e. non-bypass) cabac decoding ++#ifndef get_cabac_by22_start ++static inline void get_cabac_by22_start(CABACContext * const c) ++{ ++ const unsigned int bits = __builtin_ctz(c->low); ++ const uint32_t m = hevc_mem_bits32(c->bytestream, 0); ++ uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits)); ++#if !USE_BY22_DIV ++ const uint32_t inv = cabac_by22_inv_range[c->range & 0xff]; ++#endif ++ ++ c->bytestream -= (CABAC_BITS / 8); ++ c->by22.bits = bits; ++#if !USE_BY22_DIV ++ c->by22.range = c->range; ++ c->range = inv; ++#endif ++ c->low = x; ++} ++#endif ++ ++// Bypass block finish ++// Must be called at the end of the bypass block to return to normal operation ++static inline void get_cabac_by22_finish(CABACContext * const c) ++{ ++ unsigned int used = c->by22.bits; ++ unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8); ++ unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7); ++ ++ c->bytestream += bytes_used + (CABAC_BITS / 8); ++ c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used; ++#if !USE_BY22_DIV ++ c->range = c->by22.range; ++#endif ++} ++ ++// Peek bypass bits ++// _by22_start must be called before _by22_peek is called and _by22_flush ++// must be called afterwards to flush any used bits ++// The actual number of valid bits returned is ++// min(, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS ++// will be at least 22 which should be long enough for any prefix or suffix ++// though probably not long enough for the worst case combination ++#ifndef get_cabac_by22_peek ++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c) ++{ ++#if USE_BY22_DIV ++ return ((unsigned int)c->low / (unsigned int)c->range) << 9; ++#else ++ uint32_t x = c->low & ~1U; ++ const uint32_t inv = c->range; ++ ++ if (inv != 0) ++ x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32); ++ ++ return x << 1; ++#endif ++} ++#endif ++ ++// Flush bypass bits peeked by _by22_peek ++// Flush n bypass bits. n must be >= 1 to guarantee correct operation ++// val is an unmodified copy of whatever _by22_peek returned ++#ifndef get_cabac_by22_flush ++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val) ++{ ++ // Subtract the bits used & reshift up to the top of the word ++#if USE_BY22_DIV ++ const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23)); ++#else ++ const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23)); ++#endif ++ ++ // and refill lower bits ++ // We will probably OR over some existing bits but that doesn't matter ++ c->by22.bits += n; ++ c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9); ++} ++#endif ++ ++#endif // USE_BY22 ++ ++ ++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc) ++{ ++ memcpy(s->cabac_save->rice, lc->stat_coeff, 4); ++ memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS); ++} ++ ++static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ memcpy(lc->stat_coeff, s->cabac_save->rice, 4); ++ memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS); ++} ++ ++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc) ++{ ++ GetBitContext * const gb = &lc->gb; ++ skip_bits(gb, 1); ++ align_get_bits(gb); ++ return ff_init_cabac_decoder(&lc->cc, ++ gb->buffer + get_bits_count(gb) / 8, ++ (get_bits_left(gb) + 7) / 8); ++} ++ ++static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ int init_type = 2 - s->sh.slice_type; ++ int i; ++ ++ if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ++ init_type ^= 3; ++ ++ for (i = 0; i < HEVC_CONTEXTS; i++) { ++ int init_value = init_values[init_type][i]; ++ int m = (init_value >> 4) * 5 - 45; ++ int n = ((init_value & 15) << 3) - 16; ++ int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127; ++ ++ pre ^= pre >> 31; ++ if (pre > 124) ++ pre = 124 + (pre & 1); ++ lc->cabac_state[i] = pre; ++ } ++ ++ for (i = 0; i < 4; i++) ++ lc->stat_coeff[i] = 0; ++} ++ ++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags) ++{ ++ if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0) ++ { ++ lc->qPy_pred = s->sh.slice_qp; ++ cabac_init_state(s, lc); ++ } ++ else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0) ++ { ++ lc->qPy_pred = s->sh.slice_qp; ++ load_states(s, lc); ++ } ++ lc->cabac_init_req = 0; ++} ++ ++#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx)) ++ ++int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state) ++{ ++ return get_cabac_inline(c, state); ++} ++ ++int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c) ++{ ++ return get_cabac_terminate(c); ++} ++ ++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc) ++{ ++ if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX])) ++ return 0; ++ ++ if (!get_cabac_bypass(&lc->cc)) ++ return SAO_BAND; ++ return SAO_EDGE; ++} ++ ++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc) ++{ ++ int i; ++ int value = get_cabac_bypass(&lc->cc); ++ ++ for (i = 0; i < 4; i++) ++ value = (value << 1) | get_cabac_bypass(&lc->cc); ++ return value; ++} ++ ++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ int i = 0; ++ int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1; ++ ++ while (i < length && get_cabac_bypass(&lc->cc)) ++ i++; ++ return i; ++} ++ ++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc) ++{ ++ return get_cabac_bypass(&lc->cc); ++} ++ ++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc) ++{ ++ int ret = get_cabac_bypass(&lc->cc) << 1; ++ ret |= get_cabac_bypass(&lc->cc); ++ return ret; ++} ++ ++int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc) ++{ ++ int val = 1; ++ ++ if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0) ++ return 0; ++ ++ while (val < 5 && ++ get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0) ++ val++; ++ ++ if (val >= 5) { ++ unsigned int k = 0; ++ while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) { ++ val += 1 << k; ++ k++; ++ } ++// if (k == CABAC_MAX_BIN) ++// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k); ++ ++ while (k--) ++ val += get_cabac_bypass(&lc->cc) << k; ++ } ++ return get_cabac_bypass(&lc->cc) ? -val : val; ++} ++ ++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1); ++ int i = 0; ++ ++ while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX])) ++ i++; ++ ++ return i; ++} ++ ++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size) ++{ ++ if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1 ++ return PART_2Nx2N; ++ if (log2_cb_size == s->ps.sps->log2_min_cb_size) { ++ if (lc->cu.pred_mode == MODE_INTRA) // 0 ++ return PART_NxN; ++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01 ++ return PART_2NxN; ++ if (log2_cb_size == 3) // 00 ++ return PART_Nx2N; ++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001 ++ return PART_Nx2N; ++ return PART_NxN; // 000 ++ } ++ ++ if (!s->ps.sps->amp_enabled_flag) { ++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01 ++ return PART_2NxN; ++ return PART_Nx2N; ++ } ++ ++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX ++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011 ++ return PART_2NxN; ++ if (get_cabac_bypass(&lc->cc)) // 0101 ++ return PART_2NxnD; ++ return PART_2NxnU; // 0100 ++ } ++ ++ if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001 ++ return PART_Nx2N; ++ if (get_cabac_bypass(&lc->cc)) // 0001 ++ return PART_nRx2N; ++ return PART_nLx2N; // 0000 ++} ++ ++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc) ++{ ++ int i = 0; ++ while (i < 2 && get_cabac_bypass(&lc->cc)) ++ i++; ++ return i; ++} ++ ++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc) ++{ ++ int i; ++ int value = get_cabac_bypass(&lc->cc); ++ ++ for (i = 0; i < 4; i++) ++ value = (value << 1) | get_cabac_bypass(&lc->cc); ++ return value; ++} ++ ++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc) ++{ ++ int ret; ++ if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE])) ++ return 4; ++ ++ ret = get_cabac_bypass(&lc->cc) << 1; ++ ret |= get_cabac_bypass(&lc->cc); ++ return ret; ++} ++ ++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ int i = GET_CABAC_LC(elem_offset[MERGE_IDX]); ++ ++ if (i != 0) { ++ while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc)) ++ i++; ++ } ++ return i; ++} ++ ++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH) ++{ ++ if (nPbW + nPbH == 12) ++ return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4); ++ if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth)) ++ return PRED_BI; ++ ++ return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4); ++} ++ ++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx) ++{ ++ int i = 0; ++ int max = num_ref_idx_lx - 1; ++ int max_ctx = FFMIN(max, 2); ++ ++ while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i)) ++ i++; ++ if (i == 2) { ++ while (i < max && get_cabac_bypass(&lc->cc)) ++ i++; ++ } ++ ++ return i; ++} ++ ++static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]); ++} ++ ++static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1); ++} ++ ++#if !USE_BY22 ++static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc) ++{ ++ int ret = 2; ++ int k = 1; ++ ++ while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) { ++ ret += 1U << k; ++ k++; ++ } ++ if (k == CABAC_MAX_BIN) { ++ av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k); ++ return 0; ++ } ++ ++ while (k--) ++ ret += get_cabac_bypass(&lc->cc) << k; ++ return get_cabac_bypass_sign(&lc->cc, -ret); ++} ++#endif ++ ++static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return get_cabac_bypass_sign(&lc->cc, -1); ++} ++ ++static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) ++{ ++ return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz); ++} ++ ++static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) ++{ ++ return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz); ++} ++ ++static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz) ++{ ++ return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz); ++} ++ ++ ++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) { ++ int i =0; ++ ++ while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i)) ++ i++; ++ ++ return i; ++} ++ ++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, ++ int log2_size, int *last_scx_prefix, int *last_scy_prefix) ++{ ++ int i = 0; ++ int max = (log2_size << 1) - 1; ++ int ctx_offset, ctx_shift; ++ ++ if (!c_idx_nz) { ++ ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2); ++ ctx_shift = (log2_size + 1) >> 2; ++ } else { ++ ctx_offset = 15; ++ ctx_shift = log2_size - 2; ++ } ++ while (i < max && ++ GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset)) ++ i++; ++ *last_scx_prefix = i; ++ ++ i = 0; ++ while (i < max && ++ GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset)) ++ i++; ++ *last_scy_prefix = i; ++} ++ ++static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc, ++ int last_significant_coeff_prefix) ++{ ++ int i; ++ int length = (last_significant_coeff_prefix >> 1) - 1; ++ int value = get_cabac_bypass(&lc->cc); ++ ++ for (i = 1; i < length; i++) ++ value = (value << 1) | get_cabac_bypass(&lc->cc); ++ return value; ++} ++ ++static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg) ++{ ++ int inc; ++ ++ inc = (ctx_cg != 0) + (c_idx_nz << 1); ++ ++ return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc); ++} ++ ++static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset) ++{ ++ return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset); ++} ++ ++#if !USE_BY22 ++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r) ++#endif ++ ++ ++#ifndef coeff_abs_level_remaining_decode_bypass ++static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param) ++{ ++ uint32_t y; ++ unsigned int prefix; ++ unsigned int last_coeff_abs_level_remaining; ++ unsigned int n; ++ ++ y = get_cabac_by22_peek(c); ++ prefix = hevc_clz32(~y); ++ // y << prefix will always have top bit 0 ++ ++ if (prefix < 3) { ++ const unsigned int suffix = (y << prefix) >> (31 - rice_param); ++ last_coeff_abs_level_remaining = (prefix << rice_param) + suffix; ++ n = prefix + 1 + rice_param; ++ } ++ else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2) ++ { ++ const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param)); ++ ++ last_coeff_abs_level_remaining = (2 << rice_param) + suffix; ++ n = prefix * 2 + rice_param - 2; ++ } ++ else { ++ unsigned int suffix; ++ ++ get_cabac_by22_flush(c, prefix, y); ++ y = get_cabac_by22_peek(c); ++ ++ suffix = (y | 0x80000000) >> (34 - (prefix + rice_param)); ++ last_coeff_abs_level_remaining = (2 << rice_param) + suffix; ++ n = prefix + rice_param - 2; ++ } ++ ++ get_cabac_by22_flush(c, n, y); ++ ++ return last_coeff_abs_level_remaining; ++} ++#endif ++ ++static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param) ++{ ++ int prefix = 0; ++ int suffix = 0; ++ int last_coeff_abs_level_remaining; ++ int i; ++ ++ while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c)) ++ prefix++; ++ if (prefix == CABAC_MAX_BIN) { ++// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix); ++ return 0; ++ } ++ ++ if (prefix < 3) { ++ for (i = 0; i < rc_rice_param; i++) ++ suffix = (suffix << 1) | get_cabac_bypass(c); ++ last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix; ++ } else { ++ int prefix_minus3 = prefix - 3; ++ for (i = 0; i < prefix_minus3 + rc_rice_param; i++) ++ suffix = (suffix << 1) | get_cabac_bypass(c); ++ last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1) ++ << rc_rice_param) + suffix; ++ } ++ ++ return last_coeff_abs_level_remaining; ++} ++ ++#if !USE_BY22 ++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode ++static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb) ++{ ++ unsigned int i; ++ uint32_t ret = 0; ++ ++ for (i = 0; i < nb; i++) ++ ret = (ret << 1) | get_cabac_bypass(c); ++ ++ return ret << (32 - nb); ++} ++#endif ++ ++#ifndef coeff_sign_flag_decode_bypass ++static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb) ++{ ++ uint32_t y; ++ y = get_cabac_by22_peek(c); ++ get_cabac_by22_flush(c, nb, y); ++ return y & ~(0xffffffffU >> nb); ++} ++#endif ++ ++ ++#ifndef get_cabac_greater1_bits ++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n, ++ uint8_t * const state0) ++{ ++ unsigned int i; ++ unsigned int rv = 0; ++ for (i = 0; i != n; ++i) { ++ const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3; ++ const unsigned int b = get_cabac(c, state0 + idx); ++ rv = (rv << 1) | b; ++ } ++ return rv; ++} ++#endif ++ ++ ++// N.B. levels returned are the values assuming coeff_abs_level_remaining ++// is uncoded, so 1 must be added if it is coded. sum_abs also reflects ++// this version of events. ++static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels, ++ int * const pprev_subset_coded, int * const psum, ++ const unsigned int idx0_gt1, const unsigned int idx_gt2) ++{ ++ CABACContext * const c = &lc->cc; ++ uint8_t * const state0 = lc->cabac_state + idx0_gt1; ++ uint8_t * const state_gt2 = lc->cabac_state + idx_gt2; ++ unsigned int rv; ++ unsigned int i; ++ const unsigned int n = FFMIN(n_end, 8); ++ ++ // Really this is i != n but the simple unconditional loop is cheaper ++ // and faster ++ for (i = 0; i != 8; ++i) ++ levels[i] = 1; ++ ++ rv = get_cabac_greater1_bits(c, n, state0); ++ ++ *pprev_subset_coded = 0; ++ *psum = n; ++ ++ rv <<= (32 - n); ++ if (rv != 0) ++ { ++ *pprev_subset_coded = 1; ++ *psum = n + 1; ++ i = hevc_clz32(rv); ++ levels[i] = 2; ++ if (get_cabac(c, state_gt2) == 0) ++ { ++ // Unset first coded bit ++ rv &= ~(0x80000000U >> i); ++ } ++ } ++ ++ if (n_end > 8) { ++ const unsigned int g8 = n_end - 8; ++ rv |= ((1 << g8) - 1) << (24 - g8); ++ for (i = 0; i != g8; ++i) { ++ levels[i + 8] = 0; ++ } ++ } ++ ++ return rv; ++} ++ ++// extended_precision_processing_flag must be false given we are ++// putting the result into a 16-bit array ++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining) ++// scale_m is uint8_t ++// ++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12) ++// or it can be 2 (if we have transquant_bypass) ++// shift is set to one less than we really want but would normally be ++// s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5? ++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6 ++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient) ++// to achieve it ++ ++#ifndef trans_scale_sat ++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift) ++{ ++ return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1); ++} ++#endif ++ ++ ++#ifndef update_rice ++static inline void update_rice(uint8_t * const stat_coeff, ++ const unsigned int last_coeff_abs_level_remaining, ++ const unsigned int c_rice_param) ++{ ++ const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param; ++ if (x >= 6) ++ (*stat_coeff)++; ++ else if (x == 0 && *stat_coeff > 0) ++ (*stat_coeff)--; ++} ++#endif ++ ++ ++// n must be > 0 on entry ++#ifndef get_cabac_sig_coeff_flag_idxs ++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0, ++ unsigned int n, ++ const uint8_t const * ctx_map, ++ uint8_t * p) ++{ ++ do { ++ if (get_cabac(c, state0 + ctx_map[n])) ++ *p++ = n; ++ } while (--n != 0); ++ return p; ++} ++#endif ++ ++ ++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0, ++ unsigned int n, ++ const uint8_t * ctx_map, // const ptr here but not in asm ++ uint8_t * const flag_idx) ++{ ++ int rv; ++ ++ rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx; ++ ++ return rv; ++} ++ ++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\ ++ x0, x1, x2, x3,\ ++ x4, x5, x6, x7,\ ++ x8, x9, x10, x11,\ ++ x12, x13, x14, x15} ++ ++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\ ++ x0, x4, x8, x12,\ ++ x1, x5, x9, x13,\ ++ x2, x6, x10, x14,\ ++ x3, x7, x11, x15} ++ ++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\ ++ x0, x4, x1, x8,\ ++ x5, x2, x12, x9,\ ++ x6, x3, x13, x10,\ ++ x7, x14, x11, x15} ++ ++ ++static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz, ++ uint8_t * const significant_coeff_group_flag, ++ const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg, ++ int * const pPrev_sig) ++{ ++ while (--i >= 0) { ++ uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag; ++ const unsigned int x_cg = scan_x_cg[i]; ++ ++ // For the flag decode we only care about Z/NZ but ++ // we use the full Right * 2 + Down when calculating ++ // significant coeff flags so we obtain it here. ++ // ++ // The group flag array is one longer than it needs to ++ // be so we don't need to check for y_cg limits ++ const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1); ++ ++ if (i == 0 || ++ significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig)) ++ { ++ gf_y[0] |= (1 << x_cg); ++ *pPrev_sig = prev_sig; ++ break; ++ } ++ } ++ ++ return i; ++} ++ ++static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb, ++ const unsigned int log2_trafo_size, const unsigned int c_idx, ++ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) ++{ ++ const AVFrame * const frame = s->frame; ++ const unsigned int stride = frame_stride1(s->frame, c_idx); ++ const unsigned int x = x0 >> ctx_hshift(s, c_idx); ++ const unsigned int y = y0 >> ctx_vshift(s, c_idx); ++ const int is_sliced = 1; // av_rpi_is_sand_frame(frame); ++ uint8_t * const dst = !is_sliced ? ++ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : ++ c_idx == 0 ? ++ av_rpi_sand_frame_pos_y(frame, x, y) : ++ av_rpi_sand_frame_pos_c(frame, x, y); ++ ++ const unsigned int i = jb->intra.n; ++ HEVCPredCmd *const pc = jb->intra.cmds + i - 1; ++ ++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && ++ pc->ta.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->ta.stride == stride); ++ ++ pc->type = RPI_PRED_ADD_RESIDUAL_C; ++ } ++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && ++ pc->dc.dst == dst) ++ { ++ const int16_t dc = (int16_t)pc->dc.dc; // Discard top bits ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->dc.stride == stride); ++ ++ // Rewrite as add residual - must rewrite all fields as different union member ++ pc->type = RPI_PRED_ADD_RESIDUAL_V; ++ pc->ta.buf = coeffs; ++ pc->ta.dst = dst; ++ pc->ta.stride = stride; ++ pc->ta.dc = dc; ++ } ++ else ++ { ++ HEVCPredCmd * const cmd = pc + 1; ++ jb->intra.n = i + 1; ++ ++ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0); ++ cmd->size = log2_trafo_size; ++ cmd->ta.buf = coeffs; ++ cmd->ta.dst = dst; ++ cmd->ta.stride = stride; ++ cmd->ta.dc = 0; ++ } ++} ++ ++ ++static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const unsigned int log2_trafo_size, const unsigned int c_idx, ++ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs) ++{ ++ const AVFrame * const frame = s->frame; ++ const unsigned int stride = frame_stride1(s->frame, c_idx); ++ const unsigned int x = x0 >> ctx_hshift(s, c_idx); ++ const unsigned int y = y0 >> ctx_vshift(s, c_idx); ++ const int is_sliced = 1; ++ uint8_t * const dst = !is_sliced ? ++ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) : ++ c_idx == 0 ? ++ av_rpi_sand_frame_pos_y(frame, x, y) : ++ av_rpi_sand_frame_pos_c(frame, x, y); ++ ++ const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0); ++ const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1); ++ ++ const unsigned int i = jb->intra.n; ++ HEVCPredCmd *const pc = jb->intra.cmds + i - 1; ++ ++ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U && ++ pc->ta.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->ta.stride == stride); ++ ++ pc->ta.dc = (int16_t)coeff; ++ } ++ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U && ++ pc->dc.dst == dst) ++ { ++ av_assert1(pc->size == log2_trafo_size && ++ pc->c_idx == 1 && ++ pc->dc.stride == stride && ++ (pc->dc.dc & ~0xffff) == 0); ++ ++ pc->dc.dc |= (coeff << 16); ++ } ++ else ++ { ++ HEVCPredCmd * const cmd = pc + 1; ++ jb->intra.n = i + 1; ++ ++ cmd->type = RPI_PRED_ADD_DC + c_idx; ++ cmd->size = log2_trafo_size; ++ cmd->dc.dst = dst; ++ cmd->dc.stride = stride; ++ cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff; ++ } ++} ++ ++ ++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x0, const int y0, ++ const int log2_trafo_size, const enum ScanType scan_idx, ++ const int c_idx) ++{ ++ int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag; ++ ++ int last_significant_coeff_x, last_significant_coeff_y; ++ int num_coeff = 0; ++ int prev_subset_coded = 0; ++ ++ int num_last_subset; ++ int x_cg_last_sig, y_cg_last_sig; ++ ++ const uint8_t *scan_x_cg, *scan_y_cg; ++ const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2]; ++ ++ int use_vpu; ++#if RPI_COMPRESS_COEFFS ++ int num_nonzero = 0; ++ int use_compress = 0; ++ int *coeffs32; ++#endif ++ int use_dc = 0; ++ int16_t *coeffs; ++ uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero ++ int explicit_rdpcm_flag = 0; ++ int explicit_rdpcm_dir_flag; ++ ++ int i; ++ int shift,scale; ++ const uint8_t *scale_matrix = NULL; ++ uint8_t dc_scale; ++ const int c_idx_nz = (c_idx != 0); ++ const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; ++ int prev_sig = 0; ++ int may_hide_sign; ++ ++ int16_t dummy_coeffs[16]; ++ ++ // Derive QP for dequant ++ if (!lc->cu.cu_transquant_bypass_flag) { ++ may_hide_sign = s->ps.pps->sign_data_hiding_flag; ++ ++ if (s->ps.pps->transform_skip_enabled_flag && ++ log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) { ++ int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz); ++ if (transform_skip_flag) { ++ trans_skip_or_bypass = 1; ++ if (lc->cu.pred_mode == MODE_INTRA && ++ s->ps.sps->implicit_rdpcm_enabled_flag && ++ (pred_mode_intra == 10 || pred_mode_intra == 26)) { ++ may_hide_sign = 0; ++ } ++ } ++ } ++ ++ { ++ static const uint8_t level_scale[8] = { ++ 40, 45, 51, 57, 64, 72, 0, 0 // Pad to 8 ++ }; ++ const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y]; ++ ++ // Shift is set to one less than will actually occur as the scale ++ // and saturate step adds 1 and then shifts right again ++ scale = level_scale[qp6 & 7]; ++// shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3); ++ shift = log2_trafo_size - (qp6 >> 3); ++ ++ if (shift < 0) { ++ scale <<= -shift; ++ shift = 0; ++ } ++ } ++ ++ if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) { ++ const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ? ++ &s->ps.pps->scaling_list : &s->ps.sps->scaling_list; ++ const unsigned int matrix_id = ++ lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx; ++ ++ scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id]; ++ dc_scale = scale_matrix[0]; ++ if (log2_trafo_size >= 4) ++ dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id]; ++ } ++ else ++ { ++ static const uint8_t sixteen_scale[64] = { ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16, ++ 16, 16, 16, 16, 16, 16, 16, 16 ++ }; ++ scale_matrix = sixteen_scale; ++ dc_scale = 16; ++ } ++ } else { ++ static const uint8_t unit_scale[64] = { ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ 1, 1, 1, 1, 1, 1, 1, 1, ++ }; ++ scale_matrix = unit_scale; ++ shift = 0; ++ scale = 2; // We will shift right to kill this ++ dc_scale = 1; ++ ++ may_hide_sign = 0; ++ } ++ ++ ++ ++ ++ if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag && ++ trans_skip_or_bypass) { ++ explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz); ++ if (explicit_rdpcm_flag) { ++ may_hide_sign = 0; ++ explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz); ++ } ++ } ++ ++ last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size, ++ &last_significant_coeff_x, &last_significant_coeff_y); ++ ++ if (last_significant_coeff_x > 3) { ++ int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x); ++ last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) * ++ (2 + (last_significant_coeff_x & 1)) + ++ suffix; ++ } ++ ++ if (last_significant_coeff_y > 3) { ++ int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y); ++ last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) * ++ (2 + (last_significant_coeff_y & 1)) + ++ suffix; ++ } ++ ++ if (scan_idx == SCAN_VERT) ++ FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y); ++ ++ x_cg_last_sig = last_significant_coeff_x >> 2; ++ y_cg_last_sig = last_significant_coeff_y >> 2; ++ ++ switch (scan_idx) { ++ case SCAN_DIAG: { ++ int last_x_c = last_significant_coeff_x & 3; ++ int last_y_c = last_significant_coeff_y & 3; ++ ++ num_coeff = diag_scan4x4_inv[last_y_c][last_x_c]; ++ ++ switch (log2_trafo_size) { ++ case 2: ++ scan_x_cg = scan_1x1; ++ scan_y_cg = scan_1x1; ++ break; ++ case 3: ++ num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4; ++ scan_x_cg = diag_scan2x2_x; ++ scan_y_cg = diag_scan2x2_y; ++ break; ++ case 4: ++ num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4; ++ scan_x_cg = ff_hevc_rpi_diag_scan4x4_x; ++ scan_y_cg = ff_hevc_rpi_diag_scan4x4_y; ++ break; ++ case 5: ++ default: ++ num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4; ++ scan_x_cg = ff_hevc_rpi_diag_scan8x8_x; ++ scan_y_cg = ff_hevc_rpi_diag_scan8x8_y; ++ break; ++ } ++ break; ++ } ++ case SCAN_HORIZ: ++ scan_x_cg = horiz_scan2x2_x; ++ scan_y_cg = horiz_scan2x2_y; ++ num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x]; ++ break; ++ default: //SCAN_VERT ++ scan_x_cg = horiz_scan2x2_y; ++ scan_y_cg = horiz_scan2x2_x; ++ num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y]; ++ break; ++ } ++ num_coeff++; ++ num_last_subset = (num_coeff - 1) >> 4; ++ ++ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant ++ ++ { ++ const unsigned int ccount = 1 << (log2_trafo_size * 2); ++ const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */; // These need special processing ++ use_vpu = 0; ++ use_dc = (num_coeff == 1) && !special && ++ !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2); ++ ++ if (use_dc) { ++ // Just need a little empty space ++ coeffs = dummy_coeffs; ++ // No need to clear ++ } ++ else ++ { ++ use_vpu = !special && log2_trafo_size >= 4; ++#if RPI_COMPRESS_COEFFS ++ use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed; ++#endif ++ coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount); ++#if RPI_COMPRESS_COEFFS ++ coeffs32 = (int*)coeffs; ++ if (!use_compress) ++#endif ++#if HAVE_NEON ++ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2); ++#else ++ memset(coeffs, 0, ccount * sizeof(int16_t)); ++#endif ++ } ++ } ++ ++ i = num_last_subset; ++ do { ++ int implicit_non_zero_coeff = 0; ++ int n_end; ++ ++ uint8_t significant_coeff_flag_idx[16]; ++ unsigned int nb_significant_coeff_flag = 0; ++ ++ if (i == num_last_subset) { ++ // First time through ++ int last_scan_pos = num_coeff - (i << 4) - 1; ++ n_end = last_scan_pos - 1; ++ significant_coeff_flag_idx[0] = last_scan_pos; ++ nb_significant_coeff_flag = 1; ++ } else { ++ n_end = 15; ++ implicit_non_zero_coeff = (i != 0); ++ } ++ ++ if (n_end >= 0) { ++ static const uint8_t ctx_idx_maps_ts2[3][16] = { ++ D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2 ++ H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2 ++ V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8) // log2_trafo_size == 2 ++ }; ++ // N.B. prev_sig = Right * 2 + Down ++ static const uint8_t ctx_idx_maps[3][4][16] = { ++ { ++ D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 ++ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 ++ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 ++ D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default ++ }, ++ { ++ H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 ++ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 ++ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 ++ H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default ++ }, ++ { ++ V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0 ++ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1 ++ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2 ++ V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default ++ } ++ }; ++ const uint8_t *ctx_idx_map_p; ++ int scf_offset = 0; ++ ++ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) { ++ ctx_idx_map_p = ctx_idx_maps[0][3]; ++ scf_offset = 40 + c_idx_nz; ++ } else { ++ if (c_idx_nz != 0) ++ scf_offset = 27; ++ ++ if (log2_trafo_size == 2) { ++ ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx]; ++ } else { ++ ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig]; ++ if (!c_idx_nz) { ++ if (i != 0) ++ scf_offset += 3; ++ ++ if (log2_trafo_size == 3) { ++ scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15; ++ } else { ++ scf_offset += 21; ++ } ++ } else { ++ if (log2_trafo_size == 3) ++ scf_offset += 9; ++ else ++ scf_offset += 12; ++ } ++ } ++ } ++ ++ if (n_end > 0) { ++ int cnt = get_sig_coeff_flag_idxs(&lc->cc, ++ lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset, ++ n_end, ctx_idx_map_p, ++ significant_coeff_flag_idx + nb_significant_coeff_flag); ++ ++ nb_significant_coeff_flag += cnt; ++ if (cnt != 0) { ++ implicit_non_zero_coeff = 0; ++ } ++ } ++ ++ if (implicit_non_zero_coeff == 0) { ++ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) { ++ scf_offset = 42 + c_idx_nz; ++ } else { ++ if (i == 0) { ++ scf_offset = c_idx_nz ? 27 : 0; ++ } else { ++ scf_offset = 2 + scf_offset; ++ } ++ } ++ if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) { ++ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; ++ nb_significant_coeff_flag++; ++ } ++ } else { ++ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0; ++ nb_significant_coeff_flag++; ++ } ++ } ++#if RPI_COMPRESS_COEFFS ++ if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full! ++ int16_t temp[32*32]; ++ const unsigned int ccount = 1 << (log2_trafo_size * 2); ++ lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0; ++ lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer ++ memcpy(temp, coeffs, sizeof(int)*num_nonzero); ++ coeffs32 = (int *)temp; ++ memset(coeffs, 0, ccount * sizeof(int16_t)); ++ num_nonzero--; ++ while (num_nonzero >= 0) { ++ const unsigned int res = coeffs32[num_nonzero]; ++ const unsigned int offset = res & 0xffff; ++ coeffs[ offset ] = res >> 16; ++ num_nonzero--; ++ } ++ use_compress = 0; ++ } ++#endif ++ ++ if (nb_significant_coeff_flag != 0) { ++ const unsigned int gt1_idx_delta = (c_idx_nz << 2) | ++ ((i != 0 && !c_idx_nz) ? 2 : 0) | ++ prev_subset_coded; ++ const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] + ++ (gt1_idx_delta << 2); ++ const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] + ++ gt1_idx_delta; ++ ++ const unsigned int x_cg = scan_x_cg[i]; ++ const unsigned int y_cg = scan_y_cg[i]; ++ int16_t * const blk_coeffs = coeffs + ++ ((x_cg + (y_cg << log2_trafo_size)) << 2); ++ // This calculation is 'wrong' for log2_traffo_size == 2 ++ // but that doesn't matter as in this case x_cg & y_cg ++ // are always 0 so result is correct (0) anyway ++ const uint8_t * const blk_scale = scale_matrix + ++ (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size))); ++ ++ // * The following code block doesn't deal with these flags: ++ // (nor did the one it replaces) ++ // ++ // cabac_bypass_alignment_enabled_flag ++ // This should be easy but I can't find a test case ++ // extended_precision_processing_flag ++ // This can extend the required precision past 16bits ++ // so is probably tricky - also no example found yet ++ ++#if USE_N_END_1 ++ if (nb_significant_coeff_flag == 1) { ++ // There is a small gain to be had from special casing the single ++ // transform coefficient case. The reduction in complexity ++ // makes up for the code duplicatioon. ++ ++ int trans_coeff_level = 1; ++ int coeff_sign_flag; ++ int coded_val = 0; ++ ++ // initialize first elem of coeff_bas_level_greater1_flag ++ prev_subset_coded = 0; ++ ++ if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) { ++ trans_coeff_level = 2; ++ prev_subset_coded = 1; ++ coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2); ++ } ++ ++ // Probably not worth the overhead of starting by22 for just one value ++ coeff_sign_flag = get_cabac_bypass(&lc->cc); ++ ++ if (coded_val) ++ { ++ if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) { ++ trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0); ++ } else { ++ uint8_t * const stat_coeff = ++ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1); ++ const unsigned int c_rice_param = *stat_coeff >> 2; ++ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param); ++ ++ trans_coeff_level = 3 + last_coeff_abs_level_remaining; ++ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); ++ } ++ } ++ ++ { ++ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0]; ++ const int k = (int32_t)(coeff_sign_flag << 31) >> 31; ++ const unsigned int scale_m = blk_scale[xy_off->scale]; ++ const int res = trans_scale_sat( ++ (trans_coeff_level ^ k) - k, // Apply sign ++ scale, ++ i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m, ++ shift); ++#if RPI_COMPRESS_COEFFS ++ if (use_compress) ++ coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs); ++ else ++#endif ++ blk_coeffs[xy_off->coeff] = res; ++ } ++ } ++ else ++#endif ++ { ++ int sign_hidden = may_hide_sign; ++ int levels[16]; // Should be able to get away with int16_t but that fails some tests ++ uint32_t coeff_sign_flags; ++ uint32_t coded_vals = 0; ++ // Sum(abs(level[])) ++ // In fact we only need the bottom bit and in some future ++ // version that may be all we calculate ++ unsigned int sum_abs; ++ ++ coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels, ++ &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2); ++ ++ if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3) ++ sign_hidden = 0; ++ ++ // -- Start bypass block ++ ++ bypass_start(&lc->cc); ++ ++ coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden); ++ ++ if (coded_vals != 0) ++ { ++ const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag; ++ uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL : ++ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1); ++ int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2; ++ int * level = levels - 1; ++ ++ do { ++ { ++ const unsigned int z = hevc_clz32(coded_vals) + 1; ++ level += z; ++ coded_vals <<= z; ++ } ++ ++ { ++ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param); ++ const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1; ++ ++ sum_abs += last_coeff_abs_level_remaining + 1; ++ *level = trans_coeff_level; ++ ++ if (stat_coeff != NULL) ++ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param); ++ stat_coeff = NULL; ++ ++ if (trans_coeff_level > (3 << c_rice_param) && ++ (c_rice_param < 4 || rice_adaptation_enabled)) ++ ++c_rice_param; ++ } ++ } while (coded_vals != 0); ++ } ++ ++ // sign_hidden = 0 or 1 so we can combine the tests ++ if ((sign_hidden & sum_abs) != 0) { ++ levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1]; ++ } ++ ++ bypass_finish(&lc->cc); ++ ++ // -- Finish bypass block ++ ++ // Scale loop ++ { ++ int m = nb_significant_coeff_flag - 1; ++ ++ // Deal with DC component (if any) first ++ if (i == 0 && significant_coeff_flag_idx[m] == 0) ++ { ++ const int k = (int32_t)(coeff_sign_flags << m) >> 31; ++ const int res = trans_scale_sat( ++ (levels[m] ^ k) - k, scale, dc_scale, shift); ++#if RPI_COMPRESS_COEFFS ++ if (use_compress) ++ { ++ coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs); ++ } ++ else ++#endif ++ { ++ blk_coeffs[0] = res; ++ } ++ --m; ++ } ++ ++#if !USE_N_END_1 ++ // If N_END_1 set then m was at least 1 initially ++ if (m >= 0) ++#endif ++ { ++ do { ++ const xy_off_t * const xy_off = scan_xy_off + ++ significant_coeff_flag_idx[m]; ++ const int k = (int32_t)(coeff_sign_flags << m) >> 31; ++ const int res = trans_scale_sat( ++ (levels[m] ^ k) - k, ++ scale, ++ blk_scale[xy_off->scale], ++ shift); ++#if RPI_COMPRESS_COEFFS ++ if (use_compress) { ++ coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs); ++ } else ++#endif ++ blk_coeffs[xy_off->coeff] = res; ++ } while (--m >= 0); ++ } ++ } ++ ++ } ++ } ++ } while ((i = next_subset(lc, i, c_idx_nz, ++ significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 && ++ !cabac_overflow(&lc->cc)); ++ ++ if (lc->cu.cu_transquant_bypass_flag) { ++ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && ++ (pred_mode_intra == 10 || pred_mode_intra == 26))) { ++ int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag; ++ ++ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); ++ } ++ } else { ++ if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass ++ int rot = s->ps.sps->transform_skip_rotation_enabled_flag && ++ log2_trafo_size == 2 && ++ lc->cu.pred_mode == MODE_INTRA; ++ if (rot) { ++ for (i = 0; i < 8; i++) ++ FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]); ++ } ++ ++ s->hevcdsp.dequant(coeffs, log2_trafo_size); ++ ++ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag && ++ lc->cu.pred_mode == MODE_INTRA && ++ (pred_mode_intra == 10 || pred_mode_intra == 26))) { ++ int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26); ++ ++ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode); ++ } ++ } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) { ++ s->hevcdsp.transform_4x4_luma(coeffs); ++ } ++ else if (!use_vpu) ++ { ++ int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y); ++ if (max_xy == 0) ++ { ++ if (use_dc) ++ rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs); ++ else ++ s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs); ++ } ++ else { ++ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4; ++ if (max_xy < 4) ++ col_limit = FFMIN(4, col_limit); ++ else if (max_xy < 8) ++ col_limit = FFMIN(8, col_limit); ++ else if (max_xy < 12) ++ col_limit = FFMIN(24, col_limit); ++ s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit); ++ } ++ } ++ } ++ ++#if 0 ++ // Mildly rotted - we support no mode where cross is valid ++ if (lc->tu.cross_pf) { ++ int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer; ++ const int ccount = 1 << (log2_trafo_size * 2); ++ ++ for (i = 0; i < ccount; i++) { ++ coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3); ++ } ++ } ++#endif ++ ++ if (!use_dc) { ++#if RPI_COMPRESS_COEFFS ++ if (use_compress) { ++ coeffs32[num_nonzero] = 0; ++ } ++#endif ++ rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs); ++ } ++} ++ ++#if !USE_BY22 ++// Stores results to lc ++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) ++{ ++ int x = abs_mvd_greater0_flag_decode(lc); ++ int y = abs_mvd_greater0_flag_decode(lc); ++ ++ if (x) ++ x += abs_mvd_greater1_flag_decode(lc); ++ if (y) ++ y += abs_mvd_greater1_flag_decode(lc); ++ ++ switch (x) { ++ case 2: x = mvd_decode(lc); break; ++ case 1: x = mvd_sign_flag_decode(lc); break; ++ case 0: x = 0; break; ++ } ++ ++ switch (y) { ++ case 2: y = mvd_decode(lc); break; ++ case 1: y = mvd_sign_flag_decode(lc); break; ++ case 0: y = 0; break; ++ } ++ return MV_XY(x,y); ++} ++#else ++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc) ++{ ++ int x = abs_mvd_greater0_flag_decode(lc); ++ int y = abs_mvd_greater0_flag_decode(lc); ++ ++ if ((x | y) == 0) ++ return 0; ++ ++ if (x != 0) ++ x += abs_mvd_greater1_flag_decode(lc); ++ if (y != 0) ++ y += abs_mvd_greater1_flag_decode(lc); ++ ++ if ((x | y) == 1) ++ { ++ // Not worth starting BY22 ++ if (x != 0) ++ x = mvd_sign_flag_decode(lc); ++ if (y != 0) ++ y = mvd_sign_flag_decode(lc); ++ } ++ else ++ { ++ CABACContext * const cc = &lc->cc; ++ uint32_t val; ++ uint32_t b; ++ unsigned int n = 0; ++ ++ bypass_start(cc); ++ b = val = get_cabac_by22_peek(cc); ++ ++ if (x == 1) { ++ x = ((int32_t)b >> 31) | 1; ++ n = 1; ++ b <<= 1; ++ } ++ else if (x == 2) { ++ // EG1 so we have (leading one bits + 1) of suffix ++ // This makes prefix & suffix lengths the same ++ const unsigned int k = hevc_clz32(~b) + 1; ++ int s; ++ ++ av_assert2(k <= 15); ++ ++ b <<= k; ++ n = 2 * k + 1; // Includes suffix & sign ++ ++ // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked ++ // if we are going to do this without a flush ++ if (k > CABAC_BY22_PEEK_BITS / 2 - 1) ++ { ++ // Need too many bits - flush ++ // n = k ++ get_cabac_by22_flush(cc, k, val); ++ b = val = get_cabac_by22_peek(cc); ++ n = k + 1; ++ } ++ ++ x = (b >> (32 - k)) + (1 << k); ++ b <<= k; ++ s = (int32_t)b >> 31; ++ x = (x ^ s) - s; ++ b <<= 1; ++ ++ // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits) ++ if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15) ++ { ++ get_cabac_by22_flush(cc, n, val); ++ b = val = get_cabac_by22_peek(cc); ++ n = 0; ++ } ++ } ++ ++ if (y == 1) { ++ y = ((int32_t)b >> 31) | 1; ++ ++n; ++ // don't care about b anymore ++ } ++ else if (y == 2) { ++ const unsigned int k = hevc_clz32(~b) + 1; ++ int s; ++ ++ av_assert2(k <= 15); ++ ++ // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked ++ // if we are going to do this without a flush ++ b <<= k; ++ n += 2 * k + 1; ++ ++ if (n > CABAC_BY22_PEEK_BITS) ++ { ++ // Need too many bits - flush ++ get_cabac_by22_flush(cc, n - (k + 1), val); ++ b = val = get_cabac_by22_peek(cc); ++ n = k + 1; ++ } ++ ++ y = (b >> (32 - k)) + (1 << k); ++ s = (int32_t)(b << k) >> 31; ++ y = (y ^ s) - s; ++ // don't care about b anymore ++ } ++ ++ get_cabac_by22_flush(cc, n, val); ++ bypass_finish(cc); ++ } ++ ++ return MV_XY(x, y); ++} ++#endif +diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h +new file mode 100644 +index 0000000000..ca191f00d9 +--- /dev/null ++++ b/libavcodec/rpi_hevc_cabac_fns.h +@@ -0,0 +1,217 @@ ++/* ++ * HEVC CABAC decoding ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2018 John Cox ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H ++#define AVCODEC_RPI_HEVC_CABAC_FNS_H ++ ++#include "config.h" ++#include "rpi_hevcdec.h" ++ ++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc); ++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags); ++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size); ++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH); ++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx); ++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx); ++ ++//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc); ++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x0, const int y0, ++ const int log2_trafo_size, const enum ScanType scan_idx, ++ const int c_idx); ++ ++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc); ++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc); ++ ++#define HEVC_BIN_SAO_MERGE_FLAG 0 ++#define HEVC_BIN_SAO_TYPE_IDX 1 ++#define HEVC_BIN_SAO_EO_CLASS 2 ++#define HEVC_BIN_SAO_BAND_POSITION 2 ++#define HEVC_BIN_SAO_OFFSET_ABS 2 ++#define HEVC_BIN_SAO_OFFSET_SIGN 2 ++#define HEVC_BIN_END_OF_SLICE_FLAG 2 ++#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG 2 ++#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG 5 ++#define HEVC_BIN_SKIP_FLAG 6 ++#define HEVC_BIN_CU_QP_DELTA 9 ++#define HEVC_BIN_PRED_MODE 12 ++#define HEVC_BIN_PART_MODE 13 ++#define HEVC_BIN_PCM_FLAG 17 ++#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE 17 ++#define HEVC_BIN_MPM_IDX 18 ++#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE 18 ++#define HEVC_BIN_INTRA_CHROMA_PRED_MODE 18 ++#define HEVC_BIN_MERGE_FLAG 20 ++#define HEVC_BIN_MERGE_IDX 21 ++#define HEVC_BIN_INTER_PRED_IDC 22 ++#define HEVC_BIN_REF_IDX_L0 27 ++#define HEVC_BIN_REF_IDX_L1 29 ++#define HEVC_BIN_ABS_MVD_GREATER0_FLAG 31 ++#define HEVC_BIN_ABS_MVD_GREATER1_FLAG 33 ++#define HEVC_BIN_ABS_MVD_MINUS2 35 ++#define HEVC_BIN_MVD_SIGN_FLAG 35 ++#define HEVC_BIN_MVP_LX_FLAG 35 ++#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG 36 ++#define HEVC_BIN_SPLIT_TRANSFORM_FLAG 37 ++#define HEVC_BIN_CBF_LUMA 40 ++#define HEVC_BIN_CBF_CB_CR 42 ++#define HEVC_BIN_TRANSFORM_SKIP_FLAG 46 ++#define HEVC_BIN_EXPLICIT_RDPCM_FLAG 48 ++#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG 50 ++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX 52 ++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX 70 ++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX 88 ++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX 88 ++#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG 88 ++#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG 92 ++#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG 136 ++#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG 160 ++#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING 166 ++#define HEVC_BIN_COEFF_SIGN_FLAG 166 ++#define HEVC_BIN_LOG2_RES_SCALE_ABS 166 ++#define HEVC_BIN_RES_SCALE_SIGN_FLAG 174 ++#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG 176 ++#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX 177 ++ ++ ++int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state); ++int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c); ++ ++static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) { ++ const uint8_t *ptr = c->bytestream; ++ ++ if (c->low & 0x1) ++ ptr--; ++#if CABAC_BITS == 16 ++ if (c->low & 0x1FF) ++ ptr--; ++#endif ++ if ((int) (c->bytestream_end - ptr) < n) ++ return NULL; ++ if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0) ++ return NULL; ++ ++ return ptr; ++} ++ ++static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG); ++} ++ ++static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG); ++} ++ ++static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG); ++} ++ ++static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int ct_depth, ++ const unsigned int x0, const unsigned int y0) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG + ++ ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) + ++ ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth)); ++} ++ ++static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x0, const int y0, const int x_cb, const int y_cb) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG + ++ (s->cabac_stash_left[y0 >> 3] & 1) + ++ (s->cabac_stash_up[x0 >> 3] & 1)); ++} ++ ++static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE); ++} ++ ++static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac_terminate(&lc->cc); ++} ++ ++static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE); ++} ++ ++static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG); ++} ++ ++static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG); ++} ++ ++static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG); ++} ++ ++static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth); ++} ++ ++static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth); ++} ++ ++static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size); ++} ++ ++static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx) ++{ ++ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx); ++} ++ ++ ++ ++#endif ++ +diff --git a/libavcodec/rpi_hevc_data.c b/libavcodec/rpi_hevc_data.c +new file mode 100644 +index 0000000000..341bb77d9d +--- /dev/null ++++ b/libavcodec/rpi_hevc_data.c +@@ -0,0 +1,75 @@ ++/* ++ * HEVC shared tables ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++ ++#include "rpi_hevc_data.h" ++ ++const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = { ++ 0, 0, 1, 0, ++ 1, 2, 0, 1, ++ 2, 3, 1, 2, ++ 3, 2, 3, 3, ++}; ++ ++const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = { ++ 0, 1, 0, 2, ++ 1, 0, 3, 2, ++ 1, 0, 3, 2, ++ 1, 3, 2, 3, ++}; ++ ++const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = { ++ 0, 0, 1, 0, ++ 1, 2, 0, 1, ++ 2, 3, 0, 1, ++ 2, 3, 4, 0, ++ 1, 2, 3, 4, ++ 5, 0, 1, 2, ++ 3, 4, 5, 6, ++ 0, 1, 2, 3, ++ 4, 5, 6, 7, ++ 1, 2, 3, 4, ++ 5, 6, 7, 2, ++ 3, 4, 5, 6, ++ 7, 3, 4, 5, ++ 6, 7, 4, 5, ++ 6, 7, 5, 6, ++ 7, 6, 7, 7, ++}; ++ ++const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = { ++ 0, 1, 0, 2, ++ 1, 0, 3, 2, ++ 1, 0, 4, 3, ++ 2, 1, 0, 5, ++ 4, 3, 2, 1, ++ 0, 6, 5, 4, ++ 3, 2, 1, 0, ++ 7, 6, 5, 4, ++ 3, 2, 1, 0, ++ 7, 6, 5, 4, ++ 3, 2, 1, 7, ++ 6, 5, 4, 3, ++ 2, 7, 6, 5, ++ 4, 3, 7, 6, ++ 5, 4, 7, 6, ++ 5, 7, 6, 7, ++}; +diff --git a/libavcodec/rpi_hevc_data.h b/libavcodec/rpi_hevc_data.h +new file mode 100644 +index 0000000000..0aee673d8b +--- /dev/null ++++ b/libavcodec/rpi_hevc_data.h +@@ -0,0 +1,31 @@ ++/* ++ * HEVC shared data tables ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVC_DATA_H ++#define AVCODEC_RPI_HEVC_DATA_H ++ ++#include ++ ++extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16]; ++extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16]; ++extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64]; ++extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64]; ++ ++#endif /* AVCODEC_RPI_HEVC_DATA_H */ +diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c +new file mode 100644 +index 0000000000..5125d1eb6b +--- /dev/null ++++ b/libavcodec/rpi_hevc_filter.c +@@ -0,0 +1,1210 @@ ++/* ++ * HEVC video decoder ++ * ++ * Originally by: ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2013 Seppo Tomperi ++ * Copyright (C) 2013 Wassim Hamidouche ++ * ++ * Substantially rewritten: ++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++//#define DISABLE_SAO ++//#define DISABLE_DEBLOCK ++//#define DISABLE_STRENGTHS ++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames) ++//#define DISABLE_DEBLOCK_NONREF ++ ++#include "libavutil/common.h" ++#include "libavutil/internal.h" ++ ++#include "rpi_hevcdec.h" ++ ++#include "bit_depth_template.c" ++ ++#include "rpi_qpu.h" ++#include "rpi_zc.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++#define LUMA 0 ++#define CB 1 ++#define CR 2 ++ ++// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2 ++// so -12,75 overall ++static const uint8_t tctablex[] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1 ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // QP 0...18 ++ 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, // QP 19...37 ++ 5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24, // QP 38...53 ++ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24 // 54..75 ++}; ++#define tctable (tctablex + 12 + 6*8) ++ ++static const uint8_t betatablex[] = { ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ++ ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1 ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, // QP 0...18 ++ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37 ++ 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, // QP 38...51 ++ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 // 52..73 ++}; ++#define betatable (betatablex + 12 + 6*8) ++ ++static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y, ++ const int c_idx, const int tc_offset) ++{ ++ return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2]; ++} ++ ++static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int xBase, const unsigned int yBase) ++{ ++ const unsigned int ctb_size_mask = (1 << s->ps.sps->log2_ctb_size) - 1; ++ const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size; ++ const unsigned int xQgBase = xBase & MinCuQpDeltaSizeMask; ++ const unsigned int yQgBase = yBase & MinCuQpDeltaSizeMask; ++ const unsigned int min_cb_width = s->ps.sps->min_cb_width; ++ const unsigned int x_cb = xQgBase >> s->ps.sps->log2_min_cb_size; ++ const unsigned int y_cb = yQgBase >> s->ps.sps->log2_min_cb_size; ++ const int qPy_pred = lc->qPy_pred; ++ ++ return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred : ++ s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) + ++ ((yQgBase & ctb_size_mask) == 0 ? qPy_pred : ++ s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1; ++} ++ ++// * Only called from bitstream decode in foreground ++// so should be safe ++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase) ++{ ++ const int qp_y = get_qPy_pred(s, lc, xBase, yBase); ++ ++ if (lc->tu.cu_qp_delta != 0) { ++ // ?? I suspect that the -bd_offset here leads to us adding it elsewhere ++ int off = s->ps.sps->qp_bd_offset; ++ lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off, ++ 52 + off) - off; ++ } else ++ lc->qp_y = qp_y; ++} ++ ++static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx) ++{ ++ return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift; ++} ++ ++// "DSP" these? ++static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift) ++{ ++ switch (pixel_shift) ++ { ++ case 2: ++ *(uint32_t *)dst = *(uint32_t *)src; ++ break; ++ case 1: ++ *(uint16_t *)dst = *(uint16_t *)src; ++ break; ++ default: ++ *dst = *src; ++ break; ++ } ++} ++ ++static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src, ++ ptrdiff_t stride_src, int x, int y, int width, int height, ++ int c_idx, int x_ctb, int y_ctb) ++{ ++ const unsigned int sh = pixel_shift(s, c_idx); ++ const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx); ++ const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx); ++ ++ /* copy horizontal edges */ ++ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh), ++ src, width << sh); ++ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh), ++ src + stride_src * (height - 1), width << sh); ++ ++ /* copy vertical edges */ ++ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src); ++ ++ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src); ++} ++ ++// N.B. Src & dst are swapped as this is a restore! ++// x0 & y0 are in luma coords ++// Width & height are in Y/C pels as appropriate ++// * Clear scope for optimsation here but not used enough to be worth it ++static void restore_tqb_pixels(const HEVCRpiContext * const s, ++ uint8_t *src1, const uint8_t *dst1, ++ const ptrdiff_t stride_src, const ptrdiff_t stride_dst, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int width, const int height, ++ const int c_idx) ++{ ++ if (s->ps.pps->transquant_bypass_enable_flag || ++ s->ps.sps->pcm.loop_filter_disable_flag) ++ { ++ const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width; ++ int blks_y = height >> (c_idx == 0 ? 3 : 2); ++ const unsigned int bwidth = 8 << s->ps.sps->pixel_shift; // Y & C have the same width in sand ++ const unsigned int bheight = (c_idx == 0) ? 8 : 4; ++ const unsigned int sh = ((x0 >> 3) & 7); ++ const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1; ++ ++ do { ++ unsigned int m = (*pcm >> sh) & mask; ++ uint8_t * bd = src1; ++ const uint8_t * bs = dst1; ++ while (m != 0) { ++ if ((m & 1) != 0) { ++ s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight); ++ } ++ m >>= 1; ++ bs += bwidth; ++ bd += bwidth; ++ } ++ src1 += stride_src * bheight; ++ dst1 += stride_dst * bheight; ++ pcm += s->ps.sps->pcm_width; ++ } while (--blks_y > 0); ++ } ++} ++ ++#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)]) ++ ++static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y) ++{ ++#if SAO_FILTER_N == 5 ++ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; ++#elif SAO_FILTER_N == 6 ++ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */}; ++#else ++#error Confused by size of sao fn array ++#endif ++ int c_idx; ++ int edges[4]; // 0 left 1 top 2 right 3 bottom ++ int x_ctb = x >> s->ps.sps->log2_ctb_size; ++ int y_ctb = y >> s->ps.sps->log2_ctb_size; ++ int ctb_addr_rs = y_ctb * s->ps.sps->ctb_width + x_ctb; ++ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs]; ++ RpiSAOParams *sao = &CTB(s->sao, x_ctb, y_ctb); ++ // flags indicating unfilterable edges ++ uint8_t vert_edge[] = { 0, 0 }; ++ uint8_t horiz_edge[] = { 0, 0 }; ++ uint8_t diag_edge[] = { 0, 0, 0, 0 }; ++ uint8_t lfase = CTB(s->filter_slice_edges, x_ctb, y_ctb); ++ uint8_t no_tile_filter = s->ps.pps->tiles_enabled_flag && ++ !s->ps.pps->loop_filter_across_tiles_enabled_flag; ++ uint8_t restore = no_tile_filter || !lfase; ++ uint8_t left_tile_edge = 0; ++ uint8_t right_tile_edge = 0; ++ uint8_t up_tile_edge = 0; ++ uint8_t bottom_tile_edge = 0; ++ const int sliced = 1; ++ const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1); ++ ++ edges[0] = x_ctb == 0; ++ edges[1] = y_ctb == 0; ++ edges[2] = x_ctb == s->ps.sps->ctb_width - 1; ++ edges[3] = y_ctb == s->ps.sps->ctb_height - 1; ++ ++#ifdef DISABLE_SAO ++ return; ++#endif ++ ++ if (restore) { ++ if (!edges[0]) { ++ left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]]; ++ vert_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge; ++ } ++ if (!edges[2]) { ++ right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]]; ++ vert_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge; ++ } ++ if (!edges[1]) { ++ up_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]]; ++ horiz_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge; ++ } ++ if (!edges[3]) { ++ bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]]; ++ horiz_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge; ++ } ++ if (!edges[0] && !edges[1]) { ++ diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge; ++ } ++ if (!edges[1] && !edges[2]) { ++ diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge; ++ } ++ if (!edges[2] && !edges[3]) { ++ diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge; ++ } ++ if (!edges[0] && !edges[3]) { ++ diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge; ++ } ++ } ++ ++ for (c_idx = 0; c_idx < plane_count; c_idx++) { ++ const unsigned int vshift = ctx_vshift(s, c_idx); ++ const unsigned int hshift = ctx_hshift(s, c_idx); ++ const int x0 = x >> hshift; ++ const int y0 = y >> vshift; ++ const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx); ++ const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift; ++ const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift; ++ const int width = FFMIN(ctb_size_h, (s->ps.sps->width >> hshift) - x0); ++ const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0); ++ int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1]; ++ ptrdiff_t stride_dst; ++ uint8_t *dst; ++ ++ const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0); ++ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */; ++ uint8_t * const src = !sliced ? ++ &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] : ++ c_idx == 0 ? ++ av_rpi_sand_frame_pos_y(s->frame, x0, y0) : ++ av_rpi_sand_frame_pos_c(s->frame, x0, y0); ++ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL : ++ !sliced ? src - (1 << sh) : ++ c_idx == 0 ? ++ av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) : ++ av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0); ++ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL : ++ !sliced ? src + (width << sh) : ++ c_idx == 0 ? ++ av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) : ++ av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0); ++ ++ if (sliced && c_idx > 1) { ++ break; ++ } ++ ++// if (c_idx == 1) ++// printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr); ++ ++ switch (sao->type_idx[c_idx]) { ++ case SAO_BAND: ++ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, ++ x_ctb, y_ctb); ++ if (s->ps.pps->transquant_bypass_enable_flag || ++ s->ps.sps->pcm.loop_filter_disable_flag) ++ { ++ // Can't use the edge buffer here as it may be in use by the foreground ++ DECLARE_ALIGNED(64, uint8_t, dstbuf) ++ [2*MAX_PB_SIZE*MAX_PB_SIZE]; ++ dst = dstbuf; ++ stride_dst = 2*MAX_PB_SIZE; ++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height); ++ if (sliced && c_idx != 0) ++ { ++ s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst, ++ sao->offset_val[1], sao->band_position[1], ++ sao->offset_val[2], sao->band_position[2], ++ width, height); ++ } ++ else ++ { ++ s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst, ++ sao->offset_val[c_idx], sao->band_position[c_idx], ++ width, height); ++ } ++ restore_tqb_pixels(s, src, dst, stride_src, stride_dst, ++ x, y, width, height, c_idx); ++ } else { ++ if (sliced && c_idx != 0) ++ { ++ s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src, ++ sao->offset_val[1], sao->band_position[1], ++ sao->offset_val[2], sao->band_position[2], ++ width, height); ++ } ++ else ++ { ++ s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src, ++ sao->offset_val[c_idx], sao->band_position[c_idx], ++ width, height); ++ } ++ } ++ sao->type_idx[c_idx] = SAO_APPLIED; ++ break; ++ case SAO_EDGE: ++ { ++ const int w = s->ps.sps->width >> hshift; ++ const int h = s->ps.sps->height >> vshift; ++ int top_edge = edges[1]; ++ int bottom_edge = edges[3]; ++ // Can't use the edge buffer here as it may be in use by the foreground ++ DECLARE_ALIGNED(64, uint8_t, dstbuf) ++ [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64]; ++ ++ stride_dst = RPI_HEVC_SAO_BUF_STRIDE; ++ dst = dstbuf + stride_dst + 32; ++ ++ if (!top_edge) { ++ uint8_t *dst1; ++ int src_idx; ++ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh); ++ ++ dst1 = dst - stride_dst; ++ ++ if (src_l != NULL) { ++ src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] == ++ SAO_APPLIED); ++ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh); ++ } ++ ++ src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] == ++ SAO_APPLIED); ++ memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh); ++ ++ if (src_r != NULL) { ++ src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] == ++ SAO_APPLIED); ++ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh); ++ } ++ } ++ if (!bottom_edge) { ++ uint8_t * const dst1 = dst + height * stride_dst; ++ int src_idx; ++ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh); ++ const unsigned int hoff = height * stride_src; ++ ++ if (src_l != NULL) { ++ src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] == ++ SAO_APPLIED); ++ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh); ++ } ++ ++ src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] == ++ SAO_APPLIED); ++ memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh); ++ ++ if (src_r != NULL) { ++ src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] == ++ SAO_APPLIED); ++ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh); ++ } ++ } ++ if (src_l != NULL) { ++ if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { ++ ff_hevc_rpi_copy_vert(dst - (1 << sh), ++ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh), ++ sh, height, stride_dst, 1 << sh); ++ } else { ++ ff_hevc_rpi_copy_vert(dst - (1 << sh), ++ src_l, ++ sh, height, stride_dst, stride_src); ++ } ++ } ++ if (src_r != NULL) { ++ if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) { ++ ff_hevc_rpi_copy_vert(dst + (width << sh), ++ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh), ++ sh, height, stride_dst, 1 << sh); ++ } else { ++ ff_hevc_rpi_copy_vert(dst + (width << sh), ++ src_r, ++ sh, height, stride_dst, stride_src); ++ } ++ } ++ ++ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height); ++ ++ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx, ++ x_ctb, y_ctb); ++ if (sliced && c_idx != 0) ++ { ++ // Class always the same for both U & V (which is just as well :-)) ++ s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src, ++ sao->offset_val[1], sao->offset_val[2], sao->eo_class[1], ++ width, height); ++ s->hevcdsp.sao_edge_restore_c[restore](src, dst, ++ stride_src, stride_dst, ++ sao, ++ edges, width, ++ height, c_idx, ++ vert_edge, ++ horiz_edge, ++ diag_edge); ++ } ++ else ++ { ++ s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx], ++ sao->eo_class[c_idx], width, height); ++ s->hevcdsp.sao_edge_restore[restore](src, dst, ++ stride_src, stride_dst, ++ sao, ++ edges, width, ++ height, c_idx, ++ vert_edge, ++ horiz_edge, ++ diag_edge); ++ } ++ restore_tqb_pixels(s, src, dst, stride_src, stride_dst, ++ x, y, width, height, c_idx); ++ sao->type_idx[c_idx] = SAO_APPLIED; ++ break; ++ } ++ } ++ } ++ ++#if RPI_ZC_SAND_8_IN_10_BUF ++ if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL && ++ (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2])) ++ { ++ const unsigned int stride1 = frame_stride1(s->frame, 1); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame); ++ const unsigned int xoff = (x >> 8) * stride2 * stride1; ++ const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size); ++ const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1; ++ uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1; ++ const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1; ++ uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1; ++ const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255); ++ const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y; ++ ++// printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size); ++ av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3); ++ av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3); ++ } ++#endif ++} ++ ++// When bits are delivered to deblock we want them ++//#define TL 1 ++//#define TR 2 ++//#define BL 4 ++//#define BR 8 ++ ++// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br ++// so we need to rearrange before passing on ++ ++static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) ++{ ++ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width; ++ return (pcm[0] | ++ (pcm[1] << 8) | ++ (pcm[s->ps.sps->pcm_width] << 16) | ++ (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7); ++} ++ ++static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) ++{ ++ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width; ++ return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7); ++} ++ ++// We cast away const here as we want this to work for both get and set ++static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y) ++{ ++ return (uint32_t *)(bs + ++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0 ++#warning Unexpected masks ++ // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes ++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) & ++ (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) + ++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4 ++#error Stride1 < return size ++#endif ++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + ++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2); ++} ++ ++static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y) ++{ ++ return (uint8_t *)(bs + ++ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) & ++ (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) + ++ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) + ++ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2); ++} ++ ++ ++// Get block strength ++// Given how we call we will always get within the 32bit boundries ++static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2, ++ unsigned int xl, unsigned int xr, const unsigned int y) ++{ ++ if (xr <= xl) { ++ return 0; ++ } ++ else ++ { ++#if HAVE_ARMV6T2_INLINE ++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0 ++#error This case not yet handled in bs_get32 ++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4 ++#error Stride1 < return size ++#endif ++ uint32_t tmp; ++ __asm__ ( ++ "lsr %[tmp], %[xl], %[xl_shift] \n\t" ++ "rsb %[xr], %[xl], %[xr] \n\t" ++ "mla %[stride2], %[stride2], %[tmp], %[bs] \n\t" ++ "add %[xr], %[xr], #7 \n\t" ++ "lsr %[bs], %[y], %[y_shift1] \n\t" ++ "bic %[xr], %[xr], #7 \n\t" ++ "ubfx %[xl], %[xl], #1, #5 \n\t" ++ "lsr %[xr], %[xr], #1 \n\t" ++ "cmp %[xr], #32 \n\t" ++ "mvn %[tmp], #0 \n\t" ++ "ldr %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t" ++ "lsl %[tmp], %[tmp], %[xr] \n\t" ++ "lsr %[xl], %[bs], %[xl] \n\t" ++ "it ne \n\t" ++ "bicne %[bs], %[xl], %[tmp] \n\t" ++ : // Outputs ++ [bs]"+r"(bs), ++ [stride2]"+r"(stride2), ++ [xl]"+r"(xl), ++ [xr]"+r"(xr), ++ [tmp]"=&r"(tmp) ++ : // Inputs ++ [y]"r"(y), ++ [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT), ++ [y_shift1]"M"(HEVC_RPI_BS_Y_SHR), ++ [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) ++ : // Clobbers ++ "cc" ++ ); ++ return (uint32_t) bs; ++#else ++ const uint32_t a = *bs_ptr32(bs, stride2, xl, y); ++ const unsigned int n = ((xr - xl + 7) & ~7) >> 1; ++ ++ return n == 32 ? a : ++ (a >> ((xl >> 1) & 31)) & ~(~0U << n); ++#endif ++ } ++} ++ ++static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) ++{ ++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0); ++ return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y); ++} ++ ++static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y) ++{ ++ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0); ++ return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y); ++} ++ ++ ++static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y) ++{ ++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; ++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; ++ const unsigned int ctb_size = (1 << log2_ctb_size); ++ const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 : 1); ++ const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size; ++ const DBParams * cb_dbp = s->deblock + ctb_n; ++ const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8); ++ ++ unsigned int cb_x; ++ ++ // Do in CTB-shaped blocks ++ for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp) ++ { ++ const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r); ++ const unsigned int bv_l = FFMAX(cb_x, 8); ++ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9; ++ const unsigned int bh_l = bv_l - 8; ++ unsigned int y; ++ ++ // Main body ++ for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8) ++ { ++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y); ++ ++ const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp; ++ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width; ++ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; ++ ++ if (vbs != 0) ++ { ++ const uint8_t * const tcv = tctable + dbp->tc_offset; ++ const uint8_t * const betav = betatable + dbp->beta_offset; ++ unsigned int pcmfa = pcm2(s, bv_l - 1, y); ++ unsigned int x; ++ ++ for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1) ++ { ++ if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3) ++ { ++ const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), ++ frame_stride1(s->frame, LUMA), ++ betav[qp], ++ ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) | ++ (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16), ++ pcmfa & 3, ++ av_rpi_sand_frame_pos_y(s->frame, x - 4, y)); ++ } ++ } ++ } ++ ++ if (y != 0) ++ { ++ uint32_t hbs; ++ ++ // H left - mostly separated out so we only need a uint32_t hbs ++ if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0) ++ { ++ const unsigned int x = bh_l; ++ const unsigned int pcmfa = pcm4(s, bh_l, y - 1); ++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const DBParams * const dbph = dbp - 1; ++ const uint8_t * const tc = tctable + dbph->tc_offset + qp; ++ ++ av_assert2(cb_x - bh_l == 8); ++ ++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), ++ frame_stride1(s->frame, LUMA), ++ betatable[qp + dbph->beta_offset], ++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) | ++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16), ++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15)); ++ } ++ ++ // H ++ if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0) // Will give (x <= bh_r) in for loop ++ { ++ unsigned int x; ++ unsigned int pcmfa = pcm4(s, cb_x, y - 1); ++ ++ for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1) ++ { ++ if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0) ++ { ++ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const uint8_t * const tc = tctable + dbp->tc_offset + qp; ++ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y), ++ frame_stride1(s->frame, LUMA), ++ betatable[qp + dbp->beta_offset], ++ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) | ++ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16), ++ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15)); ++ } ++ } ++ } ++ } ++ ++ } ++ } ++} ++ ++static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y) ++{ ++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; ++ const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; ++ return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1; ++} ++ ++static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y) ++{ ++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; ++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; ++ const unsigned int ctb_size = (1 << log2_ctb_size); ++ const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 : 8); ++ const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size; ++ const DBParams * dbp = s->deblock + ctb_n; ++ const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8); ++ const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1]; ++ const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2]; ++ ++ unsigned int cb_x; ++ ++ av_assert1((bounds.x & (ctb_size - 1)) == 0); ++ av_assert1((bounds.y & (ctb_size - 1)) == 0); ++ av_assert1(bounds.h <= ctb_size); ++ ++ // Do in CTB-shaped blocks ++ for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) { ++ const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r); ++ const unsigned int bv_l = FFMAX(cb_x, 16); ++ unsigned int y; ++ ++ // V above ++ if (bounds.y != 0) { ++ // Deblock V up 8 ++ // CTB above current ++ // Top-half only (tc4 & ~0xffff == 0) is special cased in asm ++ const unsigned int y = bounds.y - 8; ++ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U; ++ ++ if (vbs != 0) ++ { ++ unsigned int pcmfa = pcm2(s, bv_l - 1, y); ++ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset; ++ unsigned int x; ++ ++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2) ++ { ++ if ((vbs & 2) != 0 && (~pcmfa & 3) != 0) ++ { ++ const int qp0 = q2h(s, x, y); ++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8), ++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), ++ pcmfa & 3); ++ } ++ } ++ } ++ } ++ ++ for (y = bounds.y; y < b_b; y += 16) ++ { ++ uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) | ++ (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4); ++ ++ // V ++ if (vbs != 0) ++ { ++ unsigned int x; ++ unsigned int pcmfa = ++ (y + 16 > b_b ? ++ pcm2(s, bv_l - 1, y) | 0xffff0000 : ++ pcm4(s, bv_l - 1, y)); ++ const uint8_t * const tc = tctable + 2 + dbp->tc_offset; ++ ++ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2) ++ { ++ if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0) ++ { ++ const int qp0 = q2h(s, x, y); ++ const int qp1 = q2h(s, x, y + 8); ++ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | ++ ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), ++ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1), ++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); ++ } ++ } ++ } ++ ++ // H ++ if (y != 0) ++ { ++ uint32_t hbs; ++ const unsigned int bh_l = bv_l - 16; ++ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16; ++ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width; ++ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width; ++ ++ // H left - mostly separated out so we only need a uint32_t hbs ++ // Stub is width 8 to the left of bounds, but width 16 internally ++ if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0) ++ { ++ unsigned int pcmfa = pcm4(s, bh_l, y - 1); ++ ++ // Chop off bits we don't want... ++ if (bh_l < bounds.x) { ++ pcmfa |= 0x10001; // TL|BL pre rearrangement ++ hbs &= ~3; // Make BS 0 ++ } ++ ++ // Double check we still want this ++ if (hbs != 0 && (~pcmfa & 0x30003) != 0) ++ { ++ const unsigned int x = bh_l; ++ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1; ++ const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset; ++ ++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | ++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), ++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); ++ } ++ } ++ ++ // H main ++ if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0) ++ { ++ unsigned int x; ++ unsigned int pcmfa = pcm4(s, cb_x, y - 1); // Might like to mask out far right writes but probably not worth it ++ ++ for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2) ++ { ++ if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0) ++ { ++ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1; ++ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1; ++ const uint8_t * const tc = tctable + 2 + dbp->tc_offset; ++ ++ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1), ++ frame_stride1(s->frame, 1), ++ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) | ++ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)), ++ (pcmfa & 3) | ((pcmfa >> 14) & 0xc)); ++ } ++ } ++ } ++ } ++ } ++ } ++} ++ ++static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n) ++{ ++ return x & ~(~0U << log2_n); ++} ++ ++static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) ++{ ++ av_assert2((y & 7) == 0); ++ ++ // This doesn't have the same simultainious update issues that bsf_stash ++ // does (other threads will have a different y) so we can do it the easy way ++ if ((bsf &= mask) != 0) ++ *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31); ++} ++ ++ ++static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf) ++{ ++ // We arrange this in a slightly odd fashion but it lines up with ++ // how we are going to use it in the actual deblock code & it is easier ++ // to do the contortions here than there ++ // ++ // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},... ++ ++ av_assert2((x & 7) == 0); ++ ++ if ((bsf &= mask) != 0) ++ { ++ uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y); ++ const unsigned int sh = ((x & 8) | (y & 4)) >> 1; ++ ++ if (mask <= 0xf) ++ { ++ *p |= (bsf << sh); ++ } ++ else ++ { ++ do { ++ *p |= (bsf & 0xf) << sh; ++ p += HEVC_RPI_BS_STRIDE1_BYTES; ++ } while ((bsf >>= 4) != 0); ++ } ++ } ++} ++ ++static inline uint32_t bsf_mv(const HEVCRpiContext * const s, ++ const unsigned int rep, const unsigned int dup, ++ const unsigned int mvf_stride0, ++ const unsigned int mvf_stride1, ++ const RefPicList * const rpl_p, const RefPicList * const rpl_q, ++ const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q) ++{ ++ return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup, ++ mvf_p, mvf_q, ++ rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list, ++ sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1); ++} ++ ++ ++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, ++ const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_trafo_size, ++ const int is_coded_block) ++{ ++ const HEVCRpiMvField * const mvf_curr = mvf_stash_ptr(s, lc, x0, y0); ++ const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE; ++ const RefPicList * const rpl = s->refPicList; ++ // Rep count for bsf_mv when running with min_pu chuncks ++ const unsigned int log2_rep_min_pu = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size; ++ const unsigned int boundary_flags = s->sh.no_dblk_boundary_flags & lc->boundary_flags; ++ const unsigned int trafo_size = (1U << log2_trafo_size); ++ const uint32_t bsf_mask = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1; ++ const uint32_t bsf_cbf = (bsf_mask & 0x55555555); ++ ++ // Do we cover a pred split line? ++ const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split; ++ const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split; ++ ++ uint32_t bsf_h; ++ uint32_t bsf_v; ++ ++#ifdef DISABLE_STRENGTHS ++ return; ++#endif ++ ++ // We are always on a size boundary ++ av_assert2((x0 & (trafo_size - 1)) == 0); ++ av_assert2((y0 & (trafo_size - 1)) == 0); ++ // log2_trafo_size not really a transform size; we can have to deal ++ // with size 2^6 blocks ++ av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6); ++ ++ // Retrieve and update coded (b0), intra (b1) bs flags ++ // ++ // Store on min width (rather than uint32_t) to avoid possible issues ++ // with another thread on another core running wpp using the same ++ // memory (min CTB = 16 pels = 4 bsf els = 8 bits) ++ // ++ // In bsf BS=2 is represented by 3 as it is much easier to test & set ++ // and the actual deblock code tests for 0 and b1 set/not-set so 2 and ++ // 3 will work the same ++ { ++ // Given where we are called from is_cbf_luma & is_intra will be constant over the block ++ const uint32_t bsf0 = (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0; ++ uint8_t *const p = s->bsf_stash_up + (x0 >> 4); ++ uint8_t *const q = s->bsf_stash_left + (y0 >> 4); ++ ++ switch (log2_trafo_size) ++ { ++ case 2: ++ case 3: ++ { ++ const unsigned int sh_h = (x0 >> 1) & 7; ++ const unsigned int sh_v = (y0 >> 1) & 7; ++ bsf_h = *p; ++ bsf_v = *q; ++ *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h); ++ *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v); ++ bsf_h >>= sh_h; ++ bsf_v >>= sh_v; ++ break; ++ } ++ case 4: ++ bsf_h = *p; ++ bsf_v = *q; ++ *p = bsf0; ++ *q = bsf0; ++ break; ++ case 5: ++ bsf_h = *(uint16_t *)p; ++ bsf_v = *(uint16_t *)q; ++ *(uint16_t *)p = bsf0; ++ *(uint16_t *)q = bsf0; ++ break; ++ case 6: ++ default: ++ bsf_h = *(uint32_t *)p; ++ bsf_v = *(uint32_t *)q; ++ *(uint32_t *)p = bsf0; ++ *(uint32_t *)q = bsf0; ++ break; ++ } ++ ++ bsf_h |= bsf0; ++ bsf_v |= bsf0; ++ } ++ ++ // Do Horizontal ++ if ((y0 & 7) == 0) ++ { ++ // Boundary upper ++ if (y0 != 0 && ++ (off_boundary(y0, s->ps.sps->log2_ctb_size) || ++ (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0)) ++ { ++ // Look at MVs (BS=1) if we don't already has a full set of bs bits ++ if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split)) ++ { ++ // If we aren't on the top boundary we must be in the middle ++ // and in that case we know where mvf can change ++ const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0; ++ const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ? ++ s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] : ++ rpl; ++ ++ bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), ++ trafo_size >> (log2_min_pu_size + log2_rep), ++ trafo_size >> (log2_min_pu_size + log2_rep), ++ rpl, rpl_top, ++ mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1)); ++ } ++ ++ // Finally put the results into bs ++ hbs_set(s, x0, y0, bsf_mask, bsf_h); ++ } ++ ++ // Max of 1 pu internal split - ignore if not on 8pel boundary ++ if (has_y_split && !off_boundary(lc->cu.y_split, 3)) ++ { ++ const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split); ++ // If we have the x split as well then it must be in the middle ++ const unsigned int log2_rep = has_x_split ? 1 : 0; ++ ++ hbs_set(s, x0, lc->cu.y_split, bsf_mask, ++ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), ++ trafo_size >> (log2_min_pu_size + log2_rep), ++ trafo_size >> (log2_min_pu_size + log2_rep), ++ rpl, rpl, ++ mvf, mvf - MVF_STASH_WIDTH_PU)); ++ } ++ } ++ ++ // And again for vertical - same logic as horizontal just in the other direction ++ if ((x0 & 7) == 0) ++ { ++ // Boundary left ++ if (x0 != 0 && ++ (off_boundary(x0, s->ps.sps->log2_ctb_size) || ++ (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0)) ++ { ++ if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split)) ++ { ++ const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0; ++ const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ? ++ s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] : ++ rpl; ++ ++ bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), ++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), ++ (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep), ++ rpl, rpl_left, ++ mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0)); ++ } ++ ++ vbs_set(s, x0, y0, bsf_mask, bsf_v); ++ } ++ ++ if (has_x_split && !off_boundary(lc->cu.x_split, 3)) ++ { ++ const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0); ++ const unsigned int log2_rep = has_y_split ? 1 : 0; ++ ++ vbs_set(s, lc->cu.x_split, y0, bsf_mask, ++ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep), ++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), ++ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep), ++ rpl, rpl, ++ mvf, mvf - 1)); ++ } ++ } ++} ++ ++#undef LUMA ++#undef CB ++#undef CR ++ ++static inline unsigned int ussub(const unsigned int a, const unsigned int b) ++{ ++ return a < b ? 0 : a - b; ++} ++ ++static inline int cache_boundry(const AVFrame * const frame, const unsigned int x) ++{ ++ return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0; ++} ++ ++int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot) ++{ ++ const int ctb_size = (1 << s->ps.sps->log2_ctb_size); ++ int x, y; ++ ++ const unsigned int br = bounds.x + bounds.w; ++ const unsigned int bb = bounds.y + bounds.h; ++ ++ const int x_end = (br >= s->ps.sps->width); ++ const int y_end = (bb >= s->ps.sps->height); ++ ++ // Deblock may not touch the edges of the bound as they are still needed ++ // for Intra pred ++ // ++ // Deblock is disabled with a per-slice flag ++ // Given that bounds may cover multiple slices & we dblock outside bounds ++ // anyway we can't avoid deblock using that flag - about the only thing we ++ // could do is have a "no deblock seen yet" flag but it doesn't really ++ // seem worth the effort ++ ++ deblock_y_blk(s, bounds, x_end, y_end); ++ deblock_uv_blk(s, bounds, x_end, y_end); ++ ++ // SAO needs ++ // (a) CTB alignment ++ // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel ++ { ++ const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1)); ++ const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1)); ++ const unsigned int yt = ussub(bounds.y, yo); ++ const unsigned int yb = y_end ? bb : ussub(bb, yo); ++ const unsigned int xl = ussub(bounds.x, xo); ++ const unsigned int xr = x_end ? br : ussub(br, xo); ++ ++ if (s->ps.sps->sao_enabled) ++ { ++ for (y = yt; y < yb; y += ctb_size) { ++ for (x = xl; x < xr; x += ctb_size) { ++ sao_filter_CTB(s, x, y); ++ } ++ } ++ } ++ ++ // Cache invalidate ++ y = 0; ++ if (xr != 0 && yb != 0) ++ { ++ const unsigned int llen = ++ (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame)); ++ const unsigned int mask = ~(llen - 1); ++ const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask; ++ const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask; ++ const unsigned int it = ussub(yt, 1); ++ const unsigned int ib = y_end ? bb : yb - 1; ++ ++ if (il < ir) { ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf); ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ il, it, ir - il, ib - it, ++ ctx_vshift(s, 1), 1, 1); ++ ++ // If we have to commit the right hand tile boundry due to ++ // cache boundry considerations then at EoTile we must commit ++ // that boundry to bottom of tile (bounds) ++ if (ib != bb && ir == br && eot) { ++ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, ++ br - 1, ib, 1, bb - ib, ++ ctx_vshift(s, 1), 1, 1); ++ } ++ ++ rpi_cache_flush_finish(rfe); ++ ++ if (x_end) ++ y = y_end ? INT_MAX : ib; ++ ++// printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1); ++ } ++ } ++ } ++ ++ return y; ++} ++ +diff --git a/libavcodec/rpi_hevc_mv.h b/libavcodec/rpi_hevc_mv.h +new file mode 100644 +index 0000000000..6b36f5e737 +--- /dev/null ++++ b/libavcodec/rpi_hevc_mv.h +@@ -0,0 +1,71 @@ ++#ifndef AVCODEC_RPI_HEVC_MV_H ++#define AVCODEC_RPI_HEVC_MV_H ++ ++#include "config.h" ++ ++typedef int32_t MvXY; ++ ++typedef struct HEVCRpiMvField { ++ MvXY xy[2]; ++ int8_t ref_idx[2]; ++ int8_t pred_flag; ++ int8_t dummy; // To 12 bytes ++} HEVCRpiMvField; ++ ++ ++#define MV_X(xy) (((xy) << 16) >> 16) ++#define MV_Y(xy) ((xy) >> 16) ++#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16)) ++ ++#if ARCH_ARM ++#include "arm/rpi_hevc_mv_arm.h" ++#endif ++ ++#ifndef mvxy_add ++static inline MvXY mvxy_add(const MvXY a, const MvXY b) ++{ ++ return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b)); ++} ++#endif ++ ++ ++#ifndef mv_scale_xy ++static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb) ++{ ++ int tx, scale_factor; ++ ++ td = td == 0 ? 1 : av_clip_int8(td); ++ tb = av_clip_int8(tb); ++ tx = (0x4000 + (abs(td) >> 1)) / td; ++ scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12); ++ return MV_XY( ++ av_clip_int16((scale_factor * MV_X(src) + 127 + ++ (scale_factor * MV_X(src) < 0)) >> 8), ++ av_clip_int16((scale_factor * MV_Y(src) + 127 + ++ (scale_factor * MV_Y(src) < 0)) >> 8)); ++} ++#endif ++ ++// 8.3.1 states that the bitstream may not contain poc diffs that do not ++// fit in 16 bits, so given that we don't care about the high bits we only ++// store the low 16 + LT & Inter flags ++ ++#define COL_POC_INTRA 0 ++#define COL_POC_INTER (1 << 16) ++#define COL_POC_LT (1 << 17) ++#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y))) ++#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff)) ++#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0) ++ ++typedef struct ColMv_s { ++ int32_t poc; ++ int32_t xy; ++} ColMv; ++ ++typedef struct ColMvField_s { ++ ColMv L[2]; ++} ColMvField; ++ ++ ++ ++#endif // AVCODEC_RPI_HEVC_MV_H +diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c +new file mode 100644 +index 0000000000..27a9f69525 +--- /dev/null ++++ b/libavcodec/rpi_hevc_mvs.c +@@ -0,0 +1,487 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2013 Anand Meher Kotra ++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "hevc.h" ++#include "rpi_hevcdec.h" ++ ++static av_always_inline int ++is_eq_mer(const unsigned int plevel, ++ const unsigned int xN, const unsigned int yN, ++ const unsigned int xP, const unsigned int yP) ++{ ++ return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0; ++} ++ ++// check if the mv's and refidx are the same between A and B ++static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b) ++{ ++ return a->pred_flag == b->pred_flag && ++ ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) && ++ ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1])); ++ return 0; ++} ++ ++/* ++ * 8.5.3.1.7 temporal luma motion vector prediction ++ */ ++static int temporal_luma_motion_vector(const HEVCRpiContext * const s, ++ const HEVCRpiLocalContext * const lc, const int x0, const int y0, ++ const int nPbW, const int nPbH, const int refIdxLx, ++ MvXY * const mvLXCol, const int X) ++{ ++ int x, y; ++ const ColMv * cmv = NULL; ++ ++ HEVCRpiFrame * const col_ref = s->ref->collocated_ref; ++ const RefPicList * const refPicList = s->refPicList + X; ++ const int cur_lt = refPicList->isLongTerm[refIdxLx]; ++ ++ *mvLXCol = 0; ++ // Unlikely but we might have a col_ref IDR frame! ++ if (col_ref->col_mvf == NULL) ++ return 0; ++ ++ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH); ++ ++ //bottom right collocated motion vector ++ x = x0 + nPbW; ++ y = y0 + nPbH; ++ ++ if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) && ++ y < s->ps.sps->height && ++ x < s->ps.sps->width) ++ { ++ const ColMvField * const col = col_ref->col_mvf + (x >> 4) + ++ (y >> 4) * s->col_mvf_stride; ++ ++ if (col->L[0].poc != COL_POC_INTRA && ++ (col->L[1].poc == COL_POC_INTRA || ++ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0))) ++ { ++ cmv = col->L + 0; ++ } ++ else if (col->L[1].poc != COL_POC_INTRA) ++ { ++ cmv = col->L + 1; ++ } ++ } ++ ++ // derive center collocated motion vector ++ if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt) ++ { ++ cmv = NULL; ++ x = x0 + (nPbW >> 1); ++ y = y0 + (nPbH >> 1); ++ ++ { ++ const ColMvField * const col = col_ref->col_mvf + (x >> 4) + ++ (y >> 4) * s->col_mvf_stride; ++ ++ if (col->L[0].poc != COL_POC_INTRA && ++ (col->L[1].poc == COL_POC_INTRA || ++ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0))) ++ { ++ cmv = col->L + 0; ++ } ++ else if (col->L[1].poc != COL_POC_INTRA) ++ { ++ cmv = col->L + 1; ++ } ++ } ++ } ++ ++ if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc)) ++ return 0; ++ ++ { ++ const int col_poc = col_ref->poc; ++ const int ref_poc = refPicList->list[refIdxLx]; ++ ++ *mvLXCol = (cur_lt || ++ cmv->poc == col_poc || ++ COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ? ++ cmv->xy : ++ mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc); ++ } ++ ++ return cmv != NULL; ++} ++ ++static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b) ++{ ++ return b != NULL && compare_mv_ref_idx(a, b); ++} ++ ++ ++ ++/* ++ * 8.5.3.1.2 Derivation process for spatial merging candidates ++ */ ++static inline const HEVCRpiMvField * ++derive_spatial_merge_candidates( ++ const HEVCRpiContext * const s, ++ const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int nPbW, const unsigned int nPbH, ++ const unsigned int avail, ++ const unsigned int part_idx, ++ const unsigned int merge_idx, ++ HEVCRpiMvField * const mvf_t) ++{ ++ const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N); ++ const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD); ++ ++ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1); ++ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1); ++ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1); ++ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1; ++ const unsigned int plevel = s->ps.pps->log2_parallel_merge_level; ++ const unsigned int part_mode = lc->cu.part_mode; ++ ++ const HEVCRpiMvField * perm[4]; ++ unsigned int nb_merge_cand = 0; ++ ++ // singleMCLFlag => part_idx == 0 so no need to test for it ++ if ((avail & AVAIL_L) == 0 || ++ (part_idx == 1 && ++ ((parts_a1 >> part_mode) & 1) != 0 || ++ is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) || ++ mvf_a1->pred_flag == PF_INTRA) ++ { ++ mvf_a1 = NULL; ++ } ++ else ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_a1; ++ perm[nb_merge_cand++] = mvf_a1; ++ } ++ ++ if ((avail & AVAIL_U) == 0 || ++ (part_idx == 1 && ++ ((parts_b1 >> part_mode) & 1) != 0 || ++ is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) || ++ mvf_b1->pred_flag == PF_INTRA) ++ { ++ mvf_b1 = NULL; ++ } ++ else if (!mvf_eq(mvf_b1, mvf_a1)) ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_b1; ++ perm[nb_merge_cand++] = mvf_b1; ++ } ++ ++ // above right spatial merge candidate ++ // Never need mvf_b0 again so don't bother zeroing if navail ++ if ((avail & AVAIL_UR) != 0 && ++ !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) && ++ mvf_b0->pred_flag != PF_INTRA && ++ !mvf_eq(mvf_b0, mvf_b1)) ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_b0; ++ perm[nb_merge_cand++] = mvf_b0; ++ } ++ ++ // left bottom spatial merge candidate ++ // Never need mvf_a0 again so don't bother zeroing if navail ++ if ((avail & AVAIL_DL) != 0 && ++ !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) && ++ mvf_a0->pred_flag != PF_INTRA && ++ !mvf_eq(mvf_a0, mvf_a1)) ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_a0; ++ perm[nb_merge_cand++] = mvf_a0; ++ } ++ ++ // above left spatial merge candidate ++ if (nb_merge_cand != 4 && ++ (avail & AVAIL_UL) != 0 && ++ !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0)) ++ { ++ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL ++ ++ if (mvf_b2->pred_flag != PF_INTRA && ++ !mvf_eq(mvf_b2, mvf_a1) && ++ !mvf_eq(mvf_b2, mvf_b1)) ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_b2; ++ perm[nb_merge_cand++] = mvf_b2; ++ } ++ } ++ ++ // temporal motion vector candidate ++ if (s->sh.slice_temporal_mvp_enabled_flag) ++ { ++ static const HEVCRpiMvField mvf_z = {{0}}; ++ ++ *mvf_t = mvf_z; ++ ++ if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, ++ 0, mvf_t->xy + 0, 0)) ++ mvf_t->pred_flag = PF_L0; ++ ++ if (s->sh.slice_type == HEVC_SLICE_B && ++ temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH, ++ 0, mvf_t->xy + 1, 1)) ++ mvf_t->pred_flag |= PF_L1; ++ ++ if (mvf_t->pred_flag != 0) ++ { ++ if (merge_idx == nb_merge_cand) ++ return mvf_t; ++ perm[nb_merge_cand++] = mvf_t; ++ } ++ } ++ ++ // combined bi-predictive merge candidates (applies for B slices) ++ if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1) ++ { ++ unsigned int comb_idx = 0; ++ const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1); ++ const RefPicList * const refPicList = s->refPicList; ++ ++ for (comb_idx = 0; comb_idx < cand_count; comb_idx++) ++ { ++ static const uint8_t l0_l1_cand_idx[12][2] = { ++ { 0, 1, }, ++ { 1, 0, }, ++ { 0, 2, }, ++ { 2, 0, }, ++ { 1, 2, }, ++ { 2, 1, }, ++ { 0, 3, }, ++ { 3, 0, }, ++ { 1, 3, }, ++ { 3, 1, }, ++ { 2, 3, }, ++ { 3, 2, }, ++ }; ++ ++ const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0]; ++ const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1]; ++ const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx]; ++ const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx]; ++ ++ if ((mvf_c0->pred_flag & PF_L0) != 0 && ++ (mvf_c1->pred_flag & PF_L1) != 0 && ++ (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] || ++ mvf_c0->xy[0] != mvf_c1->xy[1])) ++ { ++ if (merge_idx == nb_merge_cand++) ++ { ++ // Need to be a bit careful as we will construct mvf_t and we ++ // may already be using that as one of our condidates ++ // so build & copy rather than build in place ++ const HEVCRpiMvField mvf_m = { ++ .xy = { ++ mvf_c0->xy[0], ++ mvf_c1->xy[1]}, ++ .ref_idx = { ++ mvf_c0->ref_idx[0], ++ mvf_c1->ref_idx[1]}, ++ .pred_flag = PF_BI ++ }; ++ *mvf_t = mvf_m; ++ return mvf_t; ++ } ++ } ++ } ++ } ++ ++ // "append" Zero motion vector candidates ++ { ++ const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ? ++ FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0]; ++ const unsigned int zero_idx = merge_idx - nb_merge_cand; ++ ++ const HEVCRpiMvField mvf_m = { ++ .xy = {0, 0}, ++ .ref_idx = { ++ zero_idx < nb_refs ? zero_idx : 0, ++ (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0}, ++ .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0 ++ }; ++ ++ *mvf_t = mvf_m; ++ return mvf_t; ++ } ++} ++ ++ ++// 8.5.3.1.1 Derivation process of luma Mvs for merge mode ++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, ++ int nPbH, int log2_cb_size, int part_idx, ++ int merge_idx, HEVCRpiMvField * const mv) ++{ ++ const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ? ++ derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8, ++ ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8), ++ 0, merge_idx, mv) : ++ derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH), ++ part_idx, merge_idx, mv); ++ ++ if (mvf_m != mv) ++ *mv = *mvf_m; ++ ++ if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12) ++ mv->pred_flag = PF_L0; ++} ++ ++ ++static av_always_inline const MvXY * ++mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf) ++{ ++ if (mvf != NULL) ++ { ++ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0) ++ return mvf->xy + pfi0; ++ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0) ++ return mvf->xy + pfi1; ++ } ++ return NULL; ++} ++ ++static av_always_inline const MvXY * ++mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, ++ const int islt0, const int poc0, const int poc_cur, ++ MvXY * const mv_t, const HEVCRpiMvField * const mvf) ++{ ++ if (mvf != NULL) ++ { ++ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0) ++ { ++ const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]]; ++ if (islt0 || poc1 == poc0) { ++ return mvf->xy + pfi0; ++ } ++ *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0); ++ return mv_t; ++ } ++ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0) ++ { ++ const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]]; ++ if (islt0 || poc1 == poc0) { ++ return mvf->xy + pfi1; ++ } ++ *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0); ++ return mv_t; ++ } ++ } ++ return NULL; ++} ++ ++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int nPbW, const unsigned int nPbH, ++ const unsigned int avail, ++ HEVCRpiMvField * const mv, ++ const unsigned int mvp_lx_flag, const unsigned int LX) ++{ ++ const unsigned int pfi0 = LX; ++ const unsigned int pfi1 = LX == 0 ? 1 : 0; ++ const RefPicList * const rpl = s->refPicList; ++ const int poc0 = rpl[LX].list[mv->ref_idx[LX]]; ++ const int poc_cur = s->poc; ++ const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]]; ++ ++ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1); ++ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1); ++ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL ++ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1); ++ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1; ++ const MvXY * mva = NULL; ++ const MvXY * mvb; ++ MvXY * const mv_rv = mv->xy + LX; ++ MvXY mvt_a, mvt_b; ++ ++ *mv_rv = 0; ++ ++ if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA) ++ mvf_a0 = NULL; ++ else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0) ++ goto use_mva; ++ ++ if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA) ++ mvf_a1 = NULL; ++ ++ if (mva == NULL && ++ (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL && ++ (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL) ++ mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1); ++ ++ if (mvp_lx_flag == 0 && mva != NULL) ++ goto use_mva; ++ ++ if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA) ++ mvf_b0 = NULL; ++ if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA) ++ mvf_b1 = NULL; ++ if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA) ++ mvf_b2 = NULL; ++ ++ if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL && ++ (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL) ++ mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2); ++ ++ if (mvf_a0 == NULL && mvf_a1 == NULL) { ++ mva = mvb; ++ if (mvp_lx_flag == 0 && mva != NULL) ++ goto use_mva; ++ ++ if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL && ++ (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL) ++ mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2); ++ } ++ ++ if (mva == NULL) { ++ mva = mvb; ++ mvb = NULL; ++ } ++ ++ if (mvb != NULL && *mva == *mvb) // If A == B then ignore B ++ mvb = NULL; ++ ++ if (mvp_lx_flag == 0 && mva != NULL) { ++ goto use_mva; ++ } ++ else if (mvp_lx_flag != 0 && mvb != NULL) { ++ *mv_rv = *mvb; ++ } ++ else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) { ++ temporal_luma_motion_vector(s, lc, x0, y0, nPbW, ++ nPbH, mv->ref_idx[LX], ++ mv_rv, LX); ++ } ++ return; ++ ++use_mva: ++ *mv_rv = *mva; ++ return; ++} ++ +diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c +new file mode 100644 +index 0000000000..e58a59ce5e +--- /dev/null ++++ b/libavcodec/rpi_hevc_parse.c +@@ -0,0 +1,143 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "bytestream.h" ++#include "h2645_parse.h" ++#include "hevc.h" ++#include "rpi_hevc_parse.h" ++ ++static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps, ++ HEVCSEIContext *sei, int is_nalff, int nal_length_size, ++ int err_recognition, int apply_defdispwin, void *logctx) ++{ ++ int i; ++ int ret = 0; ++ H2645Packet pkt = { 0 }; ++ ++ ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff, ++ nal_length_size, AV_CODEC_ID_HEVC, 1, 0); ++ if (ret < 0) { ++ goto done; ++ } ++ ++ for (i = 0; i < pkt.nb_nals; i++) { ++ H2645NAL *nal = &pkt.nals[i]; ++ ++ /* ignore everything except parameter sets and VCL NALUs */ ++ switch (nal->type) { ++ case HEVC_NAL_VPS: ++ ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps); ++ if (ret < 0) ++ goto done; ++ break; ++ case HEVC_NAL_SPS: ++ ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin); ++ if (ret < 0) ++ goto done; ++ break; ++ case HEVC_NAL_PPS: ++ ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps); ++ if (ret < 0) ++ goto done; ++ break; ++ case HEVC_NAL_SEI_PREFIX: ++ case HEVC_NAL_SEI_SUFFIX: ++ ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type); ++ if (ret < 0) ++ goto done; ++ break; ++ default: ++ av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type); ++ break; ++ } ++ } ++ ++done: ++ ff_h2645_packet_uninit(&pkt); ++ if (err_recognition & AV_EF_EXPLODE) ++ return ret; ++ ++ return 0; ++} ++ ++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps, ++ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size, ++ int err_recognition, int apply_defdispwin, void *logctx) ++{ ++ int ret = 0; ++ GetByteContext gb; ++ ++ bytestream2_init(&gb, data, size); ++ ++ if (size > 3 && (data[0] || data[1] || data[2] > 1)) { ++ /* It seems the extradata is encoded as hvcC format. ++ * Temporarily, we support configurationVersion==0 until 14496-15 3rd ++ * is finalized. When finalized, configurationVersion will be 1 and we ++ * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */ ++ int i, j, num_arrays, nal_len_size; ++ ++ *is_nalff = 1; ++ ++ bytestream2_skip(&gb, 21); ++ nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1; ++ num_arrays = bytestream2_get_byte(&gb); ++ ++ /* nal units in the hvcC always have length coded with 2 bytes, ++ * so put a fake nal_length_size = 2 while parsing them */ ++ *nal_length_size = 2; ++ ++ /* Decode nal units from hvcC. */ ++ for (i = 0; i < num_arrays; i++) { ++ int type = bytestream2_get_byte(&gb) & 0x3f; ++ int cnt = bytestream2_get_be16(&gb); ++ ++ for (j = 0; j < cnt; j++) { ++ // +2 for the nal size field ++ int nalsize = bytestream2_peek_be16(&gb) + 2; ++ if (bytestream2_get_bytes_left(&gb) < nalsize) { ++ av_log(logctx, AV_LOG_ERROR, ++ "Invalid NAL unit size in extradata.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff, ++ *nal_length_size, err_recognition, apply_defdispwin, ++ logctx); ++ if (ret < 0) { ++ av_log(logctx, AV_LOG_ERROR, ++ "Decoding nal unit %d %d from hvcC failed\n", ++ type, i); ++ return ret; ++ } ++ bytestream2_skip(&gb, nalsize); ++ } ++ } ++ ++ /* Now store right nal length size, that will be used to parse ++ * all other nals */ ++ *nal_length_size = nal_len_size; ++ } else { ++ *is_nalff = 0; ++ ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size, ++ err_recognition, apply_defdispwin, logctx); ++ if (ret < 0) ++ return ret; ++ } ++ ++ return ret; ++} +diff --git a/libavcodec/rpi_hevc_parse.h b/libavcodec/rpi_hevc_parse.h +new file mode 100644 +index 0000000000..4b4d032a16 +--- /dev/null ++++ b/libavcodec/rpi_hevc_parse.h +@@ -0,0 +1,36 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * H.265 parser code ++ */ ++ ++#ifndef AVCODEC_RPI_HEVC_PARSE_H ++#define AVCODEC_RPI_HEVC_PARSE_H ++ ++#include ++ ++#include "rpi_hevc_ps.h" ++#include "rpi_hevc_sei.h" ++ ++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps, ++ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size, ++ int err_recognition, int apply_defdispwin, void *logctx); ++ ++#endif /* AVCODEC_RPI_HEVC_PARSE_H */ +diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c +new file mode 100644 +index 0000000000..f4e31f7d1d +--- /dev/null ++++ b/libavcodec/rpi_hevc_ps.c +@@ -0,0 +1,1938 @@ ++/* ++ * HEVC Parameter Set decoding ++ * ++ * Copyright (C) 2012 - 2103 Guillaume Martres ++ * Copyright (C) 2012 - 2103 Mickael Raulet ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2013 Vittorio Giovara ++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/imgutils.h" ++#include "golomb.h" ++#include "rpi_hevc_data.h" ++#include "rpi_hevc_ps.h" ++#include "rpi_hevcdec.h" ++ ++static const uint8_t default_scaling_list_intra[] = { ++ 16, 16, 16, 16, 17, 18, 21, 24, ++ 16, 16, 16, 16, 17, 19, 22, 25, ++ 16, 16, 17, 18, 20, 22, 25, 29, ++ 16, 16, 18, 21, 24, 27, 31, 36, ++ 17, 17, 20, 24, 30, 35, 41, 47, ++ 18, 19, 22, 27, 35, 44, 54, 65, ++ 21, 22, 25, 31, 41, 54, 70, 88, ++ 24, 25, 29, 36, 47, 65, 88, 115 ++}; ++ ++static const uint8_t default_scaling_list_inter[] = { ++ 16, 16, 16, 16, 17, 18, 20, 24, ++ 16, 16, 16, 17, 18, 20, 24, 25, ++ 16, 16, 17, 18, 20, 24, 25, 28, ++ 16, 17, 18, 20, 24, 25, 28, 33, ++ 17, 18, 20, 24, 25, 28, 33, 41, ++ 18, 20, 24, 25, 28, 33, 41, 54, ++ 20, 24, 25, 28, 33, 41, 54, 71, ++ 24, 25, 28, 33, 41, 54, 71, 91 ++}; ++ ++static const AVRational vui_sar[] = { ++ { 0, 1 }, ++ { 1, 1 }, ++ { 12, 11 }, ++ { 10, 11 }, ++ { 16, 11 }, ++ { 40, 33 }, ++ { 24, 11 }, ++ { 20, 11 }, ++ { 32, 11 }, ++ { 80, 33 }, ++ { 18, 11 }, ++ { 15, 11 }, ++ { 64, 33 }, ++ { 160, 99 }, ++ { 4, 3 }, ++ { 3, 2 }, ++ { 2, 1 }, ++}; ++ ++ ++// pps_cb_qp_offset: -12,+12 ++// slice_cb_qp_offset: -12,+12 also ++// "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive." ++// cr_qp_offset_list[n]: -12,+12 ++// So worst case total offset: -24,+24 ++ ++#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6) ++#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n)) ++#define M(B,n) C(B,(-n)) ++ ++// Sizeof the QP_START_BLOCK ++#define QP_OFFSET_0 (8*6 + 12*2) ++#define QP_START(B) \ ++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ ++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ ++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ ++ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\ ++\ ++ M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\ ++ M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\ ++ M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\ ++ M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\ ++ M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\ ++ M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\ ++ M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\ ++ M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1) ++#define QP_END(B) \ ++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\ ++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\ ++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51) ++ ++#define T1(B)\ ++{\ ++ QP_START(B),\ ++ C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\ ++ C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\ ++ C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\ ++ C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\ ++ C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\ ++ C(B,44), C(B,45),\ ++ C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\ ++ QP_END(B)\ ++} ++#define T0(B)\ ++{\ ++ QP_START(B),\ ++ C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\ ++ C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\ ++ C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\ ++ C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\ ++ C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\ ++ C(B,50), C(B,51),\ ++ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\ ++ QP_END(B)\ ++} ++ ++#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2) ++ ++static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)}; ++static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)}; ++ ++#undef T ++#undef C ++#undef QP_END ++ ++#define C(B,n) ((n)<0?0:(n)>51?51:(n)) ++// We do need a lot of -ve padding to cope with high bit depths that give -ve qps ++#define QP_DBLK_OFFSET_0 QP_OFFSET_0 ++#define QP_END(B)\ ++ 51, 51, 51, 51, 51, 51 ++ ++// These don't need all the padding we have here (12 top/bottom would be enough) ++static const uint8_t qp_c_dblk_0[] = T0(0); ++static const uint8_t qp_c_dblk_1[] = T1(0); ++ ++#undef T ++#undef M ++#undef C ++#undef QP_END ++#undef QP_START ++ ++ ++static void remove_pps(HEVCRpiParamSets * const s, const int id) ++{ ++ if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data) ++ s->pps = NULL; ++ av_buffer_unref(&s->pps_list[id]); ++} ++ ++static void remove_sps(HEVCRpiParamSets * const s, const int id) ++{ ++ int i; ++ if (s->sps_list[id]) { ++ if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data) ++ s->sps = NULL; ++ ++ /* drop all PPS that depend on this SPS */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++) ++ if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id) ++ remove_pps(s, i); ++ ++ av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data)); ++ } ++ av_buffer_unref(&s->sps_list[id]); ++} ++ ++static void remove_vps(HEVCRpiParamSets * const s, const int id) ++{ ++ int i; ++ if (s->vps_list[id]) { ++ if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data) ++ s->vps = NULL; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++) ++ if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id) ++ remove_sps(s, i); ++ } ++ av_buffer_unref(&s->vps_list[id]); ++} ++ ++int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx, ++ ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header) ++{ ++ uint8_t rps_predict = 0; ++ int delta_poc; ++ int k0 = 0; ++ int k1 = 0; ++ int k = 0; ++ int i; ++ ++ if (rps != sps->st_rps && sps->nb_st_rps) ++ rps_predict = get_bits1(gb); ++ ++ if (rps_predict) { ++ const ShortTermRPS *rps_ridx; ++ int delta_rps; ++ unsigned abs_delta_rps; ++ uint8_t use_delta_flag = 0; ++ uint8_t delta_rps_sign; ++ ++ if (is_slice_header) { ++ unsigned int delta_idx = get_ue_golomb_long(gb) + 1; ++ if (delta_idx > sps->nb_st_rps) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid value of delta_idx in slice header RPS: %d > %d.\n", ++ delta_idx, sps->nb_st_rps); ++ return AVERROR_INVALIDDATA; ++ } ++ rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx]; ++ rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs; ++ } else ++ rps_ridx = &sps->st_rps[rps - sps->st_rps - 1]; ++ ++ delta_rps_sign = get_bits1(gb); ++ abs_delta_rps = get_ue_golomb_long(gb) + 1; ++ if (abs_delta_rps < 1 || abs_delta_rps > 32768) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid value of abs_delta_rps: %d\n", ++ abs_delta_rps); ++ return AVERROR_INVALIDDATA; ++ } ++ delta_rps = (1 - (delta_rps_sign << 1)) * abs_delta_rps; ++ for (i = 0; i <= rps_ridx->num_delta_pocs; i++) { ++ int used = rps->used[k] = get_bits1(gb); ++ ++ if (!used) ++ use_delta_flag = get_bits1(gb); ++ ++ if (used || use_delta_flag) { ++ if (i < rps_ridx->num_delta_pocs) ++ delta_poc = delta_rps + rps_ridx->delta_poc[i]; ++ else ++ delta_poc = delta_rps; ++ rps->delta_poc[k] = delta_poc; ++ if (delta_poc < 0) ++ k0++; ++ else ++ k1++; ++ k++; ++ } ++ } ++ ++ if (k >= FF_ARRAY_ELEMS(rps->used)) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid num_delta_pocs: %d\n", k); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ rps->num_delta_pocs = k; ++ rps->num_negative_pics = k0; ++ // sort in increasing order (smallest first) ++ if (rps->num_delta_pocs != 0) { ++ int used, tmp; ++ for (i = 1; i < rps->num_delta_pocs; i++) { ++ delta_poc = rps->delta_poc[i]; ++ used = rps->used[i]; ++ for (k = i - 1; k >= 0; k--) { ++ tmp = rps->delta_poc[k]; ++ if (delta_poc < tmp) { ++ rps->delta_poc[k + 1] = tmp; ++ rps->used[k + 1] = rps->used[k]; ++ rps->delta_poc[k] = delta_poc; ++ rps->used[k] = used; ++ } ++ } ++ } ++ } ++ if ((rps->num_negative_pics >> 1) != 0) { ++ int used; ++ k = rps->num_negative_pics - 1; ++ // flip the negative values to largest first ++ for (i = 0; i < rps->num_negative_pics >> 1; i++) { ++ delta_poc = rps->delta_poc[i]; ++ used = rps->used[i]; ++ rps->delta_poc[i] = rps->delta_poc[k]; ++ rps->used[i] = rps->used[k]; ++ rps->delta_poc[k] = delta_poc; ++ rps->used[k] = used; ++ k--; ++ } ++ } ++ } else { ++ unsigned int prev, nb_positive_pics; ++ rps->num_negative_pics = get_ue_golomb_long(gb); ++ nb_positive_pics = get_ue_golomb_long(gb); ++ ++ if (rps->num_negative_pics >= HEVC_MAX_REFS || ++ nb_positive_pics >= HEVC_MAX_REFS) { ++ av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics; ++ if (rps->num_delta_pocs) { ++ prev = 0; ++ for (i = 0; i < rps->num_negative_pics; i++) { ++ delta_poc = get_ue_golomb_long(gb) + 1; ++ if (delta_poc < 1 || delta_poc > 32768) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid value of delta_poc: %d\n", ++ delta_poc); ++ return AVERROR_INVALIDDATA; ++ } ++ prev -= delta_poc; ++ rps->delta_poc[i] = prev; ++ rps->used[i] = get_bits1(gb); ++ } ++ prev = 0; ++ for (i = 0; i < nb_positive_pics; i++) { ++ delta_poc = get_ue_golomb_long(gb) + 1; ++ if (delta_poc < 1 || delta_poc > 32768) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid value of delta_poc: %d\n", ++ delta_poc); ++ return AVERROR_INVALIDDATA; ++ } ++ prev += delta_poc; ++ rps->delta_poc[rps->num_negative_pics + i] = prev; ++ rps->used[rps->num_negative_pics + i] = get_bits1(gb); ++ } ++ } ++ } ++ return 0; ++} ++ ++ ++static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx, ++ PTLCommon * const ptl) ++{ ++ int i; ++ ++ if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12) ++ return -1; ++ ++ ptl->profile_space = get_bits(gb, 2); ++ ptl->tier_flag = get_bits1(gb); ++ ptl->profile_idc = get_bits(gb, 5); ++ if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN) ++ av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n"); ++ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10) ++ av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n"); ++ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE) ++ av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n"); ++ else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT) ++ av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n"); ++ else ++ av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc); ++ ++ for (i = 0; i < 32; i++) { ++ ptl->profile_compatibility_flag[i] = get_bits1(gb); ++ ++ if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i]) ++ ptl->profile_idc = i; ++ } ++ ptl->progressive_source_flag = get_bits1(gb); ++ ptl->interlaced_source_flag = get_bits1(gb); ++ ptl->non_packed_constraint_flag = get_bits1(gb); ++ ptl->frame_only_constraint_flag = get_bits1(gb); ++ ++ skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15] ++ skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31] ++ skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43] ++ ++ return 0; ++} ++ ++static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx, ++ PTL * const ptl, const int max_num_sub_layers) ++{ ++ int i; ++ if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 || ++ get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) { ++ av_log(avctx, AV_LOG_ERROR, "PTL information too short\n"); ++ return -1; ++ } ++ ++ ptl->general_ptl.level_idc = get_bits(gb, 8); ++ ++ for (i = 0; i < max_num_sub_layers - 1; i++) { ++ ptl->sub_layer_profile_present_flag[i] = get_bits1(gb); ++ ptl->sub_layer_level_present_flag[i] = get_bits1(gb); ++ } ++ ++ if (max_num_sub_layers - 1> 0) ++ for (i = max_num_sub_layers - 1; i < 8; i++) ++ skip_bits(gb, 2); // reserved_zero_2bits[i] ++ for (i = 0; i < max_num_sub_layers - 1; i++) { ++ if (ptl->sub_layer_profile_present_flag[i] && ++ decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) { ++ av_log(avctx, AV_LOG_ERROR, ++ "PTL information for sublayer %i too short\n", i); ++ return -1; ++ } ++ if (ptl->sub_layer_level_present_flag[i]) { ++ if (get_bits_left(gb) < 8) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Not enough data for sublayer %i level_idc\n", i); ++ return -1; ++ } else ++ ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8); ++ } ++ } ++ ++ return 0; ++} ++ ++static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb, ++ const int subpic_params_present) ++{ ++ int i; ++ ++ for (i = 0; i < nb_cpb; i++) { ++ get_ue_golomb_long(gb); // bit_rate_value_minus1 ++ get_ue_golomb_long(gb); // cpb_size_value_minus1 ++ ++ if (subpic_params_present) { ++ get_ue_golomb_long(gb); // cpb_size_du_value_minus1 ++ get_ue_golomb_long(gb); // bit_rate_du_value_minus1 ++ } ++ skip_bits1(gb); // cbr_flag ++ } ++} ++ ++static int decode_hrd(GetBitContext * const gb, const int common_inf_present, ++ const int max_sublayers) ++{ ++ int nal_params_present = 0, vcl_params_present = 0; ++ int subpic_params_present = 0; ++ int i; ++ ++ if (common_inf_present) { ++ nal_params_present = get_bits1(gb); ++ vcl_params_present = get_bits1(gb); ++ ++ if (nal_params_present || vcl_params_present) { ++ subpic_params_present = get_bits1(gb); ++ ++ if (subpic_params_present) { ++ skip_bits(gb, 8); // tick_divisor_minus2 ++ skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1 ++ skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag ++ skip_bits(gb, 5); // dpb_output_delay_du_length_minus1 ++ } ++ ++ skip_bits(gb, 4); // bit_rate_scale ++ skip_bits(gb, 4); // cpb_size_scale ++ ++ if (subpic_params_present) ++ skip_bits(gb, 4); // cpb_size_du_scale ++ ++ skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1 ++ skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1 ++ skip_bits(gb, 5); // dpb_output_delay_length_minus1 ++ } ++ } ++ ++ for (i = 0; i < max_sublayers; i++) { ++ int low_delay = 0; ++ unsigned int nb_cpb = 1; ++ int fixed_rate = get_bits1(gb); ++ ++ if (!fixed_rate) ++ fixed_rate = get_bits1(gb); ++ ++ if (fixed_rate) ++ get_ue_golomb_long(gb); // elemental_duration_in_tc_minus1 ++ else ++ low_delay = get_bits1(gb); ++ ++ if (!low_delay) { ++ nb_cpb = get_ue_golomb_long(gb) + 1; ++ if (nb_cpb < 1 || nb_cpb > 32) { ++ av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ if (nal_params_present) ++ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present); ++ if (vcl_params_present) ++ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present); ++ } ++ return 0; ++} ++ ++int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx, ++ HEVCRpiParamSets * const ps) ++{ ++ int i,j; ++ int vps_id = 0; ++ ptrdiff_t nal_size; ++ HEVCRpiVPS *vps; ++ AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps)); ++ ++ if (!vps_buf) ++ return AVERROR(ENOMEM); ++ vps = (HEVCRpiVPS*)vps_buf->data; ++ ++ av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n"); ++ ++ nal_size = gb->buffer_end - gb->buffer; ++ if (nal_size > sizeof(vps->data)) { ++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS " ++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", ++ nal_size, sizeof(vps->data)); ++ vps->data_size = sizeof(vps->data); ++ } else { ++ vps->data_size = nal_size; ++ } ++ memcpy(vps->data, gb->buffer, vps->data_size); ++ ++ vps_id = get_bits(gb, 4); ++ if (vps_id >= HEVC_MAX_VPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id); ++ goto err; ++ } ++ ++ if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits ++ av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n"); ++ goto err; ++ } ++ ++ vps->vps_max_layers = get_bits(gb, 6) + 1; ++ vps->vps_max_sub_layers = get_bits(gb, 3) + 1; ++ vps->vps_temporal_id_nesting_flag = get_bits1(gb); ++ ++ if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits ++ av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n"); ++ goto err; ++ } ++ ++ if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) { ++ av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n", ++ vps->vps_max_sub_layers); ++ goto err; ++ } ++ ++ if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0) ++ goto err; ++ ++ vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb); ++ ++ i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1; ++ for (; i < vps->vps_max_sub_layers; i++) { ++ vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1; ++ vps->vps_num_reorder_pics[i] = get_ue_golomb_long(gb); ++ vps->vps_max_latency_increase[i] = get_ue_golomb_long(gb) - 1; ++ ++ if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) { ++ av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n", ++ vps->vps_max_dec_pic_buffering[i] - 1); ++ goto err; ++ } ++ if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) { ++ av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n", ++ vps->vps_num_reorder_pics[i]); ++ if (avctx->err_recognition & AV_EF_EXPLODE) ++ goto err; ++ } ++ } ++ ++ vps->vps_max_layer_id = get_bits(gb, 6); ++ vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1; ++ if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 || ++ (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) { ++ av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n"); ++ goto err; ++ } ++ ++ for (i = 1; i < vps->vps_num_layer_sets; i++) ++ for (j = 0; j <= vps->vps_max_layer_id; j++) ++ skip_bits(gb, 1); // layer_id_included_flag[i][j] ++ ++ vps->vps_timing_info_present_flag = get_bits1(gb); ++ if (vps->vps_timing_info_present_flag) { ++ vps->vps_num_units_in_tick = get_bits_long(gb, 32); ++ vps->vps_time_scale = get_bits_long(gb, 32); ++ vps->vps_poc_proportional_to_timing_flag = get_bits1(gb); ++ if (vps->vps_poc_proportional_to_timing_flag) ++ vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1; ++ vps->vps_num_hrd_parameters = get_ue_golomb_long(gb); ++ if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) { ++ av_log(avctx, AV_LOG_ERROR, ++ "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters); ++ goto err; ++ } ++ for (i = 0; i < vps->vps_num_hrd_parameters; i++) { ++ int common_inf_present = 1; ++ ++ get_ue_golomb_long(gb); // hrd_layer_set_idx ++ if (i) ++ common_inf_present = get_bits1(gb); ++ decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers); ++ } ++ } ++ get_bits1(gb); /* vps_extension_flag */ ++ ++ if (get_bits_left(gb) < 0) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Overread VPS by %d bits\n", -get_bits_left(gb)); ++ if (ps->vps_list[vps_id]) ++ goto err; ++ } ++ ++ if (ps->vps_list[vps_id] && ++ !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) { ++ av_buffer_unref(&vps_buf); ++ } else { ++ remove_vps(ps, vps_id); ++ ps->vps_list[vps_id] = vps_buf; ++ } ++ ++ return 0; ++ ++err: ++ av_buffer_unref(&vps_buf); ++ return AVERROR_INVALIDDATA; ++} ++ ++static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx, ++ const int apply_defdispwin, HEVCRpiSPS * const sps) ++{ ++ VUI backup_vui, * const vui = &sps->vui; ++ GetBitContext backup; ++ int sar_present, alt = 0; ++ ++ av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n"); ++ ++ sar_present = get_bits1(gb); ++ if (sar_present) { ++ uint8_t sar_idx = get_bits(gb, 8); ++ if (sar_idx < FF_ARRAY_ELEMS(vui_sar)) ++ vui->sar = vui_sar[sar_idx]; ++ else if (sar_idx == 255) { ++ vui->sar.num = get_bits(gb, 16); ++ vui->sar.den = get_bits(gb, 16); ++ } else ++ av_log(avctx, AV_LOG_WARNING, ++ "Unknown SAR index: %u.\n", sar_idx); ++ } ++ ++ vui->overscan_info_present_flag = get_bits1(gb); ++ if (vui->overscan_info_present_flag) ++ vui->overscan_appropriate_flag = get_bits1(gb); ++ ++ vui->video_signal_type_present_flag = get_bits1(gb); ++ if (vui->video_signal_type_present_flag) { ++ vui->video_format = get_bits(gb, 3); ++ vui->video_full_range_flag = get_bits1(gb); ++ vui->colour_description_present_flag = get_bits1(gb); ++ if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P) ++ sps->pix_fmt = AV_PIX_FMT_YUVJ420P; ++ if (vui->colour_description_present_flag) { ++ vui->colour_primaries = get_bits(gb, 8); ++ vui->transfer_characteristic = get_bits(gb, 8); ++ vui->matrix_coeffs = get_bits(gb, 8); ++ ++ // Set invalid values to "unspecified" ++ if (!av_color_primaries_name(vui->colour_primaries)) ++ vui->colour_primaries = AVCOL_PRI_UNSPECIFIED; ++ if (!av_color_transfer_name(vui->transfer_characteristic)) ++ vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED; ++ if (!av_color_space_name(vui->matrix_coeffs)) ++ vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED; ++ if (vui->matrix_coeffs == AVCOL_SPC_RGB) { ++ switch (sps->pix_fmt) { ++ case AV_PIX_FMT_YUV444P: ++ sps->pix_fmt = AV_PIX_FMT_GBRP; ++ break; ++ case AV_PIX_FMT_YUV444P10: ++ sps->pix_fmt = AV_PIX_FMT_GBRP10; ++ break; ++ case AV_PIX_FMT_YUV444P12: ++ sps->pix_fmt = AV_PIX_FMT_GBRP12; ++ break; ++ } ++ } ++ } ++ } ++ ++ vui->chroma_loc_info_present_flag = get_bits1(gb); ++ if (vui->chroma_loc_info_present_flag) { ++ vui->chroma_sample_loc_type_top_field = get_ue_golomb_long(gb); ++ vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb); ++ } ++ ++ vui->neutra_chroma_indication_flag = get_bits1(gb); ++ vui->field_seq_flag = get_bits1(gb); ++ vui->frame_field_info_present_flag = get_bits1(gb); ++ ++ // Backup context in case an alternate header is detected ++ memcpy(&backup, gb, sizeof(backup)); ++ memcpy(&backup_vui, vui, sizeof(backup_vui)); ++ if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) { ++ vui->default_display_window_flag = 0; ++ av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n"); ++ } else ++ vui->default_display_window_flag = get_bits1(gb); ++ ++ if (vui->default_display_window_flag) { ++ int vert_mult = 1 + (sps->chroma_format_idc < 2); ++ int horiz_mult = 1 + (sps->chroma_format_idc < 3); ++ vui->def_disp_win.left_offset = get_ue_golomb_long(gb) * horiz_mult; ++ vui->def_disp_win.right_offset = get_ue_golomb_long(gb) * horiz_mult; ++ vui->def_disp_win.top_offset = get_ue_golomb_long(gb) * vert_mult; ++ vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult; ++ ++ if (apply_defdispwin && ++ avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) { ++ av_log(avctx, AV_LOG_DEBUG, ++ "discarding vui default display window, " ++ "original values are l:%u r:%u t:%u b:%u\n", ++ vui->def_disp_win.left_offset, ++ vui->def_disp_win.right_offset, ++ vui->def_disp_win.top_offset, ++ vui->def_disp_win.bottom_offset); ++ ++ vui->def_disp_win.left_offset = ++ vui->def_disp_win.right_offset = ++ vui->def_disp_win.top_offset = ++ vui->def_disp_win.bottom_offset = 0; ++ } ++ } ++ ++timing_info: ++ vui->vui_timing_info_present_flag = get_bits1(gb); ++ ++ if (vui->vui_timing_info_present_flag) { ++ if( get_bits_left(gb) < 66 && !alt) { ++ // The alternate syntax seem to have timing info located ++ // at where def_disp_win is normally located ++ av_log(avctx, AV_LOG_WARNING, ++ "Strange VUI timing information, retrying...\n"); ++ memcpy(vui, &backup_vui, sizeof(backup_vui)); ++ memcpy(gb, &backup, sizeof(backup)); ++ alt = 1; ++ goto timing_info; ++ } ++ vui->vui_num_units_in_tick = get_bits_long(gb, 32); ++ vui->vui_time_scale = get_bits_long(gb, 32); ++ if (alt) { ++ av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n", ++ vui->vui_time_scale, vui->vui_num_units_in_tick); ++ } ++ vui->vui_poc_proportional_to_timing_flag = get_bits1(gb); ++ if (vui->vui_poc_proportional_to_timing_flag) ++ vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb); ++ vui->vui_hrd_parameters_present_flag = get_bits1(gb); ++ if (vui->vui_hrd_parameters_present_flag) ++ decode_hrd(gb, 1, sps->max_sub_layers); ++ } ++ ++ vui->bitstream_restriction_flag = get_bits1(gb); ++ if (vui->bitstream_restriction_flag) { ++ if (get_bits_left(gb) < 8 && !alt) { ++ av_log(avctx, AV_LOG_WARNING, ++ "Strange VUI bitstream restriction information, retrying" ++ " from timing information...\n"); ++ memcpy(vui, &backup_vui, sizeof(backup_vui)); ++ memcpy(gb, &backup, sizeof(backup)); ++ alt = 1; ++ goto timing_info; ++ } ++ vui->tiles_fixed_structure_flag = get_bits1(gb); ++ vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb); ++ vui->restricted_ref_pic_lists_flag = get_bits1(gb); ++ vui->min_spatial_segmentation_idc = get_ue_golomb_long(gb); ++ vui->max_bytes_per_pic_denom = get_ue_golomb_long(gb); ++ vui->max_bits_per_min_cu_denom = get_ue_golomb_long(gb); ++ vui->log2_max_mv_length_horizontal = get_ue_golomb_long(gb); ++ vui->log2_max_mv_length_vertical = get_ue_golomb_long(gb); ++ } ++ ++ if (get_bits_left(gb) < 1 && !alt) { ++ // XXX: Alternate syntax when sps_range_extension_flag != 0? ++ av_log(avctx, AV_LOG_WARNING, ++ "Overread in VUI, retrying from timing information...\n"); ++ memcpy(vui, &backup_vui, sizeof(backup_vui)); ++ memcpy(gb, &backup, sizeof(backup)); ++ alt = 1; ++ goto timing_info; ++ } ++} ++ ++static void set_default_scaling_list_data(ScalingList * const sl) ++{ ++ int matrixId; ++ ++ for (matrixId = 0; matrixId < 6; matrixId++) { ++ // 4x4 default is 16 ++ memset(sl->sl[0][matrixId], 16, 16); ++ sl->sl_dc[0][matrixId] = 16; // default for 16x16 ++ sl->sl_dc[1][matrixId] = 16; // default for 32x32 ++ } ++ ++ memcpy(sl->sl[1][0], default_scaling_list_intra, 64); ++ memcpy(sl->sl[1][1], default_scaling_list_intra, 64); ++ memcpy(sl->sl[1][2], default_scaling_list_intra, 64); ++ ++ memcpy(sl->sl[1][3], default_scaling_list_inter, 64); ++ memcpy(sl->sl[1][4], default_scaling_list_inter, 64); ++ memcpy(sl->sl[1][5], default_scaling_list_inter, 64); ++ ++ memcpy(sl->sl[2][0], default_scaling_list_intra, 64); ++ memcpy(sl->sl[2][1], default_scaling_list_intra, 64); ++ memcpy(sl->sl[2][2], default_scaling_list_intra, 64); ++ ++ memcpy(sl->sl[2][3], default_scaling_list_inter, 64); ++ memcpy(sl->sl[2][4], default_scaling_list_inter, 64); ++ memcpy(sl->sl[2][5], default_scaling_list_inter, 64); ++ ++ memcpy(sl->sl[3][0], default_scaling_list_intra, 64); ++ memcpy(sl->sl[3][1], default_scaling_list_intra, 64); ++ memcpy(sl->sl[3][2], default_scaling_list_intra, 64); ++ ++ memcpy(sl->sl[3][3], default_scaling_list_inter, 64); ++ memcpy(sl->sl[3][4], default_scaling_list_inter, 64); ++ memcpy(sl->sl[3][5], default_scaling_list_inter, 64); ++} ++ ++static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl, ++ const HEVCRpiSPS * const sps) ++{ ++ uint8_t scaling_list_pred_mode_flag; ++ int32_t scaling_list_dc_coef[2][6]; ++ int size_id, matrix_id, pos; ++ int i; ++ ++ for (size_id = 0; size_id < 4; size_id++) ++ for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) { ++ scaling_list_pred_mode_flag = get_bits1(gb); ++ if (!scaling_list_pred_mode_flag) { ++ unsigned int delta = get_ue_golomb_long(gb); ++ /* Only need to handle non-zero delta. Zero means default, ++ * which should already be in the arrays. */ ++ if (delta) { ++ // Copy from previous array. ++ delta *= (size_id == 3) ? 3 : 1; ++ if (matrix_id < delta) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Invalid delta in scaling list data: %d.\n", delta); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ memcpy(sl->sl[size_id][matrix_id], ++ sl->sl[size_id][matrix_id - delta], ++ size_id > 0 ? 64 : 16); ++ if (size_id > 1) ++ sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta]; ++ } ++ } else { ++ int next_coef, coef_num; ++ int32_t scaling_list_delta_coef; ++ ++ next_coef = 8; ++ coef_num = FFMIN(64, 1 << (4 + (size_id << 1))); ++ if (size_id > 1) { ++ scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8; ++ next_coef = scaling_list_dc_coef[size_id - 2][matrix_id]; ++ sl->sl_dc[size_id - 2][matrix_id] = next_coef; ++ } ++ for (i = 0; i < coef_num; i++) { ++ if (size_id == 0) ++ pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] + ++ ff_hevc_rpi_diag_scan4x4_x[i]; ++ else ++ pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] + ++ ff_hevc_rpi_diag_scan8x8_x[i]; ++ ++ scaling_list_delta_coef = get_se_golomb(gb); ++ next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256; ++ sl->sl[size_id][matrix_id][pos] = next_coef; ++ } ++ } ++ } ++ ++ if (sps->chroma_format_idc == 3) { ++ for (i = 0; i < 64; i++) { ++ sl->sl[3][1][i] = sl->sl[2][1][i]; ++ sl->sl[3][2][i] = sl->sl[2][2][i]; ++ sl->sl[3][4][i] = sl->sl[2][4][i]; ++ sl->sl[3][5][i] = sl->sl[2][5][i]; ++ } ++ sl->sl_dc[1][1] = sl->sl_dc[0][1]; ++ sl->sl_dc[1][2] = sl->sl_dc[0][2]; ++ sl->sl_dc[1][4] = sl->sl_dc[0][4]; ++ sl->sl_dc[1][5] = sl->sl_dc[0][5]; ++ } ++ ++ ++ return 0; ++} ++ ++static int map_pixel_format(HEVCRpiSPS * const sps) ++{ ++ const int cfmt = sps->chroma_format_idc; ++ ++ sps->pix_fmt = AV_PIX_FMT_NONE; ++ switch (sps->bit_depth) { ++ case 8: ++ if (cfmt == 1) ++ sps->pix_fmt = AV_PIX_FMT_SAND128; ++ break; ++ case 10: ++ if (cfmt == 1) ++ sps->pix_fmt = AV_PIX_FMT_SAND64_10; ++ break; ++ default: ++ break; ++ } ++ ++ sps->hshift[0] = sps->vshift[0] = 0; ++ sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4 ++ sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2 ++ ++ sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0; ++ ++ return 0; ++} ++ ++static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id, ++ const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx) ++{ ++ HEVCRpiWindow *ow; ++ int ret = 0; ++ int log2_diff_max_min_transform_block_size; ++ int bit_depth_chroma, start, vui_present, sublayer_ordering_info; ++ int i; ++ ++ // Coded parameters ++ ++ sps->vps_id = get_bits(gb, 4); ++ if (sps->vps_id >= HEVC_MAX_VPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (vps_list && !vps_list[sps->vps_id]) { ++ av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n", ++ sps->vps_id); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sps->max_sub_layers = get_bits(gb, 3) + 1; ++ if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) { ++ av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n", ++ sps->max_sub_layers); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sps->temporal_id_nesting_flag = get_bits(gb, 1); ++ ++ if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0) ++ return ret; ++ ++ *sps_id = get_ue_golomb_long(gb); ++ if (*sps_id >= HEVC_MAX_SPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sps->chroma_format_idc = get_ue_golomb_long(gb); ++ if (sps->chroma_format_idc > 3U) { ++ av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (sps->chroma_format_idc == 3) ++ sps->separate_colour_plane_flag = get_bits1(gb); ++ ++ if (sps->separate_colour_plane_flag) ++ sps->chroma_format_idc = 0; ++ ++ sps->width = get_ue_golomb_long(gb); ++ sps->height = get_ue_golomb_long(gb); ++ if ((ret = av_image_check_size(sps->width, ++ sps->height, 0, avctx)) < 0) ++ return ret; ++ ++ if (get_bits1(gb)) { // pic_conformance_flag ++ int vert_mult = 1 + (sps->chroma_format_idc < 2); ++ int horiz_mult = 1 + (sps->chroma_format_idc < 3); ++ sps->pic_conf_win.left_offset = get_ue_golomb_long(gb) * horiz_mult; ++ sps->pic_conf_win.right_offset = get_ue_golomb_long(gb) * horiz_mult; ++ sps->pic_conf_win.top_offset = get_ue_golomb_long(gb) * vert_mult; ++ sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult; ++ ++ if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) { ++ av_log(avctx, AV_LOG_DEBUG, ++ "discarding sps conformance window, " ++ "original values are l:%u r:%u t:%u b:%u\n", ++ sps->pic_conf_win.left_offset, ++ sps->pic_conf_win.right_offset, ++ sps->pic_conf_win.top_offset, ++ sps->pic_conf_win.bottom_offset); ++ ++ sps->pic_conf_win.left_offset = ++ sps->pic_conf_win.right_offset = ++ sps->pic_conf_win.top_offset = ++ sps->pic_conf_win.bottom_offset = 0; ++ } ++ sps->output_window = sps->pic_conf_win; ++ } ++ ++ sps->bit_depth = get_ue_golomb_long(gb) + 8; ++ bit_depth_chroma = get_ue_golomb_long(gb) + 8; ++ if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Luma bit depth (%d) is different from chroma bit depth (%d), " ++ "this is unsupported.\n", ++ sps->bit_depth, bit_depth_chroma); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ ret = map_pixel_format(sps); ++ if (ret < 0) ++ return ret; ++ ++ sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4; ++ if (sps->log2_max_poc_lsb > 16) { ++ av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n", ++ sps->log2_max_poc_lsb - 4); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sublayer_ordering_info = get_bits1(gb); ++ start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1; ++ for (i = start; i < sps->max_sub_layers; i++) { ++ sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1; ++ sps->temporal_layer[i].num_reorder_pics = get_ue_golomb_long(gb); ++ sps->temporal_layer[i].max_latency_increase = get_ue_golomb_long(gb) - 1; ++ if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) { ++ av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n", ++ sps->temporal_layer[i].max_dec_pic_buffering - 1U); ++ return AVERROR_INVALIDDATA; ++ } ++ if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) { ++ av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n", ++ sps->temporal_layer[i].num_reorder_pics); ++ if (avctx->err_recognition & AV_EF_EXPLODE || ++ sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) { ++ return AVERROR_INVALIDDATA; ++ } ++ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1; ++ } ++ } ++ ++ if (!sublayer_ordering_info) { ++ for (i = 0; i < start; i++) { ++ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering; ++ sps->temporal_layer[i].num_reorder_pics = sps->temporal_layer[start].num_reorder_pics; ++ sps->temporal_layer[i].max_latency_increase = sps->temporal_layer[start].max_latency_increase; ++ } ++ } ++ ++ sps->log2_min_cb_size = get_ue_golomb_long(gb) + 3; ++ sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb); ++ sps->log2_min_tb_size = get_ue_golomb_long(gb) + 2; ++ log2_diff_max_min_transform_block_size = get_ue_golomb_long(gb); ++ sps->log2_max_trafo_size = log2_diff_max_min_transform_block_size + ++ sps->log2_min_tb_size; ++ ++ if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (sps->log2_diff_max_min_coding_block_size > 30) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ { ++ const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size; ++ // Not a bitstream limitation, but all profiles ++ if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ // Inferred parameters ++ sps->log2_ctb_size = CtbLog2SizeY; ++// sps->log2_min_pu_size = sps->log2_min_cb_size - 1; ++ } ++ ++ sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb); ++ sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb); ++ ++ sps->scaling_list_enable_flag = get_bits1(gb); ++ if (sps->scaling_list_enable_flag) { ++ set_default_scaling_list_data(&sps->scaling_list); ++ ++ if (get_bits1(gb)) { ++ ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps); ++ if (ret < 0) ++ return ret; ++ } ++ } ++ ++ sps->amp_enabled_flag = get_bits1(gb); ++ sps->sao_enabled = get_bits1(gb); ++ ++ // Set pcm defaults (0) so we don't have to test _enabled when we ++ // want to use them ++ memset(&sps->pcm, 0, sizeof(sps->pcm)); ++ ++ if (get_bits1(gb)) // pcm_enabled_flag ++ { ++ const unsigned int limit_max_pcm = FFMIN(5, ++ sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size); ++ sps->pcm.bit_depth = get_bits(gb, 4) + 1; ++ sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1; ++ sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3; ++ sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size + ++ get_ue_golomb_long(gb); ++ if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) { ++ av_log(avctx, AV_LOG_ERROR, ++ "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n", ++ sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth); ++ return AVERROR_INVALIDDATA; ++ } ++ if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size || ++ sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) { ++ av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)", ++ sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sps->pcm.loop_filter_disable_flag = get_bits1(gb); ++ } ++ ++ // Could be based on min_pcm_cb_size but much easier logic if we just stick ++ // with 8 (and costs us little) ++ sps->pcm_width = (sps->width + 63) >> 6; // 8 for min size, 8 bits per byte - round up ++ sps->pcm_height = (sps->height + 7) >> 3; ++ ++ sps->nb_st_rps = get_ue_golomb_long(gb); ++ if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) { ++ av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n", ++ sps->nb_st_rps); ++ return AVERROR_INVALIDDATA; ++ } ++ for (i = 0; i < sps->nb_st_rps; i++) { ++ if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i], ++ sps, 0)) < 0) ++ return ret; ++ } ++ ++ sps->long_term_ref_pics_present_flag = get_bits1(gb); ++ if (sps->long_term_ref_pics_present_flag) { ++ sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb); ++ if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) { ++ av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n", ++ sps->num_long_term_ref_pics_sps); ++ return AVERROR_INVALIDDATA; ++ } ++ for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) { ++ sps->lt_ref_pic_poc_lsb_sps[i] = get_bits(gb, sps->log2_max_poc_lsb); ++ sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb); ++ } ++ } ++ ++ sps->sps_temporal_mvp_enabled_flag = get_bits1(gb); ++ sps->intra_filters_disable = get_bits1(gb) ? 0 : FILTER_STRONG; // sps->sps_strong_intra_smoothing_enable_flag ++ sps->vui.sar = (AVRational){0, 1}; ++ vui_present = get_bits1(gb); ++ if (vui_present) ++ decode_vui(gb, avctx, apply_defdispwin, sps); ++ ++ if (get_bits1(gb)) { // sps_extension_flag ++ int sps_extension_flag[1]; ++ for (i = 0; i < 1; i++) ++ sps_extension_flag[i] = get_bits1(gb); ++ skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7); ++ if (sps_extension_flag[0]) { ++ int extended_precision_processing_flag; ++ int cabac_bypass_alignment_enabled_flag; ++ ++ sps->transform_skip_rotation_enabled_flag = get_bits1(gb); ++ sps->transform_skip_context_enabled_flag = get_bits1(gb); ++ sps->implicit_rdpcm_enabled_flag = get_bits1(gb); ++ ++ sps->explicit_rdpcm_enabled_flag = get_bits1(gb); ++ ++ extended_precision_processing_flag = get_bits1(gb); ++ if (extended_precision_processing_flag) ++ av_log(avctx, AV_LOG_WARNING, ++ "extended_precision_processing_flag not yet implemented\n"); ++ ++ if (get_bits1(gb)) // sps->intra_smoothing_disabled_flag ++ sps->intra_filters_disable |= FILTER_EITHER; ++ sps->high_precision_offsets_enabled_flag = get_bits1(gb); ++ sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb); ++ ++ cabac_bypass_alignment_enabled_flag = get_bits1(gb); ++ if (cabac_bypass_alignment_enabled_flag) ++ av_log(avctx, AV_LOG_WARNING, ++ "cabac_bypass_alignment_enabled_flag not yet implemented\n"); ++ } ++ } ++ if (apply_defdispwin) { ++ sps->output_window.left_offset += sps->vui.def_disp_win.left_offset; ++ sps->output_window.right_offset += sps->vui.def_disp_win.right_offset; ++ sps->output_window.top_offset += sps->vui.def_disp_win.top_offset; ++ sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset; ++ } ++ ++ ow = &sps->output_window; ++ if (ow->left_offset >= INT_MAX - ow->right_offset || ++ ow->top_offset >= INT_MAX - ow->bottom_offset || ++ ow->left_offset + ow->right_offset >= sps->width || ++ ow->top_offset + ow->bottom_offset >= sps->height) { ++ av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n", ++ ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset); ++ if (avctx->err_recognition & AV_EF_EXPLODE) { ++ return AVERROR_INVALIDDATA; ++ } ++ av_log(avctx, AV_LOG_WARNING, ++ "Displaying the whole video surface.\n"); ++ memset(ow, 0, sizeof(*ow)); ++ memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win)); ++ } ++ ++ // Inferred parameters ++ ++ sps->ctb_width = (sps->width + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size; ++ sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size; ++ sps->ctb_size = sps->ctb_width * sps->ctb_height; ++ ++ sps->min_cb_width = sps->width >> sps->log2_min_cb_size; ++ sps->min_cb_height = sps->height >> sps->log2_min_cb_size; ++ sps->min_tb_width = sps->width >> sps->log2_min_tb_size; ++ sps->min_tb_height = sps->height >> sps->log2_min_tb_size; ++ sps->min_pu_width = sps->width >> LOG2_MIN_PU_SIZE; ++ sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE; ++ sps->tb_mask = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1; ++ ++ sps->qp_bd_offset = 6 * (sps->bit_depth - 8); ++ sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7)); ++ ++ if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) || ++ av_mod_uintp2(sps->height, sps->log2_min_cb_size)) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) { ++ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n", ++ sps->max_transform_hierarchy_depth_inter); ++ return AVERROR_INVALIDDATA; ++ } ++ if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) { ++ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n", ++ sps->max_transform_hierarchy_depth_intra); ++ return AVERROR_INVALIDDATA; ++ } ++ if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) { ++ av_log(avctx, AV_LOG_ERROR, ++ "max transform block size out of range: %d\n", ++ sps->log2_max_trafo_size); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (get_bits_left(gb) < 0) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Overread SPS by %d bits\n", -get_bits_left(gb)); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ return 0; ++} ++ ++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps, int apply_defdispwin) ++{ ++ HEVCRpiSPS *sps; ++ AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps)); ++ unsigned int sps_id; ++ int ret; ++ ptrdiff_t nal_size; ++ ++ if (!sps_buf) ++ return AVERROR(ENOMEM); ++ sps = (HEVCRpiSPS*)sps_buf->data; ++ ++ av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n"); ++ ++ nal_size = gb->buffer_end - gb->buffer; ++ if (nal_size > sizeof(sps->data)) { ++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS " ++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", ++ nal_size, sizeof(sps->data)); ++ sps->data_size = sizeof(sps->data); ++ } else { ++ sps->data_size = nal_size; ++ } ++ memcpy(sps->data, gb->buffer, sps->data_size); ++ ++ ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id, ++ apply_defdispwin, ++ ps->vps_list, avctx); ++ if (ret < 0) { ++ av_buffer_unref(&sps_buf); ++ return ret; ++ } ++ ++ if (avctx->debug & FF_DEBUG_BITSTREAM) { ++ av_log(avctx, AV_LOG_DEBUG, ++ "Parsed SPS: id %d; coded wxh: %dx%d; " ++ "cropped wxh: %dx%d; pix_fmt: %s.\n", ++ sps_id, sps->width, sps->height, ++ sps->width - (sps->output_window.left_offset + sps->output_window.right_offset), ++ sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset), ++ av_get_pix_fmt_name(sps->pix_fmt)); ++ } ++ ++ /* check if this is a repeat of an already parsed SPS, then keep the ++ * original one. ++ * otherwise drop all PPSes that depend on it */ ++ if (ps->sps_list[sps_id] && ++ !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) { ++ av_buffer_unref(&sps_buf); ++ } else { ++ remove_sps(ps, sps_id); ++ ps->sps_list[sps_id] = sps_buf; ++ } ++ ++ return 0; ++} ++ ++static void hevc_pps_free(void *opaque, uint8_t *data) ++{ ++ HEVCRpiPPS *pps = (HEVCRpiPPS*)data; ++ ++ av_freep(&pps->column_width); ++ av_freep(&pps->row_height); ++ av_freep(&pps->col_bd); ++ av_freep(&pps->row_bd); ++ av_freep(&pps->col_idxX); ++ av_freep(&pps->ctb_addr_rs_to_ts); ++ av_freep(&pps->ctb_addr_ts_to_rs); ++ av_freep(&pps->tile_pos_ts); ++ av_freep(&pps->tile_size); ++ av_freep(&pps->tile_id); ++ av_freep(&pps->ctb_ts_flags); ++ ++ av_freep(&pps); ++} ++ ++static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets) ++{ ++ do ++ { ++ const int offset = get_se_golomb_long(gb); ++ if (offset < -12 || offset > 12) { ++ av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset); ++ return AVERROR_INVALIDDATA; ++ } ++ *offsets++ = offset; ++ } while (n_minus_1-- != 0); ++ return 0; ++} ++ ++static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx, ++ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps) ++{ ++ if (pps->transform_skip_enabled_flag) { ++ pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2; ++ } ++ pps->cross_component_prediction_enabled_flag = get_bits1(gb); ++ if (pps->cross_component_prediction_enabled_flag && ++ (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag)) ++ { ++ av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb); ++ if (pps->chroma_qp_offset_list_enabled_flag) { ++ int err; ++ ++ pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb); ++ pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb); ++ if (pps->chroma_qp_offset_list_len_minus1 > 5) { ++ av_log(avctx, AV_LOG_ERROR, ++ "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n"); ++ ++ if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 || ++ (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0) ++ return err; ++ } ++ ++ { ++ const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0; ++ ++ pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb); ++ if (pps->log2_sao_offset_scale_luma > max_offset) { ++ av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid"); ++ return AVERROR_INVALIDDATA; ++ } ++ pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb); ++ if (pps->log2_sao_offset_scale_chroma > max_offset) { ++ av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid"); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ return(0); ++} ++ ++static inline int setup_pps(AVCodecContext * const avctx, ++ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps) ++{ ++ int pic_area_in_ctbs; ++ int i, j, x, y, ctb_addr_rs, tile_id; ++ ++ // Inferred parameters ++ ++ // qp_y -> qp_u/qp_v tables ++ // The tables have at least -24,+24 overrun after adding offset here ++ // which should allow for clipless offseting ++ ++ pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0; // No offset for luma, but may be useful for general code ++ pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0; ++ ++ if (sps->chroma_format_idc == 1) { ++ pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0; ++ pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0; ++ pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0; ++ pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0; ++ } ++ else ++ { ++ pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0; ++ pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0; ++ pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0; ++ pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0; ++ } ++ ++ pps->col_bd = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd)); ++ pps->row_bd = av_malloc_array(pps->num_tile_rows + 1, sizeof(*pps->row_bd)); ++ pps->col_idxX = av_malloc_array(sps->ctb_width, sizeof(*pps->col_idxX)); ++ if (!pps->col_bd || !pps->row_bd || !pps->col_idxX) ++ return AVERROR(ENOMEM); ++ ++ if (pps->uniform_spacing_flag) { ++ if (!pps->column_width) { ++ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width)); ++ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height)); ++ } ++ if (!pps->column_width || !pps->row_height) ++ return AVERROR(ENOMEM); ++ ++ for (i = 0; i < pps->num_tile_columns; i++) { ++ pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns - ++ (i * sps->ctb_width) / pps->num_tile_columns; ++ } ++ ++ for (i = 0; i < pps->num_tile_rows; i++) { ++ pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows - ++ (i * sps->ctb_height) / pps->num_tile_rows; ++ } ++ } ++ ++ { ++ const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift); ++ pps->col_bd[0] = 0; ++ pps->tile_wpp_inter_disable = 0; ++ for (i = 0; i < pps->num_tile_columns; i++) ++ { ++ pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i]; ++ ++ // Avoid trying tile parallel if the columns don't fall on cache boundries ++ // (this causes too much pain syncing flushes with the QPU) ++ // Ignore the final (RHS of pic) tile boundry ++ if ((pps->col_bd[i] & td_mask) != 0) { ++ pps->tile_wpp_inter_disable = 1; ++ } ++ } ++ ++ // If we can start the next row before finishing the first line of ++ // this one then we must wait at the end of the tile ++ // * if this happens a lot then there are better but more complicated ++ // conditions that we could apply ++ if (pps->tile_wpp_inter_disable) { ++ for (i = 0; i < pps->num_tile_rows; i++) ++ { ++ if (pps->row_height[i] <= RPI_MAX_JOBS) { ++ pps->tile_wpp_inter_disable = 2; ++ break; ++ } ++ } ++ } ++ } ++ ++ pps->row_bd[0] = 0; ++ for (i = 0; i < pps->num_tile_rows; i++) ++ pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i]; ++ ++ for (i = 0, j = 0; i < sps->ctb_width; i++) { ++ if (i >= pps->col_bd[j + 1]) ++ j++; ++ pps->col_idxX[i] = j; ++ } ++ ++ /** ++ * 6.5 ++ */ ++ pic_area_in_ctbs = sps->ctb_size; ++ ++ pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_rs_to_ts)); ++ pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_ts_to_rs)); ++ pps->tile_id = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_id)); ++ pps->tile_size = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size)); ++ pps->tile_pos_ts = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts)); ++ pps->ctb_ts_flags = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_ts_flags)); ++ if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs || ++ !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) { ++ return AVERROR(ENOMEM); ++ } ++ ++ memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags)); ++ ++ for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) { ++ int tb_x = ctb_addr_rs % sps->ctb_width; ++ int tb_y = ctb_addr_rs / sps->ctb_width; ++ int tile_x = 0; ++ int tile_y = 0; ++ int val = 0; ++ ++ for (i = 0; i < pps->num_tile_columns; i++) { ++ if (tb_x < pps->col_bd[i + 1]) { ++ tile_x = i; ++ break; ++ } ++ } ++ ++ for (i = 0; i < pps->num_tile_rows; i++) { ++ if (tb_y < pps->row_bd[i + 1]) { ++ tile_y = i; ++ break; ++ } ++ } ++ ++ for (i = 0; i < tile_x; i++) ++ val += pps->row_height[tile_y] * pps->column_width[i]; ++ for (i = 0; i < tile_y; i++) ++ val += sps->ctb_width * pps->row_height[i]; ++ ++ val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] + ++ tb_x - pps->col_bd[tile_x]; ++ ++ pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val; ++ pps->ctb_addr_ts_to_rs[val] = ctb_addr_rs; ++ } ++ ++ { ++ uint8_t * pflags = pps->ctb_ts_flags; ++ uint16_t * ptid = pps->tile_id; ++ ++ for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++) ++ { ++ for (i = 0; i < pps->num_tile_columns; i++, tile_id++) ++ { ++ const unsigned int tile_w = pps->column_width[i]; ++ ++ pflags[0] |= CTB_TS_FLAGS_CIREQ; ++ ++ for (x = 0; x != tile_w; ++x) { ++ pflags[x] |= CTB_TS_FLAGS_TOT; ++ } ++ ++ for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++) ++ { ++ pflags[0] |= CTB_TS_FLAGS_SOTL; ++ ++ if (pps->entropy_coding_sync_enabled_flag) ++ { ++ if (pps->column_width[i] != 1) ++ pflags[1] |= CTB_TS_FLAGS_CSAVE; ++ else ++ pflags[0] |= CTB_TS_FLAGS_CIREQ; ++ ++ if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0) ++ pflags[0] |= CTB_TS_FLAGS_CLOAD; ++ } ++ ++ for (x = 0; x != tile_w; ++x) ++ *ptid++ = tile_id; ++ ++ pflags += tile_w; ++ pflags[-1] |= CTB_TS_FLAGS_EOTL; ++ if (i + 1 == pps->num_tile_columns) ++ pflags[-1] |= CTB_TS_FLAGS_EOL; ++ } ++ ++ pflags[-1] |= CTB_TS_FLAGS_EOT; ++ } ++ } ++ } ++ ++ { ++ unsigned int ts = 0; ++ for (j = 0; j < pps->num_tile_rows; j++) ++ for (i = 0; i < pps->num_tile_columns; i++) ++ { ++ const unsigned int size = pps->column_width[i] * pps->row_height[j]; ++ pps->tile_size[j * pps->num_tile_columns + i] = size; ++ pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts; ++ ts += size; ++ } ++ } ++ ++ return 0; ++} ++ ++int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx, ++ HEVCRpiParamSets * const ps) ++{ ++ const HEVCRpiSPS *sps = NULL; ++ int i, ret = 0; ++ unsigned int pps_id = 0; ++ ptrdiff_t nal_size; ++ unsigned log2_parallel_merge_level_minus2; ++ ++ AVBufferRef *pps_buf; ++ HEVCRpiPPS *pps = av_mallocz(sizeof(*pps)); ++ ++ if (!pps) ++ return AVERROR(ENOMEM); ++ ++ pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps), ++ hevc_pps_free, NULL, 0); ++ if (!pps_buf) { ++ av_freep(&pps); ++ return AVERROR(ENOMEM); ++ } ++ ++ av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n"); ++ ++ nal_size = gb->buffer_end - gb->buffer; ++ if (nal_size > sizeof(pps->data)) { ++ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS " ++ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n", ++ nal_size, sizeof(pps->data)); ++ pps->data_size = sizeof(pps->data); ++ } else { ++ pps->data_size = nal_size; ++ } ++ memcpy(pps->data, gb->buffer, pps->data_size); ++ ++ // Default values ++ pps->loop_filter_across_tiles_enabled_flag = 1; ++ pps->num_tile_columns = 1; ++ pps->num_tile_rows = 1; ++ pps->uniform_spacing_flag = 1; ++ pps->disable_dbf = 0; ++ pps->beta_offset = 0; ++ pps->tc_offset = 0; ++ pps->log2_max_transform_skip_block_size = 2; ++ ++ // Coded parameters ++ pps_id = get_ue_golomb_long(gb); ++ if (pps_id >= HEVC_MAX_PPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->sps_id = get_ue_golomb_long(gb); ++ if (pps->sps_id >= HEVC_MAX_SPS_COUNT) { ++ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ if (!ps->sps_list[pps->sps_id]) { ++ av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data; ++ ++ pps->dependent_slice_segments_enabled_flag = get_bits1(gb); ++ pps->output_flag_present_flag = get_bits1(gb); ++ pps->num_extra_slice_header_bits = get_bits(gb, 3); ++ ++ pps->sign_data_hiding_flag = get_bits1(gb); ++ ++ pps->cabac_init_present_flag = get_bits1(gb); ++ ++ pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1; ++ if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) { ++ av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n"); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1; ++ if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) { ++ av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n"); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ ++ pps->pic_init_qp_minus26 = get_se_golomb(gb); ++ if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) { ++ av_log(avctx, AV_LOG_ERROR, ++ "init_qp_minus26 %d is outside the valid range " ++ "[%d, %d].\n", ++ pps->pic_init_qp_minus26, ++ -(26 + sps->qp_bd_offset), 25); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ ++ pps->constrained_intra_pred_flag = get_bits1(gb); ++ pps->transform_skip_enabled_flag = get_bits1(gb); ++ ++ pps->cu_qp_delta_enabled_flag = get_bits1(gb); ++ pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size; ++ if (pps->cu_qp_delta_enabled_flag) ++ { ++ const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb); ++ ++ if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) { ++ av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n", ++ diff_cu_qp_delta_depth); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ ++ pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth; ++ } ++ ++ pps->cb_qp_offset = get_se_golomb(gb); ++ if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) { ++ av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n", ++ pps->cb_qp_offset); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->cr_qp_offset = get_se_golomb(gb); ++ if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) { ++ av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n", ++ pps->cr_qp_offset); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb); ++ ++ pps->weighted_pred_flag = get_bits1(gb); ++ pps->weighted_bipred_flag = get_bits1(gb); ++ ++ pps->transquant_bypass_enable_flag = get_bits1(gb); ++ pps->tiles_enabled_flag = get_bits1(gb); ++ pps->entropy_coding_sync_enabled_flag = get_bits1(gb); ++ ++ if (pps->tiles_enabled_flag) { ++ pps->num_tile_columns = get_ue_golomb_long(gb) + 1; ++ pps->num_tile_rows = get_ue_golomb_long(gb) + 1; ++ if (pps->num_tile_columns <= 0 || ++ pps->num_tile_columns >= sps->width) { ++ av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n", ++ pps->num_tile_columns - 1); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ if (pps->num_tile_rows <= 0 || ++ pps->num_tile_rows >= sps->height) { ++ av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n", ++ pps->num_tile_rows - 1); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ ++ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width)); ++ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height)); ++ if (!pps->column_width || !pps->row_height) { ++ ret = AVERROR(ENOMEM); ++ goto err; ++ } ++ ++ pps->uniform_spacing_flag = get_bits1(gb); ++ if (!pps->uniform_spacing_flag) { ++ uint64_t sum = 0; ++ for (i = 0; i < pps->num_tile_columns - 1; i++) { ++ pps->column_width[i] = get_ue_golomb_long(gb) + 1; ++ sum += pps->column_width[i]; ++ } ++ if (sum >= sps->ctb_width) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n"); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum; ++ ++ sum = 0; ++ for (i = 0; i < pps->num_tile_rows - 1; i++) { ++ pps->row_height[i] = get_ue_golomb_long(gb) + 1; ++ sum += pps->row_height[i]; ++ } ++ if (sum >= sps->ctb_height) { ++ av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n"); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum; ++ } ++ pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb); ++ } ++ ++ pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb); ++ ++ pps->deblocking_filter_control_present_flag = get_bits1(gb); ++ if (pps->deblocking_filter_control_present_flag) { ++ pps->deblocking_filter_override_enabled_flag = get_bits1(gb); ++ pps->disable_dbf = get_bits1(gb); ++ if (!pps->disable_dbf) { ++ int beta_offset_div2 = get_se_golomb(gb); ++ int tc_offset_div2 = get_se_golomb(gb) ; ++ if (beta_offset_div2 < -6 || beta_offset_div2 > 6) { ++ av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n", ++ beta_offset_div2); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ if (tc_offset_div2 < -6 || tc_offset_div2 > 6) { ++ av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n", ++ tc_offset_div2); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->beta_offset = 2 * beta_offset_div2; ++ pps->tc_offset = 2 * tc_offset_div2; ++ } ++ } ++ ++ pps->scaling_list_data_present_flag = get_bits1(gb); ++ if (pps->scaling_list_data_present_flag) { ++ set_default_scaling_list_data(&pps->scaling_list); ++ ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps); ++ if (ret < 0) ++ goto err; ++ } ++ pps->lists_modification_present_flag = get_bits1(gb); ++ log2_parallel_merge_level_minus2 = get_ue_golomb_long(gb); ++ if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) { ++ av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n", ++ log2_parallel_merge_level_minus2); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ pps->log2_parallel_merge_level = log2_parallel_merge_level_minus2 + 2; ++ ++ pps->slice_header_extension_present_flag = get_bits1(gb); ++ ++ if (get_bits1(gb)) { // pps_extension_present_flag ++ int pps_range_extensions_flag = get_bits1(gb); ++ skip_bits(gb, 7); // pps_extension_7bits ++ if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) { ++ if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0) ++ goto err; ++ } ++ } ++ ++ ret = setup_pps(avctx, pps, sps); ++ if (ret < 0) ++ goto err; ++ ++ if (get_bits_left(gb) < 0) { ++ av_log(avctx, AV_LOG_ERROR, ++ "Overread PPS by %d bits\n", -get_bits_left(gb)); ++ ret = AVERROR_INVALIDDATA; ++ goto err; ++ } ++ ++ remove_pps(ps, pps_id); ++ ps->pps_list[pps_id] = pps_buf; ++ ++ return 0; ++ ++err: ++ av_buffer_unref(&pps_buf); ++ return ret; ++} ++ ++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type) ++{ ++ int max_poc_lsb = 1 << sps->log2_max_poc_lsb; ++ int prev_poc_lsb = pocTid0 % max_poc_lsb; ++ int prev_poc_msb = pocTid0 - prev_poc_lsb; ++ int poc_msb; ++ ++ if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2) ++ poc_msb = prev_poc_msb + max_poc_lsb; ++ else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2) ++ poc_msb = prev_poc_msb - max_poc_lsb; ++ else ++ poc_msb = prev_poc_msb; ++ ++ // For BLA picture types, POCmsb is set to 0. ++ if (nal_unit_type == HEVC_NAL_BLA_W_LP || ++ nal_unit_type == HEVC_NAL_BLA_W_RADL || ++ nal_unit_type == HEVC_NAL_BLA_N_LP) ++ poc_msb = 0; ++ ++ return poc_msb + poc_lsb; ++} +diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h +new file mode 100644 +index 0000000000..c725ebb9ca +--- /dev/null ++++ b/libavcodec/rpi_hevc_ps.h +@@ -0,0 +1,449 @@ ++/* ++ * HEVC parameter set parsing ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVC_PS_H ++#define AVCODEC_RPI_HEVC_PS_H ++ ++#include ++ ++#include "libavutil/buffer.h" ++#include "libavutil/pixfmt.h" ++#include "libavutil/rational.h" ++ ++#include "avcodec.h" ++#include "get_bits.h" ++#include "hevc.h" ++ ++typedef struct ShortTermRPS { ++ unsigned int num_negative_pics; ++ int num_delta_pocs; ++ int rps_idx_num_delta_pocs; ++ int32_t delta_poc[32]; ++ uint8_t used[32]; ++} ShortTermRPS; ++ ++typedef struct LongTermRPS { ++ int poc[32]; ++ uint8_t used[32]; ++ uint8_t nb_refs; ++} LongTermRPS; ++ ++typedef struct RpiSliceHeader { ++ unsigned int pps_id; ++ ++ ///< address (in raster order) of the first block in the current slice segment ++ unsigned int slice_segment_addr; ++ ///< address (in raster order) of the first block in the current slice ++ unsigned int slice_addr; ++ ++ enum HEVCSliceType slice_type; ++ ++ int pic_order_cnt_lsb; ++ ++ uint8_t first_slice_in_pic_flag; ++ uint8_t dependent_slice_segment_flag; ++ uint8_t pic_output_flag; ++ uint8_t colour_plane_id; ++ ++ ///< RPS coded in the slice header itself is stored here ++ int short_term_ref_pic_set_sps_flag; ++ int short_term_ref_pic_set_size; ++ ShortTermRPS slice_rps; ++ const ShortTermRPS *short_term_rps; ++ int long_term_ref_pic_set_size; ++ LongTermRPS long_term_rps; ++ unsigned int list_entry_lx[2][32]; ++ ++ uint8_t rpl_modification_flag[2]; ++ uint8_t no_output_of_prior_pics_flag; ++ uint8_t slice_temporal_mvp_enabled_flag; ++ ++ unsigned int nb_refs[2]; ++ ++ uint8_t slice_sample_adaptive_offset_flag[3]; ++ uint8_t mvd_l1_zero_flag; ++ ++ uint8_t cabac_init_flag; ++ uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag ++ uint8_t slice_loop_filter_across_slices_enabled_flag; ++ uint8_t collocated_list; ++ ++ uint8_t no_dblk_boundary_flags; ++ ++ unsigned int collocated_ref_idx; ++ ++ int slice_qp_delta; ++ int slice_cb_qp_offset; // -12, +12 ++ int slice_cr_qp_offset; // -12, +12 ++ ++ uint8_t cu_chroma_qp_offset_enabled_flag; ++ ++ int beta_offset; ///< beta_offset_div2 * 2 ++ int tc_offset; ///< tc_offset_div2 * 2 ++ ++ unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand ++ ++ unsigned *entry_point_offset; ++ int * offset; ++ int * size; ++ int num_entry_point_offsets; ++ int offsets_allocated; ++ ++ uint8_t offload_wpp; ++ uint8_t offload_tiles; ++ ++ int8_t slice_qp; ++ ++ uint8_t luma_log2_weight_denom; ++ uint8_t chroma_log2_weight_denom; ++ ++ int16_t luma_weight_l0[16]; // -128, +255 ++ int16_t luma_offset_l0[16]; ++ int16_t chroma_weight_l0[16][2]; ++ int16_t chroma_offset_l0[16][2]; ++ ++ int16_t luma_weight_l1[16]; ++ int16_t luma_offset_l1[16]; ++ int16_t chroma_weight_l1[16][2]; ++ int16_t chroma_offset_l1[16][2]; ++ ++} RpiSliceHeader; ++ ++typedef struct HEVCRpiWindow { ++ uint16_t left_offset; ++ uint16_t right_offset; ++ uint16_t top_offset; ++ uint16_t bottom_offset; ++} HEVCRpiWindow; ++ ++typedef struct VUI { ++ AVRational sar; ++ ++ int overscan_info_present_flag; ++ int overscan_appropriate_flag; ++ ++ int video_signal_type_present_flag; ++ int video_format; ++ int video_full_range_flag; ++ int colour_description_present_flag; ++ uint8_t colour_primaries; ++ uint8_t transfer_characteristic; ++ uint8_t matrix_coeffs; ++ ++ int chroma_loc_info_present_flag; ++ int chroma_sample_loc_type_top_field; ++ int chroma_sample_loc_type_bottom_field; ++ int neutra_chroma_indication_flag; ++ ++ int field_seq_flag; ++ int frame_field_info_present_flag; ++ ++ int default_display_window_flag; ++ HEVCRpiWindow def_disp_win; ++ ++ int vui_timing_info_present_flag; ++ uint32_t vui_num_units_in_tick; ++ uint32_t vui_time_scale; ++ int vui_poc_proportional_to_timing_flag; ++ int vui_num_ticks_poc_diff_one_minus1; ++ int vui_hrd_parameters_present_flag; ++ ++ int bitstream_restriction_flag; ++ int tiles_fixed_structure_flag; ++ int motion_vectors_over_pic_boundaries_flag; ++ int restricted_ref_pic_lists_flag; ++ int min_spatial_segmentation_idc; ++ int max_bytes_per_pic_denom; ++ int max_bits_per_min_cu_denom; ++ int log2_max_mv_length_horizontal; ++ int log2_max_mv_length_vertical; ++} VUI; ++ ++typedef struct PTLCommon { ++ uint8_t profile_space; ++ uint8_t tier_flag; ++ uint8_t profile_idc; ++ uint8_t profile_compatibility_flag[32]; ++ uint8_t level_idc; ++ uint8_t progressive_source_flag; ++ uint8_t interlaced_source_flag; ++ uint8_t non_packed_constraint_flag; ++ uint8_t frame_only_constraint_flag; ++} PTLCommon; ++ ++typedef struct PTL { ++ PTLCommon general_ptl; ++ PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS]; ++ ++ uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS]; ++ uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS]; ++} PTL; ++ ++typedef struct HEVCRpiVPS { ++ uint8_t vps_temporal_id_nesting_flag; ++ int vps_max_layers; ++ int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1 ++ ++ PTL ptl; ++ int vps_sub_layer_ordering_info_present_flag; ++ unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS]; ++ unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS]; ++ unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS]; ++ int vps_max_layer_id; ++ int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1 ++ uint8_t vps_timing_info_present_flag; ++ uint32_t vps_num_units_in_tick; ++ uint32_t vps_time_scale; ++ uint8_t vps_poc_proportional_to_timing_flag; ++ int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1 ++ int vps_num_hrd_parameters; ++ ++ uint8_t data[4096]; ++ int data_size; ++} HEVCRpiVPS; ++ ++typedef struct ScalingList { ++ /* This is a little wasteful, since sizeID 0 only needs 8 coeffs, ++ * and size ID 3 only has 2 arrays, not 6. */ ++ uint8_t sl[4][6][64]; ++ uint8_t sl_dc[2][6]; ++} ScalingList; ++ ++typedef struct HEVCRpiSPS { ++ unsigned vps_id; ++ uint8_t chroma_format_idc; ++ uint8_t separate_colour_plane_flag; ++ ++ HEVCRpiWindow output_window; ++ ++ HEVCRpiWindow pic_conf_win; ++ ++ uint16_t wp_offset_half_range; // WpOffsetHalfRange ++ ++ uint8_t bit_depth; ++ ++// int bit_depth_chroma; // We only support lum_bit_depth = chroma_bit_depth ++ uint8_t pixel_shift; ++ enum AVPixelFormat pix_fmt; ++ ++ unsigned int log2_max_poc_lsb; ++ ++ int max_sub_layers; ++ struct { ++ int max_dec_pic_buffering; ++ int num_reorder_pics; ++ int max_latency_increase; ++ } temporal_layer[HEVC_MAX_SUB_LAYERS]; ++ uint8_t temporal_id_nesting_flag; ++ ++ uint8_t scaling_list_enable_flag; ++ ScalingList scaling_list; ++ ++ unsigned int nb_st_rps; ++ ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS]; ++ ++ uint8_t amp_enabled_flag; ++ uint8_t sao_enabled; ++ ++ uint8_t long_term_ref_pics_present_flag; ++ uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS]; ++ uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS]; ++ uint8_t num_long_term_ref_pics_sps; ++ ++ struct { ++ uint8_t bit_depth; ++ uint8_t bit_depth_chroma; ++ uint8_t log2_min_pcm_cb_size; ++ uint8_t log2_max_pcm_cb_size; ++ uint8_t loop_filter_disable_flag; ++ } pcm; ++ char sps_temporal_mvp_enabled_flag; ++// char sps_strong_intra_smoothing_enable_flag; -> intra_filtes_disable ++ ++ uint8_t log2_min_cb_size; // 3..6 ++ uint8_t log2_diff_max_min_coding_block_size; ++ uint8_t log2_min_tb_size; // 2..5 ++ uint8_t log2_max_trafo_size; ++ uint8_t log2_ctb_size; // 4..6 ++// unsigned int log2_min_pu_size; // 2..5 (min_cb_size - 1) ++#define LOG2_MIN_PU_SIZE 2 ++#define LOG2_MIN_CU_SIZE 3 ++ ++ uint8_t max_transform_hierarchy_depth_inter; ++ uint8_t max_transform_hierarchy_depth_intra; ++ ++ char transform_skip_rotation_enabled_flag; ++ char transform_skip_context_enabled_flag; ++ char implicit_rdpcm_enabled_flag; ++ char explicit_rdpcm_enabled_flag; ++// char intra_smoothing_disabled_flag; -> intra_filtes_disable ++ char high_precision_offsets_enabled_flag; ++ char persistent_rice_adaptation_enabled_flag; ++ ++ uint8_t intra_filters_disable; ++ ++ ///< coded frame dimension in various units ++ int width; ++ int height; ++ int ctb_width; ++ int ctb_height; ++ int ctb_size; // Pic size in CTBs not size of a CTB ++ int min_cb_width; ++ int min_cb_height; ++ int min_tb_width; ++ int min_tb_height; ++ int min_pu_width; ++ int min_pu_height; ++ int pcm_width; ++ int pcm_height; ++ int tb_mask; ++ ++ int hshift[3]; ++ int vshift[3]; ++ ++ int qp_bd_offset; ++ ++ uint8_t data[4096]; ++ int data_size; ++ ++ VUI vui; ++ PTL ptl; ++} HEVCRpiSPS; ++ ++#define CTB_TS_FLAGS_SOTL (1U << 0) // X start of tile line ++#define CTB_TS_FLAGS_EOTL (1U << 1) // Last CTB of a tile line ++#define CTB_TS_FLAGS_EOL (1U << 2) // Last CTB of a complete line ++#define CTB_TS_FLAGS_EOT (1U << 3) // Last CTB of a tile ++#define CTB_TS_FLAGS_CSAVE (1U << 4) ++#define CTB_TS_FLAGS_CIREQ (1U << 5) // Cabac init request ++#define CTB_TS_FLAGS_TOT (1U << 6) // CTB on top row of a tile ++#define CTB_TS_FLAGS_CLOAD (1U << 7) ++ ++typedef struct HEVCRpiPPS { ++ unsigned int sps_id; ///< seq_parameter_set_id ++ ++ uint8_t sign_data_hiding_flag; ++ ++ uint8_t cabac_init_present_flag; ++ ++ int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1 ++ int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1 ++ int pic_init_qp_minus26; ++ ++ uint8_t constrained_intra_pred_flag; ++ uint8_t transform_skip_enabled_flag; ++ ++ uint8_t cu_qp_delta_enabled_flag; ++ uint8_t log2_min_cu_qp_delta_size; ++ int cb_qp_offset; // -12..12 ++ int cr_qp_offset; // -12..12 ++ const uint8_t * qp_dblk_x[3]; ++ const int8_t * qp_bd_x[3]; ++ ++ uint8_t pic_slice_level_chroma_qp_offsets_present_flag; ++ uint8_t weighted_pred_flag; ++ uint8_t weighted_bipred_flag; ++ uint8_t output_flag_present_flag; ++ uint8_t transquant_bypass_enable_flag; ++ ++ uint8_t dependent_slice_segments_enabled_flag; ++ uint8_t tiles_enabled_flag; ++ uint8_t entropy_coding_sync_enabled_flag; ++ ++ uint8_t tile_wpp_inter_disable; ++ int num_tile_columns; ///< num_tile_columns_minus1 + 1 ++ int num_tile_rows; ///< num_tile_rows_minus1 + 1 ++ uint8_t uniform_spacing_flag; ++ uint8_t loop_filter_across_tiles_enabled_flag; ++ ++ uint8_t seq_loop_filter_across_slices_enabled_flag; ++ ++ uint8_t deblocking_filter_control_present_flag; ++ uint8_t deblocking_filter_override_enabled_flag; ++ uint8_t disable_dbf; ++ int beta_offset; ///< beta_offset_div2 * 2 ++ int tc_offset; ///< tc_offset_div2 * 2 ++ ++ uint8_t scaling_list_data_present_flag; ++ ScalingList scaling_list; ++ ++ uint8_t lists_modification_present_flag; ++ int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2 ++ int num_extra_slice_header_bits; ++ uint8_t slice_header_extension_present_flag; ++ uint8_t log2_max_transform_skip_block_size; ++ uint8_t cross_component_prediction_enabled_flag; ++ uint8_t chroma_qp_offset_list_enabled_flag; ++ uint8_t diff_cu_chroma_qp_offset_depth; ++ uint8_t chroma_qp_offset_list_len_minus1; ++ int8_t cb_qp_offset_list[6]; ++ int8_t cr_qp_offset_list[6]; ++ uint8_t log2_sao_offset_scale_luma; ++ uint8_t log2_sao_offset_scale_chroma; ++ ++ // Inferred parameters ++ uint16_t *column_width; ///< ColumnWidth ++ uint16_t *row_height; ///< RowHeight ++ uint16_t *col_bd; ///< ColBd ++ uint16_t *row_bd; ///< RowBd ++ uint16_t *col_idxX; ++ ++ // We can limit these to uint16_t given our other size limits ++ uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS ++ uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS ++ uint16_t *tile_id; ///< TileId ++ uint16_t *tile_pos_ts; ///< TilePosRS ++ uint16_t *tile_size; ///< TileSize ++ uint8_t * ctb_ts_flags; ++ ++ uint8_t data[4096]; ++ int data_size; ++} HEVCRpiPPS; ++ ++typedef struct HEVCRpiParamSets { ++ /* currently active parameter sets */ ++ const HEVCRpiVPS *vps; ++ const HEVCRpiSPS *sps; ++ const HEVCRpiPPS *pps; ++ ++ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT]; ++ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT]; ++ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT]; ++} HEVCRpiParamSets; ++ ++int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps); ++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps, int apply_defdispwin); ++int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx, ++ HEVCRpiParamSets *ps); ++ ++int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx, ++ ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header); ++ ++int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id, ++ uint8_t *buf, int buf_size); ++ ++/** ++ * Compute POC of the current frame and return it. ++ */ ++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type); ++ ++#endif /* AVCODEC_RPI_HEVC_PS_H */ +diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c +new file mode 100644 +index 0000000000..8cc5796cf0 +--- /dev/null ++++ b/libavcodec/rpi_hevc_refs.c +@@ -0,0 +1,485 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/avassert.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/rpi_sand_fns.h" ++#include "internal.h" ++#include "thread.h" ++#include "hevc.h" ++#include "rpi_hevcdec.h" ++ ++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags) ++{ ++ /* frame->frame can be NULL if context init failed */ ++ if (!frame->frame || !frame->frame->buf[0]) ++ return; ++ ++ frame->flags &= ~flags; ++ if (!frame->flags) { ++ ff_thread_release_buffer(s->avctx, &frame->tf); ++ ++ av_buffer_unref(&frame->col_mvf_buf); // OK if already NULL ++ frame->col_mvf = NULL; ++ ++ frame->collocated_ref = NULL; ++ } ++} ++ ++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s) ++{ ++ int i; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ++ HEVC_FRAME_FLAG_SHORT_REF | ++ HEVC_FRAME_FLAG_LONG_REF); ++} ++ ++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s) ++{ ++ int i; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); ++} ++ ++static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s) ++{ ++ int i, ret; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame * const frame = &s->DPB[i]; ++ if (frame->frame->buf[0]) ++ continue; ++ ++ ret = ff_thread_get_buffer(s->avctx, &frame->tf, ++ AV_GET_BUFFER_FLAG_REF); ++ if (ret < 0) ++ return NULL; ++ ++ frame->col_mvf = NULL; ++ frame->col_mvf_buf = NULL; ++ if (s->used_for_ref && !s->is_irap) ++ { ++ frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool); ++ if (!frame->col_mvf_buf) ++ goto fail; ++ frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data; ++ } ++ ++ frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD; ++ frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD); ++ ++ return frame; ++ ++fail: ++ ff_hevc_rpi_unref_frame(s, frame, ~0); ++ return NULL; ++ } ++ av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n"); ++ return NULL; ++} ++ ++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc) ++{ ++ HEVCRpiFrame *ref; ++ int i; ++ ++ /* check that this POC doesn't already exist */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ ++ if (frame->frame->buf[0] && frame->sequence == s->seq_decode && ++ frame->poc == poc) { ++ av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n", ++ poc); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ ref = alloc_frame(s); ++ if (!ref) ++ return AVERROR(ENOMEM); ++ ++ *frame = ref->frame; ++ s->ref = ref; ++ ++ if (s->sh.pic_output_flag) ++ ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF; ++ else ++ ref->flags = HEVC_FRAME_FLAG_SHORT_REF; ++ ++ ref->poc = poc; ++ ref->sequence = s->seq_decode; ++ ref->frame->crop_left = s->ps.sps->output_window.left_offset; ++ ref->frame->crop_right = s->ps.sps->output_window.right_offset; ++ ref->frame->crop_top = s->ps.sps->output_window.top_offset; ++ ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset; ++ ++ return 0; ++} ++ ++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush) ++{ ++ do { ++ int nb_output = 0; ++ int min_poc = INT_MAX; ++ int i, min_idx, ret; ++ ++ if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) { ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc && ++ frame->sequence == s->seq_output) { ++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT); ++ } ++ } ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) && ++ frame->sequence == s->seq_output) { ++ nb_output++; ++ if (frame->poc < min_poc || nb_output == 1) { ++ min_poc = frame->poc; ++ min_idx = i; ++ } ++ } ++ } ++ ++ /* wait for more frames before output */ ++ if (!flush && s->seq_output == s->seq_decode && s->ps.sps && ++ nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics) ++ return 0; ++ ++ if (nb_output) { ++ HEVCRpiFrame *frame = &s->DPB[min_idx]; ++ if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1) ++ return 0; ++ ++ ret = av_frame_ref(out, frame->frame); ++ if (frame->flags & HEVC_FRAME_FLAG_BUMPING) ++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING); ++ else ++ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT); ++ if (ret < 0) ++ return ret; ++ av_log(s->avctx, AV_LOG_DEBUG, ++ "Output frame with POC %d.\n", frame->poc); ++ return 1; ++ } ++ ++ if (s->seq_output != s->seq_decode) ++ s->seq_output = (s->seq_output + 1) & 0xff; ++ else ++ break; ++ } while (1); ++ ++ return 0; ++} ++ ++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s) ++{ ++ int dpb = 0; ++ int min_poc = INT_MAX; ++ int i; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ if ((frame->flags) && ++ frame->sequence == s->seq_output && ++ frame->poc != s->poc) { ++ dpb++; ++ } ++ } ++ ++ if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) { ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ if ((frame->flags) && ++ frame->sequence == s->seq_output && ++ frame->poc != s->poc) { ++ if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) { ++ min_poc = frame->poc; ++ } ++ } ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ if (frame->flags & HEVC_FRAME_FLAG_OUTPUT && ++ frame->sequence == s->seq_output && ++ frame->poc <= min_poc) { ++ frame->flags |= HEVC_FRAME_FLAG_BUMPING; ++ } ++ } ++ ++ dpb--; ++ } ++} ++ ++static int init_slice_rpl(HEVCRpiContext *s) ++{ ++ if (s->slice_idx >= s->rpl_tab_size) ++ return AVERROR_INVALIDDATA; ++ ++ s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0; ++ return 0; ++} ++ ++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s) ++{ ++ RpiSliceHeader *sh = &s->sh; ++ ++ uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1; ++ uint8_t list_idx; ++ int i, j, ret; ++ ++ ret = init_slice_rpl(s); ++ if (ret < 0) ++ return ret; ++ ++ if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs + ++ s->rps[LT_CURR].nb_refs)) { ++ av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ for (list_idx = 0; list_idx < nb_list; list_idx++) { ++ RefPicList rpl_tmp = { { 0 } }; ++ RefPicList *rpl = &s->refPicList[list_idx]; ++ ++ /* The order of the elements is ++ * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and ++ * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */ ++ int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF, ++ list_idx ? ST_CURR_BEF : ST_CURR_AFT, ++ LT_CURR }; ++ ++ /* concatenate the candidate lists for the current frame */ ++ while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) { ++ for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) { ++ RefPicList *rps = &s->rps[cand_lists[i]]; ++ for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) { ++ rpl_tmp.list[rpl_tmp.nb_refs] = rps->list[j]; ++ rpl_tmp.ref[rpl_tmp.nb_refs] = rps->ref[j]; ++ rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2; ++ rpl_tmp.nb_refs++; ++ } ++ } ++ } ++ ++ /* reorder the references if necessary */ ++ if (sh->rpl_modification_flag[list_idx]) { ++ for (i = 0; i < sh->nb_refs[list_idx]; i++) { ++ int idx = sh->list_entry_lx[list_idx][i]; ++ ++ if (idx >= rpl_tmp.nb_refs) { ++ av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ rpl->list[i] = rpl_tmp.list[idx]; ++ rpl->ref[i] = rpl_tmp.ref[idx]; ++ rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx]; ++ rpl->nb_refs++; ++ } ++ } else { ++ memcpy(rpl, &rpl_tmp, sizeof(*rpl)); ++ rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]); ++ } ++ ++ if (sh->collocated_list == list_idx && ++ sh->collocated_ref_idx < rpl->nb_refs) ++ s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx]; ++ } ++ ++ return 0; ++} ++ ++static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc) ++{ ++ int i; ++ int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *ref = &s->DPB[i]; ++ if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) { ++ if ((ref->poc & LtMask) == poc) ++ return ref; ++ } ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *ref = &s->DPB[i]; ++ if (ref->frame->buf[0] && ref->sequence == s->seq_decode) { ++ if (ref->poc == poc || (ref->poc & LtMask) == poc) ++ return ref; ++ } ++ } ++ ++ if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s)) ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Could not find ref with POC %d\n", poc); ++ return NULL; ++} ++ ++static void mark_ref(HEVCRpiFrame *frame, int flag) ++{ ++ frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF); ++ frame->flags |= flag; ++} ++ ++static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc) ++{ ++ HEVCRpiFrame *frame; ++ int i, x, y; ++ ++ frame = alloc_frame(s); ++ if (!frame) ++ return NULL; ++ ++ if (!s->ps.sps->pixel_shift) { ++ for (i = 0; frame->frame->buf[i]; i++) ++ memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1), ++ frame->frame->buf[i]->size); ++ } else { ++ for (i = 0; frame->frame->data[i]; i++) ++ for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++) ++ for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) { ++ AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x, ++ 1 << (s->ps.sps->bit_depth - 1)); ++ } ++ } ++ ++ frame->poc = poc; ++ frame->sequence = s->seq_decode; ++ frame->flags = 0; ++ ++ ff_hevc_rpi_progress_set_all_done(frame); ++ ++ return frame; ++} ++ ++/* add a reference with the given poc to the list and mark it as used in DPB */ ++static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list, ++ int poc, int ref_flag) ++{ ++ HEVCRpiFrame *ref = find_ref_idx(s, poc); ++ ++ if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS) ++ return AVERROR_INVALIDDATA; ++ ++ if (!ref) { ++ ref = generate_missing_ref(s, poc); ++ if (!ref) ++ return AVERROR(ENOMEM); ++ } ++ ++ list->list[list->nb_refs] = ref->poc; ++ list->ref[list->nb_refs] = ref; ++ list->nb_refs++; ++ ++ mark_ref(ref, ref_flag); ++ return 0; ++} ++ ++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s) ++{ ++ const ShortTermRPS *short_rps = s->sh.short_term_rps; ++ const LongTermRPS *long_rps = &s->sh.long_term_rps; ++ RefPicList *rps = s->rps; ++ int i, ret = 0; ++ ++ if (!short_rps) { ++ rps[0].nb_refs = rps[1].nb_refs = 0; ++ return 0; ++ } ++ ++ /* clear the reference flags on all frames except the current one */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ HEVCRpiFrame *frame = &s->DPB[i]; ++ ++ if (frame == s->ref) ++ continue; ++ ++ mark_ref(frame, 0); ++ } ++ ++ for (i = 0; i < NB_RPS_TYPE; i++) ++ rps[i].nb_refs = 0; ++ ++ /* add the short refs */ ++ for (i = 0; i < short_rps->num_delta_pocs; i++) { ++ int poc = s->poc + short_rps->delta_poc[i]; ++ int list; ++ ++ if (!short_rps->used[i]) ++ list = ST_FOLL; ++ else if (i < short_rps->num_negative_pics) ++ list = ST_CURR_BEF; ++ else ++ list = ST_CURR_AFT; ++ ++ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF); ++ if (ret < 0) ++ goto fail; ++ } ++ ++ /* add the long refs */ ++ for (i = 0; i < long_rps->nb_refs; i++) { ++ int poc = long_rps->poc[i]; ++ int list = long_rps->used[i] ? LT_CURR : LT_FOLL; ++ ++ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF); ++ if (ret < 0) ++ goto fail; ++ } ++ ++fail: ++ /* release any frames that are now unused */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0); ++ ++ return ret; ++} ++ ++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s) ++{ ++ int ret = 0; ++ int i; ++ const ShortTermRPS *rps = s->sh.short_term_rps; ++ LongTermRPS *long_rps = &s->sh.long_term_rps; ++ ++ if (rps) { ++ for (i = 0; i < rps->num_negative_pics; i++) ++ ret += !!rps->used[i]; ++ for (; i < rps->num_delta_pocs; i++) ++ ret += !!rps->used[i]; ++ } ++ ++ if (long_rps) { ++ for (i = 0; i < long_rps->nb_refs; i++) ++ ret += !!long_rps->used[i]; ++ } ++ return ret; ++} +diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c +new file mode 100644 +index 0000000000..cd8149d58e +--- /dev/null ++++ b/libavcodec/rpi_hevc_sei.c +@@ -0,0 +1,368 @@ ++/* ++ * HEVC Supplementary Enhancement Information messages ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2013 Vittorio Giovara ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "golomb.h" ++#include "rpi_hevc_ps.h" ++#include "rpi_hevc_sei.h" ++ ++static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb) ++{ ++ int cIdx, i; ++ uint8_t hash_type; ++ //uint16_t picture_crc; ++ //uint32_t picture_checksum; ++ hash_type = get_bits(gb, 8); ++ ++ for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) { ++ if (hash_type == 0) { ++ s->is_md5 = 1; ++ for (i = 0; i < 16; i++) ++ s->md5[cIdx][i] = get_bits(gb, 8); ++ } else if (hash_type == 1) { ++ // picture_crc = get_bits(gb, 16); ++ skip_bits(gb, 16); ++ } else if (hash_type == 2) { ++ // picture_checksum = get_bits_long(gb, 32); ++ skip_bits(gb, 32); ++ } ++ } ++ return 0; ++} ++ ++static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb) ++{ ++ int i; ++ // Mastering primaries ++ for (i = 0; i < 3; i++) { ++ s->display_primaries[i][0] = get_bits(gb, 16); ++ s->display_primaries[i][1] = get_bits(gb, 16); ++ } ++ // White point (x, y) ++ s->white_point[0] = get_bits(gb, 16); ++ s->white_point[1] = get_bits(gb, 16); ++ ++ // Max and min luminance of mastering display ++ s->max_luminance = get_bits_long(gb, 32); ++ s->min_luminance = get_bits_long(gb, 32); ++ ++ // As this SEI message comes before the first frame that references it, ++ // initialize the flag to 2 and decrement on IRAP access unit so it ++ // persists for the coded video sequence (e.g., between two IRAPs) ++ s->present = 2; ++ return 0; ++} ++ ++static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb) ++{ ++ // Max and average light levels ++ s->max_content_light_level = get_bits_long(gb, 16); ++ s->max_pic_average_light_level = get_bits_long(gb, 16); ++ // As this SEI message comes before the first frame that references it, ++ // initialize the flag to 2 and decrement on IRAP access unit so it ++ // persists for the coded video sequence (e.g., between two IRAPs) ++ s->present = 2; ++ return 0; ++} ++ ++static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb) ++{ ++ get_ue_golomb_long(gb); // frame_packing_arrangement_id ++ s->present = !get_bits1(gb); ++ ++ if (s->present) { ++ s->arrangement_type = get_bits(gb, 7); ++ s->quincunx_subsampling = get_bits1(gb); ++ s->content_interpretation_type = get_bits(gb, 6); ++ ++ // spatial_flipping_flag, frame0_flipped_flag, field_views_flag ++ skip_bits(gb, 3); ++ s->current_frame_is_frame0_flag = get_bits1(gb); ++ // frame0_self_contained_flag, frame1_self_contained_flag ++ skip_bits(gb, 2); ++ ++ if (!s->quincunx_subsampling && s->arrangement_type != 5) ++ skip_bits(gb, 16); // frame[01]_grid_position_[xy] ++ skip_bits(gb, 8); // frame_packing_arrangement_reserved_byte ++ skip_bits1(gb); // frame_packing_arrangement_persistence_flag ++ } ++ skip_bits1(gb); // upsampled_aspect_ratio_flag ++ return 0; ++} ++ ++static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb) ++{ ++ s->present = !get_bits1(gb); ++ ++ if (s->present) { ++ s->hflip = get_bits1(gb); // hor_flip ++ s->vflip = get_bits1(gb); // ver_flip ++ ++ s->anticlockwise_rotation = get_bits(gb, 16); ++ skip_bits1(gb); // display_orientation_persistence_flag ++ } ++ ++ return 0; ++} ++ ++static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps, ++ void *logctx, int size) ++{ ++ HEVCSEIPictureTiming *h = &s->picture_timing; ++ HEVCRpiSPS *sps; ++ ++ if (!ps->sps_list[s->active_seq_parameter_set_id]) ++ return(AVERROR(ENOMEM)); ++ sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data; ++ ++ if (sps->vui.frame_field_info_present_flag) { ++ int pic_struct = get_bits(gb, 4); ++ h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN; ++ if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) { ++ av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n"); ++ h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD; ++ } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) { ++ av_log(logctx, AV_LOG_DEBUG, "TOP Field\n"); ++ h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD; ++ } ++ get_bits(gb, 2); // source_scan_type ++ get_bits(gb, 1); // duplicate_flag ++ skip_bits1(gb); ++ size--; ++ } ++ skip_bits_long(gb, 8 * size); ++ ++ return 0; ++} ++ ++static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb, ++ int size) ++{ ++ int flag; ++ int user_data_type_code; ++ int cc_count; ++ ++ if (size < 3) ++ return AVERROR(EINVAL); ++ ++ user_data_type_code = get_bits(gb, 8); ++ if (user_data_type_code == 0x3) { ++ skip_bits(gb, 1); // reserved ++ ++ flag = get_bits(gb, 1); // process_cc_data_flag ++ if (flag) { ++ skip_bits(gb, 1); ++ cc_count = get_bits(gb, 5); ++ skip_bits(gb, 8); // reserved ++ size -= 2; ++ ++ if (cc_count && size >= cc_count * 3) { ++ const uint64_t new_size = (s->a53_caption_size + cc_count ++ * UINT64_C(3)); ++ int i, ret; ++ ++ if (new_size > INT_MAX) ++ return AVERROR(EINVAL); ++ ++ /* Allow merging of the cc data from two fields. */ ++ ret = av_reallocp(&s->a53_caption, new_size); ++ if (ret < 0) ++ return ret; ++ ++ for (i = 0; i < cc_count; i++) { ++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); ++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); ++ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8); ++ } ++ skip_bits(gb, 8); // marker_bits ++ } ++ } ++ } else { ++ int i; ++ for (i = 0; i < size - 1; i++) ++ skip_bits(gb, 8); ++ } ++ ++ return 0; ++} ++ ++static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb, ++ int size) ++{ ++ uint32_t country_code; ++ uint32_t user_identifier; ++ ++ if (size < 7) ++ return AVERROR(EINVAL); ++ size -= 7; ++ ++ country_code = get_bits(gb, 8); ++ if (country_code == 0xFF) { ++ skip_bits(gb, 8); ++ size--; ++ } ++ ++ skip_bits(gb, 8); ++ skip_bits(gb, 8); ++ ++ user_identifier = get_bits_long(gb, 32); ++ ++ switch (user_identifier) { ++ case MKBETAG('G', 'A', '9', '4'): ++ return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size); ++ default: ++ skip_bits_long(gb, size * 8); ++ break; ++ } ++ return 0; ++} ++ ++static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx) ++{ ++ int num_sps_ids_minus1; ++ int i; ++ unsigned active_seq_parameter_set_id; ++ ++ get_bits(gb, 4); // active_video_parameter_set_id ++ get_bits(gb, 1); // self_contained_cvs_flag ++ get_bits(gb, 1); // num_sps_ids_minus1 ++ num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1 ++ ++ if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) { ++ av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ active_seq_parameter_set_id = get_ue_golomb_long(gb); ++ if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) { ++ av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id); ++ return AVERROR_INVALIDDATA; ++ } ++ s->active_seq_parameter_set_id = active_seq_parameter_set_id; ++ ++ for (i = 1; i <= num_sps_ids_minus1; i++) ++ get_ue_golomb_long(gb); // active_seq_parameter_set_id[i] ++ ++ return 0; ++} ++ ++static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb) ++{ ++ s->present = 1; ++ s->preferred_transfer_characteristics = get_bits(gb, 8); ++ return 0; ++} ++ ++static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps, ++ int type, int size) ++{ ++ switch (type) { ++ case 256: // Mismatched value from HM 8.1 ++ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb); ++ case HEVC_SEI_TYPE_FRAME_PACKING: ++ return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb); ++ case HEVC_SEI_TYPE_DISPLAY_ORIENTATION: ++ return decode_nal_sei_display_orientation(&s->display_orientation, gb); ++ case HEVC_SEI_TYPE_PICTURE_TIMING: ++ return decode_nal_sei_pic_timing(s, gb, ps, logctx, size); ++ case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO: ++ return decode_nal_sei_mastering_display_info(&s->mastering_display, gb); ++ case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO: ++ return decode_nal_sei_content_light_info(&s->content_light, gb); ++ case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS: ++ return decode_nal_sei_active_parameter_sets(s, gb, logctx); ++ case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35: ++ return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size); ++ case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS: ++ return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb); ++ default: ++ av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type); ++ skip_bits_long(gb, 8 * size); ++ return 0; ++ } ++} ++ ++static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, ++ int type, int size) ++{ ++ switch (type) { ++ case HEVC_SEI_TYPE_DECODED_PICTURE_HASH: ++ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb); ++ default: ++ av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type); ++ skip_bits_long(gb, 8 * size); ++ return 0; ++ } ++} ++ ++static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s, ++ const HEVCRpiParamSets * const ps, const int nal_unit_type) ++{ ++ int payload_type = 0; ++ int payload_size = 0; ++ int byte = 0xFF; ++ av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n"); ++ ++ while (byte == 0xFF) { ++ if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255) ++ return AVERROR_INVALIDDATA; ++ byte = get_bits(gb, 8); ++ payload_type += byte; ++ } ++ byte = 0xFF; ++ while (byte == 0xFF) { ++ if (get_bits_left(gb) < 8 + 8LL*payload_size) ++ return AVERROR_INVALIDDATA; ++ byte = get_bits(gb, 8); ++ payload_size += byte; ++ } ++ if (nal_unit_type == HEVC_NAL_SEI_PREFIX) { ++ return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size); ++ } else { /* nal_unit_type == NAL_SEI_SUFFIX */ ++ return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size); ++ } ++} ++ ++static int more_rbsp_data(GetBitContext *gb) ++{ ++ return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80; ++} ++ ++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s, ++ const HEVCRpiParamSets *ps, int type) ++{ ++ int ret; ++ ++ do { ++ ret = decode_nal_sei_message(gb, logctx, s, ps, type); ++ if (ret < 0) ++ return ret; ++ } while (more_rbsp_data(gb)); ++ return 1; ++} ++ ++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s) ++{ ++ s->a53_caption.a53_caption_size = 0; ++ av_freep(&s->a53_caption.a53_caption); ++} +diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h +new file mode 100644 +index 0000000000..d4ac348df9 +--- /dev/null ++++ b/libavcodec/rpi_hevc_sei.h +@@ -0,0 +1,135 @@ ++/* ++ * HEVC Supplementary Enhancement Information messages ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVC_SEI_H ++#define AVCODEC_RPI_HEVC_SEI_H ++ ++#include ++ ++#include "libavutil/md5.h" ++ ++#include "get_bits.h" ++ ++/** ++ * SEI message types ++ */ ++typedef enum { ++ HEVC_SEI_TYPE_BUFFERING_PERIOD = 0, ++ HEVC_SEI_TYPE_PICTURE_TIMING = 1, ++ HEVC_SEI_TYPE_PAN_SCAN_RECT = 2, ++ HEVC_SEI_TYPE_FILLER_PAYLOAD = 3, ++ HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35 = 4, ++ HEVC_SEI_TYPE_USER_DATA_UNREGISTERED = 5, ++ HEVC_SEI_TYPE_RECOVERY_POINT = 6, ++ HEVC_SEI_TYPE_SCENE_INFO = 9, ++ HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT = 15, ++ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16, ++ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END = 17, ++ HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS = 19, ++ HEVC_SEI_TYPE_POST_FILTER_HINT = 22, ++ HEVC_SEI_TYPE_TONE_MAPPING_INFO = 23, ++ HEVC_SEI_TYPE_FRAME_PACKING = 45, ++ HEVC_SEI_TYPE_DISPLAY_ORIENTATION = 47, ++ HEVC_SEI_TYPE_SOP_DESCRIPTION = 128, ++ HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS = 129, ++ HEVC_SEI_TYPE_DECODING_UNIT_INFO = 130, ++ HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX = 131, ++ HEVC_SEI_TYPE_DECODED_PICTURE_HASH = 132, ++ HEVC_SEI_TYPE_SCALABLE_NESTING = 133, ++ HEVC_SEI_TYPE_REGION_REFRESH_INFO = 134, ++ HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO = 137, ++ HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO = 144, ++ HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147, ++} HEVC_SEI_Type; ++ ++typedef struct HEVCSEIPictureHash { ++ uint8_t md5[3][16]; ++ uint8_t is_md5; ++} HEVCSEIPictureHash; ++ ++typedef struct HEVCSEIFramePacking { ++ int present; ++ int arrangement_type; ++ int content_interpretation_type; ++ int quincunx_subsampling; ++ int current_frame_is_frame0_flag; ++} HEVCSEIFramePacking; ++ ++typedef struct HEVCSEIDisplayOrientation { ++ int present; ++ int anticlockwise_rotation; ++ int hflip, vflip; ++} HEVCSEIDisplayOrientation; ++ ++typedef struct HEVCSEIPictureTiming { ++ int picture_struct; ++} HEVCSEIPictureTiming; ++ ++typedef struct HEVCSEIA53Caption { ++ int a53_caption_size; ++ uint8_t *a53_caption; ++} HEVCSEIA53Caption; ++ ++typedef struct HEVCSEIMasteringDisplay { ++ int present; ++ uint16_t display_primaries[3][2]; ++ uint16_t white_point[2]; ++ uint32_t max_luminance; ++ uint32_t min_luminance; ++} HEVCSEIMasteringDisplay; ++ ++typedef struct HEVCSEIContentLight { ++ int present; ++ uint16_t max_content_light_level; ++ uint16_t max_pic_average_light_level; ++} HEVCSEIContentLight; ++ ++typedef struct HEVCSEIAlternativeTransfer { ++ int present; ++ int preferred_transfer_characteristics; ++} HEVCSEIAlternativeTransfer; ++ ++typedef struct HEVCSEIContext { ++ HEVCSEIPictureHash picture_hash; ++ HEVCSEIFramePacking frame_packing; ++ HEVCSEIDisplayOrientation display_orientation; ++ HEVCSEIPictureTiming picture_timing; ++ HEVCSEIA53Caption a53_caption; ++ HEVCSEIMasteringDisplay mastering_display; ++ HEVCSEIContentLight content_light; ++ int active_seq_parameter_set_id; ++ HEVCSEIAlternativeTransfer alternative_transfer; ++} HEVCSEIContext; ++ ++struct HEVCRpiParamSets; ++ ++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s, ++ const struct HEVCRpiParamSets *ps, int type); ++ ++/** ++ * Reset SEI values that are stored on the Context. ++ * e.g. Caption data that was extracted during NAL ++ * parsing. ++ * ++ * @param s HEVCRpiContext. ++ */ ++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s); ++ ++#endif /* AVCODEC_RPI_HEVC_SEI_H */ +diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c +new file mode 100644 +index 0000000000..23b49a99ae +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader.c +@@ -0,0 +1,1537 @@ ++#include "rpi_hevc_shader.h" ++ ++#ifdef _MSC_VER ++ #include ++ /* cast through uintptr_t to avoid warnings */ ++ #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X)) ++#else ++ #define POINTER_TO_UINT(X) ((unsigned int)(X)) ++#endif ++ ++#ifdef __cplusplus ++extern "C" { /* the types are probably wrong... */ ++#endif ++#ifdef __cplusplus ++} ++#endif ++ ++#ifdef _MSC_VER ++__declspec(align(8)) ++#elif defined(__GNUC__) ++__attribute__((aligned(8))) ++#endif ++unsigned int ff_hevc_rpi_shader[] = { ++// ::mc_setup_c_q0 ++// ::mc_start ++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_c_qn ++/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif ++/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 ++/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift ++/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 ++/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 ++/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask ++/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif ++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch ++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num ++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 ++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num ++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x ++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a ++/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif ++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 ++/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000110] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a ++/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif ++/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD ++/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 ++/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y ++// :1 ++/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 ++/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 ++// ::mc_filter_c_p ++/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3 ++/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif ++/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val ++/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 ++/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif ++// :1 ++/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 ++/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 ++/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b ++/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a ++/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height ++/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b ++/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_c_p_l1 ++/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3 ++/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif ++/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val ++/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 ++/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif ++// :1 ++/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 ++/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax ++/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 ++/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b ++/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a ++/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height ++/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b ++/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_c_b ++/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 ++/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif ++/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif ++/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif ++/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height ++/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 ++/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif ++/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif ++/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a ++/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b ++/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif ++/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif ++/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif ++/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y ++/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add ++/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif ++/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4 ++/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif ++/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val ++/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1 ++/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1 ++/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d ++/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif ++/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d ++// :1 ++/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0 ++/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next ++/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next ++/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y ++/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0 ++/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1 ++/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1 ++/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6 ++/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 ++/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax ++/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0 ++/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1 ++/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a ++/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b ++/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b ++/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4 ++/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7 ++/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11 ++/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0 ++/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6 ++/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 ++/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add ++/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height ++/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7 ++/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b ++/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_sync_q0 ++/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q1 ++/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q2 ++/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q3 ++/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync_q4 ++/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q5 ++/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q6 ++/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q7 ++/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync_q8 ++/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q9 ++/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q10 ++/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync_q11 ++/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c_qn ++// ::mc_exit_y_qn ++/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c_q0 ++// ::mc_exit_y_q0 ++/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 ++/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_setup_y_q0 ++/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_y_qn ++/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif ++/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif ++/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif ++/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 ++/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask ++/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00 ++/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40 ++/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500 ++/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif ++/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1 ++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 ++/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif ++/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch ++/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 ++/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0 ++/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a ++/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a ++// :1 ++/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 ++/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2 ++/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6 ++/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3 ++/* [0x00000df0] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0)) ++/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) ++/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5 ++/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++// :per_block_setup_8 ++/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif ++/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif ++/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 ++/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif ++/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif ++/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init ++/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul ++/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 ++/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8) ++/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add ++/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val ++/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif ++/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif ++/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255 ++/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 ++/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d ++/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c ++/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d ++/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c ++/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 ++/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ++/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif ++/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 ++/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d ++/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c ++/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d ++/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c ++/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d ++/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 ++/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif ++/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 ++// ::mc_filter_y_pxx ++/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5 ++/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++// :1 ++/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 ++/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 ++/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 ++/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 ++/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 ++/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 ++/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 ++/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 ++/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height ++/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next ++/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next ++/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b ++/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_y_bxx ++/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6 ++/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 ++/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++// :1 ++/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 ++/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 ++/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 ++/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 ++/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 ++/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 ++/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 ++/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 ++/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4 ++/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off ++/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6 ++/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add ++/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next ++/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8 ++/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height ++/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch ++/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3 ++/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b ++/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_y_p00 ++/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num ++/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 ++/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif ++/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a ++/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif ++/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif ++/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif ++/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init ++/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift ++/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7 ++/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif ++/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base ++// :1 ++/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8 ++/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b ++/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_y_b00 ++/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8 ++/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1 ++/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 ++/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 ++/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++// :1 ++/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 ++/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next ++/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax ++/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 ++/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32 ++/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b ++/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_setup_c10_q0 ++/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_c10_qn ++/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif ++/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1 ++/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift ++/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1 ++/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 ++/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask ++/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif ++/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif ++/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch ++/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num ++/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5 ++/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num ++/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0 ++/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x ++/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a ++/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 ++/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1 ++/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif ++/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0 ++/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 ++/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 ++/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 ++/* [0x00001770] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) ++/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) ++/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 ++/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift ++/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a ++/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif ++/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch ++/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD ++/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2 ++/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y ++// :1 ++/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 ++/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0 ++/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0 ++/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0 ++/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0 ++// ::mc_filter_c10_p ++/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif ++/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val ++/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 ++/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif ++// :1 ++/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 ++/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 ++/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b ++/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a ++/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height ++/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b ++/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_c10_p_l1 ++/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 ++/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif ++/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif ++/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif ++/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height ++/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif ++/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif ++/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val ++/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c ++/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 ++/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 ++/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif ++// :1 ++/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 ++/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next ++/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax ++/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10 ++/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b ++/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a ++/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0 ++/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height ++/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b ++/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_c10_b ++/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif ++/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif ++/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 ++/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif ++/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif ++/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif ++/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul ++/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height ++/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif ++/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 ++/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif ++/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif ++/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a ++/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b ++/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif ++/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif ++/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif ++/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y ++/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add ++/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif ++/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4 ++/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif ++/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val ++/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1 ++/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1 ++/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d ++/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif ++/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d ++// :1 ++/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0 ++/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next ++/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next ++/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y ++/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15 ++/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1 ++/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask ++/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0 ++/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1 ++/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1 ++/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6 ++/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2 ++/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 ++/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15 ++/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch ++/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax ++/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0 ++/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1 ++/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a ++/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b ++/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b ++/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4 ++/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7 ++/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11 ++/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0 ++/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6 ++/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 ++/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add ++/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height ++/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7 ++/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b ++/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_sync10_q0 ++/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q1 ++/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q2 ++/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q3 ++/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i) ++/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync10_q4 ++/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q5 ++/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q6 ++/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q7 ++/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i) ++/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_sync10_q8 ++/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q9 ++/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q10 ++/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i) ++/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i) ++// ::mc_sync10_q11 ++/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov dst, srel(i) ++/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c10_q0 ++// ::mc_exit_y10_q0 ++/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i) ++/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1 ++/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_exit_c10_qn ++// ::mc_exit_y10_qn ++/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1 ++// :1 ++/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0 ++/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1 ++/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait ++/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend ++/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop ++/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop ++// ::mc_setup_y10_q0 ++/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i) ++// ::mc_setup_y10_qn ++/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif ++/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif ++/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif ++/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif ++/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100 ++/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask ++/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00 ++/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40 ++/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500 ++/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif ++/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif ++/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1 ++/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift ++/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1 ++/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif ++/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0) ++/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch ++/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3 ++/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2 ++/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch ++/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0 ++/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0 ++/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0 ++/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a ++/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a ++// :1 ++/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1 ++/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0 ++/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0 ++/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0 ++/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b ++/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y ++/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2 ++/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num ++/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1 ++/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4 ++/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1 ++/* [0x00002428] */ 0x159e7040, 0x10020827, // or r0, r0, r1 ++/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0)) ++/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1 ++/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) ++/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6 ++/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1 ++/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif ++/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0 ++/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0 ++/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0 ++/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0 ++// :per_block_setup_10 ++/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x ++/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif ++/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a ++/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif ++/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0 ++/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3 ++/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif ++/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3 ++/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif ++/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init ++/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul ++/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0 ++/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8) ++/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add ++/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val ++/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif ++/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif ++/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255 ++/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400 ++/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d ++/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c ++/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d ++/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c ++/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100 ++/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif ++/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 ++/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 ++/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif ++/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 ++/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d ++/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c ++/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d ++/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c ++/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100 ++/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d ++/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 ++/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link ++/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100 ++/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif ++/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 ++// ::mc_filter_y10_pxx ++/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5 ++/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++// :1 ++/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 ++/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 ++/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 ++/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 ++/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 ++/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 ++/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 ++/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 ++/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height ++/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next ++/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next ++/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3 ++/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6 ++/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b ++/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_y10_p00 ++/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num ++/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 ++/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif ++/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift ++/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a ++/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif ++/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3 ++/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4 ++/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif ++/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2 ++/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif ++/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init ++/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift ++/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1 ++/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7 ++/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif ++/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base ++// :1 ++/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8 ++/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b ++/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_y10_bxx ++/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6 ++/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 ++/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++// :1 ++/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0 ++/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1 ++/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6 ++/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7 ++/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 ++/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9 ++/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8 ++/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11 ++/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4 ++/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off ++/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6 ++/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 ++/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add ++/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next ++/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next ++/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8 ++/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height ++/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch ++/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3 ++/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b ++/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_filter_y10_b00 ++/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10 ++/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num ++/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 ++/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1 ++/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 ++/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 ++/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++// :1 ++/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 ++/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch ++/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0 ++/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next ++/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0 ++/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y ++/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax ++/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 ++/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height ++/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b ++/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32 ++/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait ++/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 ++/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link ++/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1 ++/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest ++/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23 ++/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b ++/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0 ++/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1 ++/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init ++// ::mc_end ++}; ++#ifdef __HIGHC__ ++#pragma Align_to(8, ff_hevc_rpi_shader) ++#endif +diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h +new file mode 100644 +index 0000000000..79651c9b6c +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader.h +@@ -0,0 +1,63 @@ ++#ifndef rpi_hevc_shader_H ++#define rpi_hevc_shader_H ++ ++extern unsigned int ff_hevc_rpi_shader[]; ++ ++#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0) ++#define mc_start (ff_hevc_rpi_shader + 0) ++#define mc_setup_c_qn (ff_hevc_rpi_shader + 2) ++#define mc_filter_c_p (ff_hevc_rpi_shader + 134) ++#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260) ++#define mc_filter_c_b (ff_hevc_rpi_shader + 386) ++#define mc_sync_q0 (ff_hevc_rpi_shader + 580) ++#define mc_sync_q1 (ff_hevc_rpi_shader + 598) ++#define mc_sync_q2 (ff_hevc_rpi_shader + 610) ++#define mc_sync_q3 (ff_hevc_rpi_shader + 622) ++#define mc_sync_q4 (ff_hevc_rpi_shader + 634) ++#define mc_sync_q5 (ff_hevc_rpi_shader + 652) ++#define mc_sync_q6 (ff_hevc_rpi_shader + 664) ++#define mc_sync_q7 (ff_hevc_rpi_shader + 676) ++#define mc_sync_q8 (ff_hevc_rpi_shader + 688) ++#define mc_sync_q9 (ff_hevc_rpi_shader + 706) ++#define mc_sync_q10 (ff_hevc_rpi_shader + 718) ++#define mc_sync_q11 (ff_hevc_rpi_shader + 730) ++#define mc_exit_c_qn (ff_hevc_rpi_shader + 742) ++#define mc_exit_y_qn (ff_hevc_rpi_shader + 742) ++#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760) ++#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760) ++#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780) ++#define mc_setup_y_qn (ff_hevc_rpi_shader + 782) ++#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014) ++#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140) ++#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272) ++#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358) ++#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432) ++#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434) ++#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562) ++#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684) ++#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806) ++#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996) ++#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014) ++#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026) ++#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038) ++#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050) ++#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068) ++#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080) ++#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092) ++#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104) ++#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122) ++#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134) ++#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146) ++#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158) ++#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158) ++#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178) ++#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178) ++#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196) ++#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198) ++#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440) ++#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566) ++#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654) ++#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786) ++#define mc_end (ff_hevc_rpi_shader + 2860) ++ ++#endif +diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm +new file mode 100644 +index 0000000000..af5b59e181 +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader.qasm +@@ -0,0 +1,1850 @@ ++# Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++# All rights reserved. ++# ++# Redistribution and use in source and binary forms, with or without ++# modification, are permitted provided that the following conditions are met: ++# * Redistributions of source code must retain the above copyright ++# notice, this list of conditions and the following disclaimer. ++# * Redistributions in binary form must reproduce the above copyright ++# notice, this list of conditions and the following disclaimer in the ++# documentation and/or other materials provided with the distribution. ++# * Neither the name of the copyright holder nor the ++# names of its contributors may be used to endorse or promote products ++# derived from this software without specific prior written permission. ++# ++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++# ++# Written by Peter de Rivaz, John Cox ++ ++ ++ ++# Inter pred asm ++# ++# Logic here should be good to 14 bits without modification ++# but only 8 & 10 are currently instantiated & tested ++# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow ++# in _p00 & _b00 ++ ++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress ++# the warning that we are using rotation & ra/rb registers. r0..3 can be ++# rotated through all 16 elems ra regs can only be rotated through their ++# local 4. As it happens this is what is wanted here as we do not want the ++# constants from the other half of the calc. ++ ++# Number limits in P/B calculation ++# ++# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier ++# we offset our intermediates s.t. they always end up +ve before the next ++# multiply (may be -ve whilst summing but that doesn't matter). ++# ++# Range calc for up to 14 bits (Y-B pred): ++# ++# denom: [0, 7] ++# bmax = (1 << bits) - 1 ++# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1] ++# ++# wt_mul: [-128, 255] ++# wt_off = off * 2 + 1: [-bmax, bmax] ++# ++# pel: [0, bmax] ++# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff] ++# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e] ++# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6] ++# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4] ++# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2): ++# [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000] ++# ++# This all looks good and is mostly bit depth independant - and as we manage ++# to do unsigned multiplies everywhere (now) this should be good for any bit ++# depth up to 14 (we could probably do 16 - but that requires a few tweaks ++# to the shifts we don't currently have logic for) ++ ++# PREREAD is the number of requests that we have sitting in the TMU request ++# queue. ++# ++# There are 8 slots availible in the TMU request Q for tm0s requests, but ++# only 4 output FIFO entries and overflow is bad (corruption or crash) ++# (If threaded then only 2 out FIFO entries, but we aren't.) ++# In s/w we are effectively limited to the min vertical read which is >= 4 ++# so output FIFO is the limit. ++# ++# As the test for read-next is is the main part of the Luma loop (rather than ++# the preload FIFO part) we are limited to min_luma_height - 1 ++# Min_luma_height is 4 so we can only have a preload of 3 ++# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick ++# in chroma without abandoning preload pretty much entirely (which would be bad) ++# ++# Timing tests vs preload of 4 suggests this doesn't hurt us much ++# Could have preread 4 for Chroma but when tested it didn't help ++ ++.set PREREAD, 3 ++ ++# Offset added (effectively) at the exit of the H FIR filter ++# This is enough to force the result +ve ++# Is good if it is a power of 2 as that allows for >> without loss ++# ++# Worst case for a single Y FIR is *-22 so we need an offset of 256*22 ++# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00 ++# Round up to next power of 2 ++ ++.set FIR_OFFSET, 0x4000 ++ ++# Block heights - 8 & 16 are the only numbers we currently support ++ ++.set C_BLK_HEIGHT_8, 16 ++.set C_BLK_HEIGHT_16, 8 ++.set Y_BLK_HEIGHT_8, 16 ++.set Y_BLK_HEIGHT_16, 8 ++ ++# QPU counts - depend on block size ++# If we have a 2-byte format & block_size > 8 then can only afford ++# 8 QPUs ++# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h ++ ++.set N_QPU_8, 12 ++.set N_QPU_16, 12 ++ ++# Value to add to the weight multiplier to convert it into an unsigned value ++# Should be power of two for convienience ++ ++.set LOG2_MUL_ADD, 14 ++.set MUL_ADD, (1 << LOG2_MUL_ADD) ++ ++# Fixed denom (max that it can be set to) ++.set DENOM, 7 ++ ++# register allocation ++# ++ ++# ra0-3 ++# Used as temp and may be loop filter coeffs (split into .8s) ++# or temp in loop. Check usage on an individual basis. ++ ++# ra4-11 ++# V FIFO / temp / free ++ ++# -- free -- ra12 ++ ++# -- free -- ra13 ++ ++# -- free -- ra14 ++ ++# -- free -- ra15 ++ ++# uniform: width:height ++.set ra_width_height, ra16 ++.set ra_width, ra16.16b ++.set ra_height, ra16.16a ++ ++# y:y2 same layout as y_y2_next so we can update both together ++.set ra_y_y2, ra17 ++.set ra_y2, ra17.16a ++.set ra_y, ra17.16b ++ ++# uniform: L1 weight (U on left, V on right) ++# Only used in Y B ++.set ra_wt_off_mul_l1, ra18 ++.set ra_wt_off_l1, ra18.16b ++.set ra_wt_mul_l1, ra18.16a ++ ++# y_next:y2_next same layout as y_y2 so we can update both together ++.set ra_y_y2_next, ra19 ++.set ra_y_next, ra19.16b ++.set ra_y2_next, ra19.16a ++ ++# Setup: consts - subdivide a single register ++.set ra_kff800100, ra20 ++.set ra_k256, ra20.16a ++.set ra_k0, ra20.8a ++.set ra_k1, ra20.8b ++.set ra_k128, ra20.8c ++.set ra_k255, ra20.8d ++ ++# Loop: xshifts ++.set ra_xshift, ra21.16a ++.set ra_xshift_next, ra21.16b ++ ++# Loop var: L0 weight (U on left, V on right) ++# _off_ is not used in loop as we want to modify it before use ++.set ra_wt_off_mul_l0, ra22 ++.set ra_wt_mul_l0, ra22.16a ++.set ra_wt_off_l0, ra22.16b ++ ++# Max pel value (for 8 bit we can get away with sat ops but not 9+) ++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the ++# 2nd byte but as the source should never be > 3 there 0x3ff should do ++.set ra_blk_height_pmax, ra23 ++.set ra_pmax, ra23.16a ++.set ra_blk_height, ra23.8c ++# --free -- ra23.8d ++ ++# Loop: src frame base (L0) ++.set ra_base, ra24 ++ ++# Misc offsets ++.set ra_fir_off_val_wt_den_p7, ra25 ++.set ra_wt_den_p7, ra25.8a ++# -- free -- ra25.8b ++.set ra_fir_off_val, ra25.16b ++ ++# As it happens these constants are the same ++.if FIR_OFFSET == MUL_ADD ++# Weight multiplier unsigned add ++.set ra_kmul_add, ra_fir_off_val ++.else ++.error "FIR_OFFSET != MUL_ADD: Need new register & init" ++.endif ++ ++# Loop: next src frame base (L0) ++.set ra_base_next, ra26 ++ ++# Loop: height<<23 + width<<16 + vdw_setup_0 ++.set ra_dma0, ra27 ++ ++# Loop: destination address ++.set ra_dest, ra28 ++ ++# Setup: Dup of rb_ef ++# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul ++# (top bits are ignored by mul24) ++.set ra_ef, ra29 ++ ++# Use an even numbered register as a link register to avoid corrupting flags ++.set ra_link, ra30 ++ ++# -- free -- ra31 ++ ++.set rb_xshift2, rb0 ++.set rb_xshift2_next, rb1 ++ ++# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2 ++.set rb_elem_x, rb2 ++ ++# El Flags ++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n ++# Duped into ra_ef as sometimes that is easier to use ++.set rb_ef, rb3 ++ ++# rb4-11 ++# Loop: V filter FIFO or V filter coeff ++ ++# Loop var: offset to add before shift (round + weighting offsets) ++# Exact value varies by loop ++.set rb_wt_off, rb12 ++ ++# -- free -- rb13 ++ ++# -- free -- rb14 ++ ++# Loop: src frame base (L1) ++.set rb_base2, rb15 ++ ++# Line pitch (128 for sand128) ++.set rb_pitch, rb16 ++ ++# Loop count - 2 (set up TMU for next xfer) ++.set rb_i_tmu, rb17 ++ ++# Loop count for min(height, 16) ++# Y will reset & loop again if height > 16 ++.set rb_lcount, rb18 ++ ++# frame_base2_next ++.set rb_base2_next, rb19 ++ ++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give ++# offset to the slice ++.set rb_xpitch, rb20 ++ ++# These 3 consts each save 1 instruction in Y loop setup ++# so whilst they are worthwhile they should be the 1st to die if we need ++# another b reg ++.set rb_y_coeffs_2, rb21 # 0x050b0a00 ++.set rb_y_coeffs_3, rb22 # 0x11283a40 ++.set rb_y_coeffs_5, rb23 # 0x0a0b0500 ++ ++# Setup: 0xff (8-bit) / 0xffff (9+ bit) ++.set rb_pmask, rb24 ++ ++# vdw_setup_1(dst_pitch) ++.set rb_dma1_base, rb25 ++ ++# Setup: pic width - 1 ++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc. ++.set rb_max_x, rb26 ++ ++# vdw_setup_0 (depends on QPU number) ++.set rb_dma0_base, rb27 ++ ++# Setup: vw_setup value to reset VPM write pointer ++.set rb_vpm_init, rb28 ++ ++# Loop: vdw_setup_1(dst_pitch-width) = stride ++.set rb_dma1, rb29 ++ ++# Setup: pic_height - 1 ++.set rb_max_y, rb30 ++ ++# Setup: FIR H offset ++.set rb_fir_off_h, rb31 ++ ++ ++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc. ++.set i_shift16, -16 ++.set i_shift21, -11 ++.set i_shift23, -9 ++.set i_shift30, -2 ++ ++# Much of the setup code is common between Y & C ++# Macros that express this - obviously these can't be overlapped ++# so are probably unsuitable for loop code ++ ++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma ++ mov r2, qpu_num ++.if v_bit_depth <= 8 ++ # 8 bit version ++ asr r1, r2, 2 ++ shl r1, r1, 6 ++ and r0, r2, 3 ++ or r0, r0, r1 ++ ++ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit ++ add r_vpm, r0, r1 # VPM 8bit storage ++ ++ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later ++ shl r0, r0, 5 ++ ++.else ++ # 16 bit version ++ # Limited to 8 QPUs if blk height > 8 ++ asr r1, r2, 1 ++.if v_blk_height <= 8 ++ shl r1, r1, 4 ++.else ++ shl r1, r1, 5 ++.endif ++ and r0, r2, 1 ++ or r0, r0, r1 ++ ++ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR ++ add r_vpm, r0, r1 ++ ++ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into ++ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg) ++ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later ++ shl r0, r0, 6 ++.endif ++ add r_dma, r0, r1 # DMA out ++.endm ++ ++ ++.macro m_setup_q0 ++ srel -, 12 ++.endm ++ ++# Code start label ++::mc_start ++ ++################################################################################ ++# mc_setup_c ++# ++# typedef struct qpu_mc_pred_c_s_s { ++# int16_t y; ++# int16_t x; ++# uint32_t base; ++# uint32_t pic_cw; // C Width (== Y width / 2) ++# uint32_t pic_ch; // C Height (== Y Height / 2) ++# uint32_t stride2; ++# uint32_t stride1; ++# uint32_t wdenom; ++# int16_t y2; ++# int16_t x2; ++# uint32_t base2; ++# uint32_t next_fn; ++# } qpu_mc_pred_c_s_t; ++ ++.macro m_setup_c, v_bit_depth ++ ++# Cannot use mul24 on x as x might be -ve, so must use shift ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_pmask, 0xff ++.set v_blk_height, C_BLK_HEIGHT_8 ++.else ++.set v_x_shift, 2 ++.set v_pmask, 0xffff ++.set v_blk_height, C_BLK_HEIGHT_16 ++.endif ++ ++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y ++ ++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++ shl rb_ef, r0, i_shift30 ; mov ra_base, unif # ; ref_c_base ++ ++# Read image dimensions ++ sub r0, unif, 1 # pic c width ++ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes ++ sub rb_max_y, unif, 1 # pic c height ++ ++# load constants ++ mov ra_kff800100, 0xff800100 ++ mov rb_pmask, v_pmask ++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++ ++# get source pitch ++ mov ra_ef, rb_ef ; mov rb_xpitch, unif # ; stride2 ++ mov rb_pitch, unif # stride1 ++ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly ++ add rb_dma1_base, r1, rb_pitch # vdw_setup_1 ++ ++ and r0, 1, elem_num ++ nop ; mul24 r0, r0, 5 ++.if v_bit_depth <= 8 ++ add rb_elem_x, r0, elem_num ++.else ++ add r0, r0, elem_num ++ add rb_elem_x, r0, r0 ++.endif ++ ++# Compute base address for first and second access ++# ra_base ends up with t0s base ++# ra_base2 ends up with t1s base ++ ++ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay] ++ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice ++ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y ++ min r0, r0, rb_max_x ++ ++# Get shift ++# Shift will always calculate as 0 for 9+ bit ++# Ideally we can optimize the shift out of the code in these cases but for now ++# it is tidier to leave it in ++.if v_bit_depth <= 8 ++ shl ra_xshift_next, r0, 3 ++.else ++ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0 ++.endif ++ ++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to ++ ++.if v_bit_depth <= 8 ++ and r0, r0, -4 ++.endif ++ sub r1, ra_k0, rb_pitch ++ and r1, r0, r1 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra0, unif # ; next_x2_y2 ++ add ra_base, ra_base, r0 ++ ++# Compute part of VPM to use for DMA output ++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop? ++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base ++ ++# And again for L1, but only worrying about frame2 stuff ++ ++# Compute base address for first and second access ++# ra_base ends up with t0s base ++# rb_base2 ends up with t1s base ++ ++ shl r0, ra0.16b, v_x_shift ++ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset ++ max r0, r0, 0 ; mov rb_base2, unif # ref_c_base2 ++ min r0, r0, rb_max_x ++ ++# Get shift (already zero if 9+ bit so ignore) ++.if v_bit_depth <= 8 ++ shl rb_xshift2_next, r0, 3 ++.endif ++ ++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs ++ ++.if v_bit_depth <= 8 ++ and r0, r0, -4 ++.endif ++ sub r1, ra_k0, rb_pitch ++ and r1, r0, r1 ; mov r3, PREREAD ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r2, ra_y2 ++ add rb_base2, rb_base2, r0 ; mov r0, ra_y ++ ++# Do preloads ++# r0 = ra_y, r2 = ra_y2, r3 = PREREAD ++ ++:1 ++ sub.setf r3, r3, 1 ++ max r1, r0, 0 ++ min r1, r1, rb_max_y ++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ; mov ra_y, r0 ++ ++ max r1, r2, 0 ++ brr.anynz -, r:1b ++ min r1, r1, rb_max_y ++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t1s, rb_base2, r1 ; mov ra_y2, r2 ++# >>> .anynz 1b ++ ++ mov ra_link, unif # link ++# touch registers to keep simulator happy (and fills in delay slots) ++ mov ra4, 0 ; mov rb4, 0 ++ bra -, ra_link ++ mov ra5, 0 ; mov rb5, 0 ++ mov ra6, 0 ; mov rb6, 0 ++ mov ra7, 0 ; mov rb7, 0 ++# >>> ra_link ++.endm ++ ++::mc_setup_c_q0 ++ m_setup_q0 ++::mc_setup_c_qn ++ m_setup_c 8 ++ ++################################################################################ ++# ++# mc_filter_c_p ++# ++# typedef struct qpu_mc_pred_c_p_s { ++# int16_t y; ++# int16_t x; ++# uint32_t base; ++# uint16_t h; ++# uint16_t w; ++# uint32_t coeffs_x; ++# uint32_t coeffs_y; ++# uint32_t wo_u; ++# uint32_t wo_v; ++# uint32_t dst_addr_c; ++# uint32_t next_fn; ++# } qpu_mc_pred_c_p_t; ++ ++.macro m_filter_c_p, v_tmu, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++.set v_v_shift, 8 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 2 ++.set v_x_mul, 4 ++.set v_v_shift, i_shift16 ++# Shifts to get width & height in the right place in rb_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++.if v_tmu == 0 ++.set vrx_xshift, rb_xshift2 # b side more convienient ++.set vrx_xshift_next, ra_xshift_next ++.set vra_y_next, ra_y_next ++.set vrx_base_next, ra_base_next ++.set vra_y, ra_y ++.set vra_base, ra_base ++.set vr_txs, t0s ++.else ++.set vrx_xshift, ra_xshift # a side more convienient ++.set vrx_xshift_next, rb_xshift2_next ++.set vra_y_next, ra_y2_next ++.set vrx_base_next, rb_base2_next ++.set vra_y, ra_y2 ++.set vra_base, rb_base2 ++.set vr_txs, t1s ++.endif ++ ++# denom shift values ++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) ++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) ++ ++# per-channel shifts were calculated on the *previous* invocation ++# get base addresses and per-channel shifts for *next* invocation ++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y ++ ++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base ++ ++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0 ++ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height ++ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs ++ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next ++ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a ++ ++.if v_bit_depth <= 8 ++ shl vrx_xshift_next, r0, 3 ++ and r0, r0, -4 ++.endif ++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced! ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs ++ add vrx_base_next, r3, r0 ; mov r1, ra_height ++ ++# set up VPM write ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight ++ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight ++ ++# Misc final setup... ++ ++ shl r0, r1, v_dma_h_shift ; mov ra_dest, unif # ; dst_addr ++ add r0, r0, r2 ; mov r2, ra_fir_off_val # Combine width and height of destination area (r0=h<<8, r2=w*2) ++ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register ++ add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight ++ shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0 ++ sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add ++ add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 # ; loop counter (V FIFO fill = 4) ++ mov rb11, ra3.8d ; mov ra_link, unif # ; Link ++ ++# r5 = -4 (loop counter) ++# ra_wt_mul_l0 = weight L0 + 128 (now unsigned) ++# rb_wt_off = (offset * 2 + 1) << (wt_den + 5) ++# rb31 = FIR value offset ++ ++# FIFO: rb4, ra5, rb6, ra7 ++# Coeffs in ra3.8a, ra3.8b, rb10, rb11 ++ ++# We want (r0r1) ++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ... ++# We fetch (after shift) ++# C0 : C3 : C1 : C4 : C2 : C5 : ... ++ ++:1 ++# retrieve texture results and pick out bytes ++# then submit two more texture requests ++ ++.if v_tmu == 0 ++ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0 ++ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next ++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next ++.else ++ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1 ++ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next ++ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y ++ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next # [r1 << delay] ++.endif ++ ++ add vra_y, r3, ra_k1 ; mov r0, r1 << 15 ++ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1 ++ min r3, r3, rb_max_y ; mov.ifnc r0, r2 ++ ++ and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch ++.if v_tmu == 0 ++ add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask # ; mask bytes ++.else ++ add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax # ; mask bytes ++.endif ++ ++# apply horizontal filter ++# The filter coeffs for the two halves of this are the same (unlike in the ++# Y case) so it doesn't matter which ra0 we get them from ++# Also as the two halves are locked together we don't need to separate the 1st ++# r0 mul or the last r1 mul as they are valid for all QPUs ++ ++ add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0 ++ sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1 ++ sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0 ++ add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0 ++ ++# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift) ++# We would like to save the r5->r4 shift but we need a delay slot ++# for both r7 & r6 which we can't find anything to put in if we have ++# already multiplied r4 & r5! ++ brr.anyn -, r:1b ++ add r2, r2, r3 ; mul24 r0, ra7, rb10 # r6 post ++ mov ra5, rb6 ; mul24 r1, rb6, ra3.8b # r5 post ++ asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7 ++# >>> .anyn 1b ++ ++ add r1, r1, r0 ; mul24 r0, rb4, ra3.8a # [ra7 delay] ++ sub r1, r1, r0 ; mul24 r0, ra7, rb11 ++ sub r1, r1, r0 ++ ++ asr r1, r1, 6 ; mov r3, ra_blk_height # ; NxtLoop ++ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop ++ brr.anyn -, r:1b ++ asr r1, r1, i_wt_den_p6 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop ++# >>> .anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc ra_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ brr -, r:1b ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_c_p ++ m_filter_c_p 0, 8 ++ ++::mc_filter_c_p_l1 ++ m_filter_c_p 1, 8 ++ ++################################################################################ ++# ++# mc_filter_c_b ++# ++# typedef struct qpu_mc_pred_c_b_s { ++# int16_t y; ++# int16_t x; ++# uint32_t base; ++# uint16_t h; ++# uint16_t w; ++# uint32_t coeffs_x1; ++# uint32_t coeffs_y1; ++# int16_t weight_u1; ++# int16_t weight_v1; ++# int16_t y2; ++# int16_t x2; ++# uint32_t base2; ++# uint32_t coeffs_x2; ++# uint32_t coeffs_y2; ++# uint32_t wo_u2; ++# uint32_t wo_v2; ++# uint32_t dst_addr_c; ++# uint32_t next_fn; ++# } qpu_mc_pred_c_b_t; ++ ++.macro m_filter_c_b, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 1 ++.set v_v_shift, 8 ++# Shifts to get width & height in the right place in ra_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 2 ++.set v_v_shift, i_shift16 ++# Shifts to get width & height in the right place in ra_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++.set v_x_mul, (1 << v_x_shift) ++ ++# denom shift values ++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) ++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) ++ ++# per-channel shifts were calculated on the *previous* invocation ++ ++# get base addresses and per-channel shifts for *next* invocation ++ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y ++ ++ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base ++ ++ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0 ++ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a ++ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height ++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++ min r0, r0, rb_max_x ; mov ra0, unif # ; L0 H filter coeffs ++ ++.if v_bit_depth <= 8 ++ shl ra_xshift_next, r0, 3 ++.endif ++ ++ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs ++ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs) ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height ++ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B ++ ++# set up VPM write ++ ++ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight ++ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height ++ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight ++ ++ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2 ++ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base ++ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register ++ add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b # r0=x ++ ++# L1 - uniform layout could possibly be optimized ++ ++ shl r0, r0, v_x_shift ; mov ra1, unif # r0=x<>> .anyn 1b ++ ++ sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b # L1 ; L0 ++ sub.setf -, r5, rb_lcount ; mov r0, ra4 ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++ add r1, r1, r0 ; mul24 r0, ra7, rb7 ++ ++ sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c # L1 ++ add r2, r2, r0 ; mul24 r0, ra11, rb11 # L1 ++ sub r2, r2, r0 ++ ++ shr r1, r1, 6 ++ shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0 ++ add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1 ++ add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add ++ sub r1, r1, r2 ; mov r3, ra_blk_height # ; NxtLoop ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 # ; NxtLoop ++ ++ brr.anyn -, r:1b ++ asr r1, r1, ra_wt_den_p7 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop ++# >>> .anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc ra_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ brr -, r:1b ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_c_b ++ m_filter_c_b 8 ++ ++################################################################################ ++# Exit code used by both Luma & Chroma so place between them to avoid I-cache ++# conflicts ++ ++.macro m_exit_drain ++.if PREREAD == 2 ++# Special case 2 as loop is wasteful ++ nop ; nop ; ldtmu0 ++ nop ; nop ; ldtmu1 ++ nop ; nop ; ldtmu0 ++ mov -, vw_wait ; nop ; ldtmu1 ++.else ++ mov.setf r3, PREREAD - 1 ++:1 ++ brr.anynz -, r:1b ++ nop ; nop ; ldtmu0 ++ nop ; nop ; ldtmu1 ++ sub.setf r3, r3, 1 ++ # >>> ++ mov -, vw_wait ++.endif ++.endm ++ ++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair) ++# All qpus start at the beginning and after that (group - 1) must have finished ++# before (group) can start ++# ++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain ++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important - ++# lockup otherwise) ++# ++# There is some, currently ill defined, potential lockup if we have the VDM active ++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ?? ++# ++# The code stalled when I had many waiters on a single sem so we have a ++# "ripple" of srels to restart. Unsure why, may have been bug, but this works ++# and we currently have both the memory & sems to support it. ++.macro m_sync_q, n_qpu, n_quads ++# Do not generate code for qpu >= quads * 4 - fns should never be called ++.if n_qpu < n_quads * 4 ++ mov ra_link, unif # Can only branch to an a reg (not r0) ++ mov -, vw_wait # [ra_link delay] ++ ++.set n_sem_sync, n_qpu - (n_qpu % 4) ++.set n_sem_in, n_qpu ++.set n_sem_out, n_qpu + 1 ++ ++.if n_qpu % 4 == 0 ++ ++.set n_sem_quad_in, 12 + n_qpu / 4 ++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads) ++ ++ sacq -, n_sem_sync ++ sacq -, n_sem_sync ++ sacq -, n_sem_sync ++ bra -, ra_link ++ sacq -, n_sem_quad_in ++ srel -, n_sem_out ++ srel -, n_sem_quad_out ++ ++.else ++ bra -, ra_link ++ srel -, n_sem_sync ++ sacq -, n_sem_in ++.if n_sem_out % 4 != 0 ++ srel -, n_sem_out ++.else ++ nop ++.endif ++.endif ++.endif ++.endm ++ ++.set v_quads8, N_QPU_8 / 4 ++ ++::mc_sync_q0 ++ m_sync_q 0, v_quads8 ++::mc_sync_q1 ++ m_sync_q 1, v_quads8 ++::mc_sync_q2 ++ m_sync_q 2, v_quads8 ++::mc_sync_q3 ++ m_sync_q 3, v_quads8 ++::mc_sync_q4 ++ m_sync_q 4, v_quads8 ++::mc_sync_q5 ++ m_sync_q 5, v_quads8 ++::mc_sync_q6 ++ m_sync_q 6, v_quads8 ++::mc_sync_q7 ++ m_sync_q 7, v_quads8 ++::mc_sync_q8 ++ m_sync_q 8, v_quads8 ++::mc_sync_q9 ++ m_sync_q 9, v_quads8 ++::mc_sync_q10 ++ m_sync_q 10, v_quads8 ++::mc_sync_q11 ++ m_sync_q 11, v_quads8 ++ ++# mc_exit() ++# Chroma & Luma the same now ++ ++.macro m_exit_qn ++ m_exit_drain ++ nop ; nop ; thrend ++ nop ++ nop ++# >>> thrend <<< ++.endm ++ ++::mc_exit_c_qn ++::mc_exit_y_qn ++ m_exit_qn ++ ++ ++ ++# mc_interrupt_exit12() ++ ++.macro m_exit_q0 ++ m_exit_drain ++ sacq -, 12 ++ nop ; nop ; thrend ++ mov interrupt, 1 ++ nop ++# >>> thrend <<< ++.endm ++ ++::mc_exit_c_q0 ++::mc_exit_y_q0 ++ m_exit_q0 ++ ++# LUMA CODE ++ ++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1. ++# For P frames we make the second x,y coordinates offset by +8 ++ ++ ++################################################################################ ++# mc_setup ++# ++# typedef struct qpu_mc_pred_y_s_s { ++# qpu_mc_src_t next_src1; ++# qpu_mc_src_t next_src2; ++# uint16_t pic_h; ++# uint16_t pic_w; ++# uint32_t stride2; ++# uint32_t stride1; ++# uint32_t wdenom; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_s_t; ++ ++.macro m_setup_y, v_bit_depth ++ ++# Cannot use mul24 on x as x might be -ve, so must use shift ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_pmask, 0xff ++.set v_blk_height, Y_BLK_HEIGHT_8 ++.else ++.set v_x_shift, 1 ++.set v_pmask, 0xffff ++.set v_blk_height, Y_BLK_HEIGHT_16 ++.endif ++ ++ ++ # Need to save these because we need to know the frame dimensions before computing texture coordinates ++ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y ++ mov ra9, unif # ref_y_base ++ mov ra1, unif # x2_y2 ++ ++ ++# load constants ++ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3] ++ shl rb_ef, r0, i_shift30 ; mov ra11, unif # ; ref_y2_base ++ ++ mov ra_kff800100, 0xff800100 ++ mov rb_pmask, v_pmask ++ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16) ++ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8)) ++ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth) ++ mov rb_y_coeffs_2, 0x050b0a00 ++ mov rb_y_coeffs_3, 0x11283a40 ++ mov rb_y_coeffs_5, 0x0a0b0500 ++ ++# Compute part of VPM to use ++ ++# Read image dimensions ++ mov ra3, unif # width_height ++ mov ra_ef, rb_ef ; mov rb_xpitch, unif # [ra3 delay] ; stride2 ++.if v_x_shift == 0 ++ sub rb_max_x, ra3.16b, 1 ++.else ++ sub r0, ra3.16b, 1 ++ shl rb_max_x, r0, v_x_shift ++.endif ++ sub rb_max_y, ra3.16a, 1 ++ mov r3, elem_num ; mov rb_pitch, unif # stride1 ++ ++# get destination pitch ++ mov r1, vdw_setup_1(0) # [rb_pitch delay] ++ or rb_dma1_base, r1, rb_pitch ++ ++# Compute base address for first and second access ++ add r0, ra0.16b, r3 # Load x + elem_num ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, 0 ++ min r0, r0, rb_max_x ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ ++# X is byte offset - we can only load words - mask ++ ++ and r0, r0, -4 ; v8subs r2, r2, r2 ++ sub r2, r2, rb_pitch ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add ra_base, ra9, r0 ++ ++ # r3 still contains elem_num ++ add r0, ra1.16b, r3 # Load x ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, 0 ++ min r0, r0, rb_max_x ++ shl rb_xshift2_next, r0, 3 # Compute shifts ++ ++ # r2 still contains mask ++ and r0, r0, -4 ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 # Add stripe offsets ++ add rb_base2, ra11, r0 ++ ++# Do preloads ++ nop ; mov r0, ra0.16a # ; r0 = y ++ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2 ++ ++:1 ++ sub.setf r3, r3, 1 ++ max r1, r0, 0 ++ min r1, r1, rb_max_y ++ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t0s, ra_base, r1 ; mov ra_y, r0 ++ ++ max r1, r2, 0 ++ brr.anynz -, r:1b ++ min r1, r1, rb_max_y ++ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch ++ add t1s, rb_base2, r1 ; mov ra_y2, r2 ++# >>> .anynz 1b ++ ++ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base ++ ++ mov ra_link, unif # Next fn ++ ++# touch vertical context to keep simulator happy ++ mov ra8, 0 ; mov rb8, 0 # [ra_link delay] ++ bra -, ra_link ++ mov ra9, 0 ; mov rb9, 0 ++ mov ra10, 0 ; mov rb10, 0 ++ mov ra11, 0 ; mov rb11, 0 ++# >>> ra_link ++.endm ++ ++::mc_setup_y_q0 ++ m_setup_q0 ++::mc_setup_y_qn ++ m_setup_y 8 ++ ++################################################################################ ++# ++# Start of per-block setup code ++# P and B blocks share the same setup code to save on Icache space ++ ++# get base addresses and per-channel shifts for *next* invocation ++# per-channel shifts were calculated on the *previous* invocation ++ ++# 1st 3 instructions of per_block-setup in branch delay ++# ++# typedef struct qpu_mc_pred_y_p_s { ++# qpu_mc_src_t next_src1; ++# qpu_mc_src_t next_src2; ++# uint16_t h; ++# uint16_t w; ++# uint32_t mymx21; ++# uint32_t wo1; ++# uint32_t wo2; ++# uint32_t dst_addr; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_p_t; ++# ++ ++.macro m_luma_setup, v_bit_depth ++# Hack - QASM may well have have label pasting but I have no idea how... ++.if v_bit_depth == 8 ++ brr ra_link, r:per_block_setup_8 ++.elif v_bit_depth == 10 ++ brr ra_link, r:per_block_setup_10 ++.endif ++ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack?? ++ add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0 ++ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next ++.endm ++ ++.macro m_per_block_setup, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_x_mul, 1 ++# Shifts to get width & height in the right place in ra_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++# Shifts to get width & height in the right place in ra_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next ++ min r0, r0, rb_max_x ++ ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ++ sub r2, r5, rb_pitch ; mov ra_base_next, unif # ; src1.base ++ and r1, r0, r2 ; mov ra_y_next, ra0.16a ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y ++ add ra_base_next, ra_base_next, r0 # [ra1 delay] ++ ++ add r0, ra1.16b, r3 # Load x2 ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ max r0, r0, r5 ; mov ra_y2_next, ra1.16a ++ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base ++ shl rb_xshift2_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height ++ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes ++ add rb_base2_next, rb_base2_next, r0 ++ ++# get width,height of block (unif load above), r1 = width * pel_size ++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width) ++ add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height ++ add rb_lcount, r0, (7-8) ++ shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add # ; r3 return val ++ add r0, r0, r1 # Combine width and height of destination area ++ shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val ++ add ra_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets ++ ++# get filter coefficients and discard unused B frame values ++ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight ++ shl ra8, r0, 3 ; mov rb5, ra_k255 ++ ++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8) ++ ++# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val ++# but I can't see a way of doing that that is cheap enough to be worth it ++ ++# Picked out in a slightly random order to space out uniform loads ++ ++ # 1 ++ mov r1, 0x01040400 # [ra8 delay] ++ ror ra2.8b, r1, ra8.8d ++ ror ra0.8b, r1, ra8.8c ++ # 2 ++ ror ra2.8c, rb_y_coeffs_2, ra8.8d ++ ror ra0.8c, rb_y_coeffs_2, ra8.8c ++ # 0 ++ mov r1,0x00010100 # -ve [ra8 delay] ++ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif # ; L1 Wt/Offset ++ ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5 ++ # 7 ++ shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000 ++ ror r0, r1, ra8.8d ; mov ra_dest, unif # ; Destination address ++ ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5 ++ # 3 ++ ror ra2.8d, rb_y_coeffs_3, ra8.8d ++ ror ra0.8d, rb_y_coeffs_3, ra8.8c ++ # 5 ++ ror ra3.8b, rb_y_coeffs_5, ra8.8d ++ ror ra1.8b, rb_y_coeffs_5, ra8.8c ++ # 6 ++ mov r1,0x04040100 ++ ror ra3.8c, r1, ra8.8d ++ ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 # ; r5 return val ++ ++ bra -, ra_link ++ # 4 ++ mov r1,0x3a281100 ++ ror r0, r1, ra8.8d ; mov ra_link, unif # ; link - load after we've used its previous val ++ ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5 ++# >>> branch ra_link ++ ++# r5 = -8 ++# r2 = fir_off_val ++# r3 = 128 ++.endm ++ ++:per_block_setup_8 ++ m_per_block_setup 8 ++ ++ ++ ++################################################################################ ++# ++# mc_filter_y_pxx ++# ++# Setup (& therefore uniform struct) shared with _bxx ++# Struct in m_luma_setup ++# ++# We can have 2 separate P reqs here as long as they mate to generate a ++# rectangular output block (i.e. h0 = h1, w0 = 8) ++# ++# At this point we have already issued PREREAD pairs of texture requests for the current block ++ ++.macro m_filter_y_pxx, v_bit_depth ++ ++# denom shift values ++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) ++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) ++ ++ m_luma_setup v_bit_depth ++ ++ shl r1, ra_wt_off_l0, i_wt_den_p5 ++ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul ++ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++ ++# retrieve texture results and pick out bytes ++# then submit two more texture requests ++ ++# This loop is identical to the B loop from here ---> ++:1 ++ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++ ++ max r2, ra_y, 0 ; mov r1, 0 ++ min r2, r2, rb_max_y ; mov r3, ra_k1 ++ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++ add t0s, ra_base, r2 ; mov rb5, rb6 ++ shr r0, r4, ra_xshift ; mov rb6, rb7 ++ ++ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes ++ shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++ add t1s, rb_base2, r2 ; mov ra8, ra9 ++ ++# apply horizontal filter ++ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++ ++ brr.anyn -, r:1b ++ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++ # >>> .anyn 1b (r5 + r5) ++ ++ # apply vertical filter and write to VPM ++ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11 ++ ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++ add r1, r1, r0 ; mul24 r0, ra8, rb8 ++ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++ add r1, r1, r0 ; mul24 r0, ra11, rb11 ++# <--- to here ++ sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height ++ sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next ++ sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next ++ ++ asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next ++ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0 ++ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add ++ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate) ++ ++ brr.anyn -, r:1b ++ asr r1, r1, i_wt_den_p6 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop ++# >>> branch.anyn 1b (r5 - rb_lcount) ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc ra_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ brr -, r:1b ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_pxx ++ m_filter_y_pxx 8 ++ ++ ++################################################################################ ++ ++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel) ++# ++# Setup (& therefore uniform struct) shared with _pxx ++# Struct in m_luma_setup ++# ++# l0 calc in els 0-7, L1 in 8-15 ++# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh) ++# ++# At this point we have already issued PREREAD pairs of texture requests for the current block ++ ++.macro m_filter_y_bxx, v_bit_depth ++ ++# denom shift values ++.set i_wt_den_p5, (DENOM + 13 - v_bit_depth) ++.set i_wt_den_p6, (DENOM + 14 - v_bit_depth) ++ ++ m_luma_setup v_bit_depth ++ ++ shl r1, ra_wt_off_l0, i_wt_den_p6 ++ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 ++ sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1 ++ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4 ++ ++# This loop is identical to the P loop from here ---> ++:1 ++ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef ++ ++ max r2, ra_y, 0 ; mov r1, 0 ++ min r2, r2, rb_max_y ; mov r3, ra_k1 ++ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0 ++ add t0s, ra_base, r2 ; mov rb5, rb6 ++ shr r0, r4, ra_xshift ; mov rb6, rb7 ++ ++ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes ++ shr r1, r4, rb_xshift2 ; mov rb7, ra8 ++ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax ++ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch ++ add t1s, rb_base2, r2 ; mov ra8, ra9 ++ ++# apply horizontal filter ++ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0 ++ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0 ++ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0 ++ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0 ++ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0 ++ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0 ++ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0 ++ ++ brr.anyn -, r:1b ++ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b ++ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b ++ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11 ++ # >>> .anyn 1b (r5 + r5) ++ ++ # apply vertical filter and write to VPM ++ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11 ++ ++ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c ++ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d ++ add r1, r1, r0 ; mul24 r0, ra8, rb8 ++ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c ++ add r1, r1, r0 ; mul24 r0, ra11, rb11 ++# <--- to here ++ sub r1, r1, ra4 ++ sub r1, r1, r0 ; mov r2, rb_wt_off ++ ++ asr r1, r1, 6 ++ sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0 ++ mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add ++ sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next ++ sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next ++ add r1, r1, r2 ; mov r0, r1 << 8 ++ add r1, r1, r0 ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height ++ ++ brr.anyn -, r:1b ++ asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch # ; NxtLoop ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, 0 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate) ++# >>> branch.anyn 1b (r5 - rb_lcount) ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height ++ ++# If looping again then we consumed block_height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc ra_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link (ra_height - remaining height) ++ ++# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ brr -, r:1b ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_bxx ++ m_filter_y_bxx 8 ++ ++################################################################################ ++# ++# typedef struct qpu_mc_pred_y_p00_s { ++# qpu_mc_src_t next_src1; ++# uint16_t h; ++# uint16_t w; ++# uint32_t wo1; ++# uint32_t dst_addr; ++# uint32_t next_fn; ++# } qpu_mc_pred_y_p00_t; ++ ++.macro m_filter_y_p00, v_bit_depth ++ ++.if v_bit_depth <= 8 ++.set v_x_shift, 0 ++.set v_x_mul, 1 ++# Shifts to get width & height in the right place in ra_dma0 ++.set v_dma_h_shift, 7 ++.set v_dma_wh_shift, i_shift16 ++.else ++.set v_x_shift, 1 ++.set v_x_mul, 2 ++# Shifts to get width & height in the right place in ra_dma0 ++.set v_dma_h_shift, 8 ++.set v_dma_wh_shift, 15 ++.endif ++ ++ mov ra0, unif ; mov r0, elem_num # y_x ++ mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 # [ra0 delay] ; r5 = 0 ++ add r0, ra0.16b, r0 ; mov ra_base_next, unif # ; src1.base ++.if v_x_shift != 0 ++ shl r0, r0, v_x_shift ++.endif ++ ++ max r0, r0, r5 ; mov ra_y_next, ra0.16a # ; width_height ++ min r0, r0, rb_max_x ; mov ra_width_height, unif ++ ++ shl ra_xshift_next, r0, 3 # Compute shifts ++ and r0, r0, -4 ++ sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif # ; weight_offset ++ and r1, r0, r2 ++ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch ++ add r0, r0, r1 ; mov ra_dest, unif # Add stripe offsets ; dest addr ++ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write ++ ++# get width,height of block (unif load above) ++# Compute vdw_setup1(dst_pitch-width) ++ shl r1, ra_width, v_x_shift ++ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height ++ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height ++ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0 ++ add r0, r0, r1 # Combine width and height of destination area ++ shl rb_wt_off, ra_wt_off_l0, DENOM + 7 ++ shl r0, r0, v_dma_wh_shift ; mov ra_link, unif # Shift into bits 16 upwards of the vdw_setup0 register ; link ++ add ra_dma0, r0, rb_dma0_base ++ ++:1 ++ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1 ++ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask ++ ++ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0 ++ shl r1, r1, 8 ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ ++ brr.anyn -, r:1b ++ asr r1, r1, DENOM + 8 ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc ra_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ brr -, r:1b ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_p00 ++ m_filter_y_p00 8 ++ ++################################################################################ ++ ++.macro m_filter_y_b00, v_bit_depth ++# luma setup does a fair bit more than we need calculating filter coeffs ++# that we will never use but it saves I-cache to use it (also simple!) ++ m_luma_setup v_bit_depth ++ ++# Fix up vals that were expecting a filter (somewhat icky) ++ mov r2, 1 ++ add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 # Need in rX rather than raX for <<8 to do what we want ++ shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero ++ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8 ++ ++:1 ++ sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1 ++ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0 ++ shr r0, r4, ra_xshift ; mov r3, rb_pitch ++ ++ max r2, ra_y, 0 # y ++ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next ++ add ra_y, ra_y, 1 ; mul24 r2, r2, r3 ++ add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next ++ ++ max r2, ra_y2, 0 ++ min r2, r2, rb_max_y ++ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3 ++ add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax # v8subs masks out all but bottom byte ++ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0 ++ ++ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1 ++ add r1, r0, r1 ; v8adds r5rep, r5, ra_k1 ++ ++ shl r1, r1, 8 ; mov r3, ra_blk_height ++ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 ++ ++ brr.anyn -, r:1b ++ asr r1, r1, (DENOM + 9) - 32 # -32 to get valid shift immediate ++ min r1, r1, ra_pmax ; mov -, vw_wait ++ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch ++# >>> branch.anyn 1b ++ ++# r0 = remaining height (min 0) ++# r2 = r3 * rb_pitch ++# r3 = block_height ++ ++# If looping again then we consumed 16 height last loop ++# rb_dma1 (stride) remains constant ++# rb_i_tmu remains const (based on total height) ++# recalc ra_dma0, rb_lcount based on new segment height ++ ++ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0 ++ ++# DMA out ++ bra.anyz -, ra_link ++ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride ++ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW ++ shl r1, r1, i_shift23 ++# >>> .anyz ra_link ++ ++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve ++# We add to dma0 to reduce the number of output lines in the final block ++ brr -, r:1b ++ add rb_lcount, rb_lcount, r0 ++ add ra_dma0, ra_dma0, r1 ++ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer ++# >>> 1b ++.endm ++ ++::mc_filter_y_b00 ++ m_filter_y_b00 8 ++ ++################################################################################ ++################################################################################ ++# 10 BIT ++ ++::mc_setup_c10_q0 ++ m_setup_q0 ++::mc_setup_c10_qn ++ m_setup_c 10 ++ ++::mc_filter_c10_p ++ m_filter_c_p 0, 10 ++ ++::mc_filter_c10_p_l1 ++ m_filter_c_p 1, 10 ++ ++ ++::mc_filter_c10_b ++ m_filter_c_b 10 ++ ++# Even if these fns are the same as for other bit depths we want our own copy ++# to keep the code we are using in a single lump to avoid (direct map) cache ++# thrashing ++.set v_quads10, N_QPU_16 / 4 ++ ++::mc_sync10_q0 ++ m_sync_q 0, v_quads10 ++::mc_sync10_q1 ++ m_sync_q 1, v_quads10 ++::mc_sync10_q2 ++ m_sync_q 2, v_quads10 ++::mc_sync10_q3 ++ m_sync_q 3, v_quads10 ++::mc_sync10_q4 ++ m_sync_q 4, v_quads10 ++::mc_sync10_q5 ++ m_sync_q 5, v_quads10 ++::mc_sync10_q6 ++ m_sync_q 6, v_quads10 ++::mc_sync10_q7 ++ m_sync_q 7, v_quads10 ++::mc_sync10_q8 ++ m_sync_q 8, v_quads10 ++::mc_sync10_q9 ++ m_sync_q 9, v_quads10 ++::mc_sync10_q10 ++ m_sync_q 10, v_quads10 ++::mc_sync10_q11 ++ m_sync_q 11, v_quads10 ++ ++::mc_exit_y10_q0 ++::mc_exit_c10_q0 ++ m_exit_q0 ++ ++::mc_exit_y10_qn ++::mc_exit_c10_qn ++ m_exit_qn ++ ++::mc_setup_y10_q0 ++ m_setup_q0 ++::mc_setup_y10_qn ++ m_setup_y 10 ++ ++:per_block_setup_10 ++ m_per_block_setup 10 ++ ++::mc_filter_y10_pxx ++ m_filter_y_pxx 10 ++ ++::mc_filter_y10_p00 ++ m_filter_y_p00 10 ++ ++::mc_filter_y10_bxx ++ m_filter_y_bxx 10 ++ ++::mc_filter_y10_b00 ++ m_filter_y_b00 10 ++ ++ ++ ++::mc_end ++# Do not add code here because mc_end must appear after all other code. +diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h +new file mode 100644 +index 0000000000..89711d776b +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader_cmd.h +@@ -0,0 +1,165 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#ifndef RPI_SHADER_CMD_H ++#define RPI_SHADER_CMD_H ++ ++#pragma pack(push, 4) ++ ++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y ++// If mixed then we are just confused and get a lot of warnings.... ++typedef const uint8_t * qpu_mc_src_addr_t; ++typedef uint8_t * qpu_mc_dst_addr_t; ++#else ++typedef uint32_t qpu_mc_src_addr_t; ++typedef uint32_t qpu_mc_dst_addr_t; ++#endif ++ ++typedef struct qpu_mc_src_s ++{ ++ int16_t y; ++ int16_t x; ++ qpu_mc_src_addr_t base; ++} qpu_mc_src_t; ++ ++ ++typedef struct qpu_mc_pred_c_p_s { ++ qpu_mc_src_t next_src; ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x; ++ uint32_t coeffs_y; ++ uint32_t wo_u; ++ uint32_t wo_v; ++ qpu_mc_dst_addr_t dst_addr_c; ++ uint32_t next_fn; ++} qpu_mc_pred_c_p_t; ++ ++typedef struct qpu_mc_pred_c_b_s { ++ qpu_mc_src_t next_src1; ++ uint16_t h; ++ uint16_t w; ++ uint32_t coeffs_x1; ++ uint32_t coeffs_y1; ++ int16_t weight_u1; ++ int16_t weight_v1; ++ qpu_mc_src_t next_src2; ++ uint32_t coeffs_x2; ++ uint32_t coeffs_y2; ++ uint32_t wo_u2; ++ uint32_t wo_v2; ++ qpu_mc_dst_addr_t dst_addr_c; ++ uint32_t next_fn; ++} qpu_mc_pred_c_b_t; ++ ++typedef struct qpu_mc_pred_c_s_s { ++ qpu_mc_src_t next_src1; ++ uint32_t pic_cw; // C Width (== Y width / 2) ++ uint32_t pic_ch; // C Height (== Y Height / 2) ++ uint32_t stride2; ++ uint32_t stride1; ++ qpu_mc_src_t next_src2; ++ uint32_t next_fn; ++} qpu_mc_pred_c_s_t; ++ ++typedef struct qpu_mc_pred_c_s { ++ union { ++ qpu_mc_pred_c_p_t p; ++ qpu_mc_pred_c_b_t b; ++ qpu_mc_pred_c_s_t s; ++ }; ++} qpu_mc_pred_c_t; ++ ++ ++typedef struct qpu_mc_pred_y_p_s { ++ qpu_mc_src_t next_src1; ++ qpu_mc_src_t next_src2; ++ uint16_t h; ++ uint16_t w; ++ uint32_t mymx21; ++ uint32_t wo1; ++ uint32_t wo2; ++ qpu_mc_dst_addr_t dst_addr; ++ uint32_t next_fn; ++} qpu_mc_pred_y_p_t; ++ ++typedef struct qpu_mc_pred_y_p00_s { ++ qpu_mc_src_t next_src1; ++ uint16_t h; ++ uint16_t w; ++ uint32_t wo1; ++ qpu_mc_dst_addr_t dst_addr; ++ uint32_t next_fn; ++} qpu_mc_pred_y_p00_t; ++ ++typedef struct qpu_mc_pred_y_s_s { ++ qpu_mc_src_t next_src1; ++ qpu_mc_src_t next_src2; ++ uint16_t pic_h; ++ uint16_t pic_w; ++ uint32_t stride2; ++ uint32_t stride1; ++ uint32_t next_fn; ++} qpu_mc_pred_y_s_t; ++ ++typedef struct qpu_mc_pred_sync_s { ++ uint32_t next_fn; ++} qpu_mc_pred_sync_t; ++ ++// Only a useful structure in that it allows us to return something other than a void * ++typedef struct qpu_mc_pred_y_s { ++ union { ++ qpu_mc_pred_y_p_t p; ++ qpu_mc_pred_y_p00_t p00; ++ qpu_mc_pred_y_s_t s; ++ }; ++} qpu_mc_pred_y_t; ++ ++typedef union qpu_mc_pred_cmd_u { ++ qpu_mc_pred_y_t y; ++ qpu_mc_pred_c_t c; ++ qpu_mc_pred_sync_t sync; ++} qpu_mc_pred_cmd_t; ++ ++static void inline qpu_mc_link_set(qpu_mc_pred_cmd_t * const cmd, const uint32_t fn) ++{ ++ // Link is last el of previous cmd ++ ((uint32_t *)cmd)[-1] = fn; ++} ++ ++#define QPU_MC_PRED_N_Y8 12 ++#define QPU_MC_PRED_N_C8 12 ++ ++#define QPU_MC_PRED_N_Y10 12 ++#define QPU_MC_PRED_N_C10 12 ++ ++#define QPU_MC_DENOM 7 ++ ++#pragma pack(pop) ++ ++#endif ++ +diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c +new file mode 100644 +index 0000000000..77d8366eb8 +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader_template.c +@@ -0,0 +1,88 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include "hevc.h" ++#include "rpi_hevcdec.h" ++#include "libavutil/rpi_sand_fns.h" ++#include "rpi_hevc_shader_cmd.h" ++#include "rpi_hevc_shader_template.h" ++ ++typedef struct shader_track_s ++{ ++ const union qpu_mc_pred_cmd_u *qpu_mc_curr; ++ const struct qpu_mc_src_s *last_l0; ++ const struct qpu_mc_src_s *last_l1; ++ uint32_t width; // pic_width * PW ++ uint32_t height; ++ uint32_t stride2; ++ uint32_t stride1; ++} shader_track_t; ++ ++static int wtoidx(const unsigned int w) ++{ ++ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; ++ return pel_weight[w]; ++} ++ ++static const int fctom(uint32_t x) ++{ ++ int rv; ++ // As it happens we can take the 2nd filter term & divide it by 8 ++ // (dropping fractions) to get the fractional move ++ rv = 8 - ((x >> 11) & 0xf); ++ av_assert2(rv >= 0 && rv <= 7); ++ return rv; ++} ++ ++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr) ++{ ++ return (x << shl) >> shr; ++} ++ ++static inline int woff_p(HEVCRpiContext *const s, int32_t x) ++{ ++ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8); ++} ++ ++static inline int woff_b(HEVCRpiContext *const s, int32_t x) ++{ ++ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8); ++} ++ ++static inline int wweight(int32_t x) ++{ ++ return ext(x, 16, 16); ++} ++ ++ ++#define PW 1 ++#include "rpi_hevc_shader_template_fn.h" ++ ++#undef PW ++#define PW 2 ++#include "rpi_hevc_shader_template_fn.h" ++ +diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h +new file mode 100644 +index 0000000000..0fc5a45e9f +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader_template.h +@@ -0,0 +1,49 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H ++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H ++ ++struct HEVCRpiContext; ++struct HEVCRpiInterPredEnv; ++ ++void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s, ++ const struct HEVCRpiInterPredEnv *const ipe_y, ++ const struct HEVCRpiInterPredEnv *const ipe_c); ++ ++void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s, ++ const struct HEVCRpiInterPredEnv *const ipe_y, ++ const struct HEVCRpiInterPredEnv *const ipe_c); ++ ++void rpi_sand_dump8(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); ++ ++void rpi_sand_dump16(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c); ++ ++#endif ++ +diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h +new file mode 100644 +index 0000000000..10c163a4b9 +--- /dev/null ++++ b/libavcodec/rpi_hevc_shader_template_fn.h +@@ -0,0 +1,502 @@ ++/* ++Copyright (c) 2017 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#define STRCAT(x,y) x##y ++ ++#if PW == 1 ++#define pixel uint8_t ++#define FUNC(f) STRCAT(f, 8) ++#elif PW == 2 ++#define pixel uint16_t ++#define FUNC(f) STRCAT(f, 16) ++#else ++#error Unexpected PW ++#endif ++ ++#define PATCH_STRIDE (16 * PW) ++ ++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) ++{ ++ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) { ++ const pixel s = *(const pixel *)src; ++ pixel * d = (pixel *)dst; ++ for (unsigned int j = 0; j < w; j += PW) { ++ *d++ = s; ++ } ++ } ++} ++ ++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride) ++{ ++ for (unsigned int i = 0; i != h; ++i, dst += stride) { ++ memcpy(dst, src, w); ++ } ++} ++ ++static void FUNC(get_patch_y)(const shader_track_t * const st, ++ uint8_t * dst, const unsigned int dst_stride, ++ const qpu_mc_src_t *src, ++ unsigned int _w, unsigned int _h) ++{ ++ int x = src->x * PW; ++ int y = src->y; ++ int w = _w * PW; ++ int h = _h; ++ int dl = 0; ++ int dr = 0; ++ int dt = 0; ++ int db = 0; ++ ++ if (x < 0) { ++ if (-x >= w) ++ x = PW - w; ++ dl = -x; ++ w += x; ++ x = 0; ++ } ++ if (x + w > st->width) { ++ if (x >= st->width) ++ x = st->width - PW; ++ dr = (x + w) - st->width; ++ w = st->width - x; ++ } ++ ++ // Y ++ if (y < 0) { ++ if (-y >= h) ++ y = 1 - h; ++ dt = -y; ++ h += y; ++ y = 0; ++ } ++ if (y + h > st->height) { ++ if (y >= st->height) ++ y = st->height - 1; ++ db = (y + h) - st->height; ++ h = st->height - y; ++ } ++ ++ dst += dl + dt * dst_stride; ++ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); ++ ++ // Edge dup ++ if (dl != 0) ++ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride); ++ if (dr != 0) ++ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride); ++ w += dl + dr; ++ dst -= dl; ++ ++ if (dt != 0) ++ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride); ++ if (db != 0) ++ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride); ++} ++ ++ ++ ++static void FUNC(get_patch_c)(const shader_track_t * const st, ++ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride, ++ const qpu_mc_src_t *src, ++ unsigned int _w, unsigned int _h) ++{ ++ int x = src->x * PW; ++ int y = src->y; ++ int w = _w * PW; ++ int h = _h; ++ int dl = 0; ++ int dr = 0; ++ int dt = 0; ++ int db = 0; ++ const int width = st->width; ++ const int height = st->height; ++ ++ if (x < 0) { ++ if (-x >= w) ++ x = PW - w; ++ dl = -x; ++ w += x; ++ x = 0; ++ } ++ if (x + w > width) { ++ if (x >= width) ++ x = width - PW; ++ dr = (x + w) - width; ++ w = width - x; ++ } ++ ++ // Y ++ if (y < 0) { ++ if (-y >= h) ++ y = 1 - h; ++ dt = -y; ++ h += y; ++ y = 0; ++ } ++ if (y + h > height) { ++ if (y >= height) ++ y = height - 1; ++ db = (y + h) - height; ++ h = height - y; ++ } ++ ++ dst_u += dl + dt * dst_stride; ++ dst_v += dl + dt * dst_stride; ++ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h); ++ ++ // Edge dup ++ if (dl != 0) ++ { ++ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride); ++ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride); ++ } ++ if (dr != 0) ++ { ++ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride); ++ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride); ++ } ++ w += dl + dr; ++ dst_u -= dl; ++ dst_v -= dl; ++ ++ if (dt != 0) ++ { ++ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride); ++ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride); ++ } ++ if (db != 0) ++ { ++ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride); ++ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride); ++ } ++} ++ ++// w, y, w, h in pixels ++// stride1, stride2 in bytes ++void FUNC(rpi_sand_dump)(const char * const name, ++ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c) ++{ ++ const int mask = stride2 == 0 ? ~0 : stride1 - 1; ++ ++ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h); ++ ++ if (is_c) { ++ x *= 2; ++ w *= 2; ++ } ++ ++ for (int i = y; i != y + h; ++i) { ++ for (int j = x; j != x + w; ++j) { ++ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2; ++ char sep = is_c && (j & 1) == 0 ? ':' : ' '; ++#if PW == 1 ++ if (j < 0 || i < 0) ++ printf("..%c", sep); ++ else ++ printf("%02x%c", *(const pixel*)p, sep); ++#else ++ if (j < 0 || i < 0) ++ printf("...%c", sep); ++ else ++ printf("%03x%c", *(const pixel*)p, sep); ++#endif ++ } ++ printf("\n"); ++ } ++} ++ ++ ++void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s, ++ const HEVCRpiInterPredEnv *const ipe_y, ++ const HEVCRpiInterPredEnv *const ipe_c) ++{ ++ for (int c_idx = 0; c_idx < 2; ++c_idx) ++ { ++ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c; ++ shader_track_t tracka[QPU_N_MAX] = {{NULL}}; ++ unsigned int exit_n = 0; ++ ++ if (ipe == NULL || !ipe->used) { ++ continue; ++ } ++ ++ do { ++ for (unsigned int i = 0; i != ipe->n; ++i) { ++ const HEVCRpiInterPredQ * const q = ipe->q + i; ++ shader_track_t * const st = tracka + i; ++ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr; ++ ++ for (;;) { ++ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1]; ++ ++ if (link == q->code_setup) { ++ if (c_idx == 0) { ++ // Luma ++ const qpu_mc_pred_y_s_t *const c = &cmd->y.s; ++ ++ st->height = c->pic_h; ++ st->width = c->pic_w * PW; ++ st->stride1 = c->stride1; ++ st->stride2 = c->stride2; ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else { ++ // Chroma ++ const qpu_mc_pred_c_s_t *const c = &cmd->c.s; ++ ++ st->height = c->pic_ch; ++ st->width = c->pic_cw * PW; ++ st->stride1 = c->stride1; ++ st->stride2 = c->stride2; ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ } ++ else if (link == s->qpu.y_pxx) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ const int w1 = FFMIN(c->w, 8); ++ const int w2 = c->w - w1; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ if (w2 > 0) { ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h + 7); ++ } ++ ++ // wo[offset] = offset*2+1 ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1); ++ if (w2 > 0) { ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( ++ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2); ++ } ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_bxx) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h + 7); ++ ++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0]( ++ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, ++ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w); ++ ++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3, ++ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2), ++ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w); ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_p00) { ++ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h + 7); ++ ++ // wo[offset] = offset*2+1 ++ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w); ++ ++ st->last_l0 = &c->next_src1; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.y_b00) { ++ const qpu_mc_pred_y_p_t *const c = &cmd->y.p; ++ ++ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ av_assert0(c->w <= 16 && c->h <= 64); ++ ++ FUNC(get_patch_y)(st, ++ patch_y1, PATCH_STRIDE, ++ st->last_l0, ++ 16, c->h); ++ FUNC(get_patch_y)(st, ++ patch_y2, PATCH_STRIDE, ++ st->last_l1, ++ 16, c->h); ++ ++ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0]( ++ patch_y3, patch_y1, PATCH_STRIDE, ++ c->h, 0, 0, c->w); ++ ++ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0]( ++ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3, ++ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2), ++ 0, woff_b(s, c->wo2), 0, 0, c->w); ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_pxx) { ++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; ++ const int mx = fctom(c->coeffs_x); ++ const int my = fctom(c->coeffs_y); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l0 = &c->next_src; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_pxx_l1) { ++ const qpu_mc_pred_c_p_t *const c = &cmd->c.p; ++ const int mx = fctom(c->coeffs_x); ++ const int my = fctom(c->coeffs_y); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8) ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w); ++ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0]( ++ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l1 = &c->next_src; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == s->qpu.c_bxx) { ++ const qpu_mc_pred_c_b_t *const c = &cmd->c.b; ++ const int mx1 = fctom(c->coeffs_x1); ++ const int my1 = fctom(c->coeffs_y1); ++ const int mx2 = fctom(c->coeffs_x2); ++ const int my2 = fctom(c->coeffs_y2); ++ ++ uint8_t patch_u1[PATCH_STRIDE * 72]; ++ uint8_t patch_v1[PATCH_STRIDE * 72]; ++ uint8_t patch_u2[PATCH_STRIDE * 72]; ++ uint8_t patch_v2[PATCH_STRIDE * 72]; ++ uint8_t patch_u3[8 * 16 * PW]; ++ uint8_t patch_v3[8 * 16 * PW]; ++ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE]; ++ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE]; ++ ++ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3); ++ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3); ++ ++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( ++ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, mx1, my1, c->w); ++ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0]( ++ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE, ++ c->h, mx1, my1, c->w); ++ ++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( ++ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4, ++ c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2), ++ 0, woff_b(s, c->wo_u2), mx2, my2, c->w); ++ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0]( ++ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4, ++ c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2), ++ 0, woff_b(s, c->wo_v2), mx2, my2, c->w); ++ ++ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h); ++ ++ st->last_l0 = &c->next_src1; ++ st->last_l1 = &c->next_src2; ++ cmd = (const qpu_mc_pred_cmd_t *)(c + 1); ++ } ++ else if (link == q->code_sync) { ++ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1); ++ break; ++ } ++ else if (link == q->code_exit) { ++ // We expect exit to occur without other sync ++ av_assert0(i == exit_n); ++ ++exit_n; ++ break; ++ } ++ else { ++ av_assert0(0); ++ } ++ } ++ ++ st->qpu_mc_curr = cmd; ++ } ++ } while (exit_n == 0); ++ } ++} ++ ++#undef FUNC ++#undef pixel ++ +diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s +new file mode 100644 +index 0000000000..3caef20137 +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform.s +@@ -0,0 +1,444 @@ ++# ****************************************************************************** ++# Argon Design Ltd. ++# (c) Copyright 2015 Argon Design Ltd. All rights reserved. ++# ++# Module : HEVC ++# Author : Peter de Rivaz ++# ****************************************************************************** ++ ++# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack) ++# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions) ++.set USE_STACK, 0 ++ ++# Lines that fail to assemble start with #: ++# The script insert_magic_opcodes.sh inserts the machine code directly for these. ++# HEVC VPU Transform ++# ++# Transform matrix can be thought of as ++# output row vector = input row vector * transMatrix2 ++# ++# The even rows of the matrix are symmetric ++# The odd rows of the matrix are antisymmetric ++# ++# So only need to compute the first half of the results, then can compute the remainder with a butterfly ++# ++# EXAMPLE ++# (a b c d) (1 2 2 1) ++# (3 4 -4 -3) ++# (5 6 6 5) ++# (7 8 -8 -7) ++# ++# x=(a c)(1 2) = 1a+5c 2a+6c ++# (5 6) ++# ++# y=(b d)(3 4) = 3b+7d 4b+8d ++# (7 8) ++# ++# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d ++# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d ++# ++# Final results are (u , v[::-1]) ++# ++# ++# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0) ++# Apply the even matrix first and stop before rounding ++# Then apply the odd matrix in a full manner: ++# ++# First step is to compute partial products with the first input (16 cycles) ++# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output ++# 2a 4b 6c 8d ++# 2a -4b 6c -8d ++# 1a -3b 5c -7d ++# ++# Second step is to sum partial products into final position (8 cycles) ++# 1a+3b+5c+7d ++# 2a+4b+6c+8d ++# 2a-4b+6c-8d ++# 1a-3b+5c-7d ++# ++# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format) ++# ++# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds) ++# ++# For 8x8 we could compute two in parallel. ++# ++# ++ ++# Columns are transformed first ++# ++# Store top left half of transMatrix2 in ++# Store bottom left half of transMatrix2 in HX(32,32) ++# ++# For 16x16 ++# HX(0:15,0) contains input data before transform ++# HY(0:15,0) contains 32bit output data after transform ++# HX(32,0) contains even rows of left half of transMatrix2 ++# HX(32,32) contains odd rows of left half of transMatrix2 ++# HY(48,0) contains partial products ready for summing ++# ++ ++ ++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!) ++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) ++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) ++# num: number of 16x16 transforms to be done ++# coeffs32 ++# num32: number of 32x32 transforms ++# command 0 for transform, 1 for memclear16(int16_t *dst,num16) ++# ++ ++.equ TRANS_SHIFT, 20 - BIT_DEPTH ++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1) ++.equ TRANS_ASL2, 16 - TRANS_SHIFT ++ ++ ++hevc_trans_16x16: ++ push r6-r15, lr # TODO cut down number of used registers ++ mov r14,r3 # coeffs32 ++ mov r15,r4 # num32 ++ mov r3, 16*2 # Stride of transMatrix2 in bytes ++ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix ++ ++ add r0, 16*16*2 # For 32x32 transforms we also need this matrix ++ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix ++ ++ # Now use r0 to describe which matrix we are working on. ++ # Allows us to prefetch the next block of coefficients for efficiency. ++ mov r0,0 # This describes the location where we read our coefficients from ++ mov r3,16*2 # Stride of coefficients in bytes (TODO remove) ++ mov r7,16*16*2 # Total block size ++ mov r8,64*16 # Value used to swap from current to next VRF location ++ mov r4,64 # Constant used for rounding first pass ++ mov r5,TRANS_RND2 # Constant used for rounding second pass ++ ++ sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack ++ ++ add r11,sp,64 # Space for 32 bytes before, and rounding ++ lsr r11,5 ++ lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32 ++ ++ lsr r10, r2, 16 # Number of compressed blocks stored in top short ++ extu r2,16 ++ # At start of block r0,r1 point to the current block (that has already been loaded) ++ # r0 VRF location of current block ++ # r1 address of current block ++ # r2 number of 16*16 transforms to do ++ # r3 Stride of coefficients (==32) ++ # r4 TRANS_RND1 (64) ++ # r5 TRANS_RND2 ++ # r6 temporary used inside col_trans16 ++ # r7 16*16*2 total bytes in block ++ # r8 64*16 VRF switch locations ++ # r9 temporary in unpack_coeff for index ++ # r10 number of 16x16 transforms using compression ++ # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer) ++ # r12 temporary counter in unpack_coeff ++ # r13 ++ # r14 Save information for 32 bit transform (coeffs location) ++ # r15 Save information for 32 bit transform (number of transforms) ++ cmp r2,0 ++ beq done16x16s ++block_loop: ++ # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests ++ cmp r10,0 ++ mov r6, r1 ++ beq not_compressed ++ sub r10, 1 ++ bl unpack16x16 ++not_compressed: ++ #mov r6,r1 # DEBUG without compress ++ vldh HX(0++,0)+r0,(r6 += r3) REP 16 ++ #eor r0,r8 ++ #add r1,r7 ++ # Prefetch the next block ++ #bl unpack16x16 ++ #vldh HX(0++,0)+r0,(r6 += r3) REP 16 ++ #vmov HX(0++,0)+r0,0 REP 16 # DEBUG ++ #eor r0,r8 ++ #sub r1,r7 ++ ++ # Transform the current block ++ bl col_trans_16 ++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate ++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word. ++ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble? ++ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position ++ ++ bl col_trans_16 ++ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate ++ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word. ++ vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag) ++ ++ # Save results - note there has been a transposition during the processing so we save columns ++ vsth VX(0,32++)+r0, (r1 += r3) REP 16 ++ ++ # Move onto next block ++ eor r0,r8 ++ add r1,r7 ++ ++ addcmpbgt r2,-1,0,block_loop ++done16x16s: ++ ++ add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack ++ # Now go and do any 32x32 transforms ++ b hevc_trans_32x32 ++ ++ pop r6-r15, pc ++# This returns a value in r6 that says where to load the data from. ++# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it. ++unpack16x16: ++# Clear out destination ++ vmov HX(0,0)+r0,0 ++ mov r6, r11 ++ vsth HX(0,0)+r0,(r6 += r3) REP 16 ++ mov r5, r1 # Moving pointer to input coefficients ++unpack_outer_loop: ++ # Loop until we find the end ++ vldh HX(0,0)+r0,(r5) # TODO would prefetch help here while unpacking previous? ++ sub r6,r11,32 ++ #add r6,pc,packed_data-$ # Packed data ++ vsth HX(0,0)+r0,(r6) # Store into packed data ++ mov r12,0 ++unpack_loop: ++ ld r4,(r6) ++ add r6,r6,4 ++ lsr r9,r4,16 # r9 is destination value ++ cmp r4,0 # {value,index} ++ extu r4,8 ++ beq done_unpack ++ sth r9,(r11, r4) ++ addcmpblt r12,1,8,unpack_loop ++# # Read next 16 ++ add r5,32 ++ b unpack_outer_loop ++done_unpack: ++# # Set new load location ++ mov r6, r11 ++ #add r6,pc,unpacked_data-$ ++# # Restore constants ++ mov r4,64 ++ mov r5,TRANS_RND2 ++# pop r6-r15, pc ++ b lr ++ ++# r1,r2,r3 r7,r8 should be preserved ++# HX(0++,0)+r0 is the block to be transformed ++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients ++# Use HY(48,0) for intermediate results ++# r0 can be used, but should be returned to its original value at the end ++col_trans_16: ++ add r6,r0,16 # Final value for this loop ++col_trans_16_loop: ++ # First compute partial products for a single column ++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16 ++ # Then sum up the results and place back ++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC ++ addcmpblt r0,1,r6,col_trans_16_loop ++ sub r0,16 # put r0 back to its original value ++ b lr ++ ++col_trans_odd_16: ++ add r6,r0,16 # Final value for this loop ++col_trans_odd_16_loop: ++ # First compute partial products for a single column ++ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16 ++ # Then sum up the results and place back ++ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC ++ addcmpblt r0,1,r6,col_trans_odd_16_loop ++ sub r0,16 # put r0 back to its original value ++ b lr ++ ++# r1/r10 input pointer ++# r0,r4,r5,r6 free ++# r8/r9 output storage ++# ++# Store packed coefficients at r9-32 ++# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows) ++unpack32x32: ++# Clear out destination ++ vmov HX(0,0),0 ++ add r0, r9, 32*32*2 # Unpacked buffer ++ mov r4, 32 ++ vsth HX(0,0),(r0 += r4) REP 64 ++unpack_outer_loop32: ++ # Loop until we find the end ++ vldh HX(0,0),(r1) # TODO would prefetch help here while unpacking previous? ++ sub r6,r9,32 ++ #add r6,pc,packed_data-$ # Packed data ++ vsth HX(0,0),(r6) # Store into packed data ++ mov r8,0 ++unpack_loop32: ++ ld r4,(r6) ++ add r6,r6,4 ++ lsr r5,r4,16 # r5 is destination value ++ cmp r4,0 # {value,index} ++ extu r4,10 ++ beq done_unpack ++ sth r5,(r0, r4) ++ addcmpblt r8,1,8,unpack_loop32 ++# # Read next 16 ++ add r1,32 ++ b unpack_outer_loop32 ++done_unpack32: ++ b lr ++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num) ++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd ++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory) ++# num: number of 16x16 transforms to be done in low 16, number of packed in high 16 ++# ++# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first! ++hevc_trans_32x32: ++ mov r1,r14 # coeffs ++ mov r2,r15 # num ++ lsr r15,r15,16 # Number that are packed ++ extu r2,16 # Total number ++ ++ # Fetch odd transform matrix ++ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients) ++ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix ++ #add r0, 16*16*2 ++ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix ++ ++ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer ++ mov r7, 16*16*2 # Total block size ++ ++.if USE_STACK ++ # Stack base allocation ++ sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking ++ # set r8 to 32byte aligned stack pointer with 32 bytes of space before it ++ add r8,sp,63 ++ lsr r8,5 ++ lsl r8,5 ++.else ++#:version r8 ++ .half 0x00e8 #AUTOINSERTED ++ btst r8,16 ++#:add r8,pc,intermediate_results-$ ++ .half 0xbfe8 ++ .half intermediate_results-($-2) ++ beq on_vpu1 ++ add r8,r8,32*32*2*2+16*2 # Move to secondary storage ++on_vpu1: ++.endif ++ mov r9,r8 # Backup of the temporary storage ++ mov r10,r1 # Backup of the coefficient buffer ++ ++ cmp r2,0 ++ beq done32x32s ++block_loop32: ++ ++ # Transform the first 16 columns ++ mov r1,r10 # Input Coefficient buffer ++ mov r8,r9 # Output temporary storage ++ # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed) ++ cmp r2,r15 ++ bgt not_compressed_32 ++ bl unpack32x32 ++ add r1,r9,32*32*2 # Uncompressed into temporary storage ++ mov r8,r9 # Transform into here ++not_compressed_32: ++ # COLUMN TRANSFORM ++ mov r4, 64 # Constant used for rounding first pass ++ mov r5, 9 # left shift used for rounding first pass ++ ++ bl trans32 ++ # Transform the second 16 columns ++ add r8,32*16*2 ++ add r1,32 ++ bl trans32 ++ ++ # ROW TRANSFORM ++ mov r4, TRANS_RND2 # Constant used for rounding second pass ++ mov r5, TRANS_ASL2 # left shift used for rounding second pass ++ ++ mov r1,r9 # Input temporary storage ++ mov r8,r10 # Output Coefficient buffer ++ bl trans32 ++ # Transform the second 16 columns ++ add r8,32*16*2 ++ add r1,32 ++ bl trans32 ++ ++ add r10, 32*32*2 # move onto next block of coefficients ++ addcmpbgt r2,-1,0,block_loop32 ++done32x32s: ++ ++.if USE_STACK ++ add sp,sp,32*32*4+64# Restore stack ++.endif ++ ++ pop r6-r15, pc ++ ++trans32: ++ push lr ++ # We can no longer afford the VRF space to do prefetching when doing 32x32 ++ # Fetch the even rows ++ vldh HX(0++,0),(r1 += r3) REP 16 ++ # Fetch the odd rows ++ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1 ++ ++ # Transform the even rows using even matrix ++ mov r0, 0 # Even rows ++ bl col_trans_16 ++ ++ # Now transform the odd rows using odd matrix ++ mov r0, 64*16 # Odd rows ++ bl col_trans_odd_16 ++ ++ # Now apply butterfly to compute the first 16 results ++ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16 ++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, ++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate ++ # 16bit results now in HX(48,32) ++ mov r0,r8 ++ mov r6,32*2 ++ vsth VX(48,32++),(r0+=r6) REP 16 ++ ++ # Now apply butterfly to compute the second 16 results (in reverse order) ++ vsub HY(63,0),HY(0 ,0),HY(16,0) ++ vsub HY(62,0),HY(1 ,0),HY(17,0) ++ vsub HY(61,0),HY(2 ,0),HY(18,0) ++ vsub HY(60,0),HY(3 ,0),HY(19,0) ++ vsub HY(59,0),HY(4 ,0),HY(20,0) ++ vsub HY(58,0),HY(5 ,0),HY(21,0) ++ vsub HY(57,0),HY(6 ,0),HY(22,0) ++ vsub HY(56,0),HY(7 ,0),HY(23,0) ++ vsub HY(55,0),HY(8 ,0),HY(24,0) ++ vsub HY(54,0),HY(9 ,0),HY(25,0) ++ vsub HY(53,0),HY(10,0),HY(26,0) ++ vsub HY(52,0),HY(11,0),HY(27,0) ++ vsub HY(51,0),HY(12,0),HY(28,0) ++ vsub HY(50,0),HY(13,0),HY(29,0) ++ vsub HY(49,0),HY(14,0),HY(30,0) ++ vsub HY(48,0),HY(15,0),HY(31,0) ++ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding, ++ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate ++ add r0,r8,32 ++ vsth VX(48,32++),(r0+=r6) REP 16 ++ pop pc ++ ++.if USE_STACK == 0 ++ .balign 32 ++ ++# .space directives generate 0's in the bin so avoid unnecessary padding by ++# just setting to appropriate value ++.equ intermediate_results, $+16*2 ++ ++# Layout goes: ++# ++#packed_buffer: ++# .space 16*2 ++#intermediate_results: ++# .space 32*32*2 ++#unpacked_buffer: ++# .space 32*32*2 ++# ++#packed_buffer2: ++# .space 16*2 ++#intermediate_results2: ++# .space 32*32*2 ++#unpacked_buffer2: ++# .space 32*32*2 ++.endif ++ ++ +diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h +new file mode 100644 +index 0000000000..1c364492d0 +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform10.h +@@ -0,0 +1,94 @@ ++static const unsigned char rpi_hevc_transform10 [] = { ++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 ++0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 ++0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 ++0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 ++0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 ++0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 ++0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x02, // 0030 ++0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 ++0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 ++0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 ++0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 ++0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 ++0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 ++0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 ++0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 ++0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 ++0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 ++0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 ++0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x06, 0x04, // 0090 ++0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 ++0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 ++0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 ++0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 ++0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 ++0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 ++0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 ++0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 ++0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 ++0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 ++0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 ++0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 ++0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 ++0x00, 0x02, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 ++0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 ++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 ++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 ++0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 ++0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 ++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 ++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 ++0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 ++0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 ++0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 ++0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 ++0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 ++0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 ++0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 ++0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 ++0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 ++0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 ++0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 ++0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 ++0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 ++0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 ++0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 ++0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 ++0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 ++0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 ++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 ++0x04, 0xb0, 0x00, 0x02, 0x65, 0x60, 0x91, 0x40, // 01d8 ++0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 ++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 ++0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 ++0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 ++0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 ++0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 ++0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 ++0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 ++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 ++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 ++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 ++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 ++0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 ++0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 ++0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 ++0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 ++0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 ++0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 ++0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 ++0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 ++0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 ++0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 ++0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 ++0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 ++0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 ++0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 ++0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 ++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 ++0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 ++0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 ++}; +diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h +new file mode 100644 +index 0000000000..1128a2c054 +--- /dev/null ++++ b/libavcodec/rpi_hevc_transform8.h +@@ -0,0 +1,94 @@ ++static const unsigned char rpi_hevc_transform8 [] = { ++0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000 ++0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008 ++0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010 ++0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018 ++0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020 ++0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028 ++0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x08, // 0030 ++0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038 ++0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040 ++0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048 ++0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050 ++0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058 ++0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060 ++0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068 ++0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070 ++0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078 ++0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080 ++0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088 ++0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x04, 0x04, // 0090 ++0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098 ++0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0 ++0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8 ++0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0 ++0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8 ++0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0 ++0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8 ++0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0 ++0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8 ++0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0 ++0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8 ++0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0 ++0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8 ++0x00, 0x08, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100 ++0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108 ++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110 ++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118 ++0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120 ++0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128 ++0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130 ++0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138 ++0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140 ++0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148 ++0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150 ++0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158 ++0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160 ++0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168 ++0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170 ++0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178 ++0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180 ++0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188 ++0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190 ++0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198 ++0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0 ++0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8 ++0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0 ++0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8 ++0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0 ++0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8 ++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0 ++0x04, 0xb0, 0x00, 0x08, 0x45, 0x60, 0x91, 0x40, // 01d8 ++0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0 ++0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8 ++0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0 ++0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8 ++0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200 ++0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208 ++0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210 ++0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218 ++0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220 ++0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228 ++0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230 ++0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238 ++0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240 ++0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248 ++0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250 ++0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258 ++0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260 ++0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268 ++0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270 ++0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278 ++0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280 ++0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288 ++0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290 ++0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298 ++0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0 ++0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8 ++0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0 ++0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8 ++0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0 ++0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0 ++0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8 ++}; +diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c +new file mode 100644 +index 0000000000..e651e5c565 +--- /dev/null ++++ b/libavcodec/rpi_hevcdec.c +@@ -0,0 +1,6134 @@ ++/* ++ * HEVC video Decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2012 - 2013 Mickael Raulet ++ * Copyright (C) 2012 - 2013 Gildas Cocherel ++ * Copyright (C) 2012 - 2013 Wassim Hamidouche ++ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "libavutil/attributes.h" ++#include "libavutil/common.h" ++#include "libavutil/display.h" ++#include "libavutil/internal.h" ++#include "libavutil/mastering_display_metadata.h" ++#include "libavutil/md5.h" ++#include "libavutil/opt.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/stereo3d.h" ++ ++#include "decode.h" ++#include "bswapdsp.h" ++#include "bytestream.h" ++#include "golomb.h" ++#include "hevc.h" ++#include "rpi_hevc_data.h" ++#include "rpi_hevc_parse.h" ++#include "rpi_hevcdec.h" ++#include "rpi_hevc_cabac_fns.h" ++#include "profiles.h" ++#include "hwconfig.h" ++ ++#include "rpi_zc_frames.h" ++#include "rpi_qpu.h" ++#include "rpi_hevc_shader.h" ++#include "rpi_hevc_shader_cmd.h" ++#include "rpi_hevc_shader_template.h" ++#include "rpi_zc.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++#include "pthread.h" ++#include ++ ++#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards ++ ++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff)) ++ ++#ifndef av_mod_uintp2 ++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p) ++{ ++ return a & ((1 << p) - 1); ++} ++# define av_mod_uintp2 av_mod_uintp2_c ++#endif ++ ++const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 }; ++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first); ++ ++#define MC_DUMMY_X (-32) ++#define MC_DUMMY_Y (-32) ++ ++// UV & Y both have min 4x4 pred (no 2x2 chroma) ++// Allow for even spread +1 for setup, +1 for rounding ++// As we have load sharing this can (in theory) be exceeded so we have to ++// check after each CTU, but it is a good base size ++ ++// Worst case (all 4x4) commands per CTU ++#define QPU_Y_CMD_PER_CTU_MAX (16 * 16) ++#define QPU_C_CMD_PER_CTU_MAX (8 * 8) ++ ++#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64) ++ ++#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP) ++#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS) ++ ++#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2) ++#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2) ++ ++// Total cmds to allocate - allow for slack & setup ++#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX) ++#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX) ++ ++#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2)) ++#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2)) ++ ++// The QPU code for UV blocks only works up to a block width of 8 ++#define RPI_CHROMA_BLOCK_WIDTH 8 ++ ++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24) ++ ++ ++// Actual filter goes -ve, +ve, +ve, -ve using these values ++static const uint32_t rpi_filter_coefs[8] = { ++ ENCODE_COEFFS( 0, 64, 0, 0), ++ ENCODE_COEFFS( 2, 58, 10, 2), ++ ENCODE_COEFFS( 4, 54, 16, 2), ++ ENCODE_COEFFS( 6, 46, 28, 4), ++ ENCODE_COEFFS( 4, 36, 36, 4), ++ ENCODE_COEFFS( 4, 28, 46, 6), ++ ENCODE_COEFFS( 2, 16, 54, 4), ++ ENCODE_COEFFS( 2, 10, 58, 2) ++}; ++ ++// Function arrays by QPU ++ ++static const int * const inter_pred_setup_c_qpu[12] = { ++ mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, ++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, ++ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn ++}; ++ ++static const int * const inter_pred_setup_c10_qpu[12] = { ++ mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, ++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, ++ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn ++}; ++ ++static const int * const inter_pred_setup_y_qpu[12] = { ++ mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, ++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, ++ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn ++}; ++ ++static const int * const inter_pred_setup_y10_qpu[12] = { ++ mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, ++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, ++ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn ++}; ++ ++static const int * const inter_pred_sync_qpu[12] = { ++ mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3, ++ mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7, ++ mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11 ++}; ++ ++static const int * const inter_pred_sync10_qpu[12] = { ++ mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3, ++ mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7, ++ mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11 ++}; ++ ++static const int * const inter_pred_exit_c_qpu[12] = { ++ mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, ++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, ++ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn ++}; ++ ++static const int * const inter_pred_exit_c10_qpu[12] = { ++ mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, ++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, ++ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn ++}; ++ ++static const int * const inter_pred_exit_y_qpu[12] = { ++ mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, ++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, ++ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn ++}; ++ ++static const int * const inter_pred_exit_y10_qpu[12] = { ++ mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, ++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, ++ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn ++}; ++ ++typedef struct ipe_chan_info_s ++{ ++ const uint8_t bit_depth; ++ const uint8_t n; ++ const int * const * setup_fns; ++ const int * const * sync_fns; ++ const int * const * exit_fns; ++} ipe_chan_info_t; ++ ++typedef struct ipe_init_info_s ++{ ++ ipe_chan_info_t luma; ++ ipe_chan_info_t chroma; ++} ipe_init_info_t; ++ ++static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a) ++{ ++ switch (ln) ++ { ++ default: // normally 0 ++ *b = a; ++ break; ++ case 1: ++ a |= a << 8; ++ *(uint16_t *)b = a; ++ b += stride; ++ *(uint16_t *)b = a; ++ break; ++ case 2: ++ a |= a << 8; ++ a |= a << 16; ++ *(uint32_t *)b = a; ++ b += stride; ++ *(uint32_t *)b = a; ++ b += stride; ++ *(uint32_t *)b = a; ++ b += stride; ++ *(uint32_t *)b = a; ++ break; ++ case 3: ++ { ++ unsigned int i; ++ uint64_t d; ++ a |= a << 8; ++ a |= a << 16; ++ d = ((uint64_t)a << 32) | a; ++ for (i = 0; i != 8; ++i, b += stride) ++ *(uint64_t *)b = d; ++ break; ++ } ++ case 4: ++ { ++ unsigned int i; ++ uint64_t d; ++ a |= a << 8; ++ a |= a << 16; ++ d = ((uint64_t)a << 32) | a; ++ for (i = 0; i != 16; ++i, b += stride) ++ { ++ *(uint64_t *)b = d; ++ *(uint64_t *)(b + 8) = d; ++ } ++ break; ++ } ++ } ++} ++ ++// We expect this to be called with ln = (log2_cb_size - 3) so range = -1..3 ++// (4 not required) ++static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a) ++{ ++ switch (ln) ++ { ++ default: // 0 or -1 ++ *b_u = a; ++ *b_l = a; ++ break; ++ case 1: ++ a |= a << 8; ++ *(uint16_t *)b_u = a; ++ *(uint16_t *)b_l = a; ++ break; ++ case 2: ++ a |= a << 8; ++ a |= a << 16; ++ *(uint32_t *)b_u = a; ++ *(uint32_t *)b_l = a; ++ break; ++ case 3: ++ a |= a << 8; ++ a |= a << 16; ++ *(uint32_t *)b_u = a; ++ *(uint32_t *)(b_u + 4) = a; ++ *(uint32_t *)b_l = a; ++ *(uint32_t *)(b_l + 4) = a; ++ break; ++ case 4: ++ a |= a << 8; ++ a |= a << 16; ++ *(uint32_t *)b_u = a; ++ *(uint32_t *)(b_u + 4) = a; ++ *(uint32_t *)(b_u + 8) = a; ++ *(uint32_t *)(b_u + 12) = a; ++ *(uint32_t *)b_l = a; ++ *(uint32_t *)(b_l + 4) = a; ++ *(uint32_t *)(b_l + 8) = a; ++ *(uint32_t *)(b_l + 12) = a; ++ break; ++ } ++} ++ ++static void zap_cabac_stash(uint8_t * b, const int ln) ++{ ++ switch (ln) ++ { ++ default: // 0 ++ *b = 0; ++ break; ++ case 1: ++ *(uint16_t *)b = 0; ++ break; ++ case 2: ++ *(uint32_t *)b = 0; ++ break; ++ case 3: ++ *(uint32_t *)b = 0; ++ *(uint32_t *)(b + 4) = 0; ++ break; ++ } ++} ++ ++ ++ ++// Set a small square block of bits in a bitmap ++// Bits must be aligned on their size boundry (which will be true of all split CBs) ++static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln) ++{ ++ unsigned int n; ++ const unsigned int sh = (x & 7); ++ ++ f += (x >> 3); ++ ++ av_assert2(ln <= 3); ++ av_assert2((x & ((1 << ln) - 1)) == 0); ++ ++ switch (ln) ++ { ++ default: // 1 ++ f[0] |= 1 << sh; ++ break; ++ case 1: // 3 * 2 ++ n = 3 << sh; ++ f[0] |= n; ++ f[stride] |= n; ++ break; ++ case 2: // 0xf * 4 ++ n = 0xf << sh; ++ f[0] |= n; ++ f[stride] |= n; ++ f[stride * 2] |= n; ++ f[stride * 3] |= n; ++ break; ++ case 3: // 0xff * 8 ++ for (n = 0; n != 8; ++n, f += stride) ++ *f = 0xff; ++ break; ++ } ++} ++ ++static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16 ++ { // 8 ++ .luma = {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu}, ++ .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu} ++ }, ++ { // 9 ++ .luma = {0}, ++ .chroma = {0} ++ }, ++ { // 10 ++ .luma = {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu}, ++ .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu} ++ } ++ ++}; ++ ++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici) ++{ ++ const unsigned int n = ici->n; ++ const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word ++ ++ ipe->n = n; ++ ipe->max_fill = q1_size - ipe->min_gap; ++ for(unsigned int i = 0; i < n; i++) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ q->qpu_mc_curr = q->qpu_mc_base = ++ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size); ++ q->code_setup = qpu_fn(ici->setup_fns[i]); ++ q->code_sync = qpu_fn(ici->sync_fns[i]); ++ q->code_exit = qpu_fn(ici->exit_fns[i]); ++ } ++} ++ ++static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth) ++{ ++ av_assert0(bit_depth >= 8 && bit_depth <= 16); ++ ++ rpi_hevc_qpu_init_fn(&s->qpu, bit_depth); ++} ++ ++// Unsigned Trivial MOD ++static inline unsigned int utmod(const unsigned int x, const unsigned int n) ++{ ++ return x >= n ? x - n : x; ++} ++ ++// returns pq->job_n++ ++static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq) ++{ ++ unsigned int const x2 = pq->job_n; ++ pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS); ++ return x2; ++} ++ ++static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n) ++{ ++ pq->terminate = 0; ++ pq->job_n = 0; ++ pq->context = s; ++ pq->worker = worker; ++ pq->psem_out = psem_out; ++ pq->pass_n = n; ++ pq->started = 0; ++ sem_init(&pq->sem_in, 0, 0); ++} ++ ++static void pass_queue_kill(HEVCRpiPassQueue * const pq) ++{ ++ sem_destroy(&pq->sem_in); ++} ++ ++static inline void rpi_sem_wait(sem_t * const sem) ++{ ++ while (sem_wait(sem) != 0) { ++ av_assert0(errno == EINTR); ++ } ++} ++ ++static void pass_queue_submit_job(HEVCRpiPassQueue * const pq) ++{ ++ sem_post(&pq->sem_in); ++} ++ ++static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ // Do the various passes - common with the worker code ++ for (unsigned int i = 0; i != RPI_PASSES; ++i) { ++ s->passq[i].worker(s, jb); ++ } ++} ++ ++ ++#if 0 ++static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func) ++{ ++ int x; ++ sem_getvalue((sem_t *)&jbc->sem_out, &x); ++ printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x); ++} ++#endif ++ ++ ++static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJob * jb; ++ HEVCRpiJobGlobal * const jbg = jbc->jbg; ++ ++ pthread_mutex_lock(&jbg->lock); ++ // Check local 1st ++ if ((jb = jbc->jb1) != NULL) ++ { ++ // Only 1 - very easy :-) ++ jbc->jb1 = NULL; ++ } ++ else ++ { ++ // Now look for global free chain ++ if ((jb = jbg->free1) != NULL) ++ { ++ // Found one - unlink it ++ jbg->free1 = jb->next; ++ jb->next = NULL; ++ } ++ else ++ { ++ // Out of places to look - wait for one to become free - add to Qs ++ ++ // Global ++ // If "good" lc then add after the last "good" el in the chain ++ // otherwise add to the tail ++ if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good) ++ { ++ // Add to end as we had to wait last time or wait Q empty ++ if ((lc->jw_prev = jbg->wait_tail) == NULL) ++ jbg->wait_head = lc; ++ else ++ lc->jw_prev->jw_next = lc; ++ lc->jw_next = NULL; ++ jbg->wait_tail = lc; ++ } ++ else ++ { ++ // This is a "good" lc that we need to poke into the middle ++ // of the Q ++ // We know that the Q isn't empty and there is at least one ++ // !last_progess_good el in it from the previous test ++ ++ HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after ++ ++ if (p == NULL) ++ { ++ // No current good els - add to head ++ lc->jw_next = jbg->wait_head; ++ jbg->wait_head = lc; ++ } ++ else ++ { ++ lc->jw_next = p->jw_next; ++ p->jw_next = lc; ++ } ++ ++ lc->jw_next->jw_prev = lc; ++ lc->jw_prev = p; ++ } ++ ++ // If "good" then we are now the last good waiting el ++ if (lc->last_progress_good) ++ jbg->wait_good = lc; ++ ++ // Local ++ if ((lc->ljw_prev = jbc->lcw_tail) == NULL) ++ jbc->lcw_head = lc; ++ else ++ lc->ljw_prev->ljw_next = lc; ++ lc->ljw_next = NULL; ++ jbc->lcw_tail = lc; ++ } ++ } ++ ++ pthread_mutex_unlock(&jbg->lock); ++ ++ if (jb == NULL) // Need to wait ++ { ++ rpi_sem_wait(&lc->jw_sem); ++ jb = lc->jw_job; // Set by free code ++ } ++ ++ return jb; ++} ++ ++ ++static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb) ++{ ++ HEVCRpiJobGlobal * const jbg = jbc0->jbg; // This jbc only used to find jbg so we can get the lock ++ HEVCRpiJobCtl * jbc = jb->jbc_local; ++ HEVCRpiLocalContext * lc = NULL; ++ ++ pthread_mutex_lock(&jbg->lock); ++ ++ if (jbc != NULL) ++ { ++ av_assert1(jbc->jb1 == NULL); ++ ++ // Release to Local if nothing waiting there ++ if ((lc = jbc->lcw_head) == NULL) ++ jbc->jb1 = jb; ++ } ++ else ++ { ++ // Release to global if nothing waiting there ++ if ((lc = jbg->wait_head) == NULL) ++ { ++ jb->next = jbg->free1; ++ jbg->free1 = jb; ++ } ++ else ++ { ++ // ? seems somehow mildy ugly... ++ jbc = lc->context->jbc; ++ } ++ } ++ ++ if (lc != NULL) ++ { ++ // Something was waiting ++ ++ // Unlink ++ // Global ++ if (lc->jw_next == NULL) ++ jbg->wait_tail = lc->jw_prev; ++ else ++ lc->jw_next->jw_prev = lc->jw_prev; ++ ++ if (lc->jw_prev == NULL) ++ jbg->wait_head = lc->jw_next; ++ else ++ lc->jw_prev->jw_next = lc->jw_next; ++ ++ // Local ++ if (lc->ljw_next == NULL) ++ jbc->lcw_tail = lc->ljw_prev; ++ else ++ lc->ljw_next->ljw_prev = lc->ljw_prev; ++ ++ if (lc->ljw_prev == NULL) ++ jbc->lcw_head = lc->ljw_next; ++ else ++ lc->ljw_prev->ljw_next = lc->ljw_next; ++ ++ // Update good if required ++ if (jbg->wait_good == lc) ++ jbg->wait_good = lc->jw_prev; ++ ++ // Prod ++ lc->jw_job = jb; ++ sem_post(&lc->jw_sem); ++ } ++ ++ pthread_mutex_unlock(&jbg->lock); ++} ++ ++static void job_lc_kill(HEVCRpiLocalContext * const lc) ++{ ++ sem_destroy(&lc->jw_sem); ++} ++ ++static void job_lc_init(HEVCRpiLocalContext * const lc) ++{ ++ lc->jw_next = NULL; ++ lc->jw_prev = NULL; ++ lc->ljw_next = NULL; ++ lc->ljw_prev = NULL; ++ lc->jw_job = NULL; ++ sem_init(&lc->jw_sem, 0, 0); ++} ++ ++// Returns: ++// 0 if we have waited for MV or expect to wait for recon ++// 1 if we haven't waited for MV & do not need to wait for recon ++static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb) ++{ ++ if (jb->waited) // reset by rpi_begin ++ return 0; ++ for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) ++ { ++ if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL && ++ ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i]) ++ return 0; ++ } ++ return 1; ++} ++ ++// Submit job if it is full (indicated by having ctu_ts_last set >= 0) ++static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJobCtl *const jbc = s->jbc; ++ HEVCRpiJob * const jb = lc->jb0; ++ ++ av_assert1(jb != NULL); ++ ++ if (jb->ctu_ts_last < 0) { ++ return; ++ } ++ ++ lc->last_progress_good = progress_good(s, jb); ++ jb->waited = !lc->last_progress_good; ++ lc->jb0 = NULL; ++ ++ if (s->offload_recon) ++ { ++ pthread_mutex_lock(&jbc->in_lock); ++ jbc->offloadq[jbc->offload_in] = jb; ++ jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS); ++ pthread_mutex_unlock(&jbc->in_lock); ++ ++ pass_queue_submit_job(s->passq + 0); // Consumes job eventually ++ } ++ else ++ { ++ pass_queue_do_all(s, jb); // Consumes job before return ++ } ++} ++ ++ ++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes ++// available to receive the next job. ++// ++// Now safe against multiple callers - needed for tiles ++// "normal" and WPP will only call here one at a time ++static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJobCtl * const jbc = s->jbc; ++ ++ // It is legit for us to already have a job allocated - do nothing in this case ++ if (lc->jb0 != NULL) ++ return; ++ ++ if (s->offload_recon) ++ rpi_sem_wait(&jbc->sem_out); // This sem will stop this frame grabbing too much ++ ++ lc->jb0 = job_alloc(jbc, lc); ++ ++ rpi_begin(s, lc->jb0, lc->ts); ++} ++ ++// Free up a job without submission ++static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJobCtl * const jbc = s->jbc; ++ HEVCRpiJob * const jb = lc->jb0; ++ ++ if (jb == NULL) { ++ return; ++ } ++ ++ lc->jb0 = NULL; ++ ++ job_free(jbc, jb); ++ ++ // If offload then poke sem_out too ++ if (s->offload_recon) { ++ sem_post(&jbc->sem_out); ++ } ++} ++ ++ ++// Call this to wait for all jobs to have completed at the end of a frame ++// Slightly icky as there is no clean way to wait for a sem to count up ++// Not reentrant - call on main thread only ++static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc) ++{ ++ HEVCRpiJobCtl * const jbc = s->jbc; ++ int i = 0; ++ ++ // We shouldn't reach here with an unsubmitted job ++ av_assert1(lc->jb0 == NULL); ++ ++ // If no offload then there can't be anything to wait for ++ if (!s->offload_recon) { ++ return; ++ } ++ ++ if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS) ++ { ++ for (i = 0; i != RPI_MAX_JOBS; ++i) { ++ rpi_sem_wait(&jbc->sem_out); ++ } ++ for (i = 0; i != RPI_MAX_JOBS; ++i) { ++ sem_post(&jbc->sem_out); ++ } ++ } ++} ++ ++static void * pass_worker(void *arg) ++{ ++ HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg; ++ HEVCRpiContext *const s = pq->context; ++ ++ for (;;) ++ { ++ rpi_sem_wait(&pq->sem_in); ++ ++ if (pq->terminate) ++ break; ++ ++ pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]); ++ // * should really set jb->passes_done here ++ ++ sem_post(pq->psem_out); ++ } ++ return NULL; ++} ++ ++static void pass_queues_start_all(HEVCRpiContext *const s) ++{ ++ unsigned int i; ++ HEVCRpiPassQueue * const pqs = s->passq; ++ ++ for (i = 0; i != RPI_PASSES; ++i) ++ { ++ av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0); ++ pqs[i].started = 1; ++ } ++} ++ ++static void pass_queues_term_all(HEVCRpiContext *const s) ++{ ++ unsigned int i; ++ HEVCRpiPassQueue * const pqs = s->passq; ++ ++ for (i = 0; i != RPI_PASSES; ++i) ++ pqs[i].terminate = 1; ++ for (i = 0; i != RPI_PASSES; ++i) ++ { ++ if (pqs[i].started) ++ sem_post(&pqs[i].sem_in); ++ } ++ for (i = 0; i != RPI_PASSES; ++i) ++ { ++ if (pqs[i].started) { ++ pthread_join(pqs[i].thread, NULL); ++ pqs[i].started = 0; ++ } ++ } ++} ++ ++static void pass_queues_kill_all(HEVCRpiContext *const s) ++{ ++ unsigned int i; ++ HEVCRpiPassQueue * const pqs = s->passq; ++ ++ for (i = 0; i != RPI_PASSES; ++i) ++ pass_queue_kill(pqs + i); ++} ++ ++ ++static void worker_pic_free_one(HEVCRpiJob * const jb) ++{ ++ // Free coeff stuff - allocation not the same for all buffers ++ HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ ++ if (cf->s[0].buf != NULL) ++ av_freep(&cf->mptr); ++ if (cf->s[2].buf != NULL) ++ gpu_free(&cf->gptr); ++ memset(cf, 0, sizeof(*cf)); ++} ++ ++static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count) ++{ ++ HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ ++ if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0) ++ goto fail; ++ cf->s[2].buf = (int16_t *)cf->gptr.arm; ++ cf->s[3].buf = cf->s[2].buf + coeff_count; ++ ++ // Must be 64 byte aligned for our zero zapping code so over-allocate & ++ // round ++ if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL) ++ goto fail; ++ cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63); ++ return 0; ++ ++fail: ++ av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__); ++ worker_pic_free_one(jb); ++ return -1; ++} ++ ++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf) ++{ ++ unsigned int i; ++ for (i = 0; i != 4; ++i) { ++ cf->s[i].n = 0; ++#if RPI_COMPRESS_COEFFS ++ cf->s[i].packed = 1; ++ cf->s[i].packed_n = 0; ++#endif ++ } ++} ++ ++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n) ++{ ++ HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no; ++ int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n); ++ cfe->n += n; ++ return coeffs; ++} ++ ++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const HEVCRpiFrame * const ref, const int val, const int field) ++{ ++ if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) { ++ HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data; ++ HEVCRpiFrameProgressState * const pstate = fs->progress_states + field; ++ sem_t * sem = NULL; ++ ++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); ++ if (((volatile int *)ref->tf.progress->data)[field] < val) { ++ HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait; ++ ++ av_assert1(pwait->req == -1 && pwait->next == NULL); ++ jb->waited = 1; // Remember that we had to wait for later scheduling ++ ++ pwait->req = val; ++ pwait->next = NULL; ++ if (pstate->first == NULL) ++ pstate->first = pwait; ++ else ++ pstate->last->next = pwait; ++ pstate->last = pwait; ++ sem = &pwait->sem; ++ } ++ pthread_mutex_unlock(&pstate->lock); ++ ++ if (sem != NULL) { ++ rpi_sem_wait(sem); ++ } ++ } ++} ++ ++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field) ++{ ++ HEVCRpiFrameProgressState *const pstate = s->progress_states + field; ++ ++ ((int *)s->ref->tf.progress->data)[field] = val; ++ ++ av_assert0(pthread_mutex_lock(&pstate->lock) == 0); ++ { ++ HEVCRpiFrameProgressWait ** ppwait = &pstate->first; ++ HEVCRpiFrameProgressWait * pwait; ++ ++ while ((pwait = *ppwait) != NULL) { ++ if (pwait->req > val) ++ { ++ ppwait = &pwait->next; ++ pstate->last = pwait; ++ } ++ else ++ { ++ *ppwait = pwait->next; ++ pwait->req = -1; ++ pwait->next = NULL; ++ sem_post(&pwait->sem); ++ } ++ } ++ } ++ pthread_mutex_unlock(&pstate->lock); ++} ++ ++static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate) ++{ ++ pstate->first = NULL; ++ pstate->last = NULL; ++ pthread_mutex_init(&pstate->lock, NULL); ++} ++ ++static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait) ++{ ++ pwait->req = -1; ++ pwait->next = NULL; ++ sem_init(&pwait->sem, 0, 0); ++} ++ ++static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate) ++{ ++ av_assert1(pstate->first == NULL); ++ pthread_mutex_destroy(&pstate->lock); ++} ++ ++static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait) ++{ ++ sem_destroy(&pwait->sem); ++} ++ ++ ++/** ++ * NOTE: Each function hls_foo correspond to the function foo in the ++ * specification (HLS stands for High Level Syntax). ++ */ ++ ++/** ++ * Section 5.7 ++ */ ++ ++// Realloc the entry point arrays ++static int alloc_entry_points(RpiSliceHeader * const sh, const int n) ++{ ++ if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0) ++ { ++ // Round up alloc to multiple of 32 ++ int a = (n + 31) & ~31; ++ ++ // We don't care about the previous contents so probably fastest to simply discard ++ av_freep(&sh->entry_point_offset); ++ av_freep(&sh->offset); ++ av_freep(&sh->size); ++ ++ if (a != 0) ++ { ++ sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned)); ++ sh->offset = av_malloc_array(a, sizeof(int)); ++ sh->size = av_malloc_array(a, sizeof(int)); ++ ++ if (!sh->entry_point_offset || !sh->offset || !sh->size) { ++ sh->num_entry_point_offsets = 0; ++ sh->offsets_allocated = 0; ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ sh->offsets_allocated = a; ++ } ++ ++ return 0; ++} ++ ++/* free everything allocated by pic_arrays_init() */ ++static void pic_arrays_free(HEVCRpiContext *s) ++{ ++ av_freep(&s->sao); ++ av_freep(&s->deblock); ++ ++ av_freep(&s->cabac_stash_up); ++ s->cabac_stash_left = NULL; // freed with _up ++ ++ av_freep(&s->mvf_up); ++ av_freep(&s->mvf_left); ++ ++ av_freep(&s->is_pcm); ++ av_freep(&s->is_intra_store); ++ s->is_intra = NULL; ++ av_freep(&s->rpl_tab); ++ s->rpl_tab_size = 0; ++ ++ av_freep(&s->qp_y_tab); ++ av_freep(&s->tab_slice_address); ++ av_freep(&s->filter_slice_edges); ++ ++ av_freep(&s->bs_horizontal); ++ s->bs_vertical = NULL; // freed with H ++ av_freep(&s->bsf_stash_left); ++ av_freep(&s->bsf_stash_up); ++ ++ av_freep(&s->rpl_up); ++ av_freep(&s->rpl_left); ++ ++ alloc_entry_points(&s->sh, 0); ++ ++ av_buffer_pool_uninit(&s->col_mvf_pool); ++} ++ ++/* allocate arrays that depend on frame dimensions */ ++static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps) ++{ ++ const unsigned int log2_min_cb_size = sps->log2_min_cb_size; ++ const unsigned int width = sps->width; ++ const unsigned int height = sps->height; ++ const unsigned int pic_size_in_cb = ((width >> log2_min_cb_size) + 1) * ++ ((height >> log2_min_cb_size) + 1); ++ const unsigned int ctb_count = sps->ctb_size; ++ ++ { ++ unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK); ++ unsigned int h = ((height + 15) & ~15); ++ ++ s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size ++ s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols ++ } ++ ++ s->sao = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly ++ s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock)); ++ if (!s->sao || !s->deblock) ++ goto fail; ++ ++ s->cabac_stash_up = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3)); ++ s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3); ++ if (s->cabac_stash_up == NULL) ++ goto fail; ++ ++ // Round width up to max ctb size ++ s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up)); ++ // * Only needed if we have H tiles ++ s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up)); ++ ++ // We can overread by 1 line & one byte in deblock so alloc & zero ++ // We don't need to zero the extra @ start of frame as it will never be ++ // written ++ s->is_pcm = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1); ++ s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1); ++ if (s->is_pcm == NULL || s->is_intra_store == NULL) ++ goto fail; ++ ++ s->filter_slice_edges = av_mallocz(ctb_count); ++ s->tab_slice_address = av_malloc_array(ctb_count, ++ sizeof(*s->tab_slice_address)); ++ s->qp_y_tab = av_malloc_array(pic_size_in_cb, ++ sizeof(*s->qp_y_tab)); ++ if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address) ++ goto fail; ++ ++ s->bs_horizontal = av_mallocz(s->bs_size * 2); ++ s->bs_vertical = s->bs_horizontal + s->bs_size; ++ if (s->bs_horizontal == NULL) ++ goto fail; ++ ++ s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up)); ++ s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left)); ++ if (s->rpl_left == NULL || s->rpl_up == NULL) ++ goto fail; ++ ++ if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL || ++ (s->bsf_stash_up = av_mallocz(((width + 63) & ~63) >> 4)) == NULL) ++ goto fail; ++ ++ s->col_mvf_stride = (width + 15) >> 4; ++ s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField), ++ av_buffer_allocz); ++ if (s->col_mvf_pool == NULL) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ pic_arrays_free(s); ++ return AVERROR(ENOMEM); ++} ++ ++static void default_pred_weight_table(HEVCRpiContext * const s) ++{ ++ unsigned int i; ++ const unsigned int wt = 1 << QPU_MC_DENOM; ++ s->sh.luma_log2_weight_denom = 0; ++ s->sh.chroma_log2_weight_denom = 0; ++ for (i = 0; i < s->sh.nb_refs[L0]; i++) { ++ s->sh.luma_weight_l0[i] = wt; ++ s->sh.luma_offset_l0[i] = 0; ++ s->sh.chroma_weight_l0[i][0] = wt; ++ s->sh.chroma_weight_l0[i][1] = wt; ++ s->sh.chroma_offset_l0[i][0] = 0; ++ s->sh.chroma_offset_l0[i][1] = 0; ++ } ++ for (i = 0; i < s->sh.nb_refs[L1]; i++) { ++ s->sh.luma_weight_l1[i] = wt; ++ s->sh.luma_offset_l1[i] = 0; ++ s->sh.chroma_weight_l1[i][0] = wt; ++ s->sh.chroma_weight_l1[i][1] = wt; ++ s->sh.chroma_offset_l1[i][0] = 0; ++ s->sh.chroma_offset_l1[i][1] = 0; ++ } ++} ++ ++static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb, ++ const unsigned int refs, ++ int16_t * luma_weight, int16_t * luma_offset, ++ int16_t * chroma_weight, int16_t * chroma_offset) ++{ ++ unsigned int luma_flags; ++ unsigned int chroma_flags; ++ unsigned int i; ++ const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8); ++ const int wp_offset_half_range = s->ps.sps->wp_offset_half_range; ++ const unsigned int luma_weight_base = 1 << QPU_MC_DENOM; ++ const unsigned int chroma_weight_base = 1 << QPU_MC_DENOM; ++ const unsigned int luma_weight_shift = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom); ++ const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom); ++ ++ if (refs == 0) ++ return 0; ++ ++ luma_flags = get_bits(gb, refs); ++ chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs); ++ i = 1 << (refs - 1); ++ ++ do ++ { ++ if ((luma_flags & i) != 0) ++ { ++ const int delta_weight = get_se_golomb(gb); ++ const int offset = get_se_golomb(gb); ++ if (delta_weight < -128 || delta_weight > 127 || ++ offset < -wp_offset_half_range || offset >= wp_offset_half_range) ++ { ++ return AVERROR_INVALIDDATA; ++ } ++ *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift); ++ *luma_offset++ = offset << wp_offset_bd_shift; ++ } ++ else ++ { ++ *luma_weight++ = luma_weight_base; ++ *luma_offset++ = 0; ++ } ++ ++ if ((chroma_flags & i) != 0) ++ { ++ unsigned int j; ++ for (j = 0; j != 2; ++j) ++ { ++ const int delta_weight = get_se_golomb(gb); ++ const int delta_offset = get_se_golomb(gb); ++ ++ if (delta_weight < -128 || delta_weight > 127 || ++ delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range) ++ { ++ return AVERROR_INVALIDDATA; ++ } ++ ++ *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift); ++ *chroma_offset++ = av_clip( ++ wp_offset_half_range + delta_offset - ++ ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom), ++ -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift; ++ } ++ } ++ else ++ { ++ *chroma_weight++ = chroma_weight_base; ++ *chroma_weight++ = chroma_weight_base; ++ *chroma_offset++ = 0; ++ *chroma_offset++ = 0; ++ } ++ } while ((i >>= 1) != 0); ++ ++ return 0; ++} ++ ++static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb) ++{ ++ int err; ++ const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb); ++ const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb); ++ ++ if (luma_log2_weight_denom > 7 || ++ chroma_log2_weight_denom > 7) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n", ++ luma_log2_weight_denom, chroma_log2_weight_denom); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ s->sh.luma_log2_weight_denom = luma_log2_weight_denom; ++ s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom; ++ ++ if ((err = get_weights(s, gb, s->sh.nb_refs[L0], ++ s->sh.luma_weight_l0, s->sh.luma_offset_l0, ++ s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 || ++ (err = get_weights(s, gb, s->sh.nb_refs[L1], ++ s->sh.luma_weight_l1, s->sh.luma_offset_l1, ++ s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n"); ++ return err; ++ } ++ ++ return 0; ++} ++ ++static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb) ++{ ++ const HEVCRpiSPS *sps = s->ps.sps; ++ int max_poc_lsb = 1 << sps->log2_max_poc_lsb; ++ int prev_delta_msb = 0; ++ unsigned int nb_sps = 0, nb_sh; ++ int i; ++ ++ rps->nb_refs = 0; ++ if (!sps->long_term_ref_pics_present_flag) ++ return 0; ++ ++ if (sps->num_long_term_ref_pics_sps > 0) ++ nb_sps = get_ue_golomb_long(gb); ++ nb_sh = get_ue_golomb_long(gb); ++ ++ if (nb_sps > sps->num_long_term_ref_pics_sps) ++ return AVERROR_INVALIDDATA; ++ if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc)) ++ return AVERROR_INVALIDDATA; ++ ++ rps->nb_refs = nb_sh + nb_sps; ++ ++ for (i = 0; i < rps->nb_refs; i++) { ++ uint8_t delta_poc_msb_present; ++ ++ if (i < nb_sps) { ++ uint8_t lt_idx_sps = 0; ++ ++ if (sps->num_long_term_ref_pics_sps > 1) ++ lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps)); ++ ++ rps->poc[i] = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps]; ++ rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps]; ++ } else { ++ rps->poc[i] = get_bits(gb, sps->log2_max_poc_lsb); ++ rps->used[i] = get_bits1(gb); ++ } ++ ++ delta_poc_msb_present = get_bits1(gb); ++ if (delta_poc_msb_present) { ++ int64_t delta = get_ue_golomb_long(gb); ++ int64_t poc; ++ ++ if (i && i != nb_sps) ++ delta += prev_delta_msb; ++ ++ poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb; ++ if (poc != (int32_t)poc) ++ return AVERROR_INVALIDDATA; ++ rps->poc[i] = poc; ++ prev_delta_msb = delta; ++ } ++ } ++ ++ return 0; ++} ++ ++static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps, ++ const HEVCRpiSPS *sps) ++{ ++ const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data; ++ const HEVCRpiWindow *ow = &sps->output_window; ++ unsigned int num = 0, den = 0; ++ ++ avctx->pix_fmt = sps->pix_fmt; ++ avctx->coded_width = sps->width; ++ avctx->coded_height = sps->height; ++ avctx->width = sps->width - ow->left_offset - ow->right_offset; ++ avctx->height = sps->height - ow->top_offset - ow->bottom_offset; ++ avctx->has_b_frames = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics; ++ avctx->profile = sps->ptl.general_ptl.profile_idc; ++ avctx->level = sps->ptl.general_ptl.level_idc; ++ ++ ff_set_sar(avctx, sps->vui.sar); ++ ++ if (sps->vui.video_signal_type_present_flag) ++ avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG ++ : AVCOL_RANGE_MPEG; ++ else ++ avctx->color_range = AVCOL_RANGE_MPEG; ++ ++ if (sps->vui.colour_description_present_flag) { ++ avctx->color_primaries = sps->vui.colour_primaries; ++ avctx->color_trc = sps->vui.transfer_characteristic; ++ avctx->colorspace = sps->vui.matrix_coeffs; ++ } else { ++ avctx->color_primaries = AVCOL_PRI_UNSPECIFIED; ++ avctx->color_trc = AVCOL_TRC_UNSPECIFIED; ++ avctx->colorspace = AVCOL_SPC_UNSPECIFIED; ++ } ++ ++ if (vps->vps_timing_info_present_flag) { ++ num = vps->vps_num_units_in_tick; ++ den = vps->vps_time_scale; ++ } else if (sps->vui.vui_timing_info_present_flag) { ++ num = sps->vui.vui_num_units_in_tick; ++ den = sps->vui.vui_time_scale; ++ } ++ ++ if (num != 0 && den != 0) ++ av_reduce(&avctx->framerate.den, &avctx->framerate.num, ++ num, den, 1 << 30); ++} ++ ++static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps) ++{ ++ enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts; ++ ++ // Admit to no h/w formats ++ ++ *fmt++ = sps->pix_fmt; ++ *fmt = AV_PIX_FMT_NONE; ++ ++ return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts); ++} ++ ++static int is_sps_supported(const HEVCRpiSPS * const sps) ++{ ++ return av_rpi_is_sand_format(sps->pix_fmt) && ++ sps->width <= HEVC_RPI_MAX_WIDTH && ++ sps->height <= HEVC_RPI_MAX_HEIGHT; ++} ++ ++static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps, ++ const enum AVPixelFormat pix_fmt) ++{ ++ int ret; ++ ++ pic_arrays_free(s); ++ s->ps.sps = NULL; ++ s->ps.vps = NULL; ++ ++ if (sps == NULL) ++ return 0; ++ ++ if (!is_sps_supported(sps)) ++ return AVERROR_DECODER_NOT_FOUND; ++ ++ ret = pic_arrays_init(s, sps); ++ if (ret < 0) ++ goto fail; ++ ++ export_stream_params(s->avctx, &s->ps, sps); ++ ++ s->avctx->pix_fmt = pix_fmt; ++ ++ ff_hevc_rpi_pred_init(&s->hpc, sps->bit_depth); ++ ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth); ++ ++ // * We don't support cross_component_prediction_enabled_flag but as that ++ // must be 0 unless we have 4:4:4 there is no point testing for it as we ++ // only deal with sand which is never 4:4:4 ++ // [support wouldn't be hard] ++ ++ rpi_hevc_qpu_set_fns(s, sps->bit_depth); ++ ++ av_freep(&s->sao_pixel_buffer_h[0]); ++ av_freep(&s->sao_pixel_buffer_v[0]); ++ ++ if (sps->sao_enabled) ++ { ++ const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1; ++ unsigned int c_idx; ++ size_t vsize[3] = {0}; ++ size_t hsize[3] = {0}; ++ ++ for(c_idx = 0; c_idx < c_count; c_idx++) { ++ int w = sps->width >> ctx_hshift(s, c_idx); ++ int h = sps->height >> ctx_vshift(s, c_idx); ++ // ctb height & width are a min of 8 so this must a multiple of 16 ++ // so no point rounding up! ++ hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift; ++ vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift; ++ } ++ ++ // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2] ++ // when we have plaited chroma ++ s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]); ++ s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]); ++ s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0]; ++ s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1]; ++ s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0]; ++ s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1]; ++ } ++ ++ s->ps.sps = sps; ++ s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data; ++ ++ return 0; ++ ++fail: ++ pic_arrays_free(s); ++ s->ps.sps = NULL; ++ return ret; ++} ++ ++static inline int qp_offset_valid(const int qp_offset) ++{ ++ return qp_offset >= -12 && qp_offset <= 12; ++} ++ ++static int hls_slice_header(HEVCRpiContext * const s) ++{ ++ GetBitContext * const gb = &s->HEVClc->gb; ++ RpiSliceHeader * const sh = &s->sh; ++ int i, ret; ++ ++ // Coded parameters ++ sh->first_slice_in_pic_flag = get_bits1(gb); ++ if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) { ++ s->seq_decode = (s->seq_decode + 1) & 0xff; ++ s->max_ra = INT_MAX; ++ if (IS_IDR(s)) ++ ff_hevc_rpi_clear_refs(s); ++ } ++ sh->no_output_of_prior_pics_flag = 0; ++ if (IS_IRAP(s)) ++ sh->no_output_of_prior_pics_flag = get_bits1(gb); ++ ++ sh->pps_id = get_ue_golomb_long(gb); ++ if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) { ++ av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id); ++ return AVERROR_INVALIDDATA; ++ } ++ if (!sh->first_slice_in_pic_flag && ++ s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) { ++ av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data; ++ if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1) ++ sh->no_output_of_prior_pics_flag = 1; ++ ++ if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) { ++ const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data; ++ const HEVCRpiSPS *last_sps = s->ps.sps; ++ enum AVPixelFormat pix_fmt; ++ ++ if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) { ++ if (sps->width != last_sps->width || sps->height != last_sps->height || ++ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering != ++ last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering) ++ sh->no_output_of_prior_pics_flag = 0; ++ } ++ ff_hevc_rpi_clear_refs(s); ++ ++ ret = set_sps(s, sps, sps->pix_fmt); ++ if (ret < 0) ++ return ret; ++ ++ pix_fmt = get_format(s, sps); ++ if (pix_fmt < 0) ++ return pix_fmt; ++ ++// ret = set_sps(s, sps, pix_fmt); ++// if (ret < 0) ++// return ret; ++ ++ s->avctx->pix_fmt = pix_fmt; ++ ++ s->seq_decode = (s->seq_decode + 1) & 0xff; ++ s->max_ra = INT_MAX; ++ } ++ ++ sh->dependent_slice_segment_flag = 0; ++ if (!sh->first_slice_in_pic_flag) { ++ int slice_address_length; ++ ++ if (s->ps.pps->dependent_slice_segments_enabled_flag) ++ sh->dependent_slice_segment_flag = get_bits1(gb); ++ ++ slice_address_length = av_ceil_log2(s->ps.sps->ctb_size); ++ sh->slice_segment_addr = get_bitsz(gb, slice_address_length); ++ if (sh->slice_segment_addr >= s->ps.sps->ctb_size) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Invalid slice segment address: %u.\n", ++ sh->slice_segment_addr); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (!sh->dependent_slice_segment_flag) { ++ sh->slice_addr = sh->slice_segment_addr; ++ s->slice_idx++; ++ } ++ } else { ++ sh->slice_segment_addr = sh->slice_addr = 0; ++ s->slice_idx = 0; ++ s->slice_initialized = 0; ++ } ++ ++ if (!sh->dependent_slice_segment_flag) { ++ s->slice_initialized = 0; ++ ++ for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++) ++ skip_bits(gb, 1); // slice_reserved_undetermined_flag[] ++ ++ sh->slice_type = get_ue_golomb_long(gb); ++ if (!(sh->slice_type == HEVC_SLICE_I || ++ sh->slice_type == HEVC_SLICE_P || ++ sh->slice_type == HEVC_SLICE_B)) { ++ av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n", ++ sh->slice_type); ++ return AVERROR_INVALIDDATA; ++ } ++ if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) { ++ av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ // when flag is not present, picture is inferred to be output ++ sh->pic_output_flag = 1; ++ if (s->ps.pps->output_flag_present_flag) ++ sh->pic_output_flag = get_bits1(gb); ++ ++ if (s->ps.sps->separate_colour_plane_flag) ++ sh->colour_plane_id = get_bits(gb, 2); ++ ++ if (!IS_IDR(s)) { ++ int poc, pos; ++ ++ sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb); ++ poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type); ++ if (!sh->first_slice_in_pic_flag && poc != s->poc) { ++ av_log(s->avctx, AV_LOG_WARNING, ++ "Ignoring POC change between slices: %d -> %d\n", s->poc, poc); ++ if (s->avctx->err_recognition & AV_EF_EXPLODE) ++ return AVERROR_INVALIDDATA; ++ poc = s->poc; ++ } ++ s->poc = poc; ++ ++ sh->short_term_ref_pic_set_sps_flag = get_bits1(gb); ++ pos = get_bits_left(gb); ++ if (!sh->short_term_ref_pic_set_sps_flag) { ++ ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1); ++ if (ret < 0) ++ return ret; ++ ++ sh->short_term_rps = &sh->slice_rps; ++ } else { ++ int numbits, rps_idx; ++ ++ if (!s->ps.sps->nb_st_rps) { ++ av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ numbits = av_ceil_log2(s->ps.sps->nb_st_rps); ++ rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0; ++ sh->short_term_rps = &s->ps.sps->st_rps[rps_idx]; ++ } ++ sh->short_term_ref_pic_set_size = pos - get_bits_left(gb); ++ ++ pos = get_bits_left(gb); ++ ret = decode_lt_rps(s, &sh->long_term_rps, gb); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n"); ++ if (s->avctx->err_recognition & AV_EF_EXPLODE) ++ return AVERROR_INVALIDDATA; ++ } ++ sh->long_term_ref_pic_set_size = pos - get_bits_left(gb); ++ ++ if (s->ps.sps->sps_temporal_mvp_enabled_flag) ++ sh->slice_temporal_mvp_enabled_flag = get_bits1(gb); ++ else ++ sh->slice_temporal_mvp_enabled_flag = 0; ++ } else { ++ s->sh.short_term_rps = NULL; ++ s->poc = 0; ++ } ++ ++ /* 8.3.1 */ ++ if (sh->first_slice_in_pic_flag && s->temporal_id == 0 && ++ s->nal_unit_type != HEVC_NAL_TRAIL_N && ++ s->nal_unit_type != HEVC_NAL_TSA_N && ++ s->nal_unit_type != HEVC_NAL_STSA_N && ++ s->nal_unit_type != HEVC_NAL_RADL_N && ++ s->nal_unit_type != HEVC_NAL_RADL_R && ++ s->nal_unit_type != HEVC_NAL_RASL_N && ++ s->nal_unit_type != HEVC_NAL_RASL_R) ++ s->pocTid0 = s->poc; ++ ++ if (s->ps.sps->sao_enabled) { ++ sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb); ++ if (ctx_cfmt(s) != 0) { ++ sh->slice_sample_adaptive_offset_flag[1] = ++ sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb); ++ } ++ } else { ++ sh->slice_sample_adaptive_offset_flag[0] = 0; ++ sh->slice_sample_adaptive_offset_flag[1] = 0; ++ sh->slice_sample_adaptive_offset_flag[2] = 0; ++ } ++ ++ sh->nb_refs[L0] = sh->nb_refs[L1] = 0; ++ if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) { ++ int nb_refs; ++ ++ sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active; ++ if (sh->slice_type == HEVC_SLICE_B) ++ sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active; ++ ++ if (get_bits1(gb)) { // num_ref_idx_active_override_flag ++ sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1; ++ if (sh->slice_type == HEVC_SLICE_B) ++ sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1; ++ } ++ if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) { ++ av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n", ++ sh->nb_refs[L0], sh->nb_refs[L1]); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sh->rpl_modification_flag[0] = 0; ++ sh->rpl_modification_flag[1] = 0; ++ nb_refs = ff_hevc_rpi_frame_nb_refs(s); ++ if (!nb_refs) { ++ av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) { ++ sh->rpl_modification_flag[0] = get_bits1(gb); ++ if (sh->rpl_modification_flag[0]) { ++ for (i = 0; i < sh->nb_refs[L0]; i++) ++ sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs)); ++ } ++ ++ if (sh->slice_type == HEVC_SLICE_B) { ++ sh->rpl_modification_flag[1] = get_bits1(gb); ++ if (sh->rpl_modification_flag[1] == 1) ++ for (i = 0; i < sh->nb_refs[L1]; i++) ++ sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs)); ++ } ++ } ++ ++ if (sh->slice_type == HEVC_SLICE_B) ++ sh->mvd_l1_zero_flag = get_bits1(gb); ++ ++ if (s->ps.pps->cabac_init_present_flag) ++ sh->cabac_init_flag = get_bits1(gb); ++ else ++ sh->cabac_init_flag = 0; ++ ++ sh->collocated_ref_idx = 0; ++ if (sh->slice_temporal_mvp_enabled_flag) { ++ sh->collocated_list = L0; ++ if (sh->slice_type == HEVC_SLICE_B) ++ sh->collocated_list = !get_bits1(gb); ++ ++ if (sh->nb_refs[sh->collocated_list] > 1) { ++ sh->collocated_ref_idx = get_ue_golomb_long(gb); ++ if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Invalid collocated_ref_idx: %d.\n", ++ sh->collocated_ref_idx); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ } ++ ++ if ((s->ps.pps->weighted_pred_flag && sh->slice_type == HEVC_SLICE_P) || ++ (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B)) ++ { ++ if ((ret = pred_weight_table(s, gb)) != 0) ++ return ret; ++ } ++ else ++ { ++ // Give us unit weights ++ default_pred_weight_table(s); ++ } ++ ++ sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb); ++ if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Invalid number of merging MVP candidates: %d.\n", ++ sh->max_num_merge_cand); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ sh->slice_qp_delta = get_se_golomb(gb); ++ ++ if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) { ++ sh->slice_cb_qp_offset = get_se_golomb(gb); ++ sh->slice_cr_qp_offset = get_se_golomb(gb); ++ if (!qp_offset_valid(sh->slice_cb_qp_offset) || ++ !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) || ++ !qp_offset_valid(sh->slice_cr_qp_offset) || ++ !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset)) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n", ++ sh->slice_cr_qp_offset, sh->slice_cr_qp_offset, ++ s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset); ++ return AVERROR_INVALIDDATA; ++ } ++ } else ++ { ++ sh->slice_cb_qp_offset = 0; ++ sh->slice_cr_qp_offset = 0; ++ } ++ ++ if (s->ps.pps->chroma_qp_offset_list_enabled_flag) ++ sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb); ++ else ++ sh->cu_chroma_qp_offset_enabled_flag = 0; ++ ++ if (s->ps.pps->deblocking_filter_control_present_flag) { ++ int deblocking_filter_override_flag = 0; ++ ++ if (s->ps.pps->deblocking_filter_override_enabled_flag) ++ deblocking_filter_override_flag = get_bits1(gb); ++ ++ if (deblocking_filter_override_flag) { ++ sh->disable_deblocking_filter_flag = get_bits1(gb); ++ if (!sh->disable_deblocking_filter_flag) { ++ int beta_offset_div2 = get_se_golomb(gb); ++ int tc_offset_div2 = get_se_golomb(gb) ; ++ if (beta_offset_div2 < -6 || beta_offset_div2 > 6 || ++ tc_offset_div2 < -6 || tc_offset_div2 > 6) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Invalid deblock filter offsets: %d, %d\n", ++ beta_offset_div2, tc_offset_div2); ++ return AVERROR_INVALIDDATA; ++ } ++ sh->beta_offset = beta_offset_div2 * 2; ++ sh->tc_offset = tc_offset_div2 * 2; ++ } ++ } else { ++ sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf; ++ sh->beta_offset = s->ps.pps->beta_offset; ++ sh->tc_offset = s->ps.pps->tc_offset; ++ } ++ } else { ++ sh->disable_deblocking_filter_flag = 0; ++ sh->beta_offset = 0; ++ sh->tc_offset = 0; ++ } ++ ++ if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag && ++ (sh->slice_sample_adaptive_offset_flag[0] || ++ sh->slice_sample_adaptive_offset_flag[1] || ++ !sh->disable_deblocking_filter_flag)) { ++ sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb); ++ } else { ++ sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag; ++ } ++ sh->no_dblk_boundary_flags = ++ (sh->slice_loop_filter_across_slices_enabled_flag ? 0 : ++ BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) | ++ (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 : ++ BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE); ++ ++ ++ } else if (!s->slice_initialized) { ++ av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sh->num_entry_point_offsets = 0; ++ sh->offload_wpp = 0; ++ sh->offload_tiles = 0; ++ ++ if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) { ++ unsigned num_entry_point_offsets = get_ue_golomb_long(gb); ++ // It would be possible to bound this tighter but this here is simpler ++ if (num_entry_point_offsets > get_bits_left(gb)) { ++ av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ sh->num_entry_point_offsets = num_entry_point_offsets; ++ if (sh->num_entry_point_offsets > 0) { ++ int offset_len = get_ue_golomb_long(gb) + 1; ++ ++ if (offset_len < 1 || offset_len > 32) { ++ sh->num_entry_point_offsets = 0; ++ av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n"); ++ return ret; ++ } ++ ++ for (i = 0; i < sh->num_entry_point_offsets; i++) { ++ uint32_t val_minus1 = get_bits_long(gb, offset_len); ++ if (val_minus1 > (1 << 28)) ++ { ++ // We can declare offsets of > 2^28 bad without loss of generality ++ // Will check actual bounds wrt NAL later, but this keeps ++ // the values within bounds we can deal with easily ++ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1); ++ return AVERROR_INVALIDDATA; ++ } ++ sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size ++ } ++ ++ // Do we want to offload this ++ if (s->threads_type != 0) ++ { ++ sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) && ++ s->ps.pps->num_tile_columns > 1; ++ // * We only cope with WPP in a single column ++ // Probably want to deal with that case as tiles rather than WPP anyway ++ // ?? Not actually sure that the main code deals with WPP + multi-col correctly ++ sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag && ++ s->ps.pps->num_tile_columns == 1; ++ } ++ } ++ } ++ ++ if (s->ps.pps->slice_header_extension_present_flag) { ++ unsigned int length = get_ue_golomb_long(gb); ++ if (length*8LL > get_bits_left(gb)) { ++ av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ for (i = 0; i < length; i++) ++ skip_bits(gb, 8); // slice_header_extension_data_byte ++ } ++ ++ // Inferred parameters ++ sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta; ++ if (sh->slice_qp > 51 || ++ sh->slice_qp < -s->ps.sps->qp_bd_offset) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "The slice_qp %d is outside the valid range " ++ "[%d, 51].\n", ++ sh->slice_qp, ++ -s->ps.sps->qp_bd_offset); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (get_bits_left(gb) < 0) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Overread slice header by %d bits\n", -get_bits_left(gb)); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ s->slice_initialized = 1; ++ return 0; ++} ++ ++static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry) ++{ ++ RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width; ++ int c_idx, i; ++ ++ if (s->sh.slice_sample_adaptive_offset_flag[0] || ++ s->sh.slice_sample_adaptive_offset_flag[1]) { ++ if ((lc->ctb_avail & AVAIL_L) != 0) ++ { ++ const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); ++ if (sao_merge_left_flag) { ++ *sao = sao[-1]; ++ return; ++ } ++ } ++ if ((lc->ctb_avail & AVAIL_U) != 0) ++ { ++ const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc); ++ if (sao_merge_up_flag) { ++ *sao = sao[-(int)s->ps.sps->ctb_width]; ++ return; ++ } ++ } ++ } ++ ++ for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) { ++ const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma : ++ s->ps.pps->log2_sao_offset_scale_chroma; ++ int offset_abs[4]; ++ char offset_sign[4] = {0}; ++ ++ if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) { ++ sao->type_idx[c_idx] = SAO_NOT_APPLIED; ++ continue; ++ } ++ ++ if (c_idx == 2) { ++ sao->type_idx[2] = sao->type_idx[1]; ++ sao->eo_class[2] = sao->eo_class[1]; ++ } else { ++ sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc); ++ } ++ ++ // ** Could use BY22 here quite plausibly - this is all bypass stuff ++ // though only per CTB so not very timing critical ++ ++ if (sao->type_idx[c_idx] == SAO_NOT_APPLIED) ++ continue; ++ ++ for (i = 0; i < 4; i++) ++ offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc); ++ ++ if (sao->type_idx[c_idx] == SAO_BAND) { ++ for (i = 0; i < 4; i++) { ++ if (offset_abs[i] != 0) ++ offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc); ++ } ++ sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc); ++ } else if (c_idx != 2) { ++ sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc); ++ } ++ ++ // Inferred parameters ++ sao->offset_val[c_idx][0] = 0; ++ for (i = 0; i < 4; i++) { ++ sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale; ++ if (sao->type_idx[c_idx] == SAO_EDGE) { ++ if (i > 1) ++ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1]; ++ } else if (offset_sign[i]) { ++ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1]; ++ } ++ } ++ } ++} ++ ++#if 0 ++static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) { ++ int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx); // 0..4 ++ ++ if (log2_res_scale_abs_plus1 != 0) { ++ int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx); ++ lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) * ++ (1 - 2 * res_scale_sign_flag); ++ } else { ++ lc->tu.res_scale_val = 0; ++ } ++ ++ ++ return 0; ++} ++#endif ++ ++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb) ++{ ++ return jb->intra.cmds + jb->intra.n++; ++} ++ ++#define A0(x, y, U, L, UL, UR, DL) \ ++ [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0)) ++ ++#define A1(x, y, U, L, UL, UR, DL) \ ++ A0((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A0((x) + 1, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A0((x) + 0, (y) + 1, 1, (L), (L), 1, (DL)), A0((x) + 1, (y) + 1, 1, 1, 1, 0, 0 ) ++ ++#define A2(x, y, U, L, UL, UR, DL) \ ++ A1((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A1((x) + 2, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A1((x) + 0, (y) + 2, 1, (L), (L), 1, (DL)), A1((x) + 2, (y) + 2, 1, 1, 1, 0, 0 ) ++ ++#define A3(x, y, U, L, UL, UR, DL) \ ++ A2((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A2((x) + 4, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A2((x) + 0, (y) + 4, 1, (L), (L), 1, (DL)), A2((x) + 4, (y) + 4, 1, 1, 1, 0, 0 ) ++ ++#define A4(x, y, U, L, UL, UR, DL) \ ++ A3((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A3((x) + 8, (y) + 0, (U), 1, (U), (UR), 0 ),\ ++ A3((x) + 0, (y) + 8, 1, (L), (L), 1, (DL)), A3((x) + 8, (y) + 8, 1, 1, 1, 0, 0 ) ++ ++static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)}; ++ ++unsigned int ff_hevc_rpi_tb_avail_flags( ++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h) ++{ ++ const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size; ++ const unsigned int tb_x = x & ~ctb_mask; ++ const unsigned int tb_y = y & ~ctb_mask; ++ const unsigned int ctb_avail = lc->ctb_avail; ++ ++ const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16; ++ ++ unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL); ++ ++ // This deals with both the U & L edges ++ if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0) ++ f |= AVAIL_UL; ++ ++ if (x + w < lc->end_of_ctb_x) ++ f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR; ++ else if (tb_y == 0) ++ f |= (ctb_avail & AVAIL_UR); ++#if AVAIL_S_U - AVAIL_S_UR < 0 ++#error Shift problem ++#endif ++ ++ // Never any D if Y beyond eoctb ++ if (y + h < lc->end_of_ctb_y) ++ f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL; ++#if AVAIL_S_DL - AVAIL_S_L < 0 ++#error Shift problem ++#endif ++ ++// printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h, ++// lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16], ++// lc->end_of_ctb_x, lc->end_of_ctb_y); ++ ++ return f; ++} ++ ++#undef A0 ++#undef A1 ++#undef A2 ++#undef A3 ++#undef A4 ++ ++static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx, ++ unsigned int avail) ++{ ++ // If rpi_enabled then sand - U & V done on U call ++ if (c_idx <= 1) ++ { ++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); ++ cmd->type = RPI_PRED_INTRA + c_idx; ++ cmd->size = log2_trafo_size; ++ cmd->avail = avail; ++ cmd->i_pred.x = x0; ++ cmd->i_pred.y = y0; ++ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode; ++ ++// printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail); ++ } ++} ++ ++#define CBF_CB0_S 0 ++#define CBF_CB1_S 1 // CB1 must be CB0 + 1 ++#define CBF_CR0_S 2 ++#define CBF_CR1_S 3 ++ ++#define CBF_CB0 (1 << CBF_CB0_S) ++#define CBF_CR0 (1 << CBF_CR0_S) ++#define CBF_CB1 (1 << CBF_CB1_S) ++#define CBF_CR1 (1 << CBF_CR1_S) ++ ++// * Only good for chroma_idx == 1 ++static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_cb_size, const unsigned int log2_trafo_size, ++ const unsigned int blk_idx, const int cbf_luma, ++ const unsigned int cbf_chroma) ++{ ++ const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1); ++ const unsigned int x0_c = x0 & ~7; ++ const unsigned int y0_c = y0 & ~7; ++ ++ enum ScanType scan_idx = SCAN_DIAG; ++ enum ScanType scan_idx_c = SCAN_DIAG; ++ ++ if (lc->cu.pred_mode == MODE_INTRA) ++ { ++ const unsigned int trafo_size = 1 << log2_trafo_size; ++ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size); ++ ++ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail); ++ ++ if (log2_trafo_size > 2) ++ do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail); ++ else if (blk_idx == 3) ++ do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, ++ ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8)); ++ ++ if (log2_trafo_size < 4) { ++ if (lc->tu.intra_pred_mode >= 6 && ++ lc->tu.intra_pred_mode <= 14) { ++ scan_idx = SCAN_VERT; ++ } else if (lc->tu.intra_pred_mode >= 22 && ++ lc->tu.intra_pred_mode <= 30) { ++ scan_idx = SCAN_HORIZ; ++ } ++ ++ if (lc->tu.intra_pred_mode_c >= 6 && ++ lc->tu.intra_pred_mode_c <= 14) { ++ scan_idx_c = SCAN_VERT; ++ } else if (lc->tu.intra_pred_mode_c >= 22 && ++ lc->tu.intra_pred_mode_c <= 30) { ++ scan_idx_c = SCAN_HORIZ; ++ } ++ } ++ } ++ ++ if (!cbf_luma && cbf_chroma == 0) ++ return 0; ++ ++ if (lc->tu.is_cu_qp_delta_wanted) ++ { ++ const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc); ++ const unsigned int cb_mask = ~0U << log2_cb_size; ++ ++ if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) || ++ qp_delta > (25 + (s->ps.sps->qp_bd_offset >> 1))) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "The cu_qp_delta %d is outside the valid range " ++ "[%d, %d].\n", ++ qp_delta, ++ -(26 + (s->ps.sps->qp_bd_offset >> 1)), ++ (25 + (s->ps.sps->qp_bd_offset >> 1))); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ lc->tu.is_cu_qp_delta_wanted = 0; ++ lc->tu.cu_qp_delta = qp_delta; ++ ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask); ++ } ++ ++ // * Not main profile & untested due to no conform streams ++ if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma && ++ !lc->cu.cu_transquant_bypass_flag) { ++ int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc); ++ if (cu_chroma_qp_offset_flag) { ++ int cu_chroma_qp_offset_idx = 0; ++ if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) { ++ cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc); ++ } ++ lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx]; ++ lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx]; ++ } ++ lc->tu.cu_chroma_qp_offset_wanted = 0; ++ } ++ ++ if (cbf_luma) ++ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0); ++ ++ if (log2_trafo_size > 2 || blk_idx == 3) ++ { ++ if ((cbf_chroma & CBF_CB0) != 0) ++ ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c, ++ log2_trafo_size_c, scan_idx_c, 1); ++ if ((cbf_chroma & CBF_CR0) != 0) ++ ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c, ++ log2_trafo_size_c, scan_idx_c, 2); ++ } ++ ++ return 0; ++} ++ ++static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size) ++{ ++ set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3); ++} ++ ++ ++static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_trafo_size, ++ const unsigned int trafo_depth, const unsigned int blk_idx, ++ const unsigned int cbf_c0) ++{ ++ // When trafo_size == 2 hls_transform_unit uses c0 so put in c1 ++ unsigned int cbf_c1 = cbf_c0; ++ int split_transform_flag; ++ int ret; ++ ++ if (lc->cu.intra_split_flag) { ++ if (trafo_depth == 1) { ++ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[blk_idx]; ++ if (ctx_cfmt(s) == 3) { ++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx]; ++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[blk_idx]; ++ } else { ++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0]; ++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0]; ++ } ++ } ++ } else { ++ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[0]; ++ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0]; ++ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0]; ++ } ++ ++ if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size && ++ log2_trafo_size > s->ps.sps->log2_min_tb_size && ++ trafo_depth < lc->cu.max_trafo_depth && ++ !(lc->cu.intra_split_flag && trafo_depth == 0)) ++ { ++ split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size); ++ } else { ++ int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 && ++ lc->cu.pred_mode == MODE_INTER && ++ lc->cu.part_mode != PART_2Nx2N && ++ trafo_depth == 0; ++ ++ split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size || ++ (lc->cu.intra_split_flag && trafo_depth == 0) || ++ inter_split; ++ } ++ ++ if (log2_trafo_size > 2 || ctx_cfmt(s) == 3) ++ { ++ const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3); ++ cbf_c1 = 0; ++ ++ if ((cbf_c0 & CBF_CB0) != 0) ++ { ++ cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S; ++ if (wants_c1) ++ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S; ++ } ++ ++ if ((cbf_c0 & CBF_CR0) != 0) ++ { ++ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S; ++ if (wants_c1) ++ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S; ++ } ++ } ++ ++ if (split_transform_flag) { ++ const int trafo_size_split = 1 << (log2_trafo_size - 1); ++ const int x1 = x0 + trafo_size_split; ++ const int y1 = y0 + trafo_size_split; ++ ++#define SUBDIVIDE(x, y, idx) \ ++do { \ ++ ret = hls_transform_tree(s, lc, x, y, \ ++ log2_trafo_size - 1, trafo_depth + 1, idx, \ ++ cbf_c1); \ ++ if (ret < 0) \ ++ return ret; \ ++} while (0) ++ ++ SUBDIVIDE(x0, y0, 0); ++ SUBDIVIDE(x1, y0, 1); ++ SUBDIVIDE(x0, y1, 2); ++ SUBDIVIDE(x1, y1, 3); ++ ++#undef SUBDIVIDE ++ } else { ++ // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have ++ // trafo_size == 2 with depth == 0 the issue is moot ++ const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) || ++ ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth)); ++ ++ ret = hls_transform_unit(s, lc, x0, y0, ++ log2_trafo_size + trafo_depth, log2_trafo_size, ++ blk_idx, cbf_luma, cbf_c1); ++ if (ret < 0) ++ return ret; ++ ++ if (!s->sh.disable_deblocking_filter_flag) { ++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma); ++ } ++ } ++ return 0; ++} ++ ++ ++static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size) ++{ ++ GetBitContext gb; ++ int ret; ++ ++ ret = init_get_bits(&gb, pcm, length); ++ if (ret < 0) ++ return ret; ++ ++ s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0), ++ frame_stride1(s->frame, 0), ++ cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth); ++ ++ s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)), ++ s->frame->linesize[1], ++ cb_size >> ctx_hshift(s, 1), ++ cb_size >> ctx_vshift(s, 1), ++ &gb, s->ps.sps->pcm.bit_depth_chroma); ++ ++ return 0; ++} ++ ++ ++// x * 2^(y*2) ++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y) ++{ ++ return x << (y * 2); ++} ++ ++static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size) ++{ ++ // Length in bits ++ const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) + ++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) + ++ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2)); ++ ++ const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3); ++ ++ if (!s->sh.disable_deblocking_filter_flag) ++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0); ++ ++ // Copy coeffs ++ { ++ const int blen = (length + 7) >> 3; ++ // Round allocated bytes up to nearest 32 to avoid alignment confusion ++ // Allocation is in int16_t s ++ // As we are only using 1 byte per sample and the coeff buffer allows 2 per ++ // sample this rounding doesn't affect the total size we need to allocate for ++ // the coeff buffer ++ int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1); ++ memcpy(coeffs, pcm, blen); ++ ++ // Our coeff stash assumes that any partially allocated 64byte lump ++ // is zeroed so make that true. ++ { ++ uint8_t * const eopcm = (uint8_t *)coeffs + blen; ++ if ((-(intptr_t)eopcm & 63) != 0) ++ memset(eopcm, 0, -(intptr_t)eopcm & 63); ++ } ++ ++ // Add command ++ { ++ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0); ++ cmd->type = RPI_PRED_I_PCM; ++ cmd->size = log2_cb_size; ++ cmd->i_pcm.src = coeffs; ++ cmd->i_pcm.x = x0; ++ cmd->i_pcm.y = y0; ++ cmd->i_pcm.src_len = length; ++ } ++ return 0; ++ } ++} ++ ++ ++static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref, ++ const MvXY xy, const int y0, const int height) ++{ ++ if (s->threads_type != 0) { ++ const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9); ++ ++ // Progress has to be attached to current job as the actual wait ++ // is in worker_core which can't use lc ++ int16_t *const pr = lc->jb0->progress_req + ref->dpb_no; ++ if (*pr < y) { ++ *pr = y; ++ } ++ } ++} ++ ++static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x0, const int y0, const int nPbW, ++ const int nPbH, ++ HEVCRpiMvField * const mv) ++{ ++ enum InterPredIdc inter_pred_idc = PRED_L0; ++ int mvp_flag; ++ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH); ++ ++ mv->pred_flag = 0; ++ if (s->sh.slice_type == HEVC_SLICE_B) ++ inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH); ++ ++ if (inter_pred_idc != PRED_L1) { ++ MvXY mvd; ++ ++ if (s->sh.nb_refs[L0]) ++ mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]); ++ ++ mv->pred_flag = PF_L0; ++ mvd = ff_hevc_rpi_hls_mvd_coding(lc); ++ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); ++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail, ++ mv, mvp_flag, 0); ++ mv->xy[0] = mvxy_add(mv->xy[0], mvd); ++ } ++ ++ if (inter_pred_idc != PRED_L0) { ++ MvXY mvd = 0; ++ ++ if (s->sh.nb_refs[L1]) ++ mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]); ++ ++ if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI) ++ mvd = ff_hevc_rpi_hls_mvd_coding(lc); ++ ++ mv->pred_flag += PF_L1; ++ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc); ++ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail, ++ mv, mvp_flag, 1); ++ mv->xy[1] = mvxy_add(mv->xy[1], mvd); ++ } ++} ++ ++ ++static HEVCRpiInterPredQ * ++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn) ++{ ++ HEVCRpiInterPredQ * yp = NULL; ++ HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr; ++ const unsigned int max_fill = ipe->max_fill; ++ unsigned int load = UINT_MAX; ++ ++ for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) { ++ // We will always have enough room between the Qs but if we are ++ // running critically low due to poor scheduling then use fill size ++ // rather than load to determine QPU. This has obvious dire ++ // performance implications but (a) it is better than crashing ++ // and (b) it should (almost) never happen ++ const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base; ++ const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load; ++ ++ if (tload < load) ++ { ++ yp = ypt; ++ load = tload; ++ } ++ } ++ ++ yp->load += load_val; ++ ipe->used_grp = 1; ++ qpu_mc_link_set(yp->qpu_mc_curr, fn); ++ ++ return yp; ++} ++ ++ ++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe) ++{ ++ for (unsigned int i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base; ++ ++ qpu_mc_link_set(q->qpu_mc_curr, q->code_sync); ++ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(&q->qpu_mc_curr->sync + 1); ++ q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage ++ } ++} ++ ++// Returns 0 on success ++// We no longer check for Q fullness as wew have emergncy code in ctu alloc ++// * However it might be an idea to have some means of spotting that we've used it ++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe) ++{ ++ if (!ipe->used_grp) ++ return 0; ++ ++ if ((ipe->curr += ipe->n_grp) >= ipe->n) ++ { ++ ipe->curr = 0; ++ rpi_inter_pred_sync(ipe); ++ } ++ ipe->used = 1; ++ ipe->used_grp = 0; ++ ++ return 0; ++} ++ ++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe) ++{ ++ unsigned int i; ++ ++ ipe->curr = 0; ++ ipe->used = 0; ++ ipe->used_grp = 0; ++ for (i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const q = ipe->q + i; ++ q->qpu_mc_curr = q->qpu_mc_base; ++ q->load = 0; ++ q->last_l0 = NULL; ++ q->last_l1 = NULL; ++ } ++} ++ ++static int rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe, ++ const unsigned int n_max, const unsigned int n_grp, ++ const unsigned int total_size, const unsigned int min_gap) ++{ ++ int rv; ++ ++ memset(ipe, 0, sizeof(*ipe)); ++ if ((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) == NULL) ++ return AVERROR(ENOMEM); ++ ++ ipe->n_grp = n_grp; ++ ipe->min_gap = min_gap; ++ ++ if ((rv = gpu_malloc_cached(total_size, &ipe->gptr)) != 0) ++ av_freep(&ipe->q); ++ return rv; ++} ++ ++ ++#if RPI_QPU_EMU_Y ++#define get_mc_address_y(f) ((f)->data[0]) ++#else ++#define get_mc_address_y(f) get_vc_address_y(f) ++#endif ++#if RPI_QPU_EMU_C ++#define get_mc_address_u(f) ((f)->data[1]) ++#else ++#define get_mc_address_u(f) get_vc_address_u(f) ++#endif ++ ++static inline uint32_t pack_wo_p(const int off, const int mul) ++{ ++ return PACK2(off * 2 + 1, mul); ++} ++ ++static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul) ++{ ++ return PACK2(off0 + off1 + 1, mul); ++} ++ ++ ++static void ++rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb, ++ const int x0, const int y0, ++ const int nPbW, const int nPbH, ++ const MvXY mv_xy, ++ const int weight_mul, ++ const int weight_offset, ++ AVFrame *const src_frame) ++{ ++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); ++ const unsigned int mx = MV_X(mv_xy) & 3; ++ const unsigned int my = MV_Y(mv_xy) & 3; ++ const unsigned int my_mx = (my << 8) | mx; ++ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx; ++ const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame); ++ qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off; ++ const uint32_t wo = pack_wo_p(weight_offset, weight_mul); ++ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); ++ ++ if (my_mx == 0) ++ { ++ const int x1 = x0 + (MV_X(mv_xy) >> 2); ++ const int y1 = y0 + (MV_Y(mv_xy) >> 2); ++ const int bh = nPbH; ++ ++ for (int start_x = 0; start_x < nPbW; start_x += 16) ++ { ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00; ++ ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; ++ ++ts->y_pred1_x0y0; ++ ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; ++ ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; ++ } ++#endif ++ ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src_vc_address_y; ++ cmd_y->w = bw; ++ cmd_y->h = bh; ++ cmd_y->wo1 = wo; ++ cmd_y->dst_addr = dst_addr + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } ++ } ++ else ++ { ++ const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3; ++ const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3; ++ const unsigned int bh = nPbH; ++ int start_x = 0; ++ ++#if 1 ++ // As Y-pred operates on two independant 8-wide src blocks we can merge ++ // this pred with the previous one if it the previous one is 8 pel wide, ++ // the same height as the current block, immediately to the left of our ++ // current dest block and mono-pred. ++ ++ qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p; ++ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr) ++ { ++ const int bw = FFMIN(nPbW, 8); ++ qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1; ++ ++ last_y8_src2->x = x1_m3; ++ last_y8_src2->y = y1_m3; ++ last_y8_src2->base = src_vc_address_y; ++ last_y8_p->w += bw; ++ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21); ++ last_y8_p->wo2 = wo; ++ ++ jb->last_y8_p = NULL; ++ jb->last_y8_l1 = NULL; ++ start_x = bw; ++#if RPI_TSTATS ++ ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge; ++#endif ++ } ++#endif ++ ++ for (; start_x < nPbW; start_x += 16) ++ { ++ const int bw = FFMIN(nPbW - start_x, 16); ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; ++ if (mx == 0 && my == 0) ++ ++ts->y_pred1_x0y0; ++ else if (mx == 0) ++ ++ts->y_pred1_x0; ++ else if (my == 0) ++ ++ts->y_pred1_y0; ++ else ++ ++ts->y_pred1_xy; ++ ++ if (nPbW > 8) ++ ++ts->y_pred1_wgt8; ++ else ++ ++ts->y_pred1_wle8; ++ ++ if (nPbH > 16) ++ ++ts->y_pred1_hgt16; ++ else ++ ++ts->y_pred1_hle16; ++ } ++#endif ++ src1->x = x1_m3 + start_x; ++ src1->y = y1_m3; ++ src1->base = src_vc_address_y; ++ if (bw <= 8) ++ { ++ src2->x = MC_DUMMY_X; ++ src2->y = MC_DUMMY_Y; ++#if RPI_QPU_EMU_Y ++ src2->base = s->qpu_dummy_frame_emu; ++#else ++ src2->base = s->qpu_dummy_frame_qpu; ++#endif ++ } ++ else ++ { ++ src2->x = x1_m3 + start_x + 8; ++ src2->y = y1_m3; ++ src2->base = src_vc_address_y; ++ } ++ cmd_y->w = bw; ++ cmd_y->h = bh; ++ cmd_y->mymx21 = my2_mx2_my_mx; ++ cmd_y->wo1 = wo; ++ cmd_y->wo2 = wo; ++ cmd_y->dst_addr = dst_addr + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ ++ if (bw == 8) { ++ jb->last_y8_l1 = src2; ++ jb->last_y8_p = cmd_y; ++ } ++ } ++ } ++} ++ ++static void ++rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const int x0, const int y0, ++ const int nPbW, const int nPbH, ++ const struct HEVCRpiMvField *const mv_field, ++ const AVFrame *const src_frame, ++ const AVFrame *const src_frame2) ++{ ++ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0); ++ const MvXY mv = mv_field->xy[0]; ++ const MvXY mv2 = mv_field->xy[1]; ++ ++ const unsigned int mx = MV_X(mv) & 3; ++ const unsigned int my = MV_Y(mv) & 3; ++ const unsigned int my_mx = (my<<8) | mx; ++ const unsigned int mx2 = MV_X(mv2) & 3; ++ const unsigned int my2 = MV_Y(mv2) & 3; ++ const unsigned int my2_mx2 = (my2<<8) | mx2; ++ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx; ++ const unsigned int ref_idx0 = mv_field->ref_idx[0]; ++ const unsigned int ref_idx1 = mv_field->ref_idx[1]; ++ const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]); ++ const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]); ++ ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame); ++ qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off; ++ const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame); ++ const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2); ++ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip; ++ ++ if (my2_mx2_my_mx == 0) ++ { ++ const int x1 = x0 + (MV_X(mv) >> 2); ++ const int y1 = y0 + (MV_Y(mv) >> 2); ++ const int x2 = x0 + (MV_X(mv2) >> 2); ++ const int y2 = y0 + (MV_Y(mv2) >> 2); ++ const int bh = nPbH; ++ ++ // Can do chunks a full 16 wide if we don't want the H filter ++ for (int start_x=0; start_x < nPbW; start_x += 16) ++ { ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; ++ ++ts->y_pred2_x0y0; ++ ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; ++ } ++#endif ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src1_base; ++ src2->x = x2 + start_x; ++ src2->y = y2; ++ src2->base = src2_base; ++ cmd_y->w = FFMIN(nPbW - start_x, 16); ++ cmd_y->h = bh; ++ cmd_y->mymx21 = 0; ++ cmd_y->wo1 = wo1; ++ cmd_y->wo2 = wo2; ++ cmd_y->dst_addr = dst + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } ++ } ++ else ++ { ++ // Filter requires a run-up of 3 ++ const int x1 = x0 + (MV_X(mv) >> 2) - 3; ++ const int y1 = y0 + (MV_Y(mv) >> 2) - 3; ++ const int x2 = x0 + (MV_X(mv2) >> 2) - 3; ++ const int y2 = y0 + (MV_Y(mv2) >> 2) - 3; ++ const int bh = nPbH; ++ ++ for (int start_x=0; start_x < nPbW; start_x += 8) ++ { // B blocks work 8 at a time ++ // B weights aren't doubled as the QPU code does the same ++ // amount of work as it does for P ++ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx); ++ qpu_mc_src_t *const src1 = yp->last_l0; ++ qpu_mc_src_t *const src2 = yp->last_l1; ++ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p; ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats; ++ const unsigned int mmx = mx | mx2; ++ const unsigned int mmy = my | my2; ++ if (mmx == 0 && mmy == 0) ++ ++ts->y_pred2_x0y0; ++ else if (mmx == 0) ++ ++ts->y_pred2_x0; ++ else if (mmy == 0) ++ ++ts->y_pred2_y0; ++ else ++ ++ts->y_pred2_xy; ++ ++ if (nPbH > 16) ++ ++ts->y_pred2_hgt16; ++ else ++ ++ts->y_pred2_hle16; ++ } ++#endif ++ src1->x = x1 + start_x; ++ src1->y = y1; ++ src1->base = src1_base; ++ src2->x = x2 + start_x; ++ src2->y = y2; ++ src2->base = src2_base; ++ cmd_y->w = FFMIN(nPbW - start_x, 8); ++ cmd_y->h = bh; ++ cmd_y->mymx21 = my2_mx2_my_mx; ++ cmd_y->wo1 = wo1; ++ cmd_y->wo2 = wo2; ++ cmd_y->dst_addr = dst + (start_x << xshl); ++ yp->last_l0 = &cmd_y->next_src1; ++ yp->last_l1 = &cmd_y->next_src2; ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1); ++ } ++ } ++} ++ ++// h/v shifts fixed at one as that is all the qasm copes with ++static void ++rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const unsigned int lx, const int x0_c, const int y0_c, ++ const int nPbW_c, const int nPbH_c, ++ const MvXY mv, ++ const int16_t * const c_weights, ++ const int16_t * const c_offsets, ++ AVFrame * const src_frame) ++{ ++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); ++ const int hshift = 1; // = s->ps.sps->hshift[1]; ++ const int vshift = 1; // = s->ps.sps->vshift[1]; ++ ++ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1; ++ const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame); ++ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)]; ++ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)]; ++ const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]); ++ const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]); ++ qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; ++ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; ++ const unsigned int bh = nPbH_c; ++ const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1; ++ ++ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH) ++ { ++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn); ++ qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p; ++ qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1; ++ qpu_mc_src_t * const last_lx = *plast_lx; ++ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); ++ ++ last_lx->x = x1_c + start_x; ++ last_lx->y = y1_c; ++ last_lx->base = src_base_u; ++ cmd_c->h = bh; ++ cmd_c->w = bw; ++ cmd_c->coeffs_x = x_coeffs; ++ cmd_c->coeffs_y = y_coeffs; ++ cmd_c->wo_u = wo_u; ++ cmd_c->wo_v = wo_v; ++ cmd_c->dst_addr_c = dst_base_u + (start_x << xshl); ++ *plast_lx = &cmd_c->next_src; ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1); ++ } ++ return; ++} ++ ++// h/v shifts fixed at one as that is all the qasm copes with ++static void ++rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const int x0_c, const int y0_c, ++ const int nPbW_c, const int nPbH_c, ++ const struct HEVCRpiMvField * const mv_field, ++ const int16_t * const c_weights, ++ const int16_t * const c_offsets, ++ const int16_t * const c_weights2, ++ const int16_t * const c_offsets2, ++ AVFrame * const src_frame, ++ AVFrame * const src_frame2) ++{ ++ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c); ++ const int hshift = 1; // s->ps.sps->hshift[1]; ++ const int vshift = 1; // s->ps.sps->vshift[1]; ++ const MvXY mv = mv_field->xy[0]; ++ const MvXY mv2 = mv_field->xy[1]; ++ ++ const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift); ++ const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift); ++ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)]; ++ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector ++ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1; ++ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1; ++ ++ const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift); ++ const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift); ++ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)]; ++ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector ++ ++ const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1; ++ const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1; ++ ++ const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]); ++ const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]); ++ ++ const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off; ++ const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame); ++ const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2); ++ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip; ++ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1; ++ const unsigned int bh = nPbH_c; ++ ++ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH) ++ { ++ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH); ++ ++ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx); ++ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b; ++ qpu_mc_src_t * const src_l0 = cp->last_l0; ++ qpu_mc_src_t * const src_l1 = cp->last_l1; ++ ++ src_l0->x = x1_c + start_x; ++ src_l0->y = y1_c; ++ src_l0->base = src1_base; ++ src_l1->x = x2_c + start_x; ++ src_l1->y = y2_c; ++ src_l1->base = src2_base; ++ ++ u[0].h = bh; ++ u[0].w = bw; ++ u[0].coeffs_x1 = coefs0_x; ++ u[0].coeffs_y1 = coefs0_y; ++ u[0].weight_u1 = c_weights[0]; // Weight L0 U ++ u[0].weight_v1 = c_weights[1]; // Weight L0 V ++ u[0].coeffs_x2 = coefs1_x; ++ u[0].coeffs_y2 = coefs1_y; ++ u[0].wo_u2 = wo_u2; ++ u[0].wo_v2 = wo_v2; ++ u[0].dst_addr_c = dst_base_u + (start_x << xshl); ++ ++ cp->last_l0 = &u[0].next_src1; ++ cp->last_l1 = &u[0].next_src2; ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); ++ } ++} ++ ++ ++static inline void ++col_stash(const HEVCRpiContext * const s, ++ const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0, ++ const HEVCRpiMvField * const mvf) ++{ ++ ColMvField * const col_mvf = s->ref->col_mvf; ++ const unsigned int x = (x0 + 15) >> 4; ++ const unsigned int y = (y0 + 15) >> 4; ++ const unsigned int w = ((x0 + 15 + w0) >> 4) - x; ++ const unsigned int h = ((y0 + 15 + h0) >> 4) - y; ++ ++ if (col_mvf != NULL && w != 0 && h != 0) ++ { ++ // Only record MV from the top left of the 16x16 block ++ ++ const RefPicList * const rpl = s->refPicList; ++ const ColMvField cmv = { ++ .L = { ++ { ++ .poc = (mvf->pred_flag & PF_L0) == 0 ? ++ COL_POC_INTRA : ++ COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]), ++ .xy = mvf->xy[0] ++ }, ++ { ++ .poc = (mvf->pred_flag & PF_L1) == 0 ? ++ COL_POC_INTRA : ++ COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]), ++ .xy = mvf->xy[1] ++ } ++ } ++ }; ++ ++ ColMvField * p = col_mvf + y * s->col_mvf_stride + x; ++ const unsigned int stride = s->col_mvf_stride - w; ++ unsigned int j = h; ++ ++ do ++ { ++ unsigned int k = w; ++ do ++ { ++ *p++ = cmv; ++ } while (--k != 0); ++ p += stride; ++ } while (--j != 0); ++ } ++} ++ ++static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int nPbW, const unsigned int nPbH, ++ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx) ++{ ++ HEVCRpiJob * const jb = lc->jb0; ++ ++ struct HEVCRpiMvField current_mv = {{0}}; ++ const RefPicList *const refPicList = s->refPicList; ++ const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL; ++ ++ if (lc->cu.pred_mode != MODE_SKIP) ++ lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc); ++ ++ if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) { ++ const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 : ++ ff_hevc_rpi_merge_idx_decode(s, lc); ++ ++ ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size, ++ partIdx, merge_idx, ¤t_mv); ++ } else { ++ hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, ¤t_mv); ++ } ++ ++ { ++ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0); ++ unsigned int i, j; ++ ++ for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++) ++ { ++ for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++) ++ p[i] = current_mv; ++ p += MVF_STASH_WIDTH_PU; ++ } ++ } ++ ++ col_stash(s, x0, y0, nPbW, nPbH, ¤t_mv); ++ ++ if (current_mv.pred_flag & PF_L0) { ++ ref0 = refPicList[0].ref[current_mv.ref_idx[0]]; ++ if (!ref0) ++ return; ++ hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH); ++ } ++ if (current_mv.pred_flag & PF_L1) { ++ ref1 = refPicList[1].ref[current_mv.ref_idx[1]]; ++ if (!ref1) ++ return; ++ hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH); ++ } ++ ++ if (current_mv.pred_flag == PF_L0) { ++ const int x0_c = x0 >> ctx_hshift(s, 1); ++ const int y0_c = y0 >> ctx_vshift(s, 1); ++ const int nPbW_c = nPbW >> ctx_hshift(s, 1); ++ const int nPbH_c = nPbH >> ctx_vshift(s, 1); ++ ++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0], ++ s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]], ++ ref0->frame); ++ ++ if (ctx_cfmt(s) != 0) { ++ rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0], ++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]], ++ ref0->frame); ++ return; ++ } ++ } else if (current_mv.pred_flag == PF_L1) { ++ const int x0_c = x0 >> ctx_hshift(s, 1); ++ const int y0_c = y0 >> ctx_vshift(s, 1); ++ const int nPbW_c = nPbW >> ctx_hshift(s, 1); ++ const int nPbH_c = nPbH >> ctx_vshift(s, 1); ++ ++ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1], ++ s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]], ++ ref1->frame); ++ ++ if (ctx_cfmt(s) != 0) { ++ rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1], ++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]], ++ ref1->frame); ++ return; ++ } ++ } else if (current_mv.pred_flag == PF_BI) { ++ const int x0_c = x0 >> ctx_hshift(s, 1); ++ const int y0_c = y0 >> ctx_vshift(s, 1); ++ const int nPbW_c = nPbW >> ctx_hshift(s, 1); ++ const int nPbH_c = nPbH >> ctx_vshift(s, 1); ++ ++ rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, ¤t_mv, ref0->frame, ref1->frame); ++ ++ if (ctx_cfmt(s) != 0) { ++ rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c, ++ ¤t_mv, ++ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], ++ s->sh.chroma_offset_l0[current_mv.ref_idx[0]], ++ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], ++ s->sh.chroma_offset_l1[current_mv.ref_idx[1]], ++ ref0->frame, ++ ref1->frame); ++ return; ++ } ++ } ++} ++ ++static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_cb_size, ++ const unsigned int ipm) ++{ ++ const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE; ++ const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE; ++ ++ { ++ const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE)); ++ set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm); ++ } ++ ++ // If IRAP then everything is Intra & we avoid ever looking at these ++ // stashes so don't bother setting them ++ if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA) ++ { ++ if (s->is_intra != NULL) ++ { ++ set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE); ++ } ++ ++ { ++ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0); ++ const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1 ++ unsigned int n = size_in_pus; ++ ++ do ++ { ++ memset(p, 0, size_in_pus * sizeof(*p)); ++ p += MVF_STASH_WIDTH_PU; ++ } while (--n != 0); ++ } ++ ++ ++ if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0) ++ { ++ // Only record top left stuff ++ // Blocks should always be alinged on size boundries ++ // so cannot have overflow from a small block ++ ++ ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4); ++ const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4)); ++ const unsigned int stride = s->col_mvf_stride - size_in_col; ++ unsigned int j = size_in_col; ++ ++ do ++ { ++ unsigned int k = size_in_col; ++ do ++ { ++ p->L[0].poc = COL_POC_INTRA; ++ p->L[0].xy = 0; ++ p->L[1].poc = COL_POC_INTRA; ++ p->L[1].xy = 0; ++ ++p; ++ } while (--k != 0); ++ p += stride; ++ } while (--j != 0); ++ } ++ } ++} ++ ++static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_cb_size) ++{ ++ set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC); ++} ++ ++ ++/** ++ * 8.4.1 ++ */ ++static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ int x0, int y0, int log2_pu_size, ++ int prev_intra_luma_pred_flag, ++ const unsigned int idx) ++{ ++ const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size); ++ const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE; ++ const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE; ++ ++ // Up does not cross boundries so as we always scan 1 slice-tile-line in an ++ // lc we can just keep 1 CTB lR stashes ++ // Left is reset to DC @ Start of Line/Tile/Slice in fill_job ++ const unsigned int cand_up = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu]; ++ const unsigned int cand_left = lc->ipm_left[yb_pu]; ++ ++ unsigned int intra_pred_mode; ++ unsigned int a, b, c; ++ ++ if (cand_left == cand_up) { ++ if (cand_left < 2) { ++ a = INTRA_PLANAR; ++ b = INTRA_DC; ++ c = INTRA_ANGULAR_26; ++ } else { ++ a = cand_left; ++ b = 2 + ((cand_left - 2 - 1 + 32) & 31); ++ c = 2 + ((cand_left - 2 + 1) & 31); ++ } ++ } else { ++ a = cand_left; ++ b = cand_up; ++ c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ? ++ INTRA_PLANAR : ++ (cand_left != INTRA_DC && cand_up != INTRA_DC) ? ++ INTRA_DC : ++ INTRA_ANGULAR_26; ++ } ++ ++ if (prev_intra_luma_pred_flag) { ++ intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c; ++ } else { ++ // Sort lowest 1st ++ if (a > b) ++ FFSWAP(int, a, b); ++ if (a > c) ++ FFSWAP(int, a, c); ++ if (b > c) ++ FFSWAP(int, b, c); ++ ++ intra_pred_mode = idx; ++ if (intra_pred_mode >= a) ++ intra_pred_mode++; ++ if (intra_pred_mode >= b) ++ intra_pred_mode++; ++ if (intra_pred_mode >= c) ++ intra_pred_mode++; ++ } ++ ++ /* write the intra prediction units into the mv array */ ++ set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode); ++ return intra_pred_mode; ++} ++ ++static const uint8_t tab_mode_idx[] = { ++ 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20, ++ 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31}; ++ ++static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_cb_size) ++{ ++ static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 }; ++ uint8_t prev_intra_luma_pred_flag[4]; ++ int split = lc->cu.part_mode == PART_NxN; ++ const unsigned int split_size = (1 << (log2_cb_size - 1)); ++ int chroma_mode; ++ const unsigned int n = split ? 4 : 1; ++ unsigned int i; ++ ++ for (i = 0; i != n; i++) ++ prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc); ++ ++ for (i = 0; i < n; i++) { ++ // depending on mode idx is mpm or luma_pred_mode ++ const unsigned int idx = prev_intra_luma_pred_flag[i] ? ++ ff_hevc_rpi_mpm_idx_decode(lc) : ++ ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc); ++ ++ lc->pu.intra_pred_mode[i] = ++ luma_intra_pred_mode(s, lc, ++ x0 + ((i & 1) == 0 ? 0 : split_size), ++ y0 + ((i & 2) == 0 ? 0 : split_size), ++ log2_cb_size - split, ++ prev_intra_luma_pred_flag[i], idx); ++ } ++ ++ if (ctx_cfmt(s) == 3) { ++ for (i = 0; i < n; i++) { ++ lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); ++ if (chroma_mode != 4) { ++ if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode]) ++ lc->pu.intra_pred_mode_c[i] = 34; ++ else ++ lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode]; ++ } else { ++ lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i]; ++ } ++ } ++ } else if (ctx_cfmt(s) == 2) { ++ int mode_idx; ++ lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); ++ if (chroma_mode != 4) { ++ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode]) ++ mode_idx = 34; ++ else ++ mode_idx = intra_chroma_table[chroma_mode]; ++ } else { ++ mode_idx = lc->pu.intra_pred_mode[0]; ++ } ++ lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx]; ++ } else if (ctx_cfmt(s) != 0) { ++ chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc); ++ if (chroma_mode != 4) { ++ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode]) ++ lc->pu.intra_pred_mode_c[0] = 34; ++ else ++ lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode]; ++ } else { ++ lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0]; ++ } ++ } ++} ++ ++static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size) ++{ ++ const unsigned int cb_size = 1 << log2_cb_size; ++ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size; ++ const unsigned int min_cb_width = s->ps.sps->min_cb_width; ++ const unsigned int x_cb = x0 >> log2_min_cb_size; ++ const unsigned int y_cb = y0 >> log2_min_cb_size; ++ const unsigned int idx = log2_cb_size - 2; ++ const unsigned int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1; ++ int skip_flag = 0; ++ ++ lc->cu.x = x0; ++ lc->cu.y = y0; ++ lc->cu.x_split = x0; ++ lc->cu.y_split = y0; ++ ++ lc->cu.pred_mode = MODE_INTRA; ++ lc->cu.part_mode = PART_2Nx2N; ++ lc->cu.intra_split_flag = 0; ++ lc->cu.cu_transquant_bypass_flag = 0; ++ lc->pu.intra_pred_mode[0] = 1; ++ lc->pu.intra_pred_mode[1] = 1; ++ lc->pu.intra_pred_mode[2] = 1; ++ lc->pu.intra_pred_mode[3] = 1; ++ ++ if (s->ps.pps->transquant_bypass_enable_flag) { ++ lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc); ++ if (lc->cu.cu_transquant_bypass_flag) ++ set_deblocking_bypass(s, x0, y0, log2_cb_size); ++ } ++ ++ if (s->sh.slice_type != HEVC_SLICE_I) { ++ lc->cu.pred_mode = MODE_INTER; ++ skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb); ++ } ++ ++ if (skip_flag) { ++ lc->cu.pred_mode = MODE_SKIP; ++ ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); ++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); ++ ++ if (!s->sh.disable_deblocking_filter_flag) ++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0); ++ } else { ++ int pcm_flag = 0; ++ ++ if (s->sh.slice_type != HEVC_SLICE_I) ++ lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc); ++ if (lc->cu.pred_mode != MODE_INTRA || ++ log2_cb_size == s->ps.sps->log2_min_cb_size) { ++ lc->cu.part_mode = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size); ++ lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN && ++ lc->cu.pred_mode == MODE_INTRA; ++ } ++ ++ if (lc->cu.pred_mode == MODE_INTRA) { ++ if (lc->cu.part_mode == PART_2Nx2N && ++ log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size && // 0 if not enabled ++ log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size && ++ ff_hevc_rpi_pcm_flag_decode(lc) != 0) ++ { ++ int ret; ++ pcm_flag = 1; ++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); ++ if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0) ++ return ret; ++ ++ if (s->ps.sps->pcm.loop_filter_disable_flag) ++ set_deblocking_bypass(s, x0, y0, log2_cb_size); ++ } else { ++ intra_prediction_unit(s, lc, x0, y0, log2_cb_size); ++ } ++ } else { ++ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size); ++ switch (lc->cu.part_mode) { ++ case PART_2Nx2N: ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx); ++ break; ++ case PART_2NxN: ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 2, log2_cb_size, 0, idx); ++ lc->cu.y_split = y0 + cb_size / 2; ++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx); ++ break; ++ case PART_Nx2N: ++ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1); ++ lc->cu.x_split = x0 + cb_size / 2; ++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1); ++ break; ++ case PART_2NxnU: ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4, log2_cb_size, 0, idx); ++ lc->cu.y_split = y0 + cb_size / 4; ++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx); ++ break; ++ case PART_2NxnD: ++ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx); ++ lc->cu.y_split = y0 + cb_size / 4 * 3; ++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size / 4, log2_cb_size, 1, idx); ++ break; ++ case PART_nLx2N: ++ hls_prediction_unit(s, lc, x0, y0, cb_size / 4, cb_size, log2_cb_size, 0, idx - 2); ++ lc->cu.x_split = x0 + cb_size / 4; ++ hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2); ++ break; ++ case PART_nRx2N: ++ hls_prediction_unit(s, lc, x0, y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2); ++ lc->cu.x_split = x0 + cb_size / 4 * 3; ++ hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size / 4, cb_size, log2_cb_size, 1, idx - 2); ++ break; ++ case PART_NxN: ++ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1); ++ lc->cu.x_split = x0 + cb_size / 2; ++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1); ++ lc->cu.y_split = y0 + cb_size / 2; ++ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1); ++ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1); ++ break; ++ } ++ } ++ ++ if (!pcm_flag) { ++ int rqt_root_cbf = 1; ++ ++ if (lc->cu.pred_mode != MODE_INTRA && ++ !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) { ++ rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc); ++ } ++ if (rqt_root_cbf) { ++ const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0); ++ int ret; ++ ++ lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ? ++ s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag : ++ s->ps.sps->max_transform_hierarchy_depth_inter; ++ // transform_tree does deblock_boundary_strengths ++ ret = hls_transform_tree(s, lc, x0, y0, ++ log2_cb_size, 0, 0, cbf_c); ++ if (ret < 0) ++ return ret; ++ } else { ++ if (!s->sh.disable_deblocking_filter_flag) ++ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0); ++ } ++ } ++ } ++ ++ // If the delta is still wanted then we haven't read the delta & therefore need to set qp here ++ if (lc->tu.is_cu_qp_delta_wanted) ++ ff_hevc_rpi_set_qPy(s, lc, x0, y0); ++ ++ if(((x0 + (1<qPy_pred = lc->qp_y; ++ } ++ ++ set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff); ++ ++ set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag); ++ ++ return 0; ++} ++ ++// Returns: ++// < 0 Error ++// 0 More data wanted ++// 1 EoSlice / EoPicture ++static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, ++ const int log2_cb_size, const unsigned int cb_depth) ++{ ++ const int cb_size = 1 << log2_cb_size; ++ int ret; ++ int split_cu; ++ ++ lc->ct_depth = cb_depth; ++ split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size); ++ if (x0 + cb_size <= s->ps.sps->width && ++ y0 + cb_size <= s->ps.sps->height && ++ split_cu) ++ { ++ split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0); ++ } ++ ++ // Qp delta (and offset) need to remain wanted if cb_size < min until ++ // a coded block is found so we still initial state at depth 0 (outside ++ // this fn) and only reset here ++ if (s->ps.pps->cu_qp_delta_enabled_flag && ++ log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size) ++ { ++ lc->tu.is_cu_qp_delta_wanted = 1; ++ lc->tu.cu_qp_delta = 0; ++ } ++ if (s->sh.cu_chroma_qp_offset_enabled_flag && ++ log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size) ++ { ++ lc->tu.cu_chroma_qp_offset_wanted = 1; ++ } ++ ++ lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0]; ++ lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset; ++ lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset; ++ ++ if (split_cu) { ++ int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1; ++ const int cb_size_split = cb_size >> 1; ++ const int x1 = x0 + cb_size_split; ++ const int y1 = y0 + cb_size_split; ++ ++ int more_data = 0; ++ ++ more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1); ++ if (more_data < 0) ++ return more_data; ++ ++ if (more_data && x1 < s->ps.sps->width) { ++ more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1); ++ if (more_data < 0) ++ return more_data; ++ } ++ if (more_data && y1 < s->ps.sps->height) { ++ more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1); ++ if (more_data < 0) ++ return more_data; ++ } ++ if (more_data && x1 < s->ps.sps->width && ++ y1 < s->ps.sps->height) { ++ more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1); ++ if (more_data < 0) ++ return more_data; ++ } ++ ++ if(((x0 + (1<qPy_pred = lc->qp_y; ++ ++ if (more_data) ++ return ((x1 + cb_size_split) < s->ps.sps->width || ++ (y1 + cb_size_split) < s->ps.sps->height); ++ else ++ return 0; ++ } else { ++ ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size); ++ if (ret < 0) ++ return ret; ++ if ((!((x0 + cb_size) % ++ (1 << (s->ps.sps->log2_ctb_size))) || ++ (x0 + cb_size >= s->ps.sps->width)) && ++ (!((y0 + cb_size) % ++ (1 << (s->ps.sps->log2_ctb_size))) || ++ (y0 + cb_size >= s->ps.sps->height))) { ++ int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc); ++ return !end_of_slice_flag; ++ } else { ++ return 1; ++ } ++ } ++ ++ return 0; // NEVER ++} ++ ++static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const int x_ctb, const int y_ctb, const int ctb_addr_ts) ++{ ++ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size; ++ const unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr; // slice_addr = RS addr of start of slice ++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts]; ++ const unsigned int line_w = s->ps.sps->ctb_width; ++ ++ s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr; ++ ++ lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width); ++ lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height); ++ ++ lc->boundary_flags = 0; ++ ++ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0) ++ lc->boundary_flags |= BOUNDARY_LEFT_TILE; ++ if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1]) ++ lc->boundary_flags |= BOUNDARY_LEFT_SLICE; ++ if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0) ++ lc->boundary_flags |= BOUNDARY_UPPER_TILE; ++ if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w]) ++ lc->boundary_flags |= BOUNDARY_UPPER_SLICE; ++ ++ // Use line width rather than tile width for addr_in_slice test as ++ // addr_in_slice is in raster units ++ ++ lc->ctb_avail = ++ ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) | ++ ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) | ++ ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 && ++ (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) | ++ ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 && ++ (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0); ++ // Down-left never avail at CTB level ++} ++ ++ ++static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds, ++ (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0); ++ ++ // Signal ++ if (y > 0) { ++ // Cast away const as progress is held in s, but this really shouldn't confuse anything ++ ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1); ++ } ++ ++ // Job done now ++ // ? Move outside this fn ++ job_free(s->jbc, jb); ++} ++ ++// I-pred, transform_and_add for all blocks types done here ++// All ARM ++static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ unsigned int i; ++ HEVCRpiIntraPredEnv * const iap = &jb->intra; ++ const HEVCPredCmd *cmd = iap->cmds; ++ ++#if !RPI_WORKER_WAIT_PASS_0 ++ rpi_sem_wait(&jb->sem); ++ rpi_cache_flush_execute(jb->rfe); // Invalidate data set up in pass1 ++#endif ++ ++ for (i = iap->n; i > 0; i--, cmd++) ++ { ++ switch (cmd->type) ++ { ++ case RPI_PRED_INTRA: ++ s->hpc.intra_pred(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size); ++ break; ++ case RPI_PRED_INTRA_C: ++ s->hpc.intra_pred_c(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size); ++ break; ++ case RPI_PRED_ADD_RESIDUAL: ++ s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ break; ++ case RPI_PRED_ADD_DC: ++ s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); ++ break; ++ case RPI_PRED_ADD_RESIDUAL_U: ++ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); ++ break; ++ case RPI_PRED_ADD_RESIDUAL_V: ++ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc); ++ break; ++ case RPI_PRED_ADD_RESIDUAL_C: ++ s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride); ++ break; ++ case RPI_PRED_ADD_DC_U: ++ case RPI_PRED_ADD_DC_V: ++ s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc); ++ break; ++ ++ case RPI_PRED_I_PCM: ++ pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size); ++ break; ++ ++ default: ++ av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type); ++ abort(); ++ } ++ } ++ ++ // Mark done ++ iap->n = 0; ++} ++ ++ ++// Set initial uniform job values & zero ctu_count ++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first) ++{ ++ unsigned int i; ++ HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip; ++ HEVCRpiInterPredEnv *const yipe = &jb->luma_ip; ++ const HEVCRpiSPS * const sps = s->ps.sps; ++ ++ const uint16_t pic_width_y = sps->width; ++ const uint16_t pic_height_y = sps->height; ++ ++ const uint16_t pic_width_c = sps->width >> ctx_hshift(s, 1); ++ const uint16_t pic_height_c = sps->height >> ctx_vshift(s, 1); ++ ++ // We expect the pointer to change if we use another sps ++ if (sps != jb->sps) ++ { ++ worker_pic_free_one(jb); ++ ++ set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma); ++ set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma); ++ ++ { ++ const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH; ++ const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1)); ++ worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma); ++ } ++ ++ jb->sps = sps; ++ } ++ ++ jb->waited = 0; ++ jb->ctu_ts_first = ctu_ts_first; ++ jb->ctu_ts_last = -1; ++ ++ rpi_inter_pred_reset(cipe); ++ for (i = 0; i < cipe->n; i++) { ++ HEVCRpiInterPredQ * const cp = cipe->q + i; ++ qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s; ++ ++ u->next_src1.x = 0; ++ u->next_src1.y = 0; ++ u->next_src1.base = 0; ++ u->pic_cw = pic_width_c; ++ u->pic_ch = pic_height_c; ++ u->stride2 = av_rpi_sand_frame_stride2(s->frame); ++ u->stride1 = av_rpi_sand_frame_stride1(s->frame); ++ cp->last_l0 = &u->next_src1; ++ ++ u->next_fn = 0; ++ u->next_src2.x = 0; ++ u->next_src2.y = 0; ++ u->next_src2.base = 0; ++ cp->last_l1 = &u->next_src2; ++ ++ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1); ++ } ++ ++ rpi_inter_pred_reset(yipe); ++ for (i = 0; i < yipe->n; i++) { ++ HEVCRpiInterPredQ * const yp = yipe->q + i; ++ qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s; ++ ++ y->next_src1.x = 0; ++ y->next_src1.y = 0; ++ y->next_src1.base = 0; ++ y->next_src2.x = 0; ++ y->next_src2.y = 0; ++ y->next_src2.base = 0; ++ y->pic_h = pic_height_y; ++ y->pic_w = pic_width_y; ++ y->stride2 = av_rpi_sand_frame_stride2(s->frame); ++ y->stride1 = av_rpi_sand_frame_stride1(s->frame); ++ y->next_fn = 0; ++ yp->last_l0 = &y->next_src1; ++ yp->last_l1 = &y->next_src2; ++ ++ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1); ++ } ++ ++ jb->last_y8_p = NULL; ++ jb->last_y8_l1 = NULL; ++ ++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) { ++ jb->progress_req[i] = -1; ++ } ++ ++ worker_pic_reset(&jb->coeffs); ++} ++ ++ ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s, ++ const vpu_qpu_job_h vqj, ++ rpi_cache_flush_env_t * const rfe, ++ HEVCRpiInterPredEnv * const ipe) ++{ ++ unsigned int i; ++ uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS]; ++ unsigned int max_block = 0; ++ ++ if (!ipe->used) { ++ return 0; ++ } ++ ++ if (ipe->curr != 0) { ++ rpi_inter_pred_sync(ipe); ++ } ++ ++ // Add final commands to Q ++ for(i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const yp = ipe->q + i; ++ qpu_mc_src_t *const p0 = yp->last_l0; ++ qpu_mc_src_t *const p1 = yp->last_l1; ++ const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base; ++ ++ if (block_size > max_block) ++ max_block = block_size; ++ ++ qpu_mc_link_set(yp->qpu_mc_curr, yp->code_exit); ++ ++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched ++ p0->x = MC_DUMMY_X; ++ p0->y = MC_DUMMY_Y; ++ p0->base = s->qpu_dummy_frame_qpu; ++ p1->x = MC_DUMMY_X; ++ p1->y = MC_DUMMY_Y; ++ p1->base = s->qpu_dummy_frame_qpu; ++ ++ yp->last_l0 = NULL; ++ yp->last_l1 = NULL; ++ ++ // Add to mailbox list ++ mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm); ++ mail[i][1] = yp->code_setup; ++ } ++ ++ // We don't need invalidate here as the uniforms aren't changed by the QPU ++ // and leaving them in ARM cache avoids (pointless) pre-reads when writing ++ // new values which seems to give us a small performance advantage ++ // ++ // In most cases we will not have a completely packed set of uniforms and as ++ // we have a 2d invalidate we writeback all uniform Qs to the depth of the ++ // fullest ++ rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK, ++ (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block, ++ ipe->n, ipe->max_fill + ipe->min_gap); ++ vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail); ++ ++ return 1; ++} ++#endif ++ ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s, ++ const vpu_qpu_job_h vqj, ++ rpi_cache_flush_env_t * const rfe, ++ HEVCRpiInterPredEnv * const ipe) ++{ ++ unsigned int i; ++ if (!ipe->used) { ++ return 0; ++ } ++ ++ if (ipe->curr != 0) { ++ rpi_inter_pred_sync(ipe); ++ } ++ ++ // Add final commands to Q ++ for(i = 0; i != ipe->n; ++i) { ++ HEVCRpiInterPredQ * const yp = ipe->q + i; ++ qpu_mc_src_t *const p0 = yp->last_l0; ++ qpu_mc_src_t *const p1 = yp->last_l1; ++ ++ yp->qpu_mc_curr->data[-1] = yp->code_exit; ++ ++ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched ++ p0->x = MC_DUMMY_X; ++ p0->y = MC_DUMMY_Y; ++ p0->base = s->qpu_dummy_frame_emu; ++ p1->x = MC_DUMMY_X; ++ p1->y = MC_DUMMY_Y; ++ p1->base = s->qpu_dummy_frame_emu; ++ ++ yp->last_l0 = NULL; ++ yp->last_l1 = NULL; ++ } ++ ++ return 1; ++} ++#endif ++ ++ ++#if RPI_QPU_EMU_Y ++#define mc_terminate_add_y mc_terminate_add_emu ++#else ++#define mc_terminate_add_y mc_terminate_add_qpu ++#endif ++#if RPI_QPU_EMU_C ++#define mc_terminate_add_c mc_terminate_add_emu ++#else ++#define mc_terminate_add_c mc_terminate_add_qpu ++#endif ++ ++ ++static void flush_frame(HEVCRpiContext *s,AVFrame *frame) ++{ ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf); ++ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE); ++ rpi_cache_flush_finish(rfe); ++} ++ ++static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first]; ++ const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last]; ++ const unsigned int ctb_width = s->ps.sps->ctb_width; ++ RpiBlk *const bounds = &jb->bounds; ++ av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last); ++ bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size; ++ bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size; ++ bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size; ++ bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size; ++ ++ bounds->w = FFMIN(bounds->w, s->ps.sps->width - bounds->x); ++ bounds->h = FFMIN(bounds->h, s->ps.sps->height - bounds->y); ++} ++ ++#if RPI_PASSES == 2 ++static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ // Perform intra prediction and residual reconstruction ++ rpi_execute_pred_cmds(s, jb); ++ ++ // Perform deblocking for CTBs in this row ++ rpi_execute_dblk_cmds(s, jb); ++} ++#endif ++ ++// Core execution tasks ++static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb) ++{ ++ int pred_y, pred_c; ++ vpu_qpu_job_env_t qvbuf; ++ const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf); ++#if RPI_WORKER_WAIT_PASS_0 ++ int do_wait; ++#endif ++ ++ { ++ const HEVCRpiCoeffsEnv * const cf = &jb->coeffs; ++ if (cf->s[3].n + cf->s[2].n != 0) ++ { ++ const unsigned int csize = sizeof(cf->s[3].buf[0]); ++ const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize; ++ unsigned int n16 = (cf->s[2].n >> 8); ++ unsigned int n32 = (cf->s[3].n >> 10); ++#if RPI_COMPRESS_COEFFS ++ if (cf->s[2].packed) { ++ n16 = n16 | (n16<<16); ++ } else { ++ const unsigned int npack16 = (cf->s[2].packed_n>>8); ++ n16 = n16 | (npack16<<16); ++ } ++ if (cf->s[3].packed) { ++ n32 = n32 | (n32<<16); ++ } else { ++ const unsigned int npack32 = (cf->s[3].packed_n>>10); ++ n32 = n32 | (npack32<<16); ++ } ++#endif ++ vpu_qpu_job_add_vpu(vqj, ++ vpu_get_fn(s->ps.sps->bit_depth), ++ vpu_get_constants(), ++ cf->gptr.vc, ++ n16, ++ cf->gptr.vc + offset32, ++ n32, ++ 0); ++ ++ rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize); ++ rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize); ++ } ++ } ++ ++ pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip); ++ ++// We could take a sync here and try to locally overlap QPU processing with ARM ++// but testing showed a slightly negative benefit with noticable extra complexity ++ ++ pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip); ++ ++ // Returns 0 if nothing to do, 1 if sync added ++#if RPI_WORKER_WAIT_PASS_0 ++ do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem); ++#else ++ if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0) ++ sem_post(&jb->sem); ++#endif ++ ++ rpi_cache_flush_execute(jb->rfe); ++ ++ // Await progress as required ++ // jb->waited will only be clear if we have already tested the progress values ++ // (in worker_submit_job) and found we don't have to wait ++ if (jb->waited) ++ { ++ unsigned int i; ++ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) { ++ if (jb->progress_req[i] >= 0) { ++ ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]); ++ } ++ } ++ } ++ ++ vpu_qpu_job_finish(vqj); ++ ++ // We always work on a rectangular block ++ if (pred_y || pred_c) ++ { ++ rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE, ++ jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h, ++ ctx_vshift(s, 1), pred_y, pred_c); ++ } ++ ++ // If we have emulated VPU ops - do it here ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ if (av_rpi_is_sand8_frame(s->frame)) ++ { ++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C ++ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip); ++#elif RPI_QPU_EMU_Y ++ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL); ++#else ++ ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip); ++#endif ++ } ++ else ++ { ++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C ++ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip); ++#elif RPI_QPU_EMU_Y ++ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL); ++#else ++ ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip); ++#endif ++ } ++#endif ++ ++#if RPI_WORKER_WAIT_PASS_0 ++ if (do_wait) ++ rpi_sem_wait(&jb->sem); ++ rpi_cache_flush_execute(jb->rfe); ++#endif ++} ++ ++ ++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe) ++{ ++ av_freep(&ipe->q); ++ gpu_free(&ipe->gptr); ++} ++ ++static HEVCRpiJob * job_new(void) ++{ ++ HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob)); ++ ++ if (jb == NULL) ++ return NULL; ++ ++ sem_init(&jb->sem, 0, 0); ++ jb->rfe = rpi_cache_flush_init(&jb->flush_buf); ++ ff_hevc_rpi_progress_init_wait(&jb->progress_wait); ++ ++ jb->intra.n = 0; ++ if ((jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS)) == NULL) ++ goto fail1; ++ ++ // * Sizeof the union structure might be overkill but at the moment it ++ // is correct (it certainly isn't going to be too small) ++ // Set max fill to slack/2 from the end of the Q ++ // If we exceed this in any Q then we will schedule by size (which should ++ // mean that we never use that Q again part from syncs) ++ // * Given how agressive the overflow resonse is we could maybe put the ++ // threshold even nearer the end, but I don't expect us to ever hit ++ // it on any real stream anyway. ++ ++ if (rpi_inter_pred_alloc(&jb->chroma_ip, ++ QPU_N_MAX, QPU_N_GRP, ++ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t), ++ QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2) != 0) ++ goto fail2; ++ if (rpi_inter_pred_alloc(&jb->luma_ip, ++ QPU_N_MAX, QPU_N_GRP, ++ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t), ++ QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2) != 0) ++ goto fail3; ++ ++ return jb; ++ ++fail3: ++ rpi_free_inter_pred(&jb->luma_ip); ++fail2: ++ av_freep(&jb->intra.cmds); ++fail1: ++ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait); ++ rpi_cache_flush_finish(jb->rfe); ++ sem_destroy(&jb->sem); ++ return NULL; ++} ++ ++static void job_delete(HEVCRpiJob * const jb) ++{ ++ worker_pic_free_one(jb); ++ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait); ++ rpi_free_inter_pred(&jb->chroma_ip); ++ rpi_free_inter_pred(&jb->luma_ip); ++ av_freep(&jb->intra.cmds); ++ rpi_cache_flush_finish(jb->rfe); // Not really needed - should do nothing ++ sem_destroy(&jb->sem); ++ av_free(jb); ++} ++ ++static void jbg_delete(HEVCRpiJobGlobal * const jbg) ++{ ++ HEVCRpiJob * jb; ++ ++ if (jbg == NULL) ++ return; ++ ++ jb = jbg->free1; ++ while (jb != NULL) ++ { ++ HEVCRpiJob * const jb2 = jb; ++ jb = jb2->next; ++ job_delete(jb2); ++ } ++ ++ pthread_mutex_destroy(&jbg->lock); ++ av_free(jbg); ++} ++ ++static HEVCRpiJobGlobal * jbg_new(unsigned int job_count) ++{ ++ HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal)); ++ if (jbg == NULL) ++ return NULL; ++ ++ pthread_mutex_init(&jbg->lock, NULL); ++ ++ while (job_count-- != 0) ++ { ++ HEVCRpiJob * const jb = job_new(); ++ if (jb == NULL) ++ goto fail; ++ ++ jb->next = jbg->free1; ++ jbg->free1 = jb; ++ } ++ ++ return jbg; ++ ++fail: ++ jbg_delete(jbg); ++ return NULL; ++} ++ ++static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc) ++{ ++ HEVCRpiJobGlobal * jbg; ++ ++ if (jbc == NULL) ++ return; ++ ++ jbg = jbc->jbg; ++ ++ if (jbc->jb1 != NULL) ++ job_delete(jbc->jb1); ++ ++ pthread_mutex_destroy(&jbc->in_lock); ++ sem_destroy(&jbc->sem_out); ++ av_free(jbc); ++ ++ // Deref the global job context ++ if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1) ++ jbg_delete(jbg); ++} ++ ++static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg) ++{ ++ HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl)); ++ ++ if (jbc == NULL) ++ return NULL; ++ ++ jbc->jbg = jbg; ++ atomic_fetch_add(&jbg->ref_count, 1); ++ ++ sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS); ++ pthread_mutex_init(&jbc->in_lock, NULL); ++ ++ if ((jbc->jb1 = job_new()) == NULL) ++ goto fail; ++ jbc->jb1->jbc_local = jbc; ++ ++ return jbc; ++ ++fail: ++ rpi_job_ctl_delete(jbc); ++ return NULL; ++} ++ ++ ++ ++static av_cold void hevc_init_worker(HEVCRpiContext * const s) ++{ ++#if RPI_PASSES == 2 ++ pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1); ++#elif RPI_PASSES == 3 ++ pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2); ++ pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1); ++#else ++#error Passes confused ++#endif ++ pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0); ++ ++ pass_queues_start_all(s); ++} ++ ++static av_cold void hevc_exit_worker(HEVCRpiContext *s) ++{ ++ pass_queues_term_all(s); ++ ++ pass_queues_kill_all(s); ++ ++ rpi_job_ctl_delete(s->jbc); ++ s->jbc = NULL; ++} ++ ++ ++static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc) ++{ ++ const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; ++ const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns; ++ const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts]; ++ ++ // Check for obvious disasters ++ if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) { ++ av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ // If dependant then ctb_addr_ts != 0 from previous check ++ if (s->sh.dependent_slice_segment_flag) { ++ int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1]; ++ if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) { ++ av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ if (!s->ps.pps->entropy_coding_sync_enabled_flag && ++ tile_id + s->sh.num_entry_point_offsets >= tiles) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ // Tiled stuff must start at start of tile if it has multiple entry points ++ if (!s->ps.pps->entropy_coding_sync_enabled_flag && ++ s->sh.num_entry_point_offsets != 0 && ++ ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id]) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ ff_hevc_rpi_cabac_init_decoder(lc); ++ ++ // Setup any required decode vars ++ lc->cabac_init_req = !s->sh.dependent_slice_segment_flag; ++ ++// printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot); ++ lc->qp_y = s->sh.slice_qp; ++ ++ // General setup ++ lc->bt_line_no = 0; ++ lc->ts = ctb_addr_ts; ++ return 0; ++} ++ ++static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal) ++{ ++ const GetBitContext * const gb = &s->HEVClc->gb; ++ RpiSliceHeader * const sh = &s->sh; ++ int i, j; ++ ++ const unsigned int length = nal->size; ++ unsigned int offset = ((gb->index) >> 3) + 1; // We have a bit & align still to come = +1 byte ++ unsigned int cmpt; ++ unsigned int startheader; ++ ++ if (sh->num_entry_point_offsets == 0) { ++ s->data = NULL; ++ return 0; ++ } ++ ++ // offset in slice header includes emulation prevention bytes. ++ // Unfortunately those have been removed by the time we get here so we ++ // have to compensate. The nal layer keeps a track of where they were. ++ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) { ++ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { ++ startheader--; ++ cmpt++; ++ } ++ } ++ ++ for (i = 1; i < sh->num_entry_point_offsets; i++) { ++ offset += (sh->entry_point_offset[i - 1] - cmpt); ++ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) { ++ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) { ++ startheader--; ++ cmpt++; ++ } ++ } ++ if (sh->entry_point_offset[i] <= cmpt) { ++ av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ sh->size[i - 1] = sh->entry_point_offset[i] - cmpt; ++ sh->offset[i - 1] = offset; ++ } ++ ++ offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt; ++ if (length < offset) { ++ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ sh->size[sh->num_entry_point_offsets - 1] = length - offset; ++ sh->offset[sh->num_entry_point_offsets - 1] = offset; ++ ++ // Remember data start pointer as we won't have nal later ++ s->data = nal->data; ++ return 0; ++} ++ ++ ++// Return ++// < 0 Error ++// 0 OK ++// ++// jb->ctu_ts_last < 0 Job still filling ++// jb->ctu_ts_last >= 0 Job ready ++ ++static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks) ++{ ++ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size; ++ const unsigned int ctb_size = (1 << log2_ctb_size); ++ HEVCRpiJob * const jb = lc->jb0; ++ int more_data = 1; ++ unsigned int ctb_addr_ts = lc->ts; ++ unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size; ++ const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size; ++ ++ lc->unit_done = 0; ++ ++ while (more_data && ctb_addr_ts < s->ps.sps->ctb_size) ++ { ++ int q_full; ++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts]; ++ ++ hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts); ++ ++ ff_hevc_rpi_cabac_init(s, lc, ctb_flags); ++ ++ hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size); ++ ++ s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset; ++ s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset; ++ s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag; ++ ++ // Zap stashes if navail ++ if ((lc->ctb_avail & AVAIL_U) == 0) ++ zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3); ++ if ((lc->ctb_avail & AVAIL_L) == 0) ++ { ++ memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE); ++ zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3); ++ } ++#if MVF_STASH_WIDTH > 64 ++ // Restore left mvf stash at start of tile if not at start of line ++ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap) ++ { ++ unsigned int i; ++ HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0); ++ const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE); ++ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i) ++ { ++ *dst = *src++; ++ dst += MVF_STASH_WIDTH_PU; ++ } ++ } ++#endif ++ ++ // Set initial tu states ++ lc->tu.cu_qp_delta = 0; ++ lc->tu.is_cu_qp_delta_wanted = 0; ++ lc->tu.cu_chroma_qp_offset_wanted = 0; ++ ++ // Decode ++ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0); ++ ++ if (ff_hevc_rpi_cabac_overflow(lc)) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n "); ++ more_data = AVERROR_INVALIDDATA; ++ } ++ ++ if (more_data < 0) { ++ s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN; // Mark slice as broken ++ return more_data; ++ } ++ ++ if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 || ++ (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0))) ++ { ++ if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 || ++ ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL) ++ { ++ av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n "); ++ return -1; ++ } ++ } ++ ++ // --- Post CTB processing ++ ++ // Stash rpl top/left for deblock that needs to remember such things cross-slice ++ s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList; ++ s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList; ++ ++ if (!s->is_irap) ++ { ++ // Copy MVF up to up-left & stash to up ++ { ++ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1); ++ HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE); ++ ++ // printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst); ++ ++ lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE]; ++ memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE); ++ } ++ // Stash sideways if end of tile line but not end of line (no point) ++ // ** Could/should do this @ end of fn ++#if MVF_STASH_WIDTH > 64 ++ if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL) ++#endif ++ { ++ unsigned int i; ++ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0); ++ HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE); ++ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i) ++ { ++ *dst++ = *src; ++ src += MVF_STASH_WIDTH_PU; ++ } ++ } ++ } ++ ++ if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0) ++ ff_hevc_rpi_save_states(s, lc); ++ ++ // Report progress so we can use our MVs in other frames ++ if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0) ++ ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1); ++ ++ // End of line || End of tile line || End of tile ++ // (EoL covers end of frame for our purposes here) ++ q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0); ++ ++ // Allocate QPU chunks on fixed size 64 pel boundries rather than ++ // whatever ctb_size is today. ++ // * We might quite like to continue to 64 pel vertical too but that ++ // currently confuses WPP ++ if (((x_ctb + ctb_size) & 63) == 0 || q_full) ++ { ++ int overflow = 0; ++ if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0) ++ overflow = 1; ++ if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0) ++ overflow = 1; ++ if (overflow) ++ { ++ // * This is very annoying (and slow) to cope with in WPP so ++ // we treat it as an error there (no known stream triggers this ++ // with the current buffer sizes). Non-wpp should cope fine. ++ av_log(s->avctx, AV_LOG_WARNING, "%s: Q full before EoL\n", __func__); ++ q_full = 1; ++ } ++ } ++ ++ // Inc TS to next. ++ ctb_addr_ts++; ++ ctb_addr_rs++; ++ x_ctb += ctb_size; ++ ++ if (q_full) ++ { ++ // Do job ++ // Prep for submission ++ jb->ctu_ts_last = ctb_addr_ts - 1; // Was pre-inced ++ job_gen_bounds(s, jb); ++ break; ++ } ++ ++ // If max_blocks started as 0 then this will never be true ++ if (--max_blocks == 0) ++ break; ++ } ++ ++ lc->unit_done = (more_data <= 0); ++ lc->ts = ctb_addr_ts; ++ return 0; ++} ++ ++static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n) ++{ ++ lc->context = s; ++ lc->jb0 = NULL; ++ lc->lc_n = n; ++ lc->bt_terminate = 0; ++ lc->bt_psem_out = NULL; ++ sem_init(&lc->bt_sem_in, 0, 0); ++} ++ ++#define TRACE_WPP 0 ++#if RPI_EXTRA_BIT_THREADS > 0 ++static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts) ++{ ++ unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts]; ++ return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]]; ++} ++ ++// Move local context parameters from an aux bit thread back to the main ++// thread at the end of a slice as processing is going to continue there. ++static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep) ++{ ++ if (src_lc == dst_lc) { ++ return; ++ } ++ ++ // Move the job ++ // We will still have an active job if the final line terminates early ++ // Dest should always be null by now ++ av_assert1(dst_lc->jb0 == NULL); ++ dst_lc->jb0 = src_lc->jb0; ++ src_lc->jb0 = NULL; ++ ++ // Always need to store where we are in the bitstream ++ dst_lc->ts = src_lc->ts; ++ dst_lc->gb = src_lc->gb; ++ // Cabac init request will be built at start of next slice ++ ++ // Need to store context if we might have a dependent seg ++ if (is_dep) ++ { ++ dst_lc->qPy_pred = src_lc->qPy_pred; ++ memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left)); ++ memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state)); ++ memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff)); ++ } ++} ++ ++static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc) ++{ ++ rpi_sem_wait(&lc->bt_sem_in); ++ return lc->bt_terminate; ++} ++ ++// Do one WPP line ++// Will not work correctly over horizontal tile boundries - vertical should be OK ++static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first) ++{ ++ const int is_tile = lc->bt_is_tile; ++ const unsigned int tile_id = s->ps.pps->tile_id[lc->ts]; ++ const unsigned int line = lc->bt_line_no; ++ const unsigned int line_inc = lc->bt_line_inc; ++ const int is_last = (line >= lc->bt_last_line); ++ ++ const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width); ++ const unsigned int ts_next = ++ line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ? ++ INT_MAX : ++ is_tile ? ++ s->ps.pps->tile_pos_ts[tile_id + line_inc] : ++ lc->ts + lc->bt_line_width * line_inc; ++ // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work) ++ const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2; ++ unsigned int ts_prev; ++ int loop_n = 0; ++ int err = 0; ++ ++ av_assert1(line <= s->sh.num_entry_point_offsets); ++ ++#if TRACE_WPP ++ printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__, ++ lc->lc_n, is_tile ? "Tile" : "WPP", tile_id, ++ line, lc->bt_last_line, s->sh.num_entry_point_offsets, ++ lc->ts, ts_eol, ts_next, partial_size, lc->jb0); ++#endif ++ if (line != 0) ++ { ++ const uint8_t * const data = s->data + s->sh.offset[line - 1]; ++ const unsigned int len = s->sh.size[line - 1]; ++ if ((err = init_get_bits8(&lc->gb, data, len)) < 0) ++ return err; ++ ++ ff_init_cabac_decoder(&lc->cc, data, len); ++ } ++ ++ // We should never be processing a dependent slice here so reset is good ++ // ?? These probably shouldn't be needed (as they should be set by later ++ // logic) but do seem to be required ++ lc->qp_y = s->sh.slice_qp; ++ ++ do ++ { ++ if (!is_last && loop_n > 1) { ++#if TRACE_WPP ++ printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out); ++#endif ++ sem_post(lc->bt_psem_out); ++ } ++ // The wait for loop_n == 0 has been done in bit_thread ++ if (!is_first && loop_n != 0) ++ { ++#if TRACE_WPP ++ printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in); ++#endif ++ if (wait_bt_sem_in(lc) != 0) ++ return AVERROR_EXIT; ++ } ++ ++#if TRACE_WPP ++ { ++ int n; ++ sem_getvalue(&lc->bt_sem_in, &n); ++ printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in); ++ } ++#endif ++ ++ ts_prev = lc->ts; ++ ++ // If we have had an error - do no further decode but do continue ++ // moving signals around so the other threads continue to operate ++ // correctly (or at least as correctly as they can with this line missing) ++ // ++ // Errors in WPP/Tile are less fatal than normal as we have a good idea ++ // of how to restart on the next line so there is no need to give up totally ++ if (err != 0) ++ { ++ lc->unit_done = 0; ++ lc->ts += partial_size; ++ } ++ else ++ { ++ worker_pass0_ready(s, lc); ++ ++ if ((err = fill_job(s, lc, partial_size)) < 0 || ++ (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done))) ++ { ++ if (err == 0) { ++ av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n"); ++ err = AVERROR_INVALIDDATA; ++ } ++ worker_free(s, lc); ++ lc->ts = ts_prev + partial_size; // Pretend we did all that ++ lc->unit_done = 0; ++ } ++ else if (is_tile) ++ { ++ worker_submit_job(s, lc); ++ } ++ } ++ ++ ++loop_n; ++ } while (lc->ts < ts_eol && !lc->unit_done); ++ ++ // If we are on the last line & we didn't get a whole line we must wait for ++ // and sink the sem_posts from the line above / tile to the left. ++ while ((ts_prev += partial_size) < ts_eol) ++ { ++#if TRACE_WPP ++ printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in); ++#endif ++ if (wait_bt_sem_in(lc) != 0) ++ return AVERROR_EXIT; ++ } ++ ++ lc->bt_line_no += line_inc; ++ ++ if (!is_tile && err == 0) ++ worker_submit_job(s, lc); ++ ++ if (!is_last) { ++ lc->ts = ts_next; ++ ++#if TRACE_WPP ++ printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out); ++#endif ++ sem_post(lc->bt_psem_out); ++ if (loop_n > 1) { ++#if TRACE_WPP ++ printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out); ++#endif ++ sem_post(lc->bt_psem_out); ++ } ++ } ++ else ++ { ++ movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag); // * & not EoT ++#if MVF_STASH_WIDTH > 64 ++ // Horrid calculations to work out what we want but luckily this should almost never execute ++ // **** Move to movlc ++ if (!s->is_irap) ++ { ++ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts]; ++ if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf ++ { ++ const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1; ++ unsigned int i; ++ const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)); ++ HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)); ++ ++ for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i) ++ { ++ *d_mvf = *s_mvf; ++ d_mvf += MVF_STASH_WIDTH_PU; ++ s_mvf += MVF_STASH_WIDTH_PU; ++ } ++ ++ } ++ } ++#endif ++ // When all done poke the thread 0 sem_in one final time ++#if TRACE_WPP ++ printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in); ++#endif ++ sem_post(&s->HEVClcList[0]->bt_sem_in); ++ } ++ ++#if TRACE_WPP ++ printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag); ++#endif ++ return err; ++} ++ ++static void wpp_setup_lcs(HEVCRpiContext * const s) ++{ ++ unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; ++ const unsigned int line_width = line_ts_width(s, ts); ++ ++ for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i) ++ { ++ HEVCRpiLocalContext * const lc = s->HEVClcList[i]; ++ lc->ts = ts; ++ lc->bt_is_tile = 0; ++ lc->bt_line_no = i; ++ lc->bt_line_width = line_width; ++ lc->bt_last_line = s->sh.num_entry_point_offsets; ++ lc->bt_line_inc = RPI_BIT_THREADS; ++ ts += line_width; ++ } ++} ++ ++ ++// Can only process tile single row at once ++static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row) ++{ ++ const HEVCRpiPPS * const pps = s->ps.pps; ++ const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr]; ++ const unsigned int tile0 = pps->tile_id[ts0]; ++ const unsigned int col0 = tile0 % pps->num_tile_columns; ++ ++ const unsigned int col = (slice_row == 0) ? col0 : 0; ++ unsigned int line = slice_row * pps->num_tile_columns - col0 + col; ++ const unsigned int last_line = FFMIN( ++ line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets); ++ ++ const unsigned int par = ++ FFMIN(RPI_BIT_THREADS, last_line + 1 - line); ++#if TRACE_WPP ++ printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row, ++ pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line); ++#endif ++ for (unsigned int i = 0; i != par; ++i, ++line) ++ { ++ HEVCRpiLocalContext * const lc = s->HEVClcList[i]; ++ const unsigned int tile = tile0 + line; ++ ++ lc->ts = pps->tile_pos_ts[tile]; ++ lc->bt_line_no = line; ++ lc->bt_is_tile = 1; ++ lc->bt_line_width = line_ts_width(s, lc->ts); ++ lc->bt_last_line = last_line; ++ lc->bt_line_inc = par; ++ } ++} ++ ++ ++static void * bit_thread(void * v) ++{ ++ HEVCRpiLocalContext * const lc = v; ++ HEVCRpiContext *const s = lc->context; ++ ++ while (wait_bt_sem_in(lc) == 0) ++ { ++ int err; ++ ++ if ((err = rpi_run_one_line(s, lc, 0)) < 0) { // Never first tile/wpp ++ if (lc->bt_terminate) { ++ av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__); ++ break; ++ } ++ av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err); ++ } ++ } ++ ++ return NULL; ++} ++ ++static int bit_threads_start(HEVCRpiContext * const s) ++{ ++ if (s->bt_started) ++ return 0; ++ ++ for (int i = 1; i < RPI_BIT_THREADS; ++i) ++ { ++ // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS] ++ if (s->HEVClcList[i] == NULL) { ++ if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL) ++ return -1; ++ } ++ ++ bt_lc_init(s, s->HEVClcList[i], i); ++ job_lc_init(s->HEVClcList[i]); ++ } ++ ++ // Link the sems in a circle ++ for (int i = 0; i < RPI_BIT_THREADS - 1; ++i) ++ s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in; ++ s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in; ++ ++ // Init all lc before starting any threads ++ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i) ++ { ++ if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0) ++ return -1; ++ } ++ ++ s->bt_started = 1; ++ return 0; ++} ++ ++static int bit_threads_kill(HEVCRpiContext * const s) ++{ ++ if (!s->bt_started) ++ return 0; ++ s->bt_started = 0; ++ ++ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i) ++ { ++ HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1]; ++ if (lc == NULL) ++ break; ++ ++ lc->bt_terminate = 1; ++ sem_post(&lc->bt_sem_in); ++ pthread_join(s->bit_threads[i], NULL); ++ ++ sem_destroy(&lc->bt_sem_in); ++ job_lc_kill(lc); ++ } ++ return 0; ++} ++#endif ++ ++ ++// If we are at EoT and the row is shorter than the number of jobs ++// we can Q we have to wait for it finish otherwise we risk cache/QPU ++// disasters ++static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n) ++{ ++ return ++ s->ps.pps->tile_wpp_inter_disable >= 2 && ++ s->sh.slice_type != HEVC_SLICE_I && ++ n >= 0 && ++ (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT; ++} ++ ++static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread) ++{ ++ HEVCRpiContext * const s = avctxt->priv_data; ++ HEVCRpiLocalContext * const lc = s->HEVClc; ++ int err; ++ ++ // Start of slice ++ if ((err = slice_start(s, lc)) != 0) ++ return err; ++ ++#if RPI_EXTRA_BIT_THREADS > 0 ++ ++ if (s->sh.offload_tiles) ++ { ++ unsigned int slice_row = 0; ++ ++#if TRACE_WPP ++ printf("%s: Do Tiles\n", __func__); ++#endif ++ // Generate & start extra bit threads if they aren't already running ++ bit_threads_start(s); ++ ++ do ++ { ++ // Reset lc lines etc. ++ tile_one_row_setup_lcs(s, slice_row); ++ ++#if TRACE_WPP ++ printf("%s: Row %d: Do 1st: line=%d/%d/%d\n", ++ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets); ++#endif ++ ++ rpi_run_one_line(s, lc, 1); // Kicks off the other threads ++#if TRACE_WPP ++ printf("%s: Row %d: Done 1st: line=%d/%d/%d\n", ++ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets); ++#endif ++ ++ while (lc->bt_line_no <= lc->bt_last_line) { ++ rpi_sem_wait(&lc->bt_sem_in); ++ rpi_run_one_line(s, lc, 0); ++ } ++#if TRACE_WPP ++ printf("%s: Done body\n", __func__); ++#endif ++ ++ // Wait for everything else to finish ++ rpi_sem_wait(&lc->bt_sem_in); ++ ++ ++slice_row; ++ } while (lc->bt_last_line < s->sh.num_entry_point_offsets); ++ ++ ++#if TRACE_WPP ++ printf("%s: Done wait: ts=%d\n", __func__, lc->ts); ++#endif ++ } ++ else if (s->sh.offload_wpp) ++ { ++#if TRACE_WPP ++ printf("%s: Do WPP\n", __func__); ++#endif ++ // Generate & start extra bit threads if they aren't already running ++ bit_threads_start(s); ++ ++ // Reset lc lines etc. ++ wpp_setup_lcs(s); ++ ++ rpi_run_one_line(s, lc, 1); // Kicks off the other threads ++#if TRACE_WPP ++ printf("%s: Done 1st\n", __func__); ++#endif ++ ++ while (lc->bt_line_no <= s->sh.num_entry_point_offsets) { ++ rpi_sem_wait(&lc->bt_sem_in); ++ rpi_run_one_line(s, lc, 0); ++ } ++#if TRACE_WPP ++ printf("%s: Done body\n", __func__); ++#endif ++ ++ // Wait for everything else to finish ++ rpi_sem_wait(&lc->bt_sem_in); ++ ++#if TRACE_WPP ++ printf("%s: Done wait: ts=%d\n", __func__, lc->ts); ++#endif ++ } ++ else ++#endif ++ { ++#if TRACE_WPP ++ printf("%s: Single start: ts=%d\n", __func__, lc->ts); ++#endif ++ // Single bit thread ++ do { ++ // Make sure we have space to prepare the next job ++ worker_pass0_ready(s, lc); ++ ++ if ((err = fill_job(s, lc, 0)) < 0) ++ goto fail; ++ ++ worker_submit_job(s, lc); ++ ++ if (tile_needs_wait(s, lc->ts - 1)) ++ worker_wait(s, lc); ++ ++ } while (!lc->unit_done); ++ ++#if TRACE_WPP ++ printf("%s: Single end: ts=%d\n", __func__, lc->ts); ++#endif ++ } ++ ++ // If we have reached the end of the frame or ++ // then wait for the worker to finish all its jobs ++ if (lc->ts >= s->ps.sps->ctb_size) ++ worker_wait(s, lc); ++ ++#if RPI_TSTATS ++ { ++ HEVCRpiStats *const ts = &s->tstats; ++ ++ printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n", ++ ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0, ++ ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge, ++ ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0, ++ ts->y_pred2_hgt16, ts->y_pred2_hle16); ++ memset(ts, 0, sizeof(*ts)); ++ } ++#endif ++ ++ return lc->ts; ++ ++fail: ++ // Cleanup ++ av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err); ++ // Free our job & wait for temination ++ worker_free(s, lc); ++ worker_wait(s, lc); ++ return err; ++} ++ ++ ++static void set_no_backward_pred(HEVCRpiContext * const s) ++{ ++ int i, j; ++ const RefPicList *const refPicList = s->refPicList; ++ ++ s->no_backward_pred_flag = 0; ++ if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag) ++ return; ++ ++ for (j = 0; j < 2; j++) { ++ for (i = 0; i < refPicList[j].nb_refs; i++) { ++ if (refPicList[j].list[i] > s->poc) { ++ s->no_backward_pred_flag = 1; ++ return; ++ } ++ } ++ } ++} ++ ++static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal) ++{ ++ int err; ++ if ((err = gen_entry_points(s, nal)) < 0) ++ return err; ++ ++ set_no_backward_pred(s); ++ ++ return rpi_decode_entry(s->avctx, NULL); ++} ++ ++static int set_side_data(HEVCRpiContext *s) ++{ ++ AVFrame *out = s->ref->frame; ++ ++ if (s->sei.frame_packing.present && ++ s->sei.frame_packing.arrangement_type >= 3 && ++ s->sei.frame_packing.arrangement_type <= 5 && ++ s->sei.frame_packing.content_interpretation_type > 0 && ++ s->sei.frame_packing.content_interpretation_type < 3) { ++ AVStereo3D *stereo = av_stereo3d_create_side_data(out); ++ if (!stereo) ++ return AVERROR(ENOMEM); ++ ++ switch (s->sei.frame_packing.arrangement_type) { ++ case 3: ++ if (s->sei.frame_packing.quincunx_subsampling) ++ stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX; ++ else ++ stereo->type = AV_STEREO3D_SIDEBYSIDE; ++ break; ++ case 4: ++ stereo->type = AV_STEREO3D_TOPBOTTOM; ++ break; ++ case 5: ++ stereo->type = AV_STEREO3D_FRAMESEQUENCE; ++ break; ++ } ++ ++ if (s->sei.frame_packing.content_interpretation_type == 2) ++ stereo->flags = AV_STEREO3D_FLAG_INVERT; ++ ++ if (s->sei.frame_packing.arrangement_type == 5) { ++ if (s->sei.frame_packing.current_frame_is_frame0_flag) ++ stereo->view = AV_STEREO3D_VIEW_LEFT; ++ else ++ stereo->view = AV_STEREO3D_VIEW_RIGHT; ++ } ++ } ++ ++ if (s->sei.display_orientation.present && ++ (s->sei.display_orientation.anticlockwise_rotation || ++ s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) { ++ double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16); ++ AVFrameSideData *rotation = av_frame_new_side_data(out, ++ AV_FRAME_DATA_DISPLAYMATRIX, ++ sizeof(int32_t) * 9); ++ if (!rotation) ++ return AVERROR(ENOMEM); ++ ++ av_display_rotation_set((int32_t *)rotation->data, angle); ++ av_display_matrix_flip((int32_t *)rotation->data, ++ s->sei.display_orientation.hflip, ++ s->sei.display_orientation.vflip); ++ } ++ ++ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1 ++ // so the side data persists for the entire coded video sequence. ++ if (s->sei.mastering_display.present > 0 && ++ IS_IRAP(s) && s->no_rasl_output_flag) { ++ s->sei.mastering_display.present--; ++ } ++ if (s->sei.mastering_display.present) { ++ // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b ++ const int mapping[3] = {2, 0, 1}; ++ const int chroma_den = 50000; ++ const int luma_den = 10000; ++ int i; ++ AVMasteringDisplayMetadata *metadata = ++ av_mastering_display_metadata_create_side_data(out); ++ if (!metadata) ++ return AVERROR(ENOMEM); ++ ++ for (i = 0; i < 3; i++) { ++ const int j = mapping[i]; ++ metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0]; ++ metadata->display_primaries[i][0].den = chroma_den; ++ metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1]; ++ metadata->display_primaries[i][1].den = chroma_den; ++ } ++ metadata->white_point[0].num = s->sei.mastering_display.white_point[0]; ++ metadata->white_point[0].den = chroma_den; ++ metadata->white_point[1].num = s->sei.mastering_display.white_point[1]; ++ metadata->white_point[1].den = chroma_den; ++ ++ metadata->max_luminance.num = s->sei.mastering_display.max_luminance; ++ metadata->max_luminance.den = luma_den; ++ metadata->min_luminance.num = s->sei.mastering_display.min_luminance; ++ metadata->min_luminance.den = luma_den; ++ metadata->has_luminance = 1; ++ metadata->has_primaries = 1; ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n"); ++ av_log(s->avctx, AV_LOG_DEBUG, ++ "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n", ++ av_q2d(metadata->display_primaries[0][0]), ++ av_q2d(metadata->display_primaries[0][1]), ++ av_q2d(metadata->display_primaries[1][0]), ++ av_q2d(metadata->display_primaries[1][1]), ++ av_q2d(metadata->display_primaries[2][0]), ++ av_q2d(metadata->display_primaries[2][1]), ++ av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1])); ++ av_log(s->avctx, AV_LOG_DEBUG, ++ "min_luminance=%f, max_luminance=%f\n", ++ av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance)); ++ } ++ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1 ++ // so the side data persists for the entire coded video sequence. ++ if (s->sei.content_light.present > 0 && ++ IS_IRAP(s) && s->no_rasl_output_flag) { ++ s->sei.content_light.present--; ++ } ++ if (s->sei.content_light.present) { ++ AVContentLightMetadata *metadata = ++ av_content_light_metadata_create_side_data(out); ++ if (!metadata) ++ return AVERROR(ENOMEM); ++ metadata->MaxCLL = s->sei.content_light.max_content_light_level; ++ metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level; ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n"); ++ av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n", ++ metadata->MaxCLL, metadata->MaxFALL); ++ } ++ ++ if (s->sei.a53_caption.a53_caption) { ++ AVFrameSideData* sd = av_frame_new_side_data(out, ++ AV_FRAME_DATA_A53_CC, ++ s->sei.a53_caption.a53_caption_size); ++ if (sd) ++ memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size); ++ av_freep(&s->sei.a53_caption.a53_caption); ++ s->sei.a53_caption.a53_caption_size = 0; ++ s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS; ++ } ++ ++ if (s->sei.alternative_transfer.present && ++ av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) && ++ s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) { ++ s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics; ++ } ++ ++ return 0; ++} ++ ++static int hevc_frame_start(HEVCRpiContext * const s) ++{ ++ int ret; ++ ++ memset(s->bs_horizontal, 0, s->bs_size * 2); // Does V too ++ memset(s->is_pcm, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height); ++ memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address)); ++ ++ // Only need to remember intra for CIP ++ if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap) ++ s->is_intra = NULL; ++ else ++ { ++ s->is_intra = s->is_intra_store; ++ memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height); ++ } ++ ++ s->is_decoded = 0; ++ s->first_nal_type = s->nal_unit_type; ++ ++ s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos); ++ ++ if (s->pkt.nb_nals > s->rpl_tab_size) ++ { ++ // In most cases it will be faster to free & realloc as that doesn't ++ // require (an unwanted) copy ++ av_freep(&s->rpl_tab); ++ s->rpl_tab_size = 0; ++ if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL) ++ goto fail; ++ s->rpl_tab_size = s->pkt.nb_nals; ++ } ++ memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab)); ++ ++ ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc); ++ if (ret < 0) ++ goto fail; ++ ++ // Resize rpl_tab to max that we might want ++ ret = ff_hevc_rpi_frame_rps(s); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n"); ++ goto fail; ++ } ++ ++ s->ref->frame->key_frame = IS_IRAP(s); ++ ++ ret = set_side_data(s); ++ if (ret < 0) ++ goto fail; ++ ++ s->frame->pict_type = 3 - s->sh.slice_type; ++ ++ if (!IS_IRAP(s)) ++ ff_hevc_rpi_bump_frame(s); ++ ++ av_frame_unref(s->output_frame); ++ ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0); ++ if (ret < 0) ++ goto fail; ++ ++ ff_thread_finish_setup(s->avctx); ++ ++ return 0; ++ ++fail: ++ if (s->ref) ++ ff_hevc_rpi_unref_frame(s, s->ref, ~0); ++ s->ref = NULL; ++ return ret; ++} ++ ++static inline int is_non_ref_unit_type(const unsigned int nal_unit_type) ++{ ++ // From Table 7-1 ++ return (nal_unit_type & ~0xe) == 0; // True for 0, 2, 4, 6, 8, 10, 12, 14 ++} ++ ++static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal) ++{ ++ GetBitContext * const gb = &s->HEVClc->gb; ++ int ctb_addr_ts, ret; ++ ++ *gb = nal->gb; ++ s->nal_unit_type = nal->type; ++ s->temporal_id = nal->temporal_id; ++ ++ switch (s->nal_unit_type) { ++ case HEVC_NAL_VPS: ++ ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps); ++ if (ret < 0) ++ goto fail; ++ break; ++ case HEVC_NAL_SPS: ++ ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps, ++ s->apply_defdispwin); ++ if (ret < 0) ++ goto fail; ++ break; ++ case HEVC_NAL_PPS: ++ ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps); ++ if (ret < 0) ++ goto fail; ++ break; ++ case HEVC_NAL_SEI_PREFIX: ++ case HEVC_NAL_SEI_SUFFIX: ++ ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type); ++ if (ret < 0) ++ goto fail; ++ break; ++ case HEVC_NAL_TRAIL_R: ++ case HEVC_NAL_TRAIL_N: ++ case HEVC_NAL_TSA_N: ++ case HEVC_NAL_TSA_R: ++ case HEVC_NAL_STSA_N: ++ case HEVC_NAL_STSA_R: ++ case HEVC_NAL_BLA_W_LP: ++ case HEVC_NAL_BLA_W_RADL: ++ case HEVC_NAL_BLA_N_LP: ++ case HEVC_NAL_IDR_W_RADL: ++ case HEVC_NAL_IDR_N_LP: ++ case HEVC_NAL_CRA_NUT: ++ case HEVC_NAL_RADL_N: ++ case HEVC_NAL_RADL_R: ++ case HEVC_NAL_RASL_N: ++ case HEVC_NAL_RASL_R: ++ ret = hls_slice_header(s); ++ if (ret < 0) ++ return ret; ++ ++ // The definition of _N unit types is "non-reference for other frames ++ // with the same temporal_id" so they may/will be ref frames for pics ++ // with a higher temporal_id. ++ s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 || ++ !is_non_ref_unit_type(s->nal_unit_type); ++ s->offload_recon = s->threads_type != 0 && s->used_for_ref; ++ s->is_irap = IS_IRAP(s); ++ ++#if DEBUG_DECODE_N ++ { ++ static int z = 0; ++ if (IS_IDR(s)) { ++ z = 1; ++ } ++ if (z != 0 && z++ > DEBUG_DECODE_N) { ++ s->is_decoded = 0; ++ break; ++ } ++ } ++#endif ++ if ( ++ (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) || ++ (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) || ++ (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) || ++ (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s))) ++ { ++ s->is_decoded = 0; ++ break; ++ } ++ ++ if (s->sh.first_slice_in_pic_flag) { ++ if (s->max_ra == INT_MAX) { ++ if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) { ++ s->max_ra = s->poc; ++ } else { ++ if (IS_IDR(s)) ++ s->max_ra = INT_MIN; ++ } ++ } ++ ++ if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) && ++ s->poc <= s->max_ra) { ++ s->is_decoded = 0; ++ break; ++ } else { ++ if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra) ++ s->max_ra = INT_MIN; ++ } ++ ++ ret = hevc_frame_start(s); ++ if (ret < 0) ++ return ret; ++ } else if (!s->ref) { ++ av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n"); ++ goto fail; ++ } ++ ++ if (s->nal_unit_type != s->first_nal_type) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Non-matching NAL types of the VCL NALUs: %d %d\n", ++ s->first_nal_type, s->nal_unit_type); ++ return AVERROR_INVALIDDATA; ++ } ++ ++ if (!s->sh.dependent_slice_segment_flag && ++ s->sh.slice_type != HEVC_SLICE_I) { ++ ret = ff_hevc_rpi_slice_rpl(s); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_WARNING, ++ "Error constructing the reference lists for the current slice.\n"); ++ goto fail; ++ } ++ } ++ ++ ctb_addr_ts = hls_slice_data(s, nal); ++ if (ctb_addr_ts >= s->ps.sps->ctb_size) { ++ s->is_decoded = 1; ++ } ++ ++ if (ctb_addr_ts < 0) { ++ ret = ctb_addr_ts; ++ goto fail; ++ } ++ break; ++ case HEVC_NAL_EOS_NUT: ++ case HEVC_NAL_EOB_NUT: ++ s->seq_decode = (s->seq_decode + 1) & 0xff; ++ s->max_ra = INT_MAX; ++ break; ++ case HEVC_NAL_AUD: ++ case HEVC_NAL_FD_NUT: ++ break; ++ default: ++ av_log(s->avctx, AV_LOG_INFO, ++ "Skipping NAL unit %d\n", s->nal_unit_type); ++ } ++ ++ return 0; ++fail: ++ if (s->avctx->err_recognition & AV_EF_EXPLODE) ++ return ret; ++ return 0; ++} ++ ++static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length) ++{ ++ int i, ret = 0; ++ int eos_at_start = 1; ++ ++ s->ref = NULL; ++ s->last_eos = s->eos; ++ s->eos = 0; ++ ++ /* split the input packet into NAL units, so we know the upper bound on the ++ * number of slices in the frame */ ++ ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff, ++ s->nal_length_size, s->avctx->codec_id, 0, 0); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_ERROR, ++ "Error splitting the input into NAL units.\n"); ++ return ret; ++ } ++ ++ for (i = 0; i < s->pkt.nb_nals; i++) { ++ if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT || ++ s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) { ++ if (eos_at_start) { ++ s->last_eos = 1; ++ } else { ++ s->eos = 1; ++ } ++ } else { ++ eos_at_start = 0; ++ } ++ } ++ ++ /* decode the NAL units */ ++ for (i = 0; i < s->pkt.nb_nals; i++) { ++ ret = decode_nal_unit(s, &s->pkt.nals[i]); ++ if (ret < 0) { ++ av_log(s->avctx, AV_LOG_WARNING, ++ "Error parsing NAL unit #%d.\n", i); ++ goto fail; ++ } ++ } ++ ++fail: // Also success path ++ if (s->ref != NULL) { ++ if (s->used_for_ref && s->threads_type != 0) { ++ ff_hevc_rpi_progress_signal_all_done(s); ++ } ++ else { ++ // Flush frame to real memory as we expect to be able to pass ++ // it straight on to mmal ++ flush_frame(s, s->frame); ++ } ++ } ++ return ret; ++} ++ ++static void print_md5(void *log_ctx, int level, uint8_t md5[16]) ++{ ++ int i; ++ for (i = 0; i < 16; i++) ++ av_log(log_ctx, level, "%02"PRIx8, md5[i]); ++} ++ ++static int verify_md5(HEVCRpiContext *s, AVFrame *frame) ++{ ++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); ++ int pixel_shift; ++ int i, j; ++ ++ if (!desc) ++ return AVERROR(EINVAL); ++ ++ pixel_shift = desc->comp[0].depth > 8; ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ", ++ s->poc); ++ ++ /* the checksums are LE, so we have to byteswap for >8bpp formats ++ * on BE arches */ ++#if HAVE_BIGENDIAN ++ if (pixel_shift && !s->checksum_buf) { ++ av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size, ++ FFMAX3(frame->linesize[0], frame->linesize[1], ++ frame->linesize[2])); ++ if (!s->checksum_buf) ++ return AVERROR(ENOMEM); ++ } ++#endif ++ ++ for (i = 0; frame->data[i]; i++) { ++ int width = s->avctx->coded_width; ++ int height = s->avctx->coded_height; ++ int w = (i == 1 || i == 2) ? (width >> desc->log2_chroma_w) : width; ++ int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height; ++ uint8_t md5[16]; ++ ++ av_md5_init(s->md5_ctx); ++ for (j = 0; j < h; j++) { ++ const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1); ++#if HAVE_BIGENDIAN ++ if (pixel_shift) { ++ s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf, ++ (const uint16_t *) src, w); ++ src = s->checksum_buf; ++ } ++#endif ++ av_md5_update(s->md5_ctx, src, w << pixel_shift); ++ } ++ av_md5_final(s->md5_ctx, md5); ++ ++ if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) { ++ av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i); ++ print_md5(s->avctx, AV_LOG_DEBUG, md5); ++ av_log (s->avctx, AV_LOG_DEBUG, "; "); ++ } else { ++ av_log (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i); ++ print_md5(s->avctx, AV_LOG_ERROR, md5); ++ av_log (s->avctx, AV_LOG_ERROR, " != "); ++ print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]); ++ av_log (s->avctx, AV_LOG_ERROR, "\n"); ++ return AVERROR_INVALIDDATA; ++ } ++ } ++ ++ av_log(s->avctx, AV_LOG_DEBUG, "\n"); ++ ++ return 0; ++} ++ ++static int all_sps_supported(const HEVCRpiContext * const s) ++{ ++ for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { ++ if (s->ps.sps_list[i] != NULL) ++ { ++ const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data; ++ if (!is_sps_supported(sps)) ++ return 0; ++ } ++ } ++ return 1; ++} ++ ++static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first) ++{ ++ int ret, i; ++ ++ ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff, ++ &s->nal_length_size, s->avctx->err_recognition, ++ s->apply_defdispwin, s->avctx); ++ if (ret < 0) ++ return ret; ++ ++ /* export stream parameters from the first SPS */ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { ++ if (first && s->ps.sps_list[i]) { ++ const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data; ++ export_stream_params(s->avctx, &s->ps, sps); ++ break; ++ } ++ } ++ ++ return 0; ++} ++ ++static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output, ++ AVPacket *avpkt) ++{ ++ int ret; ++ int new_extradata_size; ++ uint8_t *new_extradata; ++ HEVCRpiContext *s = avctx->priv_data; ++ ++ if (!avpkt->size) { ++ ret = ff_hevc_rpi_output_frame(s, data, 1); ++ if (ret < 0) ++ return ret; ++ ++ *got_output = ret; ++ return 0; ++ } ++ ++ new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA, ++ &new_extradata_size); ++ if (new_extradata && new_extradata_size > 0) { ++ ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0); ++ if (ret < 0) ++ return ret; ++ } ++ ++ s->ref = NULL; ++ ret = decode_nal_units(s, avpkt->data, avpkt->size); ++ if (ret < 0) ++ return ret; ++ ++ /* verify the SEI checksum */ ++ if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded && ++ s->sei.picture_hash.is_md5) { ++ ret = verify_md5(s, s->ref->frame); ++ if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) { ++ ff_hevc_rpi_unref_frame(s, s->ref, ~0); ++ return ret; ++ } ++ } ++ s->sei.picture_hash.is_md5 = 0; ++ ++ if (s->is_decoded) { ++ av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc); ++ s->is_decoded = 0; ++ } ++ ++ if (s->output_frame->buf[0]) { ++ av_frame_move_ref(data, s->output_frame); ++ *got_output = 1; ++ } ++ ++ return avpkt->size; ++} ++ ++static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src) ++{ ++ int ret; ++ ++ ret = ff_thread_ref_frame(&dst->tf, &src->tf); ++ if (ret < 0) ++ return ret; ++ ++ if (src->col_mvf_buf != NULL) ++ { ++ dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf); ++ if (!dst->col_mvf_buf) ++ goto fail; ++ } ++ dst->col_mvf = src->col_mvf; ++ ++ dst->poc = src->poc; ++ dst->flags = src->flags; ++ dst->sequence = src->sequence; ++ return 0; ++ ++fail: ++ ff_hevc_rpi_unref_frame(s, dst, ~0); ++ return AVERROR(ENOMEM); ++} ++ ++ ++static av_cold int hevc_decode_free(AVCodecContext *avctx) ++{ ++ HEVCRpiContext * const s = avctx->priv_data; ++ int i; ++ ++ pic_arrays_free(s); ++ ++ av_freep(&s->md5_ctx); ++ ++ av_freep(&s->cabac_save); ++ ++#if RPI_EXTRA_BIT_THREADS ++ bit_threads_kill(s); ++#endif ++ ++ hevc_exit_worker(s); ++ for (i = 0; i != 2; ++i) { ++ ff_hevc_rpi_progress_kill_state(s->progress_states + i); ++ } ++ job_lc_kill(s->HEVClc); ++ ++ av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0] ++ av_freep(&s->sao_pixel_buffer_v[0]); ++ av_frame_free(&s->output_frame); ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); ++ av_frame_free(&s->DPB[i].frame); ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) ++ av_buffer_unref(&s->ps.vps_list[i]); ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) ++ av_buffer_unref(&s->ps.sps_list[i]); ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) ++ av_buffer_unref(&s->ps.pps_list[i]); ++ s->ps.sps = NULL; ++ s->ps.pps = NULL; ++ s->ps.vps = NULL; ++ ++ // Free separately from sLists as used that way by RPI WPP ++ for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) { ++ av_freep(s->HEVClcList + i); ++ } ++ s->HEVClc = NULL; // Allocated as part of HEVClcList ++ ++ ff_h2645_packet_uninit(&s->pkt); ++ ++ if (s->qpu_init_ok) ++ vpu_qpu_term(); ++ s->qpu_init_ok = 0; ++ ++ return 0; ++} ++ ++ ++static av_cold int hevc_init_context(AVCodecContext *avctx) ++{ ++ HEVCRpiContext *s = avctx->priv_data; ++ int i; ++ ++ s->avctx = avctx; ++ ++ s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext)); ++ if (!s->HEVClc) ++ goto fail; ++ s->HEVClcList[0] = s->HEVClc; ++ ++ if (vpu_qpu_init() != 0) ++ goto fail; ++ s->qpu_init_ok = 1; ++ ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ { ++ static const uint32_t dframe[1] = {0x80808080}; ++ s->qpu_dummy_frame_emu = (const uint8_t *)dframe; ++ } ++#endif ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++ s->qpu_dummy_frame_qpu = qpu_dummy(); ++#endif ++ ++ bt_lc_init(s, s->HEVClc, 0); ++ job_lc_init(s->HEVClc); ++ ++ for (i = 0; i != 2; ++i) { ++ ff_hevc_rpi_progress_init_state(s->progress_states + i); ++ } ++ ++ if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL) ++ goto fail; ++ ++ if ((s->output_frame = av_frame_alloc()) == NULL) ++ goto fail; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ s->DPB[i].frame = av_frame_alloc(); ++ if (!s->DPB[i].frame) ++ goto fail; ++ s->DPB[i].tf.f = s->DPB[i].frame; ++ s->DPB[i].dpb_no = i; ++ } ++ ++ s->max_ra = INT_MAX; ++ ++ if ((s->md5_ctx = av_md5_alloc()) == NULL) ++ goto fail; ++ ++ s->context_initialized = 1; ++ s->eos = 0; ++ ++ ff_hevc_rpi_reset_sei(&s->sei); ++ ++ return 0; ++ ++fail: ++ av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__); ++ hevc_decode_free(avctx); ++ return AVERROR(ENOMEM); ++} ++ ++#if HAVE_THREADS ++static int hevc_update_thread_context(AVCodecContext *dst, ++ const AVCodecContext *src) ++{ ++ HEVCRpiContext *s = dst->priv_data; ++ HEVCRpiContext *s0 = src->priv_data; ++ int i, ret; ++ ++ av_assert0(s->context_initialized); ++ ++ // dst == src can happen according to the comments and in that case ++ // there is nothing to do here ++ if (dst == src) ++ return 0; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) { ++ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0); ++ if (s0->DPB[i].frame->buf[0]) { ++ ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]); ++ if (ret < 0) ++ return ret; ++ } ++ } ++ ++ if (s->ps.sps != s0->ps.sps) ++ s->ps.sps = NULL; ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) { ++ av_buffer_unref(&s->ps.vps_list[i]); ++ if (s0->ps.vps_list[i]) { ++ s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]); ++ if (!s->ps.vps_list[i]) ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) { ++ av_buffer_unref(&s->ps.sps_list[i]); ++ if (s0->ps.sps_list[i]) { ++ s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]); ++ if (!s->ps.sps_list[i]) ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) { ++ av_buffer_unref(&s->ps.pps_list[i]); ++ if (s0->ps.pps_list[i]) { ++ s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]); ++ if (!s->ps.pps_list[i]) ++ return AVERROR(ENOMEM); ++ } ++ } ++ ++ if (s->ps.sps != s0->ps.sps) ++ if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0) ++ return ret; ++ ++ s->seq_decode = s0->seq_decode; ++ s->seq_output = s0->seq_output; ++ s->pocTid0 = s0->pocTid0; ++ s->max_ra = s0->max_ra; ++ s->eos = s0->eos; ++ s->no_rasl_output_flag = s0->no_rasl_output_flag; ++ ++ s->is_nalff = s0->is_nalff; ++ s->nal_length_size = s0->nal_length_size; ++ ++ s->threads_type = s0->threads_type; ++ ++ if (s0->eos) { ++ s->seq_decode = (s->seq_decode + 1) & 0xff; ++ s->max_ra = INT_MAX; ++ } ++ ++ s->sei.frame_packing = s0->sei.frame_packing; ++ s->sei.display_orientation = s0->sei.display_orientation; ++ s->sei.mastering_display = s0->sei.mastering_display; ++ s->sei.content_light = s0->sei.content_light; ++ s->sei.alternative_transfer = s0->sei.alternative_transfer; ++ ++ // * We do this here as it allows us to easily locate our parents ++ // global job pool, but there really should be a less nasty way ++ if (s->jbc == NULL) ++ { ++ av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL); ++ hevc_init_worker(s); ++ } ++ ++ return 0; ++} ++#endif ++ ++#include ++static int qpu_ok(void) ++{ ++ static int is_pi3 = -1; ++ if (is_pi3 == -1) ++ { ++ struct stat sb; ++ is_pi3 = (stat("/dev/rpivid-intcmem", &sb) != 0); ++ } ++ return is_pi3; ++} ++ ++static av_cold int hevc_decode_init(AVCodecContext *avctx) ++{ ++ HEVCRpiContext *s = avctx->priv_data; ++ int ret; ++ ++ if (!qpu_ok()) ++ return AVERROR_DECODER_NOT_FOUND; ++ ++ if ((ret = hevc_init_context(avctx)) < 0) ++ return ret; ++ ++ // If we are a child context then stop now ++ // Everything after this point is either 1st decode setup or global alloc ++ // that must not be repeated ++ // Global info will be copied into children in update_thread_context (we ++ // can't do it here as we have no way of finding the parent context) ++ if (avctx->internal->is_copy) ++ return 0; ++ ++ // Job allocation requires VCSM alloc to work so ensure that we have it ++ // initialised by this point ++ { ++ HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5)); ++ if (jbg == NULL) { ++ av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__); ++ ret = AVERROR(ENOMEM); ++ goto fail; ++ } ++ ++ if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) { ++ av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__); ++ ret = AVERROR(ENOMEM); ++ goto fail; ++ } ++ } ++ ++ hevc_init_worker(s); ++ ++ s->eos = 1; ++ ++ if (avctx->extradata_size > 0 && avctx->extradata) { ++ if ((ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1)) < 0) ++ goto fail; ++ ++ if (!all_sps_supported(s)) { ++ ret = AVERROR_DECODER_NOT_FOUND; ++ goto fail; ++ } ++ } ++ ++ if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1) ++ s->threads_type = FF_THREAD_FRAME; ++ else ++ s->threads_type = 0; ++ ++ return 0; ++ ++fail: ++ hevc_decode_free(avctx); ++ return ret; ++} ++ ++static void hevc_decode_flush(AVCodecContext *avctx) ++{ ++ HEVCRpiContext *s = avctx->priv_data; ++ ff_hevc_rpi_flush_dpb(s); ++ s->max_ra = INT_MAX; ++ s->eos = 1; ++} ++ ++typedef struct hwaccel_rpi3_qpu_env_s { ++ const AVClass *av_class; ++ AVZcEnvPtr zc; ++} hwaccel_rpi3_qpu_env_t; ++ ++static int hwaccel_alloc_frame(AVCodecContext *s, AVFrame *frame) ++{ ++ hwaccel_rpi3_qpu_env_t * const r3 = s->internal->hwaccel_priv_data; ++ int rv; ++ ++ if (av_rpi_zc_in_use(s)) ++ { ++ rv = s->get_buffer2(s, frame, 0); ++ } ++ else ++ { ++ rv = av_rpi_zc_get_buffer(r3->zc, frame); ++ if (rv == 0) ++ rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID); // actually do the alloc ++ } ++ ++ if (rv == 0 && ++ (rv = ff_attach_decode_data(frame)) < 0) ++ { ++ av_frame_unref(frame); ++ } ++ ++ return rv; ++} ++ ++static int hwaccel_rpi3_qpu_free(AVCodecContext *avctx) ++{ ++ hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data; ++ av_rpi_zc_int_env_freep(&r3->zc); ++ return 0; ++} ++ ++static int hwaccel_rpi3_qpu_init(AVCodecContext *avctx) ++{ ++ hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data; ++ ++ if ((r3->zc = av_rpi_zc_int_env_alloc(avctx)) == NULL) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ av_log(avctx, AV_LOG_ERROR, "Rpi3 QPU init failed\n"); ++ hwaccel_rpi3_qpu_free(avctx); ++ return AVERROR(ENOMEM); ++} ++ ++ ++#define OFFSET(x) offsetof(HEVCRpiContext, x) ++#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM) ++ ++ ++static const AVOption options[] = { ++ { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin), ++ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR }, ++ { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin), ++ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR }, ++ { NULL }, ++}; ++ ++static const AVClass hevc_rpi_decoder_class = { ++ .class_name = "HEVC RPI decoder", ++ .item_name = av_default_item_name, ++ .option = options, ++ .version = LIBAVUTIL_VERSION_INT, ++}; ++ ++static const enum AVPixelFormat hevc_rpi_pix_fmts[] = { ++ AV_PIX_FMT_SAND128, ++ AV_PIX_FMT_SAND64_10, ++ AV_PIX_FMT_NONE ++}; ++ ++ ++static const AVHWAccel hwaccel_rpi3_qpu = { ++ .name = "Pi3 QPU Hwaccel", ++ .alloc_frame = hwaccel_alloc_frame, ++ .init = hwaccel_rpi3_qpu_init, ++ .uninit = hwaccel_rpi3_qpu_free, ++ .priv_data_size = sizeof(hwaccel_rpi3_qpu_env_t), ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, ++}; ++ ++static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand128 = ++{ ++ .public = { ++ .pix_fmt = AV_PIX_FMT_SAND128, ++ .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC, ++ .device_type = AV_HWDEVICE_TYPE_NONE, ++ }, ++ .hwaccel = &hwaccel_rpi3_qpu ++}; ++static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand64_10 = ++{ ++ .public = { ++ .pix_fmt = AV_PIX_FMT_SAND64_10, ++ .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC, ++ .device_type = AV_HWDEVICE_TYPE_NONE, ++ }, ++ .hwaccel = &hwaccel_rpi3_qpu ++}; ++ ++ ++static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = { ++ &hevc_rpi_hw_config_sand128, ++ &hevc_rpi_hw_config_sand64_10, ++ NULL ++}; ++ ++ ++AVCodec ff_hevc_rpi_decoder = { ++ .name = "hevc_rpi", ++ .long_name = NULL_IF_CONFIG_SMALL("HEVC (rpi)"), ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_HEVC, ++ .priv_data_size = sizeof(HEVCRpiContext), ++ .priv_class = &hevc_rpi_decoder_class, ++ .init = hevc_decode_init, ++ .close = hevc_decode_free, ++ .decode = hevc_rpi_decode_frame, ++ .flush = hevc_decode_flush, ++ .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context), ++ .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY | ++ AV_CODEC_CAP_HARDWARE | ++ AV_CODEC_CAP_AVOID_PROBING | ++#if 0 ++ // Debugging is often easier without threads getting in the way ++ 0, ++#warning H265 threading turned off ++#else ++ // We only have decent optimisation for frame - so only admit to that ++ AV_CODEC_CAP_FRAME_THREADS, ++#endif ++ .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE | ++ FF_CODEC_CAP_EXPORTS_CROPPING | ++ FF_CODEC_CAP_ALLOCATE_PROGRESS, ++ .pix_fmts = hevc_rpi_pix_fmts, ++ .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles), ++ .hw_configs = hevc_rpi_hw_configs, ++// .wrapper_name = "hevc_rpi", ++}; ++ +diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h +new file mode 100644 +index 0000000000..1f94d18673 +--- /dev/null ++++ b/libavcodec/rpi_hevcdec.h +@@ -0,0 +1,1091 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVCDEC_H ++#define AVCODEC_RPI_HEVCDEC_H ++ ++#include "config.h" ++ ++#include ++ ++#include "libavutil/buffer.h" ++ ++#include "avcodec.h" ++#include "bswapdsp.h" ++#include "cabac.h" ++#include "get_bits.h" ++#include "rpi_hevcpred.h" ++#include "h2645_parse.h" ++#include "hevc.h" ++#include "rpi_hevc_mv.h" ++#include "rpi_hevc_ps.h" ++#include "rpi_hevc_sei.h" ++#include "rpi_hevcdsp.h" ++#include "internal.h" ++#include "thread.h" ++#include "videodsp.h" ++ ++#if ARCH_ARM ++#include "arm/rpi_hevc_misc_neon.h" ++#endif ++ ++#define MAX_NB_THREADS 16 ++#define SHIFT_CTB_WPP 2 ++ ++//TODO: check if this is really the maximum ++#define MAX_TRANSFORM_DEPTH 5 ++ ++#define MAX_TB_SIZE 32 ++#define MAX_QP 51 ++#define DEFAULT_INTRA_TC_OFFSET 2 ++ ++#define HEVC_CONTEXTS 199 ++ ++#define MRG_MAX_NUM_CANDS 5 ++ ++#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE) // 64 ++ ++// Size of DPB array ++#define HEVC_DPB_ELS 32 ++ ++#define L0 0 ++#define L1 1 ++ ++#define EPEL_EXTRA_BEFORE 1 ++#define EPEL_EXTRA_AFTER 2 ++#define EPEL_EXTRA 3 ++#define QPEL_EXTRA_BEFORE 3 ++#define QPEL_EXTRA_AFTER 4 ++#define QPEL_EXTRA 7 ++ ++#define EDGE_EMU_BUFFER_STRIDE 80 ++ ++#include ++#include "rpi_qpu.h" ++ ++// Max jobs per frame thread. Actual usage will be limited by the size ++// of the global job pool ++// ?? Limits ++#define RPI_MAX_JOBS 8 ++ ++// This is the number of _extra_ bit threads - we will have ++// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing ++// ++// 0 is legitimate and will disable our WPP processing ++//#define RPI_EXTRA_BIT_THREADS 0 ++#define RPI_EXTRA_BIT_THREADS 2 ++ ++// Number of separate threads/passes in worker ++// 2 and 3 are the currently valid numbers ++// At the moment 3 seems fractionally faster ++//#define RPI_PASSES 2 ++#define RPI_PASSES 3 ++ ++// Print out various usage stats ++#define RPI_TSTATS 0 ++ ++// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form ++#define RPI_COMPRESS_COEFFS 1 ++ ++// Wait for VPU/QPU to finish in worker pass 0 ++// If 0 then the wait is in pass 1 ++// ++// One might expect the better place to wait would be in pass 1 however ++// testing shows that pass 0 produces overall faster decode. ++// Interestingly it is QPU/VPU limited streams that seem to suffer ++// from pass 1 waits, CPU limited ones tend to show a very mild gain. ++// This define exists so it is easy to test this. ++#define RPI_WORKER_WAIT_PASS_0 1 ++ ++// Use ARM emulation of QPU pred ++// These are for debug only as the emulation makes only limited ++// effort to be fast ++#define RPI_QPU_EMU_Y 0 ++#define RPI_QPU_EMU_C 0 ++ ++// Max width & height we are prepared to consider ++// Sand frame shape calc becomes confused with large frames ++// Some buffer alloc also depends on this ++#define HEVC_RPI_MAX_WIDTH 2048 ++#define HEVC_RPI_MAX_HEIGHT 1088 ++ ++ ++// Min CTB size is 16 ++#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16) ++ ++/** ++ * Value of the luma sample at position (x, y) in the 2D array tab. ++ */ ++#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)]) ++#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)]) ++ ++#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP) ++#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \ ++ (s)->nal_unit_type == HEVC_NAL_BLA_N_LP) ++#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23) ++ ++enum RPSType { ++ ST_CURR_BEF = 0, ++ ST_CURR_AFT, ++ ST_FOLL, ++ LT_CURR, ++ LT_FOLL, ++ NB_RPS_TYPE, ++}; ++ ++enum SyntaxElement { ++ SAO_MERGE_FLAG = 0, ++ SAO_TYPE_IDX, ++ SAO_EO_CLASS, ++ SAO_BAND_POSITION, ++ SAO_OFFSET_ABS, ++ SAO_OFFSET_SIGN, ++ END_OF_SLICE_FLAG, ++ SPLIT_CODING_UNIT_FLAG, ++ CU_TRANSQUANT_BYPASS_FLAG, ++ SKIP_FLAG, ++ CU_QP_DELTA, ++ PRED_MODE_FLAG, ++ PART_MODE, ++ PCM_FLAG, ++ PREV_INTRA_LUMA_PRED_FLAG, ++ MPM_IDX, ++ REM_INTRA_LUMA_PRED_MODE, ++ INTRA_CHROMA_PRED_MODE, ++ MERGE_FLAG, ++ MERGE_IDX, ++ INTER_PRED_IDC, ++ REF_IDX_L0, ++ REF_IDX_L1, ++ ABS_MVD_GREATER0_FLAG, ++ ABS_MVD_GREATER1_FLAG, ++ ABS_MVD_MINUS2, ++ MVD_SIGN_FLAG, ++ MVP_LX_FLAG, ++ NO_RESIDUAL_DATA_FLAG, ++ SPLIT_TRANSFORM_FLAG, ++ CBF_LUMA, ++ CBF_CB_CR, ++ TRANSFORM_SKIP_FLAG, ++ EXPLICIT_RDPCM_FLAG, ++ EXPLICIT_RDPCM_DIR_FLAG, ++ LAST_SIGNIFICANT_COEFF_X_PREFIX, ++ LAST_SIGNIFICANT_COEFF_Y_PREFIX, ++ LAST_SIGNIFICANT_COEFF_X_SUFFIX, ++ LAST_SIGNIFICANT_COEFF_Y_SUFFIX, ++ SIGNIFICANT_COEFF_GROUP_FLAG, ++ SIGNIFICANT_COEFF_FLAG, ++ COEFF_ABS_LEVEL_GREATER1_FLAG, ++ COEFF_ABS_LEVEL_GREATER2_FLAG, ++ COEFF_ABS_LEVEL_REMAINING, ++ COEFF_SIGN_FLAG, ++ LOG2_RES_SCALE_ABS, ++ RES_SCALE_SIGN_FLAG, ++ CU_CHROMA_QP_OFFSET_FLAG, ++ CU_CHROMA_QP_OFFSET_IDX, ++}; ++ ++enum PartMode { ++ PART_2Nx2N = 0, ++ PART_2NxN = 1, ++ PART_Nx2N = 2, ++ PART_NxN = 3, ++ PART_2NxnU = 4, ++ PART_2NxnD = 5, ++ PART_nLx2N = 6, ++ PART_nRx2N = 7, ++}; ++ ++enum PredMode { ++ MODE_INTER = 0, ++ MODE_INTRA, ++ MODE_SKIP, ++}; ++ ++enum InterPredIdc { ++ PRED_L0 = 0, ++ PRED_L1, ++ PRED_BI, ++}; ++ ++enum PredFlag { ++ PF_INTRA = 0, ++ PF_L0, ++ PF_L1, ++ PF_BI, ++}; ++ ++enum SAOType { ++ SAO_NOT_APPLIED = 0, ++ SAO_BAND, ++ SAO_EDGE, ++ SAO_APPLIED ++}; ++ ++enum SAOEOClass { ++ SAO_EO_HORIZ = 0, ++ SAO_EO_VERT, ++ SAO_EO_135D, ++ SAO_EO_45D, ++}; ++ ++enum ScanType { ++ SCAN_DIAG = 0, ++ SCAN_HORIZ, ++ SCAN_VERT, ++}; ++ ++typedef struct RefPicList { ++ struct HEVCRpiFrame *ref[HEVC_MAX_REFS]; ++ int list[HEVC_MAX_REFS]; ++ uint8_t isLongTerm[HEVC_MAX_REFS]; ++ int nb_refs; ++} RefPicList; ++ ++typedef struct RefPicListTab { ++ RefPicList refPicList[2]; ++} RefPicListTab; ++ ++typedef struct RpiCodingUnit { ++ unsigned int x; // Passed to deblock ++ unsigned int y; ++ unsigned int x_split; ++ unsigned int y_split; ++ ++ enum PredMode pred_mode; ///< PredMode ++ enum PartMode part_mode; ///< PartMode ++ ++ // Inferred parameters ++ uint8_t intra_split_flag; ///< IntraSplitFlag ++ uint8_t max_trafo_depth; ///< MaxTrafoDepth ++ uint8_t cu_transquant_bypass_flag; ++} RpiCodingUnit; ++ ++typedef struct RpiPredictionUnit { ++ uint8_t intra_pred_mode[4]; ++ uint8_t intra_pred_mode_c[4]; ++ uint8_t chroma_mode_c[4]; ++ uint8_t merge_flag; ++} RpiPredictionUnit; ++ ++typedef struct HEVCRpiTransformUnit { ++ int8_t cu_qp_delta; ++ ++ // Inferred parameters; ++ uint8_t intra_pred_mode; ++ uint8_t intra_pred_mode_c; ++ uint8_t chroma_mode_c; ++ uint8_t is_cu_qp_delta_wanted; ++ uint8_t cu_chroma_qp_offset_wanted; ++ const int8_t * qp_divmod6[3]; ++} HEVCRpiTransformUnit; ++ ++typedef struct DBParams { ++ int8_t beta_offset; // -12 to +12 ++ int8_t tc_offset; // -12 to +12 ++} DBParams; ++ ++#define HEVC_FRAME_FLAG_OUTPUT (1 << 0) ++#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1) ++#define HEVC_FRAME_FLAG_LONG_REF (1 << 2) ++#define HEVC_FRAME_FLAG_BUMPING (1 << 3) ++ ++struct HEVCRpiJob; ++ ++typedef struct HEVCRpiFrame { ++ AVFrame *frame; ++ ThreadFrame tf; ++ ColMvField *col_mvf; ++ int poc; ++ struct HEVCRpiFrame *collocated_ref; ++ ++ AVBufferRef *col_mvf_buf; ++ ++ /** ++ * A sequence counter, so that old frames are output first ++ * after a POC reset ++ */ ++ uint16_t sequence; ++ ++ /** ++ * A combination of HEVC_FRAME_FLAG_* ++ */ ++ uint8_t flags; ++ ++ // Entry no in DPB - can be used as a small unique ++ // frame identifier (within the current thread) ++ uint8_t dpb_no; ++} HEVCRpiFrame; ++ ++typedef struct HEVCRpiLocalContext { ++ HEVCRpiTransformUnit tu; ++ ++ CABACContext cc; ++ ++ // Vars that allow us to locate everything from just an lc ++ struct HEVCRpiContext * context; // ??? make const ??? ++ unsigned int lc_n; // lc list el no ++ ++ // Job wait links ++ struct HEVCRpiLocalContext * jw_next; ++ struct HEVCRpiLocalContext * jw_prev; ++ struct HEVCRpiLocalContext * ljw_next; ++ struct HEVCRpiLocalContext * ljw_prev; ++ struct HEVCRpiJob * volatile jw_job; ++ sem_t jw_sem; ++ ++ // ?? Wrap in structure ?? ++ sem_t bt_sem_in; ++ sem_t * bt_psem_out; ++ volatile int bt_terminate; ++ unsigned int ts; ++ unsigned int bt_last_line; // Last line in this bit_thread chunk ++ unsigned int bt_line_no; ++ unsigned int bt_line_width; ++ unsigned int bt_line_inc; ++ ++ struct HEVCRpiJob * jb0; ++ char unit_done; // Set once we have dealt with this slice ++ char bt_is_tile; ++ char last_progress_good; ++ char cabac_init_req; ++ ++ uint8_t cabac_state[HEVC_CONTEXTS]; ++ uint8_t stat_coeff[4]; ++ GetBitContext gb; ++ ++ uint8_t ct_depth; ++ int8_t qp_y; ++ int8_t curr_qp_y; ++ int8_t qPy_pred; ++ ++// N.B. Used by asm (neon) - do not change ++#define AVAIL_S_UR 0 ++#define AVAIL_S_U 1 ++#define AVAIL_S_UL 2 ++#define AVAIL_S_L 3 ++#define AVAIL_S_DL 4 ++ ++#define AVAIL_U (1 << AVAIL_S_U) ++#define AVAIL_L (1 << AVAIL_S_L) ++#define AVAIL_UL (1 << AVAIL_S_UL) ++#define AVAIL_UR (1 << AVAIL_S_UR) ++#define AVAIL_DL (1 << AVAIL_S_DL) ++ ++// Intra filters - same number space as avail ++#define FILTER_LIGHT 0x40 ++#define FILTER_STRONG 0x80 ++#define FILTER_EITHER (FILTER_LIGHT | FILTER_STRONG) ++ ++ uint8_t ctb_avail; ++ int end_of_ctb_x; ++ int end_of_ctb_y; ++ ++ RpiCodingUnit cu; ++ RpiPredictionUnit pu; ++ ++#define BOUNDARY_LEFT_SLICE (1 << 0) ++#define BOUNDARY_LEFT_TILE (1 << 1) ++#define BOUNDARY_UPPER_SLICE (1 << 2) ++#define BOUNDARY_UPPER_TILE (1 << 3) ++ /* properties of the boundary of the current CTB for the purposes ++ * of the deblocking filter */ ++ unsigned int boundary_flags; ++ ++#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE) ++ uint8_t ipm_left[IPM_TAB_SIZE]; ++ uint8_t ipm_up[IPM_TAB_SIZE]; ++ ++//#define MVF_STASH_WIDTH 128 ++#define MVF_STASH_WIDTH 64 ++#define MVF_STASH_HEIGHT 64 ++#define MVF_STASH_WIDTH_PU (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE) ++#define MVF_STASH_HEIGHT_PU (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE) ++ HEVCRpiMvField mvf_ul[1]; ++ HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU]; ++ ++ /* +7 is for subpixel interpolation, *2 for high bit depths */ ++// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; ++ /* The extended size between the new edge emu buffer is abused by SAO */ ++// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2]; ++// DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]); ++ ++} HEVCRpiLocalContext; ++ ++// Each block can have an intra prediction and an add_residual command ++// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH ++ ++// Sand only has 2 planes (Y/C) ++#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4)) ++ ++// Command for intra prediction and transform_add of predictions to coefficients ++enum rpi_pred_cmd_e ++{ ++ RPI_PRED_ADD_RESIDUAL, ++ RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx ++ RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx ++ RPI_PRED_ADD_RESIDUAL_C, // Merged U+V ++ RPI_PRED_ADD_DC, ++ RPI_PRED_ADD_DC_U, // Both U & V are effectively C ++ RPI_PRED_ADD_DC_V, ++ RPI_PRED_INTRA, ++ RPI_PRED_INTRA_C, ++ RPI_PRED_I_PCM, ++ RPI_PRED_CMD_MAX ++}; ++ ++typedef struct HEVCPredCmd { ++ uint8_t type; ++ uint8_t size; // log2 "size" used by all variants ++ uint8_t avail; // i_pred - but left here as they pack well ++ uint8_t dummy; ++ union { ++ struct { // TRANSFORM_ADD ++ uint8_t * dst; ++ const int16_t * buf; ++ uint16_t stride; // Should be good enough for all pic fmts we use ++ int16_t dc; ++ } ta; ++ struct { ++ uint8_t * dst; ++ uint32_t stride; ++ int dc; ++ } dc; ++ struct { // INTRA ++ uint16_t x; ++ uint16_t y; ++ enum IntraPredMode mode; ++ } i_pred; ++ struct { // I_PCM ++ uint16_t x; ++ uint16_t y; ++ const void * src; ++ uint32_t src_len; ++ } i_pcm; ++ }; ++} HEVCPredCmd; ++ ++union qpu_mc_pred_cmd_s; ++struct qpu_mc_pred_y_p_s; ++struct qpu_mc_src_s; ++ ++typedef struct HEVCRpiInterPredQ ++{ ++ union qpu_mc_pred_cmd_u *qpu_mc_base; ++ union qpu_mc_pred_cmd_u *qpu_mc_curr; ++ struct qpu_mc_src_s *last_l0; ++ struct qpu_mc_src_s *last_l1; ++ unsigned int load; ++ uint32_t code_setup; ++ uint32_t code_sync; ++ uint32_t code_exit; ++} HEVCRpiInterPredQ; ++ ++typedef struct HEVCRpiInterPredEnv ++{ ++ HEVCRpiInterPredQ * q; ++ uint8_t n; // Number of Qs ++ uint8_t n_grp; // Number of Q in a group ++ uint8_t curr; // Current Q number (0..n-1) ++ uint8_t used; // 0 if nothing in any Q, 1 otherwise ++ uint8_t used_grp; // 0 if nothing in any Q in the current group ++ unsigned int max_fill; ++ unsigned int min_gap; ++ GPU_MEM_PTR_T gptr; ++} HEVCRpiInterPredEnv; ++ ++typedef struct HEVCRpiIntraPredEnv { ++ unsigned int n; // Number of commands ++ HEVCPredCmd * cmds; ++} HEVCRpiIntraPredEnv; ++ ++typedef struct HEVCRpiCoeffEnv { ++ unsigned int n; ++#if RPI_COMPRESS_COEFFS ++ unsigned int packed; // Equal to 1 if coefficients should be being packed ++ unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed). Only valid if packed==0 ++#endif ++ int16_t * buf; ++} HEVCRpiCoeffEnv; ++ ++typedef struct HEVCRpiCoeffsEnv { ++ HEVCRpiCoeffEnv s[4]; ++ GPU_MEM_PTR_T gptr; ++ void * mptr; ++} HEVCRpiCoeffsEnv; ++ ++typedef struct HEVCRpiFrameProgressWait { ++ int req; ++ struct HEVCRpiFrameProgressWait * next; ++ sem_t sem; ++} HEVCRpiFrameProgressWait; ++ ++typedef struct HEVCRpiFrameProgressState { ++ struct HEVCRpiFrameProgressWait * first; ++ struct HEVCRpiFrameProgressWait * last; ++ pthread_mutex_t lock; ++} HEVCRpiFrameProgressState; ++ ++typedef struct RpiBlk ++{ ++ unsigned int x; ++ unsigned int y; ++ unsigned int w; ++ unsigned int h; ++} RpiBlk; ++ ++typedef struct HEVCRpiJob { ++ struct HEVCRpiJob * next; // Free chain ++ struct HEVCRpiJobCtl * jbc_local; ++ const HEVCRpiSPS * sps; // sps used to set up this job ++ ++ int waited; ++ int ctu_ts_first; ++ int ctu_ts_last; ++ RpiBlk bounds; // Bounding box of job ++ ++ struct qpu_mc_pred_y_p_s * last_y8_p; ++ struct qpu_mc_src_s * last_y8_l1; ++ rpi_cache_flush_env_t * rfe; ++ ++ HEVCRpiInterPredEnv chroma_ip; ++ HEVCRpiInterPredEnv luma_ip; ++ int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no ++ HEVCRpiIntraPredEnv intra; ++ HEVCRpiCoeffsEnv coeffs; ++ HEVCRpiFrameProgressWait progress_wait; ++ sem_t sem; ++ rpi_cache_buf_t flush_buf; ++} HEVCRpiJob; ++ ++struct HEVCRpiContext; ++ ++typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb); ++ ++typedef struct HEVCRpiPassQueue ++{ ++// int pending; ++ volatile int terminate; ++ sem_t sem_in; ++ sem_t * psem_out; ++ unsigned int job_n; ++ struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread ++ HEVCRpiWorkerFn * worker; ++ pthread_t thread; ++ uint8_t pass_n; // Pass number - debug ++ uint8_t started; ++} HEVCRpiPassQueue; ++ ++ ++struct HEVCRpiJobGlobal; ++ ++typedef struct HEVCRpiJobCtl ++{ ++ sem_t sem_out; ++ ++ HEVCRpiJob * volatile jb1; // The job associated with this frame if unallocated - NULL if allocated ++ struct HEVCRpiJobGlobal * jbg; ++ ++ HEVCRpiLocalContext * lcw_head; ++ HEVCRpiLocalContext * lcw_tail; ++ ++ pthread_mutex_t in_lock; ++ int offload_in; ++ ++ HEVCRpiJob *offloadq[RPI_MAX_JOBS]; ++} HEVCRpiJobCtl; ++ ++ ++typedef struct HEVCRpiJobGlobal ++{ ++ intptr_t ref_count; ++ pthread_mutex_t lock; ++ HEVCRpiJob * free1; // Singly linked list of free jobs ++ HEVCRpiLocalContext * wait_head; // Double linked list of lcs waiting for a job ++ HEVCRpiLocalContext * wait_good; // Last good tail ++ HEVCRpiLocalContext * wait_tail; ++ ++} HEVCRpiJobGlobal; ++ ++#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1) ++ ++#if RPI_TSTATS ++typedef struct HEVCRpiStats { ++ int y_pred1_y8_merge; ++ int y_pred1_xy; ++ int y_pred1_x0; ++ int y_pred1_y0; ++ int y_pred1_x0y0; ++ int y_pred1_wle8; ++ int y_pred1_wgt8; ++ int y_pred1_hle16; ++ int y_pred1_hgt16; ++ int y_pred2_xy; ++ int y_pred2_x0; ++ int y_pred2_y0; ++ int y_pred2_x0y0; ++ int y_pred2_hle16; ++ int y_pred2_hgt16; ++} HEVCRpiStats; ++#endif ++ ++typedef struct HEVCRpiCabacState ++{ ++ uint8_t rice[4]; ++ uint8_t state[HEVC_CONTEXTS]; ++} HEVCRpiCabacState; ++ ++#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT 6 // 64 pels ++#define HEVC_RPI_BS_STRIDE1_PELS (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT) ++#define HEVC_RPI_BS_STRIDE1_PEL_MASK (HEVC_RPI_BS_STRIDE1_PELS - 1) ++#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT 2 // 4 els per byte ++#define HEVC_RPI_BS_PELS_PER_EL_SHIFT 2 // 4 pels per el ++#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT) ++#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) ++#define HEVC_RPI_BS_STRIDE1_BYTES (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) ++#define HEVC_RPI_BS_Y_SHR 3 // 8 vertical pels per row ++#define HEVC_RPI_BS_COL_BYTES_SHR (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) ++ ++typedef struct HEVCRpiContext { ++ const AVClass *c; // needed by private avoptions ++ AVCodecContext *avctx; ++ ++ uint8_t threads_type; ++ char qpu_init_ok; ++ ++ /** 1 if the independent slice segment header was successfully parsed */ ++ uint8_t slice_initialized; ++ char used_for_ref; // rpi ++ char is_irap; ++ char offload_recon; ++ uint8_t eos; ///< current packet contains an EOS/EOB NAL ++ uint8_t last_eos; ///< last packet contains an EOS/EOB NAL ++ uint8_t no_backward_pred_flag; ++ uint8_t is_decoded; ++ uint8_t no_rasl_output_flag; ++ ++ ++ /** ++ * Sequence counters for decoded and output frames, so that old ++ * frames are output first after a POC reset ++ */ ++ uint16_t seq_decode; ++ uint16_t seq_output; ++ ++ int width; ++ int height; ++ ++ HEVCRpiJobCtl * jbc; ++ // cabac stash ++ // b0 skip flag ++ // b1+ ct_depth ++ uint8_t * cabac_stash_left; ++ uint8_t * cabac_stash_up; ++ ++ // Function pointers ++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C ++ const uint8_t * qpu_dummy_frame_emu; ++#endif ++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C ++ uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory ++#endif ++ HEVCRpiQpu qpu; ++ ++ HEVCRpiFrameProgressState progress_states[2]; ++ ++ HEVCRpiCabacState *cabac_save; ++ ++ AVFrame *frame; ++ AVFrame *output_frame; ++ uint8_t *sao_pixel_buffer_h[3]; ++ uint8_t *sao_pixel_buffer_v[3]; ++ ++ unsigned int col_mvf_stride; ++ AVBufferPool *col_mvf_pool; ++ ++ RpiSAOParams *sao; ++ DBParams *deblock; ++ enum HEVCNALUnitType nal_unit_type; ++ int temporal_id; ///< temporal_id_plus1 - 1 ++ HEVCRpiFrame *ref; ++ int poc; ++ int pocTid0; ++ int slice_idx; ///< number of the slice being currently decoded ++ int max_ra; ++ ++ int8_t *qp_y_tab; ++ ++ // Deblocking block strength bitmaps ++ unsigned int bs_stride2; ++ unsigned int bs_size; ++ uint8_t *bs_horizontal; ++ uint8_t *bs_vertical; ++ uint8_t *bsf_stash_up; ++ uint8_t *bsf_stash_left; ++ ++#if HEVC_RPI_MAX_CTBS >= 0xffff ++#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0 ++ uint32_t *tab_slice_address; ++#else ++#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0 ++ uint16_t *tab_slice_address; ++#endif ++ ++ // Bitfield 1 bit per 8 pels (min pcm size) ++ uint8_t *is_pcm; ++ // Bitfield 1 bit per 8 pels (min cb size) ++ // Only needed for CIP as CIP processing is async to the main thread ++ uint8_t *is_intra; ++ ++ // PU ++ HEVCRpiMvField *mvf_up; ++ HEVCRpiMvField *mvf_left; ++ ++ const RefPicList **rpl_up; ++ const RefPicList **rpl_left; ++ RefPicList * refPicList; ++ ++ // CTB-level flags affecting loop filter operation ++ uint8_t *filter_slice_edges; ++ ++ /** used on BE to byteswap the lines for checksumming */ ++ uint8_t *checksum_buf; ++ int checksum_buf_size; ++ ++ const uint8_t *data; ++ ++ H2645Packet pkt; ++ // type of the first VCL NAL of the current frame ++ enum HEVCNALUnitType first_nal_type; ++ ++ uint8_t context_initialized; ++ int is_nalff; ///< this flag is != 0 if bitstream is encapsulated ++ ///< as a format defined in 14496-15 ++ int apply_defdispwin; ++ ++ int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4) ++ int nuh_layer_id; ++ ++ struct AVMD5 *md5_ctx; ++ ++ RefPicListTab * rpl_tab; ++ unsigned int rpl_tab_size; ++ ++ uint8_t *is_intra_store; ++ ++ RpiSliceHeader sh; ++ ++ HEVCRpiParamSets ps; ++ ++ HEVCRpiLocalContext *HEVClc; ++ HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS]; ++ ++ HEVCRpiFrame DPB[HEVC_DPB_ELS]; ++ ++ ///< candidate references for the current frame ++ RefPicList rps[5]; ++ ++ HEVCRpiPredContext hpc; ++ HEVCDSPContext hevcdsp; ++ ++ HEVCSEIContext sei; ++ ++ // Put structures that allocate non-trivial storage at the end ++ // These are mostly used indirectly so position in the structure doesn't matter ++ HEVCRpiPassQueue passq[RPI_PASSES]; ++#if RPI_EXTRA_BIT_THREADS > 0 ++ int bt_started; ++ // This simply contains thread descriptors - task setup is held elsewhere ++ pthread_t bit_threads[RPI_EXTRA_BIT_THREADS]; ++#endif ++#if RPI_TSTATS ++ HEVCRpiStats tstats; ++#endif ++} HEVCRpiContext; ++ ++/** ++ * Mark all frames in DPB as unused for reference. ++ */ ++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s); ++ ++/** ++ * Drop all frames currently in DPB. ++ */ ++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s); ++ ++/** ++ * Construct the reference picture sets for the current frame. ++ */ ++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s); ++ ++/** ++ * Construct the reference picture list(s) for the current slice. ++ */ ++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s); ++ ++ ++/** ++ * Get the number of candidate references for the current frame. ++ */ ++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s); ++ ++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc); ++ ++/** ++ * Find next frame in output order and put a reference to it in frame. ++ * @return 1 if a frame was output, 0 otherwise ++ */ ++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush); ++ ++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s); ++ ++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags); ++ ++unsigned int ff_hevc_rpi_tb_avail_flags( ++ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h); ++ ++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW, ++ int nPbH, int log2_cb_size, int part_idx, ++ int merge_idx, HEVCRpiMvField * const mv); ++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int nPbW, const unsigned int nPbH, ++ const unsigned int avail, ++ HEVCRpiMvField * const mv, ++ const unsigned int mvp_lx_flag, const unsigned int LX); ++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase); ++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int log2_trafo_size, const int is_coded_block); ++int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot); ++ ++extern const uint8_t ff_hevc_rpi_qpel_extra_before[4]; ++extern const uint8_t ff_hevc_rpi_qpel_extra_after[4]; ++extern const uint8_t ff_hevc_rpi_qpel_extra[4]; ++ ++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n); ++ ++// arm/hevc_misc_neon.S ++// Neon coeff zap fn ++#if HAVE_NEON ++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2); ++#endif ++ ++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const HEVCRpiFrame * const ref, const int val, const int field); ++ ++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field); ++ ++// All of these expect that s->threads_type == FF_THREAD_FRAME ++ ++static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const HEVCRpiFrame * const ref, const int y) ++{ ++ if (s->threads_type != 0) ++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1); ++} ++ ++static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y) ++{ ++ if (s->used_for_ref && s->threads_type != 0) ++ ff_hevc_rpi_progress_signal_field(s, y, 1); ++} ++ ++static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb, ++ const HEVCRpiFrame * const ref, const int y) ++{ ++ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0); ++} ++ ++static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y) ++{ ++ if (s->used_for_ref && s->threads_type != 0) ++ { ++ ff_hevc_rpi_progress_signal_field(s, y, 0); ++ } ++} ++ ++static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s) ++{ ++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0); ++ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1); ++} ++ ++ ++// Set all done - signal nothing (used in missing refs) ++// Works for both rpi & non-rpi ++static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref) ++{ ++ if (ref->tf.progress != NULL) ++ { ++ int * const p = (int *)ref->tf.progress->data; ++ p[0] = INT_MAX; ++ p[1] = INT_MAX; ++ } ++} ++ ++#define HEVC_RPI_420_ONLY 1 ++#define HEVC_RPI_SAND128_ONLY 1 ++ ++static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx) ++{ ++#if HEVC_RPI_420_ONLY ++ return cidx == 0 ? 0 : 1; ++#else ++ return s->ps.sps->hshift[cidx]; ++#endif ++} ++ ++static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx) ++{ ++#if HEVC_RPI_420_ONLY ++ return cidx == 0 ? 0 : 1; ++#else ++ return s->ps.sps->vshift[cidx]; ++#endif ++} ++ ++static inline int ctx_cfmt(const HEVCRpiContext * const s) ++{ ++#if HEVC_RPI_420_ONLY ++ return 1; ++#else ++ return s->ps.sps->chroma_format_idc; ++#endif ++} ++ ++static inline int frame_stride1(const AVFrame * const frame, const int c_idx) ++{ ++#if HEVC_RPI_SAND128_ONLY ++ return 128; ++#else ++ return frame->linesize[c_idx]; ++#endif ++} ++ ++#if HEVC_RPI_SAND128_ONLY ++// Propagate this decision to later zc includes ++#define RPI_ZC_SAND128_ONLY 1 ++#endif ++ ++#ifndef ff_hevc_rpi_copy_vert ++static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src, ++ int pixel_shift, int height, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src) ++{ ++ int i; ++ switch (pixel_shift) ++ { ++ case 2: ++ for (i = 0; i < height; i++) { ++ *(uint32_t *)dst = *(uint32_t *)src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ case 1: ++ for (i = 0; i < height; i++) { ++ *(uint16_t *)dst = *(uint16_t *)src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ default: ++ for (i = 0; i < height; i++) { ++ *dst = *src; ++ dst += stride_dst; ++ src += stride_src; ++ } ++ break; ++ } ++} ++#endif ++ ++ ++#if MVF_STASH_WIDTH == 64 ++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)); ++} ++ ++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int x, const unsigned int y) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ const unsigned int x0_ctb = x0 & mask_cs_hi; ++ const unsigned int y0_ctb = y0 & mask_cs_hi; ++ ++ return (HEVCRpiMvField *)((y < y0_ctb) ? ++ (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) : ++ (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) : ++ lc->mvf_stash + ++ ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ++ ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE))); ++} ++ ++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s, ++ const unsigned int x0, ++ const unsigned int x) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ const unsigned int x0_ctb = x0 & mask_cs_hi; ++ return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU; ++} ++ ++#else ++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x, const unsigned int y) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1))); ++} ++ ++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc, ++ const unsigned int x0, const unsigned int y0, ++ const unsigned int x, const unsigned int y) ++{ ++ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size); ++ ++ const unsigned int x0_ctb = x0 & mask_cs_hi; ++ const unsigned int y0_ctb = y0 & mask_cs_hi; ++ ++ // If not in the same CTB for Y assume up ++ if (y < y0_ctb) { ++ // If not in the same CTB for X too assume up-left ++ return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)); ++ } ++ return mvf_stash_ptr(s, lc, x, y); ++} ++ ++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s, ++ const unsigned int x0, ++ const unsigned int x) ++{ ++ return MVF_STASH_WIDTH_PU; ++} ++#endif ++ ++#endif /* AVCODEC_RPI_HEVCDEC_H */ +diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c +new file mode 100644 +index 0000000000..87f3cc9d14 +--- /dev/null ++++ b/libavcodec/rpi_hevcdsp.c +@@ -0,0 +1,450 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere ++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "rpi_hevcdsp.h" ++#include "rpi_hevc_mv.h" ++ ++static const int8_t transform[32][32] = { ++ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, ++ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 }, ++ { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4, ++ -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 }, ++ { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90, ++ -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 }, ++ { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13, ++ 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 }, ++ { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89, ++ 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 }, ++ { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22, ++ -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 }, ++ { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87, ++ -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 }, ++ { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31, ++ 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 }, ++ { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83, ++ 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 }, ++ { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38, ++ -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 }, ++ { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80, ++ -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 }, ++ { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46, ++ 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 }, ++ { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75, ++ 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 }, ++ { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54, ++ -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 }, ++ { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70, ++ -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 }, ++ { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61, ++ 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 }, ++ { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, ++ 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 }, ++ { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67, ++ -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 }, ++ { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57, ++ -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 }, ++ { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73, ++ 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 }, ++ { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50, ++ 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 }, ++ { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78, ++ -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 }, ++ { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43, ++ -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 }, ++ { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82, ++ 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 }, ++ { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36, ++ 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 }, ++ { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85, ++ -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 }, ++ { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25, ++ -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 }, ++ { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88, ++ 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 }, ++ { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18, ++ 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 }, ++ { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90, ++ -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 }, ++ { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9, ++ -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 }, ++ { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90, ++ 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 }, ++}; ++ ++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = { ++ { -2, 58, 10, -2}, ++ { -4, 54, 16, -2}, ++ { -6, 46, 28, -4}, ++ { -4, 36, 36, -4}, ++ { -4, 28, 46, -6}, ++ { -2, 16, 54, -4}, ++ { -2, 10, 58, -2}, ++}; ++ ++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = { ++ { -1, 4,-10, 58, 17, -5, 1, 0, -1, 4,-10, 58, 17, -5, 1, 0}, ++ { -1, 4,-11, 40, 40,-11, 4, -1, -1, 4,-11, 40, 40,-11, 4, -1}, ++ { 0, 1, -5, 17, 58,-10, 4, -1, 0, 1, -5, 17, 58,-10, 4, -1} ++}; ++ ++#define BIT_DEPTH 8 ++#include "rpi_hevcdsp_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 9 ++#include "rpi_hevcdsp_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 10 ++#include "rpi_hevcdsp_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 12 ++#include "rpi_hevcdsp_template.c" ++#undef BIT_DEPTH ++ ++static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh, ++ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1, ++ int in_inc0, int in_inc1) ++{ ++ int shift = 32; ++ uint32_t bs = 0; ++ for (; pus > 0; pus--) { ++ int strength, out; ++ int curr_refL0 = curr_rpl0[curr->ref_idx[0]]; ++ int curr_refL1 = curr_rpl1[curr->ref_idx[1]]; ++ int nr_idx0 = neigh->ref_idx[0]; ++ int nr_idx1 = neigh->ref_idx[1]; ++ int neigh_refL0 = neigh_rpl0[nr_idx0]; ++ int neigh_refL1 = neigh_rpl1[nr_idx1]; ++ ++ av_assert0(nr_idx0 >= 0 && nr_idx0 <=31); ++ av_assert0(nr_idx1 >= 0 && nr_idx1 <=31); ++ ++#if 1 // This more directly matches the original implementation ++ if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) { ++ // same L0 and L1 ++ if (curr_refL0 == neigh_refL0 && ++ curr_refL0 == curr_refL1 && ++ neigh_refL0 == neigh_refL1) { ++ if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 || ++ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) && ++ (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 || ++ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)) ++ strength = 1; ++ else ++ strength = 0; ++ } else if (neigh_refL0 == curr_refL0 && ++ neigh_refL1 == curr_refL1) { ++ if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 || ++ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) ++ strength = 1; ++ else ++ strength = 0; ++ } else if (neigh_refL1 == curr_refL0 && ++ neigh_refL0 == curr_refL1) { ++ if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 || ++ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4) ++ strength = 1; ++ else ++ strength = 0; ++ } else { ++ strength = 1; ++ } ++ } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV ++ MvXY curr_mv0, neigh_mv0; ++ ++ if (curr->pred_flag & 1) { ++ curr_mv0 = curr->xy[0]; ++ } else { ++ curr_mv0 = curr->xy[1]; ++ curr_refL0 = curr_refL1; ++ } ++ ++ if (neigh->pred_flag & 1) { ++ neigh_mv0 = neigh->xy[0]; ++ } else { ++ neigh_mv0 = neigh->xy[1]; ++ neigh_refL0 = neigh_refL1; ++ } ++ ++ if (curr_refL0 == neigh_refL0) { ++ if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4) ++ strength = 1; ++ else ++ strength = 0; ++ } else ++ strength = 1; ++ } else ++ strength = 1; ++#else // This has exactly the same effect, but is more suitable for vectorisation ++ MvXY curr_mv[2]; ++ MvXY neigh_mv[2]; ++ memcpy(curr_mv, curr->xy, sizeof curr_mv); ++ memcpy(neigh_mv, neigh->xy, sizeof neigh_mv); ++ ++ if (!(curr->pred_flag & 2)) { ++ curr_mv[1] = curr_mv[0]; ++ curr_refL1 = curr_refL0; ++ } ++ if (!(neigh->pred_flag & 2)) { ++ neigh_mv[1] = neigh_mv[0]; ++ neigh_refL1 = neigh_refL0; ++ } ++ if (!(curr->pred_flag & 1)) { ++ curr_mv[0] = curr_mv[1]; ++ curr_refL0 = curr_refL1; ++ } ++ if (!(neigh->pred_flag & 1)) { ++ neigh_mv[0] = neigh_mv[1]; ++ neigh_refL0 = neigh_refL1; ++ } ++ ++ strength = 1; ++ ++ strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) | ++ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) | ++ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4); ++ ++ strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) | ++ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) | ++ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4); ++ ++ strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2); ++#endif ++ ++ curr += in_inc0 / sizeof (HEVCRpiMvField); ++ neigh += in_inc1 / sizeof (HEVCRpiMvField); ++ ++ for (out = dup; out > 0; out--) ++ { ++ bs = (bs >> 2) | (strength << 30); ++ shift -= 2; ++ } ++ } ++ return bs >> shift; ++} ++ ++ ++static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height) ++{ ++ unsigned int i, j; ++ ++ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) { ++ for (i = 0; i < height; i++) { ++ for (j = 0; j < width; j+=8) ++ AV_COPY64U(dst+j, src+j); ++ dst += stride_dst; ++ src += stride_src; ++ } ++ } else { ++ for (i = 0; i < height; i++) { ++ for (j = 0; j < width; j+=16) ++ AV_COPY128(dst+j, src+j); ++ dst += stride_dst; ++ src += stride_src; ++ } ++ } ++} ++ ++ ++ ++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth) ++{ ++#undef FUNC ++#define FUNC(a, depth) a ## _ ## depth ++ ++#undef PEL_FUNC ++#define PEL_FUNC(dst1, idx1, idx2, a, depth) \ ++ for(i = 0 ; i < 10 ; i++) \ ++{ \ ++ hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth; \ ++} ++ ++#undef EPEL_FUNCS ++#define EPEL_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth); \ ++ PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth); \ ++ PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth) ++ ++#undef EPEL_UNI_FUNCS ++#define EPEL_UNI_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth); \ ++ PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth); \ ++ PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth); \ ++ PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth); \ ++ PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth); \ ++ PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth) ++ ++#undef EPEL_BI_FUNCS ++#define EPEL_BI_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth); \ ++ PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth); \ ++ PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth); \ ++ PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \ ++ PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth); \ ++ PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth); \ ++ PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth) ++ ++#undef QPEL_FUNCS ++#define QPEL_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth); \ ++ PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth); \ ++ PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth) ++ ++#undef QPEL_UNI_FUNCS ++#define QPEL_UNI_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth); \ ++ PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth) ++ ++#undef QPEL_BI_FUNCS ++#define QPEL_BI_FUNCS(depth) \ ++ PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \ ++ PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth) ++ ++#define SLICED_ADD_RESIDUAL(depth)\ ++ hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \ ++ hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \ ++ hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \ ++ hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \ ++ hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \ ++ hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \ ++ hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \ ++ hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \ ++ hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \ ++ hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \ ++ hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \ ++ hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \ ++ hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \ ++ hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \ ++ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth) ++#define SLICED_LOOP_FILTERS(depth)\ ++ hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \ ++ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \ ++ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \ ++ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth) ++#define SLICED_SAO(depth)\ ++ for (i = 0; i != SAO_FILTER_N; ++i) { \ ++ hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \ ++ hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \ ++ } \ ++ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \ ++ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth) ++ ++#define HEVC_DSP(depth) \ ++ hevcdsp->put_pcm = FUNC(put_pcm, depth); \ ++ hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \ ++ hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \ ++ hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \ ++ hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \ ++ hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \ ++ hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \ ++ hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \ ++ hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \ ++ SLICED_ADD_RESIDUAL(depth); \ ++ hevcdsp->dequant = FUNC(dequant, depth); \ ++ hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \ ++ hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \ ++ hevcdsp->idct[0] = FUNC(idct_4x4, depth); \ ++ hevcdsp->idct[1] = FUNC(idct_8x8, depth); \ ++ hevcdsp->idct[2] = FUNC(idct_16x16, depth); \ ++ hevcdsp->idct[3] = FUNC(idct_32x32, depth); \ ++ \ ++ hevcdsp->idct_dc[0] = FUNC(idct_4x4_dc, depth); \ ++ hevcdsp->idct_dc[1] = FUNC(idct_8x8_dc, depth); \ ++ hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \ ++ hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \ ++ \ ++ for (i = 0; i != SAO_FILTER_N; ++i) { \ ++ hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \ ++ hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \ ++ } \ ++ hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \ ++ hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \ ++ SLICED_SAO(depth); \ ++ \ ++ QPEL_FUNCS(depth); \ ++ QPEL_UNI_FUNCS(depth); \ ++ QPEL_BI_FUNCS(depth); \ ++ EPEL_FUNCS(depth); \ ++ EPEL_UNI_FUNCS(depth); \ ++ EPEL_BI_FUNCS(depth); \ ++ \ ++ SLICED_LOOP_FILTERS(depth); \ ++ hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \ ++ hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \ ++ hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \ ++ hevcdsp->hevc_v_loop_filter_chroma = FUNC(hevc_v_loop_filter_chroma, depth); \ ++ hevcdsp->hevc_h_loop_filter_luma_c = FUNC(hevc_h_loop_filter_luma, depth); \ ++ hevcdsp->hevc_v_loop_filter_luma_c = FUNC(hevc_v_loop_filter_luma, depth); \ ++ hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \ ++ hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth) ++int i = 0; ++ ++ switch (bit_depth) { ++ case 9: ++ HEVC_DSP(9); ++ break; ++ case 10: ++ HEVC_DSP(10); ++ break; ++ case 12: ++ HEVC_DSP(12); ++ break; ++ default: ++ HEVC_DSP(8); ++ break; ++ } ++ ++ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths; ++ hevcdsp->cpy_blk = cpy_blk; ++ ++ if (ARCH_PPC) ++ ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth); ++ if (ARCH_X86) ++ ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth); ++ if (ARCH_ARM) ++ ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth); ++ if (ARCH_MIPS) ++ ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth); ++} +diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h +new file mode 100644 +index 0000000000..5a7cdeeb66 +--- /dev/null ++++ b/libavcodec/rpi_hevcdsp.h +@@ -0,0 +1,177 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere ++ * ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVCDSP_H ++#define AVCODEC_RPI_HEVCDSP_H ++ ++#include "hevc.h" ++#include "get_bits.h" ++ ++struct HEVCRpiMvField; ++ ++#define MAX_PB_SIZE 64 ++ ++#define RPI_HEVC_SAO_BUF_STRIDE 160 ++ ++ ++typedef struct RpiSAOParams { ++ uint8_t band_position[3]; ///< sao_band_position (Y,U,V) ++ uint8_t eo_class[3]; ///< sao_eo_class (Y,U=V) ++ uint8_t type_idx[3]; ///< sao_type_idx (Y,U=V) ++ ++ int16_t offset_val[3][5]; ///> 16; ++ const int dc_u = (dc << 16) >> 16; ++ ++ stride /= sizeof(pixel); ++ ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size * 2; x += 2) { ++ dst[x] = av_clip_pixel(dst[x] + dc_u); ++ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v); ++ } ++ dst += stride; ++ } ++} ++ ++ ++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, res, stride, 4); ++} ++ ++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, res, stride, 8); ++} ++ ++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, res, stride, 16); ++} ++ ++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual)(_dst, res, stride, 32); ++} ++ ++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 4); ++} ++ ++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 8); ++} ++ ++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 16); ++} ++ ++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc) ++{ ++ FUNC(add_residual_dc)(_dst, stride, dc, 32); ++} ++ ++// -- U -- (plaited) ++ ++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) ++{ ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 4); ++} ++ ++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) ++{ ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 8); ++} ++ ++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) ++{ ++ FUNC(add_residual_u)(_dst, res, stride, dc_u, 16); ++} ++ ++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_u) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++// -- V -- (plaited) ++ ++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 4); ++} ++ ++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 8); ++} ++ ++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ FUNC(add_residual_v)(_dst, res, stride, dc_v, 16); ++} ++ ++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride, int dc_v) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++// -- C -- (plaited - both U & V) ++ ++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_c)(_dst, res, stride, 4); ++} ++ ++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_c)(_dst, res, stride, 8); ++} ++ ++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ FUNC(add_residual_c)(_dst, res, stride, 16); ++} ++ ++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res, ++ ptrdiff_t stride) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 4); ++} ++ ++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 8); ++} ++ ++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ FUNC(add_residual_dc_c)(_dst, stride, dc, 16); ++} ++ ++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc) ++{ ++ // Should never occur for 420, which is all that sand supports ++ av_assert0(0); ++} ++ ++ ++static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode) ++{ ++ int16_t *coeffs = (int16_t *) _coeffs; ++ int x, y; ++ int size = 1 << log2_size; ++ ++ if (mode) { ++ coeffs += size; ++ for (y = 0; y < size - 1; y++) { ++ for (x = 0; x < size; x++) ++ coeffs[x] += coeffs[x - size]; ++ coeffs += size; ++ } ++ } else { ++ for (y = 0; y < size; y++) { ++ for (x = 1; x < size; x++) ++ coeffs[x] += coeffs[x - 1]; ++ coeffs += size; ++ } ++ } ++} ++ ++static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size) ++{ ++ int shift = 15 - BIT_DEPTH - log2_size; ++ int x, y; ++ int size = 1 << log2_size; ++ ++ if (shift > 0) { ++ int offset = 1 << (shift - 1); ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size; x++) { ++ *coeffs = (*coeffs + offset) >> shift; ++ coeffs++; ++ } ++ } ++ } else { ++ for (y = 0; y < size; y++) { ++ for (x = 0; x < size; x++) { ++ *coeffs = *coeffs << -shift; ++ coeffs++; ++ } ++ } ++ } ++} ++ ++#define SET(dst, x) (dst) = (x) ++#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift) ++ ++#define TR_4x4_LUMA(dst, src, step, assign) \ ++ do { \ ++ int c0 = src[0 * step] + src[2 * step]; \ ++ int c1 = src[2 * step] + src[3 * step]; \ ++ int c2 = src[0 * step] - src[3 * step]; \ ++ int c3 = 74 * src[1 * step]; \ ++ \ ++ assign(dst[2 * step], 74 * (src[0 * step] - \ ++ src[2 * step] + \ ++ src[3 * step])); \ ++ assign(dst[0 * step], 29 * c0 + 55 * c1 + c3); \ ++ assign(dst[1 * step], 55 * c2 - 29 * c1 + c3); \ ++ assign(dst[3 * step], 55 * c0 + 29 * c2 - c3); \ ++ } while (0) ++ ++static void FUNC(transform_4x4_luma)(int16_t *coeffs) ++{ ++ int i; ++ int shift = 7; ++ int add = 1 << (shift - 1); ++ int16_t *src = coeffs; ++ ++ for (i = 0; i < 4; i++) { ++ TR_4x4_LUMA(src, src, 4, SCALE); ++ src++; ++ } ++ ++ shift = 20 - BIT_DEPTH; ++ add = 1 << (shift - 1); ++ for (i = 0; i < 4; i++) { ++ TR_4x4_LUMA(coeffs, coeffs, 1, SCALE); ++ coeffs += 4; ++ } ++} ++ ++#undef TR_4x4_LUMA ++ ++#define TR_4(dst, src, dstep, sstep, assign, end) \ ++ do { \ ++ const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \ ++ const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \ ++ const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \ ++ const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \ ++ \ ++ assign(dst[0 * dstep], e0 + o0); \ ++ assign(dst[1 * dstep], e1 + o1); \ ++ assign(dst[2 * dstep], e1 - o1); \ ++ assign(dst[3 * dstep], e0 - o0); \ ++ } while (0) ++ ++#define TR_8(dst, src, dstep, sstep, assign, end) \ ++ do { \ ++ int i, j; \ ++ int e_8[4]; \ ++ int o_8[4] = { 0 }; \ ++ for (i = 0; i < 4; i++) \ ++ for (j = 1; j < end; j += 2) \ ++ o_8[i] += transform[4 * j][i] * src[j * sstep]; \ ++ TR_4(e_8, src, 1, 2 * sstep, SET, 4); \ ++ \ ++ for (i = 0; i < 4; i++) { \ ++ assign(dst[i * dstep], e_8[i] + o_8[i]); \ ++ assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \ ++ } \ ++ } while (0) ++ ++#define TR_16(dst, src, dstep, sstep, assign, end) \ ++ do { \ ++ int i, j; \ ++ int e_16[8]; \ ++ int o_16[8] = { 0 }; \ ++ for (i = 0; i < 8; i++) \ ++ for (j = 1; j < end; j += 2) \ ++ o_16[i] += transform[2 * j][i] * src[j * sstep]; \ ++ TR_8(e_16, src, 1, 2 * sstep, SET, 8); \ ++ \ ++ for (i = 0; i < 8; i++) { \ ++ assign(dst[i * dstep], e_16[i] + o_16[i]); \ ++ assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \ ++ } \ ++ } while (0) ++ ++#define TR_32(dst, src, dstep, sstep, assign, end) \ ++ do { \ ++ int i, j; \ ++ int e_32[16]; \ ++ int o_32[16] = { 0 }; \ ++ for (i = 0; i < 16; i++) \ ++ for (j = 1; j < end; j += 2) \ ++ o_32[i] += transform[j][i] * src[j * sstep]; \ ++ TR_16(e_32, src, 1, 2 * sstep, SET, end / 2); \ ++ \ ++ for (i = 0; i < 16; i++) { \ ++ assign(dst[i * dstep], e_32[i] + o_32[i]); \ ++ assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \ ++ } \ ++ } while (0) ++ ++#define IDCT_VAR4(H) \ ++ int limit2 = FFMIN(col_limit + 4, H) ++#define IDCT_VAR8(H) \ ++ int limit = FFMIN(col_limit, H); \ ++ int limit2 = FFMIN(col_limit + 4, H) ++#define IDCT_VAR16(H) IDCT_VAR8(H) ++#define IDCT_VAR32(H) IDCT_VAR8(H) ++ ++#define IDCT(H) \ ++static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \ ++ int col_limit) \ ++{ \ ++ int i; \ ++ int shift = 7; \ ++ int add = 1 << (shift - 1); \ ++ int16_t *src = coeffs; \ ++ IDCT_VAR ## H(H); \ ++ \ ++ for (i = 0; i < H; i++) { \ ++ TR_ ## H(src, src, H, H, SCALE, limit2); \ ++ if (limit2 < H && i%4 == 0 && !!i) \ ++ limit2 -= 4; \ ++ src++; \ ++ } \ ++ \ ++ shift = 20 - BIT_DEPTH; \ ++ add = 1 << (shift - 1); \ ++ for (i = 0; i < H; i++) { \ ++ TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \ ++ coeffs += H; \ ++ } \ ++} ++ ++#define IDCT_DC(H) \ ++static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs) \ ++{ \ ++ int i, j; \ ++ int shift = 14 - BIT_DEPTH; \ ++ int add = 1 << (shift - 1); \ ++ int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift; \ ++ \ ++ for (j = 0; j < H; j++) { \ ++ for (i = 0; i < H; i++) { \ ++ coeffs[i + j * H] = coeff; \ ++ } \ ++ } \ ++} ++ ++IDCT( 4) ++IDCT( 8) ++IDCT(16) ++IDCT(32) ++ ++IDCT_DC( 4) ++IDCT_DC( 8) ++IDCT_DC(16) ++IDCT_DC(32) ++ ++#undef TR_4 ++#undef TR_8 ++#undef TR_16 ++#undef TR_32 ++ ++#undef SET ++#undef SCALE ++ ++static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ int16_t *sao_offset_val, int sao_left_class, ++ int width, int height) ++{ ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int offset_table[32] = { 0 }; ++ int k, y, x; ++ int shift = BIT_DEPTH - 5; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ ++ for (k = 0; k < 4; k++) ++ offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]); ++ dst += stride_dst; ++ src += stride_src; ++ } ++} ++ ++#define CMP(a, b) (((a) > (b)) - ((a) < (b))) ++ ++static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, ++ int eo, int width, int height) { ++ ++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; ++ static const int8_t pos[4][2][2] = { ++ { { -1, 0 }, { 1, 0 } }, // horizontal ++ { { 0, -1 }, { 0, 1 } }, // vertical ++ { { -1, -1 }, { 1, 1 } }, // 45 degree ++ { { 1, -1 }, { -1, 1 } }, // 135 degree ++ }; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int a_stride, b_stride; ++ int x, y; ++ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel); ++ stride_dst /= sizeof(pixel); ++ ++ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src; ++ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ int diff0 = CMP(src[x], src[x + a_stride]); ++ int diff1 = CMP(src[x], src[x + b_stride]); ++ int offset_val = edge_idx[2 + diff0 + diff1]; ++ dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]); ++ } ++ src += stride_src; ++ dst += stride_dst; ++ } ++} ++ ++ ++#if BIT_DEPTH == 10 ++// We need a 32 bit variation for the _c restores so hijack bit depth 10 ++#undef pixel ++#undef BIT_DEPTH ++#define pixel uint32_t ++#define BIT_DEPTH 32 ++// All 16 bit variations are the same ++#define sao_edge_restore_0_10 sao_edge_restore_0_9 ++#define sao_edge_restore_1_10 sao_edge_restore_1_9 ++#define sao_edge_restore_0_11 sao_edge_restore_0_9 ++#define sao_edge_restore_1_11 sao_edge_restore_1_9 ++#define sao_edge_restore_0_12 sao_edge_restore_0_9 ++#define sao_edge_restore_1_12 sao_edge_restore_1_9 ++#define sao_edge_restore_0_13 sao_edge_restore_0_9 ++#define sao_edge_restore_1_13 sao_edge_restore_1_9 ++#define sao_edge_restore_0_14 sao_edge_restore_0_9 ++#define sao_edge_restore_1_14 sao_edge_restore_1_9 ++#define sao_edge_restore_0_15 sao_edge_restore_0_9 ++#define sao_edge_restore_1_15 sao_edge_restore_1_9 ++#define sao_edge_restore_0_16 sao_edge_restore_0_9 ++#define sao_edge_restore_1_16 sao_edge_restore_1_9 ++#endif ++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32 ++static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao, ++ int *borders, int _width, int _height, ++ int c_idx, uint8_t *vert_edge, ++ uint8_t *horiz_edge, uint8_t *diag_edge) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int sao_eo_class = sao->eo_class[c_idx]; ++ int init_x = 0, width = _width, height = _height; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ ++ if (sao_eo_class != SAO_EO_VERT) { ++ if (borders[0]) { ++ for (y = 0; y < height; y++) { ++ dst[y * stride_dst] = src[y * stride_src]; ++ } ++ init_x = 1; ++ } ++ if (borders[2]) { ++ int offset = width - 1; ++ for (x = 0; x < height; x++) { ++ dst[x * stride_dst + offset] = src[x * stride_src + offset]; ++ } ++ width--; ++ } ++ } ++ if (sao_eo_class != SAO_EO_HORIZ) { ++ if (borders[1]) { ++ for (x = init_x; x < width; x++) ++ dst[x] = src[x]; ++ } ++ if (borders[3]) { ++ ptrdiff_t y_stride_dst = stride_dst * (height - 1); ++ ptrdiff_t y_stride_src = stride_src * (height - 1); ++ for (x = init_x; x < width; x++) ++ dst[x + y_stride_dst] = src[x + y_stride_src]; ++ height--; ++ } ++ } ++} ++ ++static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao, ++ int *borders, int _width, int _height, ++ int c_idx, uint8_t *vert_edge, ++ uint8_t *horiz_edge, uint8_t *diag_edge) ++{ ++ int x, y; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int sao_eo_class = sao->eo_class[c_idx]; ++ int init_x = 0, init_y = 0, width = _width, height = _height; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ ++ if (sao_eo_class != SAO_EO_VERT) { ++ if (borders[0]) { ++ for (y = 0; y < height; y++) { ++ dst[y * stride_dst] = src[y * stride_src]; ++ } ++ init_x = 1; ++ } ++ if (borders[2]) { ++ int offset = width - 1; ++ for (x = 0; x < height; x++) { ++ dst[x * stride_dst + offset] = src[x * stride_src + offset]; ++ } ++ width--; ++ } ++ } ++ if (sao_eo_class != SAO_EO_HORIZ) { ++ if (borders[1]) { ++ for (x = init_x; x < width; x++) ++ dst[x] = src[x]; ++ init_y = 1; ++ } ++ if (borders[3]) { ++ ptrdiff_t y_stride_dst = stride_dst * (height - 1); ++ ptrdiff_t y_stride_src = stride_src * (height - 1); ++ for (x = init_x; x < width; x++) ++ dst[x + y_stride_dst] = src[x + y_stride_src]; ++ height--; ++ } ++ } ++ ++ { ++ int save_upper_left = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1]; ++ int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D && !borders[1] && !borders[2]; ++ int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3]; ++ int save_lower_left = !diag_edge[3] && sao_eo_class == SAO_EO_45D && !borders[0] && !borders[3]; ++ ++ // Restore pixels that can't be modified ++ if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) { ++ for(y = init_y+save_upper_left; y< height-save_lower_left; y++) ++ dst[y*stride_dst] = src[y*stride_src]; ++ } ++ if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) { ++ for(y = init_y+save_upper_right; y< height-save_lower_right; y++) ++ dst[y*stride_dst+width-1] = src[y*stride_src+width-1]; ++ } ++ ++ if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) { ++ for(x = init_x+save_upper_left; x < width-save_upper_right; x++) ++ dst[x] = src[x]; ++ } ++ if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) { ++ for(x = init_x+save_lower_left; x < width-save_lower_right; x++) ++ dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x]; ++ } ++ if(diag_edge[0] && sao_eo_class == SAO_EO_135D) ++ dst[0] = src[0]; ++ if(diag_edge[1] && sao_eo_class == SAO_EO_45D) ++ dst[width-1] = src[width-1]; ++ if(diag_edge[2] && sao_eo_class == SAO_EO_135D) ++ dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1]; ++ if(diag_edge[3] && sao_eo_class == SAO_EO_45D) ++ dst[stride_dst*(height-1)] = src[stride_src*(height-1)]; ++ ++ } ++} ++#endif ++#if BIT_DEPTH == 32 ++#undef BIT_DEPTH ++#undef pixel ++#define BIT_DEPTH 10 ++#define pixel uint16_t ++#endif ++ ++// --- Plaited chroma versions ++ ++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src, ++ ptrdiff_t stride_dst, ptrdiff_t stride_src, ++ const int16_t *sao_offset_val_u, int sao_left_class_u, ++ const int16_t *sao_offset_val_v, int sao_left_class_v, ++ int width, int height) ++{ ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int offset_table_u[32] = { 0 }; ++ int offset_table_v[32] = { 0 }; ++ int k, y, x; ++ int shift = BIT_DEPTH - 5; ++ ++ stride_dst /= sizeof(pixel); ++ stride_src /= sizeof(pixel); ++ width *= 2; ++ ++ for (k = 0; k < 4; k++) ++ { ++ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1]; ++ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1]; ++ } ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x += 2) ++ { ++// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift); ++// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]); ++ // *** & 31 shouldn't be wanted but just now we generate broken input that ++ // crashes us in 10-bit world ++ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]); ++ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]); ++ } ++ dst += stride_dst; ++ src += stride_src; ++ } ++} ++ ++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ++ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, ++ int eo, int width, int height) { ++ ++ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 }; ++ static const int8_t pos[4][2][2] = { ++ { { -1, 0 }, { 1, 0 } }, // horizontal ++ { { 0, -1 }, { 0, 1 } }, // vertical ++ { { -1, -1 }, { 1, 1 } }, // 45 degree ++ { { 1, -1 }, { -1, 1 } }, // 135 degree ++ }; ++ pixel *dst = (pixel *)_dst; ++ pixel *src = (pixel *)_src; ++ int a_stride, b_stride; ++ int x, y; ++ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel); ++ ++ stride_dst /= sizeof(pixel); ++ width *= 2; ++ ++ av_assert0(width <= 64); ++ ++ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src; ++ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x += 2) { ++ int diff0u = CMP(src[x], src[x + a_stride]); ++ int diff1u = CMP(src[x], src[x + b_stride]); ++ int offset_valu = edge_idx[2 + diff0u + diff1u]; ++ int diff0v = CMP(src[x+1], src[x+1 + a_stride]); ++ int diff1v = CMP(src[x+1], src[x+1 + b_stride]); ++ int offset_valv = edge_idx[2 + diff0v + diff1v]; ++ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]); ++ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]); ++ } ++ src += stride_src; ++ dst += stride_dst; ++ } ++} ++ ++// Do once ++#if BIT_DEPTH == 8 ++// Any old 2 byte 'normal' restore will work for these ++#define sao_edge_restore_c_0_8 sao_edge_restore_0_16 ++#define sao_edge_restore_c_1_8 sao_edge_restore_1_16 ++// We need 32 bit for 9 bit+ ++#define sao_edge_restore_c_0_9 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_9 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32 ++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32 ++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32 ++#endif ++ ++#undef CMP ++ ++//////////////////////////////////////////////////////////////////////////////// ++// ++//////////////////////////////////////////////////////////////////////////////// ++static void FUNC(put_hevc_pel_pixels)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = src[x] << (14 - BIT_DEPTH); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ for (y = 0; y < height; y++) { ++ memcpy(dst, src, width * sizeof(pixel)); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1)); ++ } ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++// ++//////////////////////////////////////////////////////////////////////////////// ++#define QPEL_FILTER(src, stride) \ ++ (filter[0] * src[x - 3 * stride] + \ ++ filter[1] * src[x - 2 * stride] + \ ++ filter[2] * src[x - stride] + \ ++ filter[3] * src[x ] + \ ++ filter[4] * src[x + stride] + \ ++ filter[5] * src[x + 2 * stride] + \ ++ filter[6] * src[x + 3 * stride] + \ ++ filter[7] * src[x + 4 * stride]) ++ ++static void FUNC(put_hevc_qpel_h)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_v)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_hv)(int16_t *dst, ++ uint8_t *_src, ++ ptrdiff_t _srcstride, ++ int height, intptr_t mx, ++ intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; ++ tmp += MAX_PB_SIZE; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ int shift = 14 - BIT_DEPTH; ++ ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ int shift = 14 - BIT_DEPTH; ++ ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++ ++static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 - BIT_DEPTH; ++ ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, ++ intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, ++ intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ ++ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, ++ intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ const int8_t *filter; ++ pixel *src = (pixel*)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ src -= QPEL_EXTRA_BEFORE * srcstride; ++ filter = ff_hevc_rpi_qpel_filters[mx - 1]; ++ for (y = 0; y < height + QPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_qpel_filters[my - 1]; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++//////////////////////////////////////////////////////////////////////////////// ++// ++//////////////////////////////////////////////////////////////////////////////// ++#define EPEL_FILTER(src, stride) \ ++ (filter[0] * src[x - stride] + \ ++ filter[1] * src[x] + \ ++ filter[2] * src[x + stride] + \ ++ filter[3] * src[x + 2 * stride]) ++ ++static void FUNC(put_hevc_epel_h)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_v)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_hv)(int16_t *dst, ++ uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ ++ src -= EPEL_EXTRA_BEFORE * srcstride; ++ ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6; ++ tmp += MAX_PB_SIZE; ++ dst += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int shift = 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); ++ } ++ dst += dststride; ++ src += srcstride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ int shift = 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift); ++ src += srcstride; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift); ++ dst += dststride; ++ src += srcstride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ src -= EPEL_EXTRA_BEFORE * srcstride; ++ ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 + 1 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ src -= EPEL_EXTRA_BEFORE * srcstride; ++ ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; ++ ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); ++ } ++ dst += dststride; ++ src += srcstride; ++ } ++} ++ ++static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) { ++ dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox); ++ } ++ dst += dststride; ++ src += srcstride; ++ } ++} ++ ++static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1]; ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1)); ++ src += srcstride; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = denom + 14 - BIT_DEPTH; ++#if BIT_DEPTH < 14 ++ int offset = 1 << (shift - 1); ++#else ++ int offset = 0; ++#endif ++ ++ src -= EPEL_EXTRA_BEFORE * srcstride; ++ ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; ++ ++ ox = ox * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ } ++} ++ ++static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, ++ int16_t *src2, ++ int height, int denom, int wx0, int wx1, ++ int ox0, int ox1, intptr_t mx, intptr_t my, int width) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ ptrdiff_t srcstride = _srcstride / sizeof(pixel); ++ pixel *dst = (pixel *)_dst; ++ ptrdiff_t dststride = _dststride / sizeof(pixel); ++ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1]; ++ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE]; ++ int16_t *tmp = tmp_array; ++ int shift = 14 + 1 - BIT_DEPTH; ++ int log2Wd = denom + shift - 1; ++ ++ src -= EPEL_EXTRA_BEFORE * srcstride; ++ ++ for (y = 0; y < height + EPEL_EXTRA; y++) { ++ for (x = 0; x < width; x++) ++ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8); ++ src += srcstride; ++ tmp += MAX_PB_SIZE; ++ } ++ ++ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE; ++ filter = ff_hevc_rpi_epel_filters[my - 1]; ++ ++ ox0 = ox0 * (1 << (BIT_DEPTH - 8)); ++ ox1 = ox1 * (1 << (BIT_DEPTH - 8)); ++ for (y = 0; y < height; y++) { ++ for (x = 0; x < width; x++) ++ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 + ++ ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1)); ++ tmp += MAX_PB_SIZE; ++ dst += dststride; ++ src2 += MAX_PB_SIZE; ++ } ++} ++ ++// line zero ++#define P3 pix[-4 * xstride] ++#define P2 pix[-3 * xstride] ++#define P1 pix[-2 * xstride] ++#define P0 pix[-1 * xstride] ++#define Q0 pix[0 * xstride] ++#define Q1 pix[1 * xstride] ++#define Q2 pix[2 * xstride] ++#define Q3 pix[3 * xstride] ++ ++// line three. used only for deblocking decision ++#define TP3 pix[-4 * xstride + 3 * ystride] ++#define TP2 pix[-3 * xstride + 3 * ystride] ++#define TP1 pix[-2 * xstride + 3 * ystride] ++#define TP0 pix[-1 * xstride + 3 * ystride] ++#define TQ0 pix[0 * xstride + 3 * ystride] ++#define TQ1 pix[1 * xstride + 3 * ystride] ++#define TQ2 pix[2 * xstride + 3 * ystride] ++#define TQ3 pix[3 * xstride + 3 * ystride] ++ ++static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix, ++ ptrdiff_t _xstride, ptrdiff_t _ystride, ++ int beta, int *_tc, ++ uint8_t *_no_p, uint8_t *_no_q) ++{ ++ int d, j; ++ pixel *pix = (pixel *)_pix; ++ ptrdiff_t xstride = _xstride / sizeof(pixel); ++ ptrdiff_t ystride = _ystride / sizeof(pixel); ++ ++ beta <<= BIT_DEPTH - 8; ++ ++ for (j = 0; j < 2; j++) { ++ const int dp0 = abs(P2 - 2 * P1 + P0); ++ const int dq0 = abs(Q2 - 2 * Q1 + Q0); ++ const int dp3 = abs(TP2 - 2 * TP1 + TP0); ++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); ++ const int d0 = dp0 + dq0; ++ const int d3 = dp3 + dq3; ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ const int no_p = _no_p[j]; ++ const int no_q = _no_q[j]; ++ ++ if (d0 + d3 >= beta) { ++ pix += 4 * ystride; ++ continue; ++ } else { ++ const int beta_3 = beta >> 3; ++ const int beta_2 = beta >> 2; ++ const int tc25 = ((tc * 5 + 1) >> 1); ++ ++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && ++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && ++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { ++ // strong filtering ++ const int tc2 = tc << 1; ++ for (d = 0; d < 4; d++) { ++ const int p3 = P3; ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ const int q3 = Q3; ++ if (!no_p) { ++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); ++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); ++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); ++ } ++ if (!no_q) { ++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); ++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); ++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); ++ } ++ pix += ystride; ++ } ++ } else { // normal filtering ++ int nd_p = 1; ++ int nd_q = 1; ++ const int tc_2 = tc >> 1; ++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) ++ nd_p = 2; ++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) ++ nd_q = 2; ++ ++ for (d = 0; d < 4; d++) { ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; ++ if (abs(delta0) < 10 * tc) { ++ delta0 = av_clip(delta0, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ if (!no_p && nd_p > 1) { ++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); ++ P1 = av_clip_pixel(p1 + deltap1); ++ } ++ if (!no_q && nd_q > 1) { ++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); ++ Q1 = av_clip_pixel(q1 + deltaq1); ++ } ++ } ++ pix += ystride; ++ } ++ } ++ } ++ } ++} ++ ++static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride, ++ ptrdiff_t _ystride, int *_tc, ++ uint8_t *_no_p, uint8_t *_no_q) ++{ ++ int d, j, no_p, no_q; ++ pixel *pix = (pixel *)_pix; ++ ptrdiff_t xstride = _xstride / sizeof(pixel); ++ ptrdiff_t ystride = _ystride / sizeof(pixel); ++ ++ for (j = 0; j < 2; j++) { ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ if (tc <= 0) { ++ pix += 4 * ystride; ++ continue; ++ } ++ no_p = _no_p[j]; ++ no_q = _no_q[j]; ++ ++ for (d = 0; d < 4; d++) { ++ int delta0; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ pix += ystride; ++ } ++ } ++} ++ ++static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, ++ int32_t *tc, uint8_t *no_p, ++ uint8_t *no_q) ++{ ++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q); ++} ++ ++static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride, ++ int32_t *tc, uint8_t *no_p, ++ uint8_t *no_q) ++{ ++ FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q); ++} ++ ++static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, ++ int beta, int32_t *tc, uint8_t *no_p, ++ uint8_t *no_q) ++{ ++ FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel), ++ beta, tc, no_p, no_q); ++} ++ ++static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride, ++ int beta, int32_t *tc, uint8_t *no_p, ++ uint8_t *no_q) ++{ ++ FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride, ++ beta, tc, no_p, no_q); ++} ++ ++#undef P3 ++#undef P2 ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 ++#undef Q2 ++#undef Q3 ++ ++#undef TP3 ++#undef TP2 ++#undef TP1 ++#undef TP0 ++#undef TQ0 ++#undef TQ1 ++#undef TQ2 ++#undef TQ3 ++ ++// line zero ++#define P3 pix_l[0 * xstride] ++#define P2 pix_l[1 * xstride] ++#define P1 pix_l[2 * xstride] ++#define P0 pix_l[3 * xstride] ++#define Q0 pix_r[0 * xstride] ++#define Q1 pix_r[1 * xstride] ++#define Q2 pix_r[2 * xstride] ++#define Q3 pix_r[3 * xstride] ++ ++// line three. used only for deblocking decision ++#define TP3 pix_l[0 * xstride + 3 * ystride] ++#define TP2 pix_l[1 * xstride + 3 * ystride] ++#define TP1 pix_l[2 * xstride + 3 * ystride] ++#define TP0 pix_l[3 * xstride + 3 * ystride] ++#define TQ0 pix_r[0 * xstride + 3 * ystride] ++#define TQ1 pix_r[1 * xstride + 3 * ystride] ++#define TQ2 pix_r[2 * xstride + 3 * ystride] ++#define TQ3 pix_r[3 * xstride + 3 * ystride] ++ ++// This is identical to hevc_loop_filter_luma except that the P/Q ++// components are on separate pointers ++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f, ++ uint8_t * _pix_l) ++{ ++ int d, j; ++ pixel *pix_l = (pixel *)_pix_l; ++ pixel *pix_r = (pixel *)_pix_r; ++ const ptrdiff_t xstride = 1; ++ const ptrdiff_t ystride = _stride / sizeof(pixel); ++ ++ beta <<= BIT_DEPTH - 8; ++ ++ for (j = 0; j < 2; j++) { ++ const int dp0 = abs(P2 - 2 * P1 + P0); ++ const int dq0 = abs(Q2 - 2 * Q1 + Q0); ++ const int dp3 = abs(TP2 - 2 * TP1 + TP0); ++ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0); ++ const int d0 = dp0 + dq0; ++ const int d3 = dp3 + dq3; ++ const int tc = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8); ++ const int no_p = no_f & 1; ++ const int no_q = no_f & 2; ++ ++ if (d0 + d3 >= beta) { ++ pix_l += 4 * ystride; ++ pix_r += 4 * ystride; ++ continue; ++ } else { ++ const int beta_3 = beta >> 3; ++ const int beta_2 = beta >> 2; ++ const int tc25 = ((tc * 5 + 1) >> 1); ++ ++ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 && ++ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 && ++ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) { ++ // strong filtering ++ const int tc2 = tc << 1; ++ for (d = 0; d < 4; d++) { ++ const int p3 = P3; ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ const int q3 = Q3; ++ if (!no_p) { ++ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2); ++ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2); ++ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2); ++ } ++ if (!no_q) { ++ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2); ++ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2); ++ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2); ++ } ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } else { // normal filtering ++ int nd_p = 1; ++ int nd_q = 1; ++ const int tc_2 = tc >> 1; ++ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3)) ++ nd_p = 2; ++ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3)) ++ nd_q = 2; ++ ++ for (d = 0; d < 4; d++) { ++ const int p2 = P2; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ const int q2 = Q2; ++ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4; ++ if (abs(delta0) < 10 * tc) { ++ delta0 = av_clip(delta0, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ if (!no_p && nd_p > 1) { ++ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2); ++ P1 = av_clip_pixel(p1 + deltap1); ++ } ++ if (!no_q && nd_q > 1) { ++ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2); ++ Q1 = av_clip_pixel(q1 + deltaq1); ++ } ++ } ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } ++ } ++ } ++} ++ ++static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r, ++ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f) ++{ ++ // Just call the non-2 function having massaged the parameters ++ int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16}; ++ uint8_t no_p[2] = {no_f & 1, no_f & 1}; ++ uint8_t no_q[2] = {no_f & 2, no_f & 2}; ++ FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q); ++} ++ ++#undef TP3 ++#undef TP2 ++#undef TP1 ++#undef TP0 ++#undef TQ0 ++#undef TQ1 ++#undef TQ2 ++#undef TQ3 ++ ++#undef P3 ++#undef P2 ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 ++#undef Q2 ++#undef Q3 ++ ++#define P1 pix_l[0 * xstride] ++#define P0 pix_l[1 * xstride] ++#define Q0 pix_r[0 * xstride] ++#define Q1 pix_r[1 * xstride] ++ ++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride, ++ ptrdiff_t _ystride, const int32_t *_tc, ++ const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r) ++{ ++ int d, j, no_p, no_q; ++ pixel *pix_l = (pixel *)_pix_l; ++ pixel *pix_r = (pixel *)_pix_r; ++ ptrdiff_t xstride = _xstride / sizeof(pixel); ++ ptrdiff_t ystride = _ystride / sizeof(pixel); ++ ++ for (j = 0; j < 2; j++) { ++ const int tc = _tc[j] << (BIT_DEPTH - 8); ++ if (tc <= 0) { ++ pix_l += 4 * ystride; ++ pix_r += 4 * ystride; ++ continue; ++ } ++ no_p = _no_p[j]; ++ no_q = _no_q[j]; ++ ++ for (d = 0; d < 4; d++) { ++ int delta0; ++ const int p1 = P1; ++ const int p0 = P0; ++ const int q0 = Q0; ++ const int q1 = Q1; ++ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc); ++ if (!no_p) ++ P0 = av_clip_pixel(p0 + delta0); ++ if (!no_q) ++ Q0 = av_clip_pixel(q0 - delta0); ++ pix_l += ystride; ++ pix_r += ystride; ++ } ++ } ++} ++ ++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4, ++ unsigned int no_f) ++{ ++ uint8_t no_p[2] = {no_f & 1, no_f & 2}; ++ uint8_t no_q[2] = {no_f & 4, no_f & 8}; ++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; ++ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q); ++ FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q); ++} ++ ++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4, ++ uint8_t * src_l, ++ unsigned int no_f) ++{ ++ uint8_t no_p[2] = {no_f & 1, no_f & 2}; ++ uint8_t no_q[2] = {no_f & 4, no_f & 8}; ++ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24}; ++ FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r); ++ FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel)); ++} ++ ++#undef P1 ++#undef P0 ++#undef Q0 ++#undef Q1 ++ +diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c +new file mode 100644 +index 0000000000..0aa8809a4b +--- /dev/null ++++ b/libavcodec/rpi_hevcpred.c +@@ -0,0 +1,161 @@ ++/* ++ * HEVC video Decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading) ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "rpi_hevcdec.h" ++ ++#include "rpi_hevcpred.h" ++#if (ARCH_ARM) ++#include "arm/rpi_hevcpred_arm.h" ++#endif ++ ++#define PRED_C 0 ++#define BIT_DEPTH 8 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 9 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 10 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 12 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++#undef PRED_C ++ ++#define PRED_C 1 ++#define BIT_DEPTH 8 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 9 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 10 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++ ++#define BIT_DEPTH 12 ++#include "rpi_hevcpred_template.c" ++#undef BIT_DEPTH ++#undef PRED_C ++ ++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth) ++{ ++#undef FUNC ++#define FUNC(a, depth) a ## _ ## depth ++ ++#undef FUNCC ++#define FUNCC(a, depth) a ## _ ## depth ## _c ++ ++#define HEVC_PRED_Y(depth) \ ++ hpc->intra_pred = FUNC(intra_pred, depth); \ ++ hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \ ++ hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \ ++ hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \ ++ hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \ ++ hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \ ++ hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \ ++ hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \ ++ hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \ ++ hpc->pred_dc[0] = FUNC(pred_dc_0, depth); \ ++ hpc->pred_dc[1] = FUNC(pred_dc_1, depth); \ ++ hpc->pred_dc[2] = FUNC(pred_dc_2, depth); \ ++ hpc->pred_dc[3] = FUNC(pred_dc_3, depth); \ ++ hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \ ++ hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \ ++ hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \ ++ hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \ ++ hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \ ++ hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \ ++ hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \ ++ hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \ ++ hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \ ++ hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \ ++ hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \ ++ hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \ ++ hpc->pred_dc0[0] = FUNC(pred_dc0_0, depth); \ ++ hpc->pred_dc0[1] = FUNC(pred_dc0_1, depth); \ ++ hpc->pred_dc0[2] = FUNC(pred_dc0_2, depth); \ ++ hpc->pred_dc0[3] = FUNC(pred_dc0_3, depth); ++ ++#define HEVC_PRED_C(depth) \ ++ hpc->intra_pred_c = FUNCC(intra_pred, depth); \ ++ hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \ ++ hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \ ++ hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \ ++ hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \ ++ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \ ++ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \ ++ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \ ++ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \ ++ hpc->pred_dc_c[0] = FUNCC(pred_dc_0, depth); \ ++ hpc->pred_dc_c[1] = FUNCC(pred_dc_1, depth); \ ++ hpc->pred_dc_c[2] = FUNCC(pred_dc_2, depth); \ ++ hpc->pred_dc_c[3] = FUNCC(pred_dc_3, depth); \ ++ hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \ ++ hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \ ++ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \ ++ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \ ++ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \ ++ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \ ++ hpc->pred_dc0_c[0] = FUNCC(pred_dc0_0, depth); \ ++ hpc->pred_dc0_c[1] = FUNCC(pred_dc0_1, depth); \ ++ hpc->pred_dc0_c[2] = FUNCC(pred_dc0_2, depth); \ ++ hpc->pred_dc0_c[3] = FUNCC(pred_dc0_3, depth); ++ ++#define HEVC_PRED(depth) \ ++ HEVC_PRED_Y(depth); \ ++ HEVC_PRED_C(depth); ++ ++ switch (bit_depth) { ++ case 9: ++ HEVC_PRED(9); ++ break; ++ case 10: ++ HEVC_PRED(10); ++ break; ++ case 12: ++ HEVC_PRED(12); ++ break; ++ default: ++ HEVC_PRED(8); ++ break; ++ } ++ ++#if (ARCH_ARM) ++ ff_hevc_rpi_pred_init_arm(hpc, bit_depth); ++#elif (ARCH_MIPS) ++ ff_hevc_rpi_pred_init_mips(hpc, bit_depth); ++#endif ++} +diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h +new file mode 100644 +index 0000000000..9f0edb8798 +--- /dev/null ++++ b/libavcodec/rpi_hevcpred.h +@@ -0,0 +1,123 @@ ++/* ++ * HEVC video Decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_RPI_HEVCPRED_H ++#define AVCODEC_RPI_HEVCPRED_H ++ ++#include ++#include ++#include "config.h" ++ ++struct HEVCRpiContext; ++struct HEVCRpiLocalContext; ++ ++enum IntraPredMode { ++ INTRA_PLANAR = 0, ++ INTRA_DC, ++ INTRA_ANGULAR_2, ++ INTRA_ANGULAR_3, ++ INTRA_ANGULAR_4, ++ INTRA_ANGULAR_5, ++ INTRA_ANGULAR_6, ++ INTRA_ANGULAR_7, ++ INTRA_ANGULAR_8, ++ INTRA_ANGULAR_9, ++ INTRA_ANGULAR_10, ++ INTRA_ANGULAR_11, ++ INTRA_ANGULAR_12, ++ INTRA_ANGULAR_13, ++ INTRA_ANGULAR_14, ++ INTRA_ANGULAR_15, ++ INTRA_ANGULAR_16, ++ INTRA_ANGULAR_17, ++ INTRA_ANGULAR_18, ++ INTRA_ANGULAR_19, ++ INTRA_ANGULAR_20, ++ INTRA_ANGULAR_21, ++ INTRA_ANGULAR_22, ++ INTRA_ANGULAR_23, ++ INTRA_ANGULAR_24, ++ INTRA_ANGULAR_25, ++ INTRA_ANGULAR_26, ++ INTRA_ANGULAR_27, ++ INTRA_ANGULAR_28, ++ INTRA_ANGULAR_29, ++ INTRA_ANGULAR_30, ++ INTRA_ANGULAR_31, ++ INTRA_ANGULAR_32, ++ INTRA_ANGULAR_33, ++ INTRA_ANGULAR_34, ++}; ++#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10 ++#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26 ++ ++typedef void intra_filter_fn_t( ++ uint8_t * const left, uint8_t * const top, ++ const unsigned int req, const unsigned int avail, ++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, ++ const unsigned int stride, ++ const unsigned int top_right_size, const unsigned int down_left_size); ++ ++typedef struct HEVCRpiPredContext { ++ void (*intra_pred)(const struct HEVCRpiContext * const s, ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, ++ const unsigned int avail, const unsigned int log2_size); ++ ++ intra_filter_fn_t *intra_filter[4]; ++ void (*pred_planar[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride); ++ void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ++ ptrdiff_t stride); ++ void (*pred_angular[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_vertical[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride); ++ ++ void (*intra_pred_c)(const struct HEVCRpiContext * const s, ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, ++ const unsigned int avail, const unsigned int log2_size); ++ intra_filter_fn_t *intra_filter_c[4]; ++ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride); ++ void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ++ ptrdiff_t stride); ++ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ptrdiff_t stride, ++ int mode); ++ void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride); ++} HEVCRpiPredContext; ++ ++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth); ++ ++#endif /* AVCODEC_RPI_HEVCPRED_H */ +diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c +new file mode 100644 +index 0000000000..f2ebcad332 +--- /dev/null ++++ b/libavcodec/rpi_hevcpred_template.c +@@ -0,0 +1,1407 @@ ++/* ++ * HEVC video decoder ++ * ++ * Copyright (C) 2012 - 2013 Guillaume Martres ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "config.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/rpi_sand_fns.h" ++#include "bit_depth_template.c" ++ ++#include "rpi_hevcdec.h" ++#include "rpi_hevcpred.h" ++ ++#define DUMP_PRED 0 ++ ++#define POS(x, y) src[(x) + stride * (y)] ++ ++// INCLUDED_ONCE defined at EOF ++#ifndef INCLUDED_ONCE ++typedef uint8_t (* c8_dst_ptr_t)[2]; ++typedef const uint8_t (* c8_src_ptr_t)[2]; ++typedef uint16_t (* c16_dst_ptr_t)[2]; ++typedef const uint16_t (* c16_src_ptr_t)[2]; ++ ++// *** On ARM make these NEON registers ++typedef struct pixel4_16 { ++ uint16_t x[4]; ++} pixel4_16; ++typedef struct pixel4_32 { ++ uint32_t x[4]; ++} pixel4_32; ++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x) ++{ ++ pixel4_16 t = {{x, x, x, x}}; ++ return t; ++} ++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x) ++{ ++ pixel4_32 t = {{x, x, x, x}}; ++ return t; ++} ++#endif ++ ++#if PRED_C ++// For chroma we double pixel size so we copy pairs ++#undef pixel ++#undef pixel2 ++#undef pixel4 ++#undef dctcoef ++#undef INIT_CLIP ++#undef no_rnd_avg_pixel4 ++#undef rnd_avg_pixel4 ++#undef AV_RN2P ++#undef AV_RN4P ++#undef AV_RN4PA ++#undef AV_WN2P ++#undef AV_WN4P ++#undef AV_WN4PA ++#undef CLIP ++#undef FUNC ++#undef FUNCC ++#undef av_clip_pixel ++#undef PIXEL_SPLAT_X4 ++ ++#if BIT_DEPTH == 8 ++#define pixel uint16_t ++#define pixel4 pixel4_16 ++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16 ++#define cpel uint8_t ++#define c_src_ptr_t c8_src_ptr_t ++#define c_dst_ptr_t c8_dst_ptr_t ++#else ++#define pixel uint32_t ++#define pixel4 pixel4_32 ++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32 ++#define cpel uint16_t ++#define c_src_ptr_t c16_dst_ptr_t ++#define c_dst_ptr_t c16_dst_ptr_t ++#endif ++#define AV_RN4P(p) (*(pixel4*)(p)) ++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x)) ++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c) ++#endif ++ ++ ++// Get PW prior to horrid PRED_C trickery ++#if BIT_DEPTH == 8 ++#define PW 1 ++#else ++#define PW 2 ++#endif ++ ++ ++#if DUMP_PRED && !defined(INCLUDED_ONCE) ++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size) ++{ ++ for (unsigned int y = 0; y != size; y++, data += stride * 2) { ++ for (unsigned int x = 0; x != size; x++) { ++ printf("%4d", data[x * 2]); ++ } ++ printf("\n"); ++ } ++ printf("\n"); ++} ++#endif ++ ++#ifndef INCLUDED_ONCE ++static inline void extend_8(void * ptr, const unsigned int v, unsigned int n) ++{ ++ if ((n >>= 2) != 0) { ++ uint32_t v4 = v | (v << 8); ++ uint32_t * p = (uint32_t *)ptr; ++ v4 = v4 | (v4 << 16); ++ do { ++ *p++ = v4; ++ } while (--n != 0); ++ } ++} ++ ++static inline void extend_16(void * ptr, const unsigned int v, unsigned int n) ++{ ++ if ((n >>= 2) != 0) { ++ uint32_t v2 = v | (v << 16); ++ uint32_t * p = (uint32_t *)ptr; ++ do { ++ *p++ = v2; ++ *p++ = v2; ++ } while (--n != 0); ++ } ++} ++ ++static inline void extend_32(void * ptr, const unsigned int v, unsigned int n) ++{ ++ if ((n >>= 2) != 0) { ++ uint32_t * p = (uint32_t *)ptr; ++ do { ++ *p++ = v; ++ *p++ = v; ++ *p++ = v; ++ *p++ = v; ++ } while (--n != 0); ++ } ++} ++ ++// Beware that this inverts the avail ordering ++// For CIP it seems easier this way round ++static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask, ++ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size, ++ unsigned int s0, unsigned int odd_s) ++{ ++ const unsigned int n = 1 << log2_intra_bits; ++ unsigned int fa = 0; ++ unsigned int i; ++ ++ size >>= 2; // Now in 4-pel units ++ s0 >>= 2; ++ ++ if ((avail & AVAIL_DL) != 0) ++ fa |= ((1 << s0) - 1) << (size - s0); ++ if ((avail & AVAIL_L) != 0) ++ fa |= ((1 << size) - 1) << size; ++ if ((avail & AVAIL_UL) != 0) ++ fa |= 1 << (size << 1); ++ ++ if (odd_s) { ++ if ((fa & 1) != 0 && (*is_intra & i_mask) == 0) ++ fa &= ~1; ++ is_intra += i_stride; ++ } ++ ++ for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) { ++ const unsigned int m = ((1 << n) - 1) << i; ++ if ((fa & m) != 0 && (*is_intra & i_mask) == 0) ++ fa &= ~m; ++ } ++ ++ return fa; ++} ++ ++static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift, ++ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size, ++ unsigned int s1, unsigned int odd_s) ++{ ++ if ((avail & (AVAIL_U | AVAIL_UR)) == 0) ++ { ++ return 0; ++ } ++ else ++ { ++ const unsigned int n = 1 << log2_intra_bits; ++ unsigned int fa = 0; ++ unsigned int i; ++ unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift; ++ ++ size >>= 2; // Now in 4-pel units ++ s1 >>= 2; ++ ++ if ((avail & AVAIL_U) != 0) ++ fa |= ((1 << size) - 1); ++ if ((avail & AVAIL_UR) != 0) ++ fa |= ((1 << s1) - 1) << size; ++ ++ if (odd_s) { ++ fa &= im | ~1; ++ im >>= 1; ++ } ++ ++ for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) { ++ const unsigned int m = ((1 << n) - 1) << i; ++ if ((im & 1) == 0) ++ fa &= ~m; ++ } ++ return fa; ++ } ++} ++ ++ ++ ++static inline unsigned int rmbd(unsigned int x) ++{ ++#if 1 ++ return __builtin_ctz(x); ++#else ++ unsigned int n = 0; ++ if ((x & 0xffff) == 0) { ++ x >>= 16; ++ n += 16; ++ } ++ if ((x & 0xff) == 0) { ++ x >>= 8; ++ n += 8; ++ } ++ if ((x & 0xf) == 0) { ++ x >>= 4; ++ n += 4; ++ } ++ if ((x & 0x3) == 0) { ++ x >>= 2; ++ n += 2; ++ } ++ ++ return (x & 1) == 0 ? n + 1 : n; ++#endif ++} ++#endif ++ ++ ++static void FUNC(cip_fill)(pixel * const left, pixel * const top, ++ const unsigned int avail_l, const unsigned int avail_u, ++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur, ++ const unsigned int stride, ++ const unsigned int size) ++{ ++ pixel a; ++ unsigned int i; ++ ++ // 1st find DL value ++ if ((avail_l & 1) == 0) { ++ if (avail_l != 0) ++ a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride]; ++ else ++ { ++ // (avail_l | avail_u) != 0 so this must be good ++ const unsigned int n = rmbd(avail_u)*4; ++ a = (n >= size) ? src_ur[n - size] : src_u[n]; ++ } ++ } ++ ++ // L ++ { ++ pixel * d = left + size * 2 - 1; ++ const pixel * s = src_l + (size * 2 - 1) * stride; ++ unsigned int x = avail_l; ++ for (i = 0; i < size * 2; i += 4, x >>= 1) ++ { ++ if ((x & 1) != 0) { ++ // Avail ++ *d-- = *s; ++ s -= stride; ++ *d-- = *s; ++ s -= stride; ++ *d-- = *s; ++ s -= stride; ++ *d-- = a = *s; ++ s -= stride; ++ } ++ else ++ { ++ *d-- = a; ++ *d-- = a; ++ *d-- = a; ++ *d-- = a; ++ s -= stride * 4; ++ } ++ } ++ // UL ++ *d = a = (x & 1) != 0 ? *s : a; ++ } ++ ++ // U ++ { ++ pixel * d = top; ++ const pixel * s = src_u; ++ unsigned int x = avail_u; ++ ++ for (i = 0; i < size; i += 4, x >>= 1) ++ { ++ if ((x & 1) != 0) { ++ // Avail ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = a = *s++; ++ } ++ else ++ { ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ s += 4; ++ } ++ } ++ ++ // UR ++ s = src_ur; ++ for (i = 0; i < size; i += 4, x >>= 1) ++ { ++ if ((x & 1) != 0) { ++ // Avail ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = *s++; ++ *d++ = a = *s++; ++ } ++ else ++ { ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ *d++ = a; ++ s += 4; ++ } ++ } ++ } ++} ++ ++ ++#if !PRED_C && PW == 1 ++#define EXTEND(ptr, val, len) extend_8(ptr, val, len) ++#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1) ++#define EXTEND(ptr, val, len) extend_16(ptr, val, len) ++#else ++#define EXTEND(ptr, val, len) extend_32(ptr, val, len) ++#endif ++ ++// Reqs: ++// ++// Planar: DL[0], L, ul, U, UR[0] ++// DC: dl, L, ul, U, ur ++// A2-9: DL, L, ul, u, ur ++// A10: dl, L, ul, u, ur ++// A11-17 dl, L, UL, U, ur ++// A18-25 dl, L, Ul, U, ur ++// A26 dl, l, ul, U, ur ++// A27-34 dl, l, ul, U, UR ++ ++#ifndef INCLUDED_ONCE ++ ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16; ++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16; ++ ++static const uint8_t req_avail_c[35] = ++{ ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L, // 2 ++ AVAIL_DL | AVAIL_L, // 3 ++ AVAIL_DL | AVAIL_L, // 4 ++ AVAIL_DL | AVAIL_L, // 5 ++ AVAIL_DL | AVAIL_L, // 6 ++ AVAIL_DL | AVAIL_L, // 7 ++ AVAIL_DL | AVAIL_L, // 8 ++ AVAIL_DL | AVAIL_L, // 9 ++ AVAIL_L, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25 ++ AVAIL_U, // 26 (V) ++ AVAIL_U | AVAIL_UR, // 27 ++ AVAIL_U | AVAIL_UR, // 28 ++ AVAIL_U | AVAIL_UR, // 29 ++ AVAIL_U | AVAIL_UR, // 30 ++ AVAIL_U | AVAIL_UR, // 31 ++ AVAIL_U | AVAIL_UR, // 32 ++ AVAIL_U | AVAIL_UR, // 33 ++ AVAIL_U | AVAIL_UR // 34 ++}; ++ ++static const uint8_t req_avail[4][35] = { ++{ ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L, // 2 ++ AVAIL_DL | AVAIL_L, // 3 ++ AVAIL_DL | AVAIL_L, // 4 ++ AVAIL_DL | AVAIL_L, // 5 ++ AVAIL_DL | AVAIL_L, // 6 ++ AVAIL_DL | AVAIL_L, // 7 ++ AVAIL_DL | AVAIL_L, // 8 ++ AVAIL_DL | AVAIL_L, // 9 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 25 ++ AVAIL_L | AVAIL_UL | AVAIL_U, // 26 (V) ++ AVAIL_U | AVAIL_UR, // 27 ++ AVAIL_U | AVAIL_UR, // 28 ++ AVAIL_U | AVAIL_UR, // 29 ++ AVAIL_U | AVAIL_UR, // 30 ++ AVAIL_U | AVAIL_UR, // 31 ++ AVAIL_U | AVAIL_UR, // 32 ++ AVAIL_U | AVAIL_UR, // 33 ++ AVAIL_U | AVAIL_UR // 34 ++}, ++{ // 3 ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2 ++ AVAIL_DL | AVAIL_L | 0, // 3 ++ AVAIL_DL | AVAIL_L | 0, // 4 ++ AVAIL_DL | AVAIL_L | 0, // 5 ++ AVAIL_DL | AVAIL_L | 0, // 6 ++ AVAIL_DL | AVAIL_L | 0, // 7 ++ AVAIL_DL | AVAIL_L | 0, // 8 ++ AVAIL_DL | AVAIL_L | 0, // 9 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V) ++ AVAIL_U | AVAIL_UR | 0, // 27 ++ AVAIL_U | AVAIL_UR | 0, // 28 ++ AVAIL_U | AVAIL_UR | 0, // 29 ++ AVAIL_U | AVAIL_UR | 0, // 30 ++ AVAIL_U | AVAIL_UR | 0, // 31 ++ AVAIL_U | AVAIL_UR | 0, // 32 ++ AVAIL_U | AVAIL_UR | 0, // 33 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34 ++}, ++{ // 4 ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 3 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 4 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 5 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 6 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 7 ++ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 8 ++ AVAIL_DL | AVAIL_L | 0, // 9 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25 ++ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V) ++ AVAIL_U | AVAIL_UR | 0, // 27 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 28 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 29 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 30 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 31 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 32 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 33 ++ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34 ++}, ++{ // 5 ++ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed) ++ AVAIL_L | 0 | AVAIL_U, // DC ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 2 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 3 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 4 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 5 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 6 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 7 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 8 ++ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 9 ++ AVAIL_L | 0, // 10 (H) ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 11 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 12 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 13 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 14 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 15 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 16 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 17 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 18 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 19 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 20 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 21 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 22 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 23 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 24 ++ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 25 ++ AVAIL_U | 0, // 26 (V) ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33 ++ AVAIL_U | AVAIL_UR | FILTER_EITHER // 34 ++} ++}; ++ ++ ++#endif ++ ++#define filter_light1 FUNC(filter_light1) ++static inline pixel filter_light1(pixel a, pixel b, pixel c) ++{ ++ return (a + b*2 + c + 2) >> 2; ++} ++ ++#define filter_light FUNC(filter_light) ++static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n) ++{ ++ pixel p0; ++ pixel p2 = *src; ++ // Allow for final pel - it is just clearer to to have the call take the actual number of output pels ++ unsigned int n_minus_1 = n - 1; ++ ++ do ++ { ++ src += sstride; ++ p0 = p1; ++ p1 = p2; ++ p2 = *src; ++ *dst++ = filter_light1(p0, p1, p2); ++ } while (--n_minus_1 != 0); ++ *dst = filter_light1(p1, p2, pn); ++} ++ ++#define filter_strong FUNC(filter_strong) ++static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n) ++{ ++ unsigned int a = 64 * p0 + 32; ++ const int v = p1 - p0; ++ ++ do ++ { ++ *dst++ = (a += v) >> 6; ++ } while (--n != 0); ++} ++ ++#define intra_filter FUNC(intra_filter) ++static av_always_inline void intra_filter( ++ pixel * const left, pixel * const top, ++ const unsigned int req, const unsigned int avail, ++ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur, ++ const unsigned int stride, ++ const unsigned int top_right_size, const unsigned int down_left_size, ++ const unsigned int log2_size) ++{ ++ const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5); ++ const unsigned int size = 1 << log2_size; ++ ++ // a_ is the first pel in a section working round dl -> ur ++ // b_ is the last ++ // Beware that top & left work out from UL so usage of a_ & b_ may ++ // swap between them. It is a bad naming scheme but I have found no ++ // better ++ const pixel * a_dl = src_l + (down_left_size + size - 1) * stride; ++ const pixel * b_dl = src_l + size * stride; ++ const pixel * a_l = src_l + (size - 1) * stride; ++ const pixel * b_l = src_l; ++ const pixel * ab_ul = src_l - stride; ++ const pixel * a_u = src_u; ++ const pixel * b_u = src_u + size - 1; ++ const pixel * a_ur = src_ur; ++ const pixel * b_ur = src_ur + top_right_size - 1; ++ ++ const unsigned int want = req & ~avail; ++ const unsigned int have = req & avail; ++ unsigned int i; ++ ++ if ((avail & AVAIL_DL) == 0) ++ { ++ a_dl = a_ur; ++ if ((avail & AVAIL_U) != 0) ++ a_dl = a_u; ++ if ((avail & AVAIL_UL) != 0) ++ a_dl = ab_ul; ++ if ((avail & AVAIL_L) != 0) ++ a_dl = a_l; ++ b_dl = a_dl; ++ } ++ ++ if ((avail & AVAIL_L) == 0) ++ { ++ a_l = b_dl; ++ b_l = b_dl; ++ } ++ if ((avail & AVAIL_UL) == 0) ++ { ++ ab_ul = b_l; ++ } ++ if ((avail & AVAIL_U) == 0) ++ { ++ a_u = ab_ul; ++ b_u = ab_ul; ++ } ++ if ((avail & AVAIL_UR) == 0) ++ { ++ a_ur = b_u; ++ b_ur = b_u; ++ } ++ ++ if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2) // PRED_C, log2_size compiler opt hints ++ { ++ if ((req & AVAIL_UL) != 0) ++ left[-1] = *ab_ul; ++ ++ if ((want & AVAIL_L) != 0) ++ EXTEND(left, *a_l, size); ++ if ((want & AVAIL_DL) != 0) ++ EXTEND(left + size, *a_dl, size); ++ if ((want & AVAIL_U) != 0) ++ EXTEND(top, *a_u, size); ++ if ((want & AVAIL_UR) != 0) ++ EXTEND(top + size, *a_ur, size); ++ ++ if ((have & AVAIL_U) != 0) ++ // Always good - even with sand ++ memcpy(top, a_u, size * sizeof(pixel)); ++ if ((have & AVAIL_UR) != 0) ++ { ++ memcpy(top + size, a_ur, top_right_size * sizeof(pixel)); ++ EXTEND(top + size + top_right_size, *b_ur, ++ size - top_right_size); ++ } ++ if ((have & AVAIL_L) != 0) ++ { ++ for (i = 0; i < size; i++) ++ left[i] = b_l[stride * i]; ++ } ++ if ((have & AVAIL_DL) != 0) ++ { ++ for (i = 0; i < down_left_size; i++) ++ left[i + size] = b_dl[stride * i]; ++ EXTEND(left + size + down_left_size, *a_dl, ++ size - down_left_size); ++ } ++ } ++ else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint ++ FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold && ++ FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold) ++ { ++ if ((req & (AVAIL_U | AVAIL_UR)) != 0) ++ filter_strong(top, *ab_ul, *b_ur, size * 2); ++ left[-1] = *ab_ul; ++ if ((req & (AVAIL_L | AVAIL_DL)) != 0) ++ filter_strong(left, *ab_ul, *a_dl, size*2); ++ } ++ else ++ { ++ // Same code for both have & want for UL ++ if ((req & AVAIL_UL) != 0) ++ { ++ left[-1] = filter_light1(*b_l, *ab_ul, *a_u); ++ } ++ ++ if ((want & AVAIL_L) != 0) ++ { ++ EXTEND(left, *a_l, size); ++ left[0] = (*a_l * 3 + *ab_ul + 2) >> 2; ++ } ++ if ((want & AVAIL_DL) != 0) ++ { ++ // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding ++ EXTEND(left + size, *a_l, size); ++ } ++ if ((want & AVAIL_U) != 0) ++ { ++ EXTEND(top, *a_u, size); ++ top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2; ++ } ++ if ((want & AVAIL_UR) != 0) ++ { ++ // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding ++ EXTEND(top + size, *a_ur, size); ++ } ++ ++ if ((have & AVAIL_U) != 0) ++ { ++ filter_light(top, *ab_ul, a_u, *a_ur, 1, size); ++ } ++ if ((have & AVAIL_UR) != 0) { ++ filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size); ++ top[size*2 - 1] = *b_ur; ++ EXTEND(top + size + top_right_size, *b_ur, size - top_right_size); ++ } ++ if ((have & AVAIL_L) != 0) ++ { ++ filter_light(left, *ab_ul, b_l, *b_dl, stride, size); ++ } ++ if ((have & AVAIL_DL) != 0) ++ { ++ filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size); ++ left[size*2 - 1] = *a_dl; ++ EXTEND(left + size + down_left_size, *a_dl, size - down_left_size); ++ } ++ } ++} ++ ++#define INTRA_FILTER(log2_size) \ ++static void FUNC(intra_filter_ ## log2_size)( \ ++ uint8_t * const left, uint8_t * const top, \ ++ const unsigned int req, const unsigned int avail, \ ++ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \ ++ const unsigned int stride, \ ++ const unsigned int top_right_size, const unsigned int down_left_size) \ ++{ \ ++ intra_filter((pixel *)left, (pixel *)top, req, avail, \ ++ (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \ ++} ++ ++INTRA_FILTER(2) ++INTRA_FILTER(3) ++INTRA_FILTER(4) ++INTRA_FILTER(5) ++ ++#undef intra_filter ++#undef INTRA_FILTER ++ ++static void FUNC(intra_pred)(const HEVCRpiContext * const s, ++ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail, ++ const unsigned int log2_size) ++{ ++ // c_idx will alaways be 1 for _c versions and 0 for y ++ const unsigned int c_idx = PRED_C; ++ const unsigned int hshift = ctx_hshift(s, c_idx); ++ const unsigned int vshift = ctx_vshift(s, c_idx); ++ const unsigned int size = (1 << log2_size); ++ const unsigned int x = x0 >> hshift; ++ const unsigned int y = y0 >> vshift; ++ ++ const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel); ++ pixel *const src = c_idx == 0 ? ++ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) : ++ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y); ++ ++ // Align so we can do multiple loads in the asm ++ // Padded to 16 byte boundary so as not to confuse anything ++ DECLARE_ALIGNED(16, pixel, top[2 * MAX_TB_SIZE]); ++ DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]); ++ ++ pixel * const left = left_array + 16 / sizeof(pixel); ++ const pixel * top_pred = top; ++ ++ const pixel * src_l = src - 1; ++ const pixel * src_u = src - stride; ++ const pixel * src_ur = src_u + size; ++#if !PRED_C ++ const unsigned int req = req_avail[log2_size - 2][mode] & ~s->ps.sps->intra_filters_disable; ++#else ++ const unsigned int req = req_avail_c[mode]; ++#endif ++ ++ // If we have nothing to pred from then fill with grey ++ // This isn't a common case but dealing with it here means we don't have to ++ // test for it later ++ if (avail == 0) ++ { ++dc_only: ++#if !PRED_C ++ s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride); ++#else ++ s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride); ++#endif ++ return; ++ } ++ ++ { ++ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs ++ const AVFrame * const frame = s->frame; ++ const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2 ++ const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride; ++ if ((x & mask) == 0) ++ src_l -= stripe_adj; ++ if (((x + size) & mask) == 0) ++ src_ur += stripe_adj; ++ } ++ ++ // Can deal with I-slices in 'normal' code even if CIP ++ // This also means that we don't need to generate (elsewhere) is_intra ++ // for IRAP frames ++ if (s->ps.pps->constrained_intra_pred_flag == 1 && ++ s->sh.slice_type != HEVC_SLICE_I) ++ { ++ // * If we ever actually care about CIP performance then we should ++ // special case out size 4 stuff (can be done by 'normal') and ++ // have 8-pel avail masks ++ unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)), ++ -(int)(s->ps.sps->pcm_width), ++ 1 << (((x - 1) >> (3 - hshift)) & 7), ++ 1 - hshift, ++ avail, ++ size, ++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size), ++ vshift != 0 ? 0 : (y >> 2) & 1); ++ ++ unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)), ++ (x >> (3 - hshift)) & 7, ++ 1 - hshift, ++ avail, ++ size, ++ FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size), ++ hshift != 0 ? 0 : (x >> 2) & 1); ++ ++ // Anything left? ++ if ((avail_l | avail_u) == 0) ++ goto dc_only; ++ ++ FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size); ++ ++#if !PRED_C ++ if ((req & FILTER_LIGHT) != 0) ++ { ++ const unsigned threshold = 1 << (BIT_DEPTH - 5); ++ if ((req & FILTER_STRONG) != 0 && ++ (int)(FFABS(left[-1] + top[63] - 2 * top[31])) < threshold && ++ (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold) ++ { ++ filter_strong(top, left[-1], top[63], 64); ++ filter_strong(left, left[-1], left[63], 64); ++ } else ++ { ++ // LHS writes UL too so copy for top ++ const pixel p_ul = left[-1]; ++ filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size); ++ filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1); ++ } ++ } ++#endif ++ } ++ else ++ { ++ const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size); ++ if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 && ++ ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size)) ++ { ++ top_pred = src_u; ++ } ++ else ++ { ++#if !PRED_C ++ s->hpc.intra_filter[log2_size - 2] ++#else ++ s->hpc.intra_filter_c[log2_size - 2] ++#endif ++ ((uint8_t *)left, (uint8_t *)top, req, avail, ++ (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel), ++ ur_size, ++ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size)); ++ } ++ } ++ ++ ++#if !PRED_C ++ switch (mode) { ++ case INTRA_PLANAR: ++ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_DC: ++ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_ANGULAR_HORIZONTAL: ++ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ case INTRA_ANGULAR_VERTICAL: ++ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ default: ++ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ } ++#else ++ switch (mode) { ++ case INTRA_PLANAR: ++ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_DC: ++ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride); ++ break; ++ case INTRA_ANGULAR_HORIZONTAL: ++ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ case INTRA_ANGULAR_VERTICAL: ++ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ default: ++ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred, ++ (uint8_t *)left, stride, ++ mode); ++ break; ++ } ++ ++#if DUMP_PRED ++ printf("U pred @ %d, %d: mode=%d\n", x, y, mode); ++ dump_pred_uv((uint8_t *)src, stride, 1 << log2_size); ++ printf("V pred @ %d, %d: mode=%d\n", x, y, mode); ++ dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size); ++#endif ++#endif ++} ++ ++#if !PRED_C ++static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top, ++ const uint8_t *_left, ptrdiff_t stride, ++ int trafo_size) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ const pixel *top = (const pixel *)_top; ++ const pixel *left = (const pixel *)_left; ++ int size = 1 << trafo_size; ++ for (y = 0; y < size; y++) ++ for (x = 0; x < size; x++) ++ POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] + ++ (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1); ++} ++#else ++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top, ++ const uint8_t * _left, ptrdiff_t stride, ++ int trafo_size) ++{ ++ int x, y; ++ int size = 1 << trafo_size; ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const c_src_ptr_t top = (c_src_ptr_t)_top; ++ const c_src_ptr_t left = (c_src_ptr_t)_left; ++ ++ for (y = 0; y < size; y++, src += stride) ++ { ++ for (x = 0; x < size; x++) ++ { ++ src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] + ++ (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1); ++ src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] + ++ (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1); ++ } ++ } ++} ++#endif ++ ++#define PRED_PLANAR(size)\ ++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \ ++ const uint8_t *left, ptrdiff_t stride) \ ++{ \ ++ FUNC(pred_planar)(src, top, left, stride, size + 2); \ ++} ++ ++PRED_PLANAR(0) ++PRED_PLANAR(1) ++PRED_PLANAR(2) ++PRED_PLANAR(3) ++ ++#undef PRED_PLANAR ++ ++#if !PRED_C ++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, int log2_size) ++{ ++ int i, j, x, y; ++ int size = (1 << log2_size); ++ pixel *src = (pixel *)_src; ++ const pixel *top = (const pixel *)_top; ++ const pixel *left = (const pixel *)_left; ++ int dc = size; ++ pixel4 a; ++ for (i = 0; i < size; i++) ++ dc += left[i] + top[i]; ++ ++ dc >>= log2_size + 1; ++ ++ a = PIXEL_SPLAT_X4(dc); ++ ++ for (i = 0; i < size; i++) ++ for (j = 0; j < size; j+=4) ++ AV_WN4P(&POS(j, i), a); ++ ++// if (c_idx == 0 && size < 32) ++// As we now have separate fns for y & c - no need to test that ++ if (size < 32) ++ { ++ POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2; ++ for (x = 1; x < size; x++) ++ POS(x, 0) = (top[x] + 3 * dc + 2) >> 2; ++ for (y = 1; y < size; y++) ++ POS(0, y) = (left[y] + 3 * dc + 2) >> 2; ++ } ++} ++#else ++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, int log2_size) ++{ ++ unsigned int i, j; ++ const unsigned int size = (1 << log2_size); ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const c_src_ptr_t top = (c_src_ptr_t)_top; ++ const c_src_ptr_t left = (c_src_ptr_t)_left; ++ unsigned int dc0 = size; ++ unsigned int dc1 = size; ++ ++ for (i = 0; i < size; i++) ++ { ++ dc0 += left[i][0] + top[i][0]; ++ dc1 += left[i][1] + top[i][1]; ++ } ++ ++ dc0 >>= log2_size + 1; ++ dc1 >>= log2_size + 1; ++ ++ for (i = 0; i < size; i++, src += stride) ++ { ++ for (j = 0; j < size; ++j) ++ { ++ src[j][0] = dc0; ++ src[j][1] = dc1; ++ ++ } ++ } ++} ++#endif ++ ++#define PRED_DC(size)\ ++static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top, \ ++ const uint8_t *left, ptrdiff_t stride) \ ++{ \ ++ FUNC(pred_dc)(src, top, left, stride, size + 2); \ ++} ++ ++PRED_DC(0) ++PRED_DC(1) ++PRED_DC(2) ++PRED_DC(3) ++ ++#undef PRED_DC ++ ++ ++ ++ ++#if !PRED_C ++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size) ++{ ++ int i, j; ++ int size = (1 << log2_size); ++ pixel *src = (pixel *)_src; ++ pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1)); ++ ++ for (i = 0; i < size; i++) ++ for (j = 0; j < size; j+=4) ++ AV_WN4P(&POS(j, i), a); ++} ++#else ++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size) ++{ ++ unsigned int i, j; ++ const unsigned int size = (1 << log2_size); ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ const pixel a = (1 << (BIT_DEPTH - 1)); ++ ++ for (i = 0; i < size; i++, src += stride) ++ { ++ for (j = 0; j < size; ++j) ++ { ++ src[j][0] = a; ++ src[j][1] = a; ++ } ++ } ++} ++#endif ++ ++#define PRED_DC0(size)\ ++static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride) \ ++{ \ ++ FUNC(pred_dc0)(src, stride, size + 2); \ ++} ++ ++PRED_DC0(0) ++PRED_DC0(1) ++PRED_DC0(2) ++PRED_DC0(3) ++ ++#undef PRED_DC0 ++ ++ ++ ++ ++#ifndef ANGLE_CONSTS ++#define ANGLE_CONSTS ++static const int intra_pred_angle[] = { ++ 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32, ++ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32 ++}; ++static const int inv_angle[] = { ++ -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482, ++ -630, -910, -1638, -4096 ++}; ++#endif ++ ++#if !PRED_C ++static av_always_inline void FUNC(pred_angular)(uint8_t *_src, ++ const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, ++ int mode, int size) ++{ ++ int x, y; ++ pixel *src = (pixel *)_src; ++ const pixel *top = (const pixel *)_top; ++ const pixel *left = (const pixel *)_left; ++ ++ int angle = intra_pred_angle[mode - 2]; ++ pixel ref_array[3 * MAX_TB_SIZE + 4]; ++ pixel *ref_tmp = ref_array + size; ++ const pixel *ref; ++ int last = (size * angle) >> 5; ++ ++ if (mode >= 18) { ++ ref = top - 1; ++ ++ if (angle < 0) ++ { ++ memcpy(ref_tmp + 1, top, size * PW); ++ ref_tmp[0] = left[-1]; ++ ++ for (x = last; x <= -1; x++) ++ ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)]; ++ ref = ref_tmp; ++ } ++ ++ for (y = 0; y < size; y++) { ++ int idx = ((y + 1) * angle) >> 5; ++ int fact = ((y + 1) * angle) & 31; ++ if (fact) { ++ for (x = 0; x < size; x += 4) { ++ POS(x , y) = ((32 - fact) * ref[x + idx + 1] + ++ fact * ref[x + idx + 2] + 16) >> 5; ++ POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] + ++ fact * ref[x + 1 + idx + 2] + 16) >> 5; ++ POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] + ++ fact * ref[x + 2 + idx + 2] + 16) >> 5; ++ POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] + ++ fact * ref[x + 3 + idx + 2] + 16) >> 5; ++ } ++ } else { ++ for (x = 0; x < size; x += 4) ++ AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1])); ++ } ++ } ++ if (mode == 26 && size < 32) { ++ for (y = 0; y < size; y++) ++ POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1)); ++ } ++ ++ } else { ++ ref = left - 1; ++ if (angle < 0 && last < -1) { ++ for (x = 0; x <= size; x += 4) ++ AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1])); ++ // Inv angle <= -256 so top offset >= 0 ++ for (x = last; x <= -1; x++) ++ ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)]; ++ ref = ref_tmp; ++ } ++ ++ for (x = 0; x < size; x++) { ++ int idx = ((x + 1) * angle) >> 5; ++ int fact = ((x + 1) * angle) & 31; ++ if (fact) { ++ for (y = 0; y < size; y++) { ++ POS(x, y) = ((32 - fact) * ref[y + idx + 1] + ++ fact * ref[y + idx + 2] + 16) >> 5; ++ } ++ } else { ++ for (y = 0; y < size; y++) ++ POS(x, y) = ref[y + idx + 1]; ++ } ++ } ++ if (mode == 10 && size < 32) { ++ for (x = 0; x < size; x += 4) { ++ POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - left[-1]) >> 1)); ++ POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - left[-1]) >> 1)); ++ POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - left[-1]) >> 1)); ++ POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - left[-1]) >> 1)); ++ } ++ } ++ } ++} ++#else ++static av_always_inline void FUNC(pred_angular)(uint8_t *_src, ++ const uint8_t *_top, ++ const uint8_t *_left, ++ ptrdiff_t stride, ++ int mode, int size) ++{ ++ int x, y; ++ c_dst_ptr_t src = (c_dst_ptr_t)_src; ++ c_src_ptr_t top = (c_src_ptr_t)_top; ++ c_src_ptr_t left = (c_src_ptr_t)_left; ++ ++ const int angle = intra_pred_angle[mode - 2]; ++ cpel ref_array[3 * MAX_TB_SIZE + 4][2]; ++ c_dst_ptr_t ref_tmp = ref_array + size; ++ c_src_ptr_t ref; ++ const int last = (size * angle) >> 5; ++ ++ if (mode >= 18) { ++ ref = top - 1; ++ if (angle < 0) { ++ memcpy(ref_tmp + 1, top, size * 2 * PW); ++ ref_tmp[0][0] = left[-1][0]; ++ ref_tmp[0][1] = left[-1][1]; ++ for (x = last; x <= -1; x++) ++ { ++ ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; ++ ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; ++ } ++ ref = (c_src_ptr_t)ref_tmp; ++ } ++ ++ for (y = 0; y < size; y++, src += stride) { ++ const int idx = ((y + 1) * angle) >> 5; ++ const int fact = ((y + 1) * angle) & 31; ++ if (fact) { ++ for (x = 0; x < size; ++x) { ++ src[x][0] = ((32 - fact) * ref[x + idx + 1][0] + ++ fact * ref[x + idx + 2][0] + 16) >> 5; ++ src[x][1] = ((32 - fact) * ref[x + idx + 1][1] + ++ fact * ref[x + idx + 2][1] + 16) >> 5; ++ } ++ } else { ++ memcpy(src, ref + idx + 1, size * 2 * PW); ++ } ++ } ++ } else { ++ ref = left - 1; ++ if (angle < 0 && last < -1) { ++ memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW); ++ for (x = last; x <= -1; x++) ++ { ++ ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0]; ++ ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1]; ++ } ++ ref = (c_src_ptr_t)ref_tmp; ++ } ++ ++ for (x = 0; x < size; x++, src++) { ++ const int idx = ((x + 1) * angle) >> 5; ++ const int fact = ((x + 1) * angle) & 31; ++ if (fact) { ++ for (y = 0; y < size; y++) { ++ src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] + ++ fact * ref[y + idx + 2][0] + 16) >> 5; ++ src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] + ++ fact * ref[y + idx + 2][1] + 16) >> 5; ++ } ++ } else { ++ for (y = 0; y < size; y++) ++ { ++ src[y * stride][0] = ref[y + idx + 1][0]; ++ src[y * stride][1] = ref[y + idx + 1][1]; ++ } ++ } ++ } ++ } ++} ++#endif ++ ++static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ++ ptrdiff_t stride, int mode) ++{ ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2); ++} ++ ++static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ++ ptrdiff_t stride, int mode) ++{ ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3); ++} ++ ++static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ++ ptrdiff_t stride, int mode) ++{ ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4); ++} ++ ++static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top, ++ const uint8_t *left, ++ ptrdiff_t stride, int mode) ++{ ++ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5); ++} ++ ++#undef cpel ++#undef c_src_ptr_t ++#undef c_dst_ptr_t ++ ++#undef EXTEND ++#undef POS ++#undef PW ++ ++#undef filter_light1 ++#undef filter_light ++#undef filter_strong ++#undef ref_gen ++ ++#ifndef INCLUDED_ONCE ++#define INCLUDED_ONCE ++#endif ++ +diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c +new file mode 100644 +index 0000000000..98a0b104b7 +--- /dev/null ++++ b/libavcodec/rpi_mailbox.c +@@ -0,0 +1,155 @@ ++/* ++Copyright (c) 2012, Broadcom Europe Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define MAJOR_NUM 100 ++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *) ++#define DEVICE_FILE_NAME "/dev/vcio" ++ ++#include "rpi_mailbox.h" ++//#include ++ ++/* ++ * use ioctl to send mbox property message ++ */ ++ ++static int mbox_property(int file_desc, void *buf) ++{ ++ int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf); ++ ++ if (ret_val < 0) { ++ printf("ioctl_set_msg failed:%d\n", ret_val); ++ } ++ ++#ifdef DEBUG ++ unsigned *p = buf; int i; unsigned size = *(unsigned *)buf; ++ for (i=0; i ++#include ++#include ++#include ++ ++#include "config.h" ++ ++#include "libavutil/avassert.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" ++#include ++#include ++#include ++#pragma GCC diagnostic pop ++ ++#include "rpi_mem.h" ++#include "rpi_zc_frames.h" ++ ++ ++#define OPT_PREFER_CMA 0 ++ ++struct rpi_cache_flush_env_s { ++ struct vcsm_user_clean_invalid2_s v; ++}; ++ ++ ++// GPU memory alloc fns (internal) ++ ++static void gpu_free_internal(GPU_MEM_PTR_T * const p) ++{ ++ if (p->arm != NULL) ++ vcsm_unlock_ptr(p->arm); ++ if (p->vcsm_handle != 0) ++ vcsm_free(p->vcsm_handle); ++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again ++} ++ ++ ++static int gpu_malloc_internal(GPU_MEM_PTR_T * const p, ++ const int numbytes, const unsigned int cache_type, const char * const name) ++{ ++ memset(p, 0, sizeof(*p)); ++ p->numbytes = (numbytes + 255) & ~255; // Round up ++ ++ if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "Unable to alloc %d bytes from VCSM for %s\n", p->numbytes, name); ++ goto fail; ++ } ++ if ((p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "Unable to VC handle from VCSM for %s\n", name); ++ goto fail; ++ } ++ if ((p->arm = vcsm_lock(p->vcsm_handle)) == NULL) ++ { ++ av_log(NULL, AV_LOG_ERROR, "Unable to lock handle from VCSM for %s\n", name); ++ goto fail; ++ } ++ if ((p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "Unable to get VC addr from VCSM for %s\n", name); ++ goto fail; ++ } ++ ++ return 0; ++ ++fail: ++ gpu_free_internal(p); ++ return AVERROR(ENOMEM); ++} ++ ++// Public gpu fns ++ ++// Allocate memory on GPU ++// Fills in structure

containing ARM pointer, videocore handle, videocore memory address, numbytes ++// Returns 0 on success. ++// This allocates memory that will not be cached in ARM's data cache. ++// Therefore safe to use without data cache flushing. ++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p) ++{ ++ return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached"); ++} ++ ++// This allocates data that will be ++// Cached in ARM L2 ++// Uncached in VPU L2 ++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p) ++{ ++ return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached"); ++} ++ ++void gpu_free(GPU_MEM_PTR_T * const p) { ++ gpu_free_internal(p); ++} ++ ++void rpi_mem_gpu_uninit(void) ++{ ++ vcsm_exit(); ++ bcm_host_deinit(); ++} ++ ++int rpi_mem_gpu_init(const unsigned int flags) ++{ ++ const int wants_cma = bcm_host_is_fkms_active(); ++ int use_cma; ++ ++ (void)flags; ++ ++ if (vcsm_init_ex(wants_cma ? 1 : 0, -1) == 0) ++ use_cma = 1; ++ else if (vcsm_init_ex(wants_cma ? 0 : 1, -1) == 0) ++ use_cma = 0; ++ else ++ return AVERROR(EINVAL); ++ ++ bcm_host_init(); ++ ++ return use_cma + 1; ++} ++ ++// ---------------------------------------------------------------------------- ++// ++// Cache flush functions ++ ++#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s)) ++ ++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf) ++{ ++ rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf; ++ *rfe = (rpi_cache_flush_env_t){.v={.op_count = 0}}; ++ return rfe; ++} ++ ++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe) ++{ ++ // Nothing needed ++} ++ ++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe) ++{ ++ int rc = 0; ++ if (rfe->v.op_count != 0) { ++ if (vcsm_clean_invalid2(&rfe->v) != 0) ++ { ++ const int err = errno; ++ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", err); ++ rc = AVERROR(err); ++ } ++ rfe->v.op_count = 0; ++ } ++ return rc; ++} ++ ++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe) ++{ ++ int rc = rpi_cache_flush_execute(rfe);; ++ ++ return rc; ++} ++ ++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride) ++{ ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; ++ ++ av_assert1(rfe->v.op_count <= CACHE_EL_MAX); ++ ++ b->invalidate_mode = mode; ++ b->block_count = blocks; ++ b->start_address = gm->arm + offset0; ++ b->block_size = block_size; ++ b->inter_block_stride = block_stride; ++} ++ ++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset, const unsigned int size) ++{ ++ // Deal with empty pointer trivially ++ if (gm == NULL || size == 0) ++ return; ++ ++ av_assert1(offset <= gm->numbytes); ++ av_assert1(size <= gm->numbytes); ++ av_assert1(offset + size <= gm->numbytes); ++ ++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0); ++} ++ ++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode) ++{ ++ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0); ++} ++ ++ ++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode) ++{ ++#if !RPI_ONE_BUF ++#error Fixme! (NIF) ++#endif ++ if (gpu_is_buf1(frame)) { ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode); ++ } ++ else ++ { ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode); ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode); ++ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode); ++ } ++} ++ ++// Flush an area of a frame ++// Width, height, x0, y0 in luma pels ++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode, ++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, ++ const unsigned int uv_shift, const int do_luma, const int do_chroma) ++{ ++ const unsigned int y_offset = frame->linesize[0] * y0; ++ const unsigned int y_size = frame->linesize[0] * height; ++ // Round UV up/down to get everything ++ const unsigned int uv_rnd = (1U << uv_shift) >> 1; ++ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift); ++ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset; ++ ++#if 0 ++ // *** frame->height is cropped height so not good ++ // As all unsigned they will also reject -ve ++ // Test individually as well as added to reject overflow ++ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped ++ av_assert0(n <= (unsigned int)frame->height); ++ av_assert0(start_line + n <= (unsigned int)frame->height); ++#endif ++ ++ if (!gpu_is_buf1(frame)) ++ { ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size); ++ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size); ++ } ++ } ++ else if (!av_rpi_is_sand_frame(frame)) ++ { ++ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame); ++ if (do_luma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size); ++ } ++ if (do_chroma) { ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size); ++ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size); ++ } ++ } ++ else ++ { ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int xshl = av_rpi_sand_frame_xshl(frame); ++ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1); ++ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C ++ av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX); ++ ++ if (do_chroma) ++ { ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; ++ b->invalidate_mode = mode; ++ b->block_count = block_count; ++ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1); ++ b->block_size = uv_size; ++ b->inter_block_stride = stride1 * stride2; ++ } ++ if (do_luma) ++ { ++ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++; ++ b->invalidate_mode = mode; ++ b->block_count = block_count; ++ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0); ++ b->block_size = y_size; ++ b->inter_block_stride = stride1 * stride2; ++ } ++ } ++} ++ ++// Call this to clean and invalidate a region of memory ++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode) ++{ ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf); ++ rpi_cache_flush_add_gm_ptr(rfe, p, mode); ++ rpi_cache_flush_finish(rfe); ++} ++ +diff --git a/libavcodec/rpi_mem.h b/libavcodec/rpi_mem.h +new file mode 100644 +index 0000000000..a451079806 +--- /dev/null ++++ b/libavcodec/rpi_mem.h +@@ -0,0 +1,88 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#ifndef RPI_MEM_H ++#define RPI_MEM_H ++ ++typedef struct gpu_mem_ptr_s { ++ unsigned char *arm; // Pointer to memory mapped on ARM side ++ int vc_handle; // Videocore handle of relocatable memory ++ int vcsm_handle; // Handle for use by VCSM ++ int vc; // Address for use in GPU code ++ int numbytes; // Size of memory block ++} GPU_MEM_PTR_T; ++ ++// General GPU functions ++ ++#define GPU_INIT_GPU 1 ++#define GPU_INIT_CMA 2 ++ ++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p); ++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p); ++extern void gpu_free(GPU_MEM_PTR_T * const p); ++int rpi_mem_gpu_init(const unsigned int flags); ++void rpi_mem_gpu_uninit(void); ++ ++// Cache flush stuff ++ ++struct rpi_cache_flush_env_s; ++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t; ++ ++typedef struct {uint32_t t[33];} rpi_cache_buf_t; ++ ++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf); ++// Free env without flushing ++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe); ++// Do the accumulated flush & clear but do not free the env ++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe); ++// Do the accumulated flush & free the env ++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe); ++ ++typedef enum ++{ ++ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1, ++ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2, ++ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3 ++} rpi_cache_flush_mode_t; ++ ++struct AVFrame; ++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode); ++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode, ++ const unsigned int offset, const unsigned int size); ++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode, ++ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride); ++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode); ++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode, ++ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height, ++ const unsigned int uv_shift, const int do_luma, const int do_chroma); ++ ++// init, add, finish for one gm ptr ++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode); ++ ++#endif +diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c +new file mode 100644 +index 0000000000..cb7b96119e +--- /dev/null ++++ b/libavcodec/rpi_qpu.c +@@ -0,0 +1,776 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++ ++#include ++#include ++#include ++#include ++#include ++#include "libavutil/avassert.h" ++ ++#include "config.h" ++ ++#include ++#include ++ ++#include ++ ++#include "rpi_mailbox.h" ++#include "rpi_mem.h" ++#include "rpi_qpu.h" ++#include "rpi_hevc_shader.h" ++#include "rpi_hevc_transform8.h" ++#include "rpi_hevc_transform10.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No) ++#define RPI_TRACE_TIME_VPU_QPU_WAIT 0 ++ ++// Add profile flags to all QPU requests - generates output in "vcdbg log msg" ++// Beware this is expensive and will probably throw off all other timing by >10% ++#define RPI_TRACE_QPU_PROFILE_ALL 0 ++ ++// QPU "noflush" flags ++// a mixture of flushing & profiling ++ ++#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed ++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers ++#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results ++#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling ++#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed) ++ ++#define vcos_verify_ge0(x) ((x)>=0) ++ ++// Size in 32bit words ++#define QPU_CODE_SIZE 4098 ++#define VPU_CODE_SIZE 16384 ++ ++static const short rpi_transMatrix2even[32][16] = { // Even rows first ++{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64}, ++{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90}, ++{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89}, ++{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87}, ++{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83}, ++{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80}, ++{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75}, ++{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70}, ++{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64}, ++{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57}, ++{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50}, ++{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43}, ++{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36}, ++{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25}, ++{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18}, ++{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9}, ++// Odd rows ++{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4}, ++{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13}, ++{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22}, ++{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31}, ++{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38}, ++{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46}, ++{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54}, ++{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61}, ++{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67}, ++{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73}, ++{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78}, ++{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82}, ++{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85}, ++{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88}, ++{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90}, ++{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90} ++}; ++ ++// Code/constants on GPU ++struct GPU ++{ ++// unsigned int qpu_code[QPU_CODE_SIZE]; ++ unsigned int vpu_code8[VPU_CODE_SIZE]; ++ unsigned int vpu_code10[VPU_CODE_SIZE]; ++ short transMatrix2even[16*16*2]; ++}; ++ ++#define WAIT_COUNT_MAX 16 ++ ++typedef struct trace_time_one_s ++{ ++ int count; ++ int64_t start[WAIT_COUNT_MAX]; ++ int64_t total[WAIT_COUNT_MAX]; ++} trace_time_one_t; ++ ++typedef struct trace_time_wait_s ++{ ++ unsigned int jcount; ++ int64_t start0; ++ int64_t last_update; ++ trace_time_one_t active; ++ trace_time_one_t wait; ++} trace_time_wait_t; ++ ++typedef struct vq_wait_s ++{ ++ sem_t sem; ++ struct vq_wait_s * next; ++} vq_wait_t; ++ ++#define VQ_WAIT_POOL_SIZE 16 ++typedef struct vq_wait_pool_s ++{ ++ vq_wait_t * head; ++ vq_wait_t pool[VQ_WAIT_POOL_SIZE]; ++} vq_wait_pool_t; ++ ++static void vq_wait_pool_init(vq_wait_pool_t * const pool); ++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool); ++ ++typedef struct gpu_env_s ++{ ++ int open_count; ++ int init_count; ++ int vpu_i_cache_flushed; ++ GPU_MEM_PTR_T qpu_code_gm_ptr; ++ GPU_MEM_PTR_T code_gm_ptr; ++ GPU_MEM_PTR_T dummy_gm_ptr; ++ vq_wait_pool_t wait_pool; ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ trace_time_wait_t ttw; ++#endif ++} gpu_env_t; ++ ++// Stop more than one thread trying to allocate memory or use the processing resources at once ++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER; ++static gpu_env_t * gpu = NULL; ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ ++static int64_t ns_time(void) ++{ ++ struct timespec ts; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec; ++} ++ ++ ++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000 ++ ++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U) ++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000) ++#define T_ARG(t) T_SEC(t), T_MS(t) ++#define T_FMT "%u.%03u" ++ ++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix) ++{ ++ // Update totals for levels that are still pending ++ for (int i = 0; i < tto->count; ++i) { ++ tto->total[i] += now - tto->start[i]; ++ tto->start[i] = now; ++ } ++ ++ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n", ++ prefix, ++ T_ARG(now - start0 - tto->total[0]), ++ T_ARG(tto->total[0]), ++ T_ARG(tto->total[1]), ++ T_ARG(tto->total[2]), ++ T_ARG(tto->total[3])); ++} ++ ++ ++static void tto_start(trace_time_one_t * const tto, const int64_t now) ++{ ++ av_assert0(tto->count < WAIT_COUNT_MAX); ++ tto->start[tto->count++] = now; ++} ++ ++static void tto_end(trace_time_one_t * const tto, const int64_t now) ++{ ++ const int n = --tto->count; ++ av_assert0(n >= 0); ++ tto->total[n] += now - tto->start[n]; ++} ++ ++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now) ++{ ++ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0)); ++ tto_print(&ttw->active, now, ttw->start0, "Active"); ++ tto_print(&ttw->wait, now, ttw->start0, " Wait"); ++} ++ ++#endif ++ ++// GPU memory alloc fns (internal) ++ ++static void gpu_free_internal(GPU_MEM_PTR_T * const p) ++{ ++ if (p->arm != NULL) ++ vcsm_unlock_ptr(p->arm); ++ if (p->vcsm_handle != 0) ++ vcsm_free(p->vcsm_handle); ++ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again ++} ++ ++ ++static int gpu_malloc_internal(GPU_MEM_PTR_T * const p, ++ const int numbytes, const unsigned int cache_type, const char * const name) ++{ ++ memset(p, 0, sizeof(*p)); ++ p->numbytes = (numbytes + 255) & ~255; // Round up ++ ++ if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 || ++ (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 || ++ (p->arm = vcsm_lock(p->vcsm_handle)) == NULL || ++ (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0) ++ { ++ gpu_free_internal(p); ++ return AVERROR(ENOMEM); ++ } ++ return 0; ++} ++ ++ ++// GPU init, free, lock, unlock ++ ++static void gpu_term(void) ++{ ++ gpu_env_t * const ge = gpu; ++ ++ // We have to hope that eveything has terminated... ++ gpu = NULL; ++ ++ vc_gpuserv_deinit(); ++ ++ gpu_free_internal(&ge->code_gm_ptr); ++ gpu_free_internal(&ge->qpu_code_gm_ptr); ++ gpu_free_internal(&ge->dummy_gm_ptr); ++ ++ vcsm_exit(); ++ ++ vq_wait_pool_deinit(&ge->wait_pool); ++ ++ free(ge); ++} ++ ++ ++// Connect to QPU, returns 0 on success. ++static int gpu_init(gpu_env_t ** const gpu) { ++ volatile struct GPU* ptr; ++ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t)); ++ int rv; ++ *gpu = NULL; ++ ++ if (ge == NULL) ++ return -1; ++ ++ vq_wait_pool_init(&ge->wait_pool); ++ ++ vcsm_init(); ++ ++ // Now copy over the QPU code into GPU memory ++ if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0) ++ return rv; ++ ++ { ++ int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader; ++ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int)); ++ memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes); ++ memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes); ++ } ++ ++ // And the VPU code ++ if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0) ++ return rv; ++ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm; ++ ++ // Zero everything so we have zeros between the code bits ++ memset((void *)ptr, 0, sizeof(*ptr)); ++ { ++ int num_bytes = sizeof(rpi_hevc_transform8); ++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); ++ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes); ++ } ++ { ++ int num_bytes = sizeof(rpi_hevc_transform10); ++ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int)); ++ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes); ++ } ++ // And the transform coefficients ++ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even)); ++ ++ // Generate a dummy "frame" & fill with 0x80 ++ // * Could reset to 1 <dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0) ++ return rv; ++ memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000); ++ ++ *gpu = ge; ++ return 0; ++} ++ ++ ++ ++static void gpu_unlock(void) { ++ pthread_mutex_unlock(&gpu_mutex); ++} ++ ++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary. ++static gpu_env_t * gpu_lock(void) { ++ pthread_mutex_lock(&gpu_mutex); ++ ++ av_assert1(gpu != NULL); ++ return gpu; ++} ++ ++static gpu_env_t * gpu_lock_ref(void) ++{ ++ pthread_mutex_lock(&gpu_mutex); ++ ++ if (gpu == NULL) { ++ int rv = gpu_init(&gpu); ++ if (rv != 0) { ++ gpu_unlock(); ++ return NULL; ++ } ++ } ++ ++ ++gpu->open_count; ++ return gpu; ++} ++ ++static void gpu_unlock_unref(gpu_env_t * const ge) ++{ ++ if (--ge->open_count == 0) ++ gpu_term(); ++ ++ gpu_unlock(); ++} ++ ++static inline gpu_env_t * gpu_ptr(void) ++{ ++ av_assert1(gpu != NULL); ++ return gpu; ++} ++ ++unsigned int vpu_get_fn(const unsigned int bit_depth) { ++ uint32_t a = 0; ++ ++ // Make sure that the gpu is initialized ++ av_assert1(gpu != NULL); ++ switch (bit_depth){ ++ case 8: ++ a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8); ++ break; ++ case 10: ++ a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10); ++ break; ++ default: ++ av_assert0(0); ++ } ++ return a; ++} ++ ++unsigned int vpu_get_constants(void) { ++ av_assert1(gpu != NULL); ++ return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even)); ++} ++ ++void gpu_ref(void) ++{ ++ gpu_lock_ref(); ++ gpu_unlock(); ++} ++ ++void gpu_unref(void) ++{ ++ gpu_env_t * const ge = gpu_lock(); ++ gpu_unlock_unref(ge); ++} ++ ++// ---------------------------------------------------------------------------- ++ ++ ++// Wait abstractions - mostly so we can easily add profile code ++static void vq_wait_pool_init(vq_wait_pool_t * const wp) ++{ ++ unsigned int i; ++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { ++ sem_init(&wp->pool[i].sem, 0, 0); ++ wp->pool[i].next = wp->pool + i + 1; ++ } ++ wp->head = wp->pool + 0; ++ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL; ++} ++ ++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp) ++{ ++ unsigned int i; ++ wp->head = NULL; ++ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) { ++ sem_destroy(&wp->pool[i].sem); ++ wp->pool[i].next = NULL; ++ } ++} ++ ++ ++// If sem_init actually takes time then maybe we want a pool... ++static vq_wait_t * vq_wait_new(void) ++{ ++ gpu_env_t * const ge = gpu_lock_ref(); ++ vq_wait_t * const wait = ge->wait_pool.head; ++ ge->wait_pool.head = wait->next; ++ wait->next = NULL; ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ tto_start(&ge->ttw.active, ns_time()); ++#endif ++ ++ gpu_unlock(); ++ return wait; ++} ++ ++static void vq_wait_delete(vq_wait_t * const wait) ++{ ++ gpu_env_t * const ge = gpu_lock(); ++ wait->next = ge->wait_pool.head; ++ ge->wait_pool.head = wait; ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ trace_time_wait_t * const ttw = &ge->ttw; ++ const int64_t now = ns_time(); ++ ++ttw->jcount; ++ tto_end(&ttw->wait, now); ++ ++ if (ttw->start0 == 0) ++ { ++ ttw->start0 = ttw->active.start[0]; ++ ttw->last_update = ttw->start0; ++ } ++ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD) ++ { ++ ttw->last_update += WAIT_TIME_PRINT_PERIOD; ++ ttw_print(ttw, now); ++ } ++ } ++#endif ++ gpu_unlock_unref(ge); ++} ++ ++static void vq_wait_wait(vq_wait_t * const wait) ++{ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ const int64_t now = ns_time(); ++ gpu_env_t * const ge = gpu_lock(); ++ tto_start(&ge->ttw.wait, now); ++ gpu_unlock(); ++ } ++#endif ++ ++ while (sem_wait(&wait->sem) == -1 && errno == EINTR) ++ /* loop */; ++} ++ ++static void vq_wait_post(vq_wait_t * const wait) ++{ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ { ++ gpu_env_t *const ge = gpu_lock(); ++ tto_end(&ge->ttw.active, ns_time()); ++ gpu_unlock(); ++ } ++#endif ++ ++ sem_post(&wait->sem); ++} ++ ++ ++ ++// Header comments were wrong for these two ++#define VPU_QPU_MASK_QPU 1 ++#define VPU_QPU_MASK_VPU 2 ++ ++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; ++ ++vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf) ++{ ++// vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t)); ++ vpu_qpu_job_env_t * vqj = buf; ++// memset(vqj, 0, sizeof(*vqj)); ++ vqj->n = 0; ++ vqj->mask = 0; ++ return vqj; ++} ++ ++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj) ++{ ++// memset(vqj, 0, sizeof(*vqj)); ++// free(vqj); ++} ++ ++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj) ++{ ++ struct gpu_job_s * const j = vqj->j + vqj->n++; ++ av_assert1(vqj->n <= VPU_QPU_JOB_MAX); ++ return j; ++} ++ ++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code, ++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5) ++{ ++ if (vpu_code != 0) { ++ struct gpu_job_s *const j = new_job(vqj); ++ vqj->mask |= VPU_QPU_MASK_VPU; ++ ++ j->command = EXECUTE_VPU; ++ j->callback.func = 0; ++ j->callback.cookie = NULL; ++ // The bottom two bits of the execute address contain no-flush flags ++ // b0 will flush the VPU I-cache if unset so we nearly always want that set ++ // as we never reload code ++ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed; ++ j->u.v.q[1] = r0; ++ j->u.v.q[2] = r1; ++ j->u.v.q[3] = r2; ++ j->u.v.q[4] = r3; ++ j->u.v.q[5] = r4; ++ j->u.v.q[6] = r5; ++ gpu->vpu_i_cache_flushed = 1; ++ } ++} ++ ++// flags are QPU_FLAGS_xxx ++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail) ++{ ++ if (n != 0) { ++ struct gpu_job_s *const j = new_job(vqj); ++ vqj->mask |= VPU_QPU_MASK_QPU; ++ ++ j->command = EXECUTE_QPU; ++ j->callback.func = 0; ++ j->callback.cookie = NULL; ++ ++ j->u.q.jobs = n; ++#if RPI_TRACE_QPU_PROFILE_ALL ++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS; ++#else ++ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU; ++#endif ++ j->u.q.timeout = 5000; ++ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t)); ++ } ++} ++ ++// Convert callback to sem post ++static void vpu_qpu_job_callback_wait(void * v) ++{ ++ vq_wait_post(v); ++} ++ ++// Poke a user-supplied sem ++static void vpu_qpu_job_callback_sem(void * v) ++{ ++ sem_post((sem_t *)v); ++} ++ ++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h) ++{ ++ vq_wait_t * wait; ++ ++ if (vqj->mask == 0) { ++ *wait_h = NULL; ++ return; ++ } ++ ++ // We are going to want a sync object ++ wait = vq_wait_new(); ++ ++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync ++ // If we only posted one thing or only QPU jobs ++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU) ++ { ++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1); ++ av_assert1(j->callback.func == 0); ++ ++ j->callback.func = vpu_qpu_job_callback_wait; ++ j->callback.cookie = wait; ++ } ++ else ++ { ++ struct gpu_job_s *const j = new_job(vqj); ++ ++ j->command = EXECUTE_SYNC; ++ j->u.s.mask = vqj->mask; ++ j->callback.func = vpu_qpu_job_callback_wait; ++ j->callback.cookie = wait; ++ } ++ ++ vqj->mask = 0; ++ *wait_h = wait; ++} ++ ++// Returns 0 if no sync added ('cos Q empty), 1 if sync added ++int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem) ++{ ++ // If nothing on q then just return ++ if (vqj->mask == 0) ++ return 0; ++ ++ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync ++ // If we only posted one thing or only QPU jobs ++ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU) ++ { ++ struct gpu_job_s * const j = vqj->j + (vqj->n - 1); ++ av_assert1(j->callback.func == 0); ++ ++ j->callback.func = vpu_qpu_job_callback_sem; ++ j->callback.cookie = sem; ++ } ++ else ++ { ++ struct gpu_job_s *const j = new_job(vqj); ++ ++ j->command = EXECUTE_SYNC; ++ j->u.s.mask = vqj->mask; ++ j->callback.func = vpu_qpu_job_callback_sem; ++ j->callback.cookie = sem; ++ } ++ ++ vqj->mask = 0; ++ return 1; ++} ++ ++ ++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj) ++{ ++ if (vqj->n == 0) ++ return 0; ++ ++ return vc_gpuserv_execute_code(vqj->n, vqj->j); ++} ++ ++// Simple wrapper of start + delete ++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj) ++{ ++ int rv; ++ rv = vpu_qpu_job_start(vqj); ++ vpu_qpu_job_delete(vqj); ++ return rv; ++} ++ ++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h) ++{ ++ if (wait_h != NULL) ++ { ++ vq_wait_t * const wait = *wait_h; ++ if (wait != NULL) { ++ *wait_h = NULL; ++ vq_wait_wait(wait); ++ vq_wait_delete(wait); ++ } ++ } ++} ++ ++int vpu_qpu_init() ++{ ++ gpu_env_t * const ge = gpu_lock_ref(); ++ if (ge == NULL) ++ return -1; ++ ++ if (ge->init_count++ == 0) ++ { ++ vc_gpuserv_init(); ++ } ++ ++ gpu_unlock(); ++ return 0; ++} ++ ++void vpu_qpu_term() ++{ ++ gpu_env_t * const ge = gpu_lock(); ++ ++ if (--ge->init_count == 0) { ++ vc_gpuserv_deinit(); ++ ++#if RPI_TRACE_TIME_VPU_QPU_WAIT ++ ttw_print(&ge->ttw, ns_time()); ++#endif ++ } ++ ++ gpu_unlock_unref(ge); ++} ++ ++uint32_t qpu_fn(const int * const mc_fn) ++{ ++ return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader); ++} ++ ++uint32_t qpu_dummy(void) ++{ ++ return gpu->dummy_gm_ptr.vc; ++} ++ ++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth) ++{ ++ // Dummy values we can catch with emulation ++ qf->y_pxx = ~1U; ++ qf->y_bxx = ~2U; ++ qf->y_p00 = ~3U; ++ qf->y_b00 = ~4U; ++ qf->c_pxx = ~5U; ++ qf->c_bxx = ~6U; ++ ++ switch (bit_depth) { ++ case 8: ++ qf->y_pxx = qpu_fn(mc_filter_y_pxx); ++ qf->y_pxx = qpu_fn(mc_filter_y_pxx); ++ qf->y_bxx = qpu_fn(mc_filter_y_bxx); ++ qf->y_p00 = qpu_fn(mc_filter_y_p00); ++ qf->y_b00 = qpu_fn(mc_filter_y_b00); ++ qf->c_pxx = qpu_fn(mc_filter_c_p); ++ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1); ++ qf->c_bxx = qpu_fn(mc_filter_c_b); ++ break; ++ case 10: ++ qf->c_pxx = qpu_fn(mc_filter_c10_p); ++ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1); ++ qf->c_bxx = qpu_fn(mc_filter_c10_b); ++ qf->y_pxx = qpu_fn(mc_filter_y10_pxx); ++ qf->y_bxx = qpu_fn(mc_filter_y10_bxx); ++ qf->y_p00 = qpu_fn(mc_filter_y10_p00); ++ qf->y_b00 = qpu_fn(mc_filter_y10_b00); ++ break; ++ default: ++ return -1; ++ } ++ return 0; ++} ++ +diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h +new file mode 100644 +index 0000000000..8777687021 +--- /dev/null ++++ b/libavcodec/rpi_qpu.h +@@ -0,0 +1,103 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#ifndef RPI_QPU_H ++#define RPI_QPU_H ++ ++#include "rpi_mem.h" ++#include "rpi_zc_frames.h" ++ ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" ++#pragma GCC diagnostic ignored "-Wstrict-prototypes" ++#include "interface/vmcs_host/vc_vchi_gpuserv.h" // for gpu_job_s ++#pragma GCC diagnostic pop ++ ++// QPU specific functions ++ ++typedef struct HEVCRpiQpu { ++ uint32_t c_pxx; ++ uint32_t c_pxx_l1; ++ uint32_t c_bxx; ++ uint32_t y_pxx; ++ uint32_t y_bxx; ++ uint32_t y_p00; ++ uint32_t y_b00; ++} HEVCRpiQpu; ++ ++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth); ++ ++uint32_t qpu_fn(const int * const mc_fn); ++uint32_t qpu_dummy(void); ++ ++#define QPU_N_GRP 4 ++#define QPU_N_MAX 12 ++ ++#define QPU_MAIL_EL_VALS 2 ++ ++struct vpu_qpu_wait_s; ++typedef struct vq_wait_s * vpu_qpu_wait_h; ++ ++// VPU specific functions ++ ++struct vpu_qpu_job_env_s; ++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h; ++ ++#define VPU_QPU_JOB_MAX 4 ++struct vpu_qpu_job_env_s ++{ ++ unsigned int n; ++ unsigned int mask; ++ struct gpu_job_s j[VPU_QPU_JOB_MAX]; ++}; ++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t; ++ ++vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf); ++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj); ++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code, ++ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5); ++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail); ++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h); ++int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem); ++int vpu_qpu_job_start(const vpu_qpu_job_h vqj); ++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj); ++ ++extern unsigned int vpu_get_fn(const unsigned int bit_depth); ++extern unsigned int vpu_get_constants(void); ++ ++// Waits for previous post_codee to complete and Will null out *wait_h after use ++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h); ++int vpu_qpu_init(void); ++void vpu_qpu_term(void); ++ ++void gpu_ref(void); ++void gpu_unref(void); ++ ++#endif +diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c +new file mode 100644 +index 0000000000..37be9a0f49 +--- /dev/null ++++ b/libavcodec/rpi_zc.c +@@ -0,0 +1,1227 @@ ++#include "config.h" ++ ++#include "libavcodec/avcodec.h" ++#include "rpi_mem.h" ++#include "rpi_mailbox.h" ++#include "rpi_zc.h" ++#include "libavutil/avassert.h" ++#include ++ ++#include "libavutil/buffer_internal.h" ++ ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" ++#include ++#include ++#pragma GCC diagnostic pop ++ ++#define TRACE_ALLOC 0 ++#define DEBUG_ALWAYS_KEEP_LOCKED 0 ++ ++struct ZcPoolEnt; ++ ++typedef struct ZcPool ++{ ++ size_t numbytes; ++ struct ZcPoolEnt * head; ++ pthread_mutex_t lock; ++} ZcPool; ++ ++typedef struct ZcPoolEnt ++{ ++ size_t numbytes; ++ ++ unsigned int vcsm_handle; ++ unsigned int vc_handle; ++ void * map_arm; ++ unsigned int map_vc; ++ ++ struct ZcPoolEnt * next; ++ struct ZcPool * pool; ++} ZcPoolEnt; ++ ++typedef struct ZcOldCtxVals ++{ ++ int thread_safe_callbacks; ++ int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags); ++ void * opaque; ++} ZcOldCtxVals; ++ ++typedef struct AVZcEnv ++{ ++ unsigned int refcount; ++ ZcOldCtxVals old; ++ ++ void * pool_env; ++ av_rpi_zc_alloc_buf_fn_t * alloc_buf; ++ av_rpi_zc_free_pool_fn_t * free_pool; ++ ++ unsigned int pool_size; ++} ZcEnv; ++ ++typedef struct ZcUserBufEnv { ++ void * v; ++ const av_rpi_zc_buf_fn_tab_t * fn; ++ size_t numbytes; ++ int offset; ++} ZcUserBufEnv; ++ ++#define ZC_BUF_INVALID 0 ++#define ZC_BUF_VALID 1 ++#define ZC_BUF_NEVER 2 ++ ++typedef struct ZcBufEnv { ++ GPU_MEM_PTR_T gmem; ++ AVZcEnvPtr zc; ++ int is_valid; ++ AVBufferRef * user; ++ AVRpiZcFrameGeometry geo; ++ size_t size_y; ++ size_t size_c; ++ size_t size_pic; ++ ssize_t offset; ++ pthread_mutex_t lock; ++ pthread_cond_t cond; ++} ZcBufEnv; ++ ++ ++ ++ ++ ++ ++#define ALLOC_PAD 0 ++#define ALLOC_ROUND 0x1000 ++#define STRIDE_ROUND 64 ++#define STRIDE_OR 0 ++ ++#define DEBUG_ZAP0_BUFFERS 0 ++ ++static inline int av_rpi_is_sand_format(const int format) ++{ ++ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16) || ++ (format == AV_PIX_FMT_RPI4_8 || format == AV_PIX_FMT_RPI4_10); ++} ++ ++static inline int av_rpi_is_sand_frame(const AVFrame * const frame) ++{ ++ return av_rpi_is_sand_format(frame->format); ++} ++ ++//---------------------------------------------------------------------------- ++// ++// Internal pool stuff ++ ++// Pool entry functions ++ ++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const size_t req_size) ++{ ++ ZcPoolEnt * const zp = av_mallocz(sizeof(ZcPoolEnt)); ++ ++ // Round up to 4k & add 4k ++ const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1); ++ ++ if (zp == NULL) { ++ av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n"); ++ goto fail0; ++ } ++ ++ // The 0x80 here maps all pages here rather than waiting for lazy mapping ++ // BEWARE that in GPU land a later unlock/lock pair will put us back into ++ // lazy mode - which will also break cache invalidate calls. ++ if ((zp->vcsm_handle = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST | 0x80, "ffmpeg_rpi_zc")) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size); ++ goto fail1; ++ } ++ ++#if TRACE_ALLOC ++ printf("%s: Alloc %#x bytes @ h=%d\n", __func__, alloc_size, zp->vcsm_handle); ++#endif ++ ++ zp->numbytes = alloc_size; ++ zp->pool = pool; ++ return zp; ++ ++fail1: ++ av_free(zp); ++fail0: ++ return NULL; ++} ++ ++static void zc_pool_ent_free(ZcPoolEnt * const zp) ++{ ++#if TRACE_ALLOC ++ printf("%s: Free %#x bytes @ h=%d\n", __func__, zp->numbytes, zp->vcsm_handle); ++#endif ++ ++ if (zp->vcsm_handle != 0) ++ { ++ // VC addr & handle need no dealloc ++ if (zp->map_arm != NULL) ++ vcsm_unlock_hdl(zp->vcsm_handle); ++ vcsm_free(zp->vcsm_handle); ++ } ++ av_free(zp); ++} ++ ++//---------------------------------------------------------------------------- ++// ++// Pool functions ++ ++static void zc_pool_free_ent_list(ZcPoolEnt * p) ++{ ++ while (p != NULL) ++ { ++ ZcPoolEnt * const zp = p; ++ p = p->next; ++ zc_pool_ent_free(zp); ++ } ++} ++ ++static void zc_pool_flush(ZcPool * const pool) ++{ ++ ZcPoolEnt * p = pool->head; ++ pool->head = NULL; ++ pool->numbytes = ~0U; ++ zc_pool_free_ent_list(p); ++} ++ ++static ZcPoolEnt * zc_pool_get_ent(ZcPool * const pool, const size_t req_bytes) ++{ ++ ZcPoolEnt * zp = NULL; ++ ZcPoolEnt * flush_list = NULL; ++ size_t numbytes; ++ ++ pthread_mutex_lock(&pool->lock); ++ ++ numbytes = pool->numbytes; ++ ++ // If size isn't close then dump the pool ++ // Close in this context means within 128k ++ if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes) ++ { ++ flush_list = pool->head; ++ pool->head = NULL; ++ pool->numbytes = numbytes = req_bytes; ++ } ++ else if (pool->head != NULL) ++ { ++ zp = pool->head; ++ pool->head = zp->next; ++ } ++ ++ pthread_mutex_unlock(&pool->lock); ++ ++ zc_pool_free_ent_list(flush_list); ++ ++ if (zp == NULL) ++ zp = zc_pool_ent_alloc(pool, numbytes); ++ ++ return zp; ++} ++ ++static void zc_pool_put_ent(ZcPoolEnt * const zp) ++{ ++ ZcPool * const pool = zp == NULL ? NULL : zp->pool; ++ if (zp != NULL) ++ { ++ pthread_mutex_lock(&pool->lock); ++#if TRACE_ALLOC ++ printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->numbytes); ++#endif ++ ++ if (pool->numbytes == zp->numbytes) ++ { ++ zp->next = pool->head; ++ pool->head = zp; ++ pthread_mutex_unlock(&pool->lock); ++ } ++ else ++ { ++ pthread_mutex_unlock(&pool->lock); ++ zc_pool_ent_free(zp); ++ } ++ } ++} ++ ++static ZcPool * ++zc_pool_new(void) ++{ ++ ZcPool * const pool = av_mallocz(sizeof(*pool)); ++ if (pool == NULL) ++ return NULL; ++ ++ pool->numbytes = -1; ++ pool->head = NULL; ++ pthread_mutex_init(&pool->lock, NULL); ++ return pool; ++} ++ ++static void ++zc_pool_delete(ZcPool * const pool) ++{ ++ if (pool != NULL) ++ { ++ pool->numbytes = -1; ++ zc_pool_flush(pool); ++ pthread_mutex_destroy(&pool->lock); ++ av_free(pool); ++ } ++} ++ ++//============================================================================ ++// ++// ZC implementation using above pool implementation ++// ++// Fn table fns... ++ ++static void zc_pool_free_v(void * v) ++{ ++ zc_pool_put_ent(v); ++} ++ ++static unsigned int zc_pool_ent_vcsm_handle_v(void * v) ++{ ++ ZcPoolEnt * zp = v; ++ return zp->vcsm_handle; ++} ++ ++static unsigned int zc_pool_ent_vc_handle_v(void * v) ++{ ++ ZcPoolEnt * zp = v; ++ if (zp->vc_handle == 0) ++ { ++ if ((zp->vc_handle = vcsm_vc_hdl_from_hdl(zp->vcsm_handle)) == 0) ++ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC handle\n", ++ __func__, zp->vcsm_handle); ++ } ++ return zp->vc_handle; ++} ++ ++static void * zc_pool_ent_map_arm_v(void * v) ++{ ++ ZcPoolEnt * zp = v; ++ if (zp->map_arm == NULL) ++ { ++ if ((zp->map_arm = vcsm_lock(zp->vcsm_handle)) == NULL) ++ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to ARM address\n", ++ __func__, zp->vcsm_handle); ++ } ++ return zp->map_arm; ++} ++ ++static unsigned int zc_pool_ent_map_vc_v(void * v) ++{ ++ ZcPoolEnt * zp = v; ++ if (zp->map_vc == 0) ++ { ++ if ((zp->map_vc = vcsm_vc_addr_from_hdl(zp->vcsm_handle)) == 0) ++ av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC address\n", ++ __func__, zp->vcsm_handle); ++ } ++ return zp->map_vc; ++} ++ ++static const av_rpi_zc_buf_fn_tab_t zc_pool_buf_fns = { ++ .free = zc_pool_free_v, ++ .vcsm_handle = zc_pool_ent_vcsm_handle_v, ++ .vc_handle = zc_pool_ent_vc_handle_v, ++ .map_arm = zc_pool_ent_map_arm_v, ++ .map_vc = zc_pool_ent_map_vc_v, ++}; ++ ++// ZC Env fns ++ ++// Delete pool ++// All buffers guaranteed freed by now ++static void ++zc_pool_delete_v(void * v) ++{ ++ zc_pool_delete((ZcPool *)v); ++ rpi_mem_gpu_uninit(); ++} ++ ++// Allocate a new ZC buffer ++static AVBufferRef * ++zc_pool_buf_alloc(void * v, size_t size, const AVRpiZcFrameGeometry * geo) ++{ ++ ZcPool * const pool = v; ++ ZcPoolEnt *const zp = zc_pool_get_ent(pool, size); ++ AVBufferRef * buf; ++ ++ (void)geo; // geo ignored here ++ ++ if (zp == NULL) { ++ av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size); ++ goto fail0; ++ } ++ ++ if ((buf = av_rpi_zc_buf(size, 0, zp, &zc_pool_buf_fns)) == NULL) ++ { ++ av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_buf() failed\n"); ++ goto fail2; ++ } ++ ++ return buf; ++ ++fail2: ++ zc_pool_put_ent(zp); ++fail0: ++ return NULL; ++} ++ ++// Init wrappers - the public fns ++ ++AVZcEnvPtr ++av_rpi_zc_int_env_alloc(void * logctx) ++{ ++ ZcEnv * zc; ++ ZcPool * pool_env; ++ ++ if (rpi_mem_gpu_init(0) < 0) ++ return NULL; ++ ++ if ((pool_env = zc_pool_new()) == NULL) ++ goto fail1; ++ ++ if ((zc = av_rpi_zc_env_alloc(logctx, pool_env, zc_pool_buf_alloc, zc_pool_delete_v)) == NULL) ++ goto fail2; ++ ++ return zc; ++ ++fail2: ++ zc_pool_delete(pool_env); ++fail1: ++ rpi_mem_gpu_uninit(); ++ return NULL; ++} ++ ++void ++av_rpi_zc_int_env_freep(AVZcEnvPtr * zcp) ++{ ++ const AVZcEnvPtr zc = *zcp; ++ *zcp = NULL; ++ if (zc != NULL) ++ av_rpi_zc_env_release(zc); ++} ++ ++//============================================================================ ++// ++// Geometry ++// ++// This is a separate chunck to the rest ++ ++// Get mailbox fd - should be in a lock when called ++// Rely on process close to close it ++static int mbox_fd(void) ++{ ++ static int fd = -1; ++ if (fd != -1) ++ return fd; ++ return (fd = mbox_open()); ++} ++ ++AVRpiZcFrameGeometry av_rpi_zc_frame_geometry( ++ const int format, const unsigned int video_width, const unsigned int video_height) ++{ ++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; ++ ++ AVRpiZcFrameGeometry geo = { ++ .format = format, ++ .video_width = video_width, ++ .video_height = video_height ++ }; ++ ++ switch (format) ++ { ++ case AV_PIX_FMT_YUV420P: ++ geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; ++ geo.stride_c = geo.stride_y / 2; ++ geo.height_y = (video_height + 32 + 31) & ~31; ++ geo.height_c = geo.height_y / 2; ++ geo.planes_c = 2; ++ geo.stripes = 1; ++ geo.bytes_per_pel = 1; ++ geo.stripe_is_yc = 1; ++ break; ++ ++ case AV_PIX_FMT_YUV420P10: ++ geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR; ++ geo.stride_c = geo.stride_y / 2; ++ geo.height_y = (video_height + 32 + 31) & ~31; ++ geo.height_c = geo.height_y / 2; ++ geo.planes_c = 2; ++ geo.stripes = 1; ++ geo.bytes_per_pel = 2; ++ geo.stripe_is_yc = 1; ++ break; ++ ++ case AV_PIX_FMT_SAND128: ++ case AV_PIX_FMT_RPI4_8: ++ { ++ const unsigned int stripe_w = 128; ++ ++ static VC_IMAGE_T img = {0}; ++ ++ // Given the overhead of calling the mailbox keep a stashed ++ // copy as we will almost certainly just want the same numbers again ++ // but that means we need a lock ++ pthread_mutex_lock(&sand_lock); ++ ++ if (img.width != video_width || img.height != video_height) ++ { ++ VC_IMAGE_T new_img = { ++ .type = VC_IMAGE_YUV_UV, ++ .width = video_width, ++ .height = video_height ++ }; ++ ++ mbox_get_image_params(mbox_fd(), &new_img); ++ img = new_img; ++ } ++ ++ geo.stride_y = stripe_w; ++ geo.stride_c = stripe_w; ++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; ++ geo.height_c = img.pitch / stripe_w - geo.height_y; ++ geo.stripe_is_yc = 1; ++ if (geo.height_y * stripe_w > img.pitch) ++ { ++ // "tall" sand - all C blocks now follow Y ++ geo.height_y = img.pitch / stripe_w; ++ geo.height_c = geo.height_y; ++ geo.stripe_is_yc = 0; ++ } ++ geo.planes_c = 1; ++ geo.stripes = (video_width + stripe_w - 1) / stripe_w; ++ geo.bytes_per_pel = 1; ++ ++ pthread_mutex_unlock(&sand_lock); ++#if 0 ++ printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n", ++ video_width, video_height, ++ geo.stride_y, geo.stride_c, ++ geo.height_y, geo.height_c, ++ geo.stripes, img.pitch); ++#endif ++ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0); ++ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2); ++ break; ++ } ++ ++ case AV_PIX_FMT_RPI4_10: ++ { ++ const unsigned int stripe_w = 128; // bytes ++ ++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; ++ static VC_IMAGE_T img = {0}; ++ ++ // Given the overhead of calling the mailbox keep a stashed ++ // copy as we will almost certainly just want the same numbers again ++ // but that means we need a lock ++ pthread_mutex_lock(&sand_lock); ++ ++ if (img.width != video_width || img.height != video_height) ++ { ++ VC_IMAGE_T new_img = { ++ .type = VC_IMAGE_YUV10COL, ++ .width = video_width, ++ .height = video_height ++ }; ++ ++ mbox_get_image_params(mbox_fd(), &new_img); ++ img = new_img; ++ } ++ ++ geo.stride_y = stripe_w; ++ geo.stride_c = stripe_w; ++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; ++ geo.height_c = img.pitch / stripe_w - geo.height_y; ++ geo.planes_c = 1; ++ geo.stripes = ((video_width * 4 + 2) / 3 + stripe_w - 1) / stripe_w; ++ geo.bytes_per_pel = 1; ++ geo.stripe_is_yc = 1; ++ ++ pthread_mutex_unlock(&sand_lock); ++ ++#if 0 ++ printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n", ++ video_width, video_height, ++ geo.stride_y, geo.stride_c, ++ geo.height_y, geo.height_c, ++ geo.stripes, img.pitch); ++#endif ++ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0); ++ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2); ++ break; ++ } ++ ++ case AV_PIX_FMT_SAND64_16: ++ case AV_PIX_FMT_SAND64_10: ++ { ++ const unsigned int stripe_w = 128; // bytes ++ ++ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER; ++ static VC_IMAGE_T img = {0}; ++ ++ // Given the overhead of calling the mailbox keep a stashed ++ // copy as we will almost certainly just want the same numbers again ++ // but that means we need a lock ++ pthread_mutex_lock(&sand_lock); ++ ++ if (img.width != video_width || img.height != video_height) ++ { ++ VC_IMAGE_T new_img = { ++ .type = VC_IMAGE_YUV_UV_16, ++ .width = video_width, ++ .height = video_height ++ }; ++ ++ mbox_get_image_params(mbox_fd(), &new_img); ++ img = new_img; ++ } ++ ++ geo.stride_y = stripe_w; ++ geo.stride_c = stripe_w; ++ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w; ++ geo.height_c = img.pitch / stripe_w - geo.height_y; ++ geo.planes_c = 1; ++ geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w; ++ geo.bytes_per_pel = 2; ++ geo.stripe_is_yc = 1; ++ ++ pthread_mutex_unlock(&sand_lock); ++ break; ++ } ++ ++ default: ++ break; ++ } ++ return geo; ++} ++ ++//============================================================================ ++// ++// ZC Env fns ++// ++// Frame copy fns ++ ++static AVBufferRef * zc_copy(const AVZcEnvPtr zc, ++ const AVFrame * const src) ++{ ++ AVFrame dest_frame; ++ AVFrame * const dest = &dest_frame; ++ unsigned int i; ++ uint8_t * psrc, * pdest; ++ ++ dest->format = src->format; ++ dest->width = src->width; ++ dest->height = src->height; ++ ++ if (av_rpi_zc_get_buffer(zc, dest) != 0 || ++ av_rpi_zc_resolve_frame(dest, ZC_RESOLVE_ALLOC_VALID) != 0) ++ { ++ return NULL; ++ } ++ ++ for (i = 0, psrc = src->data[0], pdest = dest->data[0]; ++ i != dest->height; ++ ++i, psrc += src->linesize[0], pdest += dest->linesize[0]) ++ { ++ memcpy(pdest, psrc, dest->width); ++ } ++ for (i = 0, psrc = src->data[1], pdest = dest->data[1]; ++ i != dest->height / 2; ++ ++i, psrc += src->linesize[1], pdest += dest->linesize[1]) ++ { ++ memcpy(pdest, psrc, dest->width / 2); ++ } ++ for (i = 0, psrc = src->data[2], pdest = dest->data[2]; ++ i != dest->height / 2; ++ ++i, psrc += src->linesize[2], pdest += dest->linesize[2]) ++ { ++ memcpy(pdest, psrc, dest->width / 2); ++ } ++ ++ return dest->buf[0]; ++} ++ ++ ++static AVBufferRef * zc_420p10_to_sand128(const AVZcEnvPtr zc, ++ const AVFrame * const src) ++{ ++ assert(0); ++ return NULL; ++} ++ ++ ++static AVBufferRef * zc_sand64_16_to_sand128(const AVZcEnvPtr zc, ++ const AVFrame * const src, const unsigned int src_bits) ++{ ++ assert(0); ++ return NULL; ++} ++ ++//---------------------------------------------------------------------------- ++// ++// Public info extraction calls ++ ++static void zc_buf_env_free_cb(void * opaque, uint8_t * data); ++ ++static inline ZcBufEnv * pic_zbe_ptr(AVBufferRef *const buf) ++{ ++ // Kludge where we check the free fn to check this is really ++ // one of our buffers - can't think of a better way ++ return buf == NULL || buf->buffer->free != zc_buf_env_free_cb ? NULL : ++ av_buffer_get_opaque(buf); ++} ++ ++static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf) ++{ ++ // As gmem is the first el NULL should be preserved ++ return &pic_zbe_ptr(buf)->gmem; ++} ++ ++unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref) ++{ ++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); ++ return p == NULL ? 0 : p->vcsm_handle; ++} ++ ++int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref) ++{ ++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); ++ return p == NULL ? -1 : p->vc_handle; ++} ++ ++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref) ++{ ++ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref); ++ return zbe == NULL ? 0 : zbe->offset; ++} ++ ++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref) ++{ ++ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref); ++ return zbe == NULL ? 0 : zbe->size_pic; ++} ++ ++int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref) ++{ ++ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref); ++ return p == NULL ? 0 : p->numbytes; ++} ++ ++const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref) ++{ ++ const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref); ++ return zbe == NULL ? NULL : &zbe->geo; ++} ++ ++AVRpiZcRefPtr av_rpi_zc_ref(void * const logctx, const AVZcEnvPtr zc, ++ const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy) ++{ ++ av_assert0(!maycopy || zc != NULL); ++ ++ if (frame->format != AV_PIX_FMT_YUV420P && ++ frame->format != AV_PIX_FMT_YUV420P10 && ++ !av_rpi_is_sand_frame(frame)) ++ { ++ av_log(logctx, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format); ++ return NULL; ++ } ++ ++ if (frame->buf[1] != NULL || frame->format != expected_format) ++ { ++#if RPI_ZC_SAND_8_IN_10_BUF ++ if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL) ++ { ++// av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__); ++ return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]); ++ } ++#endif ++ ++ if (maycopy) ++ { ++ if (frame->buf[1] != NULL) ++ av_log(logctx, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__); ++ else ++ av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format); ++ ++ switch (frame->format) ++ { ++ case AV_PIX_FMT_YUV420P10: ++ return zc_420p10_to_sand128(zc, frame); ++ ++ case AV_PIX_FMT_SAND64_10: ++ return zc_sand64_16_to_sand128(zc, frame, 10); ++ ++ default: ++ return zc_copy(zc, frame); ++ } ++ } ++ else ++ { ++ if (frame->buf[1] != NULL) ++ av_log(logctx, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__); ++ else ++ av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format); ++ return NULL; ++ } ++ } ++ ++ if (pic_gm_ptr(frame->buf[0]) == NULL) ++ { ++ if (maycopy) ++ { ++ av_log(logctx, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__); ++ return zc_copy(zc, frame); ++ } ++ else ++ { ++ av_log(logctx, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__); ++ return NULL; ++ } ++ } ++ ++ return av_buffer_ref(frame->buf[0]); ++} ++ ++void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref) ++{ ++ if (fr_ref != NULL) ++ { ++ av_buffer_unref(&fr_ref); ++ } ++} ++ ++//---------------------------------------------------------------------------- ++ ++// Extract user environment from an AVBufferRef ++void * av_rpi_zc_buf_v(AVBufferRef * const buf) ++{ ++ ZcBufEnv * const zbe = pic_zbe_ptr(buf); ++ if (zbe != NULL && zbe->user != NULL) ++ { ++ const ZcUserBufEnv * const zub = (const ZcUserBufEnv *)zbe->user->data; ++ return zub == NULL ? NULL : zub->v; ++ } ++ return NULL; ++} ++ ++// AV buffer pre-free callback ++static void zc_user_buf_free_cb(void * opaque, uint8_t * data) ++{ ++ if (opaque != NULL) ++ { ++ ZcUserBufEnv * const zub = opaque; ++ ++ if (zub->fn->free) ++ zub->fn->free(zub->v); ++ ++ av_free(zub); ++ } ++} ++ ++static void zc_buf_env_free_cb(void * opaque, uint8_t * data) ++{ ++ if (opaque != NULL) ++ { ++ ZcBufEnv * const zbe = opaque; ++ ++ av_buffer_unref(&zbe->user); ++ ++ if (zbe->zc != NULL) ++ av_rpi_zc_env_release(zbe->zc); ++ ++ pthread_cond_destroy(&zbe->cond); ++ pthread_mutex_destroy(&zbe->lock); ++ av_free(zbe); ++ } ++} ++ ++ ++// Wrap the various ZC bits in an AV Buffer and resolve those things we want ++// resolved now. ++// Currently we resolve everything, but in future we might not ++AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab) ++{ ++ AVBufferRef *buf; ++ ZcUserBufEnv * zub; ++ ++ if ((zub = av_malloc(sizeof(ZcUserBufEnv))) == NULL) ++ return NULL; ++ ++ zub->fn = fn_tab; ++ zub->v = v; ++ zub->numbytes = numbytes; ++ zub->offset = addr_offset; ++ ++ if ((buf = av_buffer_create((uint8_t*)zub, sizeof(*zub), zc_user_buf_free_cb, zub, 0)) == NULL) ++ { ++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed av_buffer_create\n"); ++ av_free(zub); ++ return NULL; ++ } ++ ++ return buf; ++} ++ ++int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int alloc_mode) ++{ ++ ZcBufEnv * const zbe = pic_zbe_ptr(buf); ++ ++ if (zbe == NULL) ++ return AVERROR(EINVAL); ++ ++ if (alloc_mode == ZC_RESOLVE_FAIL && !zbe->is_valid) ++ return AVERROR(EAGAIN); ++ ++ if (alloc_mode == ZC_RESOLVE_WAIT_VALID && !zbe->is_valid) ++ { ++ pthread_mutex_lock(&zbe->lock); ++ while (!zbe->is_valid) ++ pthread_cond_wait(&zbe->cond, &zbe->lock); ++ pthread_mutex_unlock(&zbe->lock); ++ } ++ ++ if (zbe->is_valid == ZC_BUF_NEVER) ++ return AVERROR(EINVAL); ++ ++ // Do alloc if we need it ++ if (zbe->user == NULL) ++ { ++ ZcEnv * const zc = zbe->zc; ++ const ZcUserBufEnv * zub; ++ ++ av_assert0(alloc_mode == ZC_RESOLVE_ALLOC || alloc_mode == ZC_RESOLVE_ALLOC_VALID); ++ ++ if ((zbe->user = zc->alloc_buf(zc->pool_env, zbe->size_pic, &zbe->geo)) == NULL) ++ { ++ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n"); ++ goto fail; ++ } ++ zub = (const ZcUserBufEnv *)zbe->user->data; ++ ++ // Track ++ ++ zbe->offset = zub->offset; ++ zbe->gmem.numbytes = zub->numbytes; ++ if ((zbe->gmem.arm = zub->fn->map_arm(zub->v)) == NULL) ++ { ++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to lock vcsm_handle %u\n", zbe->gmem.vcsm_handle); ++ goto fail; ++ } ++ ++ if ((zbe->gmem.vcsm_handle = zub->fn->vcsm_handle(zub->v)) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vcsm_handle\n"); ++ goto fail; ++ } ++ ++ if ((zbe->gmem.vc_handle = zub->fn->vc_handle(zub->v)) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc handle from vcsm_handle %u\n", zbe->gmem.vcsm_handle); ++ goto fail; ++ } ++ if ((zbe->gmem.vc = zub->fn->map_vc(zub->v)) == 0) ++ { ++ av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc addr from vcsm_handle %u\n", zbe->gmem.vcsm_handle); ++ goto fail; ++ } ++ ++ buf->buffer->data = zbe->gmem.arm + zbe->offset; ++ buf->buffer->size = zbe->size_pic; ++ ++ // In this mode we shouldn't have anyone waiting for us ++ // so no need to signal ++ if (alloc_mode == ZC_RESOLVE_ALLOC_VALID) ++ zbe->is_valid = 1; ++ } ++ ++ // Just overwrite - no point in testing ++ buf->data = zbe->gmem.arm + zbe->offset; ++ buf->size = zbe->size_pic; ++ return 0; ++ ++fail: ++ av_buffer_unref(&zbe->user); ++ return AVERROR(ENOMEM); ++} ++ ++int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc) ++{ ++ int rv; ++ ++ // Do alloc if we need it ++ if ((rv = av_rpi_zc_resolve_buffer(frame->buf[0], may_alloc)) != 0) ++ return rv; ++ ++ // If we are a framebuf copy then the alloc can be done but we haven't ++ // imported its results yet ++ if (frame->data[0] == NULL) ++ { ++ const ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]); ++ ++ frame->linesize[0] = zbe->geo.stride_y; ++ frame->linesize[1] = zbe->geo.stride_c; ++ frame->linesize[2] = zbe->geo.stride_c; ++ // abuse: linesize[3] = "stripe stride" ++ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y). ++ // In a general case this makes the calculation an xor and multiply rather ++ // than a divide and multiply ++ if (zbe->geo.stripes > 1) ++ frame->linesize[3] = zbe->geo.stripe_is_yc ? zbe->geo.height_y + zbe->geo.height_c : zbe->geo.height_y; ++ ++ frame->data[0] = frame->buf[0]->data; ++ frame->data[1] = frame->data[0] + (zbe->geo.stripe_is_yc ? zbe->size_y : zbe->size_y * zbe->geo.stripes); ++ if (zbe->geo.planes_c > 1) ++ frame->data[2] = frame->data[1] + zbe->size_c; ++ ++ frame->extended_data = frame->data; ++ // Leave extended buf alone ++ } ++ ++ return 0; ++} ++ ++int av_rpi_zc_set_valid_frame(AVFrame * const frame) ++{ ++ ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]); ++ ++ if (zbe == NULL) ++ return AVERROR(EINVAL); ++ ++ zbe->is_valid = ZC_BUF_VALID; ++ pthread_cond_broadcast(&zbe->cond); ++ ++ return 0; ++} ++ ++int av_rpi_zc_set_broken_frame(AVFrame * const frame) ++{ ++ ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]); ++ ++ if (zbe == NULL) ++ return AVERROR(EINVAL); ++ ++ zbe->is_valid = ZC_BUF_NEVER; ++ pthread_cond_broadcast(&zbe->cond); ++ ++ return 0; ++} ++ ++void av_rpi_zc_set_decoder_pool_size(ZcEnv *const zc, const unsigned int pool_size) ++{ ++ zc->pool_size = pool_size; ++} ++ ++unsigned int av_rpi_zc_get_decoder_pool_size(ZcEnv *const zc) ++{ ++ return zc->pool_size; ++} ++ ++int av_rpi_zc_get_buffer(ZcEnv *const zc, AVFrame * const frame) ++{ ++#if 1 ++ ZcBufEnv * zbe = av_mallocz(sizeof(*zbe)); ++ ++ for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; i++) { ++ frame->buf[i] = NULL; ++ frame->data[i] = NULL; ++ frame->linesize[i] = 0; ++ } ++ ++ if (zbe == NULL) ++ return AVERROR(ENOMEM); ++ ++ if ((frame->buf[0] = av_buffer_create((uint8_t *)zbe, sizeof(*zbe), zc_buf_env_free_cb, zbe, 0)) == NULL) ++ { ++ av_free(zbe); ++ return AVERROR(ENOMEM); ++ } ++ ++ pthread_mutex_init(&zbe->lock, NULL); ++ pthread_cond_init(&zbe->cond, NULL); ++ zbe->zc = zc; ++ atomic_fetch_add(&zc->refcount, 1); ++ ++ zbe->geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height); // Note geometry for later use ++ zbe->size_y = zbe->geo.stride_y * zbe->geo.height_y; ++ zbe->size_c = zbe->geo.stride_c * zbe->geo.height_c; ++ zbe->size_pic = (zbe->size_y + zbe->size_c * zbe->geo.planes_c) * zbe->geo.stripes; ++ ++#else ++ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height); ++ const unsigned int size_y = geo.stride_y * geo.height_y; ++ const unsigned int size_c = geo.stride_c * geo.height_c; ++ const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes; ++ AVBufferRef * buf; ++ unsigned int i; ++ ++// printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic); ++ ++ if ((buf = zc->alloc_buf(zc->pool_env, size_pic, &geo)) == NULL) ++ { ++ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n"); ++ return AVERROR(ENOMEM); ++ } ++ ++ // Track ++ atomic_fetch_add(&zc->refcount, 1); ++ pic_zbe_ptr(buf)->zc = zc; ++ ++ for (i = 0; i < AV_NUM_DATA_POINTERS; i++) { ++ frame->buf[i] = NULL; ++ frame->data[i] = NULL; ++ frame->linesize[i] = 0; ++ } ++ ++ frame->buf[0] = buf; ++ ++ frame->linesize[0] = geo.stride_y; ++ frame->linesize[1] = geo.stride_c; ++ frame->linesize[2] = geo.stride_c; ++ // abuse: linesize[3] = "stripe stride" ++ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y). ++ // In a general case this makes the calculation an xor and multiply rather ++ // than a divide and multiply ++ if (geo.stripes > 1) ++ frame->linesize[3] = geo.stripe_is_yc ? geo.height_y + geo.height_c : geo.height_y; ++ ++ frame->data[0] = buf->data; ++ frame->data[1] = frame->data[0] + (geo.stripe_is_yc ? size_y : size_y * geo.stripes); ++ if (geo.planes_c > 1) ++ frame->data[2] = frame->data[1] + size_c; ++ ++ frame->extended_data = frame->data; ++ // Leave extended buf alone ++ ++#if RPI_ZC_SAND_8_IN_10_BUF != 0 ++ // *** If we intend to use this for real we will want a 2nd buffer pool ++ frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = zc_pool_buf_alloc(&zc->pool, size_pic); // *** 2 * wanted size - kludge ++#endif ++#endif ++ ++ return 0; ++} ++ ++void av_rpi_zc_env_release(const AVZcEnvPtr zc) ++{ ++ const int n = atomic_fetch_add(&zc->refcount, -1); ++ if (n == 1) // was 1, now 0 ++ { ++ zc->free_pool(zc->pool_env); ++ av_free(zc); ++ } ++} ++ ++AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx, ++ void * pool_env, ++ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, ++ av_rpi_zc_free_pool_fn_t * free_pool_fn) ++{ ++ ZcEnv * zc; ++ ++ if ((zc = av_mallocz(sizeof(ZcEnv))) == NULL) ++ { ++ av_log(logctx, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n"); ++ return NULL; ++ } ++ ++ *zc = (ZcEnv){ ++ .refcount = ATOMIC_VAR_INIT(1), ++ .pool_env = pool_env, ++ .alloc_buf = alloc_buf_fn, ++ .free_pool = free_pool_fn, ++ .pool_size = 0 ++ }; ++ ++ return zc; ++} ++ ++//============================================================================ ++// ++// External ZC initialisation ++ ++#define RPI_GET_BUFFER2 1 ++ ++ ++static int zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags) ++{ ++#if !RPI_GET_BUFFER2 ++ return avcodec_default_get_buffer2(s, frame, flags); ++#else ++ int rv; ++ ++ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0) ++ { ++// printf("Do default alloc: format=%#x\n", frame->format); ++ rv = avcodec_default_get_buffer2(s, frame, flags); ++ } ++ else if (frame->format == AV_PIX_FMT_YUV420P || ++ av_rpi_is_sand_frame(frame)) ++ { ++ if ((rv = av_rpi_zc_get_buffer(s->opaque, frame)) == 0) ++ rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID); ++ } ++ else ++ { ++ rv = avcodec_default_get_buffer2(s, frame, flags); ++ } ++ ++#if 0 ++ printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__, ++ frame->format, frame->width, frame->height, ++ frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3], ++ frame->data[0], frame->data[1], frame->data[2], ++ frame->buf[0], frame->buf[1], frame->buf[2], ++ av_buffer_get_opaque(frame->buf[0])); ++#endif ++ return rv; ++#endif ++} ++ ++int av_rpi_zc_in_use(const struct AVCodecContext * const s) ++{ ++ return s->get_buffer2 == zc_get_buffer2; ++} ++ ++int av_rpi_zc_init2(struct AVCodecContext * const s, ++ void * pool_env, ++ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, ++ av_rpi_zc_free_pool_fn_t * free_pool_fn) ++{ ++ ZcEnv * zc; ++ ++ av_assert0(!av_rpi_zc_in_use(s)); ++ ++ if ((zc = av_rpi_zc_env_alloc(s, pool_env, alloc_buf_fn, free_pool_fn)) == NULL) ++ return AVERROR(ENOMEM); ++ ++ zc->old = (ZcOldCtxVals){ ++ .opaque = s->opaque, ++ .get_buffer2 = s->get_buffer2, ++ .thread_safe_callbacks = s->thread_safe_callbacks ++ }; ++ ++ s->opaque = zc; ++ s->get_buffer2 = zc_get_buffer2; ++ s->thread_safe_callbacks = 1; ++ return 0; ++} ++ ++void av_rpi_zc_uninit2(struct AVCodecContext * const s) ++{ ++ ZcEnv * const zc = s->opaque; ++ ++ av_assert0(av_rpi_zc_in_use(s)); ++ ++ s->get_buffer2 = zc->old.get_buffer2; ++ s->opaque = zc->old.opaque; ++ s->thread_safe_callbacks = zc->old.thread_safe_callbacks; ++ ++ av_rpi_zc_env_release(zc); ++} ++ +diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h +new file mode 100644 +index 0000000000..f00a7c962c +--- /dev/null ++++ b/libavcodec/rpi_zc.h +@@ -0,0 +1,228 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#ifndef LIBAVCODEC_RPI_ZC_H ++#define LIBAVCODEC_RPI_ZC_H ++ ++// Zero-Copy frame code for RPi ++// RPi needs Y/U/V planes to be contiguous for display. By default ++// ffmpeg will allocate separated planes so a memcpy is needed before ++// display. This code provides a method a making ffmpeg allocate a single ++// bit of memory for the frame when can then be reference counted until ++// display has finished with it. ++ ++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame ++// 0 disables ++// *** This option still in development ++// Only works if SAO active ++// Allocates buffers that are twice the required size ++#define RPI_ZC_SAND_8_IN_10_BUF 0 ++ ++struct AVBufferRef; ++struct AVFrame; ++struct AVCodecContext; ++enum AVPixelFormat; ++ ++// "Opaque" pointer to whatever we are using as a buffer reference ++typedef struct AVBufferRef * AVRpiZcRefPtr; ++ ++struct AVZcEnv; ++typedef struct AVZcEnv * AVZcEnvPtr; ++ ++typedef struct AVRpiZcFrameGeometry ++{ ++ unsigned int stride_y; // Luma stride (bytes) ++ unsigned int height_y; // Luma height (lines) ++ unsigned int stride_c; // Chroma stride (bytes) ++ unsigned int height_c; // Chroma stride (lines) ++ unsigned int planes_c; // Chroma plane count (U, V = 2, interleaved = 1) ++ unsigned int stripes; // Number of stripes (sand) ++ unsigned int bytes_per_pel; ++ int stripe_is_yc; // A single stripe is Y then C (false for tall sand) ++ ++ int format; // Requested format ++ unsigned int video_width; // Requested width ++ unsigned int video_height; // Requested height ++} AVRpiZcFrameGeometry; ++ ++// Get expected MMAL geometry for a given format, width & height ++AVRpiZcFrameGeometry av_rpi_zc_frame_geometry( ++ const int format, ++ const unsigned int video_width, const unsigned int video_height); ++ ++//---------------------------------------------------------------------------- ++// ++// Calls that extract info from a ZC frame whether internally or externally ++// allocated ++ ++// Generate a ZC reference to the buffer(s) in this frame ++// If the buffer doesn't appear to be one allocated by ZC ++// then the behaviour depends on maycopy: ++// If maycopy=0 then return NULL ++// If maycopy=1 && the src frame is in a form where we can easily copy ++// the data, then allocate a new buffer and copy the data into it ++// Otherwise return NULL ++// If maycopy == 0 then ZC may be NULL ++AVRpiZcRefPtr av_rpi_zc_ref(void * const logging_context, const AVZcEnvPtr zc, ++ const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy); ++ ++// Unreference the buffer refed/allocated by _zc_ref ++// If fr_ref is NULL then this will NOP ++void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref); ++ ++// Get the vc_handle from the frame ref ++// Returns -1 if ref doesn't look valid ++int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref); ++// Get the vcsm_handle from the frame ref ++// Returns 0 if ref doesn't look valid ++unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref); ++// Get offset from the start of the memory referenced ++// by the vc_handle to valid data ++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref); ++// Length of buffer data ++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref); ++// Get the number of bytes allocated from the frame ref ++// Returns 0 if ref doesn't look valid ++int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref); ++// Geometry this frame was allocated with ++const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref); ++ ++//---------------------------------------------------------------------------- ++// ++// Calls for external frame allocation ++ ++// Callbacks registered in av_rpi_zc_init2 ++ ++// Callback to allocate a buf for a frame ++// The frame itself is generated in the calling code ++// ++// Parameters: ++// pool_env value passed to av-rpi_zc_init2 ++// size size wanted ++// geo geometry of the frame to be allocated ++// Returns: ++// NULL Alloc failed ++// ptr AVBufferBuf* of allocated buffer ++// In most cases av_rpi_zc_buf will be called by this function ++// and this will be the buf returned by that. ++typedef AVBufferRef * av_rpi_zc_alloc_buf_fn_t(void * pool_env, size_t size, ++ const AVRpiZcFrameGeometry * geo); ++ ++// Callback once ffmpeg is completely done with this pool ++// Called once all allocated buffers have been derefed and ffmpegs ref to this ++// pool has been dropped ++typedef void av_rpi_zc_free_pool_fn_t(void * pool_env); ++ ++// Init ZC into a context ++// Sets opaque, get_buffer2, thread_safe_callbacks ++// Use if you want to allocate your own pools and/or create ZC buffers for ++// all decoders ++// RPI HEVC decoders will allocate appropriate VCSM buffers which can be taken ++// apart by av_rpi_zc_xxx calls without this ++int av_rpi_zc_init2(struct AVCodecContext * const s, ++ void * pool_env, av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, ++ av_rpi_zc_free_pool_fn_t * free_pool_fn); ++ ++// Free ZC from a context ++void av_rpi_zc_uninit2(struct AVCodecContext * const s); ++ ++// Get minimum pool size in frames - valid by the time the first alloc request ++// occurs. Takes into account thread requests and DPB sizes derived from SPS ++// rather than just adding a worst case DPB size. ++unsigned int av_rpi_zc_get_decoder_pool_size(const AVZcEnvPtr zc); ++ ++typedef struct av_rpi_zc_buf_fn_tab_s { ++ // This AVBuffer is being freed by ffmpeg - return memory ++ // to external pool. Memory may be, but need not be, unmapped. ++ // v is the ptr passed in av_rpi_zc_buf ++ void (* free)(void * v); ++ ++ // Return appropriate handles / mappings ++ // v is the ptr passed in av_rpi_zc_buf ++ unsigned int (* vcsm_handle)(void * v); ++ unsigned int (* vc_handle)(void * v); ++ void * (* map_arm)(void * v); ++ unsigned int (* map_vc)(void * v); ++} av_rpi_zc_buf_fn_tab_t; ++ ++// Allocate a ZC AVBufferRef and set its callback table ++// Doesn't take a buffer address directly - relies on callbacks to return ++// addresses as they are required. Mappings need not be generated until ++// the map callbacks are called but they should persist from then until ++// the buffer is freed. ++// ++// Parameters: ++// numbytes Size of the buffer ++// addr_offset Offset to first usable byte of buffer (for alignment) ++// normally 0 ++// v Pointer passed to callbacks ++// fn_tab Function table ++AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab); ++ ++// Get v ptr set in in av_rpi_zc_buf ++void * av_rpi_zc_buf_v(AVBufferRef * const buf); ++ ++//---------------------------------------------------------------------------- ++// ++// Mostly internal calls but might possibly be wanted by outside code ++ ++void av_rpi_zc_int_env_freep(AVZcEnvPtr * zc); ++AVZcEnvPtr av_rpi_zc_int_env_alloc(void * const logctx); ++void av_rpi_zc_set_decoder_pool_size(const AVZcEnvPtr zc, const unsigned int pool_size); ++ ++// Test to see if the context is using zc (checks get_buffer2) ++int av_rpi_zc_in_use(const struct AVCodecContext * const s); ++ ++// Get buffer generates placeholders for later alloc ++int av_rpi_zc_get_buffer(const AVZcEnvPtr zc, AVFrame * const frame); ++// Resolve actually does the alloc (noop if already alloced) ++// Set data pointers on a buffer/frame that was copied before the alloc ++// accured ++#define ZC_RESOLVE_FAIL 0 // return error on invalid ++#define ZC_RESOLVE_ALLOC 1 // alloc as invalid ++#define ZC_RESOLVE_WAIT_VALID 2 // wait for valid ++#define ZC_RESOLVE_ALLOC_VALID 3 // alloc as valid ++int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int may_alloc); ++int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc); ++ ++int av_rpi_zc_set_valid_frame(AVFrame * const frame); ++int av_rpi_zc_set_broken_frame(AVFrame * const frame); ++ ++ ++ ++ ++AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx, ++ void * pool_env, ++ av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn, ++ av_rpi_zc_free_pool_fn_t * free_pool_fn); ++void av_rpi_zc_env_release(const AVZcEnvPtr zc); ++ ++ ++#endif ++ +diff --git a/libavcodec/rpi_zc_frames.h b/libavcodec/rpi_zc_frames.h +new file mode 100644 +index 0000000000..9b7b6536a4 +--- /dev/null ++++ b/libavcodec/rpi_zc_frames.h +@@ -0,0 +1,142 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox, Ben Avison ++*/ ++ ++#ifndef RPI_ZC_FRAMES_H ++#define RPI_ZC_FRAMES_H ++ ++#define RPI_ONE_BUF 1 ++ ++#include "rpi_mem.h" // for GPU_MEM_PTR_T ++#include "libavutil/frame.h" ++ ++#if !RPI_ONE_BUF ++static inline uint32_t get_vc_address_y(const AVFrame * const frame) { ++ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[0]); ++ return p->vc; ++} ++ ++static inline uint32_t get_vc_address_u(const AVFrame * const frame) { ++ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[1]); ++ return p->vc; ++} ++ ++static inline uint32_t get_vc_address_v(const AVFrame * const frame) { ++ GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[2]); ++ return p->vc; ++} ++ ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { ++ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[0]); ++} ++ ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) { ++ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[1]); ++} ++ ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) { ++ return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[2]); ++} ++ ++#else ++ ++static inline int gpu_is_buf1(const AVFrame * const frame) ++{ ++ return frame->buf[1] == NULL; ++} ++ ++static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame) ++{ ++ return av_buffer_get_opaque(frame->buf[0]); ++} ++ ++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n) ++{ ++ return av_buffer_pool_buffer_get_opaque(frame->buf[n]); ++} ++ ++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n) ++{ ++ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n); ++ return gm->vc + (frame->data[n] - gm->arm); ++} ++ ++ ++static inline uint32_t get_vc_address_y(const AVFrame * const frame) { ++ return get_vc_address3(frame, 0); ++} ++ ++static inline uint32_t get_vc_address_u(const AVFrame * const frame) { ++ return get_vc_address3(frame, 1); ++} ++ ++static inline uint32_t get_vc_address_v(const AVFrame * const frame) { ++ return get_vc_address3(frame, 2); ++} ++ ++#if 0 ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) { ++ if (gpu_is_buf1(frame)) ++ { ++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); ++ g.numbytes = frame->data[1] - frame->data[0]; ++ return g; ++ } ++ else ++ return *gpu_buf3_gmem(frame, 0); ++} ++ ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) { ++ if (gpu_is_buf1(frame)) ++ { ++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); ++ g.arm += frame->data[1] - frame->data[0]; ++ g.vc += frame->data[1] - frame->data[0]; ++ g.numbytes = frame->data[2] - frame->data[1]; // chroma size ++ return g; ++ } ++ else ++ return *gpu_buf3_gmem(frame, 1); ++} ++ ++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) { ++ if (gpu_is_buf1(frame)) ++ { ++ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame); ++ g.arm += frame->data[2] - frame->data[0]; ++ g.vc += frame->data[2] - frame->data[0]; ++ g.numbytes = frame->data[2] - frame->data[1]; // chroma size ++ return g; ++ } ++ else ++ return *gpu_buf3_gmem(frame, 2); ++} ++#endif ++#endif ++ ++#endif +diff --git a/libavcodec/rpivid_hevc.c b/libavcodec/rpivid_hevc.c +new file mode 100644 +index 0000000000..a6b5e8a189 +--- /dev/null ++++ b/libavcodec/rpivid_hevc.c +@@ -0,0 +1,2033 @@ ++// FFMPEG HEVC decoder hardware accelerator ++// Andrew Holme, Argon Design Ltd ++// Copyright (c) June 2017 Raspberry Pi Ltd ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "fftools/ffmpeg.h" ++#include "libavutil/avassert.h" ++#include "libavutil/imgutils.h" ++#include "avcodec.h" ++#include "hwconfig.h" ++#include "decode.h" ++ ++#include "hevc.h" ++#include "hevcdec.h" ++#include "rpi_zc.h" ++#include "rpi_mem.h" ++#include "rpi_zc_frames.h" ++#include "rpi_mailbox.h" ++ ++ ++#define OPT_PHASE_TIMING 0 // Generate stats for phase usage ++ ++#define NUM_SCALING_FACTORS 4064 ++ ++#define AXI_BASE64 0 ++ ++#define PROB_BACKUP ((20<<12) + (20<<6) + (0<<0)) ++#define PROB_RELOAD ((20<<12) + (20<<0) + (0<<6)) ++ ++#define RPIVID_COL_PICS 17 // 16 ref & current ++ ++#define RPIVID_BITBUFS 2 // Bit + Cmd bufs (phase 0 & 1) ++#define RPIVID_BITBUF_SIZE (4 << 20) // Bit + Cmd buf size ++ ++#define RPIVID_COEFFBUFS 3 // PU + Coeff bufs (phase 1 & 2) ++#define RPIVID_COEFFBUF_SIZE (16 << 20) // PU + Coeff buf size ++ ++////////////////////////////////////////////////////////////////////////////// ++// ++// Register offsets ++ ++#define RPI_SPS0 0 ++#define RPI_SPS1 4 ++#define RPI_PPS 8 ++#define RPI_SLICE 12 ++#define RPI_TILESTART 16 ++#define RPI_TILEEND 20 ++#define RPI_SLICESTART 24 ++#define RPI_MODE 28 ++#define RPI_LEFT0 32 ++#define RPI_LEFT1 36 ++#define RPI_LEFT2 40 ++#define RPI_LEFT3 44 ++#define RPI_QP 48 ++#define RPI_CONTROL 52 ++#define RPI_STATUS 56 ++#define RPI_VERSION 60 ++#define RPI_BFBASE 64 ++#define RPI_BFNUM 68 ++#define RPI_BFCONTROL 72 ++#define RPI_BFSTATUS 76 ++#define RPI_PUWBASE 80 ++#define RPI_PUWSTRIDE 84 ++#define RPI_COEFFWBASE 88 ++#define RPI_COEFFWSTRIDE 92 ++#define RPI_SLICECMDS 96 ++#define RPI_BEGINTILEEND 100 ++#define RPI_TRANSFER 104 ++#define RPI_CFBASE 108 ++#define RPI_CFNUM 112 ++#define RPI_CFSTATUS 116 ++ ++#define RPI_PURBASE 0x8000 ++#define RPI_PURSTRIDE 0x8004 ++#define RPI_COEFFRBASE 0x8008 ++#define RPI_COEFFRSTRIDE 0x800C ++#define RPI_NUMROWS 0x8010 ++#define RPI_CONFIG2 0x8014 ++#define RPI_OUTYBASE 0x8018 ++#define RPI_OUTYSTRIDE 0x801C ++#define RPI_OUTCBASE 0x8020 ++#define RPI_OUTCSTRIDE 0x8024 ++#define RPI_STATUS2 0x8028 ++#define RPI_FRAMESIZE 0x802C ++#define RPI_MVBASE 0x8030 ++#define RPI_MVSTRIDE 0x8034 ++#define RPI_COLBASE 0x8038 ++#define RPI_COLSTRIDE 0x803C ++#define RPI_CURRPOC 0x8040 ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++// Unused but left here to illustrate the diffrences between FFmpegs prob ++// structure and the rpivid one ++ ++struct FFM_PROB { ++ uint8_t sao_merge_flag [ 1]; ++ uint8_t sao_type_idx [ 1]; ++ uint8_t split_coding_unit_flag [ 3]; ++ uint8_t cu_transquant_bypass_flag [ 1]; ++ uint8_t skip_flag [ 3]; ++ uint8_t cu_qp_delta [ 3]; ++ uint8_t pred_mode_flag [ 1]; ++ uint8_t part_mode [ 4]; ++ uint8_t prev_intra_luma_pred_flag [ 1]; ++ uint8_t intra_chroma_pred_mode [ 2]; ++ uint8_t merge_flag [ 1]; ++ uint8_t merge_idx [ 1]; ++ uint8_t inter_pred_idc [ 5]; ++ uint8_t ref_idx_l0 [ 2]; ++ uint8_t ref_idx_l1 [ 2]; ++ uint8_t abs_mvd_greater0_flag [ 2]; ++ uint8_t abs_mvd_greater1_flag [ 2]; ++ uint8_t mvp_lx_flag [ 1]; ++ uint8_t no_residual_data_flag [ 1]; ++ uint8_t split_transform_flag [ 3]; ++ uint8_t cbf_luma [ 2]; ++ uint8_t cbf_cb_cr [ 4]; ++ uint8_t transform_skip_flag/*[][]*/ [ 2]; ++ uint8_t explicit_rdpcm_flag/*[][]*/ [ 2]; ++ uint8_t explicit_rdpcm_dir_flag/*[][]*/ [ 2]; ++ uint8_t last_significant_coeff_x_prefix [18]; ++ uint8_t last_significant_coeff_y_prefix [18]; ++ uint8_t significant_coeff_group_flag [ 4]; ++ uint8_t significant_coeff_flag [44]; ++ uint8_t coeff_abs_level_greater1_flag [24]; ++ uint8_t coeff_abs_level_greater2_flag [ 6]; ++ uint8_t log2_res_scale_abs [ 8]; ++ uint8_t res_scale_sign_flag [ 2]; ++ uint8_t cu_chroma_qp_offset_flag [ 1]; ++ uint8_t cu_chroma_qp_offset_idx [ 1]; ++} __attribute__((packed)); ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++struct RPI_PROB { ++ uint8_t SAO_MERGE_FLAG [ 1]; ++ uint8_t SAO_TYPE_IDX [ 1]; ++ uint8_t SPLIT_FLAG [ 3]; ++ uint8_t CU_SKIP_FLAG [ 3]; ++ uint8_t CU_TRANSQUANT_BYPASS_FLAG [ 1]; ++ uint8_t PRED_MODE [ 1]; ++ uint8_t PART_SIZE [ 4]; ++ uint8_t INTRA_PRED_MODE [ 1]; ++ uint8_t CHROMA_PRED_MODE [ 1]; ++ uint8_t MERGE_FLAG_EXT [ 1]; ++ uint8_t MERGE_IDX_EXT [ 1]; ++ uint8_t INTER_DIR [ 5]; ++ uint8_t REF_PIC [ 2]; ++ uint8_t MVP_IDX [ 1]; ++ uint8_t MVD [ 2]; ++ uint8_t QT_ROOT_CBF [ 1]; ++ uint8_t TRANS_SUBDIV_FLAG [ 3]; ++ uint8_t QT_CBF [ 6]; ++ uint8_t DQP [ 2]; ++ uint8_t ONE_FLAG [24]; ++ uint8_t LASTX [18]; ++ uint8_t LASTY [18]; ++ uint8_t SIG_CG_FLAG [ 4]; ++ uint8_t ABS_FLAG [ 6]; ++ uint8_t TRANSFORMSKIP_FLAG [ 2]; ++ uint8_t SIG_FLAG [42]; ++ uint8_t SIG_FLAG_unused [ 2]; ++} __attribute__((packed)); ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++struct RPI_CMD { ++ uint32_t addr; ++ uint32_t data; ++} __attribute__((packed)); ++ ++struct RPI_BIT { ++ int cmd; ++ const void *ptr; ++ int len; ++}; ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++struct RPI_T; ++ ++// Actual addressability is 38bits but we can only alloc in the bottom 32 ++// currently - when passed to rpivid h/w the address is always >> 6 so will ++// fit in 32 bit there ++// At some point we may weant to make this uint64_t ++typedef uint32_t vid_vc_addr_t; ++ ++typedef enum rpivid_decode_state_e { ++ RPIVID_DECODE_NEW = 0, ++ RPIVID_DECODE_START, ++ RPIVID_DECODE_SLICE, ++ RPIVID_DECODE_END, ++} rpivid_decode_state_t; ++ ++#define RPI_PROB_VALS 154U ++#define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3) ++ ++typedef struct dec_env_s { ++ const AVCodecContext * avctx; ++ ++ rpivid_decode_state_t state; ++ unsigned int decode_order; ++ ++ int phase_no; // Current phase (i.e. the last one we waited for) ++ struct dec_env_s * phase_wait_q_next; ++ sem_t phase_wait; ++ ++ struct RPI_BIT *bit_fifo; ++ struct RPI_CMD *cmd_fifo; ++ unsigned int bit_len, bit_max; ++ unsigned int cmd_len, cmd_max; ++ unsigned int num_slice_msgs; ++ unsigned int PicWidthInCtbsY; ++ unsigned int PicHeightInCtbsY; ++ unsigned int dpbno_col; ++ uint32_t reg_slicestart; ++ unsigned int wpp_entry_x; ++ unsigned int wpp_entry_y; ++ uint16_t slice_msgs[2*HEVC_MAX_REFS*8+3]; ++ uint8_t scaling_factors[NUM_SCALING_FACTORS]; ++// unsigned int RefPicList[2][HEVC_MAX_REFS]; ++} dec_env_t; ++ ++#define RPIVID_PHASES 3 ++#define RPIVID_PHASE_NEW (RPIVID_PHASES) // Phase before we have inced decode order ++#define RPIVID_PHASE_START (-1) // Phase after we have inced decode_order ++ ++#if OPT_PHASE_TIMING ++static const unsigned int time_thresholds[8] = { ++ 10, 15, 20, 30, 45, 60, 75, 90 ++}; ++#endif ++ ++typedef struct phase_wait_env_s { ++ unsigned int last_order; ++ dec_env_t * q; ++#if OPT_PHASE_TIMING ++ uint64_t phase_time; ++ uint64_t max_phase_time; ++ uint64_t time_in_phase; ++ uint64_t time_out_phase; ++ unsigned int max_time_decode_order; ++ unsigned int time_bins[9]; ++ unsigned int time_bins3[9]; ++ unsigned int time_bins5[9]; ++ uint64_t time_stash[16]; ++ unsigned int i3; ++#endif ++} phase_wait_env_t; // Single linked list of threads waiting for this phase ++ ++typedef struct RPI_T { ++ atomic_int ref_count; ++ sem_t ref_zero; ++ ++ dec_env_t ** dec_envs; ++ AVZcEnvPtr zc; ++ ++ pthread_mutex_t phase_lock; ++ phase_wait_env_t phase_reqs[RPIVID_PHASES]; ++ ++ volatile uint32_t * regs; ++ volatile uint32_t * ints; ++ ++ GPU_MEM_PTR_T gcolbuf; ++ unsigned int col_stride; ++ size_t col_picsize; ++ ++ unsigned int bitbuf_no; ++ sem_t bitbuf_sem; ++ GPU_MEM_PTR_T gbitbufs[RPIVID_BITBUFS]; ++ ++ unsigned int max_pu_msgs; ++ unsigned int coeffbuf_no; ++ sem_t coeffbuf_sem; ++ GPU_MEM_PTR_T gcoeffbufs[RPIVID_COEFFBUFS]; ++ ++ unsigned int decode_order; ++ int mbox_fd; ++ int gpu_init_type; ++} RPI_T; ++ ++#if OPT_PHASE_TIMING ++static uint64_t tus64(void) ++{ ++ struct timespec ts; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000; ++} ++#endif ++ ++static inline unsigned int rnd64(unsigned int x) ++{ ++ return (x + 63) & ~63; ++} ++ ++static inline int rpi_sem_wait(sem_t * const sem) ++{ ++ int rv; ++ while ((rv = sem_wait(sem)) != 0 && errno == EINTR) ++ /* Loop */; ++ return rv; ++} ++ ++//============================================================================ ++ ++#define TRACE_DEV 0 ++#define TRACE_ENTRY 0 ++ ++#define REGS_NAME "/dev/rpivid-hevcmem" ++#define REGS_SIZE 0x10000 ++#define INTS_NAME "/dev/rpivid-intcmem" ++#define INTS_SIZE 0x10000 // 4 is probably enough but we are going to alloc a page anyway ++ ++static volatile uint32_t * map_dev(AVCodecContext * const avctx, const char * const dev_name, size_t size) ++{ ++ void *gpio_map; ++ int mem_fd; ++ ++ /* open /dev/mem */ ++ if ((mem_fd = open(dev_name, O_RDWR|O_SYNC) ) < 0) { ++ av_log(avctx, AV_LOG_WARNING, "can't open %s\n", dev_name); ++ return NULL; ++ } ++ ++ // Now map it ++ gpio_map = mmap( ++ NULL, ++ size, ++ PROT_READ|PROT_WRITE, ++ MAP_SHARED, ++ mem_fd, ++ 0 ++ ); ++ ++ close(mem_fd); // No longer need the FD ++ ++ if (gpio_map == MAP_FAILED) { ++ av_log(avctx, AV_LOG_WARNING, "GPIO mapping failed"); ++ return NULL; ++ } ++ ++ return (volatile uint32_t *)gpio_map; ++} ++ ++static void unmap_devp(volatile uint32_t ** const p_gpio_map, size_t size) ++{ ++ volatile uint32_t * const gpio_map = *p_gpio_map; ++ if (gpio_map != NULL) { ++ *p_gpio_map = NULL; ++ munmap((void *)gpio_map, size); ++ } ++} ++ ++#define MANGLE(x) ((x) &~0xc0000000) // ** If x is ever a 64 bit thing this will need fixing! ++#define MANGLE64(x) (uint32_t)(MANGLE(x)>>6) ++ ++static inline void apb_write_vc_addr(const RPI_T *const rpi, const uint32_t addr, const vid_vc_addr_t data) ++{ ++ rpi->regs[addr >> 2] = MANGLE64(data); ++} ++ ++static inline void apb_write_vc_len(const RPI_T *const rpi, const uint32_t addr, const unsigned int data) ++{ ++ rpi->regs[addr >> 2] = data >> 6; // ?? rnd64 - but not currently needed ++} ++ ++static inline void apb_write(const RPI_T * const rpi, const uint32_t addr, const uint32_t data) ++{ ++#if TRACE_DEV ++ printf("W %x %08x\n", addr, data); ++#endif ++ ++ rpi->regs[addr >> 2] = data; ++} ++ ++static inline uint32_t apb_read(const RPI_T * const rpi, const uint32_t addr) ++{ ++ const uint32_t v = rpi->regs[addr >> 2]; ++#if TRACE_DEV ++ printf("R %x (=%x)\n", addr, v); ++#endif ++ return v; ++} ++ ++#define ARG_IC_ICTRL_ACTIVE1_INT_SET 0x00000001 ++#define ARG_IC_ICTRL_ACTIVE1_EDGE_SET 0x00000002 ++#define ARG_IC_ICTRL_ACTIVE1_EN_SET 0x00000004 ++#define ARG_IC_ICTRL_ACTIVE1_STATUS_SET 0x00000008 ++#define ARG_IC_ICTRL_ACTIVE2_INT_SET 0x00000010 ++#define ARG_IC_ICTRL_ACTIVE2_EDGE_SET 0x00000020 ++#define ARG_IC_ICTRL_ACTIVE2_EN_SET 0x00000040 ++#define ARG_IC_ICTRL_ACTIVE2_STATUS_SET 0x00000080 ++ ++static inline void int_wait(const RPI_T * const rpi, const unsigned int phase) ++{ ++ const uint32_t mask_reset = phase == 1 ? ~ARG_IC_ICTRL_ACTIVE2_INT_SET : ~ARG_IC_ICTRL_ACTIVE1_INT_SET; ++ const uint32_t mask_done = phase == 1 ? ARG_IC_ICTRL_ACTIVE1_INT_SET : ARG_IC_ICTRL_ACTIVE2_INT_SET; ++ uint32_t ival; ++ while (((ival = rpi->ints[0]) & mask_done) == 0) { ++ usleep(1000); ++ } ++ rpi->ints[0] = ival & mask_reset; ++} ++ ++#if TRACE_DEV ++static void apb_dump_regs(const RPI_T * const rpi, uint16_t addr, int num) { ++ int i; ++ ++ for (i=0; iregs[(addr>>2)+i]); ++ ++ if ((i%4)==3 || i+1 == num) ++ printf("\n"); ++ else ++ printf(" "); ++ } ++} ++ ++static void axi_dump(const dec_env_t * const de, uint64_t addr, uint32_t size) { ++ int i; ++ ++ for (i=0; i>2; i++) ++ { ++ if ((i%4)==0) ++ printf("%08x: ", MANGLE(de->gbuf.vc) + (uint32_t)addr + 4*i); ++ ++ printf("%08x", ((uint32_t*)de->gbuf.arm)[(addr>>2)+i]); ++ ++ if ((i%4)==3 || i+1 == size>>2) ++ printf("\n"); ++ else ++ printf(" "); ++ } ++} ++#endif ++ ++ ++////////////////////////////////////////////////////////////////////////////// ++// Scaling factors ++ ++static void expand_scaling_list( ++ const unsigned int sizeID, ++ const unsigned int matrixID, ++ uint8_t * const dst0, ++ const uint8_t * const src0, ++ uint8_t dc) ++{ ++ switch (sizeID) { ++ case 0: ++ memcpy(dst0, src0, 16); ++ break; ++ case 1: ++ memcpy(dst0, src0, 64); ++ break; ++ case 2: ++ { ++ uint8_t * d = dst0; ++ for (unsigned int y=0; y != 16; y++) { ++ const uint8_t * s = src0 + (y >> 1) * 8; ++ for (unsigned int x = 0; x != 8; ++x) { ++ *d++ = *s; ++ *d++ = *s++; ++ } ++ } ++ dst0[0] = dc; ++ break; ++ } ++ default: ++ { ++ uint8_t * d = dst0; ++ for (unsigned int y=0; y != 32; y++) { ++ const uint8_t * s = src0 + (y >> 2) * 8; ++ for (unsigned int x = 0; x != 8; ++x) { ++ *d++ = *s; ++ *d++ = *s; ++ *d++ = *s; ++ *d++ = *s++; ++ } ++ } ++ dst0[0] = dc; ++ break; ++ } ++ } ++} ++ ++static void populate_scaling_factors(dec_env_t * const de, const HEVCContext * const s) { ++ // Array of constants for scaling factors ++ static const uint32_t scaling_factor_offsets[4][6] = { ++ // MID0 MID1 MID2 MID3 MID4 MID5 ++ {0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050}, // SID0 (4x4) ++ {0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0}, // SID1 (8x8) ++ {0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0}, // SID2 (16x16) ++ {0x07E0, 0, 0, 0x0BE0, 0, 0}}; // SID3 (32x32) ++ ++ // ffmpeg places SID3,MID1 where matrixID 3 normally is ++ const ScalingList * const sl = ++ s->ps.pps->scaling_list_data_present_flag ? &s->ps.pps->scaling_list ++ : &s->ps.sps->scaling_list; ++ unsigned int mid; ++ ++ for (mid=0; mid<6; mid++) ++ expand_scaling_list(0, mid, ++ de->scaling_factors + scaling_factor_offsets[0][mid], ++ sl->sl[0][mid], 0); ++ for (mid=0; mid<6; mid++) ++ expand_scaling_list(1, mid, ++ de->scaling_factors + scaling_factor_offsets[1][mid], ++ sl->sl[1][mid], 0); ++ for (mid=0; mid<6; mid++) ++ expand_scaling_list(2, mid, ++ de->scaling_factors + scaling_factor_offsets[2][mid], ++ sl->sl[2][mid], ++ sl->sl_dc[0][mid]); ++ // second scaling matrix for 32x32 is at matrixID 3 not 1 in ffmpeg ++ for (mid=0; mid<6; mid += 3) ++ expand_scaling_list(3, mid, ++ de->scaling_factors + scaling_factor_offsets[3][mid], ++ sl->sl[3][mid], ++ sl->sl_dc[1][mid]); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Probabilities ++ ++static const uint8_t prob_init[3][156] = { ++ { ++ 153, 200, 139, 141, 157, 154, 154, 154, ++ 154, 154, 184, 154, 154, 154, 184, 63, ++ 154, 154, 154, 154, 154, 154, 154, 154, ++ 154, 154, 154, 154, 154, 153, 138, 138, ++ 111, 141, 94, 138, 182, 154, 154, 154, ++ 140, 92, 137, 138, 140, 152, 138, 139, ++ 153, 74, 149, 92, 139, 107, 122, 152, ++ 140, 179, 166, 182, 140, 227, 122, 197, ++ 110, 110, 124, 125, 140, 153, 125, 127, ++ 140, 109, 111, 143, 127, 111, 79, 108, ++ 123, 63, 110, 110, 124, 125, 140, 153, ++ 125, 127, 140, 109, 111, 143, 127, 111, ++ 79, 108, 123, 63, 91, 171, 134, 141, ++ 138, 153, 136, 167, 152, 152, 139, 139, ++ 111, 111, 125, 110, 110, 94, 124, 108, ++ 124, 107, 125, 141, 179, 153, 125, 107, ++ 125, 141, 179, 153, 125, 107, 125, 141, ++ 179, 153, 125, 140, 139, 182, 182, 152, ++ 136, 152, 136, 153, 136, 139, 111, 136, ++ 139, 111, 0, 0, }, ++ { ++ 153, 185, 107, 139, 126, 197, 185, 201, ++ 154, 149, 154, 139, 154, 154, 154, 152, ++ 110, 122, 95, 79, 63, 31, 31, 153, ++ 153, 168, 140, 198, 79, 124, 138, 94, ++ 153, 111, 149, 107, 167, 154, 154, 154, ++ 154, 196, 196, 167, 154, 152, 167, 182, ++ 182, 134, 149, 136, 153, 121, 136, 137, ++ 169, 194, 166, 167, 154, 167, 137, 182, ++ 125, 110, 94, 110, 95, 79, 125, 111, ++ 110, 78, 110, 111, 111, 95, 94, 108, ++ 123, 108, 125, 110, 94, 110, 95, 79, ++ 125, 111, 110, 78, 110, 111, 111, 95, ++ 94, 108, 123, 108, 121, 140, 61, 154, ++ 107, 167, 91, 122, 107, 167, 139, 139, ++ 155, 154, 139, 153, 139, 123, 123, 63, ++ 153, 166, 183, 140, 136, 153, 154, 166, ++ 183, 140, 136, 153, 154, 166, 183, 140, ++ 136, 153, 154, 170, 153, 123, 123, 107, ++ 121, 107, 121, 167, 151, 183, 140, 151, ++ 183, 140, 0, 0, }, ++ { ++ 153, 160, 107, 139, 126, 197, 185, 201, ++ 154, 134, 154, 139, 154, 154, 183, 152, ++ 154, 137, 95, 79, 63, 31, 31, 153, ++ 153, 168, 169, 198, 79, 224, 167, 122, ++ 153, 111, 149, 92, 167, 154, 154, 154, ++ 154, 196, 167, 167, 154, 152, 167, 182, ++ 182, 134, 149, 136, 153, 121, 136, 122, ++ 169, 208, 166, 167, 154, 152, 167, 182, ++ 125, 110, 124, 110, 95, 94, 125, 111, ++ 111, 79, 125, 126, 111, 111, 79, 108, ++ 123, 93, 125, 110, 124, 110, 95, 94, ++ 125, 111, 111, 79, 125, 126, 111, 111, ++ 79, 108, 123, 93, 121, 140, 61, 154, ++ 107, 167, 91, 107, 107, 167, 139, 139, ++ 170, 154, 139, 153, 139, 123, 123, 63, ++ 124, 166, 183, 140, 136, 153, 154, 166, ++ 183, 140, 136, 153, 154, 166, 183, 140, ++ 136, 153, 154, 170, 153, 138, 138, 122, ++ 121, 122, 121, 167, 151, 183, 140, 151, ++ 183, 140, 0, 0, }, ++}; ++ ++ ++////////////////////////////////////////////////////////////////////////////// ++// Phase 1 command and bit FIFOs ++ ++// ???? uint16_t addr - put in uint32_t ++static int p1_apb_write(dec_env_t * const de, const uint16_t addr, const uint32_t data) { ++ if (de->cmd_len==de->cmd_max) ++ av_assert0(de->cmd_fifo = realloc(de->cmd_fifo, (de->cmd_max*=2)*sizeof(struct RPI_CMD))); ++ de->cmd_fifo[de->cmd_len].addr = addr; ++ de->cmd_fifo[de->cmd_len].data = data; ++ return de->cmd_len++; ++} ++ ++static void p1_axi_write(dec_env_t * const de, const uint32_t len, const void * const ptr, const int cmd_idx) { ++ if (de->bit_len==de->bit_max) ++ av_assert0(de->bit_fifo = realloc(de->bit_fifo, (de->bit_max*=2)*sizeof(struct RPI_BIT))); ++ de->bit_fifo[de->bit_len].cmd = cmd_idx; ++ de->bit_fifo[de->bit_len].ptr = ptr; ++ de->bit_fifo[de->bit_len].len = len; ++ de->bit_len++; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Write probability and scaling factor memories ++ ++#if 0 ++static void WriteProb(dec_env_t * const de) { ++ int i; ++ const uint8_t *p = (uint8_t *) &de->probabilities; ++ for (i=0; ish.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ? ++ s->sh.slice_type + 1 : 2 - s->sh.slice_type; ++ const uint8_t * p = prob_init[init_type]; ++ const int q = av_clip(s->sh.slice_qp, 0, 51); ++ unsigned int i; ++ ++ for (i = 0; i < RPI_PROB_VALS; i++) { ++ int init_value = p[i]; ++ int m = (init_value >> 4) * 5 - 45; ++ int n = ((init_value & 15) << 3) - 16; ++ int pre = 2 * (((m * q) >> 4) + n) - 127; ++ ++ pre ^= pre >> 31; ++ if (pre > 124) ++ pre = 124 + (pre & 1); ++ dst[i] = pre; ++ } ++ for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i) { ++ dst[i] = 0; ++ } ++ ++ for (i=0; i < RPI_PROB_ARRAY_SIZE; i+=4) ++ p1_apb_write(de, 0x1000+i, dst[i] + (dst[i+1]<<8) + (dst[i+2]<<16) + (dst[i+3]<<24)); ++ ++} ++ ++ ++static void WriteScalingFactors(dec_env_t * const de) { ++ int i; ++ const uint8_t *p = (uint8_t *) de->scaling_factors; ++ for (i=0; i= bd[i]; i++); // bd[] has num+1 elements; bd[0]=0; see hevc_ps.c ++ return i-1; ++} ++ ++static int ctb_to_slice_w_h (unsigned int ctb, int ctb_size, int width, unsigned int *bd, int num) { ++ if (ctb < bd[num-1]) return ctb_size; ++ else if (width % ctb_size) return width % ctb_size; ++ else return ctb_size; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Handle PU and COEFF stream overflow ++ ++ ++// Returns: ++// -2 Other error ++// -1 Out of coeff space ++// 0 OK ++// 1 Out of PU space ++ ++static int check_status(const RPI_T * const rpi, dec_env_t * const de) { ++ uint32_t status; ++ ++ // this is the definition of successful completion of phase 1 ++ // it assures that status register is zero and all blocks in each tile have completed ++ if (apb_read(rpi, RPI_CFSTATUS) == apb_read(rpi, RPI_CFNUM)) ++ return 0; ++ ++ status = apb_read(rpi, RPI_STATUS); ++ ++ if ((status & 8) != 0) ++ return -1; ++ ++ if ((status & 0x10) != 0) ++ return 1; ++ ++ return -2; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Write STATUS register with expected end CTU address of previous slice ++ ++static void end_previous_slice(dec_env_t * const de, const HEVCContext * const s, const int ctb_addr_ts) { ++ const HEVCPPS * const pps = s->ps.pps; ++ int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY; ++ int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY; ++ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18)); ++} ++ ++static void wpp_pause(dec_env_t * const de, int ctb_row) { ++ p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + 0x25); ++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); ++ p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1 ? 0x70000 : 0x30000); ++ p1_apb_write(de, RPI_CONTROL, (ctb_row<<16) + 2); ++} ++ ++static void wpp_end_previous_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) { ++ const HEVCPPS *pps = s->ps.pps; ++ int new_x = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY; ++ int new_y = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY; ++ int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY; ++ int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY; ++ if (de->wpp_entry_x<2 && (de->wpp_entry_y2) && de->PicWidthInCtbsY>2) ++ wpp_pause(de, last_y); ++ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18)); ++ if (new_x==2 || de->PicWidthInCtbsY==2 && de->wpp_entry_yps.sps; ++ const HEVCPPS *pps = s->ps.pps; ++ ++ p1_apb_write(de, RPI_SPS0, ++ (sps->log2_min_cb_size << 0) + ++ (sps->log2_ctb_size << 4) + ++ (sps->log2_min_tb_size << 8) + ++ (sps->log2_max_trafo_size << 12) + ++ (sps->bit_depth << 16) + ++ (sps->bit_depth << 20) + ++ (sps->max_transform_hierarchy_depth_intra << 24) + ++ (sps->max_transform_hierarchy_depth_inter << 28)); ++ ++ p1_apb_write(de, RPI_SPS1, ++ (sps->pcm.bit_depth << 0) + ++ (sps->pcm.bit_depth_chroma << 4) + ++ (sps->pcm.log2_min_pcm_cb_size << 8) + ++ (sps->pcm.log2_max_pcm_cb_size << 12) + ++ (sps->separate_colour_plane_flag? 0:sps->chroma_format_idc << 16) + ++ (sps->amp_enabled_flag << 18) + ++ (sps->pcm_enabled_flag << 19) + ++ (sps->scaling_list_enable_flag << 20) + ++ (sps->sps_strong_intra_smoothing_enable_flag << 21)); ++ ++ p1_apb_write(de, RPI_PPS, ++ (sps->log2_ctb_size - pps->diff_cu_qp_delta_depth << 0) + ++ (pps->cu_qp_delta_enabled_flag << 4) + ++ (pps->transquant_bypass_enable_flag << 5) + ++ (pps->transform_skip_enabled_flag << 6) + ++ (pps->sign_data_hiding_flag << 7) + ++ (((pps->cb_qp_offset + s->sh.slice_cb_qp_offset)&255) << 8) + ++ (((pps->cr_qp_offset + s->sh.slice_cr_qp_offset)&255) << 16) + ++ (pps->constrained_intra_pred_flag << 24)); ++ ++ if (s->ps.sps->scaling_list_enable_flag) WriteScalingFactors(de); ++ ++ if (!s->sh.dependent_slice_segment_flag) { ++ int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY; ++ int ctb_row = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY; ++ de->reg_slicestart = (ctb_col<<0) + (ctb_row<<16); ++ } ++ ++ p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static void write_slice(dec_env_t * const de, const HEVCContext * const s, ++ const unsigned int slice_w, const unsigned int slice_h) { ++ uint32_t u32 = ++ (s->sh.slice_type << 12) ++ + (s->sh.slice_sample_adaptive_offset_flag[0] << 14) ++ + (s->sh.slice_sample_adaptive_offset_flag[1] << 15) ++ + (slice_w << 17) ++ + (slice_h << 24); ++ ++ if (s->sh.slice_type==HEVC_SLICE_B || s->sh.slice_type==HEVC_SLICE_P) u32 |= ++ (s->sh.max_num_merge_cand << 0) ++ + (s->sh.nb_refs[L0] << 4) ++ + (s->sh.nb_refs[L1] << 8); ++ ++ if (s->sh.slice_type==HEVC_SLICE_B) ++ u32 |= s->sh.mvd_l1_zero_flag<<16; ++ p1_apb_write(de, RPI_SLICE, u32); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Wavefront mode ++ ++static void wpp_entry_point(dec_env_t * const de, const HEVCContext * const s, ++ const int do_bte, const int resetQPY, const int ctb_addr_ts) { ++ const HEVCSPS * const sps = s->ps.sps; ++ const HEVCPPS * const pps = s->ps.pps; ++ ++ int ctb_size = 1<log2_ctb_size; ++ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ ++ int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->PicWidthInCtbsY; ++ int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->PicWidthInCtbsY; ++ ++ int endx = de->PicWidthInCtbsY-1; ++ int endy = ctb_row; ++ ++ uint8_t slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, sps->width, pps->col_bd, pps->num_tile_columns); ++ uint8_t slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, sps->height, pps->row_bd, pps->num_tile_rows); ++ ++ p1_apb_write(de, RPI_TILESTART, 0); ++ p1_apb_write(de, RPI_TILEEND, endx + (endy<<16)); ++ ++ if (do_bte) ++ p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16)); ++ ++ write_slice(de, s, slice_w, ctb_row==de->PicHeightInCtbsY-1? slice_h : ctb_size); ++ ++ if (resetQPY) p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp); ++ ++ p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1? 0x60001 : 0x20001); ++ p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16)); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Tiles mode ++ ++static void new_entry_point(dec_env_t * const de, const HEVCContext * const s, ++ const int do_bte, const int resetQPY, const int ctb_addr_ts) { ++ const HEVCSPS * const sps = s->ps.sps; ++ const HEVCPPS * const pps = s->ps.pps; ++ ++ int ctb_col = pps->ctb_addr_ts_to_rs[ctb_addr_ts] % de->PicWidthInCtbsY; ++ int ctb_row = pps->ctb_addr_ts_to_rs[ctb_addr_ts] / de->PicWidthInCtbsY; ++ ++ int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns); ++ int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows); ++ ++ int endx = pps->col_bd[tile_x+1] - 1; ++ int endy = pps->row_bd[tile_y+1] - 1; ++ ++ uint8_t slice_w = ctb_to_slice_w_h(ctb_col, 1<log2_ctb_size, sps->width, pps->col_bd, pps->num_tile_columns); ++ uint8_t slice_h = ctb_to_slice_w_h(ctb_row, 1<log2_ctb_size, sps->height, pps->row_bd, pps->num_tile_rows); ++ ++ p1_apb_write(de, RPI_TILESTART, pps->col_bd[tile_x] + (pps->row_bd[tile_y]<<16)); ++ p1_apb_write(de, RPI_TILEEND, endx + (endy<<16)); ++ ++ if (do_bte) ++ p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16)); ++ ++ write_slice(de, s, slice_w, slice_h); ++ ++ if (resetQPY) ++ p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp); ++ ++ p1_apb_write(de, RPI_MODE, (0xFFFF << 0) ++ + (0x0 << 16) ++ + ((tile_x==pps->num_tile_columns-1) << 17) ++ + ((tile_y==pps->num_tile_rows-1) << 18)); ++ ++ p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16)); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++// Doesn't attempt to remove from context as we should only do this at the end ++// of time or on create error ++static void ++dec_env_delete(dec_env_t * const de) ++{ ++// gpu_free(&de->gbuf); ++ ++ av_freep(&de->cmd_fifo); ++ av_freep(&de->bit_fifo); ++ ++ sem_destroy(&de->phase_wait); ++ av_free(de); ++} ++ ++static dec_env_t * ++dec_env_new(AVCodecContext * const avctx, RPI_T * const rpi) ++{ ++ dec_env_t * const de = av_mallocz(sizeof(*de)); ++ int i; ++ ++ if (de == NULL) ++ return NULL; ++ ++ de->avctx = avctx; ++ de->phase_no = RPIVID_PHASE_NEW; ++ ++ sem_init(&de->phase_wait, 0, 0); ++ ++ if ((de->cmd_fifo = malloc((de->cmd_max=1024)*sizeof(struct RPI_CMD))) == NULL) ++ goto fail; ++ ++ if ((de->bit_fifo = malloc((de->bit_max=1024)*sizeof(struct RPI_BIT))) == NULL) ++ goto fail; ++ ++ pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this ++ for (i = 0; i != avctx->thread_count; ++i) { ++ if (rpi->dec_envs[i] == NULL) ++ { ++ rpi->dec_envs[i] = de; ++ break; ++ } ++ } ++ pthread_mutex_unlock(&rpi->phase_lock); ++ ++ if (i == avctx->thread_count) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to find a slot for hw thread context\n"); ++ goto fail; ++ } ++ ++ return de; ++ ++fail: ++ dec_env_delete(de); ++ return NULL; ++} ++ ++ ++static dec_env_t * ++dec_env_get(AVCodecContext * const avctx, RPI_T * const rpi) ++{ ++ dec_env_t * de = NULL; ++ const int ref_count = atomic_fetch_add(&rpi->ref_count, 1); ++ ++ if (ref_count <= 0) { ++ // Already dead ++ av_log(avctx, AV_LOG_ERROR, "RPIVID called whilst dead\n");; ++ return NULL; ++ } ++ ++ for (int i = 0; i != avctx->thread_count; ++i) { ++ if (rpi->dec_envs[i] == NULL) ++ { ++ de = dec_env_new(avctx, rpi); ++ break; ++ } ++ if (rpi->dec_envs[i]->avctx == avctx) ++ { ++ de = rpi->dec_envs[i]; ++ break; ++ } ++ } ++ return de; ++} ++ ++// Call at end of fn ++// Used to ensure we aren't in a worker thead when killed ++static void ++dec_env_release(RPI_T * const rpi, dec_env_t * const de) ++{ ++ const int n = atomic_fetch_sub(&rpi->ref_count, 1); ++ if (n == 1) { ++ sem_post(&rpi->ref_zero); ++ } ++} ++ ++//---------------------------------------------------------------------------- ++ ++// Wait for a slot in the given phase ++// Any error return is probably fatal ++static int ++wait_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no) ++{ ++ int needs_wait = 0; ++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no; ++ ++ pthread_mutex_lock(&rpi->phase_lock); ++ if (p->last_order + 1 != de->decode_order) { ++ de->phase_wait_q_next = p->q; ++ p->q = de; ++ needs_wait = 1; ++ } ++ pthread_mutex_unlock(&rpi->phase_lock); ++ ++ if (needs_wait) { ++ while (sem_wait(&de->phase_wait) == -1) ++ { ++ int err; ++ if ((err = errno) != EINTR) ++ return AVERROR(err); ++ } ++ } ++ ++ de->phase_no = phase_no; ++ return 0; ++} ++ ++static void ++post_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no) ++{ ++ dec_env_t * next_de = NULL; ++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no; ++ dec_env_t ** q = &p->q; ++ ++ pthread_mutex_lock(&rpi->phase_lock); ++ ++ p->last_order = de->decode_order; ++ while (*q != NULL) { ++ dec_env_t * const t_de = *q; ++ ++ if (t_de->decode_order == p->last_order + 1) { ++ // This is us - remove from Q ++ *q = t_de->phase_wait_q_next; ++ t_de->phase_wait_q_next = NULL; // Tidy ++ next_de = t_de; ++ break; ++ } ++ q = &t_de->phase_wait_q_next; ++ } ++ ++ pthread_mutex_unlock(&rpi->phase_lock); ++ ++ if (next_de != NULL) ++ sem_post(&next_de->phase_wait); ++} ++ ++// Wait & signal stuff s.t. threads in other phases can continue ++static void ++abort_phases(RPI_T * const rpi, dec_env_t * const de) ++{ ++ for (int i = de->phase_no + 1; i < RPIVID_PHASE_NEW; ++i) { ++ wait_phase(rpi, de, i); ++ post_phase(rpi, de, i); ++ } ++ de->phase_no = RPIVID_PHASE_NEW; ++} ++ ++// Start timing for phase ++// Stats only - no actual effect ++static inline void tstart_phase(RPI_T * const rpi, const int phase_no) ++{ ++#if OPT_PHASE_TIMING ++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no; ++ const int64_t now = tus64(); ++ if (p->phase_time != 0) ++ p->time_out_phase += now - p->phase_time; ++ p->phase_time = now; ++#endif ++} ++ ++#if OPT_PHASE_TIMING ++static unsigned int tavg_bin_phase(phase_wait_env_t *const p, const unsigned int avg_n) ++{ ++ uint64_t tsum = 0; ++ unsigned int i; ++ for (i = 0; i != avg_n; ++i) ++ tsum += p->time_stash[(p->i3 - i) & 15]; ++ for (i = 0; i != 9; ++i) { ++ if (time_thresholds[i] * 1000 * avg_n > tsum) ++ break; ++ } ++ return i; ++} ++#endif ++ ++// End timing for phase ++// Stats only - no actual effect ++static inline void tend_phase(RPI_T * const rpi, const int phase_no) ++{ ++#if OPT_PHASE_TIMING ++ phase_wait_env_t *const p = rpi->phase_reqs + phase_no; ++ const uint64_t now = tus64(); ++ const uint64_t in_time = now - p->phase_time; ++ ++ p->time_in_phase += in_time; ++ p->phase_time = now; ++ p->time_stash[p->i3] = in_time; ++ if (in_time > p->max_phase_time) { ++ p->max_phase_time = in_time; ++ p->max_time_decode_order = p->last_order; ++ } ++ ++p->time_bins[tavg_bin_phase(p, 1)]; ++ ++p->time_bins3[tavg_bin_phase(p, 3)]; ++ ++p->time_bins5[tavg_bin_phase(p, 5)]; ++ ++ p->i3 = (p->i3 + 1) & 15; ++#endif ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Start frame ++ ++static int rpi_hevc_start_frame( ++ AVCodecContext * avctx, ++ const uint8_t *buffer, ++ uint32_t size) { ++ ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++ dec_env_t * const de = dec_env_get(avctx, rpi); ++ const HEVCContext * const s = avctx->priv_data; ++ const HEVCSPS * const sps = s->ps.sps; ++ const unsigned int CtbSizeY = 1U << sps->log2_ctb_size; ++ ++#if TRACE_ENTRY ++ printf("<<< %s[%p]\n", __func__, de); ++#endif ++ ++ if (de == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); ++ return -1; ++ } ++ ++ de->phase_no = RPIVID_PHASE_START; ++ de->decode_order = ++rpi->decode_order; // *** atomic? ++ ++ ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame ++ ++ if (de->state != RPIVID_DECODE_NEW && de->state != RPIVID_DECODE_END) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state); ++ return -1; ++ } ++ de->state = RPIVID_DECODE_START; ++ ++ de->PicWidthInCtbsY = (sps->width + CtbSizeY - 1) / CtbSizeY; //7-15 ++ de->PicHeightInCtbsY = (sps->height + CtbSizeY - 1) / CtbSizeY; //7-17 ++ de->bit_len = 0; ++ de->cmd_len = 0; ++ ++#if TRACE_ENTRY ++ printf(">>> %s[%p]\n", __func__, de); ++#endif ++ ++ dec_env_release(rpi, de); ++ return 0; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Slice messages ++ ++static void msg_slice(dec_env_t * const de, const uint16_t msg) { ++ de->slice_msgs[de->num_slice_msgs++] = msg; ++} ++ ++static void program_slicecmds(dec_env_t * const de, const int sliceid) { ++ int i; ++ p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs+(sliceid<<8)); ++ for(i=0; i < de->num_slice_msgs; i++) { ++ p1_apb_write(de, 0x4000+4*i, de->slice_msgs[i] & 0xffff); ++ } ++} ++ ++static void pre_slice_decode(dec_env_t * const de, const HEVCContext * const s) { ++ const HEVCSPS * const sps = s->ps.sps; ++ const HEVCPPS * const pps = s->ps.pps; ++ const SliceHeader *sh = &s->sh; ++ ++ int weightedPredFlag, i, rIdx; ++ uint16_t cmd_slice; ++ unsigned int collocated_from_l0_flag; ++ ++ de->num_slice_msgs=0; ++ de->dpbno_col = 0; ++ cmd_slice = 0; ++ if (sh->slice_type==HEVC_SLICE_I) cmd_slice = 1; ++ if (sh->slice_type==HEVC_SLICE_P) cmd_slice = 2; ++ if (sh->slice_type==HEVC_SLICE_B) cmd_slice = 3; ++ ++ if (sh->slice_type!=HEVC_SLICE_I) { ++ cmd_slice += sh->nb_refs[L0]<<2; ++ cmd_slice += sh->nb_refs[L1]<<6; ++ } ++ ++ if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) ++ cmd_slice |= sh->max_num_merge_cand<<11; ++ ++ collocated_from_l0_flag = ++ !sh->slice_temporal_mvp_enabled_flag ? ++ 0 : ++ sh->slice_type == HEVC_SLICE_B ? ++ (sh->collocated_list == L0) : ++ (sh->slice_type==HEVC_SLICE_P); ++ cmd_slice |= collocated_from_l0_flag<<14; ++ ++ if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) { ++ ++ int NoBackwardPredFlag = 1; // Flag to say all reference pictures are from the past ++ for(i=L0; i<=L1; i++) { ++ for(rIdx=0; rIdx nb_refs[i]; rIdx++) { ++ HEVCFrame *f = s->ref->refPicList[i].ref[rIdx]; ++ HEVCFrame *c = s->ref; // CurrentPicture ++ if (c->poc < f->poc) NoBackwardPredFlag = 0; ++ } ++ } ++ ++ if (sps->sps_temporal_mvp_enabled_flag) ++ { ++ const RefPicList *rpl = (sh->slice_type != HEVC_SLICE_B || collocated_from_l0_flag) ? ++ s->ref->refPicList + 0 : ++ s->ref->refPicList + 1; ++ de->dpbno_col = rpl->ref[sh->collocated_ref_idx] - s->DPB; ++ } ++ ++ cmd_slice += NoBackwardPredFlag<<10; ++ msg_slice(de, cmd_slice); ++ ++ // Write reference picture descriptions ++ weightedPredFlag = sh->slice_type==HEVC_SLICE_P? pps->weighted_pred_flag : pps->weighted_bipred_flag; ++ ++ for(i=L0; i<=L1; i++) ++ for(rIdx=0; rIdx nb_refs[i]; rIdx++) { ++ HEVCFrame *f = s->ref->refPicList[i].ref[rIdx]; ++ HEVCFrame *c = s->ref; // CurrentPicture ++ int pic = f - s->DPB; ++ // Make sure pictures are in range 0 to 15 ++ int adjusted_pic = fref->refPicList[i].isLongTerm[rIdx]; ++ msg_slice(de, adjusted_pic+(lt<<4)+(weightedPredFlag<<5)+(weightedPredFlag<<6)); ++ msg_slice(de, f->poc); ++ if (weightedPredFlag) { ++ msg_slice(de, s->sh.luma_log2_weight_denom+(((i?s-> sh.luma_weight_l1: s->sh.luma_weight_l0)[rIdx] &0x1ff)<<3)); ++ msg_slice(de, (i?s-> sh.luma_offset_l1: s->sh.luma_offset_l0)[rIdx] & 0xff); ++ msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][0]&0x1ff)<<3)); ++ msg_slice(de, (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][0]& 0xff); ++ msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][1]&0x1ff)<<3)); ++ msg_slice(de, (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][1]& 0xff); ++ } ++ } ++ } ++ else ++ msg_slice(de, cmd_slice); ++ ++ msg_slice(de, ((sh->beta_offset/2)&15) ++ + (((sh->tc_offset/2)&15) << 4) ++ + (sh->disable_deblocking_filter_flag << 8) ++ + (sh->slice_loop_filter_across_slices_enabled_flag << 9) ++ + (pps->loop_filter_across_tiles_enabled_flag << 10)); // CMD_DEBLOCK ++ ++ msg_slice(de, ((sh->slice_cr_qp_offset&31)<<5) + (sh->slice_cb_qp_offset&31)); // CMD_QPOFF ++} ++ ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static void rpi_hevc_abort_frame(AVCodecContext * const avctx) { ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++ dec_env_t * const de = dec_env_get(avctx, rpi); ++ ++#if TRACE_ENTRY ++ printf("<<< %s[%p]\n", __func__, de); ++#endif ++ ++ if (de == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); ++ return; ++ } ++ ++ switch (de->state) { ++ case RPIVID_DECODE_NEW: ++ case RPIVID_DECODE_END: ++ // Expected transition ++ break; ++ ++ case RPIVID_DECODE_SLICE: ++ // Error transition ++ av_log(avctx, AV_LOG_INFO, "Error in decode - aborting\n"); ++ break; ++ ++ case RPIVID_DECODE_START: ++ default: ++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state); ++ break; ++ } ++ ++ abort_phases(rpi, de); ++ de->state = RPIVID_DECODE_NEW; ++ ++ dec_env_release(rpi, de); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// End frame ++ ++static int rpi_hevc_end_frame(AVCodecContext * const avctx) { ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++ const HEVCContext * const s = avctx->priv_data; ++ const HEVCPPS * const pps = s->ps.pps; ++ const HEVCSPS * const sps = s->ps.sps; ++ dec_env_t * const de = dec_env_get(avctx, rpi); ++ AVFrame * const f = s->ref->frame; ++ const unsigned int dpbno_cur = s->ref - s->DPB; ++ vid_vc_addr_t cmds_vc; ++ vid_vc_addr_t pu_base_vc; ++ unsigned int pu_stride; ++ vid_vc_addr_t coeff_base_vc; ++ unsigned int coeff_stride; ++ unsigned int i; ++ int rv = 0; ++ int status = 0; ++ int coeffbuf_sem_claimed = 0; ++ ++#if TRACE_ENTRY ++ fprintf("<<< %s[%p]\n", __func__, de); ++#endif ++ ++ if (de == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); ++ return AVERROR_BUG; // Should never happen ++ } ++ ++ if (de->state != RPIVID_DECODE_SLICE) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state); ++ rv = AVERROR_UNKNOWN; ++ goto fail; ++ } ++ de->state = RPIVID_DECODE_END; ++ ++ // End of command compilation ++ { ++ const unsigned int last_x = pps->col_bd[pps->num_tile_columns]-1; ++ const unsigned int last_y = pps->row_bd[pps->num_tile_rows]-1; ++ if (pps->entropy_coding_sync_enabled_flag) { ++ if (de->wpp_entry_x<2 && de->PicWidthInCtbsY>2) ++ wpp_pause(de, last_y); ++ } ++ p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18)); ++ } ++ ++ // Phase 0 --------------------------------------------------------------- ++ ++ wait_phase(rpi, de, 0); ++ rpi_sem_wait(&rpi->bitbuf_sem); ++ tstart_phase(rpi, 0); ++ ++ // Copy cmds & bits into gpu side buffer ++ // Layout: CMDS, BITS ++ { ++ uint8_t * const armbase = rpi->gbitbufs[rpi->bitbuf_no].arm; ++ vid_vc_addr_t vcbase = rpi->gbitbufs[rpi->bitbuf_no].vc; ++ unsigned int cmd_bytes = de->cmd_len * sizeof(struct RPI_CMD); ++ ++ uint8_t * p = armbase + rnd64(cmd_bytes); ++ uint8_t * const eobits = armbase + rpi->gbitbufs[rpi->bitbuf_no].numbytes; ++ ++ cmds_vc = vcbase; ++ ++ // Copy all the bits & update bitstream cmds to point at the right bits ++ for (i = 0; i < de->bit_len; ++i) ++ { ++ const unsigned int seg_len = de->bit_fifo[i].len; ++ ++ if (p + seg_len > eobits) { ++ status = -1; ++ break; ++ } ++ ++ memcpy(p, de->bit_fifo[i].ptr, seg_len); ++ de->cmd_fifo[de->bit_fifo[i].cmd].data = MANGLE64((p - armbase) + vcbase); ++ ++ p += rnd64(seg_len); ++ } ++ ++ memcpy(armbase, de->cmd_fifo, cmd_bytes); ++ } ++ ++ if (status == 0) ++ { ++ if (++rpi->bitbuf_no >= RPIVID_BITBUFS) ++ rpi->bitbuf_no = 0; ++ } ++ else ++ { ++ sem_post(&rpi->bitbuf_sem); ++ av_log(avctx, AV_LOG_ERROR, "Out of HEVC bit/cmd memory\n"); ++ rv = AVERROR_BUFFER_TOO_SMALL; ++ } ++ ++ tend_phase(rpi, 0); ++ post_phase(rpi, de, 0); ++ ++ if (status < 0) ++ goto fail; ++ ++ // Phase 1 --------------------------------------------------------------- ++ ++ wait_phase(rpi, de, 1); ++ rpi_sem_wait(&rpi->coeffbuf_sem); ++ coeffbuf_sem_claimed = 1; ++ tstart_phase(rpi, 1); ++ ++ for (;;) ++ { ++ // (Re-)allocate PU/COEFF stream space ++ const unsigned int total_size = rpi->gcoeffbufs[rpi->coeffbuf_no].numbytes; ++ unsigned int pu_size; ++ ++ pu_base_vc = rpi->gcoeffbufs[rpi->coeffbuf_no].vc; ++ pu_stride = rnd64(rpi->max_pu_msgs * 2 * de->PicWidthInCtbsY); ++ pu_size = pu_stride * de->PicHeightInCtbsY; ++ ++ if (pu_size > total_size) { ++ status = -1; ++ break; ++ } ++ ++ // Allocate all remaining space to coeff ++ coeff_base_vc = pu_base_vc + pu_size; ++ coeff_stride = ((total_size - pu_size) / de->PicHeightInCtbsY) & ~63; // Round down to multiple of 64 ++ ++ apb_write_vc_addr(rpi, RPI_PUWBASE, pu_base_vc); ++ apb_write_vc_len(rpi, RPI_PUWSTRIDE, pu_stride); ++ apb_write_vc_addr(rpi, RPI_COEFFWBASE, coeff_base_vc); ++ apb_write_vc_len(rpi, RPI_COEFFWSTRIDE, coeff_stride); ++ ++ // Trigger command FIFO ++ apb_write(rpi, RPI_CFNUM, de->cmd_len); ++#if TRACE_DEV ++ apb_dump_regs(rpi, 0x0, 32); ++ apb_dump_regs(rpi, 0x8000, 24); ++ axi_dump(de, ((uint64_t)a64)<<6, de->cmd_len * sizeof(struct RPI_CMD)); ++#endif ++ apb_write_vc_addr(rpi, RPI_CFBASE, cmds_vc); ++ ++ int_wait(rpi, 1); ++ ++ status = check_status(rpi, de); ++ ++ if (status != 1) ++ break; ++ ++ // Status 1 means out of PU space so try again with more ++ // If we ran out of Coeff space then we are out of memory - we could possibly realloc? ++ rpi->max_pu_msgs += rpi->max_pu_msgs / 2; ++ } ++ ++ // Inc inside the phase 1 lock, but only inc if we succeeded otherwise we ++ // may reuse a live buffer when we kick the coeff sem ++ if (status == 0) ++ { ++ if (++rpi->coeffbuf_no >= RPIVID_COEFFBUFS) ++ rpi->coeffbuf_no = 0; ++ } ++ else ++ { ++ if (status == -1) ++ { ++ av_log(avctx, AV_LOG_ERROR, "Out of pu + coeff intermediate memory: pus=%d\n", rpi->max_pu_msgs); ++ rv = AVERROR_BUFFER_TOO_SMALL; ++ } ++ else ++ { ++ av_log(avctx, AV_LOG_WARNING, "Phase 1 decode error\n"); ++ rv = AVERROR_INVALIDDATA; ++ } ++ } ++ ++ tend_phase(rpi, 1); ++ sem_post(&rpi->bitbuf_sem); ++ post_phase(rpi, de, 1); ++ ++ if (status != 0) ++ goto fail; ++ ++ // Phase 2 --------------------------------------------------------------- ++ ++ wait_phase(rpi, de, 2); ++ ++ if ((rv = av_rpi_zc_resolve_frame(f, ZC_RESOLVE_ALLOC)) != 0) ++ { ++ // As we are in phase 2 already here we don't need to worry about ++ // ceoffbuf_no despite the early exit ++ post_phase(rpi, de, 2); ++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate output frame\n"); ++ goto fail; ++ } ++ ++ tstart_phase(rpi, 2); ++ ++ apb_write_vc_addr(rpi, RPI_PURBASE, pu_base_vc); ++ apb_write_vc_len(rpi, RPI_PURSTRIDE, pu_stride); ++ apb_write_vc_addr(rpi, RPI_COEFFRBASE, coeff_base_vc); ++ apb_write_vc_len(rpi, RPI_COEFFRSTRIDE, coeff_stride); ++ ++ apb_write_vc_addr(rpi, RPI_OUTYBASE, get_vc_address_y(f)); ++ apb_write_vc_addr(rpi, RPI_OUTCBASE, get_vc_address_u(f)); ++ apb_write_vc_len(rpi, RPI_OUTYSTRIDE, f->linesize[3] * 128); ++ apb_write_vc_len(rpi, RPI_OUTCSTRIDE, f->linesize[3] * 128); ++ ++ // Keep the last thing we resolved as fallback for any ref we fail to ++ // resolve. As a final fallback use our current frame. The pels might ++ // not be there yet but at least the memory is valid. ++ // ++ // Attempt to resolve the entire DPB - we could note what we have used ++ // in ref lists but probably simpler and more reliable to set the whole thing ++ { ++ AVFrame * fallback_frame = f; ++ for (i = 0; i != 16; ++i) { ++ // Avoid current frame ++ const HEVCFrame * hevc_fr = (s->DPB + i >= s->ref) ? s->DPB + i + 1 : s->DPB + i; ++ AVFrame * fr = hevc_fr->frame; ++ ++ if (fr != NULL && ++ av_rpi_zc_resolve_frame(fr, ZC_RESOLVE_FAIL) == 0) ++ { ++ fallback_frame = fr; ++ } ++ else ++ { ++ fr = fallback_frame; ++ } ++ ++ apb_write_vc_addr(rpi, 0x9000+16*i, get_vc_address_y(fr)); ++ apb_write(rpi, 0x9004+16*i, 0); ++ apb_write_vc_addr(rpi, 0x9008+16*i, get_vc_address_u(fr)); ++ apb_write(rpi, 0x900C+16*i, 0); ++ } ++ } ++ ++ apb_write(rpi, RPI_CONFIG2, ++ (sps->bit_depth << 0) // BitDepthY ++ + (sps->bit_depth << 4) // BitDepthC ++ + ((sps->bit_depth>8) << 8) // BitDepthY ++ + ((sps->bit_depth>8) << 9) // BitDepthC ++ + (sps->log2_ctb_size <<10) ++ + (pps->constrained_intra_pred_flag <<13) ++ + (sps->sps_strong_intra_smoothing_enable_flag<<14) ++ + (sps->sps_temporal_mvp_enabled_flag <<15) ++ + (pps->log2_parallel_merge_level <<16) ++ + (s->sh.slice_temporal_mvp_enabled_flag <<19) ++ + (sps->pcm.loop_filter_disable_flag <<20) ++ + ((pps->cb_qp_offset&31) <<21) ++ + ((pps->cr_qp_offset&31) <<26)); ++ ++ apb_write(rpi, RPI_FRAMESIZE, (sps->height<<16) + sps->width); ++ apb_write(rpi, RPI_CURRPOC, s->poc); ++ ++ // collocated reads/writes ++ if (sps->sps_temporal_mvp_enabled_flag) { ++ av_assert0(de->dpbno_col < RPIVID_COL_PICS); ++ av_assert0(dpbno_cur < RPIVID_COL_PICS); ++ ++ apb_write_vc_len(rpi, RPI_COLSTRIDE, rpi->col_stride); ++ apb_write_vc_len(rpi, RPI_MVSTRIDE, rpi->col_stride); ++ apb_write_vc_addr(rpi, RPI_MVBASE, rpi->gcolbuf.vc + dpbno_cur * rpi->col_picsize); ++ apb_write_vc_addr(rpi, RPI_COLBASE, rpi->gcolbuf.vc + de->dpbno_col * rpi->col_picsize); ++ } ++ ++#if TRACE_DEV ++ apb_dump_regs(rpi, 0x0, 32); ++ apb_dump_regs(rpi, 0x8000, 24); ++#endif ++ ++ apb_write(rpi, RPI_NUMROWS, de->PicHeightInCtbsY); ++ apb_read(rpi, RPI_NUMROWS); // Read back to confirm write has reached block ++ ++ int_wait(rpi, 2); ++ ++ tend_phase(rpi, 2); ++ coeffbuf_sem_claimed = 0; ++ sem_post(&rpi->coeffbuf_sem); ++ // Set valid here to avoid race in resolving in any pending phase 2 ++ av_rpi_zc_set_valid_frame(f); ++ ++ post_phase(rpi, de, 2); ++ ++ // Flush frame for CPU access ++ // Arguably the best place would be at the start of phase 2 but here ++ // will overlap with the wait ++ // ++ // * Even better would be to have better lock/unlock control in ZC for external access ++ if (rpi->gpu_init_type == GPU_INIT_GPU) // * CMA is currently always uncached ++ { ++ rpi_cache_buf_t cbuf; ++ rpi_cache_flush_env_t * const fe = rpi_cache_flush_init(&cbuf); ++ rpi_cache_flush_add_frame(fe, f, RPI_CACHE_FLUSH_MODE_INVALIDATE); ++ rpi_cache_flush_finish(fe); ++ } ++ ++#if TRACE_ENTRY ++ printf(">>> %s[%p] OK\n", __func__, de); ++#endif ++ ++ dec_env_release(rpi, de); ++ return 0; ++ ++fail: ++ av_rpi_zc_set_broken_frame(f); ++ if (coeffbuf_sem_claimed) ++ sem_post(&rpi->coeffbuf_sem); ++ abort_phases(rpi, de); // Dummy any unresolved phases ++ ++#if TRACE_ENTRY ++ printf(">>> %s[%p] FAIL\n", __func__, de); ++#endif ++ ++ dec_env_release(rpi, de); ++ return rv; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static void WriteBitstream(dec_env_t * const de, const HEVCContext * const s) { ++ const int rpi_use_emu = 0; // FFmpeg removes emulation prevention bytes ++ const int offset = 0; // Always 64-byte aligned in sim, need not be on real hardware ++ const GetBitContext *gb = &s->HEVClc->gb; ++ const int len = 1 + gb->size_in_bits/8 - gb->index/8; ++ const void *ptr = &gb->buffer[gb->index/8]; ++ ++ p1_axi_write(de, len, ptr, p1_apb_write(de, RPI_BFBASE, 0)); // BFBASE set later ++ p1_apb_write(de, RPI_BFNUM, len); ++ p1_apb_write(de, RPI_BFCONTROL, offset + (1<<7)); // Stop ++ p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu<<6)); ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Wavefront mode ++ ++static void wpp_decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) ++{ ++ const HEVCPPS * const pps = s->ps.pps; ++ ++ int i, resetQPY=1; ++ int indep = !s->sh.dependent_slice_segment_flag; ++ int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY; ++ ++ if (ctb_addr_ts) ++ wpp_end_previous_slice(de, s, ctb_addr_ts); ++ pre_slice_decode(de, s); ++ WriteBitstream(de, s); ++ if (ctb_addr_ts==0 || indep || de->PicWidthInCtbsY==1) ++ WriteProb(de, s); ++ else if (ctb_col==0) ++ p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); ++ else ++ resetQPY=0; ++ program_slicecmds(de, s->slice_idx); ++ new_slice_segment(de, s); ++ wpp_entry_point(de, s, indep, resetQPY, ctb_addr_ts); ++ for (i=0; ish.num_entry_point_offsets; i++) { ++ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY; ++ int last_x = de->PicWidthInCtbsY-1; ++ if (de->PicWidthInCtbsY>2) ++ wpp_pause(de, ctb_row); ++ p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + (last_x<<5) + 2); ++ if (de->PicWidthInCtbsY==2) ++ p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP); ++ if (de->PicWidthInCtbsY==1) ++ WriteProb(de, s); ++ else ++ p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD); ++ ctb_addr_ts += pps->column_width[0]; ++ wpp_entry_point(de, s, 0, 1, ctb_addr_ts); ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++// Tiles mode ++ ++static void decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) { ++ const HEVCPPS * const pps = s->ps.pps; ++ int i, resetQPY; ++ ++ if (ctb_addr_ts) end_previous_slice(de, s, ctb_addr_ts); ++ pre_slice_decode(de, s); ++ WriteBitstream(de, s); ++ resetQPY = ctb_addr_ts==0 ++ || pps->tile_id[ctb_addr_ts]!=pps->tile_id[ctb_addr_ts-1] ++ || !s->sh.dependent_slice_segment_flag; ++ if (resetQPY) WriteProb(de, s); ++ program_slicecmds(de, s->slice_idx); ++ new_slice_segment(de, s); ++ new_entry_point(de, s, !s->sh.dependent_slice_segment_flag, resetQPY, ctb_addr_ts); ++ for (i=0; ish.num_entry_point_offsets; i++) { ++ int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts]; ++ int ctb_col = ctb_addr_rs % de->PicWidthInCtbsY; ++ int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY; ++ int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns); ++ int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows); ++ int last_x = pps->col_bd[tile_x+1]-1; ++ int last_y = pps->row_bd[tile_y+1]-1; ++ p1_apb_write(de, RPI_STATUS, 2 + (last_x<<5) + (last_y<<18)); ++ WriteProb(de, s); ++ ctb_addr_ts += pps->column_width[tile_x] * pps->row_height[tile_y]; ++ new_entry_point(de, s, 0, 1, ctb_addr_ts); ++ } ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static int cabac_start_align(HEVCContext *s) ++{ ++ GetBitContext *gb = &s->HEVClc->gb; ++ skip_bits(gb, 1); ++ align_get_bits(gb); ++ // Should look at getting rid of this ++ return ff_init_cabac_decoder(&s->HEVClc->cc, ++ gb->buffer + get_bits_count(gb) / 8, ++ (get_bits_left(gb) + 7) / 8); ++} ++ ++static int rpi_hevc_decode_slice( ++ AVCodecContext *avctx, ++ const uint8_t *buffer, ++ uint32_t size) ++{ ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++ HEVCContext * const s = avctx->priv_data; ++ dec_env_t * const de = dec_env_get(avctx, rpi); ++ const HEVCPPS *pps = s->ps.pps; ++ int ctb_addr_ts = pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs]; ++ ++#if TRACE_ENTRY ++ printf("<<< %s[%p]\n", __func__, de); ++#endif ++ if (de == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__); ++ return -1; ++ } ++ ++ if (de->state != RPIVID_DECODE_START && de->state != RPIVID_DECODE_SLICE) { ++ av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state); ++ return -1; ++ } ++ de->state = RPIVID_DECODE_SLICE; ++ ++// ff_hevc_cabac_init(s, ctb_addr_ts); ++ cabac_start_align(s); ++ if (s->ps.sps->scaling_list_enable_flag) ++ populate_scaling_factors(de, s); ++ pps->entropy_coding_sync_enabled_flag? wpp_decode_slice(de, s, ctb_addr_ts) ++ : decode_slice(de, s, ctb_addr_ts); ++#if TRACE_ENTRY ++ printf(">>> %s[%p]\n", __func__, de); ++#endif ++ dec_env_release(rpi, de); ++ return 0; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static int rpivid_retrieve_data(void *logctx, AVFrame *frame) ++{ ++ int rv; ++ if ((rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_WAIT_VALID)) != 0) ++ av_log(logctx, AV_LOG_ERROR, "Unable to resolve output frame\n"); ++ return rv; ++} ++ ++static int rpivid_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame) ++{ ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++ HEVCContext * const s = avctx->priv_data; ++ // Frame buffering + 1 output. Would need thread_count extra but we now ++ // alloc at the start of phase 2 so that is the only thread we need the ++ // extra buffer for. ++ const unsigned int pool_req = s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering + 1; ++ int rv; ++ ++ if (av_rpi_zc_in_use(avctx)) ++ { ++ const AVZcEnvPtr zc = avctx->opaque; ++ av_rpi_zc_set_decoder_pool_size(zc, pool_req); ++ av_rpi_zc_get_buffer(zc, frame); // get_buffer2 would alloc ++ } ++ else ++ { ++ if (rpi->zc == NULL) { ++ pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this ++ // Alloc inside lock to make sure we only ever alloc one ++ if (rpi->zc == NULL) { ++ rpi->zc = av_rpi_zc_int_env_alloc(s); ++ } ++ pthread_mutex_unlock(&rpi->phase_lock); ++ } ++ av_rpi_zc_set_decoder_pool_size(rpi->zc, pool_req); // Ignored by local allocator, but set anyway :-) ++ rv = (rpi->zc == NULL) ? AVERROR(ENOMEM) : ++ av_rpi_zc_get_buffer(rpi->zc, frame); ++ } ++ ++ if (rv == 0 && ++ (rv = ff_attach_decode_data(frame)) < 0) ++ { ++ av_frame_unref(frame); ++ } ++ ++ if (rv == 0) ++ { ++ FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data; ++ fdd->post_process = rpivid_retrieve_data; ++ } ++ ++ return rv; ++} ++ ++#if OPT_PHASE_TIMING ++static void log_bin_phase(AVCodecContext * const avctx, const unsigned int * const bins) ++{ ++ av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d %7d\n", ++ bins[0], bins[1], bins[2], bins[3], ++ bins[4], bins[5], bins[6], bins[7], bins[8]); ++} ++#endif ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static int rpi_hevc_free(AVCodecContext *avctx) { ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++ ++#if TRACE_ENTRY ++ printf("<<< %s\n", __func__); ++#endif ++ ++ dec_env_release(rpi, NULL); ++ ++ // Wait for everything else to stop ++ { ++ struct timespec tt; ++ clock_gettime(CLOCK_REALTIME, &tt); ++ tt.tv_sec += 2; ++ while (sem_timedwait(&rpi->ref_zero, &tt) == -1) { ++ const int err = errno; ++ if (err == ETIMEDOUT) { ++ av_log(avctx, AV_LOG_FATAL, "Rpivid worker threads still running\n"); ++ return -1; ++ } ++ if (err != EINTR) { ++ av_log(avctx, AV_LOG_ERROR, "Unexpected error %d waiting for work thread to stop\n", err); ++ break; ++ } ++ } ++ } ++ ++#if OPT_PHASE_TIMING ++ { ++ unsigned int i; ++ for (i = 0; i != RPIVID_PHASES; ++i) { ++ const phase_wait_env_t * const p = rpi->phase_reqs + i; ++ av_log(avctx, AV_LOG_INFO, "Phase %u: In %3u.%06u, Out %3u.%06u\n", i, ++ (unsigned int)(p->time_in_phase / 1000000), (unsigned int)(p->time_in_phase % 1000000), ++ (unsigned int)(p->time_out_phase / 1000000), (unsigned int)(p->time_out_phase % 1000000)); ++ av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d >\n", ++ time_thresholds[0], time_thresholds[1], time_thresholds[2], time_thresholds[3], ++ time_thresholds[4], time_thresholds[5], time_thresholds[6], time_thresholds[7]); ++ log_bin_phase(avctx, p->time_bins); ++ log_bin_phase(avctx, p->time_bins3); ++ log_bin_phase(avctx, p->time_bins5); ++ av_log(avctx, AV_LOG_INFO, "Longest duraction: %ums @ frame %u\n", ++ (unsigned int)(p->max_phase_time / 1000), ++ p->max_time_decode_order); ++ } ++ av_log(avctx, AV_LOG_INFO, "PU max=%d\n", rpi->max_pu_msgs); ++ } ++#endif ++ ++ if (rpi->dec_envs != NULL) ++ { ++ for (int i; i < avctx->thread_count && rpi->dec_envs[i] != NULL; ++i) { ++ dec_env_delete(rpi->dec_envs[i]); ++ } ++ av_freep(&rpi->dec_envs); ++ } ++ ++ av_rpi_zc_int_env_freep(&rpi->zc); ++ ++ gpu_free(&rpi->gcolbuf); ++ ++ for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) { ++ gpu_free(rpi->gbitbufs + i); ++ } ++ for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) { ++ gpu_free(rpi->gcoeffbufs + i); ++ } ++ ++ unmap_devp(&rpi->regs, REGS_SIZE); ++ unmap_devp(&rpi->ints, INTS_SIZE); ++ ++ if (rpi->gpu_init_type > 0) ++ rpi_mem_gpu_uninit(); ++ ++ if (rpi->mbox_fd >= 0) { ++ mbox_release_clock(rpi->mbox_fd); ++ mbox_close(rpi->mbox_fd); ++ } ++ ++ sem_destroy(&rpi->ref_zero); ++ sem_destroy(&rpi->coeffbuf_sem); ++ sem_destroy(&rpi->bitbuf_sem); ++ ++#if TRACE_ENTRY ++ printf(">>> %s\n", __func__); ++#endif ++ return 0; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++static int rpi_hevc_init(AVCodecContext *avctx) { ++ RPI_T * const rpi = avctx->internal->hwaccel_priv_data; ++// const char *err; ++ ++#if TRACE_ENTRY ++ printf("<<< %s\n", __func__); ++#endif ++ ++ if (avctx->width>4096 || avctx->height>4096) { ++ av_log(NULL, AV_LOG_FATAL, "Picture size %dx%d exceeds 4096x4096 maximum for HWAccel\n", avctx->width, avctx->height); ++ return AVERROR(ENOTSUP); ++ } ++ ++ memset(rpi, 0, sizeof(*rpi)); ++ ++ rpi->mbox_fd = -1; ++ rpi->decode_order = 0; ++ ++ // Initial PU/COEFF stream buffer split chosen as worst case seen so far ++ rpi->max_pu_msgs = 768; // 7.2 says at most 1611 messages per CTU ++ ++ ++ atomic_store(&rpi->ref_count, 1); ++ sem_init(&rpi->ref_zero, 0, 0); ++ ++ sem_init(&rpi->bitbuf_sem, 0, RPIVID_BITBUFS); ++ sem_init(&rpi->coeffbuf_sem, 0, RPIVID_COEFFBUFS); ++ ++ pthread_mutex_init(&rpi->phase_lock, NULL); ++ ++ if ((rpi->mbox_fd = mbox_open()) < 0) ++ { ++ av_log(avctx, AV_LOG_ERROR, "Failed to open mailbox\n"); ++ goto fail; ++ } ++ mbox_request_clock(rpi->mbox_fd); ++ ++ if ((rpi->regs = map_dev(avctx, REGS_NAME, REGS_SIZE)) == NULL || ++ (rpi->ints = map_dev(avctx, INTS_NAME, INTS_SIZE)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to open rpivid devices\n"); ++ goto fail; ++ } ++ ++ if ((rpi->gpu_init_type = rpi_mem_gpu_init(0)) < 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to init GPU\n"); ++ goto fail; ++ } ++ ++ if ((rpi->dec_envs = av_mallocz(sizeof(dec_env_t *) * avctx->thread_count)) == NULL) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d dec envs\n", avctx->thread_count); ++ goto fail; ++ } ++ ++ rpi->col_stride = rnd64(avctx->width); ++ rpi->col_picsize = rpi->col_stride * (((avctx->height + 63) & ~63) >> 4); ++ if (gpu_malloc_uncached(rpi->col_picsize * RPIVID_COL_PICS, &rpi->gcolbuf) != 0) ++ { ++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate col mv buffer\n"); ++ goto fail; ++ } ++ ++ for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) { ++ if (gpu_malloc_uncached(RPIVID_BITBUF_SIZE, rpi->gbitbufs + i) != 0) ++ { ++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate bitbuf %d\n", i); ++ goto fail; ++ } ++ } ++ ++ for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) { ++ if (gpu_malloc_uncached(RPIVID_COEFFBUF_SIZE, rpi->gcoeffbufs + i) != 0) ++ { ++ av_log(avctx, AV_LOG_ERROR, "Failed to allocate coeffbuf %d\n", i); ++ goto fail; ++ } ++ } ++ ++ return 0; ++ ++fail: ++ rpi_hevc_free(avctx); ++ return AVERROR_EXTERNAL; ++} ++ ++////////////////////////////////////////////////////////////////////////////// ++ ++const AVHWAccel ff_hevc_rpi4_8_hwaccel = { ++ .name = "hevc_rpi4_8", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_HEVC, ++ .pix_fmt = AV_PIX_FMT_RPI4_8, ++ .alloc_frame = rpivid_hevc_alloc_frame, ++ .start_frame = rpi_hevc_start_frame, ++ .end_frame = rpi_hevc_end_frame, ++ .abort_frame = rpi_hevc_abort_frame, ++ .decode_slice = rpi_hevc_decode_slice, ++ .init = rpi_hevc_init, ++ .uninit = rpi_hevc_free, ++ .priv_data_size = sizeof(RPI_T), ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, ++}; ++ ++const AVHWAccel ff_hevc_rpi4_10_hwaccel = { ++ .name = "hevc_rpi4_10", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_HEVC, ++ .pix_fmt = AV_PIX_FMT_RPI4_10, ++ .alloc_frame = rpivid_hevc_alloc_frame, ++ .start_frame = rpi_hevc_start_frame, ++ .end_frame = rpi_hevc_end_frame, ++ .abort_frame = rpi_hevc_abort_frame, ++ .decode_slice = rpi_hevc_decode_slice, ++ .init = rpi_hevc_init, ++ .uninit = rpi_hevc_free, ++ .priv_data_size = sizeof(RPI_T), ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, ++}; ++ +diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c +index 02f23d954b..522009ccfb 100644 +--- a/libavcodec/v4l2_buffers.c ++++ b/libavcodec/v4l2_buffers.c +@@ -21,6 +21,7 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include + #include + #include + #include +@@ -30,12 +31,13 @@ + #include "libavcodec/avcodec.h" + #include "libavcodec/internal.h" + #include "libavutil/pixdesc.h" ++#include "libavutil/hwcontext.h" + #include "v4l2_context.h" + #include "v4l2_buffers.h" + #include "v4l2_m2m.h" + + #define USEC_PER_SEC 1000000 +-static AVRational v4l2_timebase = { 1, USEC_PER_SEC }; ++static const AVRational v4l2_timebase = { 1, USEC_PER_SEC }; + + static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf) + { +@@ -52,10 +54,8 @@ static inline AVCodecContext *logger(V4L2Buffer *buf) + static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf) + { + V4L2m2mContext *s = buf_to_m2mctx(avbuf); +- +- if (s->avctx->pkt_timebase.num) +- return s->avctx->pkt_timebase; +- return s->avctx->time_base; ++ const AVRational tb = s->avctx->pkt_timebase.num ? s->avctx->pkt_timebase : s->avctx->time_base; ++ return tb.num && tb.den ? tb : v4l2_timebase; + } + + static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts) +@@ -210,7 +210,79 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf) + return AVCOL_TRC_UNSPECIFIED; + } + +-static void v4l2_free_buffer(void *opaque, uint8_t *unused) ++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf) ++{ ++ AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame; ++ AVDRMLayerDescriptor *layer; ++ ++ /* fill the DRM frame descriptor */ ++ drm_desc->nb_objects = avbuf->num_planes; ++ drm_desc->nb_layers = 1; ++ ++ layer = &drm_desc->layers[0]; ++ layer->nb_planes = avbuf->num_planes; ++ ++ for (int i = 0; i < avbuf->num_planes; i++) { ++ layer->planes[i].object_index = i; ++ layer->planes[i].offset = 0; ++ layer->planes[i].pitch = avbuf->plane_info[i].bytesperline; ++ } ++ ++ switch (avbuf->context->av_pix_fmt) { ++ case AV_PIX_FMT_YUYV422: ++ ++ layer->format = DRM_FORMAT_YUYV; ++ layer->nb_planes = 1; ++ ++ break; ++ ++ case AV_PIX_FMT_NV12: ++ case AV_PIX_FMT_NV21: ++ ++ layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ? ++ DRM_FORMAT_NV12 : DRM_FORMAT_NV21; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 2; ++ ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * ++ avbuf->context->format.fmt.pix.height; ++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline; ++ break; ++ ++ case AV_PIX_FMT_YUV420P: ++ ++ layer->format = DRM_FORMAT_YUV420; ++ ++ if (avbuf->num_planes > 1) ++ break; ++ ++ layer->nb_planes = 3; ++ ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = avbuf->plane_info[0].bytesperline * ++ avbuf->context->format.fmt.pix.height; ++ layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1; ++ ++ layer->planes[2].object_index = 0; ++ layer->planes[2].offset = layer->planes[1].offset + ++ ((avbuf->plane_info[0].bytesperline * ++ avbuf->context->format.fmt.pix.height) >> 2); ++ layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1; ++ break; ++ ++ default: ++ drm_desc->nb_layers = 0; ++ break; ++ } ++ ++ return (uint8_t *) drm_desc; ++} ++ ++static void v4l2_free_buffer(void *opaque, uint8_t *data) + { + V4L2Buffer* avbuf = opaque; + V4L2m2mContext *s = buf_to_m2mctx(avbuf); +@@ -226,14 +298,52 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused) + /* no need to queue more buffers to the driver */ + avbuf->status = V4L2BUF_AVAILABLE; + } +- else if (avbuf->context->streamon) ++ else if (avbuf->context->streamon) { ++ avbuf->buf.timestamp.tv_sec = 0; ++ avbuf->buf.timestamp.tv_usec = 0; + ff_v4l2_buffer_enqueue(avbuf); ++ } ++ else { ++ av_log(logger(avbuf), AV_LOG_ERROR, "=== %s: Buffer freed but streamoff\n", avbuf->context->name); ++ } + } + + av_buffer_unref(&avbuf->context_ref); + } + } + ++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf) ++{ ++ struct v4l2_exportbuffer expbuf; ++ int i, ret; ++ ++ for (i = 0; i < avbuf->num_planes; i++) { ++ memset(&expbuf, 0, sizeof(expbuf)); ++ ++ expbuf.index = avbuf->buf.index; ++ expbuf.type = avbuf->buf.type; ++ expbuf.plane = i; ++ ++ ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf); ++ if (ret < 0) ++ return AVERROR(errno); ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) { ++ /* drm frame */ ++ avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length; ++ avbuf->drm_frame.objects[i].fd = expbuf.fd; ++ avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ } else { ++ /* drm frame */ ++ avbuf->drm_frame.objects[0].size = avbuf->buf.length; ++ avbuf->drm_frame.objects[0].fd = expbuf.fd; ++ avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ } ++ } ++ ++ return 0; ++} ++ + static int v4l2_buf_increase_ref(V4L2Buffer *in) + { + V4L2m2mContext *s = buf_to_m2mctx(in); +@@ -254,6 +364,24 @@ static int v4l2_buf_increase_ref(V4L2Buffer *in) + return 0; + } + ++static int v4l2_buf_to_bufref_drm(V4L2Buffer *in, AVBufferRef **buf) ++{ ++ int ret; ++ ++ *buf = av_buffer_create((uint8_t *) &in->drm_frame, ++ sizeof(in->drm_frame), ++ v4l2_free_buffer, ++ in, AV_BUFFER_FLAG_READONLY); ++ if (!*buf) ++ return AVERROR(ENOMEM); ++ ++ ret = v4l2_buf_increase_ref(in); ++ if (ret) ++ av_buffer_unref(buf); ++ ++ return ret; ++} ++ + static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf) + { + int ret; +@@ -274,7 +402,18 @@ static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf) + return ret; + } + +-static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset, AVBufferRef* bref) ++static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length) ++{ ++ if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { ++ out->planes[plane].bytesused = bytesused; ++ out->planes[plane].length = length; ++ } else { ++ out->buf.bytesused = bytesused; ++ out->buf.length = length; ++ } ++} ++ ++static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset) + { + unsigned int bytesused, length; + +@@ -286,13 +425,7 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i + + memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset)); + +- if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) { +- out->planes[plane].bytesused = bytesused; +- out->planes[plane].length = length; +- } else { +- out->buf.bytesused = bytesused; +- out->buf.length = length; +- } ++ set_buf_length(out, plane, bytesused, length); + + return 0; + } +@@ -303,13 +436,25 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) + + frame->format = avbuf->context->av_pix_fmt; + +- for (i = 0; i < avbuf->num_planes; i++) { +- ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]); ++ if (buf_to_m2mctx(avbuf)->output_drm) { ++ /* 1. get references to the actual data */ ++ ret = v4l2_buf_to_bufref_drm(avbuf, &frame->buf[0]); + if (ret) + return ret; + +- frame->linesize[i] = avbuf->plane_info[i].bytesperline; +- frame->data[i] = frame->buf[i]->data; ++ frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf); ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref); ++ } else { ++ /* 1. get references to the actual data */ ++ for (i = 0; i < avbuf->num_planes; i++) { ++ ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]); ++ if (ret) ++ return ret; ++ ++ frame->linesize[i] = avbuf->plane_info[i].bytesperline; ++ frame->data[i] = frame->buf[i]->data; ++ } + } + + /* fixup special cases */ +@@ -338,68 +483,95 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf) + return 0; + } + +-static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) ++static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h) + { +- int i, ret; +- struct v4l2_format fmt = out->context->format; +- int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ? +- fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat; +- int height = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ? +- fmt.fmt.pix_mp.height : fmt.fmt.pix.height; +- int is_planar_format = 0; +- +- switch (pixel_format) { +- case V4L2_PIX_FMT_YUV420M: +- case V4L2_PIX_FMT_YVU420M: +-#ifdef V4L2_PIX_FMT_YUV422M +- case V4L2_PIX_FMT_YUV422M: +-#endif +-#ifdef V4L2_PIX_FMT_YVU422M +- case V4L2_PIX_FMT_YVU422M: +-#endif +-#ifdef V4L2_PIX_FMT_YUV444M +- case V4L2_PIX_FMT_YUV444M: +-#endif +-#ifdef V4L2_PIX_FMT_YVU444M +- case V4L2_PIX_FMT_YVU444M: +-#endif +- case V4L2_PIX_FMT_NV12M: +- case V4L2_PIX_FMT_NV21M: +- case V4L2_PIX_FMT_NV12MT_16X16: +- case V4L2_PIX_FMT_NV12MT: +- case V4L2_PIX_FMT_NV16M: +- case V4L2_PIX_FMT_NV61M: +- is_planar_format = 1; ++ if (dst_stride == src_stride && w + 32 >= dst_stride) { ++ memcpy(dst, src, dst_stride * h); ++ } ++ else { ++ while (--h >= 0) { ++ memcpy(dst, src, w); ++ dst += dst_stride; ++ src += src_stride; ++ } + } ++} + +- if (!is_planar_format) { +- const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); +- int planes_nb = 0; +- int offset = 0; ++static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes) ++{ ++ return i != 0 && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA)); ++} ++ ++static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out) ++{ ++ int i; ++ int num_planes = 0; ++ int pel_strides[4] = {0}; ++ ++ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format); ++ ++ if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) { ++ av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__); ++ return -1; ++ } + +- for (i = 0; i < desc->nb_components; i++) +- planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1); ++ for (i = 0; i != desc->nb_components; ++i) { ++ if (desc->comp[i].plane >= num_planes) ++ num_planes = desc->comp[i].plane + 1; ++ pel_strides[desc->comp[i].plane] = desc->comp[i].step; ++ } + +- for (i = 0; i < planes_nb; i++) { +- int size, h = height; +- if (i == 1 || i == 2) { ++ if (out->num_planes > 1) { ++ if (num_planes != out->num_planes) { ++ av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes); ++ return -1; ++ } ++ for (i = 0; i != num_planes; ++i) { ++ int w = frame->width; ++ int h = frame->height; ++ if (is_chroma(desc, i, num_planes)) { ++ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); + h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); + } +- size = frame->linesize[i] * h; +- ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset, frame->buf[i]); +- if (ret) +- return ret; +- offset += size; ++ ++ cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline, ++ frame->data[i], frame->linesize[i], ++ w * pel_strides[i], h); ++ set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length); + } +- return 0; + } ++ else ++ { ++ unsigned int offset = 0; ++ ++ for (i = 0; i != num_planes; ++i) { ++ int w = frame->width; ++ int h = frame->height; ++ int dst_stride = out->plane_info[0].bytesperline; ++ uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset; ++ ++ if (is_chroma(desc, i, num_planes)) { ++ // Is chroma ++ dst_stride >>= desc->log2_chroma_w; ++ offset += dst_stride * (out->context->height >> desc->log2_chroma_h); ++ w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w); ++ h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h); ++ } ++ else { ++ // Is luma or alpha ++ offset += dst_stride * out->context->height; ++ } ++ if (offset > out->plane_info[0].length) { ++ av_log(NULL, AV_LOG_ERROR, "%s: Plane total %d > buffer size %d\n", __func__, offset, out->plane_info[0].length); ++ return -1; ++ } + +- for (i = 0; i < out->num_planes; i++) { +- ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0, frame->buf[i]); +- if (ret) +- return ret; ++ cpy_2d(dst, dst_stride, ++ frame->data[i], frame->linesize[i], ++ w * pel_strides[i], h); ++ } ++ set_buf_length(out, 0, offset, out->plane_info[0].length); + } +- + return 0; + } + +@@ -475,11 +647,17 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf) + return 0; + } + +-int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) ++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, const void *extdata, size_t extlen) + { + int ret; + +- ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0, pkt->buf); ++ if (extlen) { ++ ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0); ++ if (ret) ++ return ret; ++ } ++ ++ ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen); + if (ret) + return ret; + +@@ -491,6 +669,11 @@ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) + return 0; + } + ++int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out) ++{ ++ return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0); ++} ++ + int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) + { + V4L2Context *ctx = avbuf->context; +@@ -500,6 +683,27 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) + avbuf->buf.type = ctx->type; + avbuf->buf.index = index; + ++ if (buf_to_m2mctx(avbuf)->output_drm) { ++ AVHWFramesContext *hwframes; ++ ++ av_buffer_unref(&ctx->frames_ref); ++ ++ ctx->frames_ref = av_hwframe_ctx_alloc(buf_to_m2mctx(avbuf)->device_ref); ++ if (!ctx->frames_ref) { ++ ret = AVERROR(ENOMEM); ++ return ret; ++ } ++ ++ hwframes = (AVHWFramesContext*)ctx->frames_ref->data; ++ hwframes->format = AV_PIX_FMT_DRM_PRIME; ++ hwframes->sw_format = ctx->av_pix_fmt; ++ hwframes->width = ctx->width; ++ hwframes->height = ctx->height; ++ ret = av_hwframe_ctx_init(ctx->frames_ref); ++ if (ret < 0) ++ return ret; ++ } ++ + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->buf.length = VIDEO_MAX_PLANES; + avbuf->buf.m.planes = avbuf->planes; +@@ -527,14 +731,22 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) + + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length; +- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, +- PROT_READ | PROT_WRITE, MAP_SHARED, +- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); ++ ++ if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || ++ !buf_to_m2mctx(avbuf)->output_drm) { ++ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset); ++ } + } else { + avbuf->plane_info[i].length = avbuf->buf.length; +- avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, +- PROT_READ | PROT_WRITE, MAP_SHARED, +- buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); ++ ++ if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) || ++ !buf_to_m2mctx(avbuf)->output_drm) { ++ avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length, ++ PROT_READ | PROT_WRITE, MAP_SHARED, ++ buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset); ++ } + } + + if (avbuf->plane_info[i].mm_addr == MAP_FAILED) +@@ -543,9 +755,6 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) + + avbuf->status = V4L2BUF_AVAILABLE; + +- if (V4L2_TYPE_IS_OUTPUT(ctx->type)) +- return 0; +- + if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) { + avbuf->buf.m.planes = avbuf->planes; + avbuf->buf.length = avbuf->num_planes; +@@ -555,6 +764,15 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index) + avbuf->buf.length = avbuf->planes[0].length; + } + ++ if (V4L2_TYPE_IS_OUTPUT(ctx->type)) ++ return 0; ++ ++ if (buf_to_m2mctx(avbuf)->output_drm) { ++ ret = v4l2_buffer_export_drm(avbuf); ++ if (ret) ++ return ret; ++ } ++ + return ff_v4l2_buffer_enqueue(avbuf); + } + +@@ -568,6 +786,9 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf) + if (ret < 0) + return AVERROR(errno); + ++ ++avbuf->context->q_count; ++ av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, count=%d\n", avbuf->context->name, avbuf->buf.index, avbuf->context->q_count); ++ + avbuf->status = V4L2BUF_IN_DRIVER; + + return 0; +diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h +index 8dbc7fc104..7baf618c66 100644 +--- a/libavcodec/v4l2_buffers.h ++++ b/libavcodec/v4l2_buffers.h +@@ -27,6 +27,7 @@ + #include + #include + ++#include "libavutil/hwcontext_drm.h" + #include "avcodec.h" + + enum V4L2Buffer_status { +@@ -42,6 +43,9 @@ typedef struct V4L2Buffer { + /* each buffer needs to have a reference to its context */ + struct V4L2Context *context; + ++ /* DRM descriptor */ ++ AVDRMFrameDescriptor drm_frame; ++ + /* This object is refcounted per-plane, so we need to keep track + * of how many context-refs we are holding. */ + AVBufferRef *context_ref; +@@ -98,6 +102,8 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf); + */ + int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out); + ++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out, const void *extdata, size_t extlen); ++ + /** + * Extracts the data from an AVFrame to a V4L2Buffer + * +diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c +index 29b144ed73..e87b5a4432 100644 +--- a/libavcodec/v4l2_context.c ++++ b/libavcodec/v4l2_context.c +@@ -173,7 +173,8 @@ static int v4l2_handle_event(V4L2Context *ctx) + } + + if (evt.type == V4L2_EVENT_EOS) { +- ctx->done = 1; ++// ctx->done = 1; ++ av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name); + return 0; + } + +@@ -280,6 +281,21 @@ static int v4l2_stop_encode(V4L2Context *ctx) + return 0; + } + ++static int count_in_driver(const V4L2Context * const ctx) ++{ ++ int i; ++ int n = 0; ++ ++ if (!ctx->buffers) ++ return -1; ++ ++ for (i = 0; i < ctx->num_buffers; ++i) { ++ if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) ++ ++n; ++ } ++ return n; ++} ++ + static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) + { + struct v4l2_plane planes[VIDEO_MAX_PLANES]; +@@ -296,11 +312,13 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout) + if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER) + break; + } ++#if 1 + if (i == ctx->num_buffers) +- av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to " ++ av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to " + "userspace. Increase num_capture_buffers " + "to prevent device deadlock or dropped " +- "packets/frames.\n"); ++ "packets/frames.\n", i); ++#endif + } + + /* if we are draining and there are no more capture buffers queued in the driver we are done */ +@@ -329,11 +347,16 @@ start: + } + + for (;;) { +- ret = poll(&pfd, 1, timeout); ++ int t2 = timeout < 0 ? 3000 : timeout; ++ int e = pfd.events; ++ ret = poll(&pfd, 1, t2); + if (ret > 0) + break; + if (errno == EINTR) + continue; ++ if (timeout == -1) { ++ av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));; ++ } + return NULL; + } + +@@ -398,23 +421,43 @@ dequeue: + if (ret) { + if (errno != EAGAIN) { + ctx->done = 1; +- if (errno != EPIPE) ++// if (errno != EPIPE) + av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n", + ctx->name, av_err2str(AVERROR(errno))); + } + return NULL; + } ++ --ctx->q_count; ++ av_log(logger(ctx), AV_LOG_TRACE, "--- %s VIDIOC_DQBUF OK: index=%d, count=%d\n", ++ ctx->name, buf.index, ctx->q_count); ++ + + if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) { + int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ? + buf.m.planes[0].bytesused : buf.bytesused; + if (bytesused == 0) { ++ av_log(logger(ctx), AV_LOG_TRACE, "Buffer empty - reQ\n"); ++ ++ // Must reQ so we don't leak ++ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_QBUF, &buf); ++ if (ret) { ++ av_log(logger(ctx), AV_LOG_WARNING, "%s VIDIOC_QBUF, errno (%s): reQ empty buf failed\n", ++ ctx->name, av_err2str(AVERROR(errno))); ++ } ++ else { ++ ++ctx->q_count; ++ av_log(logger(ctx), AV_LOG_TRACE, "--- %s VIDIOC_QBUF OK: index=%d, count=%d\n", ++ ctx->name, buf.index, ctx->q_count); ++ } ++ + ctx->done = 1; + return NULL; + } + #ifdef V4L2_BUF_FLAG_LAST +- if (buf.flags & V4L2_BUF_FLAG_LAST) ++ if (buf.flags & V4L2_BUF_FLAG_LAST){ ++ av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n"); + ctx->done = 1; ++ } + #endif + } + +@@ -455,22 +498,54 @@ static int v4l2_release_buffers(V4L2Context* ctx) + struct v4l2_requestbuffers req = { + .memory = V4L2_MEMORY_MMAP, + .type = ctx->type, +- .count = 0, /* 0 -> unmaps buffers from the driver */ ++ .count = 0, /* 0 -> unmap all buffers from the driver */ + }; +- int i, j; ++ int ret, i, j; + + for (i = 0; i < ctx->num_buffers; i++) { + V4L2Buffer *buffer = &ctx->buffers[i]; + + for (j = 0; j < buffer->num_planes; j++) { + struct V4L2Plane_info *p = &buffer->plane_info[j]; ++ ++ if (V4L2_TYPE_IS_OUTPUT(ctx->type)) { ++ /* output buffers are not EXPORTED */ ++ goto unmap; ++ } ++ ++ if (ctx_to_m2mctx(ctx)->output_drm) { ++ /* use the DRM frame to close */ ++ if (buffer->drm_frame.objects[j].fd >= 0) { ++ if (close(buffer->drm_frame.objects[j].fd) < 0) { ++ av_log(logger(ctx), AV_LOG_ERROR, "%s close drm fd " ++ "[buffer=%2d, plane=%d, fd=%2d] - %s \n", ++ ctx->name, i, j, buffer->drm_frame.objects[j].fd, ++ av_err2str(AVERROR(errno))); ++ } ++ } ++ } ++unmap: + if (p->mm_addr && p->length) + if (munmap(p->mm_addr, p->length) < 0) +- av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno))); ++ av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ++ ctx->name, av_err2str(AVERROR(errno))); + } + } + +- return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req); ++ ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req); ++ if (ret < 0) { ++ av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n", ++ ctx->name, av_err2str(AVERROR(errno))); ++ ++ if (ctx_to_m2mctx(ctx)->output_drm) ++ av_log(logger(ctx), AV_LOG_ERROR, ++ "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n" ++ "for all buffers: \n" ++ " 1. drmModeRmFB(..)\n" ++ " 2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n"); ++ } ++ ++ return ret; + } + + static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt) +@@ -499,6 +574,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm + + static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) + { ++ V4L2m2mContext* s = ctx_to_m2mctx(ctx); ++ V4L2m2mPriv *priv = s->avctx->priv_data; + enum AVPixelFormat pixfmt = ctx->av_pix_fmt; + struct v4l2_fmtdesc fdesc; + int ret; +@@ -517,6 +594,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p) + if (ret) + return AVERROR(EINVAL); + ++ if (priv->pix_fmt != AV_PIX_FMT_NONE) { ++ if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) { ++ fdesc.index++; ++ continue; ++ } ++ } ++ + pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO); + ret = v4l2_try_raw_format(ctx, pixfmt); + if (ret){ +@@ -575,10 +659,16 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd) + int ret; + + ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type); +- if (ret < 0) +- return AVERROR(errno); ++ if (ret < 0) { ++ const int err = errno; ++ av_log(logger(ctx), AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name, ++ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err); ++ return AVERROR(err); ++ } + + ctx->streamon = (cmd == VIDIOC_STREAMON); ++ av_log(logger(ctx), AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name, ++ cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF"); + + return 0; + } +@@ -608,7 +698,7 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame) + return ff_v4l2_buffer_enqueue(avbuf); + } + +-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) ++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * extdata, size_t extlen) + { + V4L2m2mContext *s = ctx_to_m2mctx(ctx); + V4L2Buffer* avbuf; +@@ -626,7 +716,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt) + if (!avbuf) + return AVERROR(EAGAIN); + +- ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf); ++ ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen); + if (ret) + return ret; + +diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h +index 22a9532444..e459c72c45 100644 +--- a/libavcodec/v4l2_context.h ++++ b/libavcodec/v4l2_context.h +@@ -92,6 +92,9 @@ typedef struct V4L2Context { + */ + int done; + ++ AVBufferRef *frames_ref; ++ int q_count; ++ + } V4L2Context; + + /** +@@ -170,7 +173,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout); + * @param[in] pkt A pointer to an AVPacket. + * @return 0 in case of success, a negative error otherwise. + */ +-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt); ++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size); + + /** + * Enqueues a buffer to a V4L2Context from an AVFrame +diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c +index e48b3a8ccf..5543ac77ba 100644 +--- a/libavcodec/v4l2_m2m.c ++++ b/libavcodec/v4l2_m2m.c +@@ -338,6 +338,13 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) + V4L2m2mContext *s = priv->context; + int ret; + ++ if (!s) ++ return 0; ++ ++ if (av_codec_is_decoder(s->avctx->codec)) ++ av_packet_unref(&s->buf_pkt); ++ ++ if (s->fd >= 0) { + ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF); + if (ret) + av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->output.name); +@@ -345,11 +352,16 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv) + ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF); + if (ret) + av_log(s->avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s\n", s->capture.name); ++ } + + ff_v4l2_context_release(&s->output); + + s->self_ref = NULL; ++ // This is only called on avctx close so after this point we don't have that ++ // Crash sooner if we find we are using it (can still log with avctx = NULL) ++ s->avctx = NULL; + av_buffer_unref(&priv->context_ref); ++ priv->context = NULL; + + return 0; + } +diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h +index 456281f48c..b08a5b38ac 100644 +--- a/libavcodec/v4l2_m2m.h ++++ b/libavcodec/v4l2_m2m.h +@@ -30,6 +30,7 @@ + #include + + #include "libavcodec/avcodec.h" ++#include "libavutil/pixfmt.h" + #include "v4l2_context.h" + + #define container_of(ptr, type, member) ({ \ +@@ -38,7 +39,18 @@ + + #define V4L_M2M_DEFAULT_OPTS \ + { "num_output_buffers", "Number of buffers in the output context",\ +- OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS } ++ OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS } ++ ++#define FF_V4L2_M2M_TRACK_SIZE 128 ++typedef struct V4L2m2mTrackEl { ++ int discard; // If we see this buffer its been flushed, so discard ++ int pkt_size; ++ int64_t pts; ++ int64_t reordered_opaque; ++ int64_t pkt_pos; ++ int64_t pkt_duration; ++ int64_t track_pts; ++} V4L2m2mTrackEl; + + typedef struct V4L2m2mContext { + char devname[PATH_MAX]; +@@ -63,6 +75,23 @@ typedef struct V4L2m2mContext { + + /* reference back to V4L2m2mPriv */ + void *priv; ++ ++ AVBufferRef *device_ref; ++ ++ /* generate DRM frames */ ++ int output_drm; ++ ++ /* Frame tracking */ ++ int64_t last_pkt_dts; ++ int64_t last_opaque; ++ unsigned int track_no; ++ V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE]; ++ ++ /* req pkt */ ++ int req_pkt; ++ ++ /* Ext data sent */ ++ int extdata_sent; + } V4L2m2mContext; + + typedef struct V4L2m2mPriv { +@@ -73,6 +102,7 @@ typedef struct V4L2m2mPriv { + + int num_output_buffers; + int num_capture_buffers; ++ enum AVPixelFormat pix_fmt; + } V4L2m2mPriv; + + /** +diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c +index 3e17e0fcac..c397f2ca2f 100644 +--- a/libavcodec/v4l2_m2m_dec.c ++++ b/libavcodec/v4l2_m2m_dec.c +@@ -23,6 +23,9 @@ + + #include + #include ++ ++#include "libavutil/hwcontext.h" ++#include "libavutil/hwcontext_drm.h" + #include "libavutil/pixfmt.h" + #include "libavutil/pixdesc.h" + #include "libavutil/opt.h" +@@ -30,26 +33,51 @@ + #include "libavcodec/decode.h" + #include "libavcodec/internal.h" + ++#include "libavcodec/hwaccels.h" ++#include "libavcodec/internal.h" ++#include "libavcodec/hwconfig.h" ++ + #include "v4l2_context.h" + #include "v4l2_m2m.h" + #include "v4l2_fmt.h" + ++static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s) ++{ ++ int ret; ++ struct v4l2_decoder_cmd cmd = { ++ .cmd = V4L2_DEC_CMD_START, ++ .flags = 0, ++ }; ++ ++ if (s->output.streamon) ++ return 0; ++ ++ ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n"); ++ ++ if (!s->capture.streamon || ret < 0) ++ return ret; ++ ++ ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno); ++ else ++ av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n", errno); ++ ++ return ret; ++} ++ + static int v4l2_try_start(AVCodecContext *avctx) + { + V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; + V4L2Context *const capture = &s->capture; +- V4L2Context *const output = &s->output; + struct v4l2_selection selection = { 0 }; + int ret; + + /* 1. start the output process */ +- if (!output->streamon) { +- ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON); +- if (ret < 0) { +- av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n"); +- return ret; +- } +- } ++ if ((ret = check_output_streamon(avctx, s)) != 0) ++ return ret; + + if (capture->streamon) + return 0; +@@ -63,8 +91,14 @@ static int v4l2_try_start(AVCodecContext *avctx) + } + + /* 2.1 update the AVCodecContext */ +- avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); +- capture->av_pix_fmt = avctx->pix_fmt; ++ capture->av_pix_fmt = ++ ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO); ++ if (s->output_drm) { ++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; ++ avctx->sw_pix_fmt = capture->av_pix_fmt; ++ } ++ else ++ avctx->pix_fmt = capture->av_pix_fmt; + + /* 3. set the crop parameters */ + selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; +@@ -133,28 +167,257 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s) + return 0; + } + ++#define XLAT_PTS 1 ++ ++static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n) ++{ ++ const AVRational t = avctx->pkt_timebase.num ? avctx->pkt_timebase : avctx->time_base; ++ return !t.num || !t.den ? (int64_t)n * 1000000 : ((int64_t)n * t.den) / (t.num); ++} ++ ++static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts) ++{ ++ const AVRational t = avctx->pkt_timebase.num ? avctx->pkt_timebase : avctx->time_base; ++ return (unsigned int)(!t.num || !t.den ? pts / 1000000 : (pts * t.num) / t.den); ++} ++ ++static void ++xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt) ++{ ++#if XLAT_PTS ++ int64_t track_pts; ++ ++ // Avoid 0 ++ if (++s->track_no == 0) ++ s->track_no = 1; ++ ++ track_pts = track_to_pts(avctx, s->track_no); ++ ++ av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no); ++ s->last_pkt_dts = avpkt->dts; ++ s->track_els[s->track_no % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){ ++ .discard = 0, ++ .pkt_size = avpkt->size, ++ .pts = avpkt->pts, ++ .reordered_opaque = avctx->reordered_opaque, ++ .pkt_pos = avpkt->pos, ++ .pkt_duration = avpkt->duration, ++ .track_pts = track_pts ++ }; ++ avpkt->pts = track_pts; ++#endif ++} ++ ++// Returns -1 if we should discard the frame ++static int ++xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame) ++{ ++#if XLAT_PTS ++ unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE; ++ const V4L2m2mTrackEl *const t = s->track_els + n; ++ if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts) ++ { ++ av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); ++ frame->pts = AV_NOPTS_VALUE; ++ frame->pkt_dts = s->last_pkt_dts; ++ frame->reordered_opaque = s->last_opaque; ++ frame->pkt_pos = -1; ++ frame->pkt_duration = 0; ++ frame->pkt_size = -1; ++ } ++ else if (!t->discard) ++ { ++ frame->pts = t->pts; ++ frame->pkt_dts = s->last_pkt_dts; ++ frame->reordered_opaque = t->reordered_opaque; ++ frame->pkt_pos = t->pkt_pos; ++ frame->pkt_duration = t->pkt_duration; ++ frame->pkt_size = t->pkt_size; ++ ++ s->last_opaque = s->track_els[n].reordered_opaque; ++ s->track_els[n].pts = AV_NOPTS_VALUE; // If we hit this again deny accurate knowledge of PTS ++ } ++ else ++ { ++ av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts); ++ return -1; ++ } ++ ++#if FF_API_PKT_PTS ++FF_DISABLE_DEPRECATION_WARNINGS ++ frame->pkt_pts = frame->pts; ++FF_ENABLE_DEPRECATION_WARNINGS ++#endif ++ frame->best_effort_timestamp = frame->pts; ++ frame->pkt_dts = frame->pts; // We can't emulate what s/w does in a useful manner? ++ av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts); ++#endif ++ return 0; ++} ++ ++static inline int stream_started(const V4L2m2mContext * const s) { ++ return s->capture.streamon && s->output.streamon; ++} ++ ++ ++// -ve Error ++// 0 OK ++// 1 Dst full (retry if we think V4L2 Q has space now) ++// 2 Src empty (do not retry) ++// 3 Not started (do not retry, do not attempt capture dQ) ++ ++static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s) ++{ ++ AVPacket avpkt = {0}; ++ int ret = 0; ++ int ret2 = 0; ++ ++ if (s->buf_pkt.size) { ++ av_packet_move_ref(&avpkt, &s->buf_pkt); ++ } else { ++ ret = ff_decode_get_packet(avctx, &avpkt); ++ if (ret == AVERROR(EAGAIN)) { ++ if (!stream_started(s)) { ++ av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__); ++ return 3; ++ } ++ return 2; ++ } ++ ++ if (ret == AVERROR_EOF || avpkt.size == 0) { ++ // EOF - enter drain mode ++ av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n", ret, avpkt.size, stream_started(s), s->draining); ++ if (!stream_started(s)) { ++ av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n"); ++ s->draining = 1; ++ s->capture.done = 1; ++ return AVERROR_EOF; ++ } ++ ++ if (!s->draining) { ++ // On the offchance that get_packet left something that needs freeing in here ++ av_packet_unref(&avpkt); ++ // Calling enqueue with an empty pkt starts drain ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &avpkt, NULL, 0); ++ if (ret) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret); ++ return ret; ++ } ++ } ++ return 2; ++ } ++ ++ if (ret < 0) ++ return ret; ++ ++ xlat_pts_in(avctx, s, &avpkt); ++ } ++ ++ if ((ret = check_output_streamon(avctx, s)) != 0) ++ return ret; ++ ++ ret = ff_v4l2_context_enqueue_packet(&s->output, &avpkt, ++ avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size); ++ s->extdata_sent = 1; ++ ++ if (ret == AVERROR(EAGAIN)) { ++ // Out of input buffers - stash ++ av_packet_move_ref(&s->buf_pkt, &avpkt); ++ ret = 1; ++ } ++ else { ++ // In all other cases we are done with this packet ++ av_packet_unref(&avpkt); ++ ++ if (ret) { ++ av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret); ++ return ret; ++ } ++ } ++ ++ // Start if we haven't ++ ret2 = v4l2_try_start(avctx); ++ if (ret2) { ++ av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2); ++ ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : 3; ++ } ++ ++ return ret; ++} ++ + static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + { ++#if 1 ++ V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context; ++ int src_rv; ++ int dst_rv = 1; ++ ++ do { ++ src_rv = try_enqueue_src(avctx, s); ++ ++ if (src_rv < 0) { ++ av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", src_rv); ++ } ++ ++ if (s->req_pkt && src_rv == 2 && !s->draining) ++ break; ++ ++ if (src_rv == 1 && dst_rv == AVERROR(EAGAIN)) { ++ av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail"); ++ src_rv = 2; ++ } ++ ++ if (src_rv >= 0 && src_rv <= 2 && dst_rv != 0) { ++ do { ++ // Dequeue frame will unref any previous contents of frame ++ // so we don't need an explicit unref when discarding ++ dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1); ++ ++ if (dst_rv < 0) { ++ av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n", s->draining, s->capture.done, dst_rv); ++ } ++ ++ } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame)); ++ } ++ } while (src_rv == 0 || (src_rv == 1 && dst_rv == AVERROR(EAGAIN)) ); ++ ++ if (dst_rv) ++ av_frame_unref(frame); ++ ++ // If we got a frame this time ask for a pkt next time ++ s->req_pkt = (dst_rv == 0); ++ ++ return dst_rv == 0 ? 0 : ++ src_rv < 0 ? src_rv : ++ dst_rv < 0 ? dst_rv : ++ AVERROR(EAGAIN); ++ ++#else + V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context; + V4L2Context *const capture = &s->capture; + V4L2Context *const output = &s->output; + AVPacket avpkt = {0}; +- int ret; ++ int ret = 0; + + if (s->buf_pkt.size) { +- avpkt = s->buf_pkt; +- memset(&s->buf_pkt, 0, sizeof(AVPacket)); ++ av_packet_move_ref(&avpkt, &s->buf_pkt); + } else { + ret = ff_decode_get_packet(avctx, &avpkt); +- if (ret < 0 && ret != AVERROR_EOF) ++ if (ret < 0 && ret != AVERROR_EOF && ret != AVERROR(EAGAIN)) + return ret; ++ if (ret == 0) ++ xlat_pts_in(avctx, s, &avpkt); + } + +- if (s->draining) ++ if (ret) + goto dequeue; + +- ret = ff_v4l2_context_enqueue_packet(output, &avpkt); ++// av_log(avctx, AV_LOG_INFO, "Extdata len=%d, sent=%d\n", avctx->extradata_size, s->extdata_sent); ++ ret = ff_v4l2_context_enqueue_packet(output, &avpkt, ++ avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size); ++ s->extdata_sent = 1; + if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret); + if (ret != AVERROR(EAGAIN)) + return ret; + +@@ -178,9 +441,36 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) + dequeue: + if (!s->buf_pkt.size) + av_packet_unref(&avpkt); +- return ff_v4l2_context_dequeue_frame(capture, frame, -1); ++ ++ ret = ff_v4l2_context_dequeue_frame(capture, frame, -1); ++ if (!ret) ++ xlat_pts_out(avctx, s, frame); ++ return ret; ++#endif + } + ++#if 0 ++#include ++static int64_t us_time(void) ++{ ++ struct timespec ts; ++ clock_gettime(CLOCK_MONOTONIC, &ts); ++ return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000; ++} ++ ++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame) ++{ ++ int ret; ++ const int64_t now = us_time(); ++ int64_t done; ++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ ret = v4l2_receive_frame2(avctx, frame); ++ done = us_time(); ++ av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret); ++ return ret; ++} ++#endif ++ + static av_cold int v4l2_decode_init(AVCodecContext *avctx) + { + V4L2Context *capture, *output; +@@ -188,6 +478,9 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + V4L2m2mPriv *priv = avctx->priv_data; + int ret; + ++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME; ++ + ret = ff_v4l2_m2m_create_context(priv, &s); + if (ret < 0) + return ret; +@@ -208,13 +501,32 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + capture->av_codec_id = AV_CODEC_ID_RAWVIDEO; + capture->av_pix_fmt = avctx->pix_fmt; + ++ /* the client requests the codec to generate DRM frames: ++ * - data[0] will therefore point to the returned AVDRMFrameDescriptor ++ * check the ff_v4l2_buffer_to_avframe conversion function. ++ * - the DRM frame format is passed in the DRM frame descriptor layer. ++ * check the v4l2_get_drm_frame function. ++ */ ++ switch (ff_get_format(avctx, avctx->codec->pix_fmts)) { ++ default: ++ s->output_drm = 1; ++ break; ++ } ++ ++ s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM); ++ if (!s->device_ref) { ++ ret = AVERROR(ENOMEM); ++ return ret; ++ } ++ ++ ret = av_hwdevice_ctx_init(s->device_ref); ++ if (ret < 0) ++ return ret; ++ + s->avctx = avctx; + ret = ff_v4l2_m2m_codec_init(priv); + if (ret) { + av_log(avctx, AV_LOG_ERROR, "can't configure decoder\n"); +- s->self_ref = NULL; +- av_buffer_unref(&priv->context_ref); +- + return ret; + } + +@@ -223,10 +535,68 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx) + + static av_cold int v4l2_decode_close(AVCodecContext *avctx) + { ++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ return ff_v4l2_m2m_codec_end(avctx->priv_data); ++ av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__); ++} ++ ++static void v4l2_decode_flush(AVCodecContext *avctx) ++{ ++ ++#if 0 ++ v4l2_decode_close(avctx); ++ v4l2_decode_init(avctx); ++#else + V4L2m2mPriv *priv = avctx->priv_data; +- V4L2m2mContext *s = priv->context; +- av_packet_unref(&s->buf_pkt); +- return ff_v4l2_m2m_codec_end(priv); ++ V4L2m2mContext* s = priv->context; ++ V4L2Context* output = &s->output; ++ V4L2Context* capture = &s->capture; ++ int ret, i; ++ ++ av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__); ++ ++ ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret); ++ ++ for (i = 0; i < output->num_buffers; i++) { ++ if (output->buffers[i].status == V4L2BUF_IN_DRIVER) ++ output->buffers[i].status = V4L2BUF_AVAILABLE; ++ } ++ ++ for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) ++ s->track_els[i].discard = 1; ++ ++#if 0 ++ ++ ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", capture->name, ret); ++ ++ ++ ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON %s error: %d\n", capture->name, ret); ++ ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON %s error: %d\n", output->name, ret); ++ ++ struct v4l2_decoder_cmd cmd = { ++ .cmd = V4L2_DEC_CMD_START, ++ .flags = 0, ++ }; ++ ++ ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno); ++#endif ++ ++ s->draining = 0; ++ s->extdata_sent = 0; ++ output->done = 0; ++ capture->done = 0; ++#endif ++ av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__); + } + + #define OFFSET(x) offsetof(V4L2m2mPriv, x) +@@ -235,10 +605,16 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx) + static const AVOption options[] = { + V4L_M2M_DEFAULT_OPTS, + { "num_capture_buffers", "Number of buffers in the capture context", +- OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS }, ++ OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS }, ++ { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS }, + { NULL}, + }; + ++static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = { ++ HW_CONFIG_INTERNAL(DRM_PRIME), ++ NULL ++}; ++ + #define M2MDEC_CLASS(NAME) \ + static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \ + .class_name = #NAME "_v4l2m2m_decoder", \ +@@ -259,9 +635,14 @@ static const AVOption options[] = { + .init = v4l2_decode_init, \ + .receive_frame = v4l2_receive_frame, \ + .close = v4l2_decode_close, \ ++ .flush = v4l2_decode_flush, \ + .bsfs = bsf_name, \ + .capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \ +- .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS, \ ++ .caps_internal = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \ ++ .pix_fmts = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \ ++ AV_PIX_FMT_NV12, \ ++ AV_PIX_FMT_NONE}, \ ++ .hw_configs = v4l2_m2m_hw_configs, \ + .wrapper_name = "v4l2m2m", \ + } + +diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c +index 32321f392f..9f1b2c2ffc 100644 +--- a/libavcodec/v4l2_m2m_enc.c ++++ b/libavcodec/v4l2_m2m_enc.c +@@ -416,6 +416,7 @@ static const AVCodecDefault v4l2_m2m_defaults[] = { + .close = v4l2_encode_close, \ + .defaults = v4l2_m2m_defaults, \ + .capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY, \ ++ .caps_internal = FF_CODEC_CAP_INIT_CLEANUP, \ + .wrapper_name = "v4l2m2m", \ + } + +diff --git a/libavcodec/v4l2_phase.c b/libavcodec/v4l2_phase.c +new file mode 100644 +index 0000000000..0a7f6abd33 +--- /dev/null ++++ b/libavcodec/v4l2_phase.c +@@ -0,0 +1,140 @@ ++// v4l2_phase.c ++ ++#include ++#include ++#include ++ ++#include "libavutil/log.h" ++#include "v4l2_phase.h" ++ ++typedef struct phase_envss { ++ unsigned int last_order; ++ pthread_mutex_t lock; ++ pthread_cond_t cond; ++} phase_env; ++ ++struct V4L2PhaseControl { ++ unsigned int order; ++ unsigned int phase_count; ++ phase_env p[V4L2PHASE_PHASE_COUNT]; ++}; ++ ++ ++unsigned int ff_v4l2_phase_order_next(V4L2PhaseControl * const pc) ++{ ++ return ++pc->order; ++} ++ ++// Phase isn't required but it acts as a check that we know what we are doing ++int ++ff_v4l2_phase_claim(V4L2PhaseInfo * const pi, unsigned int phase) ++{ ++ V4L2PhaseControl *const pc = pi->ctrl; ++ phase_env * const p = pc->p + phase; ++ ++ if (pi->n2 != phase * 2) { ++ av_log(NULL, AV_LOG_ERROR, "%s: Unexpected phase: req=%d, cur=%d/%d\n", __func__, phase, pi->n2 >> 1, pi->n2 & 1); ++ return -1; ++ } ++ ++ pthread_mutex_lock(&p->lock); ++ ++ while (pi->order != p->last_order + 1) { ++ pthread_cond_wait(&p->cond, &p->lock); ++ } ++ ++ pi->n2++; ++ pthread_mutex_unlock(&p->lock); ++ return 0; ++} ++ ++int ++ff_v4l2_phase_release(V4L2PhaseInfo * const pi, unsigned int phase) ++{ ++ V4L2PhaseControl *const pc = pi->ctrl; ++ phase_env * const p = pc->p + phase; ++ ++ if (pi->n2 != ((phase << 1) | 1)) { ++ av_log(NULL, AV_LOG_ERROR, "%s: Unexpected phase: req=%d, cur=%d/%d\n", __func__, phase, pi->n2 >> 1, pi->n2 & 1); ++ return -1; ++ } ++ ++ if (pi->order != p->last_order + 1) { ++ av_log(NULL, AV_LOG_ERROR, "%s: order_mismatch\n", __func__); ++ return -1; ++ } ++ ++ pthread_mutex_lock(&p->lock); ++ p->last_order = pi->order; ++ pi->n2++; ++ pthread_cond_broadcast(&p->cond); ++ pthread_mutex_unlock(&p->lock); ++ return 0; ++} ++ ++// Init the PhaseInfo, assign a new order, claim phase 0 ++int ++ff_v4l2_phase_start(V4L2PhaseInfo * const pi, V4L2PhaseControl * const pc) ++{ ++ pi->n2 = 0; ++ pi->ctrl = pc; ++ pi->order = ff_v4l2_phase_order_next(pc); ++ return ff_v4l2_phase_claim(pi, 0); ++} ++ ++// Release any claimed phase and claim+release all remaining phases ++void ff_v4l2_phase_abort(V4L2PhaseInfo * const pi) ++{ ++ V4L2PhaseControl *const pc = pi->ctrl; ++ ++ // Nothing to do ++ if (pi->n2 == 0 || pi->n2 >= pc->phase_count * 2) ++ return; ++ ++ // Run through all remaining phases ++ do { ++ if ((pi->n2 & 1) == 0) ++ ff_v4l2_phase_claim(pi, pi->n2 >> 1); ++ else ++ ff_v4l2_phase_release(pi, pi->n2 >> 1); ++ } while (pi->n2 < pc->phase_count * 2); ++} ++ ++ ++V4L2PhaseControl * ++ff_v4l2_phase_control_new(unsigned int phase_count) ++{ ++ V4L2PhaseControl * pc; ++ unsigned int i; ++ if (phase_count > V4L2PHASE_PHASE_COUNT) ++ return NULL; ++ if ((pc = av_mallocz(sizeof(*pc))) == NULL) ++ return NULL; ++ pc->phase_count = phase_count; ++ for (i = 0; i != phase_count; ++i) { ++ phase_env * const p = pc->p + i; ++ p->last_order = 0; ++ pthread_mutex_init(&p->lock, NULL); ++ pthread_cond_init(&p->cond, NULL); ++ } ++ return pc; ++} ++ ++void ++ff_v4l2_phase_control_deletez(V4L2PhaseControl ** const ppc) ++{ ++ V4L2PhaseControl * const pc = *ppc; ++ unsigned int i; ++ ++ if (pc == NULL) ++ return; ++ *ppc = NULL; ++ ++ for (i = 0; i != pc->phase_count; ++i) { ++ phase_env * const p = pc->p + i; ++ pthread_mutex_destroy(&p->lock); ++ pthread_cond_destroy(&p->cond); ++ } ++} ++ ++ +diff --git a/libavcodec/v4l2_phase.h b/libavcodec/v4l2_phase.h +new file mode 100644 +index 0000000000..392f22b988 +--- /dev/null ++++ b/libavcodec/v4l2_phase.h +@@ -0,0 +1,37 @@ ++// v4l2_phase.h ++#ifndef AVCODEC_V4L2_PHASE_H ++#define AVCODEC_V4L2_PHASE_H ++ ++#define V4L2PHASE_PHASE_COUNT 2 ++ ++struct V4L2PhaseControl; ++typedef struct V4L2PhaseControl V4L2PhaseControl; ++ ++typedef struct V4L2PhaseInfo { ++ unsigned int n2; // (phase << 1) | (claimed) ++ unsigned int order; ++ V4L2PhaseControl * ctrl; ++} V4L2PhaseInfo; ++ ++unsigned int ff_v4l2_phase_order_next(V4L2PhaseControl * const pc); ++ ++static inline int ff_v4l2_phase_started(const V4L2PhaseInfo * const pi) ++{ ++ return pi->n2 != 0; ++} ++ ++// Init the PhaseInfo, assign a new order, claim phase 0 ++int ff_v4l2_phase_start(V4L2PhaseInfo * const pi, V4L2PhaseControl * const pc); ++ ++// Phase isn't required but it acts as a check that we know what we are doing ++int ff_v4l2_phase_claim(V4L2PhaseInfo * const pi, unsigned int phase); ++int ff_v4l2_phase_release(V4L2PhaseInfo * const pi, unsigned int phase); ++ ++// Release any claimed phase and claim+release all remaining phases ++void ff_v4l2_phase_abort(V4L2PhaseInfo * const pi); ++ ++ ++V4L2PhaseControl * ff_v4l2_phase_control_new(unsigned int phase_count); ++void ff_v4l2_phase_control_deletez(V4L2PhaseControl ** const ppc); ++ ++#endif +diff --git a/libavcodec/v4l2_request.c b/libavcodec/v4l2_request.c +new file mode 100644 +index 0000000000..bfd2f22ab4 +--- /dev/null ++++ b/libavcodec/v4l2_request.c +@@ -0,0 +1,1102 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include "decode.h" ++#include "internal.h" ++#include "v4l2_request.h" ++#include "v4l2_phase.h" ++ ++#ifndef DRM_FORMAT_NV15 ++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') ++#endif ++ ++#ifndef DRM_FORMAT_NV20 ++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') ++#endif ++ ++// P030 should be defined in drm_fourcc.h and hopefully will be sometime ++// in the future but until then... ++#ifndef DRM_FORMAT_P030 ++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') ++#endif ++ ++#ifndef DRM_FORMAT_NV15 ++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') ++#endif ++ ++#ifndef DRM_FORMAT_NV20 ++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') ++#endif ++ ++#ifndef V4L2_PIX_FMT_NV12_10_COL128 ++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0') ++#endif ++ ++#ifndef V4L2_PIX_FMT_NV12_COL128 ++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12 Y/CbCr 4:2:0 128 pixel wide column */ ++#endif ++ ++uint64_t ff_v4l2_request_get_capture_timestamp(AVFrame *frame) ++{ ++ V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0]; ++ return req ? v4l2_timeval_to_ns(&req->capture.buffer.timestamp) : 0; ++} ++ ++int ff_v4l2_request_start_phase_control(AVFrame *frame, struct V4L2PhaseControl * ctrl) ++{ ++ V4L2RequestDescriptor * const req = (V4L2RequestDescriptor*)frame->data[0]; ++ return ff_v4l2_phase_start(&req->phase, ctrl); ++} ++ ++void ff_v4l2_request_abort_phase_control(AVFrame *frame) ++{ ++ if (frame != NULL && frame->data[0] != NULL) { ++ V4L2RequestDescriptor *const req = (V4L2RequestDescriptor *)frame->data[0]; ++ ff_v4l2_phase_abort(&req->phase); ++ } ++} ++ ++int ff_v4l2_request_reset_frame(AVCodecContext *avctx, AVFrame *frame) ++{ ++ V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0]; ++ memset(&req->drm, 0, sizeof(AVDRMFrameDescriptor)); ++ req->output.used = 0; ++ return 0; ++} ++ ++int ff_v4l2_request_append_output_buffer(AVCodecContext *avctx, AVFrame *frame, const uint8_t *data, uint32_t size) ++{ ++ V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0]; ++ if (req->output.used + size + (AV_INPUT_BUFFER_PADDING_SIZE * 4) <= req->output.size) { ++ memcpy(req->output.addr + req->output.used, data, size); ++ req->output.used += size; ++ } else { ++ av_log(avctx, AV_LOG_ERROR, "%s: output.used=%u output.size=%u size=%u\n", __func__, req->output.used, req->output.size, size); ++ } ++ return 0; ++} ++ ++static int v4l2_request_controls(V4L2RequestContext *ctx, int request_fd, unsigned long type, struct v4l2_ext_control *control, int count) ++{ ++ struct v4l2_ext_controls controls = { ++ .controls = control, ++ .count = count, ++ .request_fd = request_fd, ++ .which = (request_fd >= 0) ? V4L2_CTRL_WHICH_REQUEST_VAL : 0, ++ }; ++ ++ if (!control || !count) ++ return 0; ++ ++ return ioctl(ctx->video_fd, type, &controls); ++} ++ ++static int v4l2_request_set_controls(V4L2RequestContext *ctx, int request_fd, struct v4l2_ext_control *control, int count) ++{ ++ return v4l2_request_controls(ctx, request_fd, VIDIOC_S_EXT_CTRLS, control, count); ++} ++ ++int ff_v4l2_request_set_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ int ret; ++ ++ ret = v4l2_request_controls(ctx, -1, VIDIOC_S_EXT_CTRLS, control, count); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: set controls failed, %s (%d)\n", __func__, strerror(errno), errno); ++ return AVERROR(EINVAL); ++ } ++ ++ return ret; ++} ++ ++int ff_v4l2_request_get_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ int ret; ++ ++ ret = v4l2_request_controls(ctx, -1, VIDIOC_G_EXT_CTRLS, control, count); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: get controls failed, %s (%d)\n", __func__, strerror(errno), errno); ++ return AVERROR(EINVAL); ++ } ++ ++ return ret; ++} ++ ++int ff_v4l2_request_query_control(AVCodecContext *avctx, struct v4l2_query_ext_ctrl *control) ++{ ++ int ret; ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ ++ ret = ioctl(ctx->video_fd, VIDIOC_QUERY_EXT_CTRL, control); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: query control failed, %s (%d)\n", __func__, strerror(errno), errno); ++ return AVERROR(EINVAL); ++ } ++ ++ return 0; ++} ++ ++int ff_v4l2_request_query_control_default_value(AVCodecContext *avctx, uint32_t id) ++{ ++ int ret; ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ struct v4l2_queryctrl control = { ++ .id = id, ++ }; ++ ++ ret = ioctl(ctx->video_fd, VIDIOC_QUERYCTRL, &control); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: query control failed, %s (%d)\n", __func__, strerror(errno), errno); ++ return AVERROR(EINVAL); ++ } ++ ++ return control.default_value; ++} ++ ++static int v4l2_request_queue_buffer(V4L2RequestContext *ctx, int request_fd, V4L2RequestBuffer *buf, uint32_t flags) ++{ ++ struct v4l2_plane planes[1] = {}; ++ struct v4l2_buffer buffer = { ++ .type = buf->buffer.type, ++ .memory = buf->buffer.memory, ++ .index = buf->index, ++ .timestamp.tv_usec = ctx->timestamp, ++ .bytesused = buf->used, ++ .request_fd = request_fd, ++ .flags = ((request_fd >= 0) ? V4L2_BUF_FLAG_REQUEST_FD : 0) | flags, ++ }; ++ ++ buf->buffer.timestamp = buffer.timestamp; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) { ++ planes[0].bytesused = buf->used; ++ buffer.bytesused = 0; ++ buffer.length = 1; ++ buffer.m.planes = planes; ++ } ++ ++ return ioctl(ctx->video_fd, VIDIOC_QBUF, &buffer); ++} ++ ++static int v4l2_request_dequeue_buffer(V4L2RequestContext *ctx, V4L2RequestBuffer *buf) ++{ ++ int ret; ++ struct v4l2_plane planes[1] = {}; ++ struct v4l2_buffer buffer = { ++ .type = buf->buffer.type, ++ .memory = buf->buffer.memory, ++ .index = buf->index, ++ }; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type)) { ++ buffer.length = 1; ++ buffer.m.planes = planes; ++ } ++ ++ ret = ioctl(ctx->video_fd, VIDIOC_DQBUF, &buffer); ++ if (ret < 0) ++ return ret; ++ ++ buf->buffer.timestamp = buffer.timestamp; ++ buf->buffer.flags = buffer.flags; ++ return 0; ++} ++ ++const uint32_t v4l2_request_capture_pixelformats[] = { ++#if CONFIG_SAND ++ V4L2_PIX_FMT_NV12_COL128, ++ V4L2_PIX_FMT_NV12_10_COL128, ++#endif ++ V4L2_PIX_FMT_NV12, ++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED ++ V4L2_PIX_FMT_SUNXI_TILED_NV12, ++#endif ++#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15) ++ V4L2_PIX_FMT_NV15, ++#endif ++ V4L2_PIX_FMT_NV16, ++#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20) ++ V4L2_PIX_FMT_NV20, ++#endif ++}; ++ ++static int v4l2_request_set_drm_descriptor(V4L2RequestDescriptor *req, struct v4l2_format *format) ++{ ++ AVDRMFrameDescriptor *desc = &req->drm; ++ AVDRMLayerDescriptor *layer = &desc->layers[0]; ++ uint32_t pixelformat = V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat; ++ ++ switch (pixelformat) { ++ case V4L2_PIX_FMT_NV12: ++ layer->format = DRM_FORMAT_NV12; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ break; ++#if CONFIG_SAND ++ case V4L2_PIX_FMT_NV12_COL128: ++ layer->format = DRM_FORMAT_NV12; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(format->fmt.pix.bytesperline); ++ break; ++ case V4L2_PIX_FMT_NV12_10_COL128: ++ layer->format = DRM_FORMAT_P030; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(format->fmt.pix.bytesperline); ++ break; ++#endif ++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED ++ case V4L2_PIX_FMT_SUNXI_TILED_NV12: ++ layer->format = DRM_FORMAT_NV12; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED; ++ break; ++#endif ++#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15) ++ case V4L2_PIX_FMT_NV15: ++ layer->format = DRM_FORMAT_NV15; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ break; ++#endif ++ case V4L2_PIX_FMT_NV16: ++ layer->format = DRM_FORMAT_NV16; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ break; ++#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20) ++ case V4L2_PIX_FMT_NV20: ++ layer->format = DRM_FORMAT_NV20; ++ desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR; ++ break; ++#endif ++ default: ++ return -1; ++ } ++ ++ desc->nb_objects = 1; ++ desc->objects[0].fd = req->capture.fd; ++ desc->objects[0].size = req->capture.size; ++ ++ desc->nb_layers = 1; ++ layer->nb_planes = 2; ++ ++ layer->planes[0].object_index = 0; ++ layer->planes[0].offset = 0; ++ layer->planes[0].pitch = V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.plane_fmt[0].bytesperline : format->fmt.pix.bytesperline; ++#if CONFIG_SAND ++ if (pixelformat == V4L2_PIX_FMT_NV12_COL128) { ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = format->fmt.pix.height * 128; ++ layer->planes[0].pitch = format->fmt.pix.width; ++ layer->planes[1].pitch = format->fmt.pix.width; ++ } ++ else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) { ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = format->fmt.pix.height * 128; ++ layer->planes[0].pitch = format->fmt.pix.width * 2; // Lies but it keeps DRM import happy ++ layer->planes[1].pitch = format->fmt.pix.width * 2; ++ } ++ else ++#endif ++ { ++ layer->planes[1].object_index = 0; ++ layer->planes[1].offset = layer->planes[0].pitch * (V4L2_TYPE_IS_MULTIPLANAR(format->type) ? format->fmt.pix_mp.height : format->fmt.pix.height); ++ layer->planes[1].pitch = layer->planes[0].pitch; ++ } ++ ++ return 0; ++} ++ ++static int v4l2_request_queue_decode(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0]; ++ struct timeval tv = { 2, 0 }; ++ fd_set except_fds; ++ int ret; ++ ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p used=%u controls=%d index=%d fd=%d request_fd=%d first_slice=%d last_slice=%d\n", __func__, avctx, req->output.used, count, req->capture.index, req->capture.fd, req->request_fd, first_slice, last_slice); ++ ++ if (first_slice) ++ ctx->timestamp++; ++ ++ ret = v4l2_request_set_controls(ctx, req->request_fd, control, count); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: set controls failed for request %d, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno); ++ return -1; ++ } ++ ++ memset(req->output.addr + req->output.used, 0, AV_INPUT_BUFFER_PADDING_SIZE * 4); ++ ++ ret = v4l2_request_queue_buffer(ctx, req->request_fd, &req->output, last_slice ? 0 : V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: queue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno); ++ return -1; ++ } ++ ++ if (first_slice) { ++ ret = v4l2_request_queue_buffer(ctx, -1, &req->capture, 0); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: queue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno); ++ return -1; ++ } ++ } ++ ++ // NOTE: do we need to dequeue when request fails/timeout? ++ ++ // 4. queue request and wait ++ ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_QUEUE, NULL); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: queue request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno); ++ goto fail; ++ } ++ ++ FD_ZERO(&except_fds); ++ FD_SET(req->request_fd, &except_fds); ++ ++ ret = select(req->request_fd + 1, NULL, NULL, &except_fds, &tv); ++ if (ret == 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: request %d timeout\n", __func__, req->request_fd); ++ goto fail; ++ } else if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: select request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno); ++ goto fail; ++ } ++ ++ ret = v4l2_request_dequeue_buffer(ctx, &req->output); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: dequeue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno); ++ return -1; ++ } ++ ++ ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_REINIT, NULL); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: reinit request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno); ++ return -1; ++ } ++ ++ if (last_slice) { ++ if (ff_v4l2_phase_started(&req->phase)) { ++ ff_v4l2_phase_release(&req->phase, 0); ++ ff_v4l2_phase_claim(&req->phase, 1); ++ } ++ ++ ret = v4l2_request_dequeue_buffer(ctx, &req->capture); ++ ++ if (ff_v4l2_phase_started(&req->phase)) { ++ ff_v4l2_phase_release(&req->phase, 1); ++ } ++ ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: dequeue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno); ++ return -1; ++ } ++ } ++ ++ // TODO: check errors ++ // buffer.flags & V4L2_BUF_FLAG_ERROR ++ ++ if (last_slice) ++ return v4l2_request_set_drm_descriptor(req, &ctx->format); ++ ++ return 0; ++ ++fail: ++ ret = v4l2_request_dequeue_buffer(ctx, &req->output); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "%s: dequeue output buffer %d failed for request %d, %s (%d)\n", __func__, req->output.index, req->request_fd, strerror(errno), errno); ++ ++ ret = v4l2_request_dequeue_buffer(ctx, &req->capture); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "%s: dequeue capture buffer %d failed for request %d, %s (%d)\n", __func__, req->capture.index, req->request_fd, strerror(errno), errno); ++ ++ ret = ioctl(req->request_fd, MEDIA_REQUEST_IOC_REINIT, NULL); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "%s: reinit request %d failed, %s (%d)\n", __func__, req->request_fd, strerror(errno), errno); ++ ++ return -1; ++} ++ ++int ff_v4l2_request_decode_slice(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice) ++{ ++ V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0]; ++ ++ // fall back to queue each slice as a full frame ++ if ((req->output.capabilities & V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF) != V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF) ++ return v4l2_request_queue_decode(avctx, frame, control, count, 1, 1); ++ ++ return v4l2_request_queue_decode(avctx, frame, control, count, first_slice, last_slice); ++} ++ ++int ff_v4l2_request_decode_frame(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count) ++{ ++ return v4l2_request_queue_decode(avctx, frame, control, count, 1, 1); ++} ++ ++static int v4l2_request_try_format(AVCodecContext *avctx, enum v4l2_buf_type type, uint32_t pixelformat) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ struct v4l2_fmtdesc fmtdesc = { ++ .index = 0, ++ .type = type, ++ }; ++ ++ if (V4L2_TYPE_IS_OUTPUT(type)) { ++ struct v4l2_create_buffers buffers = { ++ .count = 0, ++ .memory = V4L2_MEMORY_MMAP, ++ .format.type = type, ++ }; ++ ++ if (ioctl(ctx->video_fd, VIDIOC_CREATE_BUFS, &buffers) < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: create buffers failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno); ++ return -1; ++ } ++ ++ if ((buffers.capabilities & V4L2_BUF_CAP_SUPPORTS_REQUESTS) != V4L2_BUF_CAP_SUPPORTS_REQUESTS) { ++ av_log(avctx, AV_LOG_INFO, "%s: output buffer type do not support requests, capabilities %u\n", __func__, buffers.capabilities); ++ return -1; ++ } ++ } ++ ++ while (ioctl(ctx->video_fd, VIDIOC_ENUM_FMT, &fmtdesc) >= 0) { ++ if (fmtdesc.pixelformat == pixelformat) ++ return 0; ++ ++ fmtdesc.index++; ++ } ++ ++ av_log(avctx, AV_LOG_INFO, "%s: pixelformat %u not supported for type %u\n", __func__, pixelformat, type); ++ return -1; ++} ++ ++static int v4l2_request_set_format(AVCodecContext *avctx, enum v4l2_buf_type type, uint32_t pixelformat, uint32_t buffersize) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ struct v4l2_format format = { ++ .type = type, ++ }; ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(type)) { ++ format.fmt.pix_mp.width = avctx->coded_width; ++ format.fmt.pix_mp.height = avctx->coded_height; ++ format.fmt.pix_mp.pixelformat = pixelformat; ++ format.fmt.pix_mp.plane_fmt[0].sizeimage = buffersize; ++ format.fmt.pix_mp.num_planes = 1; ++ } else { ++ format.fmt.pix.width = avctx->coded_width; ++ format.fmt.pix.height = avctx->coded_height; ++ format.fmt.pix.pixelformat = pixelformat; ++ format.fmt.pix.sizeimage = buffersize; ++ } ++ ++ return ioctl(ctx->video_fd, VIDIOC_S_FMT, &format); ++} ++ ++static int v4l2_request_select_capture_format(AVCodecContext *avctx) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ enum v4l2_buf_type type = ctx->format.type; ++ ++#if 0 ++ struct v4l2_format format = { ++ .type = type, ++ }; ++ struct v4l2_fmtdesc fmtdesc = { ++ .index = 0, ++ .type = type, ++ }; ++ uint32_t pixelformat; ++ int i; ++ ++ if (ioctl(ctx->video_fd, VIDIOC_G_FMT, &format) < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: get capture format failed, %s (%d)\n", __func__, strerror(errno), errno); ++ return -1; ++ } ++ ++ pixelformat = V4L2_TYPE_IS_MULTIPLANAR(type) ? format.fmt.pix_mp.pixelformat : format.fmt.pix.pixelformat; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) { ++ if (pixelformat == v4l2_request_capture_pixelformats[i]) ++ return v4l2_request_set_format(avctx, type, pixelformat, 0); ++ } ++ ++ while (ioctl(ctx->video_fd, VIDIOC_ENUM_FMT, &fmtdesc) >= 0) { ++ for (i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) { ++ if (fmtdesc.pixelformat == v4l2_request_capture_pixelformats[i]) ++ return v4l2_request_set_format(avctx, type, fmtdesc.pixelformat, 0); ++ } ++ ++ fmtdesc.index++; ++ } ++#else ++ for (int i = 0; i < FF_ARRAY_ELEMS(v4l2_request_capture_pixelformats); i++) { ++ uint32_t pixelformat = v4l2_request_capture_pixelformats[i]; ++ if (!v4l2_request_try_format(avctx, type, pixelformat)) ++ return v4l2_request_set_format(avctx, type, pixelformat, 0); ++ } ++#endif ++ ++ return -1; ++} ++ ++static int v4l2_request_probe_video_device(struct udev_device *device, AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ int ret = AVERROR(EINVAL); ++ struct v4l2_capability capability = {0}; ++ unsigned int capabilities = 0; ++ ++ const char *path = udev_device_get_devnode(device); ++ if (!path) { ++ av_log(avctx, AV_LOG_ERROR, "%s: get video device devnode failed\n", __func__); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++// ctx->video_fd = open(path, O_RDWR | O_NONBLOCK, 0); ++ ctx->video_fd = open(path, O_RDWR, 0); ++ if (ctx->video_fd < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ ret = ioctl(ctx->video_fd, VIDIOC_QUERYCAP, &capability); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: get video capability failed, %s (%d)\n", __func__, strerror(errno), errno); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ if (capability.capabilities & V4L2_CAP_DEVICE_CAPS) ++ capabilities = capability.device_caps; ++ else ++ capabilities = capability.capabilities; ++ ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p path=%s capabilities=%u\n", __func__, avctx, ctx, path, capabilities); ++ ++ if ((capabilities & V4L2_CAP_STREAMING) != V4L2_CAP_STREAMING) { ++ av_log(avctx, AV_LOG_ERROR, "%s: missing required streaming capability\n", __func__); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) == V4L2_CAP_VIDEO_M2M_MPLANE) { ++ ctx->output_type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; ++ ctx->format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; ++ } else if ((capabilities & V4L2_CAP_VIDEO_M2M) == V4L2_CAP_VIDEO_M2M) { ++ ctx->output_type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ ctx->format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ } else { ++ av_log(avctx, AV_LOG_ERROR, "%s: missing required mem2mem capability\n", __func__); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ ret = v4l2_request_try_format(avctx, ctx->output_type, pixelformat); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_WARNING, "%s: try output format failed\n", __func__); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ ret = v4l2_request_set_format(avctx, ctx->output_type, pixelformat, buffersize); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: set output format failed, %s (%d)\n", __func__, strerror(errno), errno); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ ret = v4l2_request_set_controls(ctx, -1, control, count); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: set controls failed, %s (%d)\n", __func__, strerror(errno), errno); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ ret = v4l2_request_select_capture_format(avctx); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_WARNING, "%s: select capture format failed\n", __func__); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ return 0; ++ ++fail: ++ if (ctx->video_fd >= 0) { ++ close(ctx->video_fd); ++ ctx->video_fd = -1; ++ } ++ return ret; ++} ++ ++static int v4l2_request_init_context(AVCodecContext *avctx) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ int ret; ++ ++ ret = ioctl(ctx->video_fd, VIDIOC_G_FMT, &ctx->format); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: get capture format failed, %s (%d)\n", __func__, strerror(errno), errno); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) { ++ av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u num_planes=%u\n", __func__, ctx->format.fmt.pix_mp.pixelformat, ctx->format.fmt.pix_mp.width, ctx->format.fmt.pix_mp.height, ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline, ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage, ctx->format.fmt.pix_mp.num_planes); ++ } else { ++ av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u\n", __func__, ctx->format.fmt.pix.pixelformat, ctx->format.fmt.pix.width, ctx->format.fmt.pix.height, ctx->format.fmt.pix.bytesperline, ctx->format.fmt.pix.sizeimage); ++ } ++ ++ ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM); ++ if (ret < 0) ++ goto fail; ++ ++ ret = ioctl(ctx->video_fd, VIDIOC_STREAMON, &ctx->output_type); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: output stream on failed, %s (%d)\n", __func__, strerror(errno), errno); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ ret = ioctl(ctx->video_fd, VIDIOC_STREAMON, &ctx->format.type); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: capture stream on failed, %s (%d)\n", __func__, strerror(errno), errno); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ return 0; ++ ++fail: ++ ff_v4l2_request_uninit(avctx); ++ return ret; ++} ++ ++static int v4l2_request_probe_media_device(struct udev_device *device, AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ int ret; ++ struct media_device_info device_info = {0}; ++ struct media_v2_topology topology = {0}; ++ struct media_v2_interface *interfaces = NULL; ++ struct udev *udev = udev_device_get_udev(device); ++ struct udev_device *video_device; ++ dev_t devnum; ++ ++ const char *path = udev_device_get_devnode(device); ++ if (!path) { ++ av_log(avctx, AV_LOG_ERROR, "%s: get media device devnode failed\n", __func__); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ ctx->media_fd = open(path, O_RDWR, 0); ++ if (ctx->media_fd < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ ret = ioctl(ctx->media_fd, MEDIA_IOC_DEVICE_INFO, &device_info); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: get media device info failed, %s (%d)\n", __func__, strerror(errno), errno); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p path=%s driver=%s\n", __func__, avctx, ctx, path, device_info.driver); ++ ++ ret = ioctl(ctx->media_fd, MEDIA_IOC_G_TOPOLOGY, &topology); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: get media topology failed, %s (%d)\n", __func__, strerror(errno), errno); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ if (topology.num_interfaces <= 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: media device has no interfaces\n", __func__); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ interfaces = av_mallocz(topology.num_interfaces * sizeof(struct media_v2_interface)); ++ if (!interfaces) { ++ av_log(avctx, AV_LOG_ERROR, "%s: allocating media interface struct failed\n", __func__); ++ ret = AVERROR(ENOMEM); ++ goto fail; ++ } ++ ++ topology.ptr_interfaces = (__u64)(uintptr_t)interfaces; ++ ret = ioctl(ctx->media_fd, MEDIA_IOC_G_TOPOLOGY, &topology); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: get media topology failed, %s (%d)\n", __func__, strerror(errno), errno); ++ ret = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ ret = AVERROR(EINVAL); ++ for (int i = 0; i < topology.num_interfaces; i++) { ++ if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO) ++ continue; ++ ++ devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor); ++ video_device = udev_device_new_from_devnum(udev, 'c', devnum); ++ if (!video_device) { ++ av_log(avctx, AV_LOG_ERROR, "%s: video_device=%p\n", __func__, video_device); ++ continue; ++ } ++ ++ ret = v4l2_request_probe_video_device(video_device, avctx, pixelformat, buffersize, control, count); ++ udev_device_unref(video_device); ++ ++ if (!ret) ++ break; ++ } ++ ++ av_freep(&interfaces); ++ return ret; ++ ++fail: ++ av_freep(&interfaces); ++ if (ctx->media_fd >= 0) { ++ close(ctx->media_fd); ++ ctx->media_fd = -1; ++ } ++ return ret; ++} ++ ++int ff_v4l2_request_init(AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ int ret = AVERROR(EINVAL); ++ struct udev *udev; ++ struct udev_enumerate *enumerate; ++ struct udev_list_entry *devices; ++ struct udev_list_entry *entry; ++ struct udev_device *device; ++ ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p hw_device_ctx=%p hw_frames_ctx=%p\n", __func__, avctx, avctx->hw_device_ctx, avctx->hw_frames_ctx); ++ ++ ctx->media_fd = -1; ++ ctx->video_fd = -1; ++ ctx->timestamp = 0; ++ ++ udev = udev_new(); ++ if (!udev) { ++ av_log(avctx, AV_LOG_ERROR, "%s: allocating udev context failed\n", __func__); ++ ret = AVERROR(ENOMEM); ++ goto fail; ++ } ++ ++ enumerate = udev_enumerate_new(udev); ++ if (!enumerate) { ++ av_log(avctx, AV_LOG_ERROR, "%s: allocating udev enumerator failed\n", __func__); ++ ret = AVERROR(ENOMEM); ++ goto fail; ++ } ++ ++ udev_enumerate_add_match_subsystem(enumerate, "media"); ++ udev_enumerate_scan_devices(enumerate); ++ ++ devices = udev_enumerate_get_list_entry(enumerate); ++ udev_list_entry_foreach(entry, devices) { ++ const char *path = udev_list_entry_get_name(entry); ++ if (!path) ++ continue; ++ ++ device = udev_device_new_from_syspath(udev, path); ++ if (!device) ++ continue; ++ ++ ret = v4l2_request_probe_media_device(device, avctx, pixelformat, buffersize, control, count); ++ udev_device_unref(device); ++ ++ if (!ret) ++ break; ++ } ++ ++ udev_enumerate_unref(enumerate); ++ ++ if (!ret) ++ ret = v4l2_request_init_context(avctx); ++ ++fail: ++ udev_unref(udev); ++ return ret; ++} ++ ++int ff_v4l2_request_uninit(AVCodecContext *avctx) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ int ret; ++ ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p\n", __func__, avctx, ctx); ++ ++ if (ctx->video_fd >= 0) { ++ ret = ioctl(ctx->video_fd, VIDIOC_STREAMOFF, &ctx->output_type); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "%s: output stream off failed, %s (%d)\n", __func__, strerror(errno), errno); ++ ++ ret = ioctl(ctx->video_fd, VIDIOC_STREAMOFF, &ctx->format.type); ++ if (ret < 0) ++ av_log(avctx, AV_LOG_ERROR, "%s: capture stream off failed, %s (%d)\n", __func__, strerror(errno), errno); ++ } ++ ++ if (avctx->hw_frames_ctx) { ++ AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data; ++ av_buffer_pool_flush(hwfc->pool); ++ } ++ ++ if (ctx->video_fd >= 0) ++ close(ctx->video_fd); ++ ++ if (ctx->media_fd >= 0) ++ close(ctx->media_fd); ++ ++ return 0; ++} ++ ++static int v4l2_request_buffer_alloc(AVCodecContext *avctx, V4L2RequestBuffer *buf, enum v4l2_buf_type type) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ int ret; ++ struct v4l2_plane planes[1] = {}; ++ struct v4l2_create_buffers buffers = { ++ .count = 1, ++ .memory = V4L2_MEMORY_MMAP, ++ .format.type = type, ++ }; ++ ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p buf=%p type=%u\n", __func__, avctx, buf, type); ++ ++ ret = ioctl(ctx->video_fd, VIDIOC_G_FMT, &buffers.format); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: get format failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno); ++ return ret; ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(buffers.format.type)) { ++ av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u num_planes=%u\n", __func__, buffers.format.fmt.pix_mp.pixelformat, buffers.format.fmt.pix_mp.width, buffers.format.fmt.pix_mp.height, buffers.format.fmt.pix_mp.plane_fmt[0].bytesperline, buffers.format.fmt.pix_mp.plane_fmt[0].sizeimage, buffers.format.fmt.pix_mp.num_planes); ++ } else { ++ av_log(avctx, AV_LOG_DEBUG, "%s: pixelformat=%d width=%u height=%u bytesperline=%u sizeimage=%u\n", __func__, buffers.format.fmt.pix.pixelformat, buffers.format.fmt.pix.width, buffers.format.fmt.pix.height, buffers.format.fmt.pix.bytesperline, buffers.format.fmt.pix.sizeimage); ++ } ++ ++ ret = ioctl(ctx->video_fd, VIDIOC_CREATE_BUFS, &buffers); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: create buffers failed for type %u, %s (%d)\n", __func__, type, strerror(errno), errno); ++ return ret; ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(type)) { ++ buf->width = buffers.format.fmt.pix_mp.width; ++ buf->height = buffers.format.fmt.pix_mp.height; ++ buf->size = buffers.format.fmt.pix_mp.plane_fmt[0].sizeimage; ++ buf->buffer.length = 1; ++ buf->buffer.m.planes = planes; ++ } else { ++ buf->width = buffers.format.fmt.pix.width; ++ buf->height = buffers.format.fmt.pix.height; ++ buf->size = buffers.format.fmt.pix.sizeimage; ++ } ++ ++ buf->index = buffers.index; ++ buf->capabilities = buffers.capabilities; ++ buf->used = 0; ++ ++ buf->buffer.type = type; ++ buf->buffer.memory = V4L2_MEMORY_MMAP; ++ buf->buffer.index = buf->index; ++ ++ ret = ioctl(ctx->video_fd, VIDIOC_QUERYBUF, &buf->buffer); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: query buffer %d failed, %s (%d)\n", __func__, buf->index, strerror(errno), errno); ++ return ret; ++ } ++ ++ if (V4L2_TYPE_IS_OUTPUT(type)) { ++ void *addr = mmap(NULL, buf->size, PROT_READ | PROT_WRITE, MAP_SHARED, ctx->video_fd, V4L2_TYPE_IS_MULTIPLANAR(type) ? buf->buffer.m.planes[0].m.mem_offset : buf->buffer.m.offset); ++ if (addr == MAP_FAILED) { ++ av_log(avctx, AV_LOG_ERROR, "%s: mmap failed, %s (%d)\n", __func__, strerror(errno), errno); ++ return -1; ++ } ++ ++ buf->addr = (uint8_t*)addr; ++ } else { ++ struct v4l2_exportbuffer exportbuffer = { ++ .type = type, ++ .index = buf->index, ++ .flags = O_RDONLY, ++ }; ++ ++ ret = ioctl(ctx->video_fd, VIDIOC_EXPBUF, &exportbuffer); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: export buffer %d failed, %s (%d)\n", __func__, buf->index, strerror(errno), errno); ++ return ret; ++ } ++ ++ buf->fd = exportbuffer.fd; ++ } ++ ++ av_log(avctx, AV_LOG_DEBUG, "%s: buf=%p index=%d fd=%d addr=%p width=%u height=%u size=%u\n", __func__, buf, buf->index, buf->fd, buf->addr, buf->width, buf->height, buf->size); ++ return 0; ++} ++ ++static void v4l2_request_buffer_free(V4L2RequestBuffer *buf) ++{ ++ av_log(NULL, AV_LOG_DEBUG, "%s: buf=%p index=%d fd=%d addr=%p width=%u height=%u size=%u\n", __func__, buf, buf->index, buf->fd, buf->addr, buf->width, buf->height, buf->size); ++ ++ if (buf->addr) ++ munmap(buf->addr, buf->size); ++ ++ if (buf->fd >= 0) ++ close(buf->fd); ++} ++ ++static void v4l2_request_frame_free(void *opaque, uint8_t *data) ++{ ++ AVCodecContext *avctx = opaque; ++ V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)data; ++ ++ av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p request_fd=%d\n", __func__, avctx, data, req->request_fd); ++ ++ if (req->request_fd >= 0) ++ close(req->request_fd); ++ ++ v4l2_request_buffer_free(&req->capture); ++ v4l2_request_buffer_free(&req->output); ++ ++ av_free(data); ++} ++ ++static AVBufferRef *v4l2_request_frame_alloc(void *opaque, int size) ++{ ++ AVCodecContext *avctx = opaque; ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ V4L2RequestDescriptor *req; ++ AVBufferRef *ref; ++ uint8_t *data; ++ int ret; ++ ++ data = av_mallocz(size); ++ if (!data) ++ return NULL; ++ ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data); ++ ++ ref = av_buffer_create(data, size, v4l2_request_frame_free, avctx, 0); ++ if (!ref) { ++ av_freep(&data); ++ return NULL; ++ } ++ ++ req = (V4L2RequestDescriptor*)data; ++ req->request_fd = -1; ++ req->output.fd = -1; ++ req->capture.fd = -1; ++ ++ ret = v4l2_request_buffer_alloc(avctx, &req->output, ctx->output_type); ++ if (ret < 0) { ++ av_buffer_unref(&ref); ++ return NULL; ++ } ++ ++ ret = v4l2_request_buffer_alloc(avctx, &req->capture, ctx->format.type); ++ if (ret < 0) { ++ av_buffer_unref(&ref); ++ return NULL; ++ } ++ ++ ret = ioctl(ctx->media_fd, MEDIA_IOC_REQUEST_ALLOC, &req->request_fd); ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s: request alloc failed, %s (%d)\n", __func__, strerror(errno), errno); ++ av_buffer_unref(&ref); ++ return NULL; ++ } ++ ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p request_fd=%d\n", __func__, avctx, size, data, req->request_fd); ++ return ref; ++} ++ ++static void v4l2_request_pool_free(void *opaque) ++{ ++ av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque); ++} ++ ++static void v4l2_request_hwframe_ctx_free(AVHWFramesContext *hwfc) ++{ ++ av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool); ++ ++ av_buffer_pool_flush(hwfc->pool); ++ av_buffer_pool_uninit(&hwfc->pool); ++} ++ ++int ff_v4l2_request_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx) ++{ ++ V4L2RequestContext *ctx = avctx->internal->hwaccel_priv_data; ++ AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data; ++ ++ hwfc->format = AV_PIX_FMT_DRM_PRIME; ++ hwfc->sw_format = AV_PIX_FMT_NV12; ++ if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) { ++ hwfc->width = ctx->format.fmt.pix_mp.width; ++ hwfc->height = ctx->format.fmt.pix_mp.height; ++ } else { ++ hwfc->width = ctx->format.fmt.pix.width; ++ hwfc->height = ctx->format.fmt.pix.height; ++#if CONFIG_SAND ++ if (ctx->format.fmt.pix.pixelformat == V4L2_PIX_FMT_NV12_COL128) { ++ hwfc->sw_format = AV_PIX_FMT_RPI4_8; ++ } ++ else if (ctx->format.fmt.pix.pixelformat == V4L2_PIX_FMT_NV12_10_COL128) { ++ hwfc->sw_format = AV_PIX_FMT_RPI4_10; ++ } ++#endif ++ } ++ ++ hwfc->pool = av_buffer_pool_init2(sizeof(V4L2RequestDescriptor), avctx, v4l2_request_frame_alloc, v4l2_request_pool_free); ++ if (!hwfc->pool) ++ return AVERROR(ENOMEM); ++ ++ hwfc->free = v4l2_request_hwframe_ctx_free; ++ ++ hwfc->initial_pool_size = 1; ++ ++ switch (avctx->codec_id) { ++ case AV_CODEC_ID_VP9: ++ hwfc->initial_pool_size += 8; ++ break; ++ case AV_CODEC_ID_VP8: ++ hwfc->initial_pool_size += 3; ++ break; ++ default: ++ hwfc->initial_pool_size += 2; ++ } ++ ++ av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size); ++ ++ return 0; ++} +diff --git a/libavcodec/v4l2_request.h b/libavcodec/v4l2_request.h +new file mode 100644 +index 0000000000..20b56cfbfb +--- /dev/null ++++ b/libavcodec/v4l2_request.h +@@ -0,0 +1,96 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#ifndef AVCODEC_V4L2_REQUEST_H ++#define AVCODEC_V4L2_REQUEST_H ++ ++#include ++ ++#include "libavutil/hwcontext_drm.h" ++#include "v4l2_phase.h" ++ ++typedef struct V4L2RequestContext { ++ int video_fd; ++ int media_fd; ++ enum v4l2_buf_type output_type; ++ struct v4l2_format format; ++ int timestamp; ++} V4L2RequestContext; ++ ++typedef struct V4L2RequestBuffer { ++ int index; ++ int fd; ++ uint8_t *addr; ++ uint32_t width; ++ uint32_t height; ++ uint32_t size; ++ uint32_t used; ++ uint32_t capabilities; ++ struct v4l2_buffer buffer; ++} V4L2RequestBuffer; ++ ++struct V4l2PhaseControl; ++ ++typedef struct V4L2PhaseEnv { ++ struct V4L2PhaseEnv * next; ++ struct V4L2PhaseControl * ctrl; ++ unsigned int order; ++} V4L2PhaseEnv; ++ ++typedef struct V4L2RequestDescriptor { ++ AVDRMFrameDescriptor drm; ++ int request_fd; ++ V4L2RequestBuffer output; ++ V4L2RequestBuffer capture; ++ ++ // Phase control ++ V4L2PhaseInfo phase; ++} V4L2RequestDescriptor; ++ ++uint64_t ff_v4l2_request_get_capture_timestamp(AVFrame *frame); ++ ++// Sets phase control on this frame & gives it an order ++int ff_v4l2_request_start_phase_control(AVFrame *frame, struct V4L2PhaseControl * phase); ++ ++// Had error - release all phases ++void ff_v4l2_request_abort_phase_control(AVFrame *frame); ++ ++ ++int ff_v4l2_request_reset_frame(AVCodecContext *avctx, AVFrame *frame); ++ ++int ff_v4l2_request_append_output_buffer(AVCodecContext *avctx, AVFrame *frame, const uint8_t *data, uint32_t size); ++ ++int ff_v4l2_request_set_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count); ++ ++int ff_v4l2_request_get_controls(AVCodecContext *avctx, struct v4l2_ext_control *control, int count); ++ ++int ff_v4l2_request_query_control(AVCodecContext *avctx, struct v4l2_query_ext_ctrl *control); ++ ++int ff_v4l2_request_query_control_default_value(AVCodecContext *avctx, uint32_t id); ++ ++int ff_v4l2_request_decode_slice(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count, int first_slice, int last_slice); ++ ++int ff_v4l2_request_decode_frame(AVCodecContext *avctx, AVFrame *frame, struct v4l2_ext_control *control, int count); ++ ++int ff_v4l2_request_init(AVCodecContext *avctx, uint32_t pixelformat, uint32_t buffersize, struct v4l2_ext_control *control, int count); ++ ++int ff_v4l2_request_uninit(AVCodecContext *avctx); ++ ++int ff_v4l2_request_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx); ++ ++#endif /* AVCODEC_V4L2_REQUEST_H */ +diff --git a/libavcodec/v4l2_request_h264.c b/libavcodec/v4l2_request_h264.c +new file mode 100644 +index 0000000000..d6332c01c7 +--- /dev/null ++++ b/libavcodec/v4l2_request_h264.c +@@ -0,0 +1,456 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "h264dec.h" ++#include "hwconfig.h" ++#include "v4l2_request.h" ++#include "h264-ctrls.h" ++ ++typedef struct V4L2RequestControlsH264 { ++ struct v4l2_ctrl_h264_sps sps; ++ struct v4l2_ctrl_h264_pps pps; ++ struct v4l2_ctrl_h264_scaling_matrix scaling_matrix; ++ struct v4l2_ctrl_h264_decode_params decode_params; ++ struct v4l2_ctrl_h264_slice_params slice_params; ++ struct v4l2_ctrl_h264_pred_weights pred_weights; ++ int pred_weights_required; ++ int first_slice; ++ int num_slices; ++} V4L2RequestControlsH264; ++ ++typedef struct V4L2RequestContextH264 { ++ V4L2RequestContext base; ++ int decode_mode; ++ int start_code; ++} V4L2RequestContextH264; ++ ++static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 }; ++ ++static void fill_weight_factors(struct v4l2_h264_weight_factors *factors, int list, const H264SliceContext *sl) ++{ ++ for (int i = 0; i < sl->ref_count[list]; i++) { ++ if (sl->pwt.luma_weight_flag[list]) { ++ factors->luma_weight[i] = sl->pwt.luma_weight[i][list][0]; ++ factors->luma_offset[i] = sl->pwt.luma_weight[i][list][1]; ++ } else { ++ factors->luma_weight[i] = 1 << sl->pwt.luma_log2_weight_denom; ++ factors->luma_offset[i] = 0; ++ } ++ for (int j = 0; j < 2; j++) { ++ if (sl->pwt.chroma_weight_flag[list]) { ++ factors->chroma_weight[i][j] = sl->pwt.chroma_weight[i][list][j][0]; ++ factors->chroma_offset[i][j] = sl->pwt.chroma_weight[i][list][j][1]; ++ } else { ++ factors->chroma_weight[i][j] = 1 << sl->pwt.chroma_log2_weight_denom; ++ factors->chroma_offset[i][j] = 0; ++ } ++ } ++ } ++} ++ ++static void fill_dpb_entry(struct v4l2_h264_dpb_entry *entry, const H264Picture *pic) ++{ ++ entry->reference_ts = ff_v4l2_request_get_capture_timestamp(pic->f); ++ entry->pic_num = pic->pic_id; ++ entry->frame_num = pic->frame_num; ++ entry->fields = pic->reference & V4L2_H264_FRAME_REF; ++ entry->flags = V4L2_H264_DPB_ENTRY_FLAG_VALID; ++ if (entry->fields) ++ entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_ACTIVE; ++ if (pic->long_ref) ++ entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_LONG_TERM; ++ if (pic->field_picture) ++ entry->flags |= V4L2_H264_DPB_ENTRY_FLAG_FIELD; ++ if (pic->field_poc[0] != INT_MAX) ++ entry->top_field_order_cnt = pic->field_poc[0]; ++ if (pic->field_poc[1] != INT_MAX) ++ entry->bottom_field_order_cnt = pic->field_poc[1]; ++} ++ ++static void fill_dpb(struct v4l2_ctrl_h264_decode_params *decode, const H264Context *h) ++{ ++ int entries = 0; ++ ++ for (int i = 0; i < h->short_ref_count; i++) { ++ const H264Picture *pic = h->short_ref[i]; ++ if (pic && (pic->field_poc[0] != INT_MAX || pic->field_poc[1] != INT_MAX)) ++ fill_dpb_entry(&decode->dpb[entries++], pic); ++ } ++ ++ if (!h->long_ref_count) ++ return; ++ ++ for (int i = 0; i < FF_ARRAY_ELEMS(h->long_ref); i++) { ++ const H264Picture *pic = h->long_ref[i]; ++ if (pic && (pic->field_poc[0] != INT_MAX || pic->field_poc[1] != INT_MAX)) ++ fill_dpb_entry(&decode->dpb[entries++], pic); ++ } ++} ++ ++static void fill_ref_list(struct v4l2_h264_reference *reference, struct v4l2_ctrl_h264_decode_params *decode, const H264Ref *ref) ++{ ++ uint64_t timestamp; ++ ++ if (!ref->parent) ++ return; ++ ++ timestamp = ff_v4l2_request_get_capture_timestamp(ref->parent->f); ++ ++ for (uint8_t i = 0; i < FF_ARRAY_ELEMS(decode->dpb); i++) { ++ struct v4l2_h264_dpb_entry *entry = &decode->dpb[i]; ++ if ((entry->flags & V4L2_H264_DPB_ENTRY_FLAG_VALID) && ++ entry->reference_ts == timestamp) { ++ reference->fields = ref->reference & V4L2_H264_FRAME_REF; ++ reference->index = i; ++ return; ++ } ++ } ++} ++ ++static void fill_sps(struct v4l2_ctrl_h264_sps *ctrl, const H264Context *h) ++{ ++ const SPS *sps = h->ps.sps; ++ ++ *ctrl = (struct v4l2_ctrl_h264_sps) { ++ .profile_idc = sps->profile_idc, ++ .constraint_set_flags = sps->constraint_set_flags, ++ .level_idc = sps->level_idc, ++ .seq_parameter_set_id = sps->sps_id, ++ .chroma_format_idc = sps->chroma_format_idc, ++ .bit_depth_luma_minus8 = sps->bit_depth_luma - 8, ++ .bit_depth_chroma_minus8 = sps->bit_depth_chroma - 8, ++ .log2_max_frame_num_minus4 = sps->log2_max_frame_num - 4, ++ .pic_order_cnt_type = sps->poc_type, ++ .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4, ++ .max_num_ref_frames = sps->ref_frame_count, ++ .num_ref_frames_in_pic_order_cnt_cycle = sps->poc_cycle_length, ++ .offset_for_non_ref_pic = sps->offset_for_non_ref_pic, ++ .offset_for_top_to_bottom_field = sps->offset_for_top_to_bottom_field, ++ .pic_width_in_mbs_minus1 = h->mb_width - 1, ++ .pic_height_in_map_units_minus1 = sps->frame_mbs_only_flag ? h->mb_height - 1 : h->mb_height / 2 - 1, ++ }; ++ ++ if (sps->poc_cycle_length > 0 && sps->poc_cycle_length <= 255) ++ memcpy(ctrl->offset_for_ref_frame, sps->offset_for_ref_frame, sps->poc_cycle_length * sizeof(ctrl->offset_for_ref_frame[0])); ++ ++ if (sps->residual_color_transform_flag) ++ ctrl->flags |= V4L2_H264_SPS_FLAG_SEPARATE_COLOUR_PLANE; ++ if (sps->transform_bypass) ++ ctrl->flags |= V4L2_H264_SPS_FLAG_QPPRIME_Y_ZERO_TRANSFORM_BYPASS; ++ if (sps->delta_pic_order_always_zero_flag) ++ ctrl->flags |= V4L2_H264_SPS_FLAG_DELTA_PIC_ORDER_ALWAYS_ZERO; ++ if (sps->gaps_in_frame_num_allowed_flag) ++ ctrl->flags |= V4L2_H264_SPS_FLAG_GAPS_IN_FRAME_NUM_VALUE_ALLOWED; ++ if (sps->frame_mbs_only_flag) ++ ctrl->flags |= V4L2_H264_SPS_FLAG_FRAME_MBS_ONLY; ++ if (sps->mb_aff) ++ ctrl->flags |= V4L2_H264_SPS_FLAG_MB_ADAPTIVE_FRAME_FIELD; ++ if (sps->direct_8x8_inference_flag) ++ ctrl->flags |= V4L2_H264_SPS_FLAG_DIRECT_8X8_INFERENCE; ++} ++ ++static void fill_pps(struct v4l2_ctrl_h264_pps *ctrl, const H264Context *h) ++{ ++ const SPS *sps = h->ps.sps; ++ const PPS *pps = h->ps.pps; ++ const H264SliceContext *sl = &h->slice_ctx[0]; ++ int qp_bd_offset = 6 * (sps->bit_depth_luma - 8); ++ ++ *ctrl = (struct v4l2_ctrl_h264_pps) { ++ .pic_parameter_set_id = sl->pps_id, ++ .seq_parameter_set_id = pps->sps_id, ++ .num_slice_groups_minus1 = pps->slice_group_count - 1, ++ .num_ref_idx_l0_default_active_minus1 = pps->ref_count[0] - 1, ++ .num_ref_idx_l1_default_active_minus1 = pps->ref_count[1] - 1, ++ .weighted_bipred_idc = pps->weighted_bipred_idc, ++ .pic_init_qp_minus26 = pps->init_qp - 26 - qp_bd_offset, ++ .pic_init_qs_minus26 = pps->init_qs - 26 - qp_bd_offset, ++ .chroma_qp_index_offset = pps->chroma_qp_index_offset[0], ++ .second_chroma_qp_index_offset = pps->chroma_qp_index_offset[1], ++ }; ++ ++ if (pps->cabac) ++ ctrl->flags |= V4L2_H264_PPS_FLAG_ENTROPY_CODING_MODE; ++ if (pps->pic_order_present) ++ ctrl->flags |= V4L2_H264_PPS_FLAG_BOTTOM_FIELD_PIC_ORDER_IN_FRAME_PRESENT; ++ if (pps->weighted_pred) ++ ctrl->flags |= V4L2_H264_PPS_FLAG_WEIGHTED_PRED; ++ if (pps->deblocking_filter_parameters_present) ++ ctrl->flags |= V4L2_H264_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT; ++ if (pps->constrained_intra_pred) ++ ctrl->flags |= V4L2_H264_PPS_FLAG_CONSTRAINED_INTRA_PRED; ++ if (pps->redundant_pic_cnt_present) ++ ctrl->flags |= V4L2_H264_PPS_FLAG_REDUNDANT_PIC_CNT_PRESENT; ++ if (pps->transform_8x8_mode) ++ ctrl->flags |= V4L2_H264_PPS_FLAG_TRANSFORM_8X8_MODE; ++ /* FFmpeg always provide a scaling matrix */ ++ ctrl->flags |= V4L2_H264_PPS_FLAG_SCALING_MATRIX_PRESENT; ++} ++ ++static int v4l2_request_h264_start_frame(AVCodecContext *avctx, ++ av_unused const uint8_t *buffer, ++ av_unused uint32_t size) ++{ ++ const H264Context *h = avctx->priv_data; ++ const PPS *pps = h->ps.pps; ++ const SPS *sps = h->ps.sps; ++ const H264SliceContext *sl = &h->slice_ctx[0]; ++ V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private; ++ ++ fill_sps(&controls->sps, h); ++ fill_pps(&controls->pps, h); ++ ++ memcpy(controls->scaling_matrix.scaling_list_4x4, pps->scaling_matrix4, sizeof(controls->scaling_matrix.scaling_list_4x4)); ++ memcpy(controls->scaling_matrix.scaling_list_8x8[0], pps->scaling_matrix8[0], sizeof(controls->scaling_matrix.scaling_list_8x8[0])); ++ memcpy(controls->scaling_matrix.scaling_list_8x8[1], pps->scaling_matrix8[3], sizeof(controls->scaling_matrix.scaling_list_8x8[1])); ++ ++ if (sps->chroma_format_idc == 3) { ++ memcpy(controls->scaling_matrix.scaling_list_8x8[2], pps->scaling_matrix8[1], sizeof(controls->scaling_matrix.scaling_list_8x8[2])); ++ memcpy(controls->scaling_matrix.scaling_list_8x8[3], pps->scaling_matrix8[4], sizeof(controls->scaling_matrix.scaling_list_8x8[3])); ++ memcpy(controls->scaling_matrix.scaling_list_8x8[4], pps->scaling_matrix8[2], sizeof(controls->scaling_matrix.scaling_list_8x8[4])); ++ memcpy(controls->scaling_matrix.scaling_list_8x8[5], pps->scaling_matrix8[5], sizeof(controls->scaling_matrix.scaling_list_8x8[5])); ++ } ++ ++ controls->decode_params = (struct v4l2_ctrl_h264_decode_params) { ++ .nal_ref_idc = h->nal_ref_idc, ++ .frame_num = h->poc.frame_num, ++ .top_field_order_cnt = h->cur_pic_ptr->field_poc[0] != INT_MAX ? h->cur_pic_ptr->field_poc[0] : 0, ++ .bottom_field_order_cnt = h->cur_pic_ptr->field_poc[1] != INT_MAX ? h->cur_pic_ptr->field_poc[1] : 0, ++ .idr_pic_id = sl->idr_pic_id, ++ .pic_order_cnt_lsb = sl->poc_lsb, ++ .delta_pic_order_cnt_bottom = sl->delta_poc_bottom, ++ .delta_pic_order_cnt0 = sl->delta_poc[0], ++ .delta_pic_order_cnt1 = sl->delta_poc[1], ++ /* Size in bits of dec_ref_pic_marking() syntax element. */ ++ .dec_ref_pic_marking_bit_size = sl->ref_pic_marking_size_in_bits, ++ /* Size in bits of pic order count syntax. */ ++ .pic_order_cnt_bit_size = sl->pic_order_cnt_bit_size, ++ .slice_group_change_cycle = 0, /* slice group not supported by FFmpeg */ ++ }; ++ ++ if (h->picture_idr) ++ controls->decode_params.flags |= V4L2_H264_DECODE_PARAM_FLAG_IDR_PIC; ++ if (FIELD_PICTURE(h)) ++ controls->decode_params.flags |= V4L2_H264_DECODE_PARAM_FLAG_FIELD_PIC; ++ if (h->picture_structure == PICT_BOTTOM_FIELD) ++ controls->decode_params.flags |= V4L2_H264_DECODE_PARAM_FLAG_BOTTOM_FIELD; ++ ++ fill_dpb(&controls->decode_params, h); ++ ++ controls->first_slice = !FIELD_PICTURE(h) || h->first_field; ++ controls->num_slices = 0; ++ ++ return ff_v4l2_request_reset_frame(avctx, h->cur_pic_ptr->f); ++} ++ ++static int v4l2_request_h264_queue_decode(AVCodecContext *avctx, int last_slice) ++{ ++ const H264Context *h = avctx->priv_data; ++ V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private; ++ V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data; ++ ++ struct v4l2_ext_control control[] = { ++ { ++ .id = V4L2_CID_MPEG_VIDEO_H264_SPS, ++ .ptr = &controls->sps, ++ .size = sizeof(controls->sps), ++ }, ++ { ++ .id = V4L2_CID_MPEG_VIDEO_H264_PPS, ++ .ptr = &controls->pps, ++ .size = sizeof(controls->pps), ++ }, ++ { ++ .id = V4L2_CID_MPEG_VIDEO_H264_SCALING_MATRIX, ++ .ptr = &controls->scaling_matrix, ++ .size = sizeof(controls->scaling_matrix), ++ }, ++ { ++ .id = V4L2_CID_MPEG_VIDEO_H264_DECODE_PARAMS, ++ .ptr = &controls->decode_params, ++ .size = sizeof(controls->decode_params), ++ }, ++ { ++ .id = V4L2_CID_MPEG_VIDEO_H264_SLICE_PARAMS, ++ .ptr = &controls->slice_params, ++ .size = sizeof(controls->slice_params), ++ }, ++ { ++ .id = V4L2_CID_MPEG_VIDEO_H264_PRED_WEIGHTS, ++ .ptr = &controls->pred_weights, ++ .size = sizeof(controls->pred_weights), ++ }, ++ }; ++ ++ if (ctx->decode_mode == V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED) { ++ int count = FF_ARRAY_ELEMS(control) - (controls->pred_weights_required ? 0 : 1); ++ return ff_v4l2_request_decode_slice(avctx, h->cur_pic_ptr->f, control, count, controls->first_slice, last_slice); ++ } ++ ++ return ff_v4l2_request_decode_frame(avctx, h->cur_pic_ptr->f, control, FF_ARRAY_ELEMS(control) - 2); ++} ++ ++static int v4l2_request_h264_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) ++{ ++ const H264Context *h = avctx->priv_data; ++ const PPS *pps = h->ps.pps; ++ const H264SliceContext *sl = &h->slice_ctx[0]; ++ V4L2RequestControlsH264 *controls = h->cur_pic_ptr->hwaccel_picture_private; ++ V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data; ++ int i, ret, count; ++ ++ if (ctx->decode_mode == V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED && controls->num_slices) { ++ ret = v4l2_request_h264_queue_decode(avctx, 0); ++ if (ret) ++ return ret; ++ ++ ff_v4l2_request_reset_frame(avctx, h->cur_pic_ptr->f); ++ controls->first_slice = 0; ++ } ++ ++ if (ctx->start_code == V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B) { ++ ret = ff_v4l2_request_append_output_buffer(avctx, h->cur_pic_ptr->f, nalu_slice_start_code, 3); ++ if (ret) ++ return ret; ++ } ++ ++ ret = ff_v4l2_request_append_output_buffer(avctx, h->cur_pic_ptr->f, buffer, size); ++ if (ret) ++ return ret; ++ ++ if (ctx->decode_mode != V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED) ++ return 0; ++ ++ controls->slice_params = (struct v4l2_ctrl_h264_slice_params) { ++ /* Offset in bits to slice_data() from the beginning of this slice. */ ++ .header_bit_size = get_bits_count(&sl->gb), ++ ++ .first_mb_in_slice = sl->first_mb_addr, ++ ++ .slice_type = ff_h264_get_slice_type(sl), ++ .colour_plane_id = 0, /* separate colour plane not supported by FFmpeg */ ++ .redundant_pic_cnt = sl->redundant_pic_count, ++ .cabac_init_idc = sl->cabac_init_idc, ++ .slice_qp_delta = sl->qscale - pps->init_qp, ++ .slice_qs_delta = 0, /* not implemented by FFmpeg */ ++ .disable_deblocking_filter_idc = sl->deblocking_filter < 2 ? !sl->deblocking_filter : sl->deblocking_filter, ++ .slice_alpha_c0_offset_div2 = sl->slice_alpha_c0_offset / 2, ++ .slice_beta_offset_div2 = sl->slice_beta_offset / 2, ++ .num_ref_idx_l0_active_minus1 = sl->list_count > 0 ? sl->ref_count[0] - 1 : 0, ++ .num_ref_idx_l1_active_minus1 = sl->list_count > 1 ? sl->ref_count[1] - 1 : 0, ++ }; ++ ++ if (sl->slice_type == AV_PICTURE_TYPE_B && sl->direct_spatial_mv_pred) ++ controls->slice_params.flags |= V4L2_H264_SLICE_FLAG_DIRECT_SPATIAL_MV_PRED; ++ /* V4L2_H264_SLICE_FLAG_SP_FOR_SWITCH: not implemented by FFmpeg */ ++ ++ controls->pred_weights_required = V4L2_H264_CTRL_PRED_WEIGHTS_REQUIRED(&controls->pps, &controls->slice_params); ++ if (controls->pred_weights_required) { ++ controls->pred_weights.chroma_log2_weight_denom = sl->pwt.chroma_log2_weight_denom; ++ controls->pred_weights.luma_log2_weight_denom = sl->pwt.luma_log2_weight_denom; ++ } ++ ++ count = sl->list_count > 0 ? sl->ref_count[0] : 0; ++ for (i = 0; i < count; i++) ++ fill_ref_list(&controls->slice_params.ref_pic_list0[i], &controls->decode_params, &sl->ref_list[0][i]); ++ if (count && controls->pred_weights_required) ++ fill_weight_factors(&controls->pred_weights.weight_factors[0], 0, sl); ++ ++ count = sl->list_count > 1 ? sl->ref_count[1] : 0; ++ for (i = 0; i < count; i++) ++ fill_ref_list(&controls->slice_params.ref_pic_list1[i], &controls->decode_params, &sl->ref_list[1][i]); ++ if (count && controls->pred_weights_required) ++ fill_weight_factors(&controls->pred_weights.weight_factors[1], 1, sl); ++ ++ controls->num_slices++; ++ return 0; ++} ++ ++static int v4l2_request_h264_end_frame(AVCodecContext *avctx) ++{ ++ const H264Context *h = avctx->priv_data; ++ return v4l2_request_h264_queue_decode(avctx, !FIELD_PICTURE(h) || !h->first_field); ++} ++ ++static int v4l2_request_h264_set_controls(AVCodecContext *avctx) ++{ ++ V4L2RequestContextH264 *ctx = avctx->internal->hwaccel_priv_data; ++ ++ struct v4l2_ext_control control[] = { ++ { .id = V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE, }, ++ { .id = V4L2_CID_MPEG_VIDEO_H264_START_CODE, }, ++ }; ++ ++ ctx->decode_mode = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_H264_DECODE_MODE); ++ if (ctx->decode_mode != V4L2_MPEG_VIDEO_H264_DECODE_MODE_SLICE_BASED && ++ ctx->decode_mode != V4L2_MPEG_VIDEO_H264_DECODE_MODE_FRAME_BASED) { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode); ++ return AVERROR(EINVAL); ++ } ++ ++ ctx->start_code = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_H264_START_CODE); ++ if (ctx->start_code != V4L2_MPEG_VIDEO_H264_START_CODE_NONE && ++ ctx->start_code != V4L2_MPEG_VIDEO_H264_START_CODE_ANNEX_B) { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code); ++ return AVERROR(EINVAL); ++ } ++ ++ control[0].value = ctx->decode_mode; ++ control[1].value = ctx->start_code; ++ ++ return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control)); ++} ++ ++static int v4l2_request_h264_init(AVCodecContext *avctx) ++{ ++ const H264Context *h = avctx->priv_data; ++ struct v4l2_ctrl_h264_sps sps; ++ int ret; ++ ++ struct v4l2_ext_control control[] = { ++ { ++ .id = V4L2_CID_MPEG_VIDEO_H264_SPS, ++ .ptr = &sps, ++ .size = sizeof(sps), ++ }, ++ }; ++ ++ fill_sps(&sps, h); ++ ++ ret = ff_v4l2_request_init(avctx, V4L2_PIX_FMT_H264_SLICE, 4 * 1024 * 1024, control, FF_ARRAY_ELEMS(control)); ++ if (ret) ++ return ret; ++ ++ return v4l2_request_h264_set_controls(avctx); ++} ++ ++const AVHWAccel ff_h264_v4l2request_hwaccel = { ++ .name = "h264_v4l2request", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_H264, ++ .pix_fmt = AV_PIX_FMT_DRM_PRIME, ++ .start_frame = v4l2_request_h264_start_frame, ++ .decode_slice = v4l2_request_h264_decode_slice, ++ .end_frame = v4l2_request_h264_end_frame, ++ .frame_priv_data_size = sizeof(V4L2RequestControlsH264), ++ .init = v4l2_request_h264_init, ++ .uninit = ff_v4l2_request_uninit, ++ .priv_data_size = sizeof(V4L2RequestContextH264), ++ .frame_params = ff_v4l2_request_frame_params, ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE, ++}; +diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c +new file mode 100644 +index 0000000000..1c675d6dee +--- /dev/null ++++ b/libavcodec/v4l2_request_hevc.c +@@ -0,0 +1,652 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "decode.h" ++#include "hevcdec.h" ++#include "hwconfig.h" ++#include "v4l2_request.h" ++#include "hevc-ctrls.h" ++#include "v4l2_phase.h" ++ ++#define MAX_SLICES 16 ++ ++typedef struct V4L2RequestControlsHEVC { ++ struct v4l2_ctrl_hevc_sps sps; ++ struct v4l2_ctrl_hevc_pps pps; ++ struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix; ++ struct v4l2_ctrl_hevc_slice_params slice_params[MAX_SLICES]; ++ int first_slice; ++ int num_slices; //TODO: this should be in control ++} V4L2RequestControlsHEVC; ++ ++typedef struct V4L2RequestContextHEVC { ++ V4L2RequestContext base; ++ int decode_mode; ++ int start_code; ++ int max_slices; ++ ++ unsigned int order; ++ V4L2PhaseControl * pctrl; ++} V4L2RequestContextHEVC; ++ ++static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 }; ++ ++static void v4l2_request_hevc_fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table) ++{ ++ int32_t luma_weight_denom, chroma_weight_denom; ++ const SliceHeader *sh = &h->sh; ++ ++ if (sh->slice_type == HEVC_SLICE_I || ++ (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) || ++ (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag)) ++ return; ++ ++ table->luma_log2_weight_denom = sh->luma_log2_weight_denom; ++ ++ if (h->ps.sps->chroma_format_idc) ++ table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom; ++ ++ luma_weight_denom = (1 << sh->luma_log2_weight_denom); ++ chroma_weight_denom = (1 << sh->chroma_log2_weight_denom); ++ ++ for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) { ++ table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom; ++ table->luma_offset_l0[i] = sh->luma_offset_l0[i]; ++ table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom; ++ table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom; ++ table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0]; ++ table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1]; ++ } ++ ++ if (sh->slice_type != HEVC_SLICE_B) ++ return; ++ ++ for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) { ++ table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom; ++ table->luma_offset_l1[i] = sh->luma_offset_l1[i]; ++ table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom; ++ table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom; ++ table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0]; ++ table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1]; ++ } ++} ++ ++static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp) ++{ ++ const HEVCFrame *frame; ++ int i; ++ ++ for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) { ++ frame = h->rps[ST_CURR_BEF].ref[i]; ++ if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame)) ++ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE; ++ } ++ ++ for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) { ++ frame = h->rps[ST_CURR_AFT].ref[i]; ++ if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame)) ++ return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER; ++ } ++ ++ for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) { ++ frame = h->rps[LT_CURR].ref[i]; ++ if (frame && timestamp == ff_v4l2_request_get_capture_timestamp(frame->frame)) ++ return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR; ++ } ++ ++ return 0; ++} ++ ++static uint8_t get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame, ++ struct v4l2_ctrl_hevc_slice_params *slice_params) ++{ ++ uint64_t timestamp; ++ ++ if (!frame) ++ return 0; ++ ++ timestamp = ff_v4l2_request_get_capture_timestamp(frame->frame); ++ ++ for (uint8_t i = 0; i < slice_params->num_active_dpb_entries; i++) { ++ struct v4l2_hevc_dpb_entry *entry = &slice_params->dpb[i]; ++ if (entry->timestamp == timestamp) ++ return i; ++ } ++ ++ return 0; ++} ++ ++static void v4l2_request_hevc_fill_slice_params(const HEVCContext *h, ++ struct v4l2_ctrl_hevc_slice_params *slice_params) ++{ ++ const HEVCFrame *pic = h->ref; ++ const SliceHeader *sh = &h->sh; ++ int i, entries = 0; ++ RefPicList *rpl; ++ ++ *slice_params = (struct v4l2_ctrl_hevc_slice_params) { ++ .bit_size = 0, ++ .data_bit_offset = get_bits_count(&h->HEVClc->gb), ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ .slice_segment_addr = sh->slice_segment_addr, ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */ ++ .nal_unit_type = h->nal_unit_type, ++ .nuh_temporal_id_plus1 = h->temporal_id + 1, ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ .slice_type = sh->slice_type, ++ .colour_plane_id = sh->colour_plane_id, ++ .slice_pic_order_cnt = pic->poc, ++ .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0, ++ .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0, ++ .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0, ++ .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand, ++ .slice_qp_delta = sh->slice_qp_delta, ++ .slice_cb_qp_offset = sh->slice_cb_qp_offset, ++ .slice_cr_qp_offset = sh->slice_cr_qp_offset, ++ .slice_act_y_qp_offset = 0, ++ .slice_act_cb_qp_offset = 0, ++ .slice_act_cr_qp_offset = 0, ++ .slice_beta_offset_div2 = sh->beta_offset / 2, ++ .slice_tc_offset_div2 = sh->tc_offset / 2, ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */ ++ .pic_struct = h->sei.picture_timing.picture_struct, ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */ ++ .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs, ++ .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs, ++ .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs, ++ }; ++ ++ if (sh->slice_sample_adaptive_offset_flag[0]) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA; ++ ++ if (sh->slice_sample_adaptive_offset_flag[1]) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA; ++ ++ if (sh->slice_temporal_mvp_enabled_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED; ++ ++ if (sh->mvd_l1_zero_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO; ++ ++ if (sh->cabac_init_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT; ++ ++ if (sh->collocated_list == L0) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0; ++ ++ if (sh->disable_deblocking_filter_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED; ++ ++ if (sh->slice_loop_filter_across_slices_enabled_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED; ++ ++ if (sh->dependent_slice_segment_flag) ++ slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT; ++ ++ for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) { ++ const HEVCFrame *frame = &h->DPB[i]; ++ if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) { ++ struct v4l2_hevc_dpb_entry *entry = &slice_params->dpb[entries++]; ++ ++ entry->timestamp = ff_v4l2_request_get_capture_timestamp(frame->frame); ++ entry->rps = find_frame_rps_type(h, entry->timestamp); ++ entry->field_pic = frame->frame->interlaced_frame; ++ ++ /* TODO: Interleaved: Get the POC for each field. */ ++ entry->pic_order_cnt[0] = frame->poc; ++ entry->pic_order_cnt[1] = frame->poc; ++ } ++ } ++ ++ slice_params->num_active_dpb_entries = entries; ++ ++ if (sh->slice_type != HEVC_SLICE_I) { ++ rpl = &h->ref->refPicList[0]; ++ for (i = 0; i < rpl->nb_refs; i++) ++ slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], slice_params); ++ } ++ ++ if (sh->slice_type == HEVC_SLICE_B) { ++ rpl = &h->ref->refPicList[1]; ++ for (i = 0; i < rpl->nb_refs; i++) ++ slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], slice_params); ++ } ++ ++ v4l2_request_hevc_fill_pred_table(h, &slice_params->pred_weight_table); ++ ++ slice_params->num_entry_point_offsets = sh->num_entry_point_offsets; ++ if (slice_params->num_entry_point_offsets > 256) { ++ slice_params->num_entry_point_offsets = 256; ++ av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets); ++ } ++ ++ for (i = 0; i < slice_params->num_entry_point_offsets; i++) ++ slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1; ++} ++ ++static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCContext *h) ++{ ++ const HEVCSPS *sps = h->ps.sps; ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */ ++ *ctrl = (struct v4l2_ctrl_hevc_sps) { ++ .chroma_format_idc = sps->chroma_format_idc, ++ .pic_width_in_luma_samples = sps->width, ++ .pic_height_in_luma_samples = sps->height, ++ .bit_depth_luma_minus8 = sps->bit_depth - 8, ++ .bit_depth_chroma_minus8 = sps->bit_depth - 8, ++ .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4, ++ .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1, ++ .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics, ++ .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1, ++ .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3, ++ .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size, ++ .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2, ++ .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size, ++ .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter, ++ .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra, ++ .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1, ++ .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1, ++ .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3, ++ .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size, ++ .num_short_term_ref_pic_sets = sps->nb_st_rps, ++ .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps, ++ }; ++ ++ if (sps->separate_colour_plane_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE; ++ ++ if (sps->scaling_list_enable_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED; ++ ++ if (sps->amp_enabled_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED; ++ ++ if (sps->sao_enabled) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET; ++ ++ if (sps->pcm_enabled_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED; ++ ++ if (sps->pcm.loop_filter_disable_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED; ++ ++ if (sps->long_term_ref_pics_present_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT; ++ ++ if (sps->sps_temporal_mvp_enabled_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED; ++ ++ if (sps->sps_strong_intra_smoothing_enable_flag) ++ ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED; ++} ++ ++static int v4l2_request_hevc_start_frame(AVCodecContext *avctx, ++ av_unused const uint8_t *buffer, ++ av_unused uint32_t size) ++{ ++ const HEVCContext *h = avctx->priv_data; ++ const HEVCSPS *sps = h->ps.sps; ++ const HEVCPPS *pps = h->ps.pps; ++ const ScalingList *sl = pps->scaling_list_data_present_flag ? ++ &pps->scaling_list : ++ sps->scaling_list_enable_flag ? ++ &sps->scaling_list : NULL; ++ V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private; ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ int rv; ++ ++ fill_sps(&controls->sps, h); ++ ++ if (sl) { ++ for (int i = 0; i < 6; i++) { ++ for (int j = 0; j < 16; j++) ++ controls->scaling_matrix.scaling_list_4x4[i][j] = sl->sl[0][i][j]; ++ for (int j = 0; j < 64; j++) { ++ controls->scaling_matrix.scaling_list_8x8[i][j] = sl->sl[1][i][j]; ++ controls->scaling_matrix.scaling_list_16x16[i][j] = sl->sl[2][i][j]; ++ if (i < 2) ++ controls->scaling_matrix.scaling_list_32x32[i][j] = sl->sl[3][i * 3][j]; ++ } ++ controls->scaling_matrix.scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i]; ++ if (i < 2) ++ controls->scaling_matrix.scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3]; ++ } ++ } ++ ++ /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */ ++ controls->pps = (struct v4l2_ctrl_hevc_pps) { ++ .num_extra_slice_header_bits = pps->num_extra_slice_header_bits, ++ .init_qp_minus26 = pps->pic_init_qp_minus26, ++ .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth, ++ .pps_cb_qp_offset = pps->cb_qp_offset, ++ .pps_cr_qp_offset = pps->cr_qp_offset, ++ .pps_beta_offset_div2 = pps->beta_offset / 2, ++ .pps_tc_offset_div2 = pps->tc_offset / 2, ++ .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2, ++ }; ++ ++ if (pps->dependent_slice_segments_enabled_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT; ++ ++ if (pps->output_flag_present_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT; ++ ++ if (pps->sign_data_hiding_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED; ++ ++ if (pps->cabac_init_present_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT; ++ ++ if (pps->constrained_intra_pred_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED; ++ ++ if (pps->transform_skip_enabled_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED; ++ ++ if (pps->cu_qp_delta_enabled_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED; ++ ++ if (pps->pic_slice_level_chroma_qp_offsets_present_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT; ++ ++ if (pps->weighted_pred_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED; ++ ++ if (pps->weighted_bipred_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED; ++ ++ if (pps->transquant_bypass_enable_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED; ++ ++ if (pps->tiles_enabled_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED; ++ ++ if (pps->entropy_coding_sync_enabled_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED; ++ ++ if (pps->loop_filter_across_tiles_enabled_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED; ++ ++ if (pps->seq_loop_filter_across_slices_enabled_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED; ++ ++ if (pps->deblocking_filter_override_enabled_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED; ++ ++ if (pps->disable_dbf) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER; ++ ++ if (pps->lists_modification_present_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT; ++ ++ if (pps->slice_header_extension_present_flag) ++ controls->pps.flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT; ++ ++ if (pps->tiles_enabled_flag) { ++ controls->pps.num_tile_columns_minus1 = pps->num_tile_columns - 1; ++ controls->pps.num_tile_rows_minus1 = pps->num_tile_rows - 1; ++ ++ for (int i = 0; i < pps->num_tile_columns; i++) ++ controls->pps.column_width_minus1[i] = pps->column_width[i] - 1; ++ ++ for (int i = 0; i < pps->num_tile_rows; i++) ++ controls->pps.row_height_minus1[i] = pps->row_height[i] - 1; ++ } ++ ++ controls->first_slice = 1; ++ controls->num_slices = 0; ++ ++ if ((rv = ff_v4l2_request_reset_frame(avctx, h->ref->frame)) != 0) ++ return rv; ++ ++ ff_v4l2_request_start_phase_control(h->ref->frame, ctx->pctrl); ++ ++ ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame ++ ++ return 0; ++} ++ ++static int v4l2_request_hevc_queue_decode(AVCodecContext *avctx, int last_slice) ++{ ++ const HEVCContext *h = avctx->priv_data; ++ V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private; ++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; ++ ++ struct v4l2_ext_control control[] = { ++ { ++ .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS, ++ .ptr = &controls->sps, ++ .size = sizeof(controls->sps), ++ }, ++ { ++ .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS, ++ .ptr = &controls->pps, ++ .size = sizeof(controls->pps), ++ }, ++ { ++ .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX, ++ .ptr = &controls->scaling_matrix, ++ .size = sizeof(controls->scaling_matrix), ++ }, ++ { ++ .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, ++ .ptr = &controls->slice_params, ++ .size = sizeof(controls->slice_params[0]) * FFMAX(FFMIN(controls->num_slices, MAX_SLICES), ctx->max_slices), ++ }, ++ }; ++ ++ if (ctx->decode_mode == V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED) ++ return ff_v4l2_request_decode_slice(avctx, h->ref->frame, control, FF_ARRAY_ELEMS(control), controls->first_slice, last_slice); ++ ++ return ff_v4l2_request_decode_frame(avctx, h->ref->frame, control, FF_ARRAY_ELEMS(control)); ++} ++ ++static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) ++{ ++ const HEVCContext *h = avctx->priv_data; ++ V4L2RequestControlsHEVC *controls = h->ref->hwaccel_picture_private; ++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; ++ V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)h->ref->frame->data[0]; ++ int ret, slice = FFMIN(controls->num_slices, MAX_SLICES - 1); ++ ++ if (ctx->decode_mode == V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && slice) { ++ ret = v4l2_request_hevc_queue_decode(avctx, 0); ++ if (ret) ++ return ret; ++ ++ ff_v4l2_request_reset_frame(avctx, h->ref->frame); ++ slice = controls->num_slices = 0; ++ controls->first_slice = 0; ++ } ++ ++ v4l2_request_hevc_fill_slice_params(h, &controls->slice_params[slice]); ++ ++ if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { ++ ret = ff_v4l2_request_append_output_buffer(avctx, h->ref->frame, nalu_slice_start_code, 3); ++ if (ret) ++ return ret; ++ } ++ ++ ret = ff_v4l2_request_append_output_buffer(avctx, h->ref->frame, buffer, size); ++ if (ret) ++ return ret; ++ ++ controls->slice_params[slice].bit_size = req->output.used * 8; //FIXME ++ controls->num_slices++; ++ return 0; ++} ++ ++static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx) { ++ const HEVCContext *h = avctx->priv_data; ++ ++ if (h->ref != NULL) ++ ff_v4l2_request_abort_phase_control(h->ref->frame); ++} ++ ++static int v4l2_request_hevc_end_frame(AVCodecContext *avctx) ++{ ++ int rv = v4l2_request_hevc_queue_decode(avctx, 1); ++ if (rv < 0) ++ v4l2_request_hevc_abort_frame(avctx); ++ return rv; ++} ++ ++// Called before finally returning the frame to the user ++// Set corrupt flag here as this is actually the frame structure that ++// is going to the user (in MT land each thread has its own pool) ++static int v4l2_request_post_process(void *logctx, AVFrame *frame) ++{ ++ V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)frame->data[0]; ++ if (req) { ++ av_log(logctx, AV_LOG_DEBUG, "%s: flags=%#x, ts=%ld.%06ld\n", __func__, req->capture.buffer.flags, ++ req->capture.buffer.timestamp.tv_sec, req->capture.buffer.timestamp.tv_usec); ++ frame->flags = (req->capture.buffer.flags & V4L2_BUF_FLAG_ERROR) == 0 ? 0 : AV_FRAME_FLAG_CORRUPT; ++ } ++ ++ return 0; ++} ++ ++static int v4l2_request_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame) ++{ ++ int ret; ++ ++ // This dups the remainder of ff_get_buffer but adds a post_process callback ++ ret = avctx->get_buffer2(avctx, frame, AV_GET_BUFFER_FLAG_REF); ++ if (ret < 0) ++ goto fail; ++ ++ ret = ff_attach_decode_data(frame); ++ if (ret < 0) ++ goto fail; ++ ++ { ++ FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data; ++ fdd->post_process = v4l2_request_post_process; ++ } ++ ++ return 0; ++ ++fail: ++ if (ret < 0) { ++ av_log(avctx, AV_LOG_ERROR, "%s failed\n", __func__); ++ av_frame_unref(frame); ++ } ++ ++ return ret; ++} ++ ++static int v4l2_request_hevc_set_controls(AVCodecContext *avctx) ++{ ++ V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data; ++ int ret; ++ ++ struct v4l2_ext_control control[] = { ++ { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, }, ++ { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, }, ++ }; ++ struct v4l2_query_ext_ctrl slice_params = { ++ .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, ++ }; ++ ++ ctx->decode_mode = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE); ++ if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED && ++ ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode); ++ return AVERROR(EINVAL); ++ } ++ ++ ctx->start_code = ff_v4l2_request_query_control_default_value(avctx, V4L2_CID_MPEG_VIDEO_HEVC_START_CODE); ++ if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE && ++ ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code); ++ return AVERROR(EINVAL); ++ } ++ ++ ret = ff_v4l2_request_query_control(avctx, &slice_params); ++ if (ret) ++ return ret; ++ ++ ctx->max_slices = slice_params.elems; ++ if (ctx->max_slices > MAX_SLICES) { ++ av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices); ++ return AVERROR(EINVAL); ++ } ++ ++ control[0].value = ctx->decode_mode; ++ control[1].value = ctx->start_code; ++ ++ return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control)); ++} ++ ++static int v4l2_request_hevc_uninit(AVCodecContext *avctx) ++{ ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ ff_v4l2_phase_control_deletez(&ctx->pctrl); ++ return ff_v4l2_request_uninit(avctx); ++} ++ ++static int v4l2_request_hevc_init(AVCodecContext *avctx) ++{ ++ const HEVCContext *h = avctx->priv_data; ++ V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data; ++ struct v4l2_ctrl_hevc_sps sps; ++ int ret; ++ ++ struct v4l2_ext_control control[] = { ++ { ++ .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS, ++ .ptr = &sps, ++ .size = sizeof(sps), ++ }, ++ }; ++ ++ if ((ctx->pctrl = ff_v4l2_phase_control_new(2)) == NULL) ++ return AVERROR(ENOMEM); ++ ++ fill_sps(&sps, h); ++ ++ ret = ff_v4l2_request_init(avctx, V4L2_PIX_FMT_HEVC_SLICE, 4 * 1024 * 1024, control, FF_ARRAY_ELEMS(control)); ++ if (ret) ++ return ret; ++ ++ return v4l2_request_hevc_set_controls(avctx); ++} ++ ++const AVHWAccel ff_hevc_v4l2request_hwaccel = { ++ .name = "hevc_v4l2request", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_HEVC, ++ .pix_fmt = AV_PIX_FMT_DRM_PRIME, ++ .alloc_frame = v4l2_request_hevc_alloc_frame, ++ .start_frame = v4l2_request_hevc_start_frame, ++ .decode_slice = v4l2_request_hevc_decode_slice, ++ .end_frame = v4l2_request_hevc_end_frame, ++ .abort_frame = v4l2_request_hevc_abort_frame, ++ .frame_priv_data_size = sizeof(V4L2RequestControlsHEVC), ++ .init = v4l2_request_hevc_init, ++ .uninit = v4l2_request_hevc_uninit, ++ .priv_data_size = sizeof(V4L2RequestContextHEVC), ++ .frame_params = ff_v4l2_request_frame_params, ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE, ++}; +diff --git a/libavcodec/v4l2_request_mpeg2.c b/libavcodec/v4l2_request_mpeg2.c +new file mode 100644 +index 0000000000..bc251a6fd2 +--- /dev/null ++++ b/libavcodec/v4l2_request_mpeg2.c +@@ -0,0 +1,155 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "hwconfig.h" ++#include "mpegvideo.h" ++#include "v4l2_request.h" ++#include "mpeg2-ctrls.h" ++ ++typedef struct V4L2RequestControlsMPEG2 { ++ struct v4l2_ctrl_mpeg2_slice_params slice_params; ++ struct v4l2_ctrl_mpeg2_quantization quantization; ++} V4L2RequestControlsMPEG2; ++ ++static int v4l2_request_mpeg2_start_frame(AVCodecContext *avctx, ++ av_unused const uint8_t *buffer, ++ av_unused uint32_t size) ++{ ++ const MpegEncContext *s = avctx->priv_data; ++ V4L2RequestControlsMPEG2 *controls = s->current_picture_ptr->hwaccel_picture_private; ++ V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)s->current_picture_ptr->f->data[0]; ++ ++ controls->slice_params = (struct v4l2_ctrl_mpeg2_slice_params) { ++ .bit_size = 0, ++ .data_bit_offset = 0, ++ ++ /* ISO/IEC 13818-2, ITU-T Rec. H.262: Slice */ ++ .quantiser_scale_code = s->qscale >> 1, ++ ++ .sequence = { ++ /* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence header */ ++ .horizontal_size = s->width, ++ .vertical_size = s->height, ++ .vbv_buffer_size = req->output.size, ++ ++ /* ISO/IEC 13818-2, ITU-T Rec. H.262: Sequence extension */ ++ .profile_and_level_indication = 0, ++ .progressive_sequence = s->progressive_sequence, ++ .chroma_format = s->chroma_format, ++ }, ++ ++ .picture = { ++ /* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture header */ ++ .picture_coding_type = s->pict_type, ++ ++ /* ISO/IEC 13818-2, ITU-T Rec. H.262: Picture coding extension */ ++ .f_code[0][0] = s->mpeg_f_code[0][0], ++ .f_code[0][1] = s->mpeg_f_code[0][1], ++ .f_code[1][0] = s->mpeg_f_code[1][0], ++ .f_code[1][1] = s->mpeg_f_code[1][1], ++ .intra_dc_precision = s->intra_dc_precision, ++ .picture_structure = s->picture_structure, ++ .top_field_first = s->top_field_first, ++ .frame_pred_frame_dct = s->frame_pred_frame_dct, ++ .concealment_motion_vectors = s->concealment_motion_vectors, ++ .q_scale_type = s->q_scale_type, ++ .intra_vlc_format = s->intra_vlc_format, ++ .alternate_scan = s->alternate_scan, ++ .repeat_first_field = s->repeat_first_field, ++ .progressive_frame = s->progressive_frame, ++ }, ++ }; ++ ++ switch (s->pict_type) { ++ case AV_PICTURE_TYPE_B: ++ controls->slice_params.backward_ref_ts = ff_v4l2_request_get_capture_timestamp(s->next_picture.f); ++ // fall-through ++ case AV_PICTURE_TYPE_P: ++ controls->slice_params.forward_ref_ts = ff_v4l2_request_get_capture_timestamp(s->last_picture.f); ++ } ++ ++ controls->quantization = (struct v4l2_ctrl_mpeg2_quantization) { ++ /* ISO/IEC 13818-2, ITU-T Rec. H.262: Quant matrix extension */ ++ .load_intra_quantiser_matrix = 1, ++ .load_non_intra_quantiser_matrix = 1, ++ .load_chroma_intra_quantiser_matrix = 1, ++ .load_chroma_non_intra_quantiser_matrix = 1, ++ }; ++ ++ for (int i = 0; i < 64; i++) { ++ int n = s->idsp.idct_permutation[ff_zigzag_direct[i]]; ++ controls->quantization.intra_quantiser_matrix[i] = s->intra_matrix[n]; ++ controls->quantization.non_intra_quantiser_matrix[i] = s->inter_matrix[n]; ++ controls->quantization.chroma_intra_quantiser_matrix[i] = s->chroma_intra_matrix[n]; ++ controls->quantization.chroma_non_intra_quantiser_matrix[i] = s->chroma_inter_matrix[n]; ++ } ++ ++ return ff_v4l2_request_reset_frame(avctx, s->current_picture_ptr->f); ++} ++ ++static int v4l2_request_mpeg2_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) ++{ ++ const MpegEncContext *s = avctx->priv_data; ++ ++ return ff_v4l2_request_append_output_buffer(avctx, s->current_picture_ptr->f, buffer, size); ++} ++ ++static int v4l2_request_mpeg2_end_frame(AVCodecContext *avctx) ++{ ++ const MpegEncContext *s = avctx->priv_data; ++ V4L2RequestControlsMPEG2 *controls = s->current_picture_ptr->hwaccel_picture_private; ++ V4L2RequestDescriptor *req = (V4L2RequestDescriptor*)s->current_picture_ptr->f->data[0]; ++ ++ struct v4l2_ext_control control[] = { ++ { ++ .id = V4L2_CID_MPEG_VIDEO_MPEG2_SLICE_PARAMS, ++ .ptr = &controls->slice_params, ++ .size = sizeof(controls->slice_params), ++ }, ++ { ++ .id = V4L2_CID_MPEG_VIDEO_MPEG2_QUANTIZATION, ++ .ptr = &controls->quantization, ++ .size = sizeof(controls->quantization), ++ }, ++ }; ++ ++ controls->slice_params.bit_size = req->output.used * 8; ++ ++ return ff_v4l2_request_decode_frame(avctx, s->current_picture_ptr->f, control, FF_ARRAY_ELEMS(control)); ++} ++ ++static int v4l2_request_mpeg2_init(AVCodecContext *avctx) ++{ ++ return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_MPEG2_SLICE, 1024 * 1024, NULL, 0); ++} ++ ++const AVHWAccel ff_mpeg2_v4l2request_hwaccel = { ++ .name = "mpeg2_v4l2request", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_MPEG2VIDEO, ++ .pix_fmt = AV_PIX_FMT_DRM_PRIME, ++ .start_frame = v4l2_request_mpeg2_start_frame, ++ .decode_slice = v4l2_request_mpeg2_decode_slice, ++ .end_frame = v4l2_request_mpeg2_end_frame, ++ .frame_priv_data_size = sizeof(V4L2RequestControlsMPEG2), ++ .init = v4l2_request_mpeg2_init, ++ .uninit = ff_v4l2_request_uninit, ++ .priv_data_size = sizeof(V4L2RequestContext), ++ .frame_params = ff_v4l2_request_frame_params, ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE, ++}; +diff --git a/libavcodec/v4l2_request_vp8.c b/libavcodec/v4l2_request_vp8.c +new file mode 100644 +index 0000000000..ea2c55fa2f +--- /dev/null ++++ b/libavcodec/v4l2_request_vp8.c +@@ -0,0 +1,181 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "hwconfig.h" ++#include "v4l2_request.h" ++#include "vp8.h" ++#include "vp8-ctrls.h" ++ ++typedef struct V4L2RequestControlsVP8 { ++ struct v4l2_ctrl_vp8_frame_header ctrl; ++} V4L2RequestControlsVP8; ++ ++static int v4l2_request_vp8_start_frame(AVCodecContext *avctx, ++ av_unused const uint8_t *buffer, ++ av_unused uint32_t size) ++{ ++ const VP8Context *s = avctx->priv_data; ++ V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private; ++ ++ memset(&controls->ctrl, 0, sizeof(controls->ctrl)); ++ return ff_v4l2_request_reset_frame(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f); ++} ++ ++static int v4l2_request_vp8_end_frame(AVCodecContext *avctx) ++{ ++ const VP8Context *s = avctx->priv_data; ++ V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private; ++ struct v4l2_ext_control control[] = { ++ { ++ .id = V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER, ++ .ptr = &controls->ctrl, ++ .size = sizeof(controls->ctrl), ++ }, ++ }; ++ ++ return ff_v4l2_request_decode_frame(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f, ++ control, FF_ARRAY_ELEMS(control)); ++} ++ ++static int v4l2_request_vp8_decode_slice(AVCodecContext *avctx, ++ const uint8_t *buffer, ++ uint32_t size) ++{ ++ const VP8Context *s = avctx->priv_data; ++ V4L2RequestControlsVP8 *controls = s->framep[VP56_FRAME_CURRENT]->hwaccel_picture_private; ++ struct v4l2_ctrl_vp8_frame_header *hdr = &controls->ctrl; ++ const uint8_t *data = buffer + 3 + 7 * s->keyframe; ++ unsigned int i, j, k; ++ ++ hdr->version = s->profile & 0x3; ++ hdr->width = avctx->width; ++ hdr->height = avctx->height; ++ /* FIXME: set ->xx_scale */ ++ hdr->prob_skip_false = s->prob->mbskip; ++ hdr->prob_intra = s->prob->intra; ++ hdr->prob_gf = s->prob->golden; ++ hdr->prob_last = s->prob->last; ++ hdr->first_part_size = s->header_partition_size; ++ hdr->first_part_header_bits = (8 * (s->coder_state_at_header_end.input - data) - ++ s->coder_state_at_header_end.bit_count - 8); ++ hdr->num_dct_parts = s->num_coeff_partitions; ++ for (i = 0; i < 8; i++) ++ hdr->dct_part_sizes[i] = s->coeff_partition_size[i]; ++ ++ hdr->coder_state.range = s->coder_state_at_header_end.range; ++ hdr->coder_state.value = s->coder_state_at_header_end.value; ++ hdr->coder_state.bit_count = s->coder_state_at_header_end.bit_count; ++ if (s->framep[VP56_FRAME_PREVIOUS]) ++ hdr->last_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_PREVIOUS]->tf.f); ++ if (s->framep[VP56_FRAME_GOLDEN]) ++ hdr->golden_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_GOLDEN]->tf.f); ++ if (s->framep[VP56_FRAME_GOLDEN2]) ++ hdr->alt_frame_ts = ff_v4l2_request_get_capture_timestamp(s->framep[VP56_FRAME_GOLDEN2]->tf.f); ++ hdr->flags |= s->invisible ? 0 : V4L2_VP8_FRAME_HEADER_FLAG_SHOW_FRAME; ++ hdr->flags |= s->mbskip_enabled ? V4L2_VP8_FRAME_HEADER_FLAG_MB_NO_SKIP_COEFF : 0; ++ hdr->flags |= (s->profile & 0x4) ? V4L2_VP8_FRAME_HEADER_FLAG_EXPERIMENTAL : 0; ++ hdr->flags |= s->keyframe ? V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME : 0; ++ hdr->flags |= s->sign_bias[VP56_FRAME_GOLDEN] ? V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_GOLDEN : 0; ++ hdr->flags |= s->sign_bias[VP56_FRAME_GOLDEN2] ? V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_ALT : 0; ++ hdr->segment_header.flags |= s->segmentation.enabled ? V4L2_VP8_SEGMENT_HEADER_FLAG_ENABLED : 0; ++ hdr->segment_header.flags |= s->segmentation.update_map ? V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_MAP : 0; ++ hdr->segment_header.flags |= s->segmentation.update_feature_data ? V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_FEATURE_DATA : 0; ++ hdr->segment_header.flags |= s->segmentation.absolute_vals ? 0 : V4L2_VP8_SEGMENT_HEADER_FLAG_DELTA_VALUE_MODE; ++ for (i = 0; i < 4; i++) { ++ hdr->segment_header.quant_update[i] = s->segmentation.base_quant[i]; ++ hdr->segment_header.lf_update[i] = s->segmentation.filter_level[i]; ++ } ++ ++ for (i = 0; i < 3; i++) ++ hdr->segment_header.segment_probs[i] = s->prob->segmentid[i]; ++ ++ hdr->lf_header.level = s->filter.level; ++ hdr->lf_header.sharpness_level = s->filter.sharpness; ++ hdr->lf_header.flags |= s->lf_delta.enabled ? V4L2_VP8_LF_HEADER_ADJ_ENABLE : 0; ++ hdr->lf_header.flags |= s->lf_delta.update ? V4L2_VP8_LF_HEADER_DELTA_UPDATE : 0; ++ hdr->lf_header.flags |= s->filter.simple ? V4L2_VP8_LF_FILTER_TYPE_SIMPLE : 0; ++ for (i = 0; i < 4; i++) { ++ hdr->lf_header.ref_frm_delta[i] = s->lf_delta.ref[i]; ++ hdr->lf_header.mb_mode_delta[i] = s->lf_delta.mode[i + MODE_I4x4]; ++ } ++ ++ // Probabilites ++ if (s->keyframe) { ++ static const uint8_t keyframe_y_mode_probs[4] = { ++ 145, 156, 163, 128 ++ }; ++ static const uint8_t keyframe_uv_mode_probs[3] = { ++ 142, 114, 183 ++ }; ++ ++ memcpy(hdr->entropy_header.y_mode_probs, keyframe_y_mode_probs, 4); ++ memcpy(hdr->entropy_header.uv_mode_probs, keyframe_uv_mode_probs, 3); ++ } else { ++ for (i = 0; i < 4; i++) ++ hdr->entropy_header.y_mode_probs[i] = s->prob->pred16x16[i]; ++ for (i = 0; i < 3; i++) ++ hdr->entropy_header.uv_mode_probs[i] = s->prob->pred8x8c[i]; ++ } ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < 19; j++) ++ hdr->entropy_header.mv_probs[i][j] = s->prob->mvc[i][j]; ++ ++ for (i = 0; i < 4; i++) { ++ for (j = 0; j < 8; j++) { ++ static const int coeff_bands_inverse[8] = { ++ 0, 1, 2, 3, 5, 6, 4, 15 ++ }; ++ int coeff_pos = coeff_bands_inverse[j]; ++ ++ for (k = 0; k < 3; k++) { ++ memcpy(hdr->entropy_header.coeff_probs[i][j][k], ++ s->prob->token[i][coeff_pos][k], 11); ++ } ++ } ++ } ++ ++ hdr->quant_header.y_ac_qi = s->quant.yac_qi; ++ hdr->quant_header.y_dc_delta = s->quant.ydc_delta; ++ hdr->quant_header.y2_dc_delta = s->quant.y2dc_delta; ++ hdr->quant_header.y2_ac_delta = s->quant.y2ac_delta; ++ hdr->quant_header.uv_dc_delta = s->quant.uvdc_delta; ++ hdr->quant_header.uv_ac_delta = s->quant.uvac_delta; ++ ++ return ff_v4l2_request_append_output_buffer(avctx, s->framep[VP56_FRAME_CURRENT]->tf.f, buffer, size); ++} ++ ++static int v4l2_request_vp8_init(AVCodecContext *avctx) ++{ ++ return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_VP8_FRAME, 2 * 1024 * 1024, NULL, 0); ++} ++ ++const AVHWAccel ff_vp8_v4l2request_hwaccel = { ++ .name = "vp8_v4l2request", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_VP8, ++ .pix_fmt = AV_PIX_FMT_DRM_PRIME, ++ .start_frame = v4l2_request_vp8_start_frame, ++ .decode_slice = v4l2_request_vp8_decode_slice, ++ .end_frame = v4l2_request_vp8_end_frame, ++ .frame_priv_data_size = sizeof(V4L2RequestControlsVP8), ++ .init = v4l2_request_vp8_init, ++ .uninit = ff_v4l2_request_uninit, ++ .priv_data_size = sizeof(V4L2RequestContext), ++ .frame_params = ff_v4l2_request_frame_params, ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE, ++}; +diff --git a/libavcodec/v4l2_request_vp9.c b/libavcodec/v4l2_request_vp9.c +new file mode 100644 +index 0000000000..2e10b7ad1a +--- /dev/null ++++ b/libavcodec/v4l2_request_vp9.c +@@ -0,0 +1,353 @@ ++/* ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++#include "hwconfig.h" ++#include "v4l2_request.h" ++#include "vp9dec.h" ++#include "vp9-ctrls.h" ++ ++typedef struct V4L2RequestControlsVP9 { ++ struct v4l2_ctrl_vp9_frame_decode_params decode_params; ++} V4L2RequestControlsVP9; ++ ++static const uint8_t ff_to_v4l2_intramode[] = { ++ [VERT_PRED] = V4L2_VP9_INTRA_PRED_MODE_V, ++ [HOR_PRED] = V4L2_VP9_INTRA_PRED_MODE_H, ++ [DC_PRED] = V4L2_VP9_INTRA_PRED_MODE_DC, ++ [DIAG_DOWN_LEFT_PRED] = V4L2_VP9_INTRA_PRED_MODE_D45, ++ [DIAG_DOWN_RIGHT_PRED] = V4L2_VP9_INTRA_PRED_MODE_D135, ++ [VERT_RIGHT_PRED] = V4L2_VP9_INTRA_PRED_MODE_D117, ++ [HOR_DOWN_PRED] = V4L2_VP9_INTRA_PRED_MODE_D153, ++ [VERT_LEFT_PRED] = V4L2_VP9_INTRA_PRED_MODE_D63, ++ [HOR_UP_PRED] = V4L2_VP9_INTRA_PRED_MODE_D207, ++ [TM_VP8_PRED] = V4L2_VP9_INTRA_PRED_MODE_TM, ++}; ++ ++static int v4l2_request_vp9_set_frame_ctx(AVCodecContext *avctx, unsigned int id) ++{ ++ VP9Context *s = avctx->priv_data; ++ struct v4l2_ctrl_vp9_frame_ctx fctx = {}; ++ struct v4l2_ext_control control[] = { ++ { ++ .id = V4L2_CID_MPEG_VIDEO_VP9_FRAME_CONTEXT(id), ++ .ptr = &fctx, ++ .size = sizeof(fctx), ++ }, ++ }; ++ ++ memcpy(fctx.probs.tx8, s->prob_ctx[id].p.tx8p, sizeof(s->prob_ctx[id].p.tx8p)); ++ memcpy(fctx.probs.tx16, s->prob_ctx[id].p.tx16p, sizeof(s->prob_ctx[id].p.tx16p)); ++ memcpy(fctx.probs.tx32, s->prob_ctx[id].p.tx32p, sizeof(s->prob_ctx[id].p.tx32p)); ++ memcpy(fctx.probs.coef, s->prob_ctx[id].coef, sizeof(s->prob_ctx[id].coef)); ++ memcpy(fctx.probs.skip, s->prob_ctx[id].p.skip, sizeof(s->prob_ctx[id].p.skip)); ++ memcpy(fctx.probs.inter_mode, s->prob_ctx[id].p.mv_mode, sizeof(s->prob_ctx[id].p.mv_mode)); ++ memcpy(fctx.probs.interp_filter, s->prob_ctx[id].p.filter, sizeof(s->prob_ctx[id].p.filter)); ++ memcpy(fctx.probs.is_inter, s->prob_ctx[id].p.intra, sizeof(s->prob_ctx[id].p.intra)); ++ memcpy(fctx.probs.comp_mode, s->prob_ctx[id].p.comp, sizeof(s->prob_ctx[id].p.comp)); ++ memcpy(fctx.probs.single_ref, s->prob_ctx[id].p.single_ref, sizeof(s->prob_ctx[id].p.single_ref)); ++ memcpy(fctx.probs.comp_ref, s->prob_ctx[id].p.comp_ref, sizeof(s->prob_ctx[id].p.comp_ref)); ++ memcpy(fctx.probs.y_mode, s->prob_ctx[id].p.y_mode, sizeof(s->prob_ctx[id].p.y_mode)); ++ for (unsigned i = 0; i < 10; i++) ++ memcpy(fctx.probs.uv_mode[ff_to_v4l2_intramode[i]], s->prob_ctx[id].p.uv_mode[i], sizeof(s->prob_ctx[id].p.uv_mode[0])); ++ for (unsigned i = 0; i < 4; i++) ++ memcpy(fctx.probs.partition[i * 4], s->prob_ctx[id].p.partition[3 - i], sizeof(s->prob_ctx[id].p.partition[0])); ++ memcpy(fctx.probs.mv.joint, s->prob_ctx[id].p.mv_joint, sizeof(s->prob_ctx[id].p.mv_joint)); ++ for (unsigned i = 0; i < 2; i++) { ++ fctx.probs.mv.sign[i] = s->prob_ctx[id].p.mv_comp[i].sign; ++ memcpy(fctx.probs.mv.class[i], s->prob_ctx[id].p.mv_comp[i].classes, sizeof(s->prob_ctx[id].p.mv_comp[0].classes)); ++ fctx.probs.mv.class0_bit[i] = s->prob_ctx[id].p.mv_comp[i].class0; ++ memcpy(fctx.probs.mv.bits[i], s->prob_ctx[id].p.mv_comp[i].bits, sizeof(s->prob_ctx[id].p.mv_comp[0].bits)); ++ memcpy(fctx.probs.mv.class0_fr[i], s->prob_ctx[id].p.mv_comp[i].class0_fp, sizeof(s->prob_ctx[id].p.mv_comp[0].class0_fp)); ++ memcpy(fctx.probs.mv.fr[i], s->prob_ctx[id].p.mv_comp[i].fp, sizeof(s->prob_ctx[id].p.mv_comp[0].fp)); ++ fctx.probs.mv.class0_hp[i] = s->prob_ctx[id].p.mv_comp[i].class0_hp; ++ fctx.probs.mv.hp[i] = s->prob_ctx[id].p.mv_comp[i].hp; ++ } ++ ++ return ff_v4l2_request_set_controls(avctx, control, FF_ARRAY_ELEMS(control)); ++} ++ ++static int v4l2_request_vp9_get_frame_ctx(AVCodecContext *avctx, unsigned int id) ++{ ++ VP9Context *s = avctx->priv_data; ++ struct v4l2_ctrl_vp9_frame_ctx fctx = {}; ++ struct v4l2_ext_control control[] = { ++ { ++ .id = V4L2_CID_MPEG_VIDEO_VP9_FRAME_CONTEXT(id), ++ .ptr = &fctx, ++ .size = sizeof(fctx), ++ }, ++ }; ++ ++ int ret = ff_v4l2_request_get_controls(avctx, control, FF_ARRAY_ELEMS(control)); ++ if (ret) ++ return ret; ++ ++ memcpy(s->prob_ctx[id].p.tx8p, fctx.probs.tx8, sizeof(s->prob_ctx[id].p.tx8p)); ++ memcpy(s->prob_ctx[id].p.tx16p, fctx.probs.tx16, sizeof(s->prob_ctx[id].p.tx16p)); ++ memcpy(s->prob_ctx[id].p.tx32p, fctx.probs.tx32, sizeof(s->prob_ctx[id].p.tx32p)); ++ memcpy(s->prob_ctx[id].coef, fctx.probs.coef, sizeof(s->prob_ctx[id].coef)); ++ memcpy(s->prob_ctx[id].p.skip, fctx.probs.skip, sizeof(s->prob_ctx[id].p.skip)); ++ memcpy(s->prob_ctx[id].p.mv_mode, fctx.probs.inter_mode, sizeof(s->prob_ctx[id].p.mv_mode)); ++ memcpy(s->prob_ctx[id].p.filter, fctx.probs.interp_filter, sizeof(s->prob_ctx[id].p.filter)); ++ memcpy(s->prob_ctx[id].p.intra, fctx.probs.is_inter, sizeof(s->prob_ctx[id].p.intra)); ++ memcpy(s->prob_ctx[id].p.comp, fctx.probs.comp_mode, sizeof(s->prob_ctx[id].p.comp)); ++ memcpy(s->prob_ctx[id].p.single_ref, fctx.probs.single_ref, sizeof(s->prob_ctx[id].p.single_ref)); ++ memcpy(s->prob_ctx[id].p.comp_ref, fctx.probs.comp_ref, sizeof(s->prob_ctx[id].p.comp_ref)); ++ memcpy(s->prob_ctx[id].p.y_mode, fctx.probs.y_mode, sizeof(s->prob_ctx[id].p.y_mode)); ++ for (unsigned i = 0; i < 10; i++) ++ memcpy(s->prob_ctx[id].p.uv_mode[i], fctx.probs.uv_mode[ff_to_v4l2_intramode[i]], sizeof(s->prob_ctx[id].p.uv_mode[0])); ++ for (unsigned i = 0; i < 4; i++) ++ memcpy(s->prob_ctx[id].p.partition[3 - i], fctx.probs.partition[i * 4], sizeof(s->prob_ctx[id].p.partition[0])); ++ memcpy(s->prob_ctx[id].p.mv_joint, fctx.probs.mv.joint, sizeof(s->prob_ctx[id].p.mv_joint)); ++ for (unsigned i = 0; i < 2; i++) { ++ s->prob_ctx[id].p.mv_comp[i].sign = fctx.probs.mv.sign[i]; ++ memcpy(s->prob_ctx[id].p.mv_comp[i].classes, fctx.probs.mv.class[i], sizeof(s->prob_ctx[id].p.mv_comp[0].classes)); ++ s->prob_ctx[id].p.mv_comp[i].class0 = fctx.probs.mv.class0_bit[i]; ++ memcpy(s->prob_ctx[id].p.mv_comp[i].bits, fctx.probs.mv.bits[i], sizeof(s->prob_ctx[id].p.mv_comp[0].bits)); ++ memcpy(s->prob_ctx[id].p.mv_comp[i].class0_fp, fctx.probs.mv.class0_fr[i], sizeof(s->prob_ctx[id].p.mv_comp[0].class0_fp)); ++ memcpy(s->prob_ctx[id].p.mv_comp[i].fp, fctx.probs.mv.fr[i], sizeof(s->prob_ctx[id].p.mv_comp[0].fp)); ++ s->prob_ctx[id].p.mv_comp[i].class0_hp = fctx.probs.mv.class0_hp[i]; ++ s->prob_ctx[id].p.mv_comp[i].hp = fctx.probs.mv.hp[i]; ++ } ++ ++ return 0; ++} ++ ++static int v4l2_request_vp9_start_frame(AVCodecContext *avctx, ++ av_unused const uint8_t *buffer, ++ av_unused uint32_t size) ++{ ++ const VP9Context *s = avctx->priv_data; ++ const VP9Frame *f = &s->s.frames[CUR_FRAME]; ++ V4L2RequestControlsVP9 *controls = f->hwaccel_picture_private; ++ struct v4l2_ctrl_vp9_frame_decode_params *dec_params = &controls->decode_params; ++ int ret; ++ ++ if (s->s.h.keyframe || s->s.h.errorres || (s->s.h.intraonly && s->s.h.resetctx == 3)) { ++ for (unsigned i = 0; i < 4; i++) { ++ ret = v4l2_request_vp9_set_frame_ctx(avctx, i); ++ if (ret) ++ return ret; ++ } ++ } else if (s->s.h.intraonly && s->s.h.resetctx == 2) { ++ ret = v4l2_request_vp9_set_frame_ctx(avctx, s->s.h.framectxid); ++ if (ret) ++ return ret; ++ } ++ ++ if (s->s.h.keyframe) ++ dec_params->flags |= V4L2_VP9_FRAME_FLAG_KEY_FRAME; ++ if (!s->s.h.invisible) ++ dec_params->flags |= V4L2_VP9_FRAME_FLAG_SHOW_FRAME; ++ if (s->s.h.errorres) ++ dec_params->flags |= V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT; ++ if (s->s.h.intraonly) ++ dec_params->flags |= V4L2_VP9_FRAME_FLAG_INTRA_ONLY; ++ if (!s->s.h.keyframe && s->s.h.highprecisionmvs) ++ dec_params->flags |= V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV; ++ if (s->s.h.refreshctx) ++ dec_params->flags |= V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX; ++ if (s->s.h.parallelmode) ++ dec_params->flags |= V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE; ++ if (s->ss_h) ++ dec_params->flags |= V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING; ++ if (s->ss_v) ++ dec_params->flags |= V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING; ++ if (avctx->color_range == AVCOL_RANGE_JPEG) ++ dec_params->flags |= V4L2_VP9_FRAME_FLAG_COLOR_RANGE_FULL_SWING; ++ ++ dec_params->compressed_header_size = s->s.h.compressed_header_size; ++ dec_params->uncompressed_header_size = s->s.h.uncompressed_header_size; ++ dec_params->profile = s->s.h.profile; ++ dec_params->reset_frame_context = s->s.h.resetctx > 0 ? s->s.h.resetctx - 1 : 0; ++ dec_params->frame_context_idx = s->s.h.framectxid; ++ dec_params->bit_depth = s->s.h.bpp; ++ ++ dec_params->interpolation_filter = s->s.h.filtermode ^ (s->s.h.filtermode <= 1); ++ dec_params->tile_cols_log2 = s->s.h.tiling.log2_tile_cols; ++ dec_params->tile_rows_log2 = s->s.h.tiling.log2_tile_rows; ++ dec_params->tx_mode = s->s.h.txfmmode; ++ dec_params->reference_mode = s->s.h.comppredmode; ++ dec_params->frame_width_minus_1 = s->w - 1; ++ dec_params->frame_height_minus_1 = s->h - 1; ++ //dec_params->render_width_minus_1 = avctx->width - 1; ++ //dec_params->render_height_minus_1 = avctx->height - 1; ++ ++ for (unsigned i = 0; i < 3; i++) { ++ const ThreadFrame *ref = &s->s.refs[s->s.h.refidx[i]]; ++ if (ref->f && ref->f->buf[0]) ++ dec_params->refs[i] = ff_v4l2_request_get_capture_timestamp(ref->f); ++ } ++ ++ if (s->s.h.lf_delta.enabled) ++ dec_params->lf.flags |= V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED; ++ if (s->s.h.lf_delta.updated) ++ dec_params->lf.flags |= V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE; ++ ++ dec_params->lf.level = s->s.h.filter.level; ++ dec_params->lf.sharpness = s->s.h.filter.sharpness; ++ for (unsigned i = 0; i < 4; i++) ++ dec_params->lf.ref_deltas[i] = s->s.h.lf_delta.ref[i]; ++ for (unsigned i = 0; i < 2; i++) ++ dec_params->lf.mode_deltas[i] = s->s.h.lf_delta.mode[i]; ++ for (unsigned i = 0; i < 8; i++) { ++ for (unsigned j = 0; j < 4; j++) ++ memcpy(dec_params->lf.level_lookup[i][j], s->s.h.segmentation.feat[i].lflvl[j], sizeof(dec_params->lf.level_lookup[0][0])); ++ } ++ ++ dec_params->quant.base_q_idx = s->s.h.yac_qi; ++ dec_params->quant.delta_q_y_dc = s->s.h.ydc_qdelta; ++ dec_params->quant.delta_q_uv_dc = s->s.h.uvdc_qdelta; ++ dec_params->quant.delta_q_uv_ac = s->s.h.uvac_qdelta; ++ ++ if (s->s.h.segmentation.enabled) ++ dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_ENABLED; ++ if (s->s.h.segmentation.update_map) ++ dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP; ++ if (s->s.h.segmentation.temporal) ++ dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE; ++ if (s->s.h.segmentation.update_data) ++ dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_UPDATE_DATA; ++ if (s->s.h.segmentation.absolute_vals) ++ dec_params->seg.flags |= V4L2_VP9_SEGMENTATION_FLAG_ABS_OR_DELTA_UPDATE; ++ ++ for (unsigned i = 0; i < 7; i++) ++ dec_params->seg.tree_probs[i] = s->s.h.segmentation.prob[i]; ++ ++ if (s->s.h.segmentation.temporal) { ++ for (unsigned i = 0; i < 3; i++) ++ dec_params->seg.pred_probs[i] = s->s.h.segmentation.pred_prob[i]; ++ } else { ++ memset(dec_params->seg.pred_probs, 255, sizeof(dec_params->seg.pred_probs)); ++ } ++ ++ for (unsigned i = 0; i < 8; i++) { ++ if (s->s.h.segmentation.feat[i].q_enabled) { ++ dec_params->seg.feature_enabled[i] |= 1 << V4L2_VP9_SEGMENT_FEATURE_QP_DELTA; ++ dec_params->seg.feature_data[i][V4L2_VP9_SEGMENT_FEATURE_QP_DELTA] = s->s.h.segmentation.feat[i].q_val; ++ } ++ ++ if (s->s.h.segmentation.feat[i].lf_enabled) { ++ dec_params->seg.feature_enabled[i] |= 1 << V4L2_VP9_SEGMENT_FEATURE_LF; ++ dec_params->seg.feature_data[i][V4L2_VP9_SEGMENT_FEATURE_LF] = s->s.h.segmentation.feat[i].lf_val; ++ } ++ ++ if (s->s.h.segmentation.feat[i].ref_enabled) { ++ dec_params->seg.feature_enabled[i] |= 1 << V4L2_VP9_SEGMENT_FEATURE_REF_FRAME; ++ dec_params->seg.feature_data[i][V4L2_VP9_SEGMENT_FEATURE_REF_FRAME] = s->s.h.segmentation.feat[i].ref_val; ++ } ++ ++ if (s->s.h.segmentation.feat[i].skip_enabled) ++ dec_params->seg.feature_enabled[i] |= 1 << V4L2_VP9_SEGMENT_FEATURE_SKIP; ++ } ++ ++ memcpy(dec_params->probs.tx8, s->prob.p.tx8p, sizeof(s->prob.p.tx8p)); ++ memcpy(dec_params->probs.tx16, s->prob.p.tx16p, sizeof(s->prob.p.tx16p)); ++ memcpy(dec_params->probs.tx32, s->prob.p.tx32p, sizeof(s->prob.p.tx32p)); ++ for (unsigned i = 0; i < 4; i++) { ++ for (unsigned j = 0; j < 2; j++) { ++ for (unsigned k = 0; k < 2; k++) { ++ for (unsigned l = 0; l < 6; l++) { ++ for (unsigned m = 0; m < 6; m++) { ++ memcpy(dec_params->probs.coef[i][j][k][l][m], s->prob.coef[i][j][k][l][m], sizeof(dec_params->probs.coef[0][0][0][0][0])); ++ } ++ } ++ } ++ } ++ } ++ memcpy(dec_params->probs.skip, s->prob.p.skip, sizeof(s->prob.p.skip)); ++ memcpy(dec_params->probs.inter_mode, s->prob.p.mv_mode, sizeof(s->prob.p.mv_mode)); ++ memcpy(dec_params->probs.interp_filter, s->prob.p.filter, sizeof(s->prob.p.filter)); ++ memcpy(dec_params->probs.is_inter, s->prob.p.intra, sizeof(s->prob.p.intra)); ++ memcpy(dec_params->probs.comp_mode, s->prob.p.comp, sizeof(s->prob.p.comp)); ++ memcpy(dec_params->probs.single_ref, s->prob.p.single_ref, sizeof(s->prob.p.single_ref)); ++ memcpy(dec_params->probs.comp_ref, s->prob.p.comp_ref, sizeof(s->prob.p.comp_ref)); ++ memcpy(dec_params->probs.y_mode, s->prob.p.y_mode, sizeof(s->prob.p.y_mode)); ++ for (unsigned i = 0; i < 10; i++) ++ memcpy(dec_params->probs.uv_mode[ff_to_v4l2_intramode[i]], s->prob.p.uv_mode[i], sizeof(s->prob.p.uv_mode[0])); ++ for (unsigned i = 0; i < 4; i++) ++ memcpy(dec_params->probs.partition[i * 4], s->prob.p.partition[3 - i], sizeof(s->prob.p.partition[0])); ++ memcpy(dec_params->probs.mv.joint, s->prob.p.mv_joint, sizeof(s->prob.p.mv_joint)); ++ for (unsigned i = 0; i < 2; i++) { ++ dec_params->probs.mv.sign[i] = s->prob.p.mv_comp[i].sign; ++ memcpy(dec_params->probs.mv.class[i], s->prob.p.mv_comp[i].classes, sizeof(s->prob.p.mv_comp[0].classes)); ++ dec_params->probs.mv.class0_bit[i] = s->prob.p.mv_comp[i].class0; ++ memcpy(dec_params->probs.mv.bits[i], s->prob.p.mv_comp[i].bits, sizeof(s->prob.p.mv_comp[0].bits)); ++ memcpy(dec_params->probs.mv.class0_fr[i], s->prob.p.mv_comp[i].class0_fp, sizeof(s->prob.p.mv_comp[0].class0_fp)); ++ memcpy(dec_params->probs.mv.fr[i], s->prob.p.mv_comp[i].fp, sizeof(s->prob.p.mv_comp[0].fp)); ++ dec_params->probs.mv.class0_hp[i] = s->prob.p.mv_comp[i].class0_hp; ++ dec_params->probs.mv.hp[i] = s->prob.p.mv_comp[i].hp; ++ } ++ ++ return ff_v4l2_request_reset_frame(avctx, f->tf.f); ++} ++ ++static int v4l2_request_vp9_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size) ++{ ++ const VP9Context *s = avctx->priv_data; ++ const VP9Frame *f = &s->s.frames[CUR_FRAME]; ++ ++ return ff_v4l2_request_append_output_buffer(avctx, f->tf.f, buffer, size); ++} ++ ++static int v4l2_request_vp9_end_frame(AVCodecContext *avctx) ++{ ++ const VP9Context *s = avctx->priv_data; ++ const VP9Frame *f = &s->s.frames[CUR_FRAME]; ++ V4L2RequestControlsVP9 *controls = f->hwaccel_picture_private; ++ int ret; ++ ++ struct v4l2_ext_control control[] = { ++ { ++ .id = V4L2_CID_MPEG_VIDEO_VP9_FRAME_DECODE_PARAMS, ++ .ptr = &controls->decode_params, ++ .size = sizeof(controls->decode_params), ++ }, ++ }; ++ ++ ret = ff_v4l2_request_decode_frame(avctx, f->tf.f, control, FF_ARRAY_ELEMS(control)); ++ if (ret) ++ return ret; ++ ++ if (!s->s.h.refreshctx) ++ return 0; ++ ++ return v4l2_request_vp9_get_frame_ctx(avctx, s->s.h.framectxid); ++} ++ ++static int v4l2_request_vp9_init(AVCodecContext *avctx) ++{ ++ // TODO: check V4L2_CID_MPEG_VIDEO_VP9_PROFILE ++ return ff_v4l2_request_init(avctx, V4L2_PIX_FMT_VP9_FRAME, 3 * 1024 * 1024, NULL, 0); ++} ++ ++const AVHWAccel ff_vp9_v4l2request_hwaccel = { ++ .name = "vp9_v4l2request", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .id = AV_CODEC_ID_VP9, ++ .pix_fmt = AV_PIX_FMT_DRM_PRIME, ++ .start_frame = v4l2_request_vp9_start_frame, ++ .decode_slice = v4l2_request_vp9_decode_slice, ++ .end_frame = v4l2_request_vp9_end_frame, ++ .frame_priv_data_size = sizeof(V4L2RequestControlsVP9), ++ .init = v4l2_request_vp9_init, ++ .uninit = ff_v4l2_request_uninit, ++ .priv_data_size = sizeof(V4L2RequestContext), ++ .frame_params = ff_v4l2_request_frame_params, ++ .caps_internal = HWACCEL_CAP_ASYNC_SAFE, ++}; +diff --git a/libavcodec/vp8-ctrls.h b/libavcodec/vp8-ctrls.h +new file mode 100644 +index 0000000000..53cba826e4 +--- /dev/null ++++ b/libavcodec/vp8-ctrls.h +@@ -0,0 +1,112 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * These are the VP8 state controls for use with stateless VP8 ++ * codec drivers. ++ * ++ * It turns out that these structs are not stable yet and will undergo ++ * more changes. So keep them private until they are stable and ready to ++ * become part of the official public API. ++ */ ++ ++#ifndef _VP8_CTRLS_H_ ++#define _VP8_CTRLS_H_ ++ ++#include ++ ++#define V4L2_PIX_FMT_VP8_FRAME v4l2_fourcc('V', 'P', '8', 'F') ++ ++#define V4L2_CID_MPEG_VIDEO_VP8_FRAME_HEADER (V4L2_CID_MPEG_BASE + 2000) ++#define V4L2_CTRL_TYPE_VP8_FRAME_HEADER 0x301 ++ ++#define V4L2_VP8_SEGMENT_HEADER_FLAG_ENABLED 0x01 ++#define V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_MAP 0x02 ++#define V4L2_VP8_SEGMENT_HEADER_FLAG_UPDATE_FEATURE_DATA 0x04 ++#define V4L2_VP8_SEGMENT_HEADER_FLAG_DELTA_VALUE_MODE 0x08 ++ ++struct v4l2_vp8_segment_header { ++ __s8 quant_update[4]; ++ __s8 lf_update[4]; ++ __u8 segment_probs[3]; ++ __u8 padding; ++ __u32 flags; ++}; ++ ++#define V4L2_VP8_LF_HEADER_ADJ_ENABLE 0x01 ++#define V4L2_VP8_LF_HEADER_DELTA_UPDATE 0x02 ++#define V4L2_VP8_LF_FILTER_TYPE_SIMPLE 0x04 ++struct v4l2_vp8_loopfilter_header { ++ __s8 ref_frm_delta[4]; ++ __s8 mb_mode_delta[4]; ++ __u8 sharpness_level; ++ __u8 level; ++ __u16 padding; ++ __u32 flags; ++}; ++ ++struct v4l2_vp8_quantization_header { ++ __u8 y_ac_qi; ++ __s8 y_dc_delta; ++ __s8 y2_dc_delta; ++ __s8 y2_ac_delta; ++ __s8 uv_dc_delta; ++ __s8 uv_ac_delta; ++ __u16 padding; ++}; ++ ++struct v4l2_vp8_entropy_header { ++ __u8 coeff_probs[4][8][3][11]; ++ __u8 y_mode_probs[4]; ++ __u8 uv_mode_probs[3]; ++ __u8 mv_probs[2][19]; ++ __u8 padding[3]; ++}; ++ ++struct v4l2_vp8_entropy_coder_state { ++ __u8 range; ++ __u8 value; ++ __u8 bit_count; ++ __u8 padding; ++}; ++ ++#define V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME 0x01 ++#define V4L2_VP8_FRAME_HEADER_FLAG_EXPERIMENTAL 0x02 ++#define V4L2_VP8_FRAME_HEADER_FLAG_SHOW_FRAME 0x04 ++#define V4L2_VP8_FRAME_HEADER_FLAG_MB_NO_SKIP_COEFF 0x08 ++#define V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_GOLDEN 0x10 ++#define V4L2_VP8_FRAME_HEADER_FLAG_SIGN_BIAS_ALT 0x20 ++ ++#define VP8_FRAME_IS_KEY_FRAME(hdr) \ ++ (!!((hdr)->flags & V4L2_VP8_FRAME_HEADER_FLAG_KEY_FRAME)) ++ ++struct v4l2_ctrl_vp8_frame_header { ++ struct v4l2_vp8_segment_header segment_header; ++ struct v4l2_vp8_loopfilter_header lf_header; ++ struct v4l2_vp8_quantization_header quant_header; ++ struct v4l2_vp8_entropy_header entropy_header; ++ struct v4l2_vp8_entropy_coder_state coder_state; ++ ++ __u16 width; ++ __u16 height; ++ ++ __u8 horizontal_scale; ++ __u8 vertical_scale; ++ ++ __u8 version; ++ __u8 prob_skip_false; ++ __u8 prob_intra; ++ __u8 prob_last; ++ __u8 prob_gf; ++ __u8 num_dct_parts; ++ ++ __u32 first_part_size; ++ __u32 first_part_header_bits; ++ __u32 dct_part_sizes[8]; ++ ++ __u64 last_frame_ts; ++ __u64 golden_frame_ts; ++ __u64 alt_frame_ts; ++ ++ __u64 flags; ++}; ++ ++#endif +diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c +index bab4223aca..0e1edb46fb 100644 +--- a/libavcodec/vp8.c ++++ b/libavcodec/vp8.c +@@ -175,6 +175,9 @@ static enum AVPixelFormat get_pixel_format(VP8Context *s) + #endif + #if CONFIG_VP8_NVDEC_HWACCEL + AV_PIX_FMT_CUDA, ++#endif ++#if CONFIG_VP8_V4L2REQUEST_HWACCEL ++ AV_PIX_FMT_DRM_PRIME, + #endif + AV_PIX_FMT_YUV420P, + AV_PIX_FMT_NONE, +@@ -198,7 +201,7 @@ int update_dimensions(VP8Context *s, int width, int height, int is_vp7) + return ret; + } + +- if (!s->actually_webp && !is_vp7) { ++ if (!s->actually_webp && !is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) { + s->pix_fmt = get_pixel_format(s); + if (s->pix_fmt < 0) + return AVERROR(EINVAL); +@@ -2968,6 +2971,9 @@ AVCodec ff_vp8_decoder = { + #endif + #if CONFIG_VP8_NVDEC_HWACCEL + HWACCEL_NVDEC(vp8), ++#endif ++#if CONFIG_VP8_V4L2REQUEST_HWACCEL ++ HWACCEL_V4L2REQUEST(vp8), + #endif + NULL + }, +diff --git a/libavcodec/vp9-ctrls.h b/libavcodec/vp9-ctrls.h +new file mode 100644 +index 0000000000..0cdea8a18b +--- /dev/null ++++ b/libavcodec/vp9-ctrls.h +@@ -0,0 +1,485 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * These are the VP9 state controls for use with stateless VP9 ++ * codec drivers. ++ * ++ * It turns out that these structs are not stable yet and will undergo ++ * more changes. So keep them private until they are stable and ready to ++ * become part of the official public API. ++ */ ++ ++#ifndef _VP9_CTRLS_H_ ++#define _VP9_CTRLS_H_ ++ ++#include ++ ++#define V4L2_PIX_FMT_VP9_FRAME v4l2_fourcc('V', 'P', '9', 'F') ++ ++#define V4L2_CID_MPEG_VIDEO_VP9_FRAME_CONTEXT(i) (V4L2_CID_MPEG_BASE + 4000 + (i)) ++#define V4L2_CID_MPEG_VIDEO_VP9_FRAME_DECODE_PARAMS (V4L2_CID_MPEG_BASE + 4004) ++#define V4L2_CTRL_TYPE_VP9_FRAME_CONTEXT 0x400 ++#define V4L2_CTRL_TYPE_VP9_FRAME_DECODE_PARAMS 0x404 ++ ++/** ++ * enum v4l2_vp9_loop_filter_flags - VP9 loop filter flags ++ * ++ * @V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED: the filter level depends on ++ * the mode and reference frame used ++ * to predict a block ++ * @V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE: the bitstream contains additional ++ * syntax elements that specify which ++ * mode and reference frame deltas ++ * are to be updated ++ * ++ * Those are the flags you should pass to &v4l2_vp9_loop_filter.flags. See ++ * section '7.2.8 Loop filter semantics' of the VP9 specification for more ++ * details. ++ */ ++enum v4l2_vp9_loop_filter_flags { ++ V4L2_VP9_LOOP_FILTER_FLAG_DELTA_ENABLED = 1 << 0, ++ V4L2_VP9_LOOP_FILTER_FLAG_DELTA_UPDATE = 1 << 1, ++}; ++ ++/** ++ * struct v4l2_vp9_loop_filter - VP9 loop filter parameters ++ * ++ * @flags: combination of V4L2_VP9_LOOP_FILTER_FLAG_* flags ++ * @level: indicates the loop filter strength ++ * @sharpness: indicates the sharpness level ++ * @ref_deltas: contains the adjustment needed for the filter level based on ++ * the chosen reference frame ++ * @mode_deltas: contains the adjustment needed for the filter level based on ++ * the chosen mode ++ * @level_lookup: level lookup table ++ * ++ * This structure contains all loop filter related parameters. See sections ++ * '7.2.8 Loop filter semantics' and '8.8.1 Loop filter frame init process' ++ * of the VP9 specification for more details. ++ */ ++struct v4l2_vp9_loop_filter { ++ __u8 flags; ++ __u8 level; ++ __u8 sharpness; ++ __s8 ref_deltas[4]; ++ __s8 mode_deltas[2]; ++ __u8 level_lookup[8][4][2]; ++}; ++ ++/** ++ * struct v4l2_vp9_quantization - VP9 quantization parameters ++ * ++ * @base_q_idx: indicates the base frame qindex ++ * @delta_q_y_dc: indicates the Y DC quantizer relative to base_q_idx ++ * @delta_q_uv_dc: indicates the UV DC quantizer relative to base_q_idx ++ * @delta_q_uv_ac indicates the UV AC quantizer relative to base_q_idx ++ * @padding: padding bytes to align things on 64 bits. Must be set to 0 ++ * ++ * Encodes the quantization parameters. See section '7.2.9 Quantization params ++ * syntax' of the VP9 specification for more details. ++ */ ++struct v4l2_vp9_quantization { ++ __u8 base_q_idx; ++ __s8 delta_q_y_dc; ++ __s8 delta_q_uv_dc; ++ __s8 delta_q_uv_ac; ++ __u8 padding[4]; ++}; ++ ++/** ++ * enum v4l2_vp9_segmentation_flags - VP9 segmentation flags ++ * ++ * @V4L2_VP9_SEGMENTATION_FLAG_ENABLED: indicates that this frame makes use of ++ * the segmentation tool ++ * @V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP: indicates that the segmentation map ++ * should be updated during the ++ * decoding of this frame ++ * @V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE: indicates that the updates to ++ * the segmentation map are coded ++ * relative to the existing ++ * segmentation map ++ * @V4L2_VP9_SEGMENTATION_FLAG_UPDATE_DATA: indicates that new parameters are ++ * about to be specified for each ++ * segment ++ * @V4L2_VP9_SEGMENTATION_FLAG_ABS_OR_DELTA_UPDATE: indicates that the ++ * segmentation parameters ++ * represent the actual values ++ * to be used ++ * ++ * Those are the flags you should pass to &v4l2_vp9_segmentation.flags. See ++ * section '7.2.10 Segmentation params syntax' of the VP9 specification for ++ * more details. ++ */ ++enum v4l2_vp9_segmentation_flags { ++ V4L2_VP9_SEGMENTATION_FLAG_ENABLED = 1 << 0, ++ V4L2_VP9_SEGMENTATION_FLAG_UPDATE_MAP = 1 << 1, ++ V4L2_VP9_SEGMENTATION_FLAG_TEMPORAL_UPDATE = 1 << 2, ++ V4L2_VP9_SEGMENTATION_FLAG_UPDATE_DATA = 1 << 3, ++ V4L2_VP9_SEGMENTATION_FLAG_ABS_OR_DELTA_UPDATE = 1 << 4, ++}; ++ ++#define V4L2_VP9_SEGMENT_FEATURE_ENABLED(id) (1 << (id)) ++#define V4L2_VP9_SEGMENT_FEATURE_ENABLED_MASK 0xf ++ ++/** ++ * enum v4l2_vp9_segment_feature - VP9 segment feature IDs ++ * ++ * @V4L2_VP9_SEGMENT_FEATURE_QP_DELTA: QP delta segment feature ++ * @V4L2_VP9_SEGMENT_FEATURE_LF: loop filter segment feature ++ * @V4L2_VP9_SEGMENT_FEATURE_REF_FRAME: reference frame segment feature ++ * @V4L2_VP9_SEGMENT_FEATURE_SKIP: skip segment feature ++ * @V4L2_VP9_SEGMENT_FEATURE_CNT: number of segment features ++ * ++ * Segment feature IDs. See section '7.2.10 Segmentation params syntax' of the ++ * VP9 specification for more details. ++ */ ++enum v4l2_vp9_segment_feature { ++ V4L2_VP9_SEGMENT_FEATURE_QP_DELTA, ++ V4L2_VP9_SEGMENT_FEATURE_LF, ++ V4L2_VP9_SEGMENT_FEATURE_REF_FRAME, ++ V4L2_VP9_SEGMENT_FEATURE_SKIP, ++ V4L2_VP9_SEGMENT_FEATURE_CNT, ++}; ++ ++/** ++ * struct v4l2_vp9_segmentation - VP9 segmentation parameters ++ * ++ * @flags: combination of V4L2_VP9_SEGMENTATION_FLAG_* flags ++ * @tree_probs: specifies the probability values to be used when ++ * decoding a Segment-ID. See '5.15. Segmentation map' ++ * section of the VP9 specification for more details. ++ * @pred_prob: specifies the probability values to be used when decoding a ++ * Predicted-Segment-ID. See '6.4.14. Get segment id syntax' ++ * section of :ref:`vp9` for more details.. ++ * @padding: padding used to make things aligned on 64 bits. Shall be zero ++ * filled ++ * @feature_enabled: bitmask defining which features are enabled in each ++ * segment ++ * @feature_data: data attached to each feature. Data entry is only valid if ++ * the feature is enabled ++ * ++ * Encodes the quantization parameters. See section '7.2.10 Segmentation ++ * params syntax' of the VP9 specification for more details. ++ */ ++struct v4l2_vp9_segmentation { ++ __u8 flags; ++ __u8 tree_probs[7]; ++ __u8 pred_probs[3]; ++ __u8 padding[5]; ++ __u8 feature_enabled[8]; ++ __s16 feature_data[8][4]; ++}; ++ ++/** ++ * enum v4l2_vp9_intra_prediction_mode - VP9 Intra prediction modes ++ * ++ * @V4L2_VP9_INTRA_PRED_DC: DC intra prediction ++ * @V4L2_VP9_INTRA_PRED_MODE_V: vertical intra prediction ++ * @V4L2_VP9_INTRA_PRED_MODE_H: horizontal intra prediction ++ * @V4L2_VP9_INTRA_PRED_MODE_D45: D45 intra prediction ++ * @V4L2_VP9_INTRA_PRED_MODE_D135: D135 intra prediction ++ * @V4L2_VP9_INTRA_PRED_MODE_D117: D117 intra prediction ++ * @V4L2_VP9_INTRA_PRED_MODE_D153: D153 intra prediction ++ * @V4L2_VP9_INTRA_PRED_MODE_D207: D207 intra prediction ++ * @V4L2_VP9_INTRA_PRED_MODE_D63: D63 intra prediction ++ * @V4L2_VP9_INTRA_PRED_MODE_TM: True Motion intra prediction ++ * ++ * See section '7.4.5 Intra frame mode info semantics' for more details. ++ */ ++enum v4l2_vp9_intra_prediction_mode { ++ V4L2_VP9_INTRA_PRED_MODE_DC, ++ V4L2_VP9_INTRA_PRED_MODE_V, ++ V4L2_VP9_INTRA_PRED_MODE_H, ++ V4L2_VP9_INTRA_PRED_MODE_D45, ++ V4L2_VP9_INTRA_PRED_MODE_D135, ++ V4L2_VP9_INTRA_PRED_MODE_D117, ++ V4L2_VP9_INTRA_PRED_MODE_D153, ++ V4L2_VP9_INTRA_PRED_MODE_D207, ++ V4L2_VP9_INTRA_PRED_MODE_D63, ++ V4L2_VP9_INTRA_PRED_MODE_TM, ++}; ++ ++/** ++ * struct v4l2_vp9_mv_probabilities - VP9 Motion vector probabilities ++ * @joint: motion vector joint probabilities ++ * @sign: motion vector sign probabilities ++ * @class: motion vector class probabilities ++ * @class0_bit: motion vector class0 bit probabilities ++ * @bits: motion vector bits probabilities ++ * @class0_fr: motion vector class0 fractional bit probabilities ++ * @fr: motion vector fractional bit probabilities ++ * @class0_hp: motion vector class0 high precision fractional bit probabilities ++ * @hp: motion vector high precision fractional bit probabilities ++ */ ++struct v4l2_vp9_mv_probabilities { ++ __u8 joint[3]; ++ __u8 sign[2]; ++ __u8 class[2][10]; ++ __u8 class0_bit[2]; ++ __u8 bits[2][10]; ++ __u8 class0_fr[2][2][3]; ++ __u8 fr[2][3]; ++ __u8 class0_hp[2]; ++ __u8 hp[2]; ++}; ++ ++/** ++ * struct v4l2_vp9_probabilities - VP9 Probabilities ++ * ++ * @tx8: TX 8x8 probabilities ++ * @tx16: TX 16x16 probabilities ++ * @tx32: TX 32x32 probabilities ++ * @coef: coefficient probabilities ++ * @skip: skip probabilities ++ * @inter_mode: inter mode probabilities ++ * @interp_filter: interpolation filter probabilities ++ * @is_inter: is inter-block probabilities ++ * @comp_mode: compound prediction mode probabilities ++ * @single_ref: single ref probabilities ++ * @comp_ref: compound ref probabilities ++ * @y_mode: Y prediction mode probabilities ++ * @uv_mode: UV prediction mode probabilities ++ * @partition: partition probabilities ++ * @mv: motion vector probabilities ++ * ++ * Structure containing most VP9 probabilities. See the VP9 specification ++ * for more details. ++ */ ++struct v4l2_vp9_probabilities { ++ __u8 tx8[2][1]; ++ __u8 tx16[2][2]; ++ __u8 tx32[2][3]; ++ __u8 coef[4][2][2][6][6][3]; ++ __u8 skip[3]; ++ __u8 inter_mode[7][3]; ++ __u8 interp_filter[4][2]; ++ __u8 is_inter[4]; ++ __u8 comp_mode[5]; ++ __u8 single_ref[5][2]; ++ __u8 comp_ref[5]; ++ __u8 y_mode[4][9]; ++ __u8 uv_mode[10][9]; ++ __u8 partition[16][3]; ++ ++ struct v4l2_vp9_mv_probabilities mv; ++}; ++ ++/** ++ * enum v4l2_vp9_reset_frame_context - Valid values for ++ * &v4l2_ctrl_vp9_frame_decode_params->reset_frame_context ++ * ++ * @V4L2_VP9_RESET_FRAME_CTX_NONE: don't reset any frame context ++ * @V4L2_VP9_RESET_FRAME_CTX_SPEC: reset the frame context pointed by ++ * &v4l2_ctrl_vp9_frame_decode_params.frame_context_idx ++ * @V4L2_VP9_RESET_FRAME_CTX_ALL: reset all frame contexts ++ * ++ * See section '7.2 Uncompressed header semantics' of the VP9 specification ++ * for more details. ++ */ ++enum v4l2_vp9_reset_frame_context { ++ V4L2_VP9_RESET_FRAME_CTX_NONE, ++ V4L2_VP9_RESET_FRAME_CTX_SPEC, ++ V4L2_VP9_RESET_FRAME_CTX_ALL, ++}; ++ ++/** ++ * enum v4l2_vp9_interpolation_filter - VP9 interpolation filter types ++ * ++ * @V4L2_VP9_INTERP_FILTER_8TAP: height tap filter ++ * @V4L2_VP9_INTERP_FILTER_8TAP_SMOOTH: height tap smooth filter ++ * @V4L2_VP9_INTERP_FILTER_8TAP_SHARP: height tap sharp filter ++ * @V4L2_VP9_INTERP_FILTER_BILINEAR: bilinear filter ++ * @V4L2_VP9_INTERP_FILTER_SWITCHABLE: filter selection is signaled at the ++ * block level ++ * ++ * See section '7.2.7 Interpolation filter semantics' of the VP9 specification ++ * for more details. ++ */ ++enum v4l2_vp9_interpolation_filter { ++ V4L2_VP9_INTERP_FILTER_8TAP, ++ V4L2_VP9_INTERP_FILTER_8TAP_SMOOTH, ++ V4L2_VP9_INTERP_FILTER_8TAP_SHARP, ++ V4L2_VP9_INTERP_FILTER_BILINEAR, ++ V4L2_VP9_INTERP_FILTER_SWITCHABLE, ++}; ++ ++/** ++ * enum v4l2_vp9_reference_mode - VP9 reference modes ++ * ++ * @V4L2_VP9_REF_MODE_SINGLE: indicates that all the inter blocks use only a ++ * single reference frame to generate motion ++ * compensated prediction ++ * @V4L2_VP9_REF_MODE_COMPOUND: requires all the inter blocks to use compound ++ * mode. Single reference frame prediction is not ++ * allowed ++ * @V4L2_VP9_REF_MODE_SELECT: allows each individual inter block to select ++ * between single and compound prediction modes ++ * ++ * See section '7.3.6 Frame reference mode semantics' of the VP9 specification ++ * for more details. ++ */ ++enum v4l2_vp9_reference_mode { ++ V4L2_VP9_REF_MODE_SINGLE, ++ V4L2_VP9_REF_MODE_COMPOUND, ++ V4L2_VP9_REF_MODE_SELECT, ++}; ++ ++/** ++ * enum v4l2_vp9_tx_mode - VP9 TX modes ++ * ++ * @V4L2_VP9_TX_MODE_ONLY_4X4: transform size is 4x4 ++ * @V4L2_VP9_TX_MODE_ALLOW_8X8: transform size can be up to 8x8 ++ * @V4L2_VP9_TX_MODE_ALLOW_16X16: transform size can be up to 16x16 ++ * @V4L2_VP9_TX_MODE_ALLOW_32X32: transform size can be up to 32x32 ++ * @V4L2_VP9_TX_MODE_SELECT: bitstream contains transform size for each block ++ * ++ * See section '7.3.1 Tx mode semantics' of the VP9 specification for more ++ * details. ++ */ ++enum v4l2_vp9_tx_mode { ++ V4L2_VP9_TX_MODE_ONLY_4X4, ++ V4L2_VP9_TX_MODE_ALLOW_8X8, ++ V4L2_VP9_TX_MODE_ALLOW_16X16, ++ V4L2_VP9_TX_MODE_ALLOW_32X32, ++ V4L2_VP9_TX_MODE_SELECT, ++}; ++ ++/** ++ * enum v4l2_vp9_ref_id - VP9 Reference frame IDs ++ * ++ * @V4L2_REF_ID_LAST: last reference frame ++ * @V4L2_REF_ID_GOLDEN: golden reference frame ++ * @V4L2_REF_ID_ALTREF: alternative reference frame ++ * @V4L2_REF_ID_CNT: number of reference frames ++ * ++ * See section '7.4.12 Ref frames semantics' of the VP9 specification for more ++ * details. ++ */ ++enum v4l2_vp9_ref_id { ++ V4L2_REF_ID_LAST, ++ V4L2_REF_ID_GOLDEN, ++ V4L2_REF_ID_ALTREF, ++ V4L2_REF_ID_CNT, ++}; ++ ++/** ++ * enum v4l2_vp9_frame_flags - VP9 frame flags ++ * @V4L2_VP9_FRAME_FLAG_KEY_FRAME: the frame is a key frame ++ * @V4L2_VP9_FRAME_FLAG_SHOW_FRAME: the frame should be displayed ++ * @V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT: the decoding should be error resilient ++ * @V4L2_VP9_FRAME_FLAG_INTRA_ONLY: the frame does not reference other frames ++ * @V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV: the frame might can high precision ++ * motion vectors ++ * @V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX: frame context should be updated ++ * after decoding ++ * @V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE: parallel decoding is used ++ * @V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING: vertical subsampling is enabled ++ * @V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING: horizontal subsampling is enabled ++ * @V4L2_VP9_FRAME_FLAG_COLOR_RANGE_FULL_SWING: full UV range is used ++ * ++ * Check the VP9 specification for more details. ++ */ ++enum v4l2_vp9_frame_flags { ++ V4L2_VP9_FRAME_FLAG_KEY_FRAME = 1 << 0, ++ V4L2_VP9_FRAME_FLAG_SHOW_FRAME = 1 << 1, ++ V4L2_VP9_FRAME_FLAG_ERROR_RESILIENT = 1 << 2, ++ V4L2_VP9_FRAME_FLAG_INTRA_ONLY = 1 << 3, ++ V4L2_VP9_FRAME_FLAG_ALLOW_HIGH_PREC_MV = 1 << 4, ++ V4L2_VP9_FRAME_FLAG_REFRESH_FRAME_CTX = 1 << 5, ++ V4L2_VP9_FRAME_FLAG_PARALLEL_DEC_MODE = 1 << 6, ++ V4L2_VP9_FRAME_FLAG_X_SUBSAMPLING = 1 << 7, ++ V4L2_VP9_FRAME_FLAG_Y_SUBSAMPLING = 1 << 8, ++ V4L2_VP9_FRAME_FLAG_COLOR_RANGE_FULL_SWING = 1 << 9, ++}; ++ ++#define V4L2_VP9_PROFILE_MAX 3 ++ ++/** ++ * struct v4l2_ctrl_vp9_frame_decode_params - VP9 frame decoding control ++ * ++ * @flags: combination of V4L2_VP9_FRAME_FLAG_* flags ++ * @compressed_header_size: compressed header size in bytes ++ * @uncompressed_header_size: uncompressed header size in bytes ++ * @profile: VP9 profile. Can be 0, 1, 2 or 3 ++ * @reset_frame_context: specifies whether the frame context should be reset ++ * to default values. See &v4l2_vp9_reset_frame_context ++ * for more details ++ * @frame_context_idx: frame context that should be used/updated ++ * @bit_depth: bits per components. Can be 8, 10 or 12. Note that not all ++ * profiles support 10 and/or 12 bits depths ++ * @interpolation_filter: specifies the filter selection used for performing ++ * inter prediction. See &v4l2_vp9_interpolation_filter ++ * for more details ++ * @tile_cols_log2: specifies the base 2 logarithm of the width of each tile ++ * (where the width is measured in units of 8x8 blocks). ++ * Shall be less than or equal to 6 ++ * @tile_rows_log2: specifies the base 2 logarithm of the height of each tile ++ * (where the height is measured in units of 8x8 blocks) ++ * @tx_mode: specifies the TX mode. See &v4l2_vp9_tx_mode for more details ++ * @reference_mode: specifies the type of inter prediction to be used. See ++ * &v4l2_vp9_reference_mode for more details ++ * @padding: needed to make this struct 64 bit aligned. Shall be filled with ++ * zeros ++ * @frame_width_minus_1: add 1 to it and you'll get the frame width expressed ++ * in pixels ++ * @frame_height_minus_1: add 1 to it and you'll get the frame height expressed ++ * in pixels ++ * @frame_width_minus_1: add 1 to it and you'll get the expected render width ++ * expressed in pixels. This is not used during the ++ * decoding process but might be used by HW scalers to ++ * prepare a frame that's ready for scanout ++ * @frame_height_minus_1: add 1 to it and you'll get the expected render height ++ * expressed in pixels. This is not used during the ++ * decoding process but might be used by HW scalers to ++ * prepare a frame that's ready for scanout ++ * @refs: array of reference frames. See &v4l2_vp9_ref_id for more details ++ * @lf: loop filter parameters. See &v4l2_vp9_loop_filter for more details ++ * @quant: quantization parameters. See &v4l2_vp9_quantization for more details ++ * @seg: segmentation parameters. See &v4l2_vp9_segmentation for more details ++ * @probs: probabilities. See &v4l2_vp9_probabilities for more details ++ */ ++struct v4l2_ctrl_vp9_frame_decode_params { ++ __u32 flags; ++ __u16 compressed_header_size; ++ __u16 uncompressed_header_size; ++ __u8 profile; ++ __u8 reset_frame_context; ++ __u8 frame_context_idx; ++ __u8 bit_depth; ++ __u8 interpolation_filter; ++ __u8 tile_cols_log2; ++ __u8 tile_rows_log2; ++ __u8 tx_mode; ++ __u8 reference_mode; ++ __u8 padding[6]; ++ __u16 frame_width_minus_1; ++ __u16 frame_height_minus_1; ++ __u16 render_width_minus_1; ++ __u16 render_height_minus_1; ++ __u64 refs[V4L2_REF_ID_CNT]; ++ struct v4l2_vp9_loop_filter lf; ++ struct v4l2_vp9_quantization quant; ++ struct v4l2_vp9_segmentation seg; ++ struct v4l2_vp9_probabilities probs; ++}; ++ ++#define V4L2_VP9_NUM_FRAME_CTX 4 ++ ++/** ++ * struct v4l2_ctrl_vp9_frame_ctx - VP9 frame context control ++ * ++ * @probs: VP9 probabilities ++ * ++ * This control is accessed in both direction. The user should initialize the ++ * 4 contexts with default values just after starting the stream. Then before ++ * decoding a frame it should query the current frame context (the one passed ++ * through &v4l2_ctrl_vp9_frame_decode_params.frame_context_idx) to initialize ++ * &v4l2_ctrl_vp9_frame_decode_params.probs. The probs are then adjusted based ++ * on the bitstream info and passed to the kernel. The codec should update ++ * the frame context after the frame has been decoded, so that next time ++ * userspace query this context it contains the updated probabilities. ++ */ ++struct v4l2_ctrl_vp9_frame_ctx { ++ struct v4l2_vp9_probabilities probs; ++}; ++ ++#endif /* _VP9_CTRLS_H_ */ +diff --git a/libavcodec/vp9.c b/libavcodec/vp9.c +index fd0bab14a2..434f905c62 100644 +--- a/libavcodec/vp9.c ++++ b/libavcodec/vp9.c +@@ -191,6 +191,7 @@ static int update_size(AVCodecContext *avctx, int w, int h) + #define HWACCEL_MAX (CONFIG_VP9_DXVA2_HWACCEL + \ + CONFIG_VP9_D3D11VA_HWACCEL * 2 + \ + CONFIG_VP9_NVDEC_HWACCEL + \ ++ CONFIG_VP9_V4L2REQUEST_HWACCEL + \ + CONFIG_VP9_VAAPI_HWACCEL + \ + CONFIG_VP9_VDPAU_HWACCEL) + enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmtp = pix_fmts; +@@ -223,6 +224,9 @@ static int update_size(AVCodecContext *avctx, int w, int h) + #endif + #if CONFIG_VP9_VAAPI_HWACCEL + *fmtp++ = AV_PIX_FMT_VAAPI; ++#endif ++#if CONFIG_VP9_V4L2REQUEST_HWACCEL ++ *fmtp++ = AV_PIX_FMT_DRM_PRIME; + #endif + break; + case AV_PIX_FMT_YUV420P12: +@@ -231,6 +235,9 @@ static int update_size(AVCodecContext *avctx, int w, int h) + #endif + #if CONFIG_VP9_VAAPI_HWACCEL + *fmtp++ = AV_PIX_FMT_VAAPI; ++#endif ++#if CONFIG_VP9_V4L2REQUEST_HWACCEL ++ *fmtp++ = AV_PIX_FMT_DRM_PRIME; + #endif + break; + } +@@ -700,7 +707,8 @@ static int decode_frame_header(AVCodecContext *avctx, + get_bits(&s->gb, 8) : 255; + } + +- if (get_bits1(&s->gb)) { ++ s->s.h.segmentation.update_data = get_bits1(&s->gb); ++ if (s->s.h.segmentation.update_data) { + s->s.h.segmentation.absolute_vals = get_bits1(&s->gb); + for (i = 0; i < 8; i++) { + if ((s->s.h.segmentation.feat[i].q_enabled = get_bits1(&s->gb))) +@@ -1909,6 +1917,9 @@ AVCodec ff_vp9_decoder = { + #endif + #if CONFIG_VP9_VDPAU_HWACCEL + HWACCEL_VDPAU(vp9), ++#endif ++#if CONFIG_VP9_V4L2REQUEST_HWACCEL ++ HWACCEL_V4L2REQUEST(vp9), + #endif + NULL + }, +diff --git a/libavcodec/vp9shared.h b/libavcodec/vp9shared.h +index 54726df742..fee3568736 100644 +--- a/libavcodec/vp9shared.h ++++ b/libavcodec/vp9shared.h +@@ -131,6 +131,7 @@ typedef struct VP9BitstreamHeader { + uint8_t temporal; + uint8_t absolute_vals; + uint8_t update_map; ++ uint8_t update_data; + uint8_t prob[7]; + uint8_t pred_prob[3]; + struct { +diff --git a/libavdevice/Makefile b/libavdevice/Makefile +index 6ea62b914e..c8c9eeb22b 100644 +--- a/libavdevice/Makefile ++++ b/libavdevice/Makefile +@@ -45,6 +45,9 @@ OBJS-$(CONFIG_SNDIO_INDEV) += sndio_dec.o sndio.o + OBJS-$(CONFIG_SNDIO_OUTDEV) += sndio_enc.o sndio.o + OBJS-$(CONFIG_V4L2_INDEV) += v4l2.o v4l2-common.o timefilter.o + OBJS-$(CONFIG_V4L2_OUTDEV) += v4l2enc.o v4l2-common.o ++OBJS-$(CONFIG_VOUT_DRM_OUTDEV) += drm_vout.o ++OBJS-$(CONFIG_VOUT_EGL_OUTDEV) += egl_vout.o ++OBJS-$(CONFIG_VOUT_RPI_OUTDEV) += rpi_vout.o + OBJS-$(CONFIG_VFWCAP_INDEV) += vfwcap.o + OBJS-$(CONFIG_XCBGRAB_INDEV) += xcbgrab.o + OBJS-$(CONFIG_XV_OUTDEV) += xv.o +diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c +index 8633433254..bc15112a00 100644 +--- a/libavdevice/alldevices.c ++++ b/libavdevice/alldevices.c +@@ -52,6 +52,9 @@ extern AVOutputFormat ff_sndio_muxer; + extern AVInputFormat ff_v4l2_demuxer; + extern AVOutputFormat ff_v4l2_muxer; + extern AVInputFormat ff_vfwcap_demuxer; ++extern AVOutputFormat ff_vout_drm_muxer; ++extern AVOutputFormat ff_vout_egl_muxer; ++extern AVOutputFormat ff_vout_rpi_muxer; + extern AVInputFormat ff_xcbgrab_demuxer; + extern AVOutputFormat ff_xv_muxer; + +diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c +new file mode 100644 +index 0000000000..c427b60d30 +--- /dev/null ++++ b/libavdevice/drm_vout.c +@@ -0,0 +1,613 @@ ++/* ++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++// *** This module is a work in progress and its utility is strictly ++// limited to testing. ++// Amongst other issues it doesn't wait for the pic to be displayed before ++// returning the buffer so flikering does occur. ++ ++#include "libavutil/opt.h" ++#include "libavutil/avassert.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/imgutils.h" ++#include "libavutil/hwcontext_drm.h" ++#include "libavformat/internal.h" ++#include "avdevice.h" ++ ++#include "pthread.h" ++#include ++#include ++ ++#include "drm_fourcc.h" ++#include ++#include ++#include ++#include ++ ++#include "libavutil/rpi_sand_fns.h" ++ ++#define TRACE_ALL 0 ++ ++#define NUM_BUFFERS 4 ++#define RPI_DISPLAY_ALL 0 ++ ++#define DRM_MODULE "vc4" ++ ++#define ERRSTR strerror(errno) ++ ++struct drm_setup { ++ int conId; ++ uint32_t crtcId; ++ int crtcIdx; ++ uint32_t planeId; ++ unsigned int out_fourcc; ++ struct { ++ int x, y, width, height; ++ } compose; ++}; ++ ++typedef struct drm_aux_s { ++ int fd; ++ uint32_t bo_handles[4]; ++ unsigned int fb_handle; ++} drm_aux_t; ++ ++typedef struct drm_display_env_s ++{ ++ AVClass *class; ++ ++ int drm_fd; ++ uint32_t con_id; ++ struct drm_setup setup; ++ enum AVPixelFormat avfmt; ++ ++ drm_aux_t aux[32]; ++ ++ pthread_t q_thread; ++ pthread_mutex_t q_lock; ++ sem_t q_sem; ++ int q_terminate; ++ AVFrame * q_this; ++ AVFrame * q_next; ++ ++} drm_display_env_t; ++ ++ ++static int drm_vout_write_trailer(AVFormatContext *s) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ ++ return 0; ++} ++ ++static int drm_vout_write_header(AVFormatContext *s) ++{ ++ const AVCodecParameters * const par = s->streams[0]->codecpar; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ if ( s->nb_streams > 1 ++ || par->codec_type != AVMEDIA_TYPE_VIDEO ++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { ++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ return 0; ++} ++ ++ ++static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * const frame) ++{ ++ int ret = 0; ++ ++ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0]; ++ drm_aux_t * da = NULL; ++ unsigned int i; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); ++#endif ++ ++ for (i = 0; i != 32; ++i) { ++ if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) { ++ da = de->aux + i; ++ break; ++ } ++ } ++ ++ if (da == NULL) { ++ av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__); ++ return AVERROR(EINVAL); ++ } ++ ++ if (da->fd == -1) { ++ uint32_t pitches[4] = {0}; ++ uint32_t offsets[4] = {0}; ++ uint64_t modifiers[4] = {0}; ++ uint32_t bo_plane_handles[4] = {0}; ++ int i, j, n; ++ ++ for (i = 0; i < desc->nb_objects; ++i) { ++ if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) { ++ av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle failed: %s\n", ERRSTR); ++ return -1; ++ } ++ } ++ ++ n = 0; ++ for (i = 0; i < desc->nb_layers; ++i) { ++ for (j = 0; j < desc->layers[i].nb_planes; ++j) { ++ const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j; ++ const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index; ++ pitches[n] = p->pitch; ++ offsets[n] = p->offset; ++ modifiers[n] = obj->format_modifier; ++ bo_plane_handles[n] = da->bo_handles[p->object_index]; ++ ++n; ++ } ++ } ++ ++#if 0 ++ av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d," ++ " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n", ++ av_frame_cropped_width(frame), ++ av_frame_cropped_height(frame), ++ desc->layers[0].format, ++ bo_plane_handles[0], ++ bo_plane_handles[1], ++ bo_plane_handles[2], ++ bo_plane_handles[3], ++ pitches[0], ++ pitches[1], ++ pitches[2], ++ pitches[3], ++ offsets[0], ++ offsets[1], ++ offsets[2], ++ offsets[3], ++ (long long)modifiers[0], ++ (long long)modifiers[1], ++ (long long)modifiers[2], ++ (long long)modifiers[3] ++ ); ++#endif ++ ++ if (drmModeAddFB2WithModifiers(de->drm_fd, ++ av_frame_cropped_width(frame), ++ av_frame_cropped_height(frame), ++ desc->layers[0].format, bo_plane_handles, ++ pitches, offsets, modifiers, ++ &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) { ++ av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR); ++ return -1; ++ } ++ ++ da->fd = desc->objects[0].fd; ++ } ++ ++ ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId, ++ da->fb_handle, 0, ++ de->setup.compose.x, de->setup.compose.y, ++ de->setup.compose.width, ++ de->setup.compose.height, ++ 0, 0, ++ av_frame_cropped_width(frame) << 16, ++ av_frame_cropped_height(frame) << 16); ++ ++ if (ret != 0) { ++ av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR); ++ } ++ ++ return ret; ++} ++ ++static void * display_thread(void * v) ++{ ++ AVFormatContext * const s = v; ++ drm_display_env_t * const de = s->priv_data; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); ++#endif ++ ++ for (;;) { ++ AVFrame * frame; ++ ++ while (sem_wait(&de->q_sem) != 0) { ++ av_assert0(errno == EINTR); ++ } ++ ++ if (de->q_terminate) ++ break; ++ ++ pthread_mutex_lock(&de->q_lock); ++ frame = de->q_next; ++ de->q_next = NULL; ++ pthread_mutex_unlock(&de->q_lock); ++ ++ do_display(s, de, frame); ++ ++ av_frame_free(&de->q_this); ++ de->q_this = frame; ++ } ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, ">>> %s\n", __func__); ++#endif ++ ++ return NULL; ++} ++ ++static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt) ++{ ++ const AVFrame * const src_frame = (AVFrame *)pkt->data; ++ AVFrame * frame; ++ drm_display_env_t * const de = s->priv_data; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ ++ if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) { ++ av_log(s, AV_LOG_WARNING, "Discard corrupt frame: ts=%" PRId64 "\n", src_frame->format, src_frame->pts); ++ return 0; ++ } ++ ++ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) { ++ frame = av_frame_alloc(); ++ av_frame_ref(frame, src_frame); ++ } ++ else if (src_frame->format == AV_PIX_FMT_VAAPI) { ++ frame = av_frame_alloc(); ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ if (av_hwframe_map(frame, src_frame, 0) != 0) ++ { ++ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); ++ av_frame_free(&frame); ++ return AVERROR(EINVAL); ++ } ++ } ++ else { ++ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format); ++ return AVERROR(EINVAL); ++ } ++ ++ ++ pthread_mutex_lock(&de->q_lock); ++ { ++ AVFrame * const t = de->q_next; ++ de->q_next = frame; ++ frame = t; ++ } ++ pthread_mutex_unlock(&de->q_lock); ++ ++ if (frame == NULL) ++ sem_post(&de->q_sem); ++ else ++ av_frame_free(&frame); ++ ++ return 0; ++} ++ ++static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, ++ unsigned flags) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags); ++#endif ++ ++ /* drm_vout_write_header() should have accepted only supported formats */ ++ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY)) ++ return 0; ++ ++ return 0; ++} ++ ++static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); ++#endif ++ switch(type) { ++ case AV_APP_TO_DEV_WINDOW_REPAINT: ++ return 0; ++ default: ++ break; ++ } ++ return AVERROR(ENOSYS); ++} ++ ++static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId) ++{ ++ int ret = -1; ++ int i; ++ drmModeRes *res = drmModeGetResources(drmfd); ++ drmModeConnector *c; ++ ++ if(!res) ++ { ++ printf( "drmModeGetResources failed: %s\n", ERRSTR); ++ return -1; ++ } ++ ++ if (res->count_crtcs <= 0) ++ { ++ printf( "drm: no crts\n"); ++ goto fail_res; ++ } ++ ++ if (!s->conId) { ++ fprintf(stderr, ++ "No connector ID specified. Choosing default from list:\n"); ++ ++ for (i = 0; i < res->count_connectors; i++) { ++ drmModeConnector *con = ++ drmModeGetConnector(drmfd, res->connectors[i]); ++ drmModeEncoder *enc = NULL; ++ drmModeCrtc *crtc = NULL; ++ ++ if (con->encoder_id) { ++ enc = drmModeGetEncoder(drmfd, con->encoder_id); ++ if (enc->crtc_id) { ++ crtc = drmModeGetCrtc(drmfd, enc->crtc_id); ++ } ++ } ++ ++ if (!s->conId && crtc) { ++ s->conId = con->connector_id; ++ s->crtcId = crtc->crtc_id; ++ } ++ ++ av_log(avctx, AV_LOG_INFO, "Connector %d (crtc %d): type %d, %dx%d%s\n", ++ con->connector_id, ++ crtc ? crtc->crtc_id : 0, ++ con->connector_type, ++ crtc ? crtc->width : 0, ++ crtc ? crtc->height : 0, ++ (s->conId == (int)con->connector_id ? ++ " (chosen)" : "")); ++ } ++ ++ if (!s->conId) { ++ av_log(avctx, AV_LOG_ERROR, ++ "No suitable enabled connector found.\n"); ++ return -1;; ++ } ++ } ++ ++ s->crtcIdx = -1; ++ ++ for (i = 0; i < res->count_crtcs; ++i) { ++ if (s->crtcId == res->crtcs[i]) { ++ s->crtcIdx = i; ++ break; ++ } ++ } ++ ++ if (s->crtcIdx == -1) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId); ++ goto fail_res; ++ } ++ ++ if (res->count_connectors <= 0) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n"); ++ goto fail_res; ++ } ++ ++ c = drmModeGetConnector(drmfd, s->conId); ++ if (!c) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR); ++ goto fail_res; ++ } ++ ++ if (!c->count_modes) ++ { ++ av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n"); ++ goto fail_conn; ++ } ++ ++ { ++ drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId); ++ s->compose.x = crtc->x; ++ s->compose.y = crtc->y; ++ s->compose.width = crtc->width; ++ s->compose.height = crtc->height; ++ drmModeFreeCrtc(crtc); ++ } ++ ++ if (pConId) ++ *pConId = c->connector_id; ++ ret = 0; ++ ++fail_conn: ++ drmModeFreeConnector(c); ++ ++fail_res: ++ drmModeFreeResources(res); ++ ++ return ret; ++} ++ ++static int find_plane(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s) ++{ ++ drmModePlaneResPtr planes; ++ drmModePlanePtr plane; ++ unsigned int i; ++ unsigned int j; ++ int ret = 0; ++ ++ planes = drmModeGetPlaneResources(drmfd); ++ if (!planes) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR); ++ return -1; ++ } ++ ++ for (i = 0; i < planes->count_planes; ++i) { ++ plane = drmModeGetPlane(drmfd, planes->planes[i]); ++ if (!planes) ++ { ++ av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR); ++ break; ++ } ++ ++ if (!(plane->possible_crtcs & (1 << s->crtcIdx))) { ++ drmModeFreePlane(plane); ++ continue; ++ } ++ ++ for (j = 0; j < plane->count_formats; ++j) { ++ if (plane->formats[j] == s->out_fourcc) ++ break; ++ } ++ ++ if (j == plane->count_formats) { ++ drmModeFreePlane(plane); ++ continue; ++ } ++ ++ s->planeId = plane->plane_id; ++ drmModeFreePlane(plane); ++ break; ++ } ++ ++ if (i == planes->count_planes) ++ ret = -1; ++ ++ drmModeFreePlaneResources(planes); ++ return ret; ++} ++ ++// deinit is called if init fails so no need to clean up explicity here ++static int drm_vout_init(struct AVFormatContext * s) ++{ ++ drm_display_env_t * const de = s->priv_data; ++ unsigned int i; ++ ++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); ++ ++ de->drm_fd = -1; ++ de->con_id = 0; ++ de->setup = (struct drm_setup){0}; ++ ++ de->setup.out_fourcc = DRM_FORMAT_NV12; // **** Need some sort of select ++ ++ for (i = 0; i != 32; ++i) { ++ de->aux[i].fd = -1; ++ } ++ ++ if ((de->drm_fd = drmOpen(DRM_MODULE, NULL)) < 0) ++ { ++ av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s\n", DRM_MODULE); ++ return -1; ++ } ++ ++ if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0) ++ { ++ av_log(s, AV_LOG_ERROR, "failed to find valid mode\n"); ++ return -1; ++ } ++ ++ if (find_plane(s, de->drm_fd, &de->setup) != 0) ++ { ++ av_log(s, AV_LOG_ERROR, "failed to find compatible plane\n"); ++ return -1; ++ } ++ ++ de->q_terminate = 0; ++ pthread_mutex_init(&de->q_lock, NULL); ++ sem_init(&de->q_sem, 0, 0); ++ av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0); ++ ++ av_log(s, AV_LOG_INFO, ">>> %s\n", __func__); ++ ++ return 0; ++} ++ ++static void drm_vout_deinit(struct AVFormatContext * s) ++{ ++ drm_display_env_t * const de = s->priv_data; ++ ++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); ++ ++ de->q_terminate = 1; ++ sem_post(&de->q_sem); ++ pthread_join(de->q_thread, NULL); ++ sem_destroy(&de->q_sem); ++ pthread_mutex_destroy(&de->q_lock); ++ ++ av_frame_free(&de->q_next); ++ av_frame_free(&de->q_this); ++ ++ if (de->drm_fd >= 0) { ++ close(de->drm_fd); ++ de->drm_fd = -1; ++ } ++ ++ av_log(s, AV_LOG_INFO, ">>> %s\n", __func__); ++} ++ ++ ++#define OFFSET(x) offsetof(drm_display_env_t, x) ++static const AVOption options[] = { ++#if 0 ++ { "display_name", "set display name", OFFSET(display_name), AV_OPT_TYPE_STRING, {.str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_id", "set existing window id", OFFSET(window_id), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, INT64_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_title", "set window title", OFFSET(window_title), AV_OPT_TYPE_STRING, {.str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++#endif ++ { NULL } ++ ++}; ++ ++static const AVClass drm_vout_class = { ++ .class_name = "drm vid outdev", ++ .item_name = av_default_item_name, ++ .option = options, ++ .version = LIBAVUTIL_VERSION_INT, ++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, ++}; ++ ++AVOutputFormat ff_vout_drm_muxer = { ++ .name = "vout_drm", ++ .long_name = NULL_IF_CONFIG_SMALL("Drm video output device"), ++ .priv_data_size = sizeof(drm_display_env_t), ++ .audio_codec = AV_CODEC_ID_NONE, ++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, ++ .write_header = drm_vout_write_header, ++ .write_packet = drm_vout_write_packet, ++ .write_uncoded_frame = drm_vout_write_frame, ++ .write_trailer = drm_vout_write_trailer, ++ .control_message = drm_vout_control_message, ++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, ++ .priv_class = &drm_vout_class, ++ .init = drm_vout_init, ++ .deinit = drm_vout_deinit, ++}; +diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c +new file mode 100644 +index 0000000000..85bda396d7 +--- /dev/null ++++ b/libavdevice/egl_vout.c +@@ -0,0 +1,782 @@ ++/* ++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++ ++// *** This module is a work in progress and its utility is strictly ++// limited to testing. ++// Amongst other issues it doesn't wait for the pic to be displayed before ++// returning the buffer so flikering does occur. ++ ++#include ++#include ++ ++#include "libavutil/opt.h" ++#include "libavutil/avassert.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/imgutils.h" ++#include "libavutil/hwcontext_drm.h" ++#include "libavformat/internal.h" ++#include "avdevice.h" ++ ++#include "pthread.h" ++#include ++#include ++ ++#include "drm_fourcc.h" ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "libavutil/rpi_sand_fns.h" ++ ++#define TRACE_ALL 1 ++ ++struct egl_setup { ++ int conId; ++ ++ Display *dpy; ++ EGLDisplay egl_dpy; ++ EGLContext ctx; ++ EGLSurface surf; ++ Window win; ++ ++ uint32_t crtcId; ++ int crtcIdx; ++ uint32_t planeId; ++ struct { ++ int x, y, width, height; ++ } compose; ++}; ++ ++typedef struct egl_aux_s { ++ int fd; ++ GLuint texture; ++ ++} egl_aux_t; ++ ++typedef struct egl_display_env_s ++{ ++ AVClass *class; ++ ++ struct egl_setup setup; ++ enum AVPixelFormat avfmt; ++ ++ egl_aux_t aux[32]; ++ ++ pthread_t q_thread; ++ pthread_mutex_t q_lock; ++ sem_t display_start_sem; ++ sem_t q_sem; ++ int q_terminate; ++ AVFrame * q_this; ++ AVFrame * q_next; ++ ++} egl_display_env_t; ++ ++ ++/** ++ * Remove window border/decorations. ++ */ ++static void ++no_border( Display *dpy, Window w) ++{ ++ static const unsigned MWM_HINTS_DECORATIONS = (1 << 1); ++ static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5; ++ ++ typedef struct ++ { ++ unsigned long flags; ++ unsigned long functions; ++ unsigned long decorations; ++ long inputMode; ++ unsigned long status; ++ } PropMotifWmHints; ++ ++ PropMotifWmHints motif_hints; ++ Atom prop, proptype; ++ unsigned long flags = 0; ++ ++ /* setup the property */ ++ motif_hints.flags = MWM_HINTS_DECORATIONS; ++ motif_hints.decorations = flags; ++ ++ /* get the atom for the property */ ++ prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True ); ++ if (!prop) { ++ /* something went wrong! */ ++ return; ++ } ++ ++ /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */ ++ proptype = prop; ++ ++ XChangeProperty( dpy, w, /* display, window */ ++ prop, proptype, /* property, type */ ++ 32, /* format: 32-bit datums */ ++ PropModeReplace, /* mode */ ++ (unsigned char *) &motif_hints, /* data */ ++ PROP_MOTIF_WM_HINTS_ELEMENTS /* nelements */ ++ ); ++} ++ ++ ++/* ++ * Create an RGB, double-buffered window. ++ * Return the window and context handles. ++ */ ++static int ++make_window(struct AVFormatContext * const s, ++ Display *dpy, EGLDisplay egl_dpy, const char *name, ++ int x, int y, int width, int height, ++ Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet) ++{ ++ int scrnum = DefaultScreen( dpy ); ++ XSetWindowAttributes attr; ++ unsigned long mask; ++ Window root = RootWindow( dpy, scrnum ); ++ Window win; ++ EGLContext ctx; ++ bool fullscreen = false; /* Hook this up to a command line arg */ ++ ++ if (fullscreen) { ++ int scrnum = DefaultScreen(dpy); ++ ++ x = 0; y = 0; ++ width = DisplayWidth(dpy, scrnum); ++ height = DisplayHeight(dpy, scrnum); ++ } ++ ++ static const EGLint attribs[] = { ++ EGL_RED_SIZE, 1, ++ EGL_GREEN_SIZE, 1, ++ EGL_BLUE_SIZE, 1, ++ EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT, ++ EGL_NONE ++ }; ++ EGLConfig config; ++ EGLint num_configs; ++ if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) { ++ av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n"); ++ return -1; ++ } ++ ++ EGLint vid; ++ if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) { ++ av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n"); ++ return -1; ++ } ++ ++ XVisualInfo visTemplate = { ++ .visualid = vid, ++ }; ++ int num_visuals; ++ XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask, ++ &visTemplate, &num_visuals); ++ ++ /* window attributes */ ++ attr.background_pixel = 0; ++ attr.border_pixel = 0; ++ attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone); ++ attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask; ++ /* XXX this is a bad way to get a borderless window! */ ++ mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask; ++ ++ win = XCreateWindow( dpy, root, x, y, width, height, ++ 0, visinfo->depth, InputOutput, ++ visinfo->visual, mask, &attr ); ++ ++ if (fullscreen) ++ no_border(dpy, win); ++ ++ /* set hints and properties */ ++ { ++ XSizeHints sizehints; ++ sizehints.x = x; ++ sizehints.y = y; ++ sizehints.width = width; ++ sizehints.height = height; ++ sizehints.flags = USSize | USPosition; ++ XSetNormalHints(dpy, win, &sizehints); ++ XSetStandardProperties(dpy, win, name, name, ++ None, (char **)NULL, 0, &sizehints); ++ } ++ ++ eglBindAPI(EGL_OPENGL_ES_API); ++ ++ static const EGLint ctx_attribs[] = { ++ EGL_CONTEXT_CLIENT_VERSION, 2, ++ EGL_NONE ++ }; ++ ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs ); ++ if (!ctx) { ++ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); ++ return -1; ++ } ++ ++ XFree(visinfo); ++ ++ XMapWindow(dpy, win); ++ ++ EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, ++ (void *)(uintptr_t)win, NULL); ++ if (!surf) { ++ av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n"); ++ return -1; ++ } ++ ++ if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) { ++ av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n"); ++ return -1; ++ } ++ ++ *winRet = win; ++ *ctxRet = ctx; ++ *surfRet = surf; ++ ++ return 0; ++} ++ ++static GLint ++compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source) ++{ ++ GLuint s = glCreateShader(target); ++ ++ if (s == 0) { ++ av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n"); ++ return 0; ++ } ++ ++ glShaderSource(s, 1, (const GLchar **) &source, NULL); ++ glCompileShader(s); ++ ++ GLint ok; ++ glGetShaderiv(s, GL_COMPILE_STATUS, &ok); ++ ++ if (!ok) { ++ GLchar *info; ++ GLint size; ++ ++ glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size); ++ info = malloc(size); ++ ++ glGetShaderInfoLog(s, size, NULL, info); ++ av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source); ++ ++ return 0; ++ } ++ ++ return s; ++} ++ ++static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs) ++{ ++ GLuint prog = glCreateProgram(); ++ ++ if (prog == 0) { ++ av_log(s, AV_LOG_ERROR, "Failed to create program\n"); ++ return 0; ++ } ++ ++ glAttachShader(prog, vs); ++ glAttachShader(prog, fs); ++ glLinkProgram(prog); ++ ++ GLint ok; ++ glGetProgramiv(prog, GL_LINK_STATUS, &ok); ++ if (!ok) { ++ /* Some drivers return a size of 1 for an empty log. This is the size ++ * of a log that contains only a terminating NUL character. ++ */ ++ GLint size; ++ GLchar *info = NULL; ++ glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size); ++ if (size > 1) { ++ info = malloc(size); ++ glGetProgramInfoLog(prog, size, NULL, info); ++ } ++ ++ av_log(s, AV_LOG_ERROR, "Failed to link: %s\n", ++ (info != NULL) ? info : ""); ++ return 0; ++ } ++ ++ return prog; ++} ++ ++static int ++gl_setup(struct AVFormatContext * const s) ++{ ++ const char *vs = ++ "attribute vec4 pos;\n" ++ "varying vec2 texcoord;\n" ++ "\n" ++ "void main() {\n" ++ " gl_Position = pos;\n" ++ " texcoord.x = (pos.x + 1.0) / 2.0;\n" ++ " texcoord.y = (-pos.y + 1.0) / 2.0;\n" ++ "}\n"; ++ const char *fs = ++ "#extension GL_OES_EGL_image_external : enable\n" ++ "precision mediump float;\n" ++ "uniform samplerExternalOES s;\n" ++ "varying vec2 texcoord;\n" ++ "void main() {\n" ++ " gl_FragColor = texture2D(s, texcoord);\n" ++ "}\n"; ++ ++ GLuint vs_s; ++ GLuint fs_s; ++ GLuint prog; ++ ++ if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) || ++ !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) || ++ !(prog = link_program(s, vs_s, fs_s))) ++ return -1; ++ ++ glUseProgram(prog); ++ ++ static const float verts[] = { ++ -1, -1, ++ 1, -1, ++ 1, 1, ++ -1, 1, ++ }; ++ glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts); ++ glEnableVertexAttribArray(0); ++ return 0; ++} ++ ++static int egl_vout_write_trailer(AVFormatContext *s) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ ++ return 0; ++} ++ ++static int egl_vout_write_header(AVFormatContext *s) ++{ ++ const AVCodecParameters * const par = s->streams[0]->codecpar; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ if ( s->nb_streams > 1 ++ || par->codec_type != AVMEDIA_TYPE_VIDEO ++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { ++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ return 0; ++} ++ ++ ++static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame) ++{ ++ const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0]; ++ egl_aux_t * da = NULL; ++ unsigned int i; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); ++#endif ++ ++ for (i = 0; i != 32; ++i) { ++ if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) { ++ da = de->aux + i; ++ break; ++ } ++ } ++ ++ if (da == NULL) { ++ av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__); ++ return AVERROR(EINVAL); ++ } ++ ++ if (da->texture == 0) { ++ EGLint attribs[50]; ++ EGLint * a = attribs; ++ int i, j; ++ static const EGLint anames[] = { ++ EGL_DMA_BUF_PLANE0_FD_EXT, ++ EGL_DMA_BUF_PLANE0_OFFSET_EXT, ++ EGL_DMA_BUF_PLANE0_PITCH_EXT, ++ EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT, ++ EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT, ++ EGL_DMA_BUF_PLANE1_FD_EXT, ++ EGL_DMA_BUF_PLANE1_OFFSET_EXT, ++ EGL_DMA_BUF_PLANE1_PITCH_EXT, ++ EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT, ++ EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT, ++ EGL_DMA_BUF_PLANE2_FD_EXT, ++ EGL_DMA_BUF_PLANE2_OFFSET_EXT, ++ EGL_DMA_BUF_PLANE2_PITCH_EXT, ++ EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT, ++ EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT, ++ }; ++ const EGLint * b = anames; ++ ++ *a++ = EGL_WIDTH; ++ *a++ = av_frame_cropped_width(frame); ++ *a++ = EGL_HEIGHT; ++ *a++ = av_frame_cropped_height(frame); ++ *a++ = EGL_LINUX_DRM_FOURCC_EXT; ++ *a++ = desc->layers[0].format; ++ ++ for (i = 0; i < desc->nb_layers; ++i) { ++ for (j = 0; j < desc->layers[i].nb_planes; ++j) { ++ const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j; ++ const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index; ++ *a++ = *b++; ++ *a++ = obj->fd; ++ *a++ = *b++; ++ *a++ = p->offset; ++ *a++ = *b++; ++ *a++ = p->pitch; ++ if (obj->format_modifier == 0) { ++ b += 2; ++ } ++ else { ++ *a++ = *b++; ++ *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF); ++ *a++ = *b++; ++ *a++ = (EGLint)(obj->format_modifier >> 32); ++ } ++ } ++ } ++ ++ *a = EGL_NONE; ++ ++ for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) { ++ av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]); ++ } ++ ++ EGLImage image = eglCreateImageKHR(de->setup.egl_dpy, ++ EGL_NO_CONTEXT, ++ EGL_LINUX_DMA_BUF_EXT, ++ NULL, attribs); ++ if (!image) { ++ fprintf(stderr, "Failed to import fd %d\n", desc->objects[0].fd); ++ exit(1); ++ } ++ ++ glGenTextures(1, &da->texture); ++ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); ++ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR); ++ glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR); ++ glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image); ++ ++ eglDestroyImageKHR(de->setup.egl_dpy, image); ++ ++ da->fd = desc->objects[0].fd; ++ ++#if 0 ++ av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d," ++ " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n", ++ av_frame_cropped_width(frame), ++ av_frame_cropped_height(frame), ++ desc->layers[0].format, ++ bo_plane_handles[0], ++ bo_plane_handles[1], ++ bo_plane_handles[2], ++ bo_plane_handles[3], ++ pitches[0], ++ pitches[1], ++ pitches[2], ++ pitches[3], ++ offsets[0], ++ offsets[1], ++ offsets[2], ++ offsets[3], ++ (long long)modifiers[0], ++ (long long)modifiers[1], ++ (long long)modifiers[2], ++ (long long)modifiers[3] ++ ); ++#endif ++ } ++ ++ glClearColor(0.5, 0.5, 0.5, 0.5); ++ glClear(GL_COLOR_BUFFER_BIT); ++ ++ glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture); ++ glDrawArrays(GL_TRIANGLE_FAN, 0, 4); ++ eglSwapBuffers(de->setup.egl_dpy, de->setup.surf); ++ ++ return 0; ++} ++ ++static void * display_thread(void * v) ++{ ++ AVFormatContext * const s = v; ++ egl_display_env_t * const de = s->priv_data; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); ++#endif ++ { ++ EGLint egl_major, egl_minor; ++ ++ de->setup.dpy = XOpenDisplay(NULL); ++ if (!de->setup.dpy) { ++ av_log(s, AV_LOG_ERROR, "Couldn't open X display\n"); ++ goto fail; ++ } ++ ++ de->setup.egl_dpy = eglGetDisplay(de->setup.dpy); ++ if (!de->setup.egl_dpy) { ++ av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n"); ++ goto fail; ++ } ++ ++ if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) { ++ av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n"); ++ goto fail; ++ } ++ ++ av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor); ++ ++ if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) { ++ av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n"); ++ goto fail; ++ } ++ } ++ ++ if (make_window(s, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout", ++ 0, 0, 1280, 720, &de->setup.win, &de->setup.ctx, &de->setup.surf)) { ++ av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__); ++ goto fail; ++ } ++ ++ if (gl_setup(s)) { ++ av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__); ++ goto fail; ++ } ++ ++ av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__); ++ sem_post(&de->display_start_sem); ++ ++ for (;;) { ++ AVFrame * frame; ++ ++ while (sem_wait(&de->q_sem) != 0) { ++ av_assert0(errno == EINTR); ++ } ++ ++ if (de->q_terminate) ++ break; ++ ++ pthread_mutex_lock(&de->q_lock); ++ frame = de->q_next; ++ de->q_next = NULL; ++ pthread_mutex_unlock(&de->q_lock); ++ ++ do_display(s, de, frame); ++ ++ av_frame_free(&de->q_this); ++ de->q_this = frame; ++ } ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, ">>> %s\n", __func__); ++#endif ++ ++ return NULL; ++ ++fail: ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__); ++#endif ++ de->q_terminate = 1; ++ sem_post(&de->display_start_sem); ++ ++ return NULL; ++} ++ ++static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt) ++{ ++ const AVFrame * const src_frame = (AVFrame *)pkt->data; ++ AVFrame * frame; ++ egl_display_env_t * const de = s->priv_data; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ ++ if (src_frame->format == AV_PIX_FMT_DRM_PRIME) { ++ frame = av_frame_alloc(); ++ av_frame_ref(frame, src_frame); ++ } ++ else if (src_frame->format == AV_PIX_FMT_VAAPI) { ++ frame = av_frame_alloc(); ++ frame->format = AV_PIX_FMT_DRM_PRIME; ++ if (av_hwframe_map(frame, src_frame, 0) != 0) ++ { ++ av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format); ++ av_frame_free(&frame); ++ return AVERROR(EINVAL); ++ } ++ } ++ else { ++ av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format); ++ return AVERROR(EINVAL); ++ } ++ ++ ++ pthread_mutex_lock(&de->q_lock); ++ { ++ AVFrame * const t = de->q_next; ++ de->q_next = frame; ++ frame = t; ++ } ++ pthread_mutex_unlock(&de->q_lock); ++ ++ if (frame == NULL) ++ sem_post(&de->q_sem); ++ else ++ av_frame_free(&frame); ++ ++ return 0; ++} ++ ++static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, ++ unsigned flags) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags); ++#endif ++ ++ /* egl_vout_write_header() should have accepted only supported formats */ ++ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY)) ++ return 0; ++ ++ return 0; ++} ++ ++static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); ++#endif ++ switch(type) { ++ case AV_APP_TO_DEV_WINDOW_REPAINT: ++ return 0; ++ default: ++ break; ++ } ++ return AVERROR(ENOSYS); ++} ++ ++// deinit is called if init fails so no need to clean up explicity here ++static int egl_vout_init(struct AVFormatContext * s) ++{ ++ egl_display_env_t * const de = s->priv_data; ++ unsigned int i; ++ ++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); ++ ++ de->setup = (struct egl_setup){0}; ++ ++ for (i = 0; i != 32; ++i) { ++ de->aux[i].fd = -1; ++ } ++ ++ de->q_terminate = 0; ++ pthread_mutex_init(&de->q_lock, NULL); ++ sem_init(&de->q_sem, 0, 0); ++ sem_init(&de->display_start_sem, 0, 0); ++ av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0); ++ ++ sem_wait(&de->display_start_sem); ++ if (de->q_terminate) { ++ av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__); ++ return -1; ++ } ++ ++ av_log(s, AV_LOG_INFO, ">>> %s\n", __func__); ++ ++ return 0; ++} ++ ++static void egl_vout_deinit(struct AVFormatContext * s) ++{ ++ egl_display_env_t * const de = s->priv_data; ++ ++ av_log(s, AV_LOG_INFO, "<<< %s\n", __func__); ++ ++ de->q_terminate = 1; ++ sem_post(&de->q_sem); ++ pthread_join(de->q_thread, NULL); ++ sem_destroy(&de->q_sem); ++ pthread_mutex_destroy(&de->q_lock); ++ ++ av_frame_free(&de->q_next); ++ av_frame_free(&de->q_this); ++ ++ av_log(s, AV_LOG_INFO, ">>> %s\n", __func__); ++} ++ ++#define OFFSET(x) offsetof(egl_display_env_t, x) ++static const AVOption options[] = { ++#if 0 ++ { "display_name", "set display name", OFFSET(display_name), AV_OPT_TYPE_STRING, {.str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_id", "set existing window id", OFFSET(window_id), AV_OPT_TYPE_INT64, {.i64 = 0 }, 0, INT64_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_title", "set window title", OFFSET(window_title), AV_OPT_TYPE_STRING, {.str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++#endif ++ { NULL } ++ ++}; ++ ++static const AVClass egl_vout_class = { ++ .class_name = "egl vid outdev", ++ .item_name = av_default_item_name, ++ .option = options, ++ .version = LIBAVUTIL_VERSION_INT, ++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, ++}; ++ ++AVOutputFormat ff_vout_egl_muxer = { ++ .name = "vout_egl", ++ .long_name = NULL_IF_CONFIG_SMALL("Egl video output device"), ++ .priv_data_size = sizeof(egl_display_env_t), ++ .audio_codec = AV_CODEC_ID_NONE, ++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, ++ .write_header = egl_vout_write_header, ++ .write_packet = egl_vout_write_packet, ++ .write_uncoded_frame = egl_vout_write_frame, ++ .write_trailer = egl_vout_write_trailer, ++ .control_message = egl_vout_control_message, ++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, ++ .priv_class = &egl_vout_class, ++ .init = egl_vout_init, ++ .deinit = egl_vout_deinit, ++}; ++ +diff --git a/libavdevice/rpi_vout.c b/libavdevice/rpi_vout.c +new file mode 100644 +index 0000000000..60fe8a7075 +--- /dev/null ++++ b/libavdevice/rpi_vout.c +@@ -0,0 +1,534 @@ ++/* ++ * Copyright (c) 2013 Jeff Moguillansky ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * XVideo output device ++ * ++ * TODO: ++ * - add support to more formats ++ */ ++ ++#include "libavutil/opt.h" ++#include "libavutil/avassert.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/imgutils.h" ++#include "libavformat/internal.h" ++#include "avdevice.h" ++ ++#include ++ ++#pragma GCC diagnostic push ++// Many many redundant decls in the header files ++#pragma GCC diagnostic ignored "-Wredundant-decls" ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#pragma GCC diagnostic pop ++#include "libavutil/rpi_sand_fns.h" ++#include "libavcodec/rpi_zc.h" ++ ++#define TRACE_ALL 0 ++ ++#define RPI_DISPLAY_ALL 0 ++#define DISPLAY_PORT_DEPTH 4 ++ ++typedef struct rpi_display_env_s ++{ ++ AVClass *class; ++ ++ MMAL_COMPONENT_T* display; ++ MMAL_COMPONENT_T* isp; ++ MMAL_PORT_T * port_in; // Input port of either isp or display depending on pipe setup ++ MMAL_CONNECTION_T * conn; ++ ++ MMAL_POOL_T *rpi_pool; ++ volatile int rpi_display_count; ++ ++ MMAL_FOURCC_T req_fmt; ++ MMAL_VIDEO_FORMAT_T req_vfmt; ++ ++ AVZcEnvPtr zc; ++ ++ int window_width, window_height; ++ int window_x, window_y; ++ int layer, fullscreen; ++} rpi_display_env_t; ++ ++ ++static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) { ++ mmal_buffer_header_release(buffer); ++} ++ ++static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) { ++ mmal_buffer_header_release(buffer); ++} ++ ++ ++static MMAL_FOURCC_T mmfmt_from_avfmt(const enum AVPixelFormat fmt) ++{ ++ switch (fmt) { ++ case AV_PIX_FMT_SAND128: ++ case AV_PIX_FMT_RPI4_8: ++ return MMAL_ENCODING_YUVUV128; ++ case AV_PIX_FMT_RPI4_10: ++ return MMAL_ENCODING_YUV10_COL; ++ case AV_PIX_FMT_SAND64_10: ++ return MMAL_ENCODING_YUVUV64_10; ++ case AV_PIX_FMT_SAND64_16: ++ return MMAL_ENCODING_YUVUV64_16; ++ case AV_PIX_FMT_YUV420P: ++ return MMAL_ENCODING_I420; ++ ++ default: ++ break; ++ } ++ return 0; ++} ++ ++ ++static void video_format_from_zc_frame(MMAL_ES_FORMAT_T* const es_fmt, ++ const AVFrame * const frame, const AVRpiZcRefPtr fr_ref) ++{ ++ MMAL_VIDEO_FORMAT_T *const vfmt = &es_fmt->es->video; ++ const AVRpiZcFrameGeometry * geo = av_rpi_zc_geometry(fr_ref); ++ if (av_rpi_is_sand_format(geo->format)) { ++ // Sand formats are a bit "special" ++ // stride1 implicit in format ++ // width = stride2 ++ vfmt->width = geo->stripe_is_yc ? ++ geo->height_y + geo->height_c : geo->height_y; ++// es->height = geo->video_height; //*** When we get the FLAG this will change ++ vfmt->height = geo->height_y; ++ es_fmt->flags = MMAL_ES_FORMAT_FLAG_COL_FMTS_WIDTH_IS_COL_STRIDE; ++ } ++ else { ++ vfmt->width = geo->stride_y / geo->bytes_per_pel; ++ vfmt->height = geo->height_y; ++ es_fmt->flags = 0; ++ } ++ ++ es_fmt->type = MMAL_ES_TYPE_VIDEO; ++ es_fmt->encoding = mmfmt_from_avfmt(geo->format); ++ es_fmt->encoding_variant = 0; ++ es_fmt->bitrate = 0; ++ ++ vfmt->crop.x = frame->crop_left; ++ vfmt->crop.y = frame->crop_top; ++ vfmt->crop.width = av_frame_cropped_width(frame); ++ vfmt->crop.height = av_frame_cropped_height(frame); ++ ++ vfmt->frame_rate.den = 0; // Don't think I know it here ++ vfmt->frame_rate.num = 0; ++ ++ vfmt->par.den = frame->sample_aspect_ratio.den; ++ vfmt->par.num = frame->sample_aspect_ratio.num; ++ ++ vfmt->color_space = 0; // Unknown currently ++} ++ ++static MMAL_BOOL_T buf_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata) ++{ ++ rpi_display_env_t * const de = userdata; ++ if (buf->user_data != NULL) { ++ av_rpi_zc_unref((AVRpiZcRefPtr)buf->user_data); ++ buf->user_data = NULL; ++ } ++ atomic_fetch_add(&de->rpi_display_count, -1); ++ return MMAL_FALSE; ++} ++ ++static inline int avfmt_needs_isp(const enum AVPixelFormat avfmt) ++{ ++ return avfmt == AV_PIX_FMT_SAND64_10; ++} ++ ++static void isp_remove(AVFormatContext * const s, rpi_display_env_t * const de) ++{ ++ if (de->isp != NULL) ++ { ++ if (de->isp->input[0]->is_enabled) ++ mmal_port_disable(de->isp->input[0]); ++ if (de->isp->control->is_enabled) ++ mmal_port_disable(de->isp->control); ++ } ++ if (de->conn != NULL) { ++ mmal_connection_destroy(de->conn); ++ de->conn = NULL; ++ } ++ if (de->isp != NULL) { ++ mmal_component_destroy(de->isp); ++ de->isp = NULL; ++ } ++} ++ ++static void display_frame(AVFormatContext * const s, rpi_display_env_t * const de, const AVFrame* const fr) ++{ ++ MMAL_BUFFER_HEADER_T* buf = NULL; ++ AVRpiZcRefPtr fr_buf = NULL; ++ ++ if (de == NULL) ++ return; ++ ++ if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) { ++ av_log(s, AV_LOG_VERBOSE, "Frame dropped\n"); ++ return; ++ } ++ ++ if ((fr_buf = av_rpi_zc_ref(s, de->zc, fr, fr->format, 1)) == NULL) { ++ return; ++ } ++ ++ buf = mmal_queue_get(de->rpi_pool->queue); ++ if (!buf) { ++ // Running too fast so drop the frame (unexpected) ++ goto fail; ++ } ++ ++ buf->cmd = 0; ++ buf->offset = 0; ++ buf->flags = 0; ++ mmal_buffer_header_reset(buf); ++ ++ atomic_fetch_add(&de->rpi_display_count, 1); // Deced on release ++ mmal_buffer_header_pre_release_cb_set(buf, buf_release_cb, de); ++ ++ buf->user_data = fr_buf; ++ buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf); // Cast our handle to a pointer for mmal ++ buf->offset = av_rpi_zc_offset(fr_buf); ++ buf->length = av_rpi_zc_length(fr_buf); ++ buf->alloc_size = av_rpi_zc_numbytes(fr_buf); ++ ++#if RPI_DISPLAY_ALL ++ while (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) { ++ usleep(5000); ++ } ++#endif ++ ++ { ++ MMAL_ES_SPECIFIC_FORMAT_T new_ess = {.video = {0}}; ++ MMAL_ES_FORMAT_T new_es = {.es = &new_ess}; ++ MMAL_VIDEO_FORMAT_T * const new_vfmt = &new_ess.video; ++ ++ video_format_from_zc_frame(&new_es, fr, fr_buf); ++ if (de->req_fmt != new_es.encoding || ++ de->req_vfmt.width != new_vfmt->width || ++ de->req_vfmt.height != new_vfmt->height || ++ de->req_vfmt.crop.x != new_vfmt->crop.x || ++ de->req_vfmt.crop.y != new_vfmt->crop.y || ++ de->req_vfmt.crop.width != new_vfmt->crop.width || ++ de->req_vfmt.crop.height != new_vfmt->crop.height) { ++ // Something has changed ++ ++ // If we have an ISP tear it down ++ isp_remove(s, de); ++ de->port_in = de->display->input[0]; ++ ++ // If we still need an ISP create it now ++ if (avfmt_needs_isp(fr->format)) ++ { ++ if (mmal_component_create("vc.ril.isp", &de->isp) != MMAL_SUCCESS) ++ { ++ av_log(s, AV_LOG_ERROR, "ISP creation failed\n"); ++ goto fail; ++ } ++ de->port_in = de->isp->input[0]; ++ } ++ ++ mmal_format_copy(de->port_in->format, &new_es); ++ ++ if (mmal_port_format_commit(de->port_in)) { ++ av_log(s, AV_LOG_ERROR, "Failed to commit input format\n"); ++ goto fail; ++ } ++ ++ // If we have an ISP then we must want to use it ++ if (de->isp != NULL) { ++ MMAL_PORT_T * const port_out = de->isp->output[0]; ++ MMAL_VIDEO_FORMAT_T* vfmt_in = &de->port_in->format->es->video; ++ MMAL_VIDEO_FORMAT_T* vfmt_out = &port_out->format->es->video; ++ ++ port_out->format->type = MMAL_ES_TYPE_VIDEO; ++ port_out->format->encoding = MMAL_ENCODING_YUVUV128; ++ port_out->format->encoding_variant = 0; ++ port_out->format->bitrate = 0; ++ port_out->format->flags = 0; ++ port_out->format->extradata = NULL; ++ port_out->format->extradata_size = 0; ++ ++ vfmt_out->width = (vfmt_in->crop.width + 31) & ~31; ++ vfmt_out->height = (vfmt_in->crop.height + 15) & ~15; ++ vfmt_out->crop.x = 0; ++ vfmt_out->crop.y = 0; ++ vfmt_out->crop.width = vfmt_in->crop.width; ++ vfmt_out->crop.height = vfmt_in->crop.height; ++ vfmt_out->frame_rate = vfmt_in->frame_rate; ++ vfmt_out->par = vfmt_in->par; ++ vfmt_out->color_space = vfmt_in->color_space; ++ ++ if (mmal_port_format_commit(port_out)) { ++ av_log(s, AV_LOG_ERROR, "Failed to commit output format\n"); ++ goto fail; ++ } ++ ++ if (mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING) != MMAL_SUCCESS) { ++ av_log(s, AV_LOG_ERROR, "Failed to create connection\n"); ++ goto fail; ++ } ++ if (mmal_connection_enable(de->conn) != MMAL_SUCCESS) { ++ av_log(s, AV_LOG_ERROR, "Failed to enable connection\n"); ++ goto fail; ++ } ++ mmal_port_enable(de->isp->control,display_cb_control); ++ mmal_component_enable(de->isp); ++ } ++ ++ // Number of slots in my port Q ++ de->port_in->buffer_num = DISPLAY_PORT_DEPTH; ++ // Size to keep it happy - isn't used for anything other than error checking ++ de->port_in->buffer_size = buf->alloc_size; ++ if (!de->port_in->is_enabled) ++ { ++ mmal_port_parameter_set_boolean(de->port_in, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle? Would have expected a vc_image? ++ if (mmal_port_enable(de->port_in, display_cb_input) != MMAL_SUCCESS) { ++ av_log(s, AV_LOG_ERROR, "Failed to enable input port\n"); ++ goto fail; ++ } ++ } ++ ++ de->req_fmt = new_es.encoding; ++ de->req_vfmt = *new_vfmt; ++ } ++ } ++ ++ if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS) ++ { ++ av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count); ++ goto fail; ++ } ++ return; ++ ++fail: ++ // If we have a buf then fr_buf is held by that ++ if (buf != NULL) ++ mmal_buffer_header_release(buf); ++ else if (fr_buf != NULL) ++ av_rpi_zc_unref(fr_buf); ++} ++ ++ ++static int xv_write_trailer(AVFormatContext *s) ++{ ++ rpi_display_env_t * const de = s->priv_data; ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ if (de->port_in != NULL && de->port_in->is_enabled) { ++ mmal_port_disable(de->port_in); ++ } ++ ++ // The above disable should kick out all buffers - check that ++ if (atomic_load(&de->rpi_display_count) != 0) { ++ av_log(s, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count)); ++ } ++ ++ isp_remove(s, de); ++ if (de->rpi_pool != NULL) { ++ mmal_pool_destroy(de->rpi_pool); ++ de->rpi_pool = NULL; ++ } ++ if (de->display != NULL) { ++ mmal_component_destroy(de->display); ++ de->display = NULL; ++ } ++ ++ return 0; ++} ++ ++static int xv_write_header(AVFormatContext *s) ++{ ++ rpi_display_env_t * const de = s->priv_data; ++ const AVCodecParameters * const par = s->streams[0]->codecpar; ++ const unsigned int w = de->window_width ? de->window_width : par->width; ++ const unsigned int h = de->window_height ? de->window_height : par->height; ++ const unsigned int x = de->window_x; ++ const unsigned int y = de->window_y; ++ const int layer = de->layer ? de->layer : 2; ++ const MMAL_BOOL_T fullscreen = de->fullscreen; ++ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: %dx%d\n", __func__, w, h); ++#endif ++ if ( s->nb_streams > 1 ++ || par->codec_type != AVMEDIA_TYPE_VIDEO ++ || par->codec_id != AV_CODEC_ID_WRAPPED_AVFRAME) { ++ av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ { ++ MMAL_DISPLAYREGION_T region = ++ { ++ .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)}, ++ .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | ++ MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_ALPHA, ++ .layer = layer, ++ .fullscreen = fullscreen, ++ .dest_rect = {x, y, w, h}, ++ .alpha = !fullscreen ? 0xff : 0xff | MMAL_DISPLAY_ALPHA_FLAGS_DISCARD_LOWER_LAYERS, ++ }; ++ ++ bcm_host_init(); // Needs to be done by someone... ++ ++ if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display) != MMAL_SUCCESS) ++ { ++ av_log(s, AV_LOG_ERROR, "Failed to create display component\n"); ++ goto fail; ++ } ++ de->port_in = de->display->input[0]; ++ ++ mmal_port_parameter_set(de->display->input[0], ®ion.hdr); ++ ++ if (mmal_component_enable(de->display) != MMAL_SUCCESS) ++ { ++ av_log(s, AV_LOG_ERROR, "Failed to enable display component\n"); ++ goto fail; ++ } ++ if (mmal_port_enable(de->display->control,display_cb_control) != MMAL_SUCCESS) ++ { ++ av_log(s, AV_LOG_ERROR, "Failed to enable display control port\n"); ++ goto fail; ++ } ++ ++ if ((de->rpi_pool = mmal_pool_create(DISPLAY_PORT_DEPTH, 0)) == NULL) ++ { ++ av_log(s, AV_LOG_ERROR, "Failed to create pool\n"); ++ goto fail; ++ } ++ } ++ ++ return 0; ++ ++fail: ++ xv_write_trailer(s); ++ return AVERROR_UNKNOWN; ++} ++ ++static int xv_write_packet(AVFormatContext *s, AVPacket *pkt) ++{ ++ AVFrame * const frame = (AVFrame *)pkt->data; ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s\n", __func__); ++#endif ++ display_frame(s, s->priv_data, frame); ++ return 0; ++} ++ ++static int xv_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe, ++ unsigned flags) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags); ++#endif ++ ++ /* xv_write_header() should have accepted only supported formats */ ++ if ((flags & AV_WRITE_UNCODED_FRAME_QUERY)) ++ return 0; ++// return write_picture(s, (*frame)->data, (*frame)->linesize); ++ ++ display_frame(s, s->priv_data, *ppframe); ++ return 0; ++} ++ ++static int xv_control_message(AVFormatContext *s, int type, void *data, size_t data_size) ++{ ++#if TRACE_ALL ++ av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type); ++#endif ++ switch(type) { ++ case AV_APP_TO_DEV_WINDOW_REPAINT: ++ return 0; ++ default: ++ break; ++ } ++ return AVERROR(ENOSYS); ++} ++ ++// deinit is called if init fails so no need to clean up explicity here ++static int rpi_vout_init(struct AVFormatContext * s) ++{ ++ rpi_display_env_t * const de = s->priv_data; ++ ++ // Get a ZC context in case we need one - has little overhead if unused ++ if ((de->zc = av_rpi_zc_int_env_alloc(s)) == NULL) ++ return 1; ++ ++ return 0; ++} ++ ++static void rpi_vout_deinit(struct AVFormatContext * s) ++{ ++ rpi_display_env_t * const de = s->priv_data; ++ ++ av_rpi_zc_int_env_freep(&de->zc); ++} ++ ++ ++#define OFFSET(x) offsetof(rpi_display_env_t, x) ++static const AVOption options[] = { ++ { "window_size", "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_x", "set window x offset", OFFSET(window_x), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "window_y", "set window y offset", OFFSET(window_y), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "display_layer","set display layer", OFFSET(layer), AV_OPT_TYPE_INT, {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM }, ++ { "fullscreen", "set fullscreen display", OFFSET(fullscreen), AV_OPT_TYPE_BOOL, {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM }, ++ { NULL } ++ ++}; ++ ++static const AVClass xv_class = { ++ .class_name = "rpi vid outdev", ++ .item_name = av_default_item_name, ++ .option = options, ++ .version = LIBAVUTIL_VERSION_INT, ++ .category = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT, ++}; ++ ++AVOutputFormat ff_vout_rpi_muxer = { ++ .name = "vout_rpi", ++ .long_name = NULL_IF_CONFIG_SMALL("Rpi (mmal) video output device"), ++ .priv_data_size = sizeof(rpi_display_env_t), ++ .audio_codec = AV_CODEC_ID_NONE, ++ .video_codec = AV_CODEC_ID_WRAPPED_AVFRAME, ++ .write_header = xv_write_header, ++ .write_packet = xv_write_packet, ++ .write_uncoded_frame = xv_write_frame, ++ .write_trailer = xv_write_trailer, ++ .control_message = xv_control_message, ++ .flags = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS, ++ .priv_class = &xv_class, ++ .init = rpi_vout_init, ++ .deinit = rpi_vout_deinit, ++}; +diff --git a/libavfilter/Makefile b/libavfilter/Makefile +index 5123540653..17ccea3150 100644 +--- a/libavfilter/Makefile ++++ b/libavfilter/Makefile +@@ -434,6 +434,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER) += vf_transpose_opencl.o opencl.o o + OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER) += vf_transpose_vaapi.o vaapi_vpp.o + OBJS-$(CONFIG_TRIM_FILTER) += trim.o + OBJS-$(CONFIG_UNPREMULTIPLY_FILTER) += vf_premultiply.o framesync.o ++OBJS-$(CONFIG_UNSAND_FILTER) += vf_unsand.o + OBJS-$(CONFIG_UNSHARP_FILTER) += vf_unsharp.o + OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER) += vf_unsharp_opencl.o opencl.o \ + opencl/unsharp.o +diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c +index 1183e40267..2f569057dd 100644 +--- a/libavfilter/allfilters.c ++++ b/libavfilter/allfilters.c +@@ -414,6 +414,7 @@ extern AVFilter ff_vf_transpose_opencl; + extern AVFilter ff_vf_transpose_vaapi; + extern AVFilter ff_vf_trim; + extern AVFilter ff_vf_unpremultiply; ++extern AVFilter ff_vf_unsand; + extern AVFilter ff_vf_unsharp; + extern AVFilter ff_vf_unsharp_opencl; + extern AVFilter ff_vf_untile; +diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c +index 2fe4f0b0f9..5a8e6b3f24 100644 +--- a/libavfilter/avfiltergraph.c ++++ b/libavfilter/avfiltergraph.c +@@ -32,6 +32,9 @@ + #include "libavutil/internal.h" + #include "libavutil/opt.h" + #include "libavutil/pixdesc.h" ++#if CONFIG_UNSAND_FILTER ++#include "libavutil/rpi_sand_fns.h" ++#endif + + #define FF_INTERNAL_FIELDS 1 + #include "framequeue.h" +@@ -429,6 +432,19 @@ static int can_merge_formats(AVFilterFormats *a_arg, + } + } + ++#if CONFIG_UNSAND_FILTER ++static int has_sand_format(const AVFilterFormats * const ff) ++{ ++ int i; ++ for (i = 0; i != ff->nb_formats; ++i) { ++ if (av_rpi_is_sand_format(ff->formats[i])) { ++ return 1; ++ } ++ } ++ return 0; ++} ++#endif ++ + /** + * Perform one round of query_formats() and merging formats lists on the + * filter graph. +@@ -469,6 +485,7 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) + for (j = 0; j < filter->nb_inputs; j++) { + AVFilterLink *link = filter->inputs[j]; + int convert_needed = 0; ++ unsigned int extra_convert_tried = 0; + + if (!link) + continue; +@@ -516,11 +533,14 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) + ) + #undef MERGE_DISPATCH + +- if (convert_needed) { ++ while (convert_needed) { + AVFilterContext *convert; + const AVFilter *filter; + AVFilterLink *inlink, *outlink; + char inst_name[30]; ++ int can_retry = 0; ++ ++ convert_needed = 0; + + if (graph->disable_auto_convert) { + av_log(log_ctx, AV_LOG_ERROR, +@@ -533,19 +553,45 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) + /* couldn't merge format lists. auto-insert conversion filter */ + switch (link->type) { + case AVMEDIA_TYPE_VIDEO: +- if (!(filter = avfilter_get_by_name("scale"))) { +- av_log(log_ctx, AV_LOG_ERROR, "'scale' filter " +- "not present, cannot convert pixel formats.\n"); +- return AVERROR(EINVAL); +- } +- +- snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d", +- scaler_count++); ++#if CONFIG_UNSAND_FILTER ++ // Only try each extra conversion once ++ // The unsand output pad should never trigger has_sand_format ++ // but it is better to be safe ++ if ((extra_convert_tried & 1) == 0 && has_sand_format(link->in_formats)) { ++ if (!(filter = avfilter_get_by_name("unsand"))) { ++ av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter " ++ "not present, cannot convert pixel formats.\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d", ++ scaler_count++); ++ ++ if ((ret = avfilter_graph_create_filter(&convert, filter, ++ inst_name, "", NULL, ++ graph)) < 0) ++ return ret; + +- if ((ret = avfilter_graph_create_filter(&convert, filter, +- inst_name, graph->scale_sws_opts, NULL, +- graph)) < 0) +- return ret; ++ extra_convert_tried |= 1; ++ can_retry = 1; ++ } ++ else ++#endif ++ { ++ if (!(filter = avfilter_get_by_name("scale"))) { ++ av_log(log_ctx, AV_LOG_ERROR, "'scale' filter " ++ "not present, cannot convert pixel formats.\n"); ++ return AVERROR(EINVAL); ++ } ++ ++ snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d", ++ scaler_count++); ++ ++ if ((ret = avfilter_graph_create_filter(&convert, filter, ++ inst_name, graph->scale_sws_opts, NULL, ++ graph)) < 0) ++ return ret; ++ } + break; + case AVMEDIA_TYPE_AUDIO: + if (!(filter = avfilter_get_by_name("aresample"))) { +@@ -587,9 +633,19 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx) + av_assert0(outlink-> in_channel_layouts->refcount > 0); + av_assert0(outlink->out_channel_layouts->refcount > 0); + } +- if (!ff_merge_formats( inlink->in_formats, inlink->out_formats, inlink->type) || +- !ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type)) ++ // If we have added an extra filter we must merge the input ++ // side but we can have another go at the output ++ if (!ff_merge_formats( inlink->in_formats, inlink->out_formats, inlink->type)) ++ ret = AVERROR(ENOSYS); ++ else if (!ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type)) ++ { ++ if (can_retry) { ++ link = outlink; ++ convert_needed = 1; ++ continue; ++ } + ret = AVERROR(ENOSYS); ++ } + if (inlink->type == AVMEDIA_TYPE_AUDIO && + (!ff_merge_samplerates(inlink->in_samplerates, + inlink->out_samplerates) || +diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c +index bf30f54177..eb5dfa22f8 100644 +--- a/libavfilter/buffersrc.c ++++ b/libavfilter/buffersrc.c +@@ -210,7 +210,7 @@ static int av_buffersrc_add_frame_internal(AVFilterContext *ctx, + + switch (ctx->outputs[0]->type) { + case AVMEDIA_TYPE_VIDEO: +- CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height, ++ CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame), + frame->format, frame->pts); + break; + case AVMEDIA_TYPE_AUDIO: +diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c +new file mode 100644 +index 0000000000..fbea56dd09 +--- /dev/null ++++ b/libavfilter/vf_unsand.c +@@ -0,0 +1,234 @@ ++/* ++ * Copyright (c) 2007 Bobby Bingham ++ * ++ * This file is part of FFmpeg. ++ * ++ * FFmpeg is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU Lesser General Public ++ * License as published by the Free Software Foundation; either ++ * version 2.1 of the License, or (at your option) any later version. ++ * ++ * FFmpeg is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * Lesser General Public License for more details. ++ * ++ * You should have received a copy of the GNU Lesser General Public ++ * License along with FFmpeg; if not, write to the Free Software ++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ++ */ ++ ++/** ++ * @file ++ * format and noformat video filters ++ */ ++ ++#include ++ ++#include "libavutil/internal.h" ++#include "libavutil/mem.h" ++#include "libavutil/pixdesc.h" ++#include "libavutil/opt.h" ++#include "libavutil/rpi_sand_fns.h" ++ ++#include "avfilter.h" ++#include "formats.h" ++#include "internal.h" ++#include "video.h" ++ ++typedef struct UnsandContext { ++ const AVClass *class; ++} UnsandContext; ++ ++static av_cold void uninit(AVFilterContext *ctx) ++{ ++// UnsandContext *s = ctx->priv; ++} ++ ++static av_cold int init(AVFilterContext *ctx) ++{ ++// UnsandContext *s = ctx->priv; ++ ++ return 0; ++} ++ ++ ++static int filter_frame(AVFilterLink *link, AVFrame *in) ++{ ++ AVFilterLink * const outlink = link->dst->outputs[0]; ++ AVFrame *out = NULL; ++ int rv = 0; ++ ++ if (outlink->format == in->format) { ++ // If nothing to do then do nothing ++ out = in; ++ } ++ else ++ { ++ if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL) ++ { ++ rv = AVERROR(ENOMEM); ++ goto fail; ++ } ++ if (av_rpi_sand_to_planar_frame(out, in) != 0) ++ { ++ rv = -1; ++ goto fail; ++ } ++ ++ av_frame_free(&in); ++ } ++ ++ return ff_filter_frame(outlink, out); ++ ++fail: ++ av_frame_free(&out); ++ av_frame_free(&in); ++ return rv; ++} ++ ++#if 0 ++static void dump_fmts(const AVFilterFormats * fmts) ++{ ++ int i; ++ if (fmts== NULL) { ++ printf("NULL\n"); ++ return; ++ } ++ for (i = 0; i < fmts->nb_formats; ++i) { ++ printf(" %d", fmts->formats[i]); ++ } ++ printf("\n"); ++} ++#endif ++ ++static int query_formats(AVFilterContext *ctx) ++{ ++// UnsandContext *s = ctx->priv; ++ int ret; ++ ++ // If we aren't connected at both ends then just do nothing ++ if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL) ++ return 0; ++ ++// printf("Unsand: %s in: ", __func__); ++// dump_fmts(ctx->inputs[0]->in_formats); ++// printf("Unsand: %s out: ", __func__); ++// dump_fmts(ctx->outputs[0]->out_formats); ++ ++ // Our output formats depend on our input formats and we can't/don't ++ // want to convert between bit depths so we need to wait for the source ++ // to have an opinion before we do ++ if (ctx->inputs[0]->in_formats == NULL) ++ return AVERROR(EAGAIN); ++ ++ // Accept anything ++ if (ctx->inputs[0]->out_formats == NULL && ++ (ret = ff_formats_ref(ctx->inputs[0]->in_formats, &ctx->inputs[0]->out_formats)) < 0) ++ return ret; ++ ++ // Filter out sand formats ++ ++ // Generate a container if we don't already have one ++ if (ctx->outputs[0]->in_formats == NULL) ++ { ++ // Somewhat rubbish way of ensuring we have a good structure ++ const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE}; ++ AVFilterFormats *formats = ff_make_format_list(out_fmts); ++ ++ if (formats == NULL) ++ return AVERROR(ENOMEM); ++ if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->in_formats)) < 0) ++ return ret; ++ } ++ ++ // Replace old format list with new filtered list derived from what our ++ // input says it can do ++ { ++ const AVFilterFormats * const src_ff = ctx->inputs[0]->out_formats; ++ AVFilterFormats * const dst_ff = ctx->outputs[0]->in_formats; ++ enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats); ++ int i; ++ int n = 0; ++ int seen_420p = 0; ++ int seen_420p10 = 0; ++ ++ for (i = 0; i < src_ff->nb_formats; ++i) { ++ const enum AVPixelFormat f = src_ff->formats[i]; ++ ++ switch (f){ ++ case AV_PIX_FMT_YUV420P: ++ case AV_PIX_FMT_SAND128: ++ case AV_PIX_FMT_RPI4_8: ++ if (!seen_420p) { ++ seen_420p = 1; ++ dst_fmts[n++] = AV_PIX_FMT_YUV420P; ++ } ++ break; ++ case AV_PIX_FMT_SAND64_10: ++ case AV_PIX_FMT_YUV420P10: ++ case AV_PIX_FMT_RPI4_10: ++ if (!seen_420p10) { ++ seen_420p10 = 1; ++ dst_fmts[n++] = AV_PIX_FMT_YUV420P10; ++ } ++ break; ++ default: ++ dst_fmts[n++] = f; ++ break; ++ } ++ } ++ ++ av_freep(&dst_ff->formats); ++ dst_ff->formats = dst_fmts; ++ dst_ff->nb_formats = n; ++ } ++ ++// printf("Unsand: %s calc: ", __func__); ++// dump_fmts(ctx->outputs[0]->in_formats); ++ ++ return 0; ++} ++ ++ ++#define OFFSET(x) offsetof(UnsandContext, x) ++static const AVOption unsand_options[] = { ++ { NULL } ++}; ++ ++ ++AVFILTER_DEFINE_CLASS(unsand); ++ ++static const AVFilterPad avfilter_vf_unsand_inputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO, ++ .filter_frame = filter_frame, ++ }, ++ { NULL } ++}; ++ ++static const AVFilterPad avfilter_vf_unsand_outputs[] = { ++ { ++ .name = "default", ++ .type = AVMEDIA_TYPE_VIDEO ++ }, ++ { NULL } ++}; ++ ++AVFilter ff_vf_unsand = { ++ .name = "unsand", ++ .description = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"), ++ ++ .init = init, ++ .uninit = uninit, ++ ++ .query_formats = query_formats, ++ ++ .priv_size = sizeof(UnsandContext), ++ .priv_class = &unsand_class, ++ ++ .inputs = avfilter_vf_unsand_inputs, ++ .outputs = avfilter_vf_unsand_outputs, ++}; ++ +diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c +index 1da81a0fe6..089ace9f36 100644 +--- a/libavformat/mpegts.c ++++ b/libavformat/mpegts.c +@@ -2352,7 +2352,7 @@ static void pmt_cb(MpegTSFilter *filter, const uint8_t *section, int section_len + goto out; + + // stop parsing after pmt, we found header +- if (!ts->pkt) ++ if (!ts->stream->nb_streams) + ts->stop_parse = 2; + + set_pmt_found(ts, h->id); +@@ -2562,6 +2562,10 @@ static void eit_cb(MpegTSFilter *filter, const uint8_t *section, int section_len + const uint8_t *p, *p_end; + SectionHeader h1, *h = &h1; + ++ // Something in kodi breaks with seeking when EIT EPG data is included in the stream ++ // As we figure this out lets just skip any EIT data. ++ return; ++ + /* + * Sometimes we receive EPG packets but SDT table do not have + * eit_pres_following or eit_sched turned on, so we open EPG +diff --git a/libavformat/utils.c b/libavformat/utils.c +index ba8aaebfb7..4c7bd7f5e1 100644 +--- a/libavformat/utils.c ++++ b/libavformat/utils.c +@@ -3044,6 +3044,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr) + return 1; + } + ++#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER ++// This should be quite general purpose but avoid possible conflicts ++// by limiting usage to cases wehere we know it works. ++static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts) ++{ ++ // Only try fallback if we know it is supported (HEVC only) ++ const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL : ++ avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE); ++ int err; ++ ++ // Failed to find fallback or we are already at the fallback ++ if (new_codec == NULL || new_codec == old_codec) ++ { ++ return AVERROR_DECODER_NOT_FOUND; ++ } ++ ++ // * This may be dodgy - header says to not use this fn, ++ // especially if we are going to reopen the context... ++ // (but it does seem to work for our cases) ++ if (avcodec_is_open(avctx)) { ++ avcodec_close(avctx); ++ } ++ ++ if ((err = avcodec_open2(avctx, new_codec, opts)) < 0) ++ { ++ return err; ++ } ++ ++ return 0; ++} ++#else ++#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND) ++#endif ++ + /* returns 1 or 0 if or if not decoded data was returned, or a negative error */ + static int try_decode_frame(AVFormatContext *s, AVStream *st, + const AVPacket *avpkt, AVDictionary **options) +@@ -3078,7 +3112,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, + av_dict_set(options ? options : &thread_opt, "threads", "1", 0); + if (s->codec_whitelist) + av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0); +- ret = avcodec_open2(avctx, codec, options ? options : &thread_opt); ++ if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND) ++ { ++ // Try fallback if if looks worth a try ++ ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt); ++ } + if (!options) + av_dict_free(&thread_opt); + if (ret < 0) { +@@ -3109,6 +3147,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, + if (avctx->codec_type == AVMEDIA_TYPE_VIDEO || + avctx->codec_type == AVMEDIA_TYPE_AUDIO) { + ret = avcodec_send_packet(avctx, &pkt); ++ ++ // If we are going to want to fall back we should know here ++ if (ret == AVERROR_DECODER_NOT_FOUND) { ++ if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0) ++ break; ++ continue; ++ } ++ + if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) + break; + if (ret >= 0) +@@ -3719,9 +3765,20 @@ FF_ENABLE_DEPRECATION_WARNINGS + // Try to just open decoders, in case this is enough to get parameters. + if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) { + if (codec && !avctx->codec) +- if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0) +- av_log(ic, AV_LOG_WARNING, +- "Failed to open codec in %s\n",__FUNCTION__); ++ { ++ int err; ++ ++ if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0) ++ { ++ if (err == AVERROR_DECODER_NOT_FOUND) { ++ err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt); ++ } ++ if (err < 0) { ++ av_log(ic, AV_LOG_WARNING, ++ "Failed to open codec in %s\n",__FUNCTION__); ++ } ++ } ++ } + } + if (!options) + av_dict_free(&thread_opt); +diff --git a/libavutil/Makefile b/libavutil/Makefile +index 9b08372eb2..b0b5be0fa6 100644 +--- a/libavutil/Makefile ++++ b/libavutil/Makefile +@@ -68,6 +68,7 @@ HEADERS = adler32.h \ + rational.h \ + replaygain.h \ + ripemd.h \ ++ rpi_sand_fns.h \ + samplefmt.h \ + sha.h \ + sha512.h \ +@@ -86,6 +87,7 @@ HEADERS = adler32.h \ + tx.h \ + + HEADERS-$(CONFIG_LZO) += lzo.h ++HEADERS-$(CONFIG-RPI) += rpi_sand_fn_pw.h + + ARCH_HEADERS = bswap.h \ + intmath.h \ +@@ -180,10 +182,12 @@ OBJS-$(CONFIG_LZO) += lzo.o + OBJS-$(CONFIG_MEDIACODEC) += hwcontext_mediacodec.o + OBJS-$(CONFIG_OPENCL) += hwcontext_opencl.o + OBJS-$(CONFIG_QSV) += hwcontext_qsv.o ++OBJS-$(CONFIG_SAND) += rpi_sand_fns.o + OBJS-$(CONFIG_VAAPI) += hwcontext_vaapi.o + OBJS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.o + OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o + OBJS-$(CONFIG_VULKAN) += hwcontext_vulkan.o ++OBJS-$(CONFIG_RPI) += rpi_sand_fns.o + + OBJS += $(COMPAT_OBJS:%=../compat/%) + +diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile +index 5da44b0542..b74b7c4e2f 100644 +--- a/libavutil/arm/Makefile ++++ b/libavutil/arm/Makefile +@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o \ + + NEON-OBJS += arm/float_dsp_init_neon.o \ + arm/float_dsp_neon.o \ ++ arm/rpi_sand_neon.o \ +diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S +new file mode 100644 +index 0000000000..80890fe985 +--- /dev/null ++++ b/libavutil/arm/rpi_sand_neon.S +@@ -0,0 +1,768 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#include "libavutil/arm/asm.S" ++ ++ ++@ General notes: ++@ Having done some timing on this in sand8->y8 (Pi4) ++@ vst1 (680fps) is a bit faster than vstm (660fps) ++@ vldm (680fps) is noticably faster than vld1 (480fps) ++@ (or it might be that a mix is what is required) ++@ ++@ At least on a Pi4 it is no more expensive to have a single auto-inc register ++@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted ++@ the latter was better) ++@ ++@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless ++@ the memory is uncached. ++@ As these are Sand -> planar we can assume that src is going to be aligned but ++@ it is possible that dest isn't (converting to .yuv or other packed format). ++@ Luckily vst1 is faster than vstm :-) so all is well ++@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4 ++@ .8 stores would let us do non-word aligned stores into uncached but it ++@ probably isn't worth it. ++ ++ ++ ++ ++@ void ff_rpi_sand128b_stripe_to_8_10( ++@ uint8_t * dest, // [r0] ++@ const uint8_t * src1, // [r1] ++@ const uint8_t * src2, // [r2] ++@ unsigned int lines); // [r3] ++ ++.macro stripe2_to_8, bit_depth ++ vpush {q4-q7} ++1: ++ vldm r1!, {q0-q7} ++ subs r3, #1 ++ vldm r2!, {q8-q15} ++ vqrshrn.u16 d0, q0, #\bit_depth - 8 ++ vqrshrn.u16 d1, q1, #\bit_depth - 8 ++ vqrshrn.u16 d2, q2, #\bit_depth - 8 ++ vqrshrn.u16 d3, q3, #\bit_depth - 8 ++ vqrshrn.u16 d4, q4, #\bit_depth - 8 ++ vqrshrn.u16 d5, q5, #\bit_depth - 8 ++ vqrshrn.u16 d6, q6, #\bit_depth - 8 ++ vqrshrn.u16 d7, q7, #\bit_depth - 8 ++ vqrshrn.u16 d8, q8, #\bit_depth - 8 ++ vqrshrn.u16 d9, q9, #\bit_depth - 8 ++ vqrshrn.u16 d10, q10, #\bit_depth - 8 ++ vqrshrn.u16 d11, q11, #\bit_depth - 8 ++ vqrshrn.u16 d12, q12, #\bit_depth - 8 ++ vqrshrn.u16 d13, q13, #\bit_depth - 8 ++ vqrshrn.u16 d14, q14, #\bit_depth - 8 ++ vqrshrn.u16 d15, q15, #\bit_depth - 8 ++ vstm r0!, {q0-q7} ++ bne 1b ++ vpop {q4-q7} ++ bx lr ++.endm ++ ++function ff_rpi_sand128b_stripe_to_8_10, export=1 ++ stripe2_to_8 10 ++endfunc ++ ++@ void ff_rpi_sand8_lines_to_planar_y8( ++@ uint8_t * dest, // [r0] ++@ unsigned int dst_stride, // [r1] ++@ const uint8_t * src, // [r2] ++@ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++@ unsigned int src_stride2, // [sp, #0] -> r3 ++@ unsigned int _x, // [sp, #4] Ignored - 0 ++@ unsigned int y, // [sp, #8] (r7 in prefix) ++@ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++@ unsigned int h); // [sp, #16] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand8_lines_to_planar_y8, export=1 ++ push {r4-r8, lr} @ +24 L ++ ldr r3, [sp, #24] ++ ldr r6, [sp, #36] ++ ldr r7, [sp, #32] @ y ++ lsl r3, #7 ++ sub r1, r6 ++ add r8, r2, r7, lsl #7 ++ ldr r7, [sp, #40] ++ ++10: ++ mov r2, r8 ++ add r4, r0, #24 ++ mov r5, r6 ++ mov lr, #0 ++1: ++ vldm r2, {q8-q15} ++ add r2, r3 ++ subs r5, #128 ++ blt 2f ++ vst1.8 {d16, d17, d18, d19}, [r0]! ++ vst1.8 {d20, d21, d22, d23}, [r0]! ++ vst1.8 {d24, d25, d26, d27}, [r0]! ++ vst1.8 {d28, d29, d30, d31}, [r0]! ++ bne 1b ++11: ++ subs r7, #1 ++ add r0, r1 ++ add r8, #128 ++ bne 10b ++ ++ pop {r4-r8, pc} ++ ++@ Partial final write ++2: ++ cmp r5, #64-128 ++ blt 1f ++ vst1.8 {d16, d17, d18, d19}, [r0]! ++ vst1.8 {d20, d21, d22, d23}, [r0]! ++ beq 11b ++ vmov q8, q12 ++ vmov q9, q13 ++ sub r5, #64 ++ vmov q10, q14 ++ vmov q11, q15 ++1: ++ cmp r5, #32-128 ++ blt 1f ++ vst1.8 {d16, d17, d18, d19}, [r0]! ++ beq 11b ++ vmov q8, q10 ++ sub r5, #32 ++ vmov q9, q11 ++1: ++ cmp r5, #16-128 ++ blt 1f ++ vst1.8 {d16, d17}, [r0]! ++ beq 11b ++ sub r5, #16 ++ vmov q8, q9 ++1: ++ cmp r5, #8-128 ++ blt 1f ++ vst1.8 {d16}, [r0]! ++ beq 11b ++ sub r5, #8 ++ vmov d16, d17 ++1: ++ cmp r5, #4-128 ++ blt 1f ++ vst1.32 {d16[0]}, [r0]! ++ beq 11b ++ sub r5, #4 ++ vshr.u64 d16, #32 ++1: ++ cmp r5, #2-128 ++ blt 1f ++ vst1.16 {d16[0]}, [r0]! ++ beq 11b ++ vst1.8 {d16[2]}, [r0]! ++ b 11b ++1: ++ vst1.8 {d16[0]}, [r0]! ++ b 11b ++endfunc ++ ++@ void ff_rpi_sand8_lines_to_planar_c8( ++@ uint8_t * dst_u, // [r0] ++@ unsigned int dst_stride_u, // [r1] ++@ uint8_t * dst_v, // [r2] ++@ unsigned int dst_stride_v, // [r3] ++@ const uint8_t * src, // [sp, #0] -> r4, r5 ++@ unsigned int stride1, // [sp, #4] 128 ++@ unsigned int stride2, // [sp, #8] -> r8 ++@ unsigned int _x, // [sp, #12] 0 ++@ unsigned int y, // [sp, #16] (r7 in prefix) ++@ unsigned int _w, // [sp, #20] -> r12, r6 ++@ unsigned int h); // [sp, #24] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand8_lines_to_planar_c8, export=1 ++ push {r4-r8, lr} @ +24 ++ ++ ldr r5, [sp, #24] ++ ldr r8, [sp, #32] ++ ldr r7, [sp, #40] ++ ldr r6, [sp, #44] ++ lsl r8, #7 ++ add r5, r5, r7, lsl #7 ++ sub r1, r1, r6 ++ sub r3, r3, r6 ++ ldr r7, [sp, #48] ++ vpush {q4-q7} ++ ++10: ++ mov r4, r5 ++ mov r12, r6 ++1: ++ subs r12, #64 ++ vldm r4, {q0-q7} ++ add r4, r8 ++ it gt ++ vldmgt r4, {q8-q15} ++ add r4, r8 ++ ++ vuzp.8 q0, q1 ++ vuzp.8 q2, q3 ++ vuzp.8 q4, q5 ++ vuzp.8 q6, q7 ++ ++ vuzp.8 q8, q9 ++ vuzp.8 q10, q11 ++ vuzp.8 q12, q13 ++ vuzp.8 q14, q15 ++ subs r12, #64 ++ ++ @ Rearrange regs so we can use vst1 with 4 regs ++ vswp q1, q2 ++ vswp q5, q6 ++ vswp q9, q10 ++ vswp q13, q14 ++ blt 2f ++ ++ vst1.8 {d0, d1, d2, d3 }, [r0]! ++ vst1.8 {d8, d9, d10, d11}, [r0]! ++ vst1.8 {d16, d17, d18, d19}, [r0]! ++ vst1.8 {d24, d25, d26, d27}, [r0]! ++ ++ vst1.8 {d4, d5, d6, d7 }, [r2]! ++ vst1.8 {d12, d13, d14, d15}, [r2]! ++ vst1.8 {d20, d21, d22, d23}, [r2]! ++ vst1.8 {d28, d29, d30, d31}, [r2]! ++ bne 1b ++11: ++ subs r7, #1 ++ add r5, #128 ++ add r0, r1 ++ add r2, r3 ++ bne 10b ++ vpop {q4-q7} ++ pop {r4-r8,pc} ++ ++2: ++ cmp r12, #64-128 ++ blt 1f ++ vst1.8 {d0, d1, d2, d3 }, [r0]! ++ vst1.8 {d8, d9, d10, d11}, [r0]! ++ vst1.8 {d4, d5, d6, d7 }, [r2]! ++ vst1.8 {d12, d13, d14, d15}, [r2]! ++ beq 11b ++ sub r12, #64 ++ vmov q0, q8 ++ vmov q1, q9 ++ vmov q2, q10 ++ vmov q3, q11 ++ vmov q4, q12 ++ vmov q5, q13 ++ vmov q6, q14 ++ vmov q7, q15 ++1: ++ cmp r12, #32-128 ++ blt 1f ++ vst1.8 {d0, d1, d2, d3 }, [r0]! ++ vst1.8 {d4, d5, d6, d7 }, [r2]! ++ beq 11b ++ sub r12, #32 ++ vmov q0, q4 ++ vmov q1, q5 ++ vmov q2, q6 ++ vmov q3, q7 ++1: ++ cmp r12, #16-128 ++ blt 1f ++ vst1.8 {d0, d1 }, [r0]! ++ vst1.8 {d4, d5 }, [r2]! ++ beq 11b ++ sub r12, #16 ++ vmov q0, q1 ++ vmov q2, q3 ++1: ++ cmp r12, #8-128 ++ blt 1f ++ vst1.8 {d0}, [r0]! ++ vst1.8 {d4}, [r2]! ++ beq 11b ++ sub r12, #8 ++ vmov d0, d1 ++ vmov d4, d5 ++1: ++ cmp r12, #4-128 ++ blt 1f ++ vst1.32 {d0[0]}, [r0]! ++ vst1.32 {d4[0]}, [r2]! ++ beq 11b ++ sub r12, #4 ++ vmov s0, s1 ++ vmov s8, s9 ++1: ++ cmp r12, #2-128 ++ blt 1f ++ vst1.16 {d0[0]}, [r0]! ++ vst1.16 {d4[0]}, [r2]! ++ beq 11b ++ vst1.8 {d0[2]}, [r0]! ++ vst1.8 {d4[2]}, [r2]! ++ b 11b ++1: ++ vst1.8 {d0[0]}, [r0]! ++ vst1.8 {d4[0]}, [r2]! ++ b 11b ++endfunc ++ ++ ++ ++@ void ff_rpi_sand30_lines_to_planar_y16( ++@ uint8_t * dest, // [r0] ++@ unsigned int dst_stride, // [r1] ++@ const uint8_t * src, // [r2] ++@ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++@ unsigned int src_stride2, // [sp, #0] -> r3 ++@ unsigned int _x, // [sp, #4] Ignored - 0 ++@ unsigned int y, // [sp, #8] (r7 in prefix) ++@ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++@ unsigned int h); // [sp, #16] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand30_lines_to_planar_y16, export=1 ++ push {r4-r8, lr} @ +24 ++ ldr r3, [sp, #24] ++ ldr r6, [sp, #36] ++ ldr r7, [sp, #32] @ y ++ mov r12, #48 ++ vmov.u16 q15, #0x3ff ++ sub r3, #1 ++ lsl r3, #7 ++ sub r1, r1, r6, lsl #1 ++ add r8, r2, r7, lsl #7 ++ ldr r7, [sp, #40] ++ ++10: ++ mov r2, r8 ++ add r4, r0, #24 ++ mov r5, r6 ++ mov lr, #0 ++1: ++ vldm r2!, {q10-q13} ++ add lr, #64 ++ ++ vshr.u32 q14, q10, #20 @ Cannot vshrn.u32 #20! ++ ands lr, #127 ++ vshrn.u32 d2, q10, #10 ++ vmovn.u32 d0, q10 ++ vmovn.u32 d4, q14 ++ ++ vshr.u32 q14, q11, #20 ++ it eq ++ addeq r2, r3 ++ vshrn.u32 d3, q11, #10 ++ vmovn.u32 d1, q11 ++ vmovn.u32 d5, q14 ++ ++ subs r5, #48 ++ vand q0, q15 ++ vand q1, q15 ++ vand q2, q15 ++ ++ vshr.u32 q14, q12, #20 ++ vshrn.u32 d18, q12, #10 ++ vmovn.u32 d16, q12 ++ vmovn.u32 d20, q14 ++ ++ vshr.u32 q14, q13, #20 ++ vshrn.u32 d19, q13, #10 ++ vmovn.u32 d17, q13 ++ vmovn.u32 d21, q14 ++ ++ vand q8, q15 ++ vand q9, q15 ++ vand q10, q15 ++ blt 2f ++ ++ vst3.16 {d0, d2, d4}, [r0], r12 ++ vst3.16 {d1, d3, d5}, [r4], r12 ++ vst3.16 {d16, d18, d20}, [r0], r12 ++ vst3.16 {d17, d19, d21}, [r4], r12 ++ ++ bne 1b ++ ++11: ++ subs r7, #1 ++ add r0, r1 ++ add r8, #128 ++ bne 10b ++ ++ pop {r4-r8, pc} ++ ++@ Partial final write ++2: ++ cmp r5, #24-48 ++ blt 1f ++ vst3.16 {d0, d2, d4}, [r0], r12 ++ vst3.16 {d1, d3, d5}, [r4] ++ beq 11b ++ vmov q0, q8 ++ sub r5, #24 ++ vmov q1, q9 ++ vmov q2, q10 ++1: ++ cmp r5, #12-48 ++ blt 1f ++ vst3.16 {d0, d2, d4}, [r0]! ++ beq 11b ++ vmov d0, d1 ++ sub r5, #12 ++ vmov d2, d3 ++ vmov d4, d5 ++1: ++ cmp r5, #6-48 ++ add r4, r0, #6 @ avoid [r0]! on sequential instructions ++ blt 1f ++ vst3.16 {d0[0], d2[0], d4[0]}, [r0] ++ vst3.16 {d0[1], d2[1], d4[1]}, [r4] ++ add r0, #12 ++ beq 11b ++ vmov s0, s1 ++ sub r5, #6 ++ vmov s4, s5 ++ vmov s8, s9 ++1: ++ cmp r5, #3-48 ++ blt 1f ++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]! ++ beq 11b ++ sub r5, #3 ++ vshr.u32 d0, #16 ++ vshr.u32 d2, #16 ++1: ++ cmp r5, #2-48 ++ blt 1f ++ vst2.16 {d0[0], d2[0]}, [r0]! ++ b 11b ++1: ++ vst1.16 {d0[0]}, [r0]! ++ b 11b ++ ++endfunc ++ ++ ++@ void ff_rpi_sand30_lines_to_planar_c16( ++@ uint8_t * dst_u, // [r0] ++@ unsigned int dst_stride_u, // [r1] ++@ uint8_t * dst_v, // [r2] ++@ unsigned int dst_stride_v, // [r3] ++@ const uint8_t * src, // [sp, #0] -> r4, r5 ++@ unsigned int stride1, // [sp, #4] 128 ++@ unsigned int stride2, // [sp, #8] -> r8 ++@ unsigned int _x, // [sp, #12] 0 ++@ unsigned int y, // [sp, #16] (r7 in prefix) ++@ unsigned int _w, // [sp, #20] -> r6, r9 ++@ unsigned int h); // [sp, #24] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand30_lines_to_planar_c16, export=1 ++ push {r4-r10, lr} @ +32 ++ ldr r5, [sp, #32] ++ ldr r8, [sp, #40] ++ ldr r7, [sp, #48] ++ ldr r9, [sp, #52] ++ mov r12, #48 ++ vmov.u16 q15, #0x3ff ++ sub r8, #1 ++ lsl r8, #7 ++ add r5, r5, r7, lsl #7 ++ sub r1, r1, r9, lsl #1 ++ sub r3, r3, r9, lsl #1 ++ ldr r7, [sp, #56] ++10: ++ mov lr, #0 ++ mov r4, r5 ++ mov r6, r9 ++1: ++ vldm r4!, {q0-q3} ++ add lr, #64 ++ ++ @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2 ++ vshr.u32 q14, q0, #20 ++ vshrn.u32 d16, q0, #10 ++ vmovn.u32 d18, q0 ++ ands lr, #127 ++ vmovn.u32 d20, q14 ++ ++ vshr.u32 q14, q1, #20 ++ vshrn.u32 d17, q1, #10 ++ vmovn.u32 d19, q1 ++ vmovn.u32 d21, q14 ++ ++ vshr.u32 q14, q2, #20 ++ vshrn.u32 d22, q2, #10 ++ vmovn.u32 d24, q2 ++ vmovn.u32 d26, q14 ++ ++ vshr.u32 q14, q3, #20 ++ vshrn.u32 d23, q3, #10 ++ vmovn.u32 d25, q3 ++ add r10, r0, #24 ++ vmovn.u32 d27, q14 ++ ++ it eq ++ addeq r4, r8 ++ vuzp.16 q8, q11 ++ vuzp.16 q9, q12 ++ vuzp.16 q10, q13 ++ ++ @ q8 V0, V3,.. -> q0 ++ @ q9 U0, U3... ++ @ q10 U1, U4... ++ @ q11 U2, U5,.. ++ @ q12 V1, V4,.. -> q1 ++ @ q13 V2, V5,.. -> q2 ++ ++ subs r6, #24 ++ vand q11, q15 ++ vand q9, q15 ++ vand q10, q15 ++ vand q0, q8, q15 ++ vand q1, q12, q15 ++ vand q2, q13, q15 ++ ++ blt 2f ++ ++ vst3.16 {d18, d20, d22}, [r0], r12 ++ vst3.16 {d19, d21, d23}, [r10] ++ add r10, r2, #24 ++ vst3.16 {d0, d2, d4}, [r2], r12 ++ vst3.16 {d1, d3, d5}, [r10] ++ ++ bne 1b ++ ++11: ++ subs r7, #1 ++ add r5, #128 ++ add r0, r1 ++ add r2, r3 ++ bne 10b ++ ++ pop {r4-r10, pc} ++ ++@ Partial final write ++2: ++ cmp r6, #-12 ++ blt 1f ++ vst3.16 {d18, d20, d22}, [r0]! ++ vst3.16 {d0, d2, d4}, [r2]! ++ beq 11b ++ vmov d18, d19 ++ vmov d20, d21 ++ vmov d22, d23 ++ sub r6, #12 ++ vmov d0, d1 ++ vmov d2, d3 ++ vmov d4, d5 ++1: ++ cmp r6, #-18 ++ @ Rezip here as it makes the remaining tail handling easier ++ vzip.16 d0, d18 ++ vzip.16 d2, d20 ++ vzip.16 d4, d22 ++ blt 1f ++ vst3.16 {d0[1], d2[1], d4[1]}, [r0]! ++ vst3.16 {d0[0], d2[0], d4[0]}, [r2]! ++ vst3.16 {d0[3], d2[3], d4[3]}, [r0]! ++ vst3.16 {d0[2], d2[2], d4[2]}, [r2]! ++ beq 11b ++ vmov d0, d18 ++ vmov d2, d20 ++ sub r6, #6 ++ vmov d4, d22 ++1: ++ cmp r6, #-21 ++ blt 1f ++ vst3.16 {d0[1], d2[1], d4[1]}, [r0]! ++ vst3.16 {d0[0], d2[0], d4[0]}, [r2]! ++ beq 11b ++ vmov s4, s5 ++ sub r6, #3 ++ vmov s0, s1 ++1: ++ cmp r6, #-22 ++ blt 1f ++ vst2.16 {d0[1], d2[1]}, [r0]! ++ vst2.16 {d0[0], d2[0]}, [r2]! ++ b 11b ++1: ++ vst1.16 {d0[1]}, [r0]! ++ vst1.16 {d0[0]}, [r2]! ++ b 11b ++ ++endfunc ++ ++@ void ff_rpi_sand30_lines_to_planar_p010( ++@ uint8_t * dest, // [r0] ++@ unsigned int dst_stride, // [r1] ++@ const uint8_t * src, // [r2] ++@ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++@ unsigned int src_stride2, // [sp, #0] -> r3 ++@ unsigned int _x, // [sp, #4] Ignored - 0 ++@ unsigned int y, // [sp, #8] (r7 in prefix) ++@ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++@ unsigned int h); // [sp, #16] -> r7 ++@ ++@ Assumes that we are starting on a stripe boundary and that overreading ++@ within the stripe is OK. However it does respect the dest size for writing ++ ++function ff_rpi_sand30_lines_to_planar_p010, export=1 ++ push {r4-r8, lr} @ +24 ++ ldr r3, [sp, #24] ++ ldr r6, [sp, #36] ++ ldr r7, [sp, #32] @ y ++ mov r12, #48 ++ vmov.u16 q15, #0xffc0 ++ sub r3, #1 ++ lsl r3, #7 ++ sub r1, r1, r6, lsl #1 ++ add r8, r2, r7, lsl #7 ++ ldr r7, [sp, #40] ++ ++10: ++ mov r2, r8 ++ add r4, r0, #24 ++ mov r5, r6 ++ mov lr, #0 ++1: ++ vldm r2!, {q10-q13} ++ add lr, #64 ++ ++ vshl.u32 q14, q10, #6 ++ ands lr, #127 ++ vshrn.u32 d4, q10, #14 ++ vshrn.u32 d2, q10, #4 ++ vmovn.u32 d0, q14 ++ ++ vshl.u32 q14, q11, #6 ++ it eq ++ addeq r2, r3 ++ vshrn.u32 d5, q11, #14 ++ vshrn.u32 d3, q11, #4 ++ vmovn.u32 d1, q14 ++ ++ subs r5, #48 ++ vand q2, q15 ++ vand q1, q15 ++ vand q0, q15 ++ ++ vshl.u32 q14, q12, #6 ++ vshrn.u32 d20, q12, #14 ++ vshrn.u32 d18, q12, #4 ++ vmovn.u32 d16, q14 ++ ++ vshl.u32 q14, q13, #6 ++ vshrn.u32 d21, q13, #14 ++ vshrn.u32 d19, q13, #4 ++ vmovn.u32 d17, q14 ++ ++ vand q10, q15 ++ vand q9, q15 ++ vand q8, q15 ++ blt 2f ++ ++ vst3.16 {d0, d2, d4}, [r0], r12 ++ vst3.16 {d1, d3, d5}, [r4], r12 ++ vst3.16 {d16, d18, d20}, [r0], r12 ++ vst3.16 {d17, d19, d21}, [r4], r12 ++ ++ bne 1b ++ ++11: ++ subs r7, #1 ++ add r0, r1 ++ add r8, #128 ++ bne 10b ++ ++ pop {r4-r8, pc} ++ ++@ Partial final write ++2: ++ cmp r5, #24-48 ++ blt 1f ++ vst3.16 {d0, d2, d4}, [r0], r12 ++ vst3.16 {d1, d3, d5}, [r4] ++ beq 11b ++ vmov q0, q8 ++ sub r5, #24 ++ vmov q1, q9 ++ vmov q2, q10 ++1: ++ cmp r5, #12-48 ++ blt 1f ++ vst3.16 {d0, d2, d4}, [r0]! ++ beq 11b ++ vmov d0, d1 ++ sub r5, #12 ++ vmov d2, d3 ++ vmov d4, d5 ++1: ++ cmp r5, #6-48 ++ add r4, r0, #6 @ avoid [r0]! on sequential instructions ++ blt 1f ++ vst3.16 {d0[0], d2[0], d4[0]}, [r0] ++ vst3.16 {d0[1], d2[1], d4[1]}, [r4] ++ add r0, #12 ++ beq 11b ++ vmov s0, s1 ++ sub r5, #6 ++ vmov s4, s5 ++ vmov s8, s9 ++1: ++ cmp r5, #3-48 ++ blt 1f ++ vst3.16 {d0[0], d2[0], d4[0]}, [r0]! ++ beq 11b ++ sub r5, #3 ++ vshr.u32 d0, #16 ++ vshr.u32 d2, #16 ++1: ++ cmp r5, #2-48 ++ blt 1f ++ vst2.16 {d0[0], d2[0]}, [r0]! ++ b 11b ++1: ++ vst1.16 {d0[0]}, [r0]! ++ b 11b ++ ++endfunc ++ ++ ++ +diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h +new file mode 100644 +index 0000000000..447f367bea +--- /dev/null ++++ b/libavutil/arm/rpi_sand_neon.h +@@ -0,0 +1,99 @@ ++/* ++Copyright (c) 2020 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#ifndef AVUTIL_ARM_SAND_NEON_H ++#define AVUTIL_ARM_SAND_NEON_H ++ ++void ff_rpi_sand128b_stripe_to_8_10( ++ uint8_t * dest, // [r0] ++ const uint8_t * src1, // [r1] ++ const uint8_t * src2, // [r2] ++ unsigned int lines); // [r3] ++ ++void ff_rpi_sand8_lines_to_planar_y8( ++ uint8_t * dest, // [r0] ++ unsigned int dst_stride, // [r1] ++ const uint8_t * src, // [r2] ++ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++ unsigned int src_stride2, // [sp, #0] -> r3 ++ unsigned int _x, // [sp, #4] Ignored - 0 ++ unsigned int y, // [sp, #8] (r7 in prefix) ++ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++ unsigned int h); // [sp, #16] -> r7 ++ ++void ff_rpi_sand8_lines_to_planar_c8( ++ uint8_t * dst_u, // [r0] ++ unsigned int dst_stride_u, // [r1] ++ uint8_t * dst_v, // [r2] ++ unsigned int dst_stride_v, // [r3] ++ const uint8_t * src, // [sp, #0] -> r4, r5 ++ unsigned int stride1, // [sp, #4] 128 ++ unsigned int stride2, // [sp, #8] -> r8 ++ unsigned int _x, // [sp, #12] 0 ++ unsigned int y, // [sp, #16] (r7 in prefix) ++ unsigned int _w, // [sp, #20] -> r12, r6 ++ unsigned int h); // [sp, #24] -> r7 ++ ++void ff_rpi_sand30_lines_to_planar_y16( ++ uint8_t * dest, // [r0] ++ unsigned int dst_stride, // [r1] ++ const uint8_t * src, // [r2] ++ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++ unsigned int src_stride2, // [sp, #0] -> r3 ++ unsigned int _x, // [sp, #4] Ignored - 0 ++ unsigned int y, // [sp, #8] (r7 in prefix) ++ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++ unsigned int h); // [sp, #16] -> r7 ++ ++void ff_rpi_sand30_lines_to_planar_c16( ++ uint8_t * dst_u, // [r0] ++ unsigned int dst_stride_u, // [r1] ++ uint8_t * dst_v, // [r2] ++ unsigned int dst_stride_v, // [r3] ++ const uint8_t * src, // [sp, #0] -> r4, r5 ++ unsigned int stride1, // [sp, #4] 128 ++ unsigned int stride2, // [sp, #8] -> r8 ++ unsigned int _x, // [sp, #12] 0 ++ unsigned int y, // [sp, #16] (r7 in prefix) ++ unsigned int _w, // [sp, #20] -> r6, r9 ++ unsigned int h); // [sp, #24] -> r7 ++ ++void ff_rpi_sand30_lines_to_planar_p010( ++ uint8_t * dest, // [r0] ++ unsigned int dst_stride, // [r1] ++ const uint8_t * src, // [r2] ++ unsigned int src_stride1, // [r3] Ignored - assumed 128 ++ unsigned int src_stride2, // [sp, #0] -> r3 ++ unsigned int _x, // [sp, #4] Ignored - 0 ++ unsigned int y, // [sp, #8] (r7 in prefix) ++ unsigned int _w, // [sp, #12] -> r6 (cur r5) ++ unsigned int h); // [sp, #16] -> r7 ++ ++#endif // AVUTIL_ARM_SAND_NEON_H ++ +diff --git a/libavutil/buffer.c b/libavutil/buffer.c +index 38a554208a..b0fedabc3e 100644 +--- a/libavutil/buffer.c ++++ b/libavutil/buffer.c +@@ -273,6 +273,19 @@ static void buffer_pool_free(AVBufferPool *pool) + av_freep(&pool); + } + ++void av_buffer_pool_flush(AVBufferPool *pool) ++{ ++ ff_mutex_lock(&pool->mutex); ++ while (pool->pool) { ++ BufferPoolEntry *buf = pool->pool; ++ pool->pool = buf->next; ++ ++ buf->free(buf->opaque, buf->data); ++ av_freep(&buf); ++ } ++ ff_mutex_unlock(&pool->mutex); ++} ++ + void av_buffer_pool_uninit(AVBufferPool **ppool) + { + AVBufferPool *pool; +diff --git a/libavutil/buffer.h b/libavutil/buffer.h +index c0f3f6cc9a..998beec9ac 100644 +--- a/libavutil/buffer.h ++++ b/libavutil/buffer.h +@@ -267,6 +267,11 @@ AVBufferPool *av_buffer_pool_init2(int size, void *opaque, + AVBufferRef* (*alloc)(void *opaque, int size), + void (*pool_free)(void *opaque)); + ++/** ++ * Free all available buffers in a buffer pool. ++ */ ++ void av_buffer_pool_flush(AVBufferPool *pool); ++ + /** + * Mark the pool as being available for freeing. It will actually be freed only + * once all the allocated buffers associated with the pool are released. Thus it +diff --git a/libavutil/frame.c b/libavutil/frame.c +index 2e952edd29..96e8bf5b3e 100644 +--- a/libavutil/frame.c ++++ b/libavutil/frame.c +@@ -16,6 +16,8 @@ + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + ++#include "config.h" ++ + #include "channel_layout.h" + #include "avassert.h" + #include "buffer.h" +@@ -26,6 +28,9 @@ + #include "mem.h" + #include "samplefmt.h" + #include "hwcontext.h" ++#if CONFIG_SAND ++#include "rpi_sand_fns.h" ++#endif + + #if FF_API_FRAME_GET_SET + MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp) +@@ -902,6 +907,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags) + (frame->crop_top + frame->crop_bottom) >= frame->height) + return AVERROR(ERANGE); + ++#if CONFIG_SAND ++ // Sand cannot be cropped - do not try ++ if (av_rpi_is_sand_format(frame->format)) ++ return 0; ++#endif ++ + desc = av_pix_fmt_desc_get(frame->format); + if (!desc) + return AVERROR_BUG; +diff --git a/libavutil/frame.h b/libavutil/frame.h +index fc67db0f6c..b1a7eb4858 100644 +--- a/libavutil/frame.h ++++ b/libavutil/frame.h +@@ -968,6 +968,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags); + */ + const char *av_frame_side_data_name(enum AVFrameSideDataType type); + ++ ++static inline int av_frame_cropped_width(const AVFrame * const frame) ++{ ++ return frame->width - (frame->crop_left + frame->crop_right); ++} ++static inline int av_frame_cropped_height(const AVFrame * const frame) ++{ ++ return frame->height - (frame->crop_top + frame->crop_bottom); ++} ++ + /** + * @} + */ +diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c +index 32cbde82eb..9ba8b7b2dd 100644 +--- a/libavutil/hwcontext_drm.c ++++ b/libavutil/hwcontext_drm.c +@@ -21,6 +21,7 @@ + #include + + #include ++#include + #include + + #include "avassert.h" +@@ -28,6 +29,7 @@ + #include "hwcontext_drm.h" + #include "hwcontext_internal.h" + #include "imgutils.h" ++#include "libavutil/rpi_sand_fns.h" + + + static void drm_device_free(AVHWDeviceContext *hwdev) +@@ -43,6 +45,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device, + AVDRMDeviceContext *hwctx = hwdev->hwctx; + drmVersionPtr version; + ++ if (device == NULL) { ++ hwctx->fd = -1; ++ return 0; ++ } ++ + hwctx->fd = open(device, O_RDWR); + if (hwctx->fd < 0) + return AVERROR(errno); +@@ -120,6 +127,9 @@ static int drm_map_frame(AVHWFramesContext *hwfc, + if (flags & AV_HWFRAME_MAP_WRITE) + mmap_prot |= PROT_WRITE; + ++ if (dst->format == AV_PIX_FMT_NONE) ++ dst->format = hwfc->sw_format; ++ + av_assert0(desc->nb_objects <= AV_DRM_MAX_PLANES); + for (i = 0; i < desc->nb_objects; i++) { + addr = mmap(NULL, desc->objects[i].size, mmap_prot, MAP_SHARED, +@@ -151,6 +161,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc, + + dst->width = src->width; + dst->height = src->height; ++ dst->crop_top = src->crop_top; ++ dst->crop_bottom = src->crop_bottom; ++ dst->crop_left = src->crop_left; ++ dst->crop_right = src->crop_right; ++ ++#if CONFIG_SAND ++ // Rework for sand frames ++ if (av_rpi_is_sand_frame(dst)) { ++ // As it stands the sand formats hold stride2 in linesize[3] ++ // linesize[0] & [1] contain stride1 which is always 128 for everything we do ++ // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1] ++ dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier); ++ dst->linesize[0] = 128; ++ dst->linesize[1] = 128; ++ // *** Are we sure src->height is actually what we want ??? ++ } ++#endif + + err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src, + &drm_unmap_frame, map); +@@ -178,7 +205,15 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx, + if (!pix_fmts) + return AVERROR(ENOMEM); + +- pix_fmts[0] = ctx->sw_format; ++ // **** Offer native sand too ???? ++ pix_fmts[0] = ++#if CONFIG_SAND ++ ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ? ++ AV_PIX_FMT_YUV420P : ++ ctx->sw_format == AV_PIX_FMT_RPI4_10 ? ++ AV_PIX_FMT_YUV420P10LE : ++#endif ++ ctx->sw_format; + pix_fmts[1] = AV_PIX_FMT_NONE; + + *formats = pix_fmts; +@@ -197,18 +232,82 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc, + map = av_frame_alloc(); + if (!map) + return AVERROR(ENOMEM); +- map->format = dst->format; + ++ // Map to default ++ map->format = AV_PIX_FMT_NONE; + err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ); + if (err) + goto fail; + +- map->width = dst->width; +- map->height = dst->height; ++#if 0 ++ av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__, ++ hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE, ++ map->width, map->height, ++ map->linesize[0], ++ map->linesize[1], ++ map->linesize[2], ++ map->linesize[3], ++ dst->width, dst->height, ++ dst->linesize[0], ++ dst->linesize[1], ++ dst->linesize[2]); ++#endif ++#if CONFIG_SAND ++ if (av_rpi_is_sand_frame(map)) { ++ unsigned int stride2 = map->linesize[3]; ++ const unsigned int w = FFMIN(dst->width, av_frame_cropped_width(map)); ++ const unsigned int h = FFMIN(dst->height, av_frame_cropped_height(map)); ++ ++ if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) { ++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], ++ map->data[0], ++ 128, stride2, ++ map->crop_left, map->crop_top, ++ w, h); ++ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ map->data[1], ++ 128, stride2, ++ map->crop_left / 2, map->crop_top / 2, ++ w / 2, h / 2); ++ } ++ else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) { ++ av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], ++ map->data[0], ++ 128, stride2, ++ map->crop_left, map->crop_top, ++ w, h); // *** ??? crop ++ av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ map->data[1], ++ 128, stride2, ++ map->crop_left / 2, map->crop_top / 2, ++ w / 2, h / 2); ++ } ++ else ++ { ++ av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__); ++ err = AVERROR(EINVAL); ++ goto fail; ++ } ++ ++ dst->width = w; ++ dst->height = h; ++ } ++ else ++#endif ++ { ++ // Kludge mapped h/w s.t. frame_copy works ++ map->width = dst->width; ++ map->height = dst->height; ++ err = av_frame_copy(dst, map); ++ } + +- err = av_frame_copy(dst, map); + if (err) ++ { ++ av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__); + goto fail; ++ } + + err = 0; + fail: +@@ -223,7 +322,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc, + int err; + + if (src->width > hwfc->width || src->height > hwfc->height) ++ { ++ av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height); + return AVERROR(EINVAL); ++ } + + map = av_frame_alloc(); + if (!map) +diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c +index 9d61c52567..4e36a110c1 100644 +--- a/libavutil/pixdesc.c ++++ b/libavutil/pixdesc.c +@@ -2073,6 +2073,18 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { + .name = "cuda", + .flags = AV_PIX_FMT_FLAG_HWACCEL, + }, ++ [AV_PIX_FMT_RPI] = { ++ .name = "rpi", ++ .flags = AV_PIX_FMT_FLAG_HWACCEL, ++ }, ++ [AV_PIX_FMT_RPI4_10] = { ++ .name = "rpi", ++ .flags = AV_PIX_FMT_FLAG_HWACCEL, ++ }, ++ [AV_PIX_FMT_RPI4_8] = { ++ .name = "rpi", ++ .flags = AV_PIX_FMT_FLAG_HWACCEL, ++ }, + [AV_PIX_FMT_AYUV64LE] = { + .name = "ayuv64le", + .nb_components = 4, +@@ -2371,6 +2383,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = { + .name = "vulkan", + .flags = AV_PIX_FMT_FLAG_HWACCEL, + }, ++ [AV_PIX_FMT_SAND128] = { ++ .name = "sand128", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 1, 0, 0, 8, 0, 7, 1 }, /* Y */ ++ { 1, 2, 0, 0, 8, 1, 7, 1 }, /* U */ ++ { 1, 2, 1, 0, 8, 1, 7, 2 }, /* V */ ++ }, ++ .flags = 0, ++ }, ++ [AV_PIX_FMT_SAND64_10] = { ++ .name = "sand64_10", ++ .nb_components = 3, ++ .log2_chroma_w = 1, ++ .log2_chroma_h = 1, ++ .comp = { ++ { 0, 2, 0, 0, 10, 0, 9, 1 }, /* Y */ ++ { 1, 4, 0, 0, 10, 1, 9, 1 }, /* U */ ++ { 1, 4, 1, 0, 10, 1, 9, 2 }, /* V */ ++ }, ++ .flags = 0, ++ }, + }; + #if FF_API_PLUS1_MINUS1 + FF_ENABLE_DEPRECATION_WARNINGS +diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h +index 1c625cfc8a..3400390a77 100644 +--- a/libavutil/pixfmt.h ++++ b/libavutil/pixfmt.h +@@ -234,6 +234,11 @@ enum AVPixelFormat { + */ + AV_PIX_FMT_CUDA, + ++ /** ++ * HW acceleration through RPI. ++ */ ++ AV_PIX_FMT_RPI, ++ + AV_PIX_FMT_0RGB, ///< packed RGB 8:8:8, 32bpp, XRGBXRGB... X=unused/undefined + AV_PIX_FMT_RGB0, ///< packed RGB 8:8:8, 32bpp, RGBXRGBX... X=unused/undefined + AV_PIX_FMT_0BGR, ///< packed BGR 8:8:8, 32bpp, XBGRXBGR... X=unused/undefined +@@ -357,6 +362,12 @@ enum AVPixelFormat { + + AV_PIX_FMT_Y210BE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian + AV_PIX_FMT_Y210LE, ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian ++// RPI - not on ifdef so can be got at by calling progs ++ AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding ++ AV_PIX_FMT_RPI4_8, ++ AV_PIX_FMT_RPI4_10, + + AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions + }; +diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h +new file mode 100644 +index 0000000000..0d5d203dc3 +--- /dev/null ++++ b/libavutil/rpi_sand_fn_pw.h +@@ -0,0 +1,227 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++// * Included twice from rpi_sand_fn with different PW ++ ++#define STRCAT(x,y) x##y ++ ++#if PW == 1 ++#define pixel uint8_t ++#define FUNC(f) STRCAT(f, 8) ++#elif PW == 2 ++#define pixel uint16_t ++#define FUNC(f) STRCAT(f, 16) ++#else ++#error Unexpected PW ++#endif ++ ++// Fetches a single patch - offscreen fixup not done here ++// w <= stride1 ++// unclipped ++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x; ++ const unsigned int w = _w; ++ const unsigned int mask = stride1 - 1; ++ ++#if PW == 1 && HAVE_SAND_ASM ++ if (_x == 0) { ++ ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride, ++ src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) { ++ memcpy(dst, p, w); ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const uint8_t * p = p2; ++ uint8_t * d = dst; ++ memcpy(d, p1, w1); ++ d += w1; ++ for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) { ++ memcpy(d, p, stride1); ++ } ++ memcpy(d, p, w3); ++ } ++ } ++} ++ ++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V) ++ ++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x * 2; ++ const unsigned int w = _w * 2; ++ const unsigned int mask = stride1 - 1; ++ ++#if PW == 1 && HAVE_SAND_ASM ++ if (_x == 0) { ++ ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v, ++ src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) { ++ pixel * du = (pixel *)dst_u; ++ pixel * dv = (pixel *)dst_v; ++ const pixel * p = (const pixel *)p1; ++ for (unsigned int k = 0; k < w; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const unsigned int sstride_p = (sstride - stride1) / PW; ++ ++ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const pixel * p = (const pixel *)p1; ++ pixel * du = (pixel *)dst_u; ++ pixel * dv = (pixel *)dst_v; ++ for (unsigned int k = 0; k < w1; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) { ++ for (unsigned int k = 0; k < stride1; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ for (unsigned int k = 0; k < w3; k += 2 * PW) { ++ *du++ = *p++; ++ *dv++ = *p++; ++ } ++ } ++ } ++} ++ ++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x = _x * 2; ++ const unsigned int w = _w * 2; ++ const unsigned int mask = stride1 - 1; ++ if ((x & ~mask) == ((x + w) & ~mask)) { ++ // All in one sand stripe ++ uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) { ++ const pixel * su = (const pixel *)src_u; ++ const pixel * sv = (const pixel *)src_v; ++ pixel * p = (pixel *)p1; ++ for (unsigned int k = 0; k < w; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ } ++ else ++ { ++ // Two+ stripe ++ const unsigned int sstride = stride1 * stride2; ++ const unsigned int sstride_p = (sstride - stride1) / PW; ++ ++ const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2; ++ const uint8_t * p2 = p1 + sstride - (x & mask); ++ const unsigned int w1 = stride1 - (x & mask); ++ const unsigned int w3 = (x + w) & mask; ++ const unsigned int w2 = w - (w1 + w3); ++ ++ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) { ++ unsigned int j; ++ const pixel * su = (const pixel *)src_u; ++ const pixel * sv = (const pixel *)src_v; ++ pixel * p = (pixel *)p1; ++ for (unsigned int k = 0; k < w1; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) { ++ for (unsigned int k = 0; k < stride1; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ for (unsigned int k = 0; k < w3; k += 2 * PW) { ++ *p++ = *su++; ++ *p++ = *sv++; ++ } ++ } ++ } ++} ++ ++ ++#undef pixel ++#undef STRCAT ++#undef FUNC ++ +diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c +new file mode 100644 +index 0000000000..ed0261b02f +--- /dev/null ++++ b/libavutil/rpi_sand_fns.c +@@ -0,0 +1,353 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#include "config.h" ++#include ++#include ++#include "rpi_sand_fns.h" ++#include "avassert.h" ++#include "frame.h" ++ ++#if ARCH_ARM && HAVE_NEON ++#include "arm/rpi_sand_neon.h" ++#define HAVE_SAND_ASM 1 ++#else ++#define HAVE_SAND_ASM 0 ++#endif ++ ++#define PW 1 ++#include "rpi_sand_fn_pw.h" ++#undef PW ++ ++#define PW 2 ++#include "rpi_sand_fn_pw.h" ++#undef PW ++ ++#if 1 ++// Simple round ++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) ++{ ++ const unsigned int rnd = (1 << shr) >> 1; ++ const uint16_t * src = (const uint16_t *)_src; ++ ++ for (; n != 0; --n) { ++ *dst++ = (*src++ + rnd) >> shr; ++ } ++} ++#else ++// Dithered variation ++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr) ++{ ++ unsigned int rnd = (1 << shr) >> 1; ++ const unsigned int mask = ((1 << shr) - 1); ++ const uint16_t * src = (const uint16_t *)_src; ++ ++ for (; n != 0; --n) { ++ rnd = *src++ + (rnd & mask); ++ *dst++ = rnd >> shr; ++ } ++} ++#endif ++ ++// Fetches a single patch - offscreen fixup not done here ++// w <= stride1 ++// unclipped ++// _x & _w in pixels, strides in bytes ++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word ++ const unsigned int xskip0 = _x - (x0 >> 2) * 3; ++ const unsigned int x1 = ((_x + _w) / 3) * 4; ++ const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3; ++ const unsigned int mask = stride1 - 1; ++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; ++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words ++ ++#if HAVE_SAND_ASM ++ if (_x == 0) { ++ ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if (x0 == x1) { ++ // ******************* ++ // Partial single word xfer ++ return; ++ } ++ ++ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1) ++ { ++ unsigned int x = x0; ++ const uint32_t * p = (const uint32_t *)p0; ++ uint16_t * d = (uint16_t *)dst; ++ ++ if (xskip0 != 0) { ++ const uint32_t p3 = *p++; ++ ++ if (xskip0 == 1) ++ *d++ = (p3 >> 10) & 0x3ff; ++ *d++ = (p3 >> 20) & 0x3ff; ++ ++ if (((x += 4) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ while (x != x1) { ++ const uint32_t p3 = *p++; ++ *d++ = p3 & 0x3ff; ++ *d++ = (p3 >> 10) & 0x3ff; ++ *d++ = (p3 >> 20) & 0x3ff; ++ ++ if (((x += 4) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ if (xrem1 != 0) { ++ const uint32_t p3 = *p; ++ ++ *d++ = p3 & 0x3ff; ++ if (xrem1 == 2) ++ *d++ = (p3 >> 10) & 0x3ff; ++ } ++ } ++} ++ ++ ++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h) ++{ ++ const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word ++ const unsigned int xskip0 = _x - (x0 >> 3) * 3; ++ const unsigned int x1 = ((_x + _w) / 3) * 8; ++ const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3; ++ const unsigned int mask = stride1 - 1; ++ const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2; ++ const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2; // RHS of a stripe to LHS of next in words ++ ++#if HAVE_SAND_ASM ++ if (_x == 0) { ++ ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v, ++ src, stride1, stride2, _x, y, _w, h); ++ return; ++ } ++#endif ++ ++ if (x0 == x1) { ++ // ******************* ++ // Partial single word xfer ++ return; ++ } ++ ++ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1) ++ { ++ unsigned int x = x0; ++ const uint32_t * p = (const uint32_t *)p0; ++ uint16_t * du = (uint16_t *)dst_u; ++ uint16_t * dv = (uint16_t *)dst_v; ++ ++ if (xskip0 != 0) { ++ const uint32_t p3a = *p++; ++ const uint32_t p3b = *p++; ++ ++ if (xskip0 == 1) ++ { ++ *du++ = (p3a >> 20) & 0x3ff; ++ *dv++ = (p3b >> 0) & 0x3ff; ++ } ++ *du++ = (p3b >> 10) & 0x3ff; ++ *dv++ = (p3b >> 20) & 0x3ff; ++ ++ if (((x += 8) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ while (x != x1) { ++ const uint32_t p3a = *p++; ++ const uint32_t p3b = *p++; ++ ++ *du++ = p3a & 0x3ff; ++ *dv++ = (p3a >> 10) & 0x3ff; ++ *du++ = (p3a >> 20) & 0x3ff; ++ *dv++ = p3b & 0x3ff; ++ *du++ = (p3b >> 10) & 0x3ff; ++ *dv++ = (p3b >> 20) & 0x3ff; ++ ++ if (((x += 8) & mask) == 0) ++ p += slice_inc; ++ } ++ ++ if (xrem1 != 0) { ++ const uint32_t p3a = *p++; ++ const uint32_t p3b = *p++; ++ ++ *du++ = p3a & 0x3ff; ++ *dv++ = (p3a >> 10) & 0x3ff; ++ if (xrem1 == 2) ++ { ++ *du++ = (p3a >> 20) & 0x3ff; ++ *dv++ = p3b & 0x3ff; ++ } ++ } ++ } ++} ++ ++ ++// w/h in pixels ++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, ++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, ++ unsigned int w, unsigned int h, const unsigned int shr) ++{ ++ const unsigned int n = dst_stride1 / 2; ++ unsigned int j; ++ ++ // This is true for our current layouts ++ av_assert0(dst_stride1 == src_stride1); ++ ++ // As we have the same stride1 for src & dest and src is wider than dest ++ // then if we loop on src we can always write contiguously to dest ++ // We make no effort to copy an exact width - round up to nearest src stripe ++ // as we will always have storage in dest for that ++ ++#if ARCH_ARM && HAVE_NEON ++ if (shr == 3 && src_stride1 == 128) { ++ for (j = 0; j + n < w; j += dst_stride1) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ const uint8_t * s2 = s1 + src_stride1 * src_stride2; ++ ++ ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h); ++ } ++ } ++ else ++#endif ++ { ++ for (j = 0; j + n < w; j += dst_stride1) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ const uint8_t * s2 = s1 + src_stride1 * src_stride2; ++ ++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) { ++ cpy16_to_8(d, s1, n, shr); ++ cpy16_to_8(d + n, s2, n, shr); ++ } ++ } ++ } ++ ++ // Fix up a trailing dest half stripe ++ if (j < w) { ++ uint8_t * d = dst + j * dst_stride2; ++ const uint8_t * s1 = src + j * 2 * src_stride2; ++ ++ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) { ++ cpy16_to_8(d, s1, n, shr); ++ } ++ } ++} ++ ++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src) ++{ ++ const int w = av_frame_cropped_width(src); ++ const int h = av_frame_cropped_height(src); ++ const int x = src->crop_left; ++ const int y = src->crop_top; ++ ++ // We will crop as part of the conversion ++ dst->crop_top = 0; ++ dst->crop_left = 0; ++ dst->crop_bottom = 0; ++ dst->crop_right = 0; ++ ++ switch (src->format){ ++ case AV_PIX_FMT_SAND128: ++ case AV_PIX_FMT_RPI4_8: ++ switch (dst->format){ ++ case AV_PIX_FMT_YUV420P: ++ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w/2, h/2); ++ break; ++ default: ++ return -1; ++ } ++ break; ++ case AV_PIX_FMT_SAND64_10: ++ switch (dst->format){ ++ case AV_PIX_FMT_YUV420P10: ++ av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x*2, y, w*2, h); ++ av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y/2, w, h/2); ++ break; ++ default: ++ return -1; ++ } ++ break; ++ case AV_PIX_FMT_RPI4_10: ++ switch (dst->format){ ++ case AV_PIX_FMT_YUV420P10: ++ av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0], ++ src->data[0], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x, y, w, h); ++ av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1], ++ dst->data[2], dst->linesize[2], ++ src->data[1], ++ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src), ++ x/2, y/2, w/2, h/2); ++ break; ++ default: ++ return -1; ++ } ++ break; ++ default: ++ return -1; ++ } ++ ++ return av_frame_copy_props(dst, src); ++} +diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h +new file mode 100644 +index 0000000000..634b55e800 +--- /dev/null ++++ b/libavutil/rpi_sand_fns.h +@@ -0,0 +1,183 @@ ++/* ++Copyright (c) 2018 Raspberry Pi (Trading) Ltd. ++All rights reserved. ++ ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are met: ++ * Redistributions of source code must retain the above copyright ++ notice, this list of conditions and the following disclaimer. ++ * Redistributions in binary form must reproduce the above copyright ++ notice, this list of conditions and the following disclaimer in the ++ documentation and/or other materials provided with the distribution. ++ * Neither the name of the copyright holder nor the ++ names of its contributors may be used to endorse or promote products ++ derived from this software without specific prior written permission. ++ ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED ++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE ++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY ++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES ++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; ++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++ ++Authors: John Cox ++*/ ++ ++#ifndef AVUTIL_RPI_SAND_FNS ++#define AVUTIL_RPI_SAND_FNS ++ ++#include "libavutil/frame.h" ++ ++// For all these fns _x & _w are measured as coord * PW ++// For the C fns coords are in chroma pels (so luma / 2) ++// Strides are in bytes ++ ++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_planar_to_sand_c8(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_planar_to_sand_c16(uint8_t * dst_c, ++ unsigned int stride1, unsigned int stride2, ++ const uint8_t * src_u, const unsigned int src_stride_u, ++ const uint8_t * src_v, const unsigned int src_stride_v, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u, ++ uint8_t * dst_v, const unsigned int dst_stride_v, ++ const uint8_t * src, ++ unsigned int stride1, unsigned int stride2, ++ unsigned int _x, unsigned int y, ++ unsigned int _w, unsigned int h); ++ ++ ++// w/h in pixels ++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2, ++ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2, ++ unsigned int w, unsigned int h, const unsigned int shr); ++ ++ ++// dst must contain required pixel format & allocated data buffers ++// Cropping on the src buffer will be honoured and dst crop will be set to zero ++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src); ++ ++ ++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame) ++{ ++#ifdef RPI_ZC_SAND128_ONLY ++ // If we are sure we only only support 128 byte sand formats replace the ++ // var with a constant which should allow for better optimisation ++ return 128; ++#else ++ return frame->linesize[0]; ++#endif ++} ++ ++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame) ++{ ++ return frame->linesize[3]; ++} ++ ++ ++static inline int av_rpi_is_sand_format(const int format) ++{ ++ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10); ++} ++ ++static inline int av_rpi_is_sand_frame(const AVFrame * const frame) ++{ ++ return av_rpi_is_sand_format(frame->format); ++} ++ ++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame) ++{ ++ return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8); ++} ++ ++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame) ++{ ++ return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16); ++} ++ ++static inline int av_rpi_is_sand30_frame(const AVFrame * const frame) ++{ ++ return (frame->format == AV_PIX_FMT_RPI4_10); ++} ++ ++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame) ++{ ++ return av_rpi_is_sand8_frame(frame) ? 0 : 1; ++} ++ ++// If x is measured in bytes (not pixels) then this works for sand64_16 as ++// well as sand128 - but in the general case we work that out ++ ++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y) ++{ ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame); ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y + stride2 * x2; ++} ++ ++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c) ++{ ++ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame); ++ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame); ++ const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1); ++ const unsigned int x1 = x & (stride1 - 1); ++ const unsigned int x2 = x ^ x1; ++ ++ return x1 + stride1 * y_c + stride2 * x2; ++} ++ ++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y); ++} ++ ++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y) ++{ ++ return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y); ++} ++ ++#endif ++ +diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt +new file mode 100644 +index 0000000000..7f16dff6a2 +--- /dev/null ++++ b/pi-util/BUILD.txt +@@ -0,0 +1,29 @@ ++Building Pi FFmpeg ++================== ++ ++Configuration: ++============= ++ ++These instructions work for cross compiles from Ubuntu 16.04 & Ubuntu ++18.04. I would expect most other linux environments to work but I haven't ++tried them. ++ ++pi-util/conf_pi2.sh ++ ++contains suitable options to build the code for Pi2/3. It expects to find ++git clones of ++ ++https://github.com/raspberrypi/tools ++https://github.com/raspberrypi/firmware ++ ++in the parent of the FFmpeg directory. I recommend using --depth 1 to avoid a ++lot of history you don't want. ++ ++If you have a copy of qasm.py in ../local/bin then the .qasm sources will be ++rebuilt. Otherwise the prebuilt .c & .h files will be used. ++Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild ++ ++pi-util/conf_p1.sh should configure for Pi1. Beware that as of this time ++H265 QPU acceleration is broken on Pi1 and so it is disabled. ++ ++ +diff --git a/pi-util/NOTES.txt b/pi-util/NOTES.txt +new file mode 100644 +index 0000000000..fcce72226a +--- /dev/null ++++ b/pi-util/NOTES.txt +@@ -0,0 +1,69 @@ ++Notes on the hevc_rpi decoder & associated support code ++------------------------------------------------------- ++ ++There are 3 main parts to the existing code: ++ ++1) The decoder - this is all in libavcodec as rpi_hevc*. ++ ++2) A few filters to deal with Sand frames and a small patch to ++automatically select the sand->i420 converter when required. ++ ++3) A kludge in ffmpeg.c to display the decoded video. This could & should ++be converted into a proper ffmpeg display module. ++ ++ ++Decoder ++------- ++ ++The decoder is a modified version of the existing ffmpeg hevc decoder. ++Generally it is ~100% faster than the existing ffmpeg hevc s/w decoder. ++More complex bitstreams can be up to ~200% faster but particularly easy ++streams can cut its advantage down to ~50%. This means that a Pi3+ can ++display nearly all 8-bit 1080p30 streams and with some overclocking it can ++display most lower bitrate 10-bit 1080p30 streams - this latter case is ++not helped by the requirement to downsample to 8-bit before display on a ++Pi. ++ ++It has had co-processor offload added for inter-pred and large block ++residual transform. Various parts have had optimized ARM NEON assembler ++added and the existing ARM asm sections have been profiled and ++re-optimized for A53. The main C code has been substantially reworked at ++its lower levels in an attempt to optimize it and minimize memory ++bandwidth. To some extent code paths that deal with frame types that it ++doesn't support have been pruned. ++ ++It outputs frames in Broadcom Sand format. This is a somewhat annoying ++layout that doesn't fit into ffmpegs standard frame descriptions. It has ++vertical stripes of 128 horizontal pixels (64 in 10 bit forms) with Y for ++the stripe followed by interleaved U & V, that is then followed by the Y ++for the next stripe, etc. The final stripe is always padded to ++stripe-width. This is used in an attempt to help with cache locality and ++cut down on the number of dram bank switches. It is annoying to use for ++inter-pred with conventional processing but the way the Pi QPU (which is ++used for inter-pred) works means that it has negligible downsides here and ++the improved memory performance exceeds the overhead of the increased ++complexity in the rest of the code. ++ ++Frames must be allocated out of GPU memory (as otherwise they can't be ++accessed by the co-processors). Utility functions (in rpi_zc.c) have been ++written to make this easier. As the frames are already in GPU memory they ++can be displayed by the Pi h/w without any further copying. ++ ++ ++Known non-features ++------------------ ++ ++Frame allocation should probably be done in some other way in order to fit ++into the standard framework better. ++ ++Sand frames are currently declared as software frames, there is an ++argument that they should be hardware frames but they aren't really. ++ ++There must be a better way of auto-selecting the hevc_rpi decoder over the ++normal s/w hevc decoder, but I became confused by the existing h/w ++acceleration framework and what I wanted to do didn't seem to fit in ++neatly. ++ ++Display should be a proper device rather than a kludge in ffmpeg.c ++ ++ +diff --git a/pi-util/conf_arm64_native.sh b/pi-util/conf_arm64_native.sh +new file mode 100644 +index 0000000000..9e3bbfa190 +--- /dev/null ++++ b/pi-util/conf_arm64_native.sh +@@ -0,0 +1,45 @@ ++echo "Configure for ARM64 native build" ++ ++#RPI_KEEPS="-save-temps=obj" ++ ++SHARED_LIBS="--enable-shared" ++if [ "$1" == "--noshared" ]; then ++ SHARED_LIBS="--disable-shared" ++ echo Static libs ++ OUT=out/arm64-static-rel ++else ++ echo Shared libs ++ OUT=out/arm64-shared-rel ++fi ++ ++mkdir -p $OUT ++cd $OUT ++ ++A=aarch64-linux-gnu ++USR_PREFIX=`pwd`/install ++LIB_PREFIX=$USR_PREFIX/lib/$A ++INC_PREFIX=$USR_PREFIX/include/$A ++ ++../../configure \ ++ --prefix=$USR_PREFIX\ ++ --libdir=$LIB_PREFIX\ ++ --incdir=$INC_PREFIX\ ++ --disable-stripping\ ++ --disable-thumb\ ++ --disable-mmal\ ++ --enable-sand\ ++ --enable-v4l2-request\ ++ --enable-libdrm\ ++ --enable-epoxy\ ++ --enable-libudev\ ++ --enable-vout-drm\ ++ --enable-vout-egl\ ++ $SHARED_LIBS\ ++ --extra-cflags="-ggdb" ++ ++# --enable-decoder=hevc_rpi\ ++# --enable-extra-warnings\ ++# --arch=armv71\ ++ ++# gcc option for getting asm listing ++# -Wa,-ahls +diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv +new file mode 100644 +index 0000000000..4efd5d1c67 +--- /dev/null ++++ b/pi-util/conf_h265.2016.csv +@@ -0,0 +1,195 @@ ++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8 ++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8 ++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8 ++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8 ++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8 ++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8 ++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8 ++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8 ++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8 ++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8 ++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8 ++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8 ++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8 ++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8 ++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8 ++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8 ++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8 ++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8 ++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8 ++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8 ++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8 ++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10 ++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8 ++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8 ++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8 ++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8 ++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8 ++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8 ++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8 ++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8 ++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8 ++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8 ++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8 ++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8 ++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8 ++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8 ++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8 ++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8 ++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8 ++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8 ++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8 ++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8 ++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10 ++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8 ++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8 ++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8 ++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8 ++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8 ++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8 ++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8 ++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8 ++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8 ++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8 ++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8 ++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8 ++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8 ++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8 ++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8 ++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8 ++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8 ++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8 ++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8 ++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8 ++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8 ++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8 ++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8 ++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8 ++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8 ++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8 ++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8 ++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8 ++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8 ++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8 ++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8 ++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8 ++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8 ++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8 ++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8 ++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8 ++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8 ++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8 ++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8 ++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8 ++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8 ++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8 ++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8 ++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8 ++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8 ++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8 ++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8 ++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8 ++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8 ++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8 ++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8 ++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8 ++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8 ++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8 ++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8 ++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8 ++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8 ++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8 ++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8 ++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8 ++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8 ++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8 ++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8 ++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8 ++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8 ++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8 ++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8 ++1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8 ++1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8 ++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8 ++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8 ++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8 ++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8 ++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8 ++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8 ++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8 ++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8 ++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8 ++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8 ++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8 ++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8 ++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8 ++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8 ++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8 ++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10 ++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8 ++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8 ++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8 ++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10 ++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8 ++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8 ++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10 ++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10 ++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8 ++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10 ++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8 ++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10 ++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8 ++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10 ++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8 ++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10 ++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8 ++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10 ++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8 ++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0 ++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8 ++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8 ++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10 ++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8 ++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8 ++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0 ++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8 ++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8 ++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8 ++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10 ++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0 ++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0 ++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0 ++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0 ++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0 ++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0 ++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0 ++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8 ++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8 ++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0 ++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8 ++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0 ++1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0 ++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0 ++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0 ++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0 ++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0 ++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0 ++0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8 ++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10 ++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10 ++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8 ++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8 ++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8 ++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8 ++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8 ++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8 ++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8 ++1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8 ++1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8 ++1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8 +diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv +new file mode 100644 +index 0000000000..6082641271 +--- /dev/null ++++ b/pi-util/conf_h265.2016_HEVC_v1.csv +@@ -0,0 +1,147 @@ ++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5 ++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5 ++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 ++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 ++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 ++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 ++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 ++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5 ++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 ++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 ++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 ++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5 ++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5 ++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5 ++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5 ++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5 ++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5 ++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5 ++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5 ++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5 ++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5 ++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5 ++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5 ++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5 ++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5 ++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5 ++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 ++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 ++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 ++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 ++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 ++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 ++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 ++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5 ++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5 ++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5 ++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5 ++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5 ++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5 ++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5 ++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5 ++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5 ++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5 ++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5 ++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5 ++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5 ++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5 ++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5 ++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5 ++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5 ++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5 ++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5 ++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5 ++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5 ++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5 ++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5 ++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5 ++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5 ++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5 ++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5 ++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5 ++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5 ++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5 ++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5 ++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5 ++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5 ++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 ++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 ++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 ++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 ++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 ++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 ++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 ++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5 ++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5 ++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5 ++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5 ++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5 ++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5 ++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5 ++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 ++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 ++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 ++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 ++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 ++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 ++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 ++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 ++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 ++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 ++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5 ++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5 ++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5 ++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5 ++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5 ++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5 ++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5 ++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5 ++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5 ++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5 ++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5 ++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5 ++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5 ++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5 ++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5 ++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5 ++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5 ++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 ++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 ++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5 ++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt ++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt ++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 ++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 ++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5 ++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5 ++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5 ++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 ++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 ++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5 ++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5 ++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 ++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5 ++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5 ++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 ++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 ++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 ++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth ++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 ++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 ++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ??? ++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 ++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 ++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 ++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5 ++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5 ++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5 ++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5 ++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5 ++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5 ++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5 ++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5 ++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5 ++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5 ++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 ++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 ++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 +diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv +new file mode 100644 +index 0000000000..fc14f2a3c2 +--- /dev/null ++++ b/pi-util/conf_h265.csv +@@ -0,0 +1,144 @@ ++1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5 ++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5 ++1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5 ++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5 ++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5 ++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5 ++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5 ++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5 ++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5 ++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5 ++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5 ++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5 ++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5 ++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5 ++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5 ++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5 ++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5 ++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5 ++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5 ++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5 ++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5 ++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5 ++1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5 ++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5 ++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5 ++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5 ++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5 ++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5 ++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5 ++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5 ++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5 ++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5 ++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5 ++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5 ++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5 ++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5 ++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5 ++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5 ++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5 ++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5 ++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5 ++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5 ++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5 ++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5 ++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5 ++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5 ++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5 ++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5 ++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5 ++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5 ++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5 ++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5 ++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5 ++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5 ++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5 ++1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5 ++1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5 ++1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5 ++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5 ++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5 ++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5 ++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5 ++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5 ++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5 ++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5 ++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5 ++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5 ++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5 ++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5 ++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5 ++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5 ++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5 ++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5 ++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5 ++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5 ++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5 ++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5 ++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5 ++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5 ++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5 ++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5 ++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5 ++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5 ++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5 ++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5 ++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5 ++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5 ++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5 ++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5 ++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5 ++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5 ++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5 ++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5 ++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5 ++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5 ++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5 ++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5 ++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5 ++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5 ++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5 ++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5 ++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5 ++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5 ++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5 ++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5 ++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5 ++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5 ++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5 ++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5 ++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5 ++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5 ++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5 ++1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5 ++1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5 ++1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5 ++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5 ++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5 ++1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5 ++1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5 ++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5 ++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5 ++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5 ++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5 ++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5 ++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5 ++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched ++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5 ++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5 ++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5 ++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5 ++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5 ++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5 ++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5 ++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5 ++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5 ++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5 ++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5 ++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5 ++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5 ++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5 ++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5 ++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5 ++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5 ++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5 +diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh +new file mode 100755 +index 0000000000..063edbf8af +--- /dev/null ++++ b/pi-util/conf_native.sh +@@ -0,0 +1,56 @@ ++echo "Configure for native build" ++ ++FFSRC=`pwd` ++ ++RPI_OPT_VC=/opt/vc ++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" ++RPI_LIBDIRS="-L$RPI_OPT_VC/lib" ++RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4" ++#RPI_KEEPS="-save-temps=obj" ++RPI_KEEPS="" ++ ++SHARED_LIBS="--enable-shared" ++if [ "$1" == "--noshared" ]; then ++ SHARED_LIBS="--disable-shared" ++ OUT=out/armv7-static-rel ++ echo Static libs ++else ++ echo Shared libs ++ OUT=out/armv7-shared-rel ++fi ++ ++USR_PREFIX=$FFSRC/$OUT/install ++LIB_PREFIX=$USR_PREFIX/lib/arm-linux-gnueabihf ++INC_PREFIX=$USR_PREFIX/include/arm-linux-gnueabihf ++ ++mkdir -p $FFSRC/$OUT ++cd $FFSRC/$OUT ++ ++$FFSRC/configure \ ++ --prefix=$USR_PREFIX\ ++ --libdir=$LIB_PREFIX\ ++ --incdir=$INC_PREFIX\ ++ --arch=armv6t2\ ++ --cpu=cortex-a7\ ++ --disable-stripping\ ++ --disable-thumb\ ++ --enable-mmal\ ++ --enable-rpi\ ++ --enable-v4l2-request\ ++ --enable-libdrm\ ++ --enable-epoxy\ ++ --enable-libudev\ ++ --enable-vout-drm\ ++ --enable-vout-egl\ ++ $SHARED_LIBS\ ++ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\ ++ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\ ++ --extra-ldflags="$RPI_LIBDIRS"\ ++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ ++ ++# --enable-decoder=hevc_rpi\ ++# --enable-extra-warnings\ ++# --arch=armv71\ ++ ++# gcc option for getting asm listing ++# -Wa,-ahls +diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh +new file mode 100755 +index 0000000000..29fa9fa68d +--- /dev/null ++++ b/pi-util/conf_pi1.sh +@@ -0,0 +1,39 @@ ++echo "Configure for Pi1" ++ ++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf ++RPI_OPT_VC=`pwd`/../firmware/hardfp/opt/vc ++ ++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" ++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib" ++#RPI_KEEPS="-save-temps=obj" ++RPI_KEEPS="" ++ ++SHARED_LIBS="--enable-shared" ++if [ "$1" == "--noshared" ]; then ++ SHARED_LIBS="--disable-shared" ++ echo Static libs ++else ++ echo Shared libs ++fi ++ ++./configure --enable-cross-compile\ ++ --cpu=arm1176jzf-s\ ++ --arch=arm\ ++ --disable-neon\ ++ --target-os=linux\ ++ --disable-stripping\ ++ --enable-mmal\ ++ $SHARED_LIBS\ ++ --extra-cflags="-g $RPI_KEEPS $RPI_INCLUDES"\ ++ --extra-cxxflags="$RPI_INCLUDES"\ ++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\ ++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ ++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- ++ ++ ++# --enable-extra-warnings\ ++# --arch=armv71\ ++# --enable-shared\ ++ ++# gcc option for getting asm listing ++# -Wa,-ahls +diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh +new file mode 100755 +index 0000000000..3dd5edcf83 +--- /dev/null ++++ b/pi-util/conf_pi2.sh +@@ -0,0 +1,50 @@ ++echo "Configure for Pi2/3" ++ ++RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf ++RPI_OPT_VC=`pwd`/../firmware/hardfp/opt/vc ++ ++RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux" ++RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib" ++RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4" ++#RPI_KEEPS="-save-temps=obj" ++RPI_KEEPS="" ++ ++USR_PREFIX=`pwd`/install ++LIB_PREFIX=$USR_PREFIX/lib/arm-linux-gnueabihf ++INC_PREFIX=$USR_PREFIX/include/arm-linux-gnueabihf ++ ++SHARED_LIBS="--enable-shared" ++if [ "$1" == "--noshared" ]; then ++ SHARED_LIBS="--disable-shared" ++ echo Static libs ++else ++ echo Shared libs ++fi ++ ++./configure --enable-cross-compile\ ++ --prefix=$USR_PREFIX\ ++ --libdir=$LIB_PREFIX\ ++ --incdir=$INC_PREFIX\ ++ --arch=armv6t2\ ++ --cpu=cortex-a7\ ++ --target-os=linux\ ++ --disable-stripping\ ++ --disable-thumb\ ++ --enable-mmal\ ++ --enable-rpi\ ++ $SHARED_LIBS\ ++ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\ ++ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\ ++ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\ ++ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\ ++ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf- ++ ++# --enable-shared\ ++ ++# --enable-decoder=hevc_rpi\ ++# --enable-extra-warnings\ ++# --arch=armv71\ ++# --enable-shared\ ++ ++# gcc option for getting asm listing ++# -Wa,-ahls +diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py +new file mode 100755 +index 0000000000..2e59e6ceb5 +--- /dev/null ++++ b/pi-util/ffconf.py +@@ -0,0 +1,216 @@ ++#!/usr/bin/env python ++ ++import string ++import os ++import subprocess ++import re ++import argparse ++import sys ++import csv ++from stat import * ++ ++ffmpeg_exec = "./ffmpeg" ++ ++CODEC_HEVC_RPI = 1 ++HWACCEL_RPI = 2 ++HWACCEL_DRM = 3 ++HWACCEL_VAAPI = 4 ++ ++def testone(fileroot, srcname, es_file, md5_file, pix, dectype, vcodec): ++ hwaccel = "" ++ if dectype == HWACCEL_RPI: ++ hwaccel = "rpi" ++ elif dectype == HWACCEL_DRM: ++ hwaccel = "drm" ++ elif dectype == HWACCEL_VAAPI: ++ hwaccel = "vaapi" ++ ++ pix_fmt = [] ++ if pix == "8": ++ pix_fmt = ["-pix_fmt", "yuv420p"] ++ elif pix == "10": ++ pix_fmt = ["-pix_fmt", "yuv420p10le"] ++ elif pix == "12": ++ pix_fmt = ["-pix_fmt", "yuv420p12le"] ++ ++ tmp_root = "/tmp" ++ ++ names = srcname.split('/') ++ while len(names) > 1: ++ tmp_root = os.path.join(tmp_root, names[0]) ++ del names[0] ++ name = names[0] ++ ++ if not os.path.exists(tmp_root): ++ os.makedirs(tmp_root) ++ ++ dec_file = os.path.join(tmp_root, name + ".dec.md5") ++ try: ++ os.remove(dec_file) ++ except: ++ pass ++ ++ flog = open(os.path.join(tmp_root, name + ".log"), "wt") ++ ++ ffargs = [ffmpeg_exec, "-flags", "unaligned", "-hwaccel", hwaccel, "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file)] + pix_fmt + ["-f", "md5", dec_file] ++ ++ # Unaligned needed for cropping conformance ++ if hwaccel: ++ rstr = subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT) ++ else: ++ rstr = subprocess.call( ++ [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file], ++ stdout=flog, stderr=subprocess.STDOUT) ++ ++ try: ++ m1 = None ++ m2 = None ++ with open(os.path.join(fileroot, md5_file)) as f: ++ for line in f: ++ m1 = re.search("[0-9a-f]{32}", line.lower()) ++ if m1: ++ break ++ ++ with open(dec_file) as f: ++ m2 = re.search("[0-9a-f]{32}", f.readline()) ++ except: ++ pass ++ ++ if m1 and m2 and m1.group() == m2.group(): ++ print >> flog, "Match: " + m1.group() ++ rv = 0 ++ elif not m1: ++ print >> flog, "****** Cannot find m1" ++ rv = 3 ++ elif not m2: ++ print >> flog, "****** Cannot find m2" ++ rv = 2 ++ else: ++ print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group() ++ rv = 1 ++ flog.close() ++ return rv ++ ++def scandir(root): ++ aconf = [] ++ ents = os.listdir(root) ++ ents.sort(key=str.lower) ++ for name in ents: ++ test_path = os.path.join(root, name) ++ if S_ISDIR(os.stat(test_path).st_mode): ++ files = os.listdir(test_path) ++ es_file = "?" ++ md5_file = "?" ++ for f in files: ++ (base, ext) = os.path.splitext(f) ++ if base[0] == '.': ++ pass ++ elif ext == ".bit" or ext == ".bin": ++ es_file = f ++ elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")): ++ if md5_file == "?": ++ md5_file = f ++ elif base[-3:] == "yuv": ++ md5_file = f ++ aconf.append((1, name, es_file, md5_file)) ++ return aconf ++ ++def runtest(name, tests): ++ if not tests: ++ return True ++ for t in tests: ++ if name[0:len(t)] == t or name.find("/" + t) != -1: ++ return True ++ return False ++ ++def doconf(csva, tests, test_root, vcodec, dectype): ++ unx_failures = [] ++ unx_success = [] ++ failures = 0 ++ successes = 0 ++ for a in csva: ++ exp_test = int(a[0]) ++ if (exp_test and runtest(a[1], tests)): ++ name = a[1] ++ print "==== ", name, ++ sys.stdout.flush() ++ ++ rv = testone(os.path.join(test_root, name), name, a[2], a[3], a[4], dectype=dectype, vcodec=vcodec) ++ if (rv == 0): ++ successes += 1 ++ else: ++ failures += 1 ++ ++ if (rv == 0): ++ if exp_test == 2: ++ print ": * OK *" ++ unx_success.append(name) ++ else: ++ print ": ok" ++ elif exp_test == 2 and rv == 1: ++ print ": fail" ++ elif exp_test == 3 and rv == 2: ++ # Call an expected "crash" an abort ++ print ": abort" ++ else: ++ unx_failures.append(name) ++ if rv == 1: ++ print ": * FAIL *" ++ elif (rv == 2) : ++ print ": * CRASH *" ++ elif (rv == 3) : ++ print ": * MD5 MISSING *" ++ else : ++ print ": * BANG *" ++ ++ if unx_failures or unx_success: ++ print "Unexpected Failures:", unx_failures ++ print "Unexpected Success: ", unx_success ++ else: ++ print "All tests normal:", successes, "ok,", failures, "failed" ++ ++ ++class ConfCSVDialect(csv.Dialect): ++ delimiter = ',' ++ doublequote = True ++ lineterminator = '\n' ++ quotechar='"' ++ quoting = csv.QUOTE_MINIMAL ++ skipinitialspace = True ++ strict = True ++ ++if __name__ == '__main__': ++ ++ argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester") ++ argp.add_argument("tests", nargs='*') ++ argp.add_argument("--pi4", action='store_true', help="Force pi4 cmd line") ++ argp.add_argument("--drm", action='store_true', help="Force v4l2 drm cmd line") ++ argp.add_argument("--vaapi", action='store_true', help="Force vaapi cmd line") ++ argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test") ++ argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir") ++ argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename") ++ argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use") ++ args = argp.parse_args() ++ ++ if args.csvgen: ++ csv.writer(sys.stdout).writerows(scandir(args.test_root)) ++ exit(0) ++ ++ with open(args.csv, 'rt') as csvfile: ++ csva = [a for a in csv.reader(csvfile, ConfCSVDialect())] ++ ++ dectype = CODEC_HEVC_RPI ++ if os.path.exists("/dev/rpivid-hevcmem"): ++ dectype = HWACCEL_RPI ++ if args.drm or os.path.exists("/sys/module/rpivid_hevc"): ++ dectype = HWACCEL_DRM ++ ++ if args.pi4: ++ dectype = HWACCEL_RPI ++ elif args.drm: ++ dectype = HWACCEL_DRM ++ elif args.vaapi: ++ dectype = HWACCEL_VAAPI ++ ++ doconf(csva, args.tests, args.test_root, args.vcodec, dectype) ++ +diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py +new file mode 100755 +index 0000000000..2fabe98c32 +--- /dev/null ++++ b/pi-util/ffperf.py +@@ -0,0 +1,127 @@ ++#!/usr/bin/env python3 ++ ++import time ++import string ++import os ++import tempfile ++import subprocess ++import re ++import argparse ++import sys ++import csv ++from stat import * ++ ++class tstats: ++ close_threshold = 0.01 ++ ++ def __init__(self, stats_dict=None): ++ if stats_dict != None: ++ self.name = stats_dict["name"] ++ self.elapsed = float(stats_dict["elapsed"]) ++ self.user = float(stats_dict["user"]) ++ self.sys = float(stats_dict["sys"]) ++ ++ def times_str(self): ++ ctime = self.sys + self.user ++ return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed) ++ ++ def dict(self): ++ return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys} ++ ++ def is_close(self, other): ++ return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold ++ ++ def __lt__(self, other): ++ return self.elapsed < other.elapsed ++ def __gt__(self, other): ++ return self.elapsed > other.elapsed ++ ++ def time_file(name, prefix): ++ stats = tstats() ++ stats.name = name ++ start_time = time.clock_gettime(time.CLOCK_MONOTONIC); ++ cproc = subprocess.Popen(["./ffmpeg", ++ "-hwaccel", "rpi", ++ "-t", "30", "-i", prefix + name, ++ "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog); ++ pinfo = os.wait4(cproc.pid, 0) ++ end_time = time.clock_gettime(time.CLOCK_MONOTONIC); ++ stats.elapsed = end_time - start_time ++ stats.user = pinfo[2].ru_utime ++ stats.sys = pinfo[2].ru_stime ++ return stats ++ ++ ++def common_prefix(s1, s2): ++ for i in range(min(len(s1),len(s2))): ++ if s1[i] != s2[i]: ++ return s1[:i] ++ return s1[:i+1] ++ ++def main(): ++ global flog ++ ++ argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog=""" ++To blank the screen before starting use "xdg-screensaver activate" ++(For some reason this doesn't seem to work from within python). ++""") ++ ++ argp.add_argument("streams", nargs='*') ++ argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename") ++ argp.add_argument("--csv_in", help="CSV input filename") ++ argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).") ++ argp.add_argument("--repeat", default=3, type=int, help="Run repeat count") ++ ++ args = argp.parse_args() ++ ++ csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"]) ++ csv_out.writeheader() ++ ++ stats_in = {} ++ if args.csv_in != None: ++ with open(args.csv_in, 'r', newline='') as f_in: ++ stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)} ++ ++ flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt") ++ ++ streams = args.streams ++ if not streams: ++ if not stats_in: ++ print ("No source streams specified") ++ return 1 ++ prefix = "" if args.prefix == None else args.prefix ++ streams = [k for k in stats_in] ++ elif args.prefix != None: ++ prefix = args.prefix ++ else: ++ prefix = streams[0] ++ for f in streams[1:]: ++ prefix = common_prefix(prefix, f) ++ pp = prefix.rpartition(os.sep) ++ prefix = pp[0] + pp[1] ++ streams = [s[len(prefix):] for s in streams] ++ ++ for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()): ++ print ("====", f) ++ ++ t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999}) ++ for i in range(args.repeat): ++ t = tstats.time_file(f, prefix) ++ print ("...", t.times_str()) ++ if t0 > t: ++ t0 = t ++ ++ if t0.name in stats_in: ++ pstat = stats_in[t0.name] ++ print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str()) ++ ++ csv_out.writerow(t0.dict()) ++ ++ print () ++ ++ return 0 ++ ++ ++if __name__ == '__main__': ++ exit(main()) ++ +diff --git a/pi-util/genpatch.sh b/pi-util/genpatch.sh +new file mode 100755 +index 0000000000..0948a68a7a +--- /dev/null ++++ b/pi-util/genpatch.sh +@@ -0,0 +1,35 @@ ++set -e ++ ++NOPATCH= ++if [ "$1" == "--notag" ]; then ++ shift ++ NOPATCH=1 ++fi ++ ++if [ "$1" == "" ]; then ++ echo Usage: $0 [--notag] \ ++ echo e.g.: $0 mmal_4 ++ exit 1 ++fi ++ ++VERSION=`cat RELEASE` ++if [ "$VERSION" == "" ]; then ++ echo Can\'t find version RELEASE ++ exit 1 ++fi ++ ++PATCHFILE=../ffmpeg-$VERSION-$1.patch ++ ++if [ $NOPATCH ]; then ++ echo Not tagged ++else ++ # Only continue if we are all comitted ++ git diff --name-status --exit-code ++ ++ PATCHTAG=pi/$VERSION/$1 ++ echo Tagging: $PATCHTAG ++ ++ git tag $PATCHTAG ++fi ++echo Generating patch: $PATCHFILE ++git diff n$VERSION -- > $PATCHFILE +diff --git a/pi-util/make_array.py b/pi-util/make_array.py +new file mode 100755 +index 0000000000..67b22d2d51 +--- /dev/null ++++ b/pi-util/make_array.py +@@ -0,0 +1,23 @@ ++#!/usr/bin/env python ++ ++# Usage ++# make_array file.bin ++# Produces file.h with array of bytes. ++# ++import sys ++for file in sys.argv[1:]: ++ prefix,suffix = file.split('.') ++ assert suffix=='bin' ++ name=prefix.split('/')[-1] ++ print 'Converting',file ++ with open(prefix+'.h','wb') as out: ++ print >>out, 'static const unsigned char',name,'[] = {' ++ with open(file,'rb') as fd: ++ i = 0 ++ for byte in fd.read(): ++ print >>out, '0x%02x, ' % ord(byte), ++ i = i + 1 ++ if i % 8 == 0: ++ print >>out, ' // %04x' % (i - 8) ++ print >>out,'};' ++ +diff --git a/pi-util/mkinst.sh b/pi-util/mkinst.sh +new file mode 100755 +index 0000000000..271a39e846 +--- /dev/null ++++ b/pi-util/mkinst.sh +@@ -0,0 +1,5 @@ ++set -e ++ ++make install ++ ++cp -r install/* ../vlc/sysroot/raspian_stretch_pi1-sysroot/usr +diff --git a/pi-util/patkodi.sh b/pi-util/patkodi.sh +new file mode 100644 +index 0000000000..dcd05a606e +--- /dev/null ++++ b/pi-util/patkodi.sh +@@ -0,0 +1,9 @@ ++set -e ++KODIBASE=/home/jc/rpi/kodi/xbmc ++JOBS=-j20 ++make $JOBS ++git diff xbmc/release/4.3-kodi > $KODIBASE/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch ++make -C $KODIBASE/tools/depends/target/ffmpeg $JOBS ++make -C $KODIBASE/build install ++ ++ +diff --git a/pi-util/perfcmp.py b/pi-util/perfcmp.py +new file mode 100755 +index 0000000000..e44cfa0c3c +--- /dev/null ++++ b/pi-util/perfcmp.py +@@ -0,0 +1,101 @@ ++#!/usr/bin/env python3 ++ ++import time ++import string ++import os ++import tempfile ++import subprocess ++import re ++import argparse ++import sys ++import csv ++from stat import * ++ ++class tstats: ++ close_threshold = 0.01 ++ ++ def __init__(self, stats_dict=None): ++ if stats_dict != None: ++ self.name = stats_dict["name"] ++ self.elapsed = float(stats_dict["elapsed"]) ++ self.user = float(stats_dict["user"]) ++ self.sys = float(stats_dict["sys"]) ++ ++ def times_str(self): ++ ctime = self.sys + self.user ++ return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed) ++ ++ def dict(self): ++ return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys} ++ ++ def is_close(self, other): ++ return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold ++ ++ def __lt__(self, other): ++ return self.elapsed < other.elapsed ++ def __gt__(self, other): ++ return self.elapsed > other.elapsed ++ ++ def time_file(name, prefix): ++ stats = tstats() ++ stats.name = name ++ start_time = time.clock_gettime(time.CLOCK_MONOTONIC); ++ cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name, ++ "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog); ++ pinfo = os.wait4(cproc.pid, 0) ++ end_time = time.clock_gettime(time.CLOCK_MONOTONIC); ++ stats.elapsed = end_time - start_time ++ stats.user = pinfo[2].ru_utime ++ stats.sys = pinfo[2].ru_stime ++ return stats ++ ++ ++def common_prefix(s1, s2): ++ for i in range(min(len(s1),len(s2))): ++ if s1[i] != s2[i]: ++ return s1[:i] ++ return s1[:i+1] ++ ++def main(): ++ argp = argparse.ArgumentParser(description="FFmpeg performance compare") ++ ++ argp.add_argument("stream0", help="CSV to compare") ++ argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare") ++ ++ args = argp.parse_args() ++ ++ with open(args.stream0, 'r', newline='') as f_in: ++ stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)} ++ with open(args.stream1, 'r', newline='') as f_in: ++ stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)} ++ ++ print (args.stream0, "<<-->>", args.stream1) ++ print () ++ ++ for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()): ++ if not (f in stats0) : ++ print (" XX :", f) ++ continue ++ if not (f in stats1) : ++ print (" XX :", f) ++ continue ++ ++ s0 = stats0[f] ++ s1 = stats1[f] ++ ++ pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0 ++ thresh = 0.3 ++ tc = 6 ++ ++ nchar = min(tc - 1, int(abs(pcent) / thresh)) ++ cc = " -- " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar ++ ++ print ("%6.2f %s%6.2f (%+5.2f) : %s" % ++ (s0.elapsed, cc, s1.elapsed, pcent, f)) ++ ++ return 0 ++ ++ ++if __name__ == '__main__': ++ exit(main()) ++ +diff --git a/pi-util/qem.sh b/pi-util/qem.sh +new file mode 100755 +index 0000000000..a4dbb6eacd +--- /dev/null ++++ b/pi-util/qem.sh +@@ -0,0 +1,9 @@ ++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex ++QASM=python\ ../local/bin/qasm.py ++SRC_FILE=libavcodec/rpi_hevc_shader.qasm ++DST_BASE=shader ++ ++cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR ++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c ++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h ++ +diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py +new file mode 100755 +index 0000000000..5935a11ca5 +--- /dev/null ++++ b/pi-util/v3dusage.py +@@ -0,0 +1,128 @@ ++#!/usr/bin/env python ++ ++import sys ++import argparse ++import re ++ ++def do_logparse(logname): ++ ++ rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ') ++ rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$') ++ rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$') ++ rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$') ++ ++ ttotal = {'idle':0.0} ++ tstart = {} ++ qctotal = {} ++ qtstotal = {} ++ l2hits = {} ++ l2total = {} ++ time0 = None ++ idle_start = None ++ qpu_op_no = 0 ++ op_count = 0 ++ ++ with open(logname, "rt") as infile: ++ for line in infile: ++ match = rmatch.match(line) ++ if match: ++# print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":" ++ time = float(match.group(1)) ++ unit = match.group(3) ++ opstart = not match.group(2) ++ optype = match.group(7) ++ hascb = match.group(8) != "0" ++ ++ if unit == 'qpu1': ++ unit = unit + "." + str(qpu_op_no) ++ if not opstart: ++ if hascb or optype == 'EXECUTE_SYNC': ++ qpu_op_no = 0 ++ else: ++ qpu_op_no += 1 ++ ++ # Ignore sync type ++ if optype == 'EXECUTE_SYNC': ++ continue ++ ++ if not time0: ++ time0 = time ++ ++ if opstart: ++ tstart[unit] = time; ++ elif unit in tstart: ++ op_count += 1 ++ if not unit in ttotal: ++ ttotal[unit] = 0.0 ++ ttotal[unit] += time - tstart[unit] ++ del tstart[unit] ++ ++ if not idle_start and not tstart: ++ idle_start = time ++ elif idle_start and tstart: ++ ttotal['idle'] += time - idle_start ++ idle_start = None ++ ++ match = rqcycle.match(line) ++ if match: ++ unit = "qpu1." + str(qpu_op_no) ++ if not unit in qctotal: ++ qctotal[unit] = 0 ++ qctotal[unit] += int(match.group(2)) ++ ++ match = rqtscycle.match(line) ++ if match: ++ unit = "qpu1." + str(qpu_op_no) ++ if not unit in qtstotal: ++ qtstotal[unit] = 0 ++ qtstotal[unit] += int(match.group(2)) ++ ++ match = rl2hits.match(line) ++ if match: ++ unit = "qpu1." + str(qpu_op_no) ++ if not unit in l2total: ++ l2total[unit] = 0 ++ l2hits[unit] = 0 ++ l2total[unit] += int(match.group(3)) ++ if match.group(2) == "hits": ++ l2hits[unit] += int(match.group(3)) ++ ++ ++ if not time0: ++ print "No v3d profile records found" ++ else: ++ tlogged = time - time0 ++ ++ print "Logged time:", tlogged, " Op count:", op_count ++ for unit in sorted(ttotal): ++ print b'%6s: %10.3f %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged) ++ print ++ for unit in sorted(qctotal): ++ if not unit in qtstotal: ++ qtstotal[unit] = 0; ++ print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit]) ++ if unit in l2total: ++ print b' L2Total: %10d, hits: %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit]) ++ ++ ++ ++if __name__ == '__main__': ++ argp = argparse.ArgumentParser( ++ formatter_class=argparse.RawDescriptionHelpFormatter, ++ description="QPU/VPU perf summary from VC logging", ++ epilog = """ ++Will also summarise TMU stalls if logging requests set in qpu noflush param ++in the profiled code. ++ ++Example use: ++ vcgencmd set_logging level=0xc0 ++ ++ sudo vcdbg log msg >& t.log ++ v3dusage.py t.log ++""") ++ ++ argp.add_argument("logfile") ++ args = argp.parse_args() ++ ++ do_logparse(args.logfile) ++ diff --git a/tools/depends/target/ffmpeg/CMakeLists.txt b/tools/depends/target/ffmpeg/CMakeLists.txt index e35877591173a..df2fa1e8b4d83 100644 --- a/tools/depends/target/ffmpeg/CMakeLists.txt +++ b/tools/depends/target/ffmpeg/CMakeLists.txt @@ -18,6 +18,13 @@ if(CROSSCOMPILING) message(STATUS "CROSS: ${ffmpeg_conf}") endif() +#if(CORE_PLATFORM_NAME STREQUAL rbpi) + string(CONCAT CMAKE_C_FLAGS ${CMAKE_C_FLAGS} " -I/opt/vc/include -I/opt/vc/include/interface/vcos/pthreads -I/opt/vc/include/interface/vmcs_host/linux") + string(CONCAT CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS} " -L/opt/vc/lib") + string(CONCAT CMAKE_MODULE_LINKER_FLAGS ${CMAKE_MODULE_LINKER_FLAGS} " -L/opt/vc/lib") + list(APPEND ffmpeg_conf --enable-sand --enable-v4l2-request --enable-libdrm --enable-libudev --disable-hwaccel=h264_v4l2request --disable-hwaccel=mpeg2_v4l2request --disable-hwaccel=vp8_v4l2request) +#endif() + if(CMAKE_C_FLAGS) list(APPEND ffmpeg_conf --extra-cflags=${CMAKE_C_FLAGS}) endif() @@ -111,10 +118,9 @@ externalproject_add(ffmpeg CONFIGURE_COMMAND ${pkgconf} ${pkgconf_path} /configure --prefix=${CMAKE_INSTALL_PREFIX} --extra-version="kodi-${FFMPEG_VER}" - --disable-devices --disable-doc --disable-ffplay - --disable-ffmpeg + --enable-ffmpeg --disable-ffprobe --enable-gpl --enable-runtime-cpudetect diff --git a/xbmc/cores/VideoPlayer/Buffers/VideoBufferDRMPRIME.cpp b/xbmc/cores/VideoPlayer/Buffers/VideoBufferDRMPRIME.cpp index b85097d37b867..037f66db72b50 100644 --- a/xbmc/cores/VideoPlayer/Buffers/VideoBufferDRMPRIME.cpp +++ b/xbmc/cores/VideoPlayer/Buffers/VideoBufferDRMPRIME.cpp @@ -19,6 +19,18 @@ extern "C" namespace DRMPRIME { +std::string GetColorimetry(const VideoPicture& picture) +{ + switch (picture.color_space) + { + case AVCOL_SPC_BT2020_CL: + case AVCOL_SPC_BT2020_NCL: + return "BT2020_RGB"; + } + + return "Default"; +} + std::string GetColorEncoding(const VideoPicture& picture) { switch (picture.color_space) diff --git a/xbmc/cores/VideoPlayer/Buffers/VideoBufferDRMPRIME.h b/xbmc/cores/VideoPlayer/Buffers/VideoBufferDRMPRIME.h index ebced6f5eff9b..8345f833114ee 100644 --- a/xbmc/cores/VideoPlayer/Buffers/VideoBufferDRMPRIME.h +++ b/xbmc/cores/VideoPlayer/Buffers/VideoBufferDRMPRIME.h @@ -34,6 +34,7 @@ enum hdmi_eotf HDMI_EOTF_BT_2100_HLG, }; +std::string GetColorimetry(const VideoPicture& picture); std::string GetColorEncoding(const VideoPicture& picture); std::string GetColorRange(const VideoPicture& picture); uint8_t GetEOTF(const VideoPicture& picture); diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Overlay/DVDOverlayCodecFFmpeg.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Overlay/DVDOverlayCodecFFmpeg.cpp index 59479c049a813..9baf98719e703 100644 --- a/xbmc/cores/VideoPlayer/DVDCodecs/Overlay/DVDOverlayCodecFFmpeg.cpp +++ b/xbmc/cores/VideoPlayer/DVDCodecs/Overlay/DVDOverlayCodecFFmpeg.cpp @@ -233,7 +233,8 @@ CDVDOverlay* CDVDOverlayCodecFFmpeg::GetOverlay() } RENDER_STEREO_MODE render_stereo_mode = CServiceBroker::GetWinSystem()->GetGfxContext().GetStereoMode(); - if (render_stereo_mode != RENDER_STEREO_MODE_OFF) + if (render_stereo_mode != RENDER_STEREO_MODE_OFF && + m_pCodecContext->codec_id != AV_CODEC_ID_HDMV_PGS_SUBTITLE) { if (rect.h > m_height / 2) { @@ -262,8 +263,19 @@ CDVDOverlay* CDVDOverlayCodecFFmpeg::GetOverlay() overlay->height = rect.h; overlay->bForced = rect.flags != 0; - overlay->source_width = m_width; - overlay->source_height = m_height; + if (render_stereo_mode != RENDER_STEREO_MODE_OFF && + m_pCodecContext->codec_id == AV_CODEC_ID_HDMV_PGS_SUBTITLE) + { + // For PGS subtitles we don't set source_width and source_height here. + // Later this will lead to 'video alignment' being chosen for that subtitle. + overlay->source_width = 0; + overlay->source_height = 0; + } + else + { + overlay->source_width = m_width; + overlay->source_height = m_height; + } uint8_t* s = rect.data[0]; uint8_t* t = overlay->data; diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp index 8024c20816ffe..028512e028715 100644 --- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp +++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp @@ -37,6 +37,7 @@ namespace { constexpr const char* SETTING_VIDEOPLAYER_USEPRIMEDECODERFORHW{"videoplayer.useprimedecoderforhw"}; +constexpr const char* SETTING_VIDEOPLAYER_DISABLENONHEVC{"videoplayer.disablenonhevc"}; static void ReleaseBuffer(void* opaque, uint8_t* data) { @@ -123,6 +124,15 @@ void CDVDVideoCodecDRMPRIME::Register() setting->SetVisible(true); + setting = settings->GetSetting(SETTING_VIDEOPLAYER_DISABLENONHEVC); + if (!setting) + { + CLog::Log(LOGERROR, "Failed to load setting for: {}", SETTING_VIDEOPLAYER_DISABLENONHEVC); + return; + } + + setting->SetVisible(true); + CDVDFactoryCodec::RegisterHWVideoCodec("drm_prime", CDVDVideoCodecDRMPRIME::Create); } @@ -144,6 +154,9 @@ static const AVCodecHWConfig* FindHWConfig(const AVCodec* codec) if (!CServiceBroker::GetSettingsComponent()->GetSettings()->GetBool( SETTING_VIDEOPLAYER_USEPRIMEDECODERFORHW)) return nullptr; + if (CServiceBroker::GetSettingsComponent()->GetSettings()->GetBool( + SETTING_VIDEOPLAYER_DISABLENONHEVC) && codec->id != AV_CODEC_ID_HEVC) + return nullptr; const AVCodecHWConfig* config = nullptr; for (int n = 0; (config = avcodec_get_hw_config(codec, n)); n++) @@ -515,6 +528,7 @@ void CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture) pVideoPicture->iRepeatPicture = 0; pVideoPicture->iFlags = 0; + pVideoPicture->iFlags |= !(m_pFrame->flags & AV_FRAME_FLAG_CORRUPT) ? 0 : DVP_FLAG_DROPPED; pVideoPicture->iFlags |= m_pFrame->interlaced_frame ? DVP_FLAG_INTERLACED : 0; pVideoPicture->iFlags |= m_pFrame->top_field_first ? DVP_FLAG_TOP_FIELD_FIRST : 0; @@ -523,6 +537,8 @@ void CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture) ? DVD_NOPTS_VALUE : static_cast(pts) * DVD_TIME_BASE / AV_TIME_BASE; pVideoPicture->dts = DVD_NOPTS_VALUE; + + CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::{} - iFlags:{} flags:{} pts:{}", __FUNCTION__, pVideoPicture->iFlags, m_pFrame->flags, pts); } CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideoPicture) diff --git a/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/VideoLayerBridgeDRMPRIME.cpp b/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/VideoLayerBridgeDRMPRIME.cpp index 4f07ffc0b84aa..666c15e6aae22 100644 --- a/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/VideoLayerBridgeDRMPRIME.cpp +++ b/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/VideoLayerBridgeDRMPRIME.cpp @@ -35,8 +35,18 @@ void CVideoLayerBridgeDRMPRIME::Disable() m_DRM->AddProperty(plane, "FB_ID", 0); m_DRM->AddProperty(plane, "CRTC_ID", 0); - // disable HDR metadata auto connector = m_DRM->GetConnector(); + + bool result; + uint64_t value; + std::tie(result, value) = connector->GetPropertyValue("Colorspace", "Default"); + if (result) + { + CLog::Log(LOGDEBUG, "CVideoLayerBridgeDRMPRIME::{} - setting connector colorspace to Default", __FUNCTION__); + m_DRM->AddProperty(connector, "Colorspace", value); + } + + // disable HDR metadata if (connector->SupportsProperty("HDR_OUTPUT_METADATA")) { m_DRM->AddProperty(connector, "HDR_OUTPUT_METADATA", 0); @@ -173,6 +183,16 @@ void CVideoLayerBridgeDRMPRIME::Configure(CVideoBufferDRMPRIME* buffer) m_DRM->AddProperty(plane, "COLOR_RANGE", value); auto connector = m_DRM->GetConnector(); + + std::tie(result, value) = connector->GetPropertyValue("Colorspace", GetColorimetry(picture)); + if (result) + { + CLog::Log(LOGDEBUG, "CVideoLayerBridgeDRMPRIME::{} - setting connector colorspace to {}", __FUNCTION__, + GetColorimetry(picture)); + m_DRM->AddProperty(connector, "Colorspace", value); + m_DRM->SetActive(true); + } + if (connector->SupportsProperty("HDR_OUTPUT_METADATA")) { m_hdr_metadata.metadata_type = HDMI_STATIC_METADATA_TYPE1; @@ -246,11 +266,11 @@ void CVideoLayerBridgeDRMPRIME::SetVideoPlane(CVideoBufferDRMPRIME* buffer, cons m_DRM->AddProperty(plane, "CRTC_ID", m_DRM->GetCrtc()->GetCrtcId()); m_DRM->AddProperty(plane, "SRC_X", 0); m_DRM->AddProperty(plane, "SRC_Y", 0); - m_DRM->AddProperty(plane, "SRC_W", buffer->GetWidth() << 16); + m_DRM->AddProperty(plane, "SRC_W", (buffer->GetWidth()-2) << 16); m_DRM->AddProperty(plane, "SRC_H", buffer->GetHeight() << 16); m_DRM->AddProperty(plane, "CRTC_X", static_cast(destRect.x1) & ~1); m_DRM->AddProperty(plane, "CRTC_Y", static_cast(destRect.y1) & ~1); - m_DRM->AddProperty(plane, "CRTC_W", (static_cast(destRect.Width()) + 1) & ~1); + m_DRM->AddProperty(plane, "CRTC_W", (static_cast(destRect.Width()) + 1) & ~1)-2; m_DRM->AddProperty(plane, "CRTC_H", (static_cast(destRect.Height()) + 1) & ~1); } diff --git a/xbmc/filesystem/SpecialProtocol.cpp b/xbmc/filesystem/SpecialProtocol.cpp index 21789557509c6..14e5ffab1c774 100644 --- a/xbmc/filesystem/SpecialProtocol.cpp +++ b/xbmc/filesystem/SpecialProtocol.cpp @@ -296,6 +296,6 @@ std::string CSpecialProtocol::GetPath(const std::string &key) std::map::iterator it = m_pathMap.find(key); if (it != m_pathMap.end()) return it->second; - assert(false); + //assert(false); return ""; } diff --git a/xbmc/settings/Settings.cpp b/xbmc/settings/Settings.cpp index 2b487339d6c33..b9fbe306ce7ac 100644 --- a/xbmc/settings/Settings.cpp +++ b/xbmc/settings/Settings.cpp @@ -140,6 +140,7 @@ constexpr const char* CSettings::SETTING_VIDEOPLAYER_RENDERMETHOD; constexpr const char* CSettings::SETTING_VIDEOPLAYER_HQSCALERS; constexpr const char* CSettings::SETTING_VIDEOPLAYER_USEMEDIACODEC; constexpr const char* CSettings::SETTING_VIDEOPLAYER_USEMEDIACODECSURFACE; +constexpr const char* CSettings::SETTING_VIDEOPLAYER_DISABLE_NON_HEVC; constexpr const char* CSettings::SETTING_VIDEOPLAYER_USEVDPAU; constexpr const char* CSettings::SETTING_VIDEOPLAYER_USEVDPAUMIXER; constexpr const char* CSettings::SETTING_VIDEOPLAYER_USEVDPAUMPEG2; @@ -982,6 +983,7 @@ void CSettings::InitializeISettingCallbacks() settingSet.insert(CSettings::SETTING_VIDEOSCREEN_TESTPATTERN); settingSet.insert(CSettings::SETTING_VIDEOPLAYER_USEMEDIACODEC); settingSet.insert(CSettings::SETTING_VIDEOPLAYER_USEMEDIACODECSURFACE); + settingSet.insert(CSettings::SETTING_VIDEOPLAYER_DISABLE_NON_HEVC); settingSet.insert(CSettings::SETTING_AUDIOOUTPUT_VOLUMESTEPS); settingSet.insert(CSettings::SETTING_SOURCE_VIDEOS); settingSet.insert(CSettings::SETTING_SOURCE_MUSIC); diff --git a/xbmc/settings/Settings.h b/xbmc/settings/Settings.h index 8276c8a5aa737..00967cf908497 100644 --- a/xbmc/settings/Settings.h +++ b/xbmc/settings/Settings.h @@ -118,6 +118,7 @@ class CSettings : public CSettingsBase, public CSettingCreator, public CSettingC static constexpr auto SETTING_VIDEOPLAYER_USEMEDIACODEC = "videoplayer.usemediacodec"; static constexpr auto SETTING_VIDEOPLAYER_USEMEDIACODECSURFACE = "videoplayer.usemediacodecsurface"; + static constexpr auto SETTING_VIDEOPLAYER_DISABLE_NON_HEVC = "videoplayer.disablenonhevc"; static constexpr auto SETTING_VIDEOPLAYER_USEVDPAU = "videoplayer.usevdpau"; static constexpr auto SETTING_VIDEOPLAYER_USEVDPAUMIXER = "videoplayer.usevdpaumixer"; static constexpr auto SETTING_VIDEOPLAYER_USEVDPAUMPEG2 = "videoplayer.usevdpaumpeg2"; diff --git a/xbmc/windowing/gbm/drm/DRMAtomic.cpp b/xbmc/windowing/gbm/drm/DRMAtomic.cpp index 37166b8dcddca..45b15b9ed9037 100644 --- a/xbmc/windowing/gbm/drm/DRMAtomic.cpp +++ b/xbmc/windowing/gbm/drm/DRMAtomic.cpp @@ -98,7 +98,7 @@ void CDRMAtomic::DrmAtomicCommit(int fb_id, int flags, bool rendered, bool video AddProperty(m_gui_plane, "CRTC_ID", m_crtc->GetCrtcId()); AddProperty(m_gui_plane, "SRC_X", 0); AddProperty(m_gui_plane, "SRC_Y", 0); - AddProperty(m_gui_plane, "SRC_W", m_width << 16); + AddProperty(m_gui_plane, "SRC_W", (m_width-2) << 16); AddProperty(m_gui_plane, "SRC_H", m_height << 16); AddProperty(m_gui_plane, "CRTC_X", 0); AddProperty(m_gui_plane, "CRTC_Y", 0); @@ -109,7 +109,7 @@ void CDRMAtomic::DrmAtomicCommit(int fb_id, int flags, bool rendered, bool video // } // else { - AddProperty(m_gui_plane, "CRTC_W", m_mode->hdisplay); + AddProperty(m_gui_plane, "CRTC_W", m_mode->hdisplay-2); AddProperty(m_gui_plane, "CRTC_H", m_mode->vdisplay); }