Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

aarch64: Fix for support of xonly #3812

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions codec/common/arm64/mc_aarch64_neon.S
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@

#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"
.rodata
.align 4
filter_para: .short 0, 1, -5, 20, 0, 0, 0, 0
.previous

.macro FILTER_6TAG_8BITS1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
// { // input:src[-2], src[-1], src[0], src[1], src[2], src[3], dst_d, multiplier a/b; working: v18, v19
Expand Down Expand Up @@ -1912,7 +1914,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width17_AArch64_neon
mov x5, #16
movi v0.8h, #20, lsl #0
movi v1.8h, #5, lsl #0
ldr q22, filter_para
adrp x6, filter_para
ldr q22, [x6, #:lo12:filter_para]
w17_h_mc_luma_loop:
ld1 {v2.16b, v3.16b}, [x0], x1 //only use 22(17+5); v2=src[-2]

Expand Down Expand Up @@ -1946,7 +1949,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer20Width9_AArch64_neon
mov x5, #8
movi v0.8h, #20, lsl #0
movi v1.8h, #5, lsl #0
ldr q22, filter_para
adrp x6, filter_para
ldr q22, [x6, #:lo12:filter_para]
w9_h_mc_luma_loop:
ld1 {v2.16b}, [x0], x1 //only use 14(9+5); v2=src[-2]
mov v3.d[0], v2.d[1]
Expand Down Expand Up @@ -2012,7 +2016,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width17_AArch64_neon
movi v1.8h, #5, lsl #0
sub x3, x3, #16
mov x5, #16
ldr q29, filter_para
adrp x6, filter_para
ldr q29, [x6, #:lo12:filter_para]

sub x4, x4, #1

Expand Down Expand Up @@ -2215,7 +2220,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width9_AArch64_neon
movi v1.8h, #5, lsl #0
sub x3, x3, #8
mov x5, #8
ldr q29, filter_para
adrp x6, filter_para
ldr q29, [x6, #:lo12:filter_para]
sub x4, x4, #1

//prfm pldl1strm, [x0]
Expand Down Expand Up @@ -2315,7 +2321,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN McHorVer22Width5_AArch64_neon
movi v1.8h, #5, lsl #0
sub x3, x3, #4
mov x5, #4
ldr q29, filter_para
adrp x6, filter_para
ldr q29, [x6, #:lo12:filter_para]
sub x4, x4, #1

//prfm pldl1strm, [x0]
Expand Down
19 changes: 14 additions & 5 deletions codec/decoder/core/arm64/intra_pred_aarch64_neon.S
Original file line number Diff line number Diff line change
Expand Up @@ -307,9 +307,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredDcTop_AArch64_neon
.endr
WELS_ASM_AARCH64_FUNC_END

.rodata
.align 4
intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4
intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4
.previous

WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlane_AArch64_neon
sxtw x1, w1
Expand Down Expand Up @@ -339,8 +341,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderIChromaPredPlane_AArch64_neon

uxtl v1.8h, v1.8b
uxtl v0.8h, v0.8b
ldr q2, intra_1_to_4
ldr q3, intra_m3_to_p4
adrp x4, intra_1_to_4
adrp x5, intra_m3_to_p4
ldr q2, [x4, #:lo12:intra_1_to_4]
ldr q3, [x5, #:lo12:intra_m3_to_p4]
dup v4.8h, v0.h[3]
dup v5.8h, v0.h[7]
add v4.8h, v4.8h, v5.8h
Expand Down Expand Up @@ -456,9 +460,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredDcLeft_AArch64_neon
WELS_ASM_AARCH64_FUNC_END


.rodata
.align 4
intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40
intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8
.previous

WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon
sxtw x1, w1
Expand Down Expand Up @@ -492,7 +498,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon
uxtl v3.8h, v3.8b
sub v0.8h, v1.8h, v0.8h
sub v2.8h, v3.8h, v2.8h
ldr q4, intra_1_to_8
adrp x4, intra_1_to_8
ldr q4, [x4, #:lo12:intra_1_to_8]
mul v0.8h, v0.8h, v4.8h
mul v2.8h, v2.8h, v4.8h
saddlv s0, v0.8h
Expand All @@ -501,8 +508,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_AArch64_neon
sqrshrn v0.4h, v0.4S, #6 // b is in v0.h[0]
sqrshrn v2.4h, v2.4S, #6 // c is in v2.h[0]
shl v1.8h, v1.8h, #4 // a is in v1.h[7]
ldr q4, intra_m7_to_p8
ldr q5, intra_m7_to_p8 + 16
adrp x4, intra_m7_to_p8
add x5, x4, 16
ldr q4, [x4, #:lo12:intra_m7_to_p8]
ldr q5, [x5, #:lo12:intra_m7_to_p8]
dup v1.8h, v1.h[7]
dup v3.8h, v1.h[7]
mla v1.8h, v4.8h, v0.h[0]
Expand Down
19 changes: 14 additions & 5 deletions codec/encoder/core/arm64/intra_pred_aarch64_neon.S
Original file line number Diff line number Diff line change
Expand Up @@ -307,9 +307,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredDcTop_AArch64_neon
.endr
WELS_ASM_AARCH64_FUNC_END

.rodata
.align 4
intra_1_to_4: .short 17*1, 17*2, 17*3, 17*4, 17*1, 17*2, 17*3, 17*4
intra_m3_to_p4: .short -3, -2, -1, 0, 1, 2, 3, 4
.previous

WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch64_neon
SIGN_EXTENSION x2,w2
Expand Down Expand Up @@ -339,8 +341,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsIChromaPredPlane_AArch64_neon

uxtl v1.8h, v1.8b
uxtl v0.8h, v0.8b
ldr q2, intra_1_to_4
ldr q3, intra_m3_to_p4
adrp x4, intra_1_to_4
adrp x5, intra_m3_to_p4
ldr q2, [x4, #:lo12:intra_1_to_4]
ldr q3, [x5, #:lo12:intra_m3_to_p4]
dup v4.8h, v0.h[3]
dup v5.8h, v0.h[7]
add v4.8h, v4.8h, v5.8h
Expand Down Expand Up @@ -437,9 +441,11 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredDcLeft_AArch64_neon
WELS_ASM_AARCH64_FUNC_END


.rodata
.align 4
intra_1_to_8: .short 5, 10, 15, 20, 25, 30, 35, 40
intra_m7_to_p8: .short -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8
.previous
//void WelsI16x16LumaPredPlane_AArch64_neon (uint8_t* pPred, uint8_t* pRef, const int32_t kiStride);
WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon
SIGN_EXTENSION x2,w2
Expand Down Expand Up @@ -473,7 +479,8 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon
uxtl v3.8h, v3.8b
sub v0.8h, v1.8h, v0.8h
sub v2.8h, v3.8h, v2.8h
ldr q4, intra_1_to_8
adrp x4, intra_1_to_8
ldr q4, [x4, #:lo12:intra_1_to_8]
mul v0.8h, v0.8h, v4.8h
mul v2.8h, v2.8h, v4.8h
saddlv s0, v0.8h
Expand All @@ -482,8 +489,10 @@ WELS_ASM_AARCH64_FUNC_BEGIN WelsI16x16LumaPredPlane_AArch64_neon
sqrshrn v0.4h, v0.4S, #6 // b is in v0.h[0]
sqrshrn v2.4h, v2.4S, #6 // c is in v2.h[0]
shl v1.8h, v1.8h, #4 // a is in v1.h[7]
ldr q4, intra_m7_to_p8
ldr q5, intra_m7_to_p8 + 16
adrp x4, intra_m7_to_p8
add x5, x4, 16
ldr q4, [x4, #:lo12:intra_m7_to_p8]
ldr q5, [x5, #:lo12:intra_m7_to_p8]
dup v1.8h, v1.h[7]
dup v3.8h, v1.h[7]
mla v1.8h, v4.8h, v0.h[0]
Expand Down
11 changes: 8 additions & 3 deletions codec/encoder/core/arm64/svc_motion_estimation_aarch64_neon.S
Original file line number Diff line number Diff line change
Expand Up @@ -283,16 +283,21 @@ _hash_assign_loop_x4_rem:
_hash_assign_end:
WELS_ASM_AARCH64_FUNC_END

.rodata
.align 4
mv_x_inc_x4: .short 0x10, 0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00
mv_y_inc_x4: .short 0x04, 0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00
mx_x_offset_x4: .short 0x00, 0x04, 0x08, 0x0c, 0x00, 0x00, 0x00, 0x00
.previous

WELS_ASM_AARCH64_FUNC_BEGIN FillQpelLocationByFeatureValue_AArch64_neon
// void (uint16_t* pFeatureOfBlock, const int32_t kiWidth, const int32_t kiHeight, uint16_t** pFeatureValuePointerList)
ldr q7, mv_x_inc_x4
ldr q6, mv_y_inc_x4
ldr q5, mx_x_offset_x4
adrp x4, mv_x_inc_x4
adrp x5, mv_y_inc_x4
adrp x6, mx_x_offset_x4
ldr q7, [x4, #:lo12:mv_x_inc_x4]
ldr q6, [x5, #:lo12:mv_y_inc_x4]
ldr q5, [x6, #:lo12:mx_x_offset_x4]
SIGN_EXTENSION x1,w1
SIGN_EXTENSION x2,w2
eor v4.16b, v4.16b, v4.16b
Expand Down