diff options
Diffstat (limited to 'src/libffmpeg/libavcodec/armv4l')
-rw-r--r-- | src/libffmpeg/libavcodec/armv4l/dsputil_arm_s.S | 696 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/armv4l/dsputil_iwmmxt.c | 188 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/armv4l/dsputil_iwmmxt_rnd.h | 1114 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/armv4l/mathops.h | 49 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/armv4l/mpegvideo_armv5te.c | 213 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/armv4l/mpegvideo_iwmmxt.c | 119 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/armv4l/simple_idct_armv5te.S | 718 |
7 files changed, 0 insertions, 3097 deletions
diff --git a/src/libffmpeg/libavcodec/armv4l/dsputil_arm_s.S b/src/libffmpeg/libavcodec/armv4l/dsputil_arm_s.S deleted file mode 100644 index 2a3ee9c50..000000000 --- a/src/libffmpeg/libavcodec/armv4l/dsputil_arm_s.S +++ /dev/null @@ -1,696 +0,0 @@ -@ -@ ARMv4L optimized DSP utils -@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp> -@ -@ This file is part of FFmpeg. -@ -@ FFmpeg is free software; you can redistribute it and/or -@ modify it under the terms of the GNU Lesser General Public -@ License as published by the Free Software Foundation; either -@ version 2.1 of the License, or (at your option) any later version. -@ -@ FFmpeg is distributed in the hope that it will be useful, -@ but WITHOUT ANY WARRANTY; without even the implied warranty of -@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -@ Lesser General Public License for more details. -@ -@ You should have received a copy of the GNU Lesser General Public -@ License along with FFmpeg; if not, write to the Free Software -@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -@ - -.macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4 - mov \Rd0, \Rn0, lsr #(\shift * 8) - mov \Rd1, \Rn1, lsr #(\shift * 8) - mov \Rd2, \Rn2, lsr #(\shift * 8) - mov \Rd3, \Rn3, lsr #(\shift * 8) - orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8) - orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8) - orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8) - orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8) -.endm -.macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2 - mov \R0, \R0, lsr #(\shift * 8) - orr \R0, \R0, \R1, lsl #(32 - \shift * 8) - mov \R1, \R1, lsr #(\shift * 8) - orr \R1, \R1, \R2, lsl #(32 - \shift * 8) -.endm -.macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2 - mov \Rdst0, \Rsrc0, lsr #(\shift * 8) - mov \Rdst1, \Rsrc1, lsr #(\shift * 8) - orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8)) - orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8)) -.endm - -.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask - @ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) - @ Rmask = 0xFEFEFEFE - @ Rn = destroy - eor \Rd0, \Rn0, \Rm0 - eor \Rd1, \Rn1, \Rm1 - orr \Rn0, \Rn0, \Rm0 - orr \Rn1, \Rn1, \Rm1 - and \Rd0, \Rd0, \Rmask - and \Rd1, \Rd1, \Rmask - sub \Rd0, \Rn0, \Rd0, lsr #1 - sub \Rd1, \Rn1, \Rd1, lsr #1 -.endm - -.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask - @ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1) - @ Rmask = 0xFEFEFEFE - @ Rn = destroy - eor \Rd0, \Rn0, \Rm0 - eor \Rd1, \Rn1, \Rm1 - and \Rn0, \Rn0, \Rm0 - and \Rn1, \Rn1, \Rm1 - and \Rd0, \Rd0, \Rmask - and \Rd1, \Rd1, \Rmask - add \Rd0, \Rn0, \Rd0, lsr #1 - add \Rd1, \Rn1, \Rd1, lsr #1 -.endm - -@ ---------------------------------------------------------------- - .align 8 - .global put_pixels16_arm -put_pixels16_arm: - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r11, lr} @ R14 is also called LR - adr r5, 5f - ands r4, r1, #3 - bic r1, r1, #3 - add r5, r5, r4, lsl #2 - ldrne pc, [r5] -1: - ldmia r1, {r4-r7} - add r1, r1, r2 - stmia r0, {r4-r7} - pld [r1] - subs r3, r3, #1 - add r0, r0, r2 - bne 1b - ldmfd sp!, {r4-r11, pc} - .align 8 -2: - ldmia r1, {r4-r8} - add r1, r1, r2 - ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stmia r0, {r9-r12} - add r0, r0, r2 - bne 2b - ldmfd sp!, {r4-r11, pc} - .align 8 -3: - ldmia r1, {r4-r8} - add r1, r1, r2 - ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stmia r0, {r9-r12} - add r0, r0, r2 - bne 3b - ldmfd sp!, {r4-r11, pc} - .align 8 -4: - ldmia r1, {r4-r8} - add r1, r1, r2 - ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8 - pld [r1] - subs r3, r3, #1 - stmia r0, {r9-r12} - add r0, r0, r2 - bne 4b - ldmfd sp!, {r4-r11,pc} - .align 8 -5: - .word 1b - .word 2b - .word 3b - .word 4b - -@ ---------------------------------------------------------------- - .align 8 - .global put_pixels8_arm -put_pixels8_arm: - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r5,lr} @ R14 is also called LR - adr r5, 5f - ands r4, r1, #3 - bic r1, r1, #3 - add r5, r5, r4, lsl #2 - ldrne pc, [r5] -1: - ldmia r1, {r4-r5} - add r1, r1, r2 - subs r3, r3, #1 - pld [r1] - stmia r0, {r4-r5} - add r0, r0, r2 - bne 1b - ldmfd sp!, {r4-r5,pc} - .align 8 -2: - ldmia r1, {r4-r5, r12} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 2b - ldmfd sp!, {r4-r5,pc} - .align 8 -3: - ldmia r1, {r4-r5, r12} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 3b - ldmfd sp!, {r4-r5,pc} - .align 8 -4: - ldmia r1, {r4-r5, r12} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12 - pld [r1] - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 4b - ldmfd sp!, {r4-r5,pc} - .align 8 -5: - .word 1b - .word 2b - .word 3b - .word 4b - -@ ---------------------------------------------------------------- - .align 8 - .global put_pixels8_x2_arm -put_pixels8_x2_arm: - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r10,lr} @ R14 is also called LR - adr r5, 5f - ands r4, r1, #3 - ldr r12, [r5] - add r5, r5, r4, lsl #2 - bic r1, r1, #3 - ldrne pc, [r5] -1: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 - pld [r1] - RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - subs r3, r3, #1 - stmia r0, {r8-r9} - add r0, r0, r2 - bne 1b - ldmfd sp!, {r4-r10,pc} - .align 8 -2: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 - ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 - pld [r1] - RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 2b - ldmfd sp!, {r4-r10,pc} - .align 8 -3: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 - ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 - pld [r1] - RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 3b - ldmfd sp!, {r4-r10,pc} - .align 8 -4: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 - pld [r1] - RND_AVG32 r8, r9, r6, r7, r5, r10, r12 - subs r3, r3, #1 - stmia r0, {r8-r9} - add r0, r0, r2 - bne 4b - ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. - .align 8 -5: - .word 0xFEFEFEFE - .word 2b - .word 3b - .word 4b - - .align 8 - .global put_no_rnd_pixels8_x2_arm -put_no_rnd_pixels8_x2_arm: - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r10,lr} @ R14 is also called LR - adr r5, 5f - ands r4, r1, #3 - ldr r12, [r5] - add r5, r5, r4, lsl #2 - bic r1, r1, #3 - ldrne pc, [r5] -1: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - subs r3, r3, #1 - stmia r0, {r8-r9} - add r0, r0, r2 - bne 1b - ldmfd sp!, {r4-r10,pc} - .align 8 -2: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10 - ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 2b - ldmfd sp!, {r4-r10,pc} - .align 8 -3: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10 - ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12 - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 3b - ldmfd sp!, {r4-r10,pc} - .align 8 -4: - ldmia r1, {r4-r5, r10} - add r1, r1, r2 - ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10 - pld [r1] - NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12 - subs r3, r3, #1 - stmia r0, {r8-r9} - add r0, r0, r2 - bne 4b - ldmfd sp!, {r4-r10,pc} @@ update PC with LR content. - .align 8 -5: - .word 0xFEFEFEFE - .word 2b - .word 3b - .word 4b - - -@ ---------------------------------------------------------------- - .align 8 - .global put_pixels8_y2_arm -put_pixels8_y2_arm: - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r11,lr} @ R14 is also called LR - adr r5, 5f - ands r4, r1, #3 - mov r3, r3, lsr #1 - ldr r12, [r5] - add r5, r5, r4, lsl #2 - bic r1, r1, #3 - ldrne pc, [r5] -1: - ldmia r1, {r4-r5} - add r1, r1, r2 -6: ldmia r1, {r6-r7} - add r1, r1, r2 - pld [r1] - RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - ldmia r1, {r4-r5} - add r1, r1, r2 - stmia r0, {r8-r9} - add r0, r0, r2 - pld [r1] - RND_AVG32 r8, r9, r6, r7, r4, r5, r12 - subs r3, r3, #1 - stmia r0, {r8-r9} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -2: - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 -6: ldmia r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -3: - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 -6: ldmia r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -4: - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 -6: ldmia r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 - RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 - subs r3, r3, #1 - RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - - .align 8 -5: - .word 0xFEFEFEFE - .word 2b - .word 3b - .word 4b - - .align 8 - .global put_no_rnd_pixels8_y2_arm -put_no_rnd_pixels8_y2_arm: - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r11,lr} @ R14 is also called LR - adr r5, 5f - ands r4, r1, #3 - mov r3, r3, lsr #1 - ldr r12, [r5] - add r5, r5, r4, lsl #2 - bic r1, r1, #3 - ldrne pc, [r5] -1: - ldmia r1, {r4-r5} - add r1, r1, r2 -6: ldmia r1, {r6-r7} - add r1, r1, r2 - pld [r1] - NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12 - ldmia r1, {r4-r5} - add r1, r1, r2 - stmia r0, {r8-r9} - add r0, r0, r2 - pld [r1] - NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12 - subs r3, r3, #1 - stmia r0, {r8-r9} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -2: - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 -6: ldmia r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -3: - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 -6: ldmia r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -4: - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 -6: ldmia r1, {r7-r9} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9 - NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - ldmia r1, {r4-r6} - add r1, r1, r2 - pld [r1] - ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6 - subs r3, r3, #1 - NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12 - stmia r0, {r10-r11} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} - .align 8 -5: - .word 0xFEFEFEFE - .word 2b - .word 3b - .word 4b - -@ ---------------------------------------------------------------- -.macro RND_XY2_IT align, rnd - @ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202) - @ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2) -.if \align == 0 - ldmia r1, {r6-r8} -.elseif \align == 3 - ldmia r1, {r5-r7} -.else - ldmia r1, {r8-r10} -.endif - add r1, r1, r2 - pld [r1] -.if \align == 0 - ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8 -.elseif \align == 1 - ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10 - ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10 -.elseif \align == 2 - ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10 - ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10 -.elseif \align == 3 - ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7 -.endif - ldr r14, [r12, #0] @ 0x03030303 - tst r3, #1 - and r8, r4, r14 - and r9, r5, r14 - and r10, r6, r14 - and r11, r7, r14 -.if \rnd == 1 - ldreq r14, [r12, #16] @ 0x02020202 -.else - ldreq r14, [r12, #28] @ 0x01010101 -.endif - add r8, r8, r10 - add r9, r9, r11 - addeq r8, r8, r14 - addeq r9, r9, r14 - ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2 - and r4, r14, r4, lsr #2 - and r5, r14, r5, lsr #2 - and r6, r14, r6, lsr #2 - and r7, r14, r7, lsr #2 - add r10, r4, r6 - add r11, r5, r7 -.endm - -.macro RND_XY2_EXPAND align, rnd - RND_XY2_IT \align, \rnd -6: stmfd sp!, {r8-r11} - RND_XY2_IT \align, \rnd - ldmfd sp!, {r4-r7} - add r4, r4, r8 - add r5, r5, r9 - add r6, r6, r10 - add r7, r7, r11 - ldr r14, [r12, #24] @ 0x0F0F0F0F - and r4, r14, r4, lsr #2 - and r5, r14, r5, lsr #2 - add r4, r4, r6 - add r5, r5, r7 - subs r3, r3, #1 - stmia r0, {r4-r5} - add r0, r0, r2 - bne 6b - ldmfd sp!, {r4-r11,pc} -.endm - - .align 8 - .global put_pixels8_xy2_arm -put_pixels8_xy2_arm: - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r11,lr} @ R14 is also called LR - adrl r12, 5f - ands r4, r1, #3 - add r5, r12, r4, lsl #2 - bic r1, r1, #3 - ldrne pc, [r5] -1: - RND_XY2_EXPAND 0, 1 - - .align 8 -2: - RND_XY2_EXPAND 1, 1 - - .align 8 -3: - RND_XY2_EXPAND 2, 1 - - .align 8 -4: - RND_XY2_EXPAND 3, 1 - -5: - .word 0x03030303 - .word 2b - .word 3b - .word 4b - .word 0x02020202 - .word 0xFCFCFCFC >> 2 - .word 0x0F0F0F0F - .word 0x01010101 - - .align 8 - .global put_no_rnd_pixels8_xy2_arm -put_no_rnd_pixels8_xy2_arm: - @ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h) - @ block = word aligned, pixles = unaligned - pld [r1] - stmfd sp!, {r4-r11,lr} @ R14 is also called LR - adrl r12, 5f - ands r4, r1, #3 - add r5, r12, r4, lsl #2 - bic r1, r1, #3 - ldrne pc, [r5] -1: - RND_XY2_EXPAND 0, 0 - - .align 8 -2: - RND_XY2_EXPAND 1, 0 - - .align 8 -3: - RND_XY2_EXPAND 2, 0 - - .align 8 -4: - RND_XY2_EXPAND 3, 0 - -5: - .word 0x03030303 - .word 2b - .word 3b - .word 4b - .word 0x02020202 - .word 0xFCFCFCFC >> 2 - .word 0x0F0F0F0F - .word 0x01010101 diff --git a/src/libffmpeg/libavcodec/armv4l/dsputil_iwmmxt.c b/src/libffmpeg/libavcodec/armv4l/dsputil_iwmmxt.c deleted file mode 100644 index d7401e760..000000000 --- a/src/libffmpeg/libavcodec/armv4l/dsputil_iwmmxt.c +++ /dev/null @@ -1,188 +0,0 @@ -/* - * iWMMXt optimized DSP utils - * Copyright (c) 2004 AGAWA Koji - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "../dsputil.h" - -#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt -#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12"); -#define WAVG2B "wavg2b" -#include "dsputil_iwmmxt_rnd.h" -#undef DEF -#undef SET_RND -#undef WAVG2B - -#define DEF(x, y) x ## _ ## y ##_iwmmxt -#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12"); -#define WAVG2B "wavg2br" -#include "dsputil_iwmmxt_rnd.h" -#undef DEF -#undef SET_RND -#undef WAVG2BR - -// need scheduling -#define OP(AVG) \ - asm volatile ( \ - /* alignment */ \ - "and r12, %[pixels], #7 \n\t" \ - "bic %[pixels], %[pixels], #7 \n\t" \ - "tmcr wcgr1, r12 \n\t" \ - \ - "wldrd wr0, [%[pixels]] \n\t" \ - "wldrd wr1, [%[pixels], #8] \n\t" \ - "add %[pixels], %[pixels], %[line_size] \n\t" \ - "walignr1 wr4, wr0, wr1 \n\t" \ - \ - "1: \n\t" \ - \ - "wldrd wr2, [%[pixels]] \n\t" \ - "wldrd wr3, [%[pixels], #8] \n\t" \ - "add %[pixels], %[pixels], %[line_size] \n\t" \ - "pld [%[pixels]] \n\t" \ - "walignr1 wr5, wr2, wr3 \n\t" \ - AVG " wr6, wr4, wr5 \n\t" \ - "wstrd wr6, [%[block]] \n\t" \ - "add %[block], %[block], %[line_size] \n\t" \ - \ - "wldrd wr0, [%[pixels]] \n\t" \ - "wldrd wr1, [%[pixels], #8] \n\t" \ - "add %[pixels], %[pixels], %[line_size] \n\t" \ - "walignr1 wr4, wr0, wr1 \n\t" \ - "pld [%[pixels]] \n\t" \ - AVG " wr6, wr4, wr5 \n\t" \ - "wstrd wr6, [%[block]] \n\t" \ - "add %[block], %[block], %[line_size] \n\t" \ - \ - "subs %[h], %[h], #2 \n\t" \ - "bne 1b \n\t" \ - : [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \ - : [line_size]"r"(line_size) \ - : "memory", "r12"); -void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - OP("wavg2br"); -} -void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - OP("wavg2b"); -} -#undef OP - -void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size) -{ - uint8_t *pixels2 = pixels + line_size; - - __asm__ __volatile__ ( - "mov r12, #4 \n\t" - "1: \n\t" - "pld [%[pixels], %[line_size2]] \n\t" - "pld [%[pixels2], %[line_size2]] \n\t" - "wldrd wr4, [%[pixels]] \n\t" - "wldrd wr5, [%[pixels2]] \n\t" - "pld [%[block], #32] \n\t" - "wunpckelub wr6, wr4 \n\t" - "wldrd wr0, [%[block]] \n\t" - "wunpckehub wr7, wr4 \n\t" - "wldrd wr1, [%[block], #8] \n\t" - "wunpckelub wr8, wr5 \n\t" - "wldrd wr2, [%[block], #16] \n\t" - "wunpckehub wr9, wr5 \n\t" - "wldrd wr3, [%[block], #24] \n\t" - "add %[block], %[block], #32 \n\t" - "waddhss wr10, wr0, wr6 \n\t" - "waddhss wr11, wr1, wr7 \n\t" - "waddhss wr12, wr2, wr8 \n\t" - "waddhss wr13, wr3, wr9 \n\t" - "wpackhus wr14, wr10, wr11 \n\t" - "wpackhus wr15, wr12, wr13 \n\t" - "wstrd wr14, [%[pixels]] \n\t" - "add %[pixels], %[pixels], %[line_size2] \n\t" - "subs r12, r12, #1 \n\t" - "wstrd wr15, [%[pixels2]] \n\t" - "add %[pixels2], %[pixels2], %[line_size2] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2) - : [line_size2]"r"(line_size << 1) - : "cc", "memory", "r12"); -} - -static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ - return; -} - -int mm_flags; /* multimedia extension flags */ - -int mm_support(void) -{ - return 0; /* TODO, implement proper detection */ -} - -void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx) -{ - mm_flags = mm_support(); - - if (avctx->dsp_mask) { - if (avctx->dsp_mask & FF_MM_FORCE) - mm_flags |= (avctx->dsp_mask & 0xffff); - else - mm_flags &= ~(avctx->dsp_mask & 0xffff); - } - - if (!(mm_flags & MM_IWMMXT)) return; - - c->add_pixels_clamped = add_pixels_clamped_iwmmxt; - - c->put_pixels_tab[0][0] = put_pixels16_iwmmxt; - c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt; - c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt; - c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt; - c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt; - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt; - c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt; - - c->put_pixels_tab[1][0] = put_pixels8_iwmmxt; - c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt; - c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt; - c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt; - c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt; - c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt; - - c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt; - c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt; - c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt; - c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt; - c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt; - c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt; - c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt; - - c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt; - c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt; - c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt; - c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt; - c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt; - c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt; - c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt; -} diff --git a/src/libffmpeg/libavcodec/armv4l/dsputil_iwmmxt_rnd.h b/src/libffmpeg/libavcodec/armv4l/dsputil_iwmmxt_rnd.h deleted file mode 100644 index 51ba61c47..000000000 --- a/src/libffmpeg/libavcodec/armv4l/dsputil_iwmmxt_rnd.h +++ /dev/null @@ -1,1114 +0,0 @@ -/* - * iWMMXt optimized DSP utils - * copyright (c) 2004 AGAWA Koji - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - __asm__ __volatile__ ( - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r4, %[pixels], %[line_size] \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "1: \n\t" - "wldrd wr0, [%[pixels]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wldrd wr1, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr3, [r4] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wldrd wr4, [r4, #8] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr8, wr0, wr1 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr10, wr3, wr4 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr10, [r5] \n\t" - "add r5, r5, %[line_size] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) - : - : "memory", "r4", "r5", "r12"); -} - -void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - __asm__ __volatile__ ( - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r4, %[pixels], %[line_size] \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "1: \n\t" - "wldrd wr0, [%[pixels]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wldrd wr1, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr3, [r4] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wldrd wr4, [r4, #8] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr8, wr0, wr1 \n\t" - "wldrd wr0, [%[block]] \n\t" - "wldrd wr2, [r5] \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr10, wr3, wr4 \n\t" - WAVG2B" wr8, wr8, wr0 \n\t" - WAVG2B" wr10, wr10, wr2 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr10, [r5] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "add r5, r5, %[line_size] \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) - : - : "memory", "r4", "r5", "r12"); -} - -void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - __asm__ __volatile__ ( - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r4, %[pixels], %[line_size] \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "1: \n\t" - "wldrd wr0, [%[pixels]] \n\t" - "wldrd wr1, [%[pixels], #8] \n\t" - "subs %[h], %[h], #2 \n\t" - "wldrd wr2, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr3, [r4] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr8, wr0, wr1 \n\t" - "wldrd wr4, [r4, #8] \n\t" - "walignr1 wr9, wr1, wr2 \n\t" - "wldrd wr5, [r4, #16] \n\t" - "add r4, r4, %[line_size] \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr10, wr3, wr4 \n\t" - "wstrd wr8, [%[block]] \n\t" - "walignr1 wr11, wr4, wr5 \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr10, [r5] \n\t" - "wstrd wr11, [r5, #8] \n\t" - "add r5, r5, %[line_size] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) - : - : "memory", "r4", "r5", "r12"); -} - -void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - __asm__ __volatile__ ( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "1: \n\t" - "wldrd wr0, [%[pixels]] \n\t" - "wldrd wr1, [%[pixels], #8] \n\t" - "subs %[h], %[h], #2 \n\t" - "wldrd wr2, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr3, [r4] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr8, wr0, wr1 \n\t" - "wldrd wr4, [r4, #8] \n\t" - "walignr1 wr9, wr1, wr2 \n\t" - "wldrd wr5, [r4, #16] \n\t" - "add r4, r4, %[line_size] \n\t" - "wldrd wr0, [%[block]] \n\t" - "pld [r4] \n\t" - "wldrd wr1, [%[block], #8] \n\t" - "pld [r4, #32] \n\t" - "wldrd wr2, [r5] \n\t" - "walignr1 wr10, wr3, wr4 \n\t" - "wldrd wr3, [r5, #8] \n\t" - WAVG2B" wr8, wr8, wr0 \n\t" - WAVG2B" wr9, wr9, wr1 \n\t" - WAVG2B" wr10, wr10, wr2 \n\t" - "wstrd wr8, [%[block]] \n\t" - "walignr1 wr11, wr4, wr5 \n\t" - WAVG2B" wr11, wr11, wr3 \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr10, [r5] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "wstrd wr11, [r5, #8] \n\t" - "add r5, r5, %[line_size] \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - "bne 1b \n\t" - : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) - : - : "memory", "r4", "r5", "r12"); -} - -void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ __volatile__( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "tmcr wcgr2, r12 \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr13, [r4] \n\t" - "pld [%[pixels]] \n\t" - "wldrd wr14, [r4, #8] \n\t" - "pld [%[pixels], #32] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr2, wr13, wr14 \n\t" - "wmoveq wr4, wr11 \n\t" - "wmoveq wr6, wr14 \n\t" - "walignr2ne wr4, wr10, wr11 \n\t" - "walignr2ne wr6, wr13, wr14 \n\t" - WAVG2B" wr0, wr0, wr4 \n\t" - WAVG2B" wr2, wr2, wr6 \n\t" - "wstrd wr0, [%[block]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr2, [r5] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "add r5, r5, %[line_size] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ __volatile__( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "tmcr wcgr2, r12 \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr13, [r4] \n\t" - "pld [%[pixels]] \n\t" - "wldrd wr14, [r4, #8] \n\t" - "pld [%[pixels], #32] \n\t" - "wldrd wr15, [r4, #16] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - "walignr1 wr2, wr13, wr14 \n\t" - "walignr1 wr3, wr14, wr15 \n\t" - "wmoveq wr4, wr11 \n\t" - "wmoveq wr5, wr12 \n\t" - "wmoveq wr6, wr14 \n\t" - "wmoveq wr7, wr15 \n\t" - "walignr2ne wr4, wr10, wr11 \n\t" - "walignr2ne wr5, wr11, wr12 \n\t" - "walignr2ne wr6, wr13, wr14 \n\t" - "walignr2ne wr7, wr14, wr15 \n\t" - WAVG2B" wr0, wr0, wr4 \n\t" - WAVG2B" wr1, wr1, wr5 \n\t" - "wstrd wr0, [%[block]] \n\t" - WAVG2B" wr2, wr2, wr6 \n\t" - "wstrd wr1, [%[block], #8] \n\t" - WAVG2B" wr3, wr3, wr7 \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr2, [r5] \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr3, [r5, #8] \n\t" - "add r5, r5, %[line_size] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ __volatile__( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "tmcr wcgr2, r12 \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr13, [r4] \n\t" - "pld [%[pixels]] \n\t" - "wldrd wr14, [r4, #8] \n\t" - "pld [%[pixels], #32] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr2, wr13, wr14 \n\t" - "wmoveq wr4, wr11 \n\t" - "wmoveq wr6, wr14 \n\t" - "walignr2ne wr4, wr10, wr11 \n\t" - "wldrd wr10, [%[block]] \n\t" - "walignr2ne wr6, wr13, wr14 \n\t" - "wldrd wr12, [r5] \n\t" - WAVG2B" wr0, wr0, wr4 \n\t" - WAVG2B" wr2, wr2, wr6 \n\t" - WAVG2B" wr0, wr0, wr10 \n\t" - WAVG2B" wr2, wr2, wr12 \n\t" - "wstrd wr0, [%[block]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr2, [r5] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "add r5, r5, %[line_size] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ __volatile__( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "add r4, %[pixels], %[line_size]\n\t" - "tmcr wcgr2, r12 \n\t" - "add r5, %[block], %[line_size] \n\t" - "mov %[line_size], %[line_size], lsl #1 \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "wldrd wr13, [r4] \n\t" - "pld [%[pixels]] \n\t" - "wldrd wr14, [r4, #8] \n\t" - "pld [%[pixels], #32] \n\t" - "wldrd wr15, [r4, #16] \n\t" - "add r4, r4, %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [r4] \n\t" - "pld [r4, #32] \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - "walignr1 wr2, wr13, wr14 \n\t" - "walignr1 wr3, wr14, wr15 \n\t" - "wmoveq wr4, wr11 \n\t" - "wmoveq wr5, wr12 \n\t" - "wmoveq wr6, wr14 \n\t" - "wmoveq wr7, wr15 \n\t" - "walignr2ne wr4, wr10, wr11 \n\t" - "walignr2ne wr5, wr11, wr12 \n\t" - "walignr2ne wr6, wr13, wr14 \n\t" - "walignr2ne wr7, wr14, wr15 \n\t" - "wldrd wr10, [%[block]] \n\t" - WAVG2B" wr0, wr0, wr4 \n\t" - "wldrd wr11, [%[block], #8] \n\t" - WAVG2B" wr1, wr1, wr5 \n\t" - "wldrd wr12, [r5] \n\t" - WAVG2B" wr2, wr2, wr6 \n\t" - "wldrd wr13, [r5, #8] \n\t" - WAVG2B" wr3, wr3, wr7 \n\t" - WAVG2B" wr0, wr0, wr10 \n\t" - WAVG2B" wr1, wr1, wr11 \n\t" - WAVG2B" wr2, wr2, wr12 \n\t" - WAVG2B" wr3, wr3, wr13 \n\t" - "wstrd wr0, [%[block]] \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr1, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wstrd wr2, [r5] \n\t" - "pld [%[block]] \n\t" - "wstrd wr3, [r5, #8] \n\t" - "add r5, r5, %[line_size] \n\t" - "pld [%[block], #32] \n\t" - "pld [r5] \n\t" - "pld [r5, #32] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - :"r4", "r5", "r12", "memory"); -} - -void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - __asm__ __volatile__( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "pld [%[block]] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr4, wr10, wr11 \n\t" - "wldrd wr10, [%[block]] \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr8, wr8, wr10 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "pld [%[block]] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "wldrd wr10, [%[block]] \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr8, wr8, wr10 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "subs %[h], %[h], #2 \n\t" - "pld [%[block]] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "cc", "memory", "r12"); -} - -void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - __asm__ __volatile__( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr4, wr10, wr11 \n\t" - "walignr1 wr5, wr11, wr12 \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr9, wr1, wr5 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr9, wr1, wr5 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "subs %[h], %[h], #2 \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - int stride = line_size; - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - __asm__ __volatile__( - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "and r12, %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "pld [%[block]] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - - "1: \n\t" - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr4, wr10, wr11 \n\t" - "walignr1 wr5, wr11, wr12 \n\t" - "wldrd wr10, [%[block]] \n\t" - "wldrd wr11, [%[block], #8] \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr9, wr1, wr5 \n\t" - WAVG2B" wr8, wr8, wr10 \n\t" - WAVG2B" wr9, wr9, wr11 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "wldrd wr10, [%[pixels]] \n\t" - "wldrd wr11, [%[pixels], #8] \n\t" - "pld [%[block]] \n\t" - "wldrd wr12, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr0, wr10, wr11 \n\t" - "walignr1 wr1, wr11, wr12 \n\t" - "wldrd wr10, [%[block]] \n\t" - "wldrd wr11, [%[block], #8] \n\t" - WAVG2B" wr8, wr0, wr4 \n\t" - WAVG2B" wr9, wr1, wr5 \n\t" - WAVG2B" wr8, wr8, wr10 \n\t" - WAVG2B" wr9, wr9, wr11 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "subs %[h], %[h], #2 \n\t" - "pld [%[block]] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) - : - : "r4", "r5", "r12", "memory"); -} - -void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ __volatile__( - "pld [%[pixels]] \n\t" - "mov r12, #2 \n\t" - "pld [%[pixels], #32] \n\t" - "tmcr wcgr0, r12 \n\t" /* for shift value */ - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "add r12, r12, #1 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "tmcr wcgr2, r12 \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "cmp r12, #8 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - - "1: \n\t" - // [wr0 wr1 wr2 wr3] - // [wr4 wr5 wr6 wr7] <= * - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr6, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr4, wr6 \n\t" - "wunpckehub wr5, wr6 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr4, wr4, wr8 \n\t" - "waddhus wr5, wr5, wr9 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "subs %[h], %[h], #2 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) - : [line_size]"r"(line_size) - : "r12", "memory"); -} - -void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ __volatile__( - "pld [%[pixels]] \n\t" - "mov r12, #2 \n\t" - "pld [%[pixels], #32] \n\t" - "tmcr wcgr0, r12 \n\t" /* for shift value */ - /* alignment */ - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "tmcr wcgr2, r12 \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr3, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr2, wr3 \n\t" - "wunpckehub wr3, wr3 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr2, wr2, wr10 \n\t" - "waddhus wr3, wr3, wr11 \n\t" - - "1: \n\t" - // [wr0 wr1 wr2 wr3] - // [wr4 wr5 wr6 wr7] <= * - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr6, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr7, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr4, wr6 \n\t" - "wunpckehub wr5, wr6 \n\t" - "wunpckelub wr6, wr7 \n\t" - "wunpckehub wr7, wr7 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr4, wr4, wr8 \n\t" - "waddhus wr5, wr5, wr9 \n\t" - "waddhus wr6, wr6, wr10 \n\t" - "waddhus wr7, wr7, wr11 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr10, wr2, wr6 \n\t" - "waddhus wr11, wr3, wr7 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "waddhus wr10, wr10, wr15 \n\t" - "waddhus wr11, wr11, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wsrlhg wr10, wr10, wcgr0 \n\t" - "wsrlhg wr11, wr11, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wpackhus wr9, wr10, wr11 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr3, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr2, wr3 \n\t" - "wunpckehub wr3, wr3 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr2, wr2, wr10 \n\t" - "waddhus wr3, wr3, wr11 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr10, wr2, wr6 \n\t" - "waddhus wr11, wr3, wr7 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "waddhus wr10, wr10, wr15 \n\t" - "waddhus wr11, wr11, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wsrlhg wr10, wr10, wcgr0 \n\t" - "wsrlhg wr11, wr11, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wpackhus wr9, wr10, wr11 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - "subs %[h], %[h], #2 \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) - : [line_size]"r"(line_size) - : "r12", "memory"); -} - -void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ __volatile__( - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "pld [%[pixels]] \n\t" - "mov r12, #2 \n\t" - "pld [%[pixels], #32] \n\t" - "tmcr wcgr0, r12 \n\t" /* for shift value */ - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "add r12, r12, #1 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "tmcr wcgr2, r12 \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "cmp r12, #8 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - - "1: \n\t" - // [wr0 wr1 wr2 wr3] - // [wr4 wr5 wr6 wr7] <= * - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr6, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr4, wr6 \n\t" - "wunpckehub wr5, wr6 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr4, wr4, wr8 \n\t" - "waddhus wr5, wr5, wr9 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "wldrd wr12, [%[block]] \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - WAVG2B" wr8, wr8, wr12 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "wldrd wr12, [%[pixels]] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr13, [%[pixels], #8] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "wmoveq wr10, wr13 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "wldrd wr12, [%[block]] \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "subs %[h], %[h], #2 \n\t" - WAVG2B" wr8, wr8, wr12 \n\t" - "wstrd wr8, [%[block]] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) - : [line_size]"r"(line_size) - : "r12", "memory"); -} - -void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h) -{ - // [wr0 wr1 wr2 wr3] for previous line - // [wr4 wr5 wr6 wr7] for current line - SET_RND(wr15); // =2 for rnd and =1 for no_rnd version - __asm__ __volatile__( - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "pld [%[pixels]] \n\t" - "mov r12, #2 \n\t" - "pld [%[pixels], #32] \n\t" - "tmcr wcgr0, r12 \n\t" /* for shift value */ - /* alignment */ - "and r12, %[pixels], #7 \n\t" - "bic %[pixels], %[pixels], #7 \n\t" - "tmcr wcgr1, r12 \n\t" - "add r12, r12, #1 \n\t" - "tmcr wcgr2, r12 \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "pld [%[pixels]] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr3, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr2, wr3 \n\t" - "wunpckehub wr3, wr3 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr2, wr2, wr10 \n\t" - "waddhus wr3, wr3, wr11 \n\t" - - "1: \n\t" - // [wr0 wr1 wr2 wr3] - // [wr4 wr5 wr6 wr7] <= * - "wldrd wr12, [%[pixels]] \n\t" - "cmp r12, #8 \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr6, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr7, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr4, wr6 \n\t" - "wunpckehub wr5, wr6 \n\t" - "wunpckelub wr6, wr7 \n\t" - "wunpckehub wr7, wr7 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr4, wr4, wr8 \n\t" - "waddhus wr5, wr5, wr9 \n\t" - "waddhus wr6, wr6, wr10 \n\t" - "waddhus wr7, wr7, wr11 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr10, wr2, wr6 \n\t" - "waddhus wr11, wr3, wr7 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "waddhus wr10, wr10, wr15 \n\t" - "waddhus wr11, wr11, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wldrd wr12, [%[block]] \n\t" - "wldrd wr13, [%[block], #8] \n\t" - "wsrlhg wr10, wr10, wcgr0 \n\t" - "wsrlhg wr11, wr11, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wpackhus wr9, wr10, wr11 \n\t" - WAVG2B" wr8, wr8, wr12 \n\t" - WAVG2B" wr9, wr9, wr13 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - - // [wr0 wr1 wr2 wr3] <= * - // [wr4 wr5 wr6 wr7] - "wldrd wr12, [%[pixels]] \n\t" - "pld [%[block]] \n\t" - "wldrd wr13, [%[pixels], #8] \n\t" - "pld [%[block], #32] \n\t" - "wldrd wr14, [%[pixels], #16] \n\t" - "add %[pixels], %[pixels], %[line_size] \n\t" - "walignr1 wr2, wr12, wr13 \n\t" - "pld [%[pixels]] \n\t" - "pld [%[pixels], #32] \n\t" - "walignr1 wr3, wr13, wr14 \n\t" - "wmoveq wr10, wr13 \n\t" - "wmoveq wr11, wr14 \n\t" - "walignr2ne wr10, wr12, wr13 \n\t" - "walignr2ne wr11, wr13, wr14 \n\t" - "wunpckelub wr0, wr2 \n\t" - "wunpckehub wr1, wr2 \n\t" - "wunpckelub wr2, wr3 \n\t" - "wunpckehub wr3, wr3 \n\t" - "wunpckelub wr8, wr10 \n\t" - "wunpckehub wr9, wr10 \n\t" - "wunpckelub wr10, wr11 \n\t" - "wunpckehub wr11, wr11 \n\t" - "waddhus wr0, wr0, wr8 \n\t" - "waddhus wr1, wr1, wr9 \n\t" - "waddhus wr2, wr2, wr10 \n\t" - "waddhus wr3, wr3, wr11 \n\t" - "waddhus wr8, wr0, wr4 \n\t" - "waddhus wr9, wr1, wr5 \n\t" - "waddhus wr10, wr2, wr6 \n\t" - "waddhus wr11, wr3, wr7 \n\t" - "waddhus wr8, wr8, wr15 \n\t" - "waddhus wr9, wr9, wr15 \n\t" - "waddhus wr10, wr10, wr15 \n\t" - "waddhus wr11, wr11, wr15 \n\t" - "wsrlhg wr8, wr8, wcgr0 \n\t" - "wsrlhg wr9, wr9, wcgr0 \n\t" - "wldrd wr12, [%[block]] \n\t" - "wldrd wr13, [%[block], #8] \n\t" - "wsrlhg wr10, wr10, wcgr0 \n\t" - "wsrlhg wr11, wr11, wcgr0 \n\t" - "wpackhus wr8, wr8, wr9 \n\t" - "wpackhus wr9, wr10, wr11 \n\t" - WAVG2B" wr8, wr8, wr12 \n\t" - WAVG2B" wr9, wr9, wr13 \n\t" - "wstrd wr8, [%[block]] \n\t" - "wstrd wr9, [%[block], #8] \n\t" - "add %[block], %[block], %[line_size] \n\t" - "subs %[h], %[h], #2 \n\t" - "pld [%[block]] \n\t" - "pld [%[block], #32] \n\t" - "bne 1b \n\t" - : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block) - : [line_size]"r"(line_size) - : "r12", "memory"); -} diff --git a/src/libffmpeg/libavcodec/armv4l/mathops.h b/src/libffmpeg/libavcodec/armv4l/mathops.h deleted file mode 100644 index 7ddd0ec6e..000000000 --- a/src/libffmpeg/libavcodec/armv4l/mathops.h +++ /dev/null @@ -1,49 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifdef FRAC_BITS -# define MULL(a, b) \ - ({ int lo, hi;\ - asm("smull %0, %1, %2, %3 \n\t"\ - "mov %0, %0, lsr %4\n\t"\ - "add %1, %0, %1, lsl %5\n\t"\ - : "=&r"(lo), "=&r"(hi)\ - : "r"(b), "r"(a), "i"(FRAC_BITS), "i"(32-FRAC_BITS));\ - hi; }) -#endif - -#define MULH(a, b) \ - ({ int lo, hi;\ - asm ("smull %0, %1, %2, %3" : "=&r"(lo), "=&r"(hi) : "r"(b), "r"(a));\ - hi; }) - -#if defined(HAVE_ARMV5TE) - -/* signed 16x16 -> 32 multiply add accumulate */ -# define MAC16(rt, ra, rb) \ - asm ("smlabb %0, %2, %3, %0" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); -/* signed 16x16 -> 32 multiply */ -# define MUL16(ra, rb) \ - ({ int __rt; \ - asm ("smulbb %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); \ - __rt; }) - -#endif diff --git a/src/libffmpeg/libavcodec/armv4l/mpegvideo_armv5te.c b/src/libffmpeg/libavcodec/armv4l/mpegvideo_armv5te.c deleted file mode 100644 index a8d09b8ce..000000000 --- a/src/libffmpeg/libavcodec/armv4l/mpegvideo_armv5te.c +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Optimization of some functions from mpegvideo.c for armv5te - * Copyright (c) 2007 Siarhei Siamashka <ssvb@users.sourceforge.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * Some useful links for those who may be interested in optimizing code for ARM. - * ARM Architecture Reference Manual: http://www.arm.com/community/academy/resources.html - * Instructions timings and optimization guide for ARM9E: http://www.arm.com/pdfs/DDI0222B_9EJS_r1p2.pdf - */ - -#include "../dsputil.h" -#include "../mpegvideo.h" -#include "../avcodec.h" - - -#ifdef ENABLE_ARM_TESTS -/** - * h263 dequantizer supplementary function, it is performance critical and needs to - * have optimized implementations for each architecture. Is also used as a reference - * implementation in regression tests - */ -static inline void dct_unquantize_h263_helper_c(DCTELEM *block, int qmul, int qadd, int count) -{ - int i, level; - for (i = 0; i < count; i++) { - level = block[i]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; - } - block[i] = level; - } - } -} -#endif - -/* GCC 3.1 or higher is required to support symbolic names in assembly code */ -#if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) - -/** - * Special optimized version of dct_unquantize_h263_helper_c, it requires the block - * to be at least 8 bytes aligned, and may process more elements than requested. - * But it is guaranteed to never process more than 64 elements provided that - * xxcount argument is <= 64, so it is safe. This macro is optimized for a common - * distribution of values for nCoeffs (they are mostly multiple of 8 plus one or - * two extra elements). So this macro processes data as 8 elements per loop iteration - * and contains optional 2 elements processing in the end. - * - * Inner loop should take 6 cycles per element on arm926ej-s (Nokia 770) - */ -#define dct_unquantize_h263_special_helper_armv5te(xxblock, xxqmul, xxqadd, xxcount) \ -({ DCTELEM *xblock = xxblock; \ - int xqmul = xxqmul, xqadd = xxqadd, xcount = xxcount, xtmp; \ - int xdata1, xdata2; \ -__asm__ __volatile__( \ - "subs %[count], #2 \n\t" \ - "ble 2f \n\t" \ - "ldrd r4, [%[block], #0] \n\t" \ - "1: \n\t" \ - "ldrd r6, [%[block], #8] \n\t" \ -\ - "rsbs %[data1], %[zero], r4, asr #16 \n\t" \ - "addgt %[data1], %[qadd], #0 \n\t" \ - "rsblt %[data1], %[qadd], #0 \n\t" \ - "smlatbne %[data1], r4, %[qmul], %[data1] \n\t" \ -\ - "rsbs %[data2], %[zero], r5, asr #16 \n\t" \ - "addgt %[data2], %[qadd], #0 \n\t" \ - "rsblt %[data2], %[qadd], #0 \n\t" \ - "smlatbne %[data2], r5, %[qmul], %[data2] \n\t" \ -\ - "rsbs %[tmp], %[zero], r4, asl #16 \n\t" \ - "addgt %[tmp], %[qadd], #0 \n\t" \ - "rsblt %[tmp], %[qadd], #0 \n\t" \ - "smlabbne r4, r4, %[qmul], %[tmp] \n\t" \ -\ - "rsbs %[tmp], %[zero], r5, asl #16 \n\t" \ - "addgt %[tmp], %[qadd], #0 \n\t" \ - "rsblt %[tmp], %[qadd], #0 \n\t" \ - "smlabbne r5, r5, %[qmul], %[tmp] \n\t" \ -\ - "strh r4, [%[block]], #2 \n\t" \ - "strh %[data1], [%[block]], #2 \n\t" \ - "strh r5, [%[block]], #2 \n\t" \ - "strh %[data2], [%[block]], #2 \n\t" \ -\ - "rsbs %[data1], %[zero], r6, asr #16 \n\t" \ - "addgt %[data1], %[qadd], #0 \n\t" \ - "rsblt %[data1], %[qadd], #0 \n\t" \ - "smlatbne %[data1], r6, %[qmul], %[data1] \n\t" \ -\ - "rsbs %[data2], %[zero], r7, asr #16 \n\t" \ - "addgt %[data2], %[qadd], #0 \n\t" \ - "rsblt %[data2], %[qadd], #0 \n\t" \ - "smlatbne %[data2], r7, %[qmul], %[data2] \n\t" \ -\ - "rsbs %[tmp], %[zero], r6, asl #16 \n\t" \ - "addgt %[tmp], %[qadd], #0 \n\t" \ - "rsblt %[tmp], %[qadd], #0 \n\t" \ - "smlabbne r6, r6, %[qmul], %[tmp] \n\t" \ -\ - "rsbs %[tmp], %[zero], r7, asl #16 \n\t" \ - "addgt %[tmp], %[qadd], #0 \n\t" \ - "rsblt %[tmp], %[qadd], #0 \n\t" \ - "smlabbne r7, r7, %[qmul], %[tmp] \n\t" \ -\ - "strh r6, [%[block]], #2 \n\t" \ - "strh %[data1], [%[block]], #2 \n\t" \ - "strh r7, [%[block]], #2 \n\t" \ - "strh %[data2], [%[block]], #2 \n\t" \ -\ - "subs %[count], #8 \n\t" \ - "ldrgtd r4, [%[block], #0] \n\t" /* load data early to avoid load/use pipeline stall */ \ - "bgt 1b \n\t" \ -\ - "adds %[count], #2 \n\t" \ - "ble 3f \n\t" \ - "2: \n\t" \ - "ldrsh %[data1], [%[block], #0] \n\t" \ - "ldrsh %[data2], [%[block], #2] \n\t" \ - "mov %[tmp], %[qadd] \n\t" \ - "cmp %[data1], #0 \n\t" \ - "rsblt %[tmp], %[qadd], #0 \n\t" \ - "smlabbne %[data1], %[data1], %[qmul], %[tmp] \n\t" \ - "mov %[tmp], %[qadd] \n\t" \ - "cmp %[data2], #0 \n\t" \ - "rsblt %[tmp], %[qadd], #0 \n\t" \ - "smlabbne %[data2], %[data2], %[qmul], %[tmp] \n\t" \ - "strh %[data1], [%[block]], #2 \n\t" \ - "strh %[data2], [%[block]], #2 \n\t" \ - "3: \n\t" \ - : [block] "+&r" (xblock), [count] "+&r" (xcount), [tmp] "=&r" (xtmp), \ - [data1] "=&r" (xdata1), [data2] "=&r" (xdata2) \ - : [qmul] "r" (xqmul), [qadd] "r" (xqadd), [zero] "r" (0) \ - : "r4", "r5", "r6", "r7", "cc", "memory" \ -); \ -}) - -static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int i, level, qmul, qadd; - int nCoeffs; - - assert(s->block_last_index[n]>=0); - - qmul = qscale << 1; - - if (!s->h263_aic) { - if (n < 4) - level = block[0] * s->y_dc_scale; - else - level = block[0] * s->c_dc_scale; - qadd = (qscale - 1) | 1; - }else{ - qadd = 0; - level = block[0]; - } - if(s->ac_pred) - nCoeffs=63; - else - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - dct_unquantize_h263_special_helper_armv5te(block, qmul, qadd, nCoeffs + 1); - block[0] = level; -} - -static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int i, level, qmul, qadd; - int nCoeffs; - - assert(s->block_last_index[n]>=0); - - qadd = (qscale - 1) | 1; - qmul = qscale << 1; - - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - dct_unquantize_h263_special_helper_armv5te(block, qmul, qadd, nCoeffs + 1); -} - -#define HAVE_DCT_UNQUANTIZE_H263_ARMV5TE_OPTIMIZED - -#endif - -void MPV_common_init_armv5te(MpegEncContext *s) -{ -#ifdef HAVE_DCT_UNQUANTIZE_H263_ARMV5TE_OPTIMIZED - s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_armv5te; - s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_armv5te; -#endif -} diff --git a/src/libffmpeg/libavcodec/armv4l/mpegvideo_iwmmxt.c b/src/libffmpeg/libavcodec/armv4l/mpegvideo_iwmmxt.c deleted file mode 100644 index 1336ac5f8..000000000 --- a/src/libffmpeg/libavcodec/armv4l/mpegvideo_iwmmxt.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * copyright (c) 2004 AGAWA Koji - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "../dsputil.h" -#include "../mpegvideo.h" -#include "../avcodec.h" - -static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int level, qmul, qadd; - int nCoeffs; - DCTELEM *block_orig = block; - - assert(s->block_last_index[n]>=0); - - qmul = qscale << 1; - - if (!s->h263_aic) { - if (n < 4) - level = block[0] * s->y_dc_scale; - else - level = block[0] * s->c_dc_scale; - qadd = (qscale - 1) | 1; - }else{ - qadd = 0; - level = block[0]; - } - if(s->ac_pred) - nCoeffs=63; - else - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - __asm__ __volatile__ ( -/* "movd %1, %%mm6 \n\t" //qmul */ -/* "packssdw %%mm6, %%mm6 \n\t" */ -/* "packssdw %%mm6, %%mm6 \n\t" */ - "tbcsth wr6, %[qmul] \n\t" -/* "movd %2, %%mm5 \n\t" //qadd */ -/* "packssdw %%mm5, %%mm5 \n\t" */ -/* "packssdw %%mm5, %%mm5 \n\t" */ - "tbcsth wr5, %[qadd] \n\t" - "wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */ - "wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */ - "wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */ - "1: \n\t" - "wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */ - "wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */ - "wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */ - "wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */ -/* "movq (%0, %3), %%mm2 \n\t" */ -/* "movq 8(%0, %3), %%mm3 \n\t" */ - "wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */ - "wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */ - "wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */ - "wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */ - "waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */ - "waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */ - "wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */ - "wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */ - "wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */ - "wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */ - "wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */ - "wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */ - "wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */ - "wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */ - "add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */ - "subs %[i], %[i], #1 \n\t" - "bne 1b \n\t" /* "jng 1b \n\t" */ - :[block]"+r"(block) - :[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd) - :"memory"); - - block_orig[0] = level; -} - -#if 0 -static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ - int nCoeffs; - - assert(s->block_last_index[n]>=0); - - if(s->ac_pred) - nCoeffs=63; - else - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - - ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale); -} -#endif - -void MPV_common_init_iwmmxt(MpegEncContext *s) -{ - if (!(mm_flags & MM_IWMMXT)) return; - - s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt; -#if 0 - s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt; -#endif -} diff --git a/src/libffmpeg/libavcodec/armv4l/simple_idct_armv5te.S b/src/libffmpeg/libavcodec/armv4l/simple_idct_armv5te.S deleted file mode 100644 index 28bee0643..000000000 --- a/src/libffmpeg/libavcodec/armv4l/simple_idct_armv5te.S +++ /dev/null @@ -1,718 +0,0 @@ -/* - * Simple IDCT - * - * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> - * Copyright (c) 2006 Mans Rullgard <mru@inprovide.com> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ -#define ROW_SHIFT 11 -#define COL_SHIFT 20 - -#define W13 (W1 | (W3 << 16)) -#define W26 (W2 | (W6 << 16)) -#define W57 (W5 | (W7 << 16)) - - .text - .align -w13: .long W13 -w26: .long W26 -w57: .long W57 - - .align - .func idct_row_armv5te -idct_row_armv5te: - str lr, [sp, #-4]! - - ldrd v1, [a1, #8] - ldrd a3, [a1] /* a3 = row[1:0], a4 = row[3:2] */ - orrs v1, v1, v2 - cmpeq v1, a4 - cmpeq v1, a3, lsr #16 - beq row_dc_only - - mov v1, #(1<<(ROW_SHIFT-1)) - mov ip, #16384 - sub ip, ip, #1 /* ip = W4 */ - smlabb v1, ip, a3, v1 /* v1 = W4*row[0]+(1<<(RS-1)) */ - ldr ip, [pc, #(w26-.-8)] /* ip = W2 | (W6 << 16) */ - smultb a2, ip, a4 - smulbb lr, ip, a4 - add v2, v1, a2 - sub v3, v1, a2 - sub v4, v1, lr - add v1, v1, lr - - ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ - ldr lr, [pc, #(w57-.-8)] /* lr = W5 | (W7 << 16) */ - smulbt v5, ip, a3 - smultt v6, lr, a4 - smlatt v5, ip, a4, v5 - smultt a2, ip, a3 - smulbt v7, lr, a3 - sub v6, v6, a2 - smulbt a2, ip, a4 - smultt fp, lr, a3 - sub v7, v7, a2 - smulbt a2, lr, a4 - ldrd a3, [a1, #8] /* a3=row[5:4] a4=row[7:6] */ - sub fp, fp, a2 - - orrs a2, a3, a4 - beq 1f - - smlabt v5, lr, a3, v5 - smlabt v6, ip, a3, v6 - smlatt v5, lr, a4, v5 - smlabt v6, lr, a4, v6 - smlatt v7, lr, a3, v7 - smlatt fp, ip, a3, fp - smulbt a2, ip, a4 - smlatt v7, ip, a4, v7 - sub fp, fp, a2 - - ldr ip, [pc, #(w26-.-8)] /* ip = W2 | (W6 << 16) */ - mov a2, #16384 - sub a2, a2, #1 /* a2 = W4 */ - smulbb a2, a2, a3 /* a2 = W4*row[4] */ - smultb lr, ip, a4 /* lr = W6*row[6] */ - add v1, v1, a2 /* v1 += W4*row[4] */ - add v1, v1, lr /* v1 += W6*row[6] */ - add v4, v4, a2 /* v4 += W4*row[4] */ - sub v4, v4, lr /* v4 -= W6*row[6] */ - smulbb lr, ip, a4 /* lr = W2*row[6] */ - sub v2, v2, a2 /* v2 -= W4*row[4] */ - sub v2, v2, lr /* v2 -= W2*row[6] */ - sub v3, v3, a2 /* v3 -= W4*row[4] */ - add v3, v3, lr /* v3 += W2*row[6] */ - -1: add a2, v1, v5 - mov a3, a2, lsr #11 - bic a3, a3, #0x1f0000 - sub a2, v2, v6 - mov a2, a2, lsr #11 - add a3, a3, a2, lsl #16 - add a2, v3, v7 - mov a4, a2, lsr #11 - bic a4, a4, #0x1f0000 - add a2, v4, fp - mov a2, a2, lsr #11 - add a4, a4, a2, lsl #16 - strd a3, [a1] - - sub a2, v4, fp - mov a3, a2, lsr #11 - bic a3, a3, #0x1f0000 - sub a2, v3, v7 - mov a2, a2, lsr #11 - add a3, a3, a2, lsl #16 - add a2, v2, v6 - mov a4, a2, lsr #11 - bic a4, a4, #0x1f0000 - sub a2, v1, v5 - mov a2, a2, lsr #11 - add a4, a4, a2, lsl #16 - strd a3, [a1, #8] - - ldr pc, [sp], #4 - -row_dc_only: - orr a3, a3, a3, lsl #16 - bic a3, a3, #0xe000 - mov a3, a3, lsl #3 - mov a4, a3 - strd a3, [a1] - strd a3, [a1, #8] - - ldr pc, [sp], #4 - .endfunc - - .macro idct_col - ldr a4, [a1] /* a4 = col[1:0] */ - mov ip, #16384 - sub ip, ip, #1 /* ip = W4 */ -#if 0 - mov v1, #(1<<(COL_SHIFT-1)) - smlabt v2, ip, a4, v1 /* v2 = W4*col[1] + (1<<(COL_SHIFT-1)) */ - smlabb v1, ip, a4, v1 /* v1 = W4*col[0] + (1<<(COL_SHIFT-1)) */ - ldr a4, [a1, #(16*4)] -#else - mov v1, #((1<<(COL_SHIFT-1))/W4) /* this matches the C version */ - add v2, v1, a4, asr #16 - rsb v2, v2, v2, lsl #14 - mov a4, a4, lsl #16 - add v1, v1, a4, asr #16 - ldr a4, [a1, #(16*4)] - rsb v1, v1, v1, lsl #14 -#endif - - smulbb lr, ip, a4 - smulbt a3, ip, a4 - sub v3, v1, lr - sub v5, v1, lr - add v7, v1, lr - add v1, v1, lr - sub v4, v2, a3 - sub v6, v2, a3 - add fp, v2, a3 - ldr ip, [pc, #(w26-.-8)] - ldr a4, [a1, #(16*2)] - add v2, v2, a3 - - smulbb lr, ip, a4 - smultb a3, ip, a4 - add v1, v1, lr - sub v7, v7, lr - add v3, v3, a3 - sub v5, v5, a3 - smulbt lr, ip, a4 - smultt a3, ip, a4 - add v2, v2, lr - sub fp, fp, lr - add v4, v4, a3 - ldr a4, [a1, #(16*6)] - sub v6, v6, a3 - - smultb lr, ip, a4 - smulbb a3, ip, a4 - add v1, v1, lr - sub v7, v7, lr - sub v3, v3, a3 - add v5, v5, a3 - smultt lr, ip, a4 - smulbt a3, ip, a4 - add v2, v2, lr - sub fp, fp, lr - sub v4, v4, a3 - add v6, v6, a3 - - stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp} - - ldr ip, [pc, #(w13-.-8)] - ldr a4, [a1, #(16*1)] - ldr lr, [pc, #(w57-.-8)] - smulbb v1, ip, a4 - smultb v3, ip, a4 - smulbb v5, lr, a4 - smultb v7, lr, a4 - smulbt v2, ip, a4 - smultt v4, ip, a4 - smulbt v6, lr, a4 - smultt fp, lr, a4 - rsb v4, v4, #0 - ldr a4, [a1, #(16*3)] - rsb v3, v3, #0 - - smlatb v1, ip, a4, v1 - smlatb v3, lr, a4, v3 - smulbb a3, ip, a4 - smulbb a2, lr, a4 - sub v5, v5, a3 - sub v7, v7, a2 - smlatt v2, ip, a4, v2 - smlatt v4, lr, a4, v4 - smulbt a3, ip, a4 - smulbt a2, lr, a4 - sub v6, v6, a3 - ldr a4, [a1, #(16*5)] - sub fp, fp, a2 - - smlabb v1, lr, a4, v1 - smlabb v3, ip, a4, v3 - smlatb v5, lr, a4, v5 - smlatb v7, ip, a4, v7 - smlabt v2, lr, a4, v2 - smlabt v4, ip, a4, v4 - smlatt v6, lr, a4, v6 - ldr a3, [a1, #(16*7)] - smlatt fp, ip, a4, fp - - smlatb v1, lr, a3, v1 - smlabb v3, lr, a3, v3 - smlatb v5, ip, a3, v5 - smulbb a4, ip, a3 - smlatt v2, lr, a3, v2 - sub v7, v7, a4 - smlabt v4, lr, a3, v4 - smulbt a4, ip, a3 - smlatt v6, ip, a3, v6 - sub fp, fp, a4 - .endm - - .align - .func idct_col_armv5te -idct_col_armv5te: - str lr, [sp, #-4]! - - idct_col - - ldmfd sp!, {a3, a4} - adds a2, a3, v1 - mov a2, a2, lsr #20 - orrmi a2, a2, #0xf000 - add ip, a4, v2 - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1] - subs a3, a3, v1 - mov a2, a3, lsr #20 - orrmi a2, a2, #0xf000 - sub a4, a4, v2 - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - ldmfd sp!, {a3, a4} - str a2, [a1, #(16*7)] - - subs a2, a3, v3 - mov a2, a2, lsr #20 - orrmi a2, a2, #0xf000 - sub ip, a4, v4 - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1, #(16*1)] - adds a3, a3, v3 - mov a2, a3, lsr #20 - orrmi a2, a2, #0xf000 - add a4, a4, v4 - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - ldmfd sp!, {a3, a4} - str a2, [a1, #(16*6)] - - adds a2, a3, v5 - mov a2, a2, lsr #20 - orrmi a2, a2, #0xf000 - add ip, a4, v6 - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1, #(16*2)] - subs a3, a3, v5 - mov a2, a3, lsr #20 - orrmi a2, a2, #0xf000 - sub a4, a4, v6 - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - ldmfd sp!, {a3, a4} - str a2, [a1, #(16*5)] - - adds a2, a3, v7 - mov a2, a2, lsr #20 - orrmi a2, a2, #0xf000 - add ip, a4, fp - mov ip, ip, asr #20 - orr a2, a2, ip, lsl #16 - str a2, [a1, #(16*3)] - subs a3, a3, v7 - mov a2, a3, lsr #20 - orrmi a2, a2, #0xf000 - sub a4, a4, fp - mov a4, a4, asr #20 - orr a2, a2, a4, lsl #16 - str a2, [a1, #(16*4)] - - ldr pc, [sp], #4 - .endfunc - - .align - .func idct_col_put_armv5te -idct_col_put_armv5te: - str lr, [sp, #-4]! - - idct_col - - ldmfd sp!, {a3, a4} - ldr lr, [sp, #32] - add a2, a3, v1 - movs a2, a2, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add ip, a4, v2 - movs ip, ip, asr #20 - movmi ip, #0 - cmp ip, #255 - movgt ip, #255 - orr a2, a2, ip, lsl #8 - sub a3, a3, v1 - movs a3, a3, asr #20 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - sub a4, a4, v2 - movs a4, a4, asr #20 - movmi a4, #0 - cmp a4, #255 - ldr v1, [sp, #28] - movgt a4, #255 - strh a2, [v1] - add a2, v1, #2 - str a2, [sp, #28] - orr a2, a3, a4, lsl #8 - rsb v2, lr, lr, lsl #3 - ldmfd sp!, {a3, a4} - strh a2, [v2, v1]! - - sub a2, a3, v3 - movs a2, a2, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - sub ip, a4, v4 - movs ip, ip, asr #20 - movmi ip, #0 - cmp ip, #255 - movgt ip, #255 - orr a2, a2, ip, lsl #8 - strh a2, [v1, lr]! - add a3, a3, v3 - movs a2, a3, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add a4, a4, v4 - movs a4, a4, asr #20 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a2, a4, lsl #8 - ldmfd sp!, {a3, a4} - strh a2, [v2, -lr]! - - add a2, a3, v5 - movs a2, a2, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add ip, a4, v6 - movs ip, ip, asr #20 - movmi ip, #0 - cmp ip, #255 - movgt ip, #255 - orr a2, a2, ip, lsl #8 - strh a2, [v1, lr]! - sub a3, a3, v5 - movs a2, a3, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - sub a4, a4, v6 - movs a4, a4, asr #20 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a2, a4, lsl #8 - ldmfd sp!, {a3, a4} - strh a2, [v2, -lr]! - - add a2, a3, v7 - movs a2, a2, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add ip, a4, fp - movs ip, ip, asr #20 - movmi ip, #0 - cmp ip, #255 - movgt ip, #255 - orr a2, a2, ip, lsl #8 - strh a2, [v1, lr] - sub a3, a3, v7 - movs a2, a3, asr #20 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - sub a4, a4, fp - movs a4, a4, asr #20 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a2, a4, lsl #8 - strh a2, [v2, -lr] - - ldr pc, [sp], #4 - .endfunc - - .align - .func idct_col_add_armv5te -idct_col_add_armv5te: - str lr, [sp, #-4]! - - idct_col - - ldr lr, [sp, #36] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr] - add a2, a3, v1 - mov a2, a2, asr #20 - sub a3, a3, v1 - and v1, ip, #255 - adds a2, a2, v1 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add v1, a4, v2 - mov v1, v1, asr #20 - adds v1, v1, ip, lsr #8 - movmi v1, #0 - cmp v1, #255 - movgt v1, #255 - orr a2, a2, v1, lsl #8 - ldr v1, [sp, #32] - sub a4, a4, v2 - rsb v2, v1, v1, lsl #3 - ldrh ip, [v2, lr]! - strh a2, [lr] - mov a3, a3, asr #20 - and a2, ip, #255 - adds a3, a3, a2 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - mov a4, a4, asr #20 - adds a4, a4, ip, lsr #8 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - add a2, lr, #2 - str a2, [sp, #28] - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr, v1]! - sub a2, a3, v3 - mov a2, a2, asr #20 - add a3, a3, v3 - and v3, ip, #255 - adds a2, a2, v3 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - sub v3, a4, v4 - mov v3, v3, asr #20 - adds v3, v3, ip, lsr #8 - movmi v3, #0 - cmp v3, #255 - movgt v3, #255 - orr a2, a2, v3, lsl #8 - add a4, a4, v4 - ldrh ip, [v2, -v1]! - strh a2, [lr] - mov a3, a3, asr #20 - and a2, ip, #255 - adds a3, a3, a2 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - mov a4, a4, asr #20 - adds a4, a4, ip, lsr #8 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr, v1]! - add a2, a3, v5 - mov a2, a2, asr #20 - sub a3, a3, v5 - and v3, ip, #255 - adds a2, a2, v3 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add v3, a4, v6 - mov v3, v3, asr #20 - adds v3, v3, ip, lsr #8 - movmi v3, #0 - cmp v3, #255 - movgt v3, #255 - orr a2, a2, v3, lsl #8 - sub a4, a4, v6 - ldrh ip, [v2, -v1]! - strh a2, [lr] - mov a3, a3, asr #20 - and a2, ip, #255 - adds a3, a3, a2 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - mov a4, a4, asr #20 - adds a4, a4, ip, lsr #8 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldmfd sp!, {a3, a4} - ldrh ip, [lr, v1]! - add a2, a3, v7 - mov a2, a2, asr #20 - sub a3, a3, v7 - and v3, ip, #255 - adds a2, a2, v3 - movmi a2, #0 - cmp a2, #255 - movgt a2, #255 - add v3, a4, fp - mov v3, v3, asr #20 - adds v3, v3, ip, lsr #8 - movmi v3, #0 - cmp v3, #255 - movgt v3, #255 - orr a2, a2, v3, lsl #8 - sub a4, a4, fp - ldrh ip, [v2, -v1]! - strh a2, [lr] - mov a3, a3, asr #20 - and a2, ip, #255 - adds a3, a3, a2 - movmi a3, #0 - cmp a3, #255 - movgt a3, #255 - mov a4, a4, asr #20 - adds a4, a4, ip, lsr #8 - movmi a4, #0 - cmp a4, #255 - movgt a4, #255 - orr a2, a3, a4, lsl #8 - strh a2, [v2] - - ldr pc, [sp], #4 - .endfunc - - .align - .global simple_idct_armv5te - .func simple_idct_armv5te -simple_idct_armv5te: - stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} - - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - - sub a1, a1, #(16*7) - - bl idct_col_armv5te - add a1, a1, #4 - bl idct_col_armv5te - add a1, a1, #4 - bl idct_col_armv5te - add a1, a1, #4 - bl idct_col_armv5te - - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} - .endfunc - - .align - .global simple_idct_add_armv5te - .func simple_idct_add_armv5te -simple_idct_add_armv5te: - stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} - - mov a1, a3 - - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - - sub a1, a1, #(16*7) - - bl idct_col_add_armv5te - add a1, a1, #4 - bl idct_col_add_armv5te - add a1, a1, #4 - bl idct_col_add_armv5te - add a1, a1, #4 - bl idct_col_add_armv5te - - add sp, sp, #8 - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} - .endfunc - - .align - .global simple_idct_put_armv5te - .func simple_idct_put_armv5te -simple_idct_put_armv5te: - stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} - - mov a1, a3 - - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - add a1, a1, #16 - bl idct_row_armv5te - - sub a1, a1, #(16*7) - - bl idct_col_put_armv5te - add a1, a1, #4 - bl idct_col_put_armv5te - add a1, a1, #4 - bl idct_col_put_armv5te - add a1, a1, #4 - bl idct_col_put_armv5te - - add sp, sp, #8 - ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} - .endfunc |