From e1488382e13935f11061fc35d2b2220a96751491 Mon Sep 17 00:00:00 2001 From: Miguel Freitas Date: Thu, 27 Dec 2001 20:02:23 +0000 Subject: updating ffmpeg to 26-12-2001 CVS version CVS patchset: 1310 CVS date: 2001/12/27 20:02:23 --- src/libffmpeg/libavcodec/armv4l/Makefile.am | 38 + src/libffmpeg/libavcodec/armv4l/dsputil_arm.c | 27 + src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S | 386 ++++++ src/libffmpeg/libavcodec/i386/Makefile.am | 43 + src/libffmpeg/libavcodec/i386/dsputil_mmx.c | 1056 ++++++++++++++++ src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h | 342 ++++++ src/libffmpeg/libavcodec/i386/fdct_mmx.c | 291 +++++ src/libffmpeg/libavcodec/i386/idct_mmx.c | 590 +++++++++ src/libffmpeg/libavcodec/i386/mmx.h | 1 + src/libffmpeg/libavcodec/i386/motion_est_mmx.c | 244 ++++ src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c | 326 +++++ src/libffmpeg/libavcodec/i386/simple_idct_mmx.c | 1455 +++++++++++++++++++++++ src/libffmpeg/libavcodec/mlib/Makefile.am | 39 + src/libffmpeg/libavcodec/mlib/dsputil_mlib.c | 144 +++ 14 files changed, 4982 insertions(+) create mode 100644 src/libffmpeg/libavcodec/armv4l/Makefile.am create mode 100644 src/libffmpeg/libavcodec/armv4l/dsputil_arm.c create mode 100644 src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S create mode 100644 src/libffmpeg/libavcodec/i386/Makefile.am create mode 100644 src/libffmpeg/libavcodec/i386/dsputil_mmx.c create mode 100644 src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h create mode 100644 src/libffmpeg/libavcodec/i386/fdct_mmx.c create mode 100644 src/libffmpeg/libavcodec/i386/idct_mmx.c create mode 100644 src/libffmpeg/libavcodec/i386/mmx.h create mode 100644 src/libffmpeg/libavcodec/i386/motion_est_mmx.c create mode 100644 src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c create mode 100644 src/libffmpeg/libavcodec/i386/simple_idct_mmx.c create mode 100644 src/libffmpeg/libavcodec/mlib/Makefile.am create mode 100644 src/libffmpeg/libavcodec/mlib/dsputil_mlib.c (limited to 'src') diff --git a/src/libffmpeg/libavcodec/armv4l/Makefile.am b/src/libffmpeg/libavcodec/armv4l/Makefile.am new file mode 100644 index 000000000..1fbda388f --- /dev/null +++ b/src/libffmpeg/libavcodec/armv4l/Makefile.am @@ -0,0 +1,38 @@ +## +## Process this file with automake to produce Makefile.in +## + +CFLAGS = @GLOBAL_CFLAGS@ @LIBFFMPEG_CFLAGS@ -DCONFIG_DECODERS -DHAVE_AV_CONFIG_H +ASFLAGS = + +LIBTOOL = $(SHELL) $(top_builddir)/libtool-nofpic + +noinst_LTLIBRARIES = libavcodec_armv4l.la + +EXTRA_DIST = jrevdct_arm.S dsputil_arm.c + +#if HAVE_ARMV4L +#armv4l_modules = $(EXTRA_DIST) +#endif +armv4l_modules = + +libavcodec_armv4l_la_SOURCES = $(armv4l_modules) + +noinst_HEADERS = + +.s.lo: + $(ASCOMPILE) -o $@ `test -f $< || echo '$(srcdir)/'`$< + +debug: + @$(MAKE) CFLAGS="@DEBUG_CFLAGS@ @LIBFFMPEG_CFLAGS@ -DCONFIG_DECODERS -DHAVE_AV_CONFIG_H" + +install-debug: debug + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +mostlyclean-generic: + -rm -f *~ \#* .*~ .\#* + +maintainer-clean-generic: + -@echo "This command is intended for maintainers to use;" + -@echo "it deletes files that may require special tools to rebuild." + -rm -f Makefile.in diff --git a/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c b/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c new file mode 100644 index 000000000..1cf7b4fba --- /dev/null +++ b/src/libffmpeg/libavcodec/armv4l/dsputil_arm.c @@ -0,0 +1,27 @@ +/* + * ARMv4L optimized DSP utils + * Copyright (c) 2001 Lionel Ulmer. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "../dsputil.h" + +extern void j_rev_dct_ARM(DCTELEM *data); + +void dsputil_init_armv4l(void) +{ + ff_idct = j_rev_dct_ARM; +} diff --git a/src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S b/src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S new file mode 100644 index 000000000..76eda57ea --- /dev/null +++ b/src/libffmpeg/libavcodec/armv4l/jrevdct_arm.S @@ -0,0 +1,386 @@ +/* + C-like prototype : + void j_rev_dct_ARM(DCTBLOCK data) + + With DCTBLOCK being a pointer to an array of 64 'signed shorts' + + Copyright (c) 2001 Lionel Ulmer (lionel.ulmer@free.fr / bbrox@bbrox.org) + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*/ +#define FIX_0_298631336 2446 +#define FIX_0_541196100 4433 +#define FIX_0_765366865 6270 +#define FIX_1_175875602 9633 +#define FIX_1_501321110 12299 +#define FIX_2_053119869 16819 +#define FIX_3_072711026 25172 +#define FIX_M_0_390180644 -3196 +#define FIX_M_0_899976223 -7373 +#define FIX_M_1_847759065 -15137 +#define FIX_M_1_961570560 -16069 +#define FIX_M_2_562915447 -20995 +#define FIX_0xFFFF 0xFFFF + +#define FIX_0_298631336_ID 0 +#define FIX_0_541196100_ID 4 +#define FIX_0_765366865_ID 8 +#define FIX_1_175875602_ID 12 +#define FIX_1_501321110_ID 16 +#define FIX_2_053119869_ID 20 +#define FIX_3_072711026_ID 24 +#define FIX_M_0_390180644_ID 28 +#define FIX_M_0_899976223_ID 32 +#define FIX_M_1_847759065_ID 36 +#define FIX_M_1_961570560_ID 40 +#define FIX_M_2_562915447_ID 44 +#define FIX_0xFFFF_ID 48 + .text + .align + + .global j_rev_dct_ARM +j_rev_dct_ARM: + stmdb sp!, { r4 - r12, lr } @ all callee saved regs + + sub sp, sp, #4 @ reserve some space on the stack + str r0, [ sp ] @ save the DCT pointer to the stack + + mov lr, r0 @ lr = pointer to the current row + mov r12, #8 @ r12 = row-counter + add r11, pc, #(const_array-.-8) @ r11 = base pointer to the constants array +row_loop: + ldrsh r0, [lr, # 0] @ r0 = 'd0' + ldrsh r1, [lr, # 8] @ r1 = 'd1' + + @ Optimization for row that have all items except the first set to 0 + @ (this works as the DCTELEMS are always 4-byte aligned) + ldr r5, [lr, # 0] + ldr r2, [lr, # 4] + ldr r3, [lr, # 8] + ldr r4, [lr, #12] + orr r3, r3, r4 + orr r3, r3, r2 + orrs r5, r3, r5 + beq end_of_row_loop @ nothing to be done as ALL of them are '0' + orrs r2, r3, r1 + beq empty_row + + ldrsh r2, [lr, # 2] @ r2 = 'd2' + ldrsh r4, [lr, # 4] @ r4 = 'd4' + ldrsh r6, [lr, # 6] @ r6 = 'd6' + + ldr r3, [r11, #FIX_0_541196100_ID] + add r7, r2, r6 + ldr r5, [r11, #FIX_M_1_847759065_ID] + mul r7, r3, r7 @ r7 = z1 + ldr r3, [r11, #FIX_0_765366865_ID] + mla r6, r5, r6, r7 @ r6 = tmp2 + add r5, r0, r4 @ r5 = tmp0 + mla r2, r3, r2, r7 @ r2 = tmp3 + sub r3, r0, r4 @ r3 = tmp1 + + add r0, r2, r5, lsl #13 @ r0 = tmp10 + rsb r2, r2, r5, lsl #13 @ r2 = tmp13 + add r4, r6, r3, lsl #13 @ r4 = tmp11 + rsb r3, r6, r3, lsl #13 @ r3 = tmp12 + + stmdb sp!, { r0, r2, r3, r4 } @ save on the stack tmp10, tmp13, tmp12, tmp11 + + ldrsh r3, [lr, #10] @ r3 = 'd3' + ldrsh r5, [lr, #12] @ r5 = 'd5' + ldrsh r7, [lr, #14] @ r7 = 'd7' + + add r0, r3, r5 @ r0 = 'z2' + add r2, r1, r7 @ r2 = 'z1' + add r4, r3, r7 @ r4 = 'z3' + add r6, r1, r5 @ r6 = 'z4' + ldr r9, [r11, #FIX_1_175875602_ID] + add r8, r4, r6 @ r8 = z3 + z4 + ldr r10, [r11, #FIX_M_0_899976223_ID] + mul r8, r9, r8 @ r8 = 'z5' + ldr r9, [r11, #FIX_M_2_562915447_ID] + mul r2, r10, r2 @ r2 = 'z1' + ldr r10, [r11, #FIX_M_1_961570560_ID] + mul r0, r9, r0 @ r0 = 'z2' + ldr r9, [r11, #FIX_M_0_390180644_ID] + mla r4, r10, r4, r8 @ r4 = 'z3' + ldr r10, [r11, #FIX_0_298631336_ID] + mla r6, r9, r6, r8 @ r6 = 'z4' + ldr r9, [r11, #FIX_2_053119869_ID] + mla r7, r10, r7, r2 @ r7 = tmp0 + z1 + ldr r10, [r11, #FIX_3_072711026_ID] + mla r5, r9, r5, r0 @ r5 = tmp1 + z2 + ldr r9, [r11, #FIX_1_501321110_ID] + mla r3, r10, r3, r0 @ r3 = tmp2 + z2 + add r7, r7, r4 @ r7 = tmp0 + mla r1, r9, r1, r2 @ r1 = tmp3 + z1 + add r5, r5, r6 @ r5 = tmp1 + add r3, r3, r4 @ r3 = tmp2 + add r1, r1, r6 @ r1 = tmp3 + + ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp12 / r6 = tmp11 + @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 + + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS) + add r8, r0, r1 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 0] + + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS) + sub r8, r0, r1 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #14] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS) + add r8, r6, r3 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 2] + + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS) + sub r8, r6, r3 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #12] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS) + add r8, r4, r5 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 4] + + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS) + sub r8, r4, r5 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, #10] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS) + add r8, r2, r7 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 6] + + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS) + sub r8, r2, r7 + add r8, r8, #(1<<10) + mov r8, r8, asr #11 + strh r8, [lr, # 8] + + @ End of row loop + add lr, lr, #16 + subs r12, r12, #1 + bne row_loop + beq start_column_loop + +empty_row: + ldr r1, [r11, #FIX_0xFFFF_ID] + mov r0, r0, lsl #2 + and r0, r0, r1 + add r0, r0, r0, lsl #16 + str r0, [lr, # 0] + str r0, [lr, # 4] + str r0, [lr, # 8] + str r0, [lr, #12] + +end_of_row_loop: + @ End of loop + add lr, lr, #16 + subs r12, r12, #1 + bne row_loop + +start_column_loop: + @ Start of column loop + ldr lr, [ sp ] + mov r12, #8 +column_loop: + ldrsh r0, [lr, #( 0*8)] @ r0 = 'd0' + ldrsh r2, [lr, #( 4*8)] @ r2 = 'd2' + ldrsh r4, [lr, #( 8*8)] @ r4 = 'd4' + ldrsh r6, [lr, #(12*8)] @ r6 = 'd6' + + ldr r3, [r11, #FIX_0_541196100_ID] + add r1, r2, r6 + ldr r5, [r11, #FIX_M_1_847759065_ID] + mul r1, r3, r1 @ r1 = z1 + ldr r3, [r11, #FIX_0_765366865_ID] + mla r6, r5, r6, r1 @ r6 = tmp2 + add r5, r0, r4 @ r5 = tmp0 + mla r2, r3, r2, r1 @ r2 = tmp3 + sub r3, r0, r4 @ r3 = tmp1 + + add r0, r2, r5, lsl #13 @ r0 = tmp10 + rsb r2, r2, r5, lsl #13 @ r2 = tmp13 + add r4, r6, r3, lsl #13 @ r4 = tmp11 + rsb r6, r6, r3, lsl #13 @ r6 = tmp12 + + ldrsh r1, [lr, #( 2*8)] @ r1 = 'd1' + ldrsh r3, [lr, #( 6*8)] @ r3 = 'd3' + ldrsh r5, [lr, #(10*8)] @ r5 = 'd5' + ldrsh r7, [lr, #(14*8)] @ r7 = 'd7' + + @ Check for empty odd column (happens about 20 to 25 % of the time according to my stats) + orr r9, r1, r3 + orr r10, r5, r7 + orrs r10, r9, r10 + beq empty_odd_column + + stmdb sp!, { r0, r2, r4, r6 } @ save on the stack tmp10, tmp13, tmp12, tmp11 + + add r0, r3, r5 @ r0 = 'z2' + add r2, r1, r7 @ r2 = 'z1' + add r4, r3, r7 @ r4 = 'z3' + add r6, r1, r5 @ r6 = 'z4' + ldr r9, [r11, #FIX_1_175875602_ID] + add r8, r4, r6 + ldr r10, [r11, #FIX_M_0_899976223_ID] + mul r8, r9, r8 @ r8 = 'z5' + ldr r9, [r11, #FIX_M_2_562915447_ID] + mul r2, r10, r2 @ r2 = 'z1' + ldr r10, [r11, #FIX_M_1_961570560_ID] + mul r0, r9, r0 @ r0 = 'z2' + ldr r9, [r11, #FIX_M_0_390180644_ID] + mla r4, r10, r4, r8 @ r4 = 'z3' + ldr r10, [r11, #FIX_0_298631336_ID] + mla r6, r9, r6, r8 @ r6 = 'z4' + ldr r9, [r11, #FIX_2_053119869_ID] + mla r7, r10, r7, r2 @ r7 = tmp0 + z1 + ldr r10, [r11, #FIX_3_072711026_ID] + mla r5, r9, r5, r0 @ r5 = tmp1 + z2 + ldr r9, [r11, #FIX_1_501321110_ID] + mla r3, r10, r3, r0 @ r3 = tmp2 + z2 + add r7, r7, r4 @ r7 = tmp0 + mla r1, r9, r1, r2 @ r1 = tmp3 + z1 + add r5, r5, r6 @ r5 = tmp1 + add r3, r3, r4 @ r3 = tmp2 + add r1, r1, r6 @ r1 = tmp3 + + ldmia sp!, { r0, r2, r4, r6 } @ r0 = tmp10 / r2 = tmp13 / r4 = tmp11 / r6 = tmp12 + @ r1 = tmp3 / r3 = tmp2 / r5 = tmp1 / r7 = tmp0 + + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) + add r8, r0, r1 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 0*8)] + + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) + sub r8, r0, r1 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(14*8)] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) + add r8, r4, r3 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 2*8)] + + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) + sub r8, r4, r3 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(12*8)] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) + add r8, r6, r5 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 4*8)] + + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) + sub r8, r6, r5 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #(10*8)] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) + add r8, r2, r7 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 6*8)] + + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) + sub r8, r2, r7 + add r8, r8, #(1<<17) + mov r8, r8, asr #18 + strh r8, [lr, #( 8*8)] + + @ End of row loop + add lr, lr, #2 + subs r12, r12, #1 + bne column_loop + beq the_end + +empty_odd_column: + @ Compute DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3) + add r0, r0, #(1<<17) + mov r0, r0, asr #18 + strh r0, [lr, #( 0*8)] + strh r0, [lr, #(14*8)] + + @ Compute DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3) + add r4, r4, #(1<<17) + mov r4, r4, asr #18 + strh r4, [lr, #( 2*8)] + strh r4, [lr, #(12*8)] + + @ Compute DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3) + add r6, r6, #(1<<17) + mov r6, r6, asr #18 + strh r6, [lr, #( 4*8)] + strh r6, [lr, #(10*8)] + + @ Compute DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3) + @ Compute DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3) + add r2, r2, #(1<<17) + mov r2, r2, asr #18 + strh r2, [lr, #( 6*8)] + strh r2, [lr, #( 8*8)] + + @ End of row loop + add lr, lr, #2 + subs r12, r12, #1 + bne column_loop + +the_end: + @ The end.... + add sp, sp, #4 + ldmia sp!, { r4 - r12, pc } @ restore callee saved regs and return + +const_array: + .align + .word FIX_0_298631336 + .word FIX_0_541196100 + .word FIX_0_765366865 + .word FIX_1_175875602 + .word FIX_1_501321110 + .word FIX_2_053119869 + .word FIX_3_072711026 + .word FIX_M_0_390180644 + .word FIX_M_0_899976223 + .word FIX_M_1_847759065 + .word FIX_M_1_961570560 + .word FIX_M_2_562915447 + .word FIX_0xFFFF diff --git a/src/libffmpeg/libavcodec/i386/Makefile.am b/src/libffmpeg/libavcodec/i386/Makefile.am new file mode 100644 index 000000000..72cbcc92c --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/Makefile.am @@ -0,0 +1,43 @@ +## +## Process this file with automake to produce Makefile.in +## + +#CFLAGS = -D_FILE_OFFSET_BITS=64 @GLOBAL_CFLAGS@ -DCONFIG_DECODERS -DHAVE_AV_CONFIG_H + +CFLAGS = @GLOBAL_CFLAGS@ @LIBFFMPEG_CFLAGS@ -DCONFIG_DECODERS -DHAVE_AV_CONFIG_H +ASFLAGS = + +LIBTOOL = $(SHELL) $(top_builddir)/libtool-nofpic + +noinst_LTLIBRARIES = libavcodec_mmx.la + +EXTRA_DIST = fdct_mmx.c \ + dsputil_mmx.c mpegvideo_mmx.c \ + idct_mmx.c motion_est_mmx.c \ + simple_idct_mmx.c + +if HAVE_FFMMX +mmx_modules = $(EXTRA_DIST) +endif + + +libavcodec_mmx_la_SOURCES = $(mmx_modules) + +noinst_HEADERS = dsputil_mmx_avg.h mmx.h + +.s.lo: + $(ASCOMPILE) -o $@ `test -f $< || echo '$(srcdir)/'`$< + +debug: + @$(MAKE) CFLAGS="@DEBUG_CFLAGS@ @LIBFFMPEG_CFLAGS@ -DCONFIG_DECODERS -DHAVE_AV_CONFIG_H" + +install-debug: debug + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +mostlyclean-generic: + -rm -f *~ \#* .*~ .\#* + +maintainer-clean-generic: + -@echo "This command is intended for maintainers to use;" + -@echo "it deletes files that may require special tools to rebuild." + -rm -f Makefile.in diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c new file mode 100644 index 000000000..8647ed187 --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c @@ -0,0 +1,1056 @@ +/* + * MMX optimized DSP utils + * Copyright (c) 2000, 2001 Gerard Lantau. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * MMX optimization by Nick Kurshev + */ + +#include "../dsputil.h" +#include "../simple_idct.h" + +int mm_flags; /* multimedia extension flags */ + +int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); +int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h); +int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); +int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); +int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h); + +/* external functions, from idct_mmx.c */ +void ff_mmx_idct(DCTELEM *block); +void ff_mmxext_idct(DCTELEM *block); + +/* pixel operations */ +static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; +static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; +//static const unsigned short mm_wone[4] __attribute__ ((aligned(8))) = { 0x1, 0x1, 0x1, 0x1 }; +//static const unsigned short mm_wtwo[4] __attribute__ ((aligned(8))) = { 0x2, 0x2, 0x2, 0x2 }; + +/***********************************/ +/* 3Dnow specific */ + +#define DEF(x) x ## _3dnow +/* for Athlons PAVGUSB is prefered */ +#define PAVGB "pavgusb" + +#include "dsputil_mmx_avg.h" + +#undef DEF +#undef PAVGB + +/***********************************/ +/* MMX2 specific */ + +#define DEF(x) x ## _sse + +/* Introduced only in MMX2 set */ +#define PAVGB "pavgb" + +#include "dsputil_mmx_avg.h" + +#undef DEF +#undef PAVGB + +/***********************************/ +/* standard MMX */ + +static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) +{ + DCTELEM *p; + const UINT8 *pix; + int i; + + /* read the pixels */ + p = block; + pix = pixels; + __asm __volatile("pxor %%mm7, %%mm7":); + for(i=0;i<4;i++) { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %2, %%mm1\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "movq %%mm0, %0\n\t" + "movq %%mm2, 8%0\n\t" + "movq %%mm1, 16%0\n\t" + "movq %%mm3, 24%0\n\t" + :"=m"(*p) + :"m"(*pix), "m"(*(pix+line_size)) + :"memory"); + pix += line_size*2; + p += 16; + } +} + +static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) +{ + const DCTELEM *p; + UINT8 *pix; + int i; + + /* read the pixels */ + p = block; + pix = pixels; + for(i=0;i<2;i++) { + __asm __volatile( + "movq %3, %%mm0\n\t" + "movq 8%3, %%mm1\n\t" + "movq 16%3, %%mm2\n\t" + "movq 24%3, %%mm3\n\t" + "movq 32%3, %%mm4\n\t" + "movq 40%3, %%mm5\n\t" + "movq 48%3, %%mm6\n\t" + "movq 56%3, %%mm7\n\t" + "packuswb %%mm1, %%mm0\n\t" + "packuswb %%mm3, %%mm2\n\t" + "packuswb %%mm5, %%mm4\n\t" + "packuswb %%mm7, %%mm6\n\t" + "movq %%mm0, (%0)\n\t" + "movq %%mm2, (%0, %1)\n\t" + "movq %%mm4, (%0, %1, 2)\n\t" + "movq %%mm6, (%0, %2)\n\t" + ::"r" (pix), "r" (line_size), "r" (line_size*3), "m"(*p) + :"memory"); + pix += line_size*4; + p += 32; + } +} + +static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) +{ + const DCTELEM *p; + UINT8 *pix; + int i; + + /* read the pixels */ + p = block; + pix = pixels; + __asm __volatile("pxor %%mm7, %%mm7":); + for(i=0;i<4;i++) { + __asm __volatile( + "movq %2, %%mm0\n\t" + "movq 8%2, %%mm1\n\t" + "movq 16%2, %%mm2\n\t" + "movq 24%2, %%mm3\n\t" + "movq %0, %%mm4\n\t" + "movq %1, %%mm6\n\t" + "movq %%mm4, %%mm5\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpckhbw %%mm7, %%mm5\n\t" + "paddsw %%mm4, %%mm0\n\t" + "paddsw %%mm5, %%mm1\n\t" + "movq %%mm6, %%mm5\n\t" + "punpcklbw %%mm7, %%mm6\n\t" + "punpckhbw %%mm7, %%mm5\n\t" + "paddsw %%mm6, %%mm2\n\t" + "paddsw %%mm5, %%mm3\n\t" + "packuswb %%mm1, %%mm0\n\t" + "packuswb %%mm3, %%mm2\n\t" + "movq %%mm0, %0\n\t" + "movq %%mm2, %1\n\t" + :"+m"(*pix), "+m"(*(pix+line_size)) + :"m"(*p) + :"memory"); + pix += line_size*2; + p += 16; + } +} + +static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + int dh, hh; + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + hh=h>>2; + dh=h&3; + while(hh--) { + __asm __volatile( + "movq (%1), %%mm0 \n\t" + "movq (%1, %2), %%mm1 \n\t" + "movq (%1, %2, 2), %%mm2 \n\t" + "movq (%1, %3), %%mm3 \n\t" + "movq %%mm0, (%0) \n\t" + "movq %%mm1, (%0, %2) \n\t" + "movq %%mm2, (%0, %2, 2) \n\t" + "movq %%mm3, (%0, %3) \n\t" + ::"r"(p), "r"(pix), "r"(line_size), "r"(line_size*3) + :"memory"); + pix = pix + line_size*4; + p = p + line_size*4; + } + while(dh--) { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"=m"(*p) + :"m"(*pix) + :"memory"); + pix = pix + line_size; + p = p + line_size; + } +} + +static void put_pixels_x2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm4\n\t" + ::"m"(mm_wone)); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq 1%1, %%mm1\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "paddusw %%mm4, %%mm0\n\t" + "paddusw %%mm4, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"=m"(*p) + :"m"(*pix) + :"memory"); + pix += line_size; p += line_size; + } while (--h); +} + +static void put_pixels_y2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm4\n\t" + ::"m"(mm_wone)); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %2, %%mm1\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "paddusw %%mm4, %%mm0\n\t" + "paddusw %%mm4, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"=m"(*p) + :"m"(*pix), + "m"(*(pix+line_size)) + :"memory"); + pix += line_size; + p += line_size; + } while (--h); +} + +static void put_pixels_xy2_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm6\n\t" + ::"m"(mm_wtwo)); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %2, %%mm1\n\t" + "movq 1%1, %%mm4\n\t" + "movq 1%2, %%mm5\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "movq %%mm4, %%mm1\n\t" + "movq %%mm5, %%mm3\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpcklbw %%mm7, %%mm5\n\t" + "punpckhbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm5, %%mm4\n\t" + "paddusw %%mm3, %%mm1\n\t" + "paddusw %%mm6, %%mm4\n\t" + "paddusw %%mm6, %%mm1\n\t" + "paddusw %%mm4, %%mm0\n\t" + "paddusw %%mm1, %%mm2\n\t" + "psrlw $2, %%mm0\n\t" + "psrlw $2, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"=m"(*p) + :"m"(*pix), + "m"(*(pix+line_size)) + :"memory"); + pix += line_size; + p += line_size; + } while(--h); +} + +static void put_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile("pxor %%mm7, %%mm7\n\t":); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq 1%1, %%mm1\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"=m"(*p) + :"m"(*pix) + :"memory"); + pix += line_size; + p += line_size; + } while (--h); +} + +static void put_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile("pxor %%mm7, %%mm7\n\t":); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %2, %%mm1\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"=m"(*p) + :"m"(*pix), + "m"(*(pix+line_size)) + :"memory"); + pix += line_size; + p += line_size; + } while(--h); +} + +static void put_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm6\n\t" + ::"m"(mm_wone)); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %2, %%mm1\n\t" + "movq 1%1, %%mm4\n\t" + "movq 1%2, %%mm5\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "movq %%mm4, %%mm1\n\t" + "movq %%mm5, %%mm3\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpcklbw %%mm7, %%mm5\n\t" + "punpckhbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm5, %%mm4\n\t" + "paddusw %%mm3, %%mm1\n\t" + "paddusw %%mm6, %%mm4\n\t" + "paddusw %%mm6, %%mm1\n\t" + "paddusw %%mm4, %%mm0\n\t" + "paddusw %%mm1, %%mm2\n\t" + "psrlw $2, %%mm0\n\t" + "psrlw $2, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"=m"(*p) + :"m"(*pix), + "m"(*(pix+line_size)) + :"memory"); + pix += line_size; + p += line_size; + } while(--h); +} + +static void avg_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm6\n\t" + ::"m"(mm_wone)); + do { + __asm __volatile( + "movq %0, %%mm0\n\t" + "movq %1, %%mm1\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "paddusw %%mm6, %%mm0\n\t" + "paddusw %%mm6, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"+m"(*p) + :"m"(*pix) + :"memory"); + pix += line_size; + p += line_size; + } + while (--h); +} + +static void avg_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm6\n\t" + ::"m"(mm_wone)); + do { + __asm __volatile( + "movq %1, %%mm1\n\t" + "movq %0, %%mm0\n\t" + "movq 1%1, %%mm4\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "movq %%mm4, %%mm5\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpckhbw %%mm7, %%mm5\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "paddusw %%mm4, %%mm1\n\t" + "paddusw %%mm5, %%mm3\n\t" + "paddusw %%mm6, %%mm1\n\t" + "paddusw %%mm6, %%mm3\n\t" + "psrlw $1, %%mm1\n\t" + "psrlw $1, %%mm3\n\t" + "paddusw %%mm6, %%mm0\n\t" + "paddusw %%mm6, %%mm2\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"+m"(*p) + :"m"(*pix) + :"memory"); + pix += line_size; + p += line_size; + } while (--h); +} + +static void avg_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm6\n\t" + ::"m"(mm_wone)); + do { + __asm __volatile( + "movq %1, %%mm1\n\t" + "movq %0, %%mm0\n\t" + "movq %2, %%mm4\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "movq %%mm4, %%mm5\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpckhbw %%mm7, %%mm5\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "paddusw %%mm4, %%mm1\n\t" + "paddusw %%mm5, %%mm3\n\t" + "paddusw %%mm6, %%mm1\n\t" + "paddusw %%mm6, %%mm3\n\t" + "psrlw $1, %%mm1\n\t" + "psrlw $1, %%mm3\n\t" + "paddusw %%mm6, %%mm0\n\t" + "paddusw %%mm6, %%mm2\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"+m"(*p) + :"m"(*pix), "m"(*(pix+line_size)) + :"memory"); + pix += line_size; + p += line_size ; + } while(--h); +} + +static void avg_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm6\n\t" + ::"m"(mm_wtwo)); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %2, %%mm1\n\t" + "movq 1%1, %%mm4\n\t" + "movq 1%2, %%mm5\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "movq %%mm4, %%mm1\n\t" + "movq %%mm5, %%mm3\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpcklbw %%mm7, %%mm5\n\t" + "punpckhbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm5, %%mm4\n\t" + "paddusw %%mm3, %%mm1\n\t" + "paddusw %%mm6, %%mm4\n\t" + "paddusw %%mm6, %%mm1\n\t" + "paddusw %%mm4, %%mm0\n\t" + "paddusw %%mm1, %%mm2\n\t" + "movq %3, %%mm5\n\t" + "psrlw $2, %%mm0\n\t" + "movq %0, %%mm1\n\t" + "psrlw $2, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "paddusw %%mm5, %%mm0\n\t" + "paddusw %%mm5, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"+m"(*p) + :"m"(*pix), + "m"(*(pix+line_size)), "m"(mm_wone) + :"memory"); + pix += line_size; + p += line_size ; + } while(--h); +} + +static void avg_no_rnd_pixels_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile("pxor %%mm7, %%mm7\n\t":); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %0, %%mm1\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"+m"(*p) + :"m"(*pix) + :"memory"); + pix += line_size; + p += line_size ; + } while (--h); +} + +static void avg_no_rnd_pixels_x2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t":); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq 1%1, %%mm1\n\t" + "movq %0, %%mm4\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "movq %%mm4, %%mm5\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpckhbw %%mm7, %%mm5\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "paddusw %%mm4, %%mm0\n\t" + "paddusw %%mm5, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"+m"(*p) + :"m"(*pix) + :"memory"); + pix += line_size; + p += line_size; + } while (--h); +} + +static void avg_no_rnd_pixels_y2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t":); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %2, %%mm1\n\t" + "movq %0, %%mm4\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "movq %%mm4, %%mm5\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpckhbw %%mm7, %%mm5\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "paddusw %%mm4, %%mm0\n\t" + "paddusw %%mm5, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"+m"(*p) + :"m"(*pix), "m"(*(pix+line_size)) + :"memory"); + pix += line_size; + p += line_size ; + } while(--h); +} + +static void avg_no_rnd_pixels_xy2_mmx( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm6\n\t" + ::"m"(mm_wone)); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %2, %%mm1\n\t" + "movq 1%1, %%mm4\n\t" + "movq 1%2, %%mm5\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "movq %%mm4, %%mm1\n\t" + "movq %%mm5, %%mm3\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpcklbw %%mm7, %%mm5\n\t" + "punpckhbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm5, %%mm4\n\t" + "paddusw %%mm3, %%mm1\n\t" + "paddusw %%mm6, %%mm4\n\t" + "paddusw %%mm6, %%mm1\n\t" + "paddusw %%mm4, %%mm0\n\t" + "paddusw %%mm1, %%mm2\n\t" + "movq %0, %%mm1\n\t" + "psrlw $2, %%mm0\n\t" + "movq %%mm1, %%mm3\n\t" + "psrlw $2, %%mm2\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "psrlw $1, %%mm0\n\t" + "psrlw $1, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"+m"(*p) + :"m"(*pix), + "m"(*(pix+line_size)) + :"memory"); + pix += line_size; + p += line_size; + } while(--h); +} + +static void sub_pixels_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) +{ + DCTELEM *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile("pxor %%mm7, %%mm7":); + do { + __asm __volatile( + "movq %0, %%mm0\n\t" + "movq %1, %%mm2\n\t" + "movq 8%0, %%mm1\n\t" + "movq %%mm2, %%mm3\n\t" + "punpcklbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "psubsw %%mm2, %%mm0\n\t" + "psubsw %%mm3, %%mm1\n\t" + "movq %%mm0, %0\n\t" + "movq %%mm1, 8%0\n\t" + :"+m"(*p) + :"m"(*pix) + :"memory"); + pix += line_size; + p += 8; + } while (--h); +} + +static void sub_pixels_x2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) +{ + DCTELEM *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm6" + ::"m"(mm_wone)); + do { + __asm __volatile( + "movq %0, %%mm0\n\t" + "movq %1, %%mm2\n\t" + "movq 8%0, %%mm1\n\t" + "movq 1%1, %%mm4\n\t" + "movq %%mm2, %%mm3\n\t" + "movq %%mm4, %%mm5\n\t" + "punpcklbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpckhbw %%mm7, %%mm5\n\t" + "paddusw %%mm4, %%mm2\n\t" + "paddusw %%mm5, %%mm3\n\t" + "paddusw %%mm6, %%mm2\n\t" + "paddusw %%mm6, %%mm3\n\t" + "psrlw $1, %%mm2\n\t" + "psrlw $1, %%mm3\n\t" + "psubsw %%mm2, %%mm0\n\t" + "psubsw %%mm3, %%mm1\n\t" + "movq %%mm0, %0\n\t" + "movq %%mm1, 8%0\n\t" + :"+m"(*p) + :"m"(*pix) + :"memory"); + pix += line_size; + p += 8; + } while (--h); +} + +static void sub_pixels_y2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) +{ + DCTELEM *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm6" + ::"m"(mm_wone)); + do { + __asm __volatile( + "movq %0, %%mm0\n\t" + "movq %1, %%mm2\n\t" + "movq 8%0, %%mm1\n\t" + "movq %2, %%mm4\n\t" + "movq %%mm2, %%mm3\n\t" + "movq %%mm4, %%mm5\n\t" + "punpcklbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpckhbw %%mm7, %%mm5\n\t" + "paddusw %%mm4, %%mm2\n\t" + "paddusw %%mm5, %%mm3\n\t" + "paddusw %%mm6, %%mm2\n\t" + "paddusw %%mm6, %%mm3\n\t" + "psrlw $1, %%mm2\n\t" + "psrlw $1, %%mm3\n\t" + "psubsw %%mm2, %%mm0\n\t" + "psubsw %%mm3, %%mm1\n\t" + "movq %%mm0, %0\n\t" + "movq %%mm1, 8%0\n\t" + :"+m"(*p) + :"m"(*pix), "m"(*(pix+line_size)) + :"memory"); + pix += line_size; + p += 8; + } while (--h); +} + +static void sub_pixels_xy2_mmx( DCTELEM *block, const UINT8 *pixels, int line_size, int h) +{ + DCTELEM *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm6\n\t" + ::"m"(mm_wtwo)); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %2, %%mm1\n\t" + "movq 1%1, %%mm4\n\t" + "movq 1%2, %%mm5\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "movq %%mm4, %%mm1\n\t" + "movq %%mm5, %%mm3\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpcklbw %%mm7, %%mm5\n\t" + "punpckhbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm5, %%mm4\n\t" + "paddusw %%mm3, %%mm1\n\t" + "paddusw %%mm6, %%mm4\n\t" + "paddusw %%mm6, %%mm1\n\t" + "paddusw %%mm4, %%mm0\n\t" + "paddusw %%mm1, %%mm2\n\t" + "movq %0, %%mm1\n\t" + "movq 8%0, %%mm3\n\t" + "psrlw $2, %%mm0\n\t" + "psrlw $2, %%mm2\n\t" + "psubsw %%mm0, %%mm1\n\t" + "psubsw %%mm2, %%mm3\n\t" + "movq %%mm1, %0\n\t" + "movq %%mm3, 8%0\n\t" + :"+m"(*p) + :"m"(*pix), + "m"(*(pix+line_size)) + :"memory"); + pix += line_size; + p += 8 ; + } while(--h); +} + +void dsputil_init_mmx(void) +{ + mm_flags = mm_support(); +#if 0 + printf("CPU flags:"); + if (mm_flags & MM_MMX) + printf(" mmx"); + if (mm_flags & MM_MMXEXT) + printf(" mmxext"); + if (mm_flags & MM_3DNOW) + printf(" 3dnow"); + if (mm_flags & MM_SSE) + printf(" sse"); + if (mm_flags & MM_SSE2) + printf(" sse2"); + printf("\n"); +#endif + + if (mm_flags & MM_MMX) { + get_pixels = get_pixels_mmx; + put_pixels_clamped = put_pixels_clamped_mmx; + add_pixels_clamped = add_pixels_clamped_mmx; + + pix_abs16x16 = pix_abs16x16_mmx; + pix_abs16x16_x2 = pix_abs16x16_x2_mmx; + pix_abs16x16_y2 = pix_abs16x16_y2_mmx; + pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; + av_fdct = fdct_mmx; + + put_pixels_tab[0] = put_pixels_mmx; + put_pixels_tab[1] = put_pixels_x2_mmx; + put_pixels_tab[2] = put_pixels_y2_mmx; + put_pixels_tab[3] = put_pixels_xy2_mmx; + + put_no_rnd_pixels_tab[0] = put_pixels_mmx; + put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; + put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; + put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; + + avg_pixels_tab[0] = avg_pixels_mmx; + avg_pixels_tab[1] = avg_pixels_x2_mmx; + avg_pixels_tab[2] = avg_pixels_y2_mmx; + avg_pixels_tab[3] = avg_pixels_xy2_mmx; + + avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; + avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; + avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; + avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; + + sub_pixels_tab[0] = sub_pixels_mmx; + sub_pixels_tab[1] = sub_pixels_x2_mmx; + sub_pixels_tab[2] = sub_pixels_y2_mmx; + sub_pixels_tab[3] = sub_pixels_xy2_mmx; + + if (mm_flags & MM_MMXEXT) { + pix_abs16x16 = pix_abs16x16_sse; + } + + if (mm_flags & MM_SSE) { + put_pixels_tab[1] = put_pixels_x2_sse; + put_pixels_tab[2] = put_pixels_y2_sse; + + avg_pixels_tab[0] = avg_pixels_sse; + avg_pixels_tab[1] = avg_pixels_x2_sse; + avg_pixels_tab[2] = avg_pixels_y2_sse; + avg_pixels_tab[3] = avg_pixels_xy2_sse; + + sub_pixels_tab[1] = sub_pixels_x2_sse; + sub_pixels_tab[2] = sub_pixels_y2_sse; + } else if (mm_flags & MM_3DNOW) { + put_pixels_tab[1] = put_pixels_x2_3dnow; + put_pixels_tab[2] = put_pixels_y2_3dnow; + + avg_pixels_tab[0] = avg_pixels_3dnow; + avg_pixels_tab[1] = avg_pixels_x2_3dnow; + avg_pixels_tab[2] = avg_pixels_y2_3dnow; + avg_pixels_tab[3] = avg_pixels_xy2_3dnow; + + sub_pixels_tab[1] = sub_pixels_x2_3dnow; + sub_pixels_tab[2] = sub_pixels_y2_3dnow; + } + + /* idct */ + if (mm_flags & MM_MMXEXT) { + ff_idct = ff_mmxext_idct; + } else { + ff_idct = ff_mmx_idct; + } +#ifdef SIMPLE_IDCT +// ff_idct = simple_idct; + ff_idct = simple_idct_mmx; +#endif + } +} diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h new file mode 100644 index 000000000..830fe9f3b --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h @@ -0,0 +1,342 @@ +/* + * DSP utils : average functions are compiled twice for 3dnow/mmx2 + * Copyright (c) 2000, 2001 Gerard Lantau. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * MMX optimization by Nick Kurshev + */ + +static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + int dh, hh; + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + hh=h>>2; + dh=h&3; + while(hh--) { + __asm __volatile( + "movq (%1), %%mm0\n\t" + "movq 1(%1), %%mm1\n\t" + "movq (%1, %2), %%mm2\n\t" + "movq 1(%1, %2), %%mm3\n\t" + "movq (%1, %2, 2), %%mm4\n\t" + "movq 1(%1, %2, 2), %%mm5\n\t" + "movq (%1, %3), %%mm6\n\t" + "movq 1(%1, %3), %%mm7\n\t" + PAVGB" %%mm1, %%mm0\n\t" + PAVGB" %%mm3, %%mm2\n\t" + PAVGB" %%mm5, %%mm4\n\t" + PAVGB" %%mm7, %%mm6\n\t" + "movq %%mm0, (%0)\n\t" + "movq %%mm2, (%0, %2)\n\t" + "movq %%mm4, (%0, %2, 2)\n\t" + "movq %%mm6, (%0, %3)\n\t" + ::"r"(p), "r"(pix), "r" (line_size), "r" (line_size*3) + :"memory"); + pix += line_size*4; p += line_size*4; + } + while(dh--) { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq 1%1, %%mm1\n\t" + PAVGB" %%mm1, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"=m"(*p) + :"m"(*pix) + :"memory"); + pix += line_size; p += line_size; + } +} + +static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + int dh, hh; + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + + hh=h>>1; + dh=h&1; + while(hh--) { + __asm __volatile( + "movq %2, %%mm0\n\t" + "movq %3, %%mm1\n\t" + "movq %4, %%mm2\n\t" + PAVGB" %%mm1, %%mm0\n\t" + PAVGB" %%mm2, %%mm1\n\t" + "movq %%mm0, %0\n\t" + "movq %%mm1, %1\n\t" + :"=m"(*p), "=m"(*(p+line_size)) + :"m"(*pix), "m"(*(pix+line_size)), + "m"(*(pix+line_size*2)) + :"memory"); + pix += line_size*2; + p += line_size*2; + } + if(dh) { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %2, %%mm1\n\t" + PAVGB" %%mm1, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"=m"(*p) + :"m"(*pix), + "m"(*(pix+line_size)) + :"memory"); + } +} + +static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + int dh, hh; + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + hh=h>>2; + dh=h&3; + while(hh--) { + __asm __volatile( + "movq (%0), %%mm0\n\t" + "movq (%1), %%mm1\n\t" + "movq (%0, %2), %%mm2\n\t" + "movq (%1, %2), %%mm3\n\t" + "movq (%0, %2, 2), %%mm4\n\t" + "movq (%1, %2, 2), %%mm5\n\t" + "movq (%0, %3), %%mm6\n\t" + "movq (%1, %3), %%mm7\n\t" + PAVGB" %%mm1, %%mm0\n\t" + PAVGB" %%mm3, %%mm2\n\t" + PAVGB" %%mm5, %%mm4\n\t" + PAVGB" %%mm7, %%mm6\n\t" + "movq %%mm0, (%0)\n\t" + "movq %%mm2, (%0, %2)\n\t" + "movq %%mm4, (%0, %2, 2)\n\t" + "movq %%mm6, (%0, %3)\n\t" + ::"r"(p), "r"(pix), "r" (line_size), "r" (line_size*3) + :"memory"); + pix += line_size*4; p += line_size*4; + } + while(dh--) { + __asm __volatile( + "movq %0, %%mm0\n\t" + "movq %1, %%mm1\n\t" + PAVGB" %%mm1, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"+m"(*p) + :"m"(*pix) + :"memory"); + pix += line_size; p += line_size; + } +} + +static void DEF(avg_pixels_x2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + int dh, hh; + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + hh=h>>1; + dh=h&1; + while(hh--) { + __asm __volatile( + "movq %2, %%mm2\n\t" + "movq 1%2, %%mm3\n\t" + "movq %3, %%mm4\n\t" + "movq 1%3, %%mm5\n\t" + "movq %0, %%mm0\n\t" + "movq %1, %%mm1\n\t" + PAVGB" %%mm3, %%mm2\n\t" + PAVGB" %%mm2, %%mm0\n\t" + PAVGB" %%mm5, %%mm4\n\t" + PAVGB" %%mm4, %%mm1\n\t" + "movq %%mm0, %0\n\t" + "movq %%mm1, %1\n\t" + :"+m"(*p), "+m"(*(p+line_size)) + :"m"(*pix), "m"(*(pix+line_size)) + :"memory"); + pix += line_size*2; + p += line_size*2; + } + if(dh) { + __asm __volatile( + "movq %1, %%mm1\n\t" + "movq 1%1, %%mm2\n\t" + "movq %0, %%mm0\n\t" + PAVGB" %%mm2, %%mm1\n\t" + PAVGB" %%mm1, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"+m"(*p) + :"m"(*pix) + :"memory"); + } +} + +static void DEF(avg_pixels_y2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + int dh, hh; + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + hh=h>>1; + dh=h&1; + while(hh--) { + __asm __volatile( + "movq %2, %%mm2\n\t" + "movq %3, %%mm3\n\t" + "movq %3, %%mm4\n\t" + "movq %4, %%mm5\n\t" + "movq %0, %%mm0\n\t" + "movq %1, %%mm1\n\t" + PAVGB" %%mm3, %%mm2\n\t" + PAVGB" %%mm2, %%mm0\n\t" + PAVGB" %%mm5, %%mm4\n\t" + PAVGB" %%mm4, %%mm1\n\t" + "movq %%mm0, %0\n\t" + "movq %%mm1, %1\n\t" + :"+m"(*p), "+m"(*(p+line_size)) + :"m"(*pix), "m"(*(pix+line_size)), "m"(*(pix+line_size*2)) + :"memory"); + pix += line_size*2; + p += line_size*2; + } + if(dh) { + __asm __volatile( + "movq %1, %%mm1\n\t" + "movq %2, %%mm2\n\t" + "movq %0, %%mm0\n\t" + PAVGB" %%mm2, %%mm1\n\t" + PAVGB" %%mm1, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"+m"(*p) + :"m"(*pix), "m"(*(pix+line_size)) + :"memory"); + } +} + +static void DEF(avg_pixels_xy2)( UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + UINT8 *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7\n\t" + "movq %0, %%mm6\n\t" + ::"m"(mm_wtwo)); + do { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %2, %%mm1\n\t" + "movq 1%1, %%mm4\n\t" + "movq 1%2, %%mm5\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm1, %%mm3\n\t" + "punpcklbw %%mm7, %%mm0\n\t" + "punpcklbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm1, %%mm0\n\t" + "paddusw %%mm3, %%mm2\n\t" + "movq %%mm4, %%mm1\n\t" + "movq %%mm5, %%mm3\n\t" + "punpcklbw %%mm7, %%mm4\n\t" + "punpcklbw %%mm7, %%mm5\n\t" + "punpckhbw %%mm7, %%mm1\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "paddusw %%mm5, %%mm4\n\t" + "paddusw %%mm3, %%mm1\n\t" + "paddusw %%mm6, %%mm4\n\t" + "paddusw %%mm6, %%mm1\n\t" + "paddusw %%mm4, %%mm0\n\t" + "paddusw %%mm1, %%mm2\n\t" + "psrlw $2, %%mm0\n\t" + "psrlw $2, %%mm2\n\t" + "packuswb %%mm2, %%mm0\n\t" + PAVGB" %0, %%mm0\n\t" + "movq %%mm0, %0\n\t" + :"+m"(*p) + :"m"(*pix), + "m"(*(pix+line_size)) + :"memory"); + pix += line_size; + p += line_size ; + } while(--h); +} + +static void DEF(sub_pixels_x2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h) +{ + DCTELEM *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7":); + do { + __asm __volatile( + "movq 1%1, %%mm2\n\t" + "movq %0, %%mm0\n\t" + PAVGB" %1, %%mm2\n\t" + "movq 8%0, %%mm1\n\t" + "movq %%mm2, %%mm3\n\t" + "punpcklbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "psubsw %%mm2, %%mm0\n\t" + "psubsw %%mm3, %%mm1\n\t" + "movq %%mm0, %0\n\t" + "movq %%mm1, 8%0\n\t" + :"+m"(*p) + :"m"(*pix) + :"memory"); + pix += line_size; + p += 8; + } while (--h); +} + +static void DEF(sub_pixels_y2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h) +{ + DCTELEM *p; + const UINT8 *pix; + p = block; + pix = pixels; + __asm __volatile( + "pxor %%mm7, %%mm7":); + do { + __asm __volatile( + "movq %2, %%mm2\n\t" + "movq %0, %%mm0\n\t" + PAVGB" %1, %%mm2\n\t" + "movq 8%0, %%mm1\n\t" + "movq %%mm2, %%mm3\n\t" + "punpcklbw %%mm7, %%mm2\n\t" + "punpckhbw %%mm7, %%mm3\n\t" + "psubsw %%mm2, %%mm0\n\t" + "psubsw %%mm3, %%mm1\n\t" + "movq %%mm0, %0\n\t" + "movq %%mm1, 8%0\n\t" + :"+m"(*p) + :"m"(*pix), "m"(*(pix+line_size)) + :"memory"); + pix += line_size; + p += 8; + } while (--h); +} + diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c new file mode 100644 index 000000000..a71b89bb1 --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c @@ -0,0 +1,291 @@ +/* + * MMX optimized forward DCT + * The gcc porting is Copyright (c) 2001 Gerard Lantau. + * + * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT + * + * Intel Application Note AP-922 - fast, precise implementation of DCT + * http://developer.intel.com/vtune/cbts/appnotes.htm + */ +#include "../common.h" +#include "mmx.h" + +#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) + +////////////////////////////////////////////////////////////////////// +// +// constants for the forward DCT +// ----------------------------- +// +// Be sure to check that your compiler is aligning all constants to QWORD +// (8-byte) memory boundaries! Otherwise the unaligned memory access will +// severely stall MMX execution. +// +////////////////////////////////////////////////////////////////////// + +#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy +#define SHIFT_FRW_COL BITS_FRW_ACC +#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) +//#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1) +#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) +//#define RND_FRW_COL (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1) +#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) + +//concatenated table, for forward DCT transformation +const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = { + 13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5 + 27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5 + -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5 +}; +const int16_t cos_4_16[4] = { + -19195, -19195, -19195, -19195, //cos * (2<<16) + 0.5 +}; + +const int16_t ocos_4_16[4] = { + 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5 +}; + +const mmx_t fdct_one_corr = {0x0001000100010001LL}; +const mmx_t fdct_r_row = {d:{RND_FRW_ROW, RND_FRW_ROW} }; + +const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table + //row0 + 16384, 16384, 21407, -8867, // w09 w01 w08 w00 + 16384, 16384, 8867, -21407, // w13 w05 w12 w04 + 16384, -16384, 8867, 21407, // w11 w03 w10 w02 + -16384, 16384, -21407, -8867, // w15 w07 w14 w06 + 22725, 12873, 19266, -22725, // w22 w20 w18 w16 + 19266, 4520, -4520, -12873, // w23 w21 w19 w17 + 12873, 4520, 4520, 19266, // w30 w28 w26 w24 + -22725, 19266, -12873, -22725, // w31 w29 w27 w25 + + //row1 + 22725, 22725, 29692, -12299, // w09 w01 w08 w00 + 22725, 22725, 12299, -29692, // w13 w05 w12 w04 + 22725, -22725, 12299, 29692, // w11 w03 w10 w02 + -22725, 22725, -29692, -12299, // w15 w07 w14 w06 + 31521, 17855, 26722, -31521, // w22 w20 w18 w16 + 26722, 6270, -6270, -17855, // w23 w21 w19 w17 + 17855, 6270, 6270, 26722, // w30 w28 w26 w24 + -31521, 26722, -17855, -31521, // w31 w29 w27 w25 + + //row2 + 21407, 21407, 27969, -11585, // w09 w01 w08 w00 + 21407, 21407, 11585, -27969, // w13 w05 w12 w04 + 21407, -21407, 11585, 27969, // w11 w03 w10 w02 + -21407, 21407, -27969, -11585, // w15 w07 w14 w06 + 29692, 16819, 25172, -29692, // w22 w20 w18 w16 + 25172, 5906, -5906, -16819, // w23 w21 w19 w17 + 16819, 5906, 5906, 25172, // w30 w28 w26 w24 + -29692, 25172, -16819, -29692, // w31 w29 w27 w25 + + //row3 + 19266, 19266, 25172, -10426, // w09 w01 w08 w00 + 19266, 19266, 10426, -25172, // w13 w05 w12 w04 + 19266, -19266, 10426, 25172, // w11 w03 w10 w02 + -19266, 19266, -25172, -10426, // w15 w07 w14 w06, + 26722, 15137, 22654, -26722, // w22 w20 w18 w16 + 22654, 5315, -5315, -15137, // w23 w21 w19 w17 + 15137, 5315, 5315, 22654, // w30 w28 w26 w24 + -26722, 22654, -15137, -26722, // w31 w29 w27 w25, + + //row4 + 16384, 16384, 21407, -8867, // w09 w01 w08 w00 + 16384, 16384, 8867, -21407, // w13 w05 w12 w04 + 16384, -16384, 8867, 21407, // w11 w03 w10 w02 + -16384, 16384, -21407, -8867, // w15 w07 w14 w06 + 22725, 12873, 19266, -22725, // w22 w20 w18 w16 + 19266, 4520, -4520, -12873, // w23 w21 w19 w17 + 12873, 4520, 4520, 19266, // w30 w28 w26 w24 + -22725, 19266, -12873, -22725, // w31 w29 w27 w25 + + //row5 + 19266, 19266, 25172, -10426, // w09 w01 w08 w00 + 19266, 19266, 10426, -25172, // w13 w05 w12 w04 + 19266, -19266, 10426, 25172, // w11 w03 w10 w02 + -19266, 19266, -25172, -10426, // w15 w07 w14 w06 + 26722, 15137, 22654, -26722, // w22 w20 w18 w16 + 22654, 5315, -5315, -15137, // w23 w21 w19 w17 + 15137, 5315, 5315, 22654, // w30 w28 w26 w24 + -26722, 22654, -15137, -26722, // w31 w29 w27 w25 + + //row6 + 21407, 21407, 27969, -11585, // w09 w01 w08 w00 + 21407, 21407, 11585, -27969, // w13 w05 w12 w04 + 21407, -21407, 11585, 27969, // w11 w03 w10 w02 + -21407, 21407, -27969, -11585, // w15 w07 w14 w06, + 29692, 16819, 25172, -29692, // w22 w20 w18 w16 + 25172, 5906, -5906, -16819, // w23 w21 w19 w17 + 16819, 5906, 5906, 25172, // w30 w28 w26 w24 + -29692, 25172, -16819, -29692, // w31 w29 w27 w25, + + //row7 + 22725, 22725, 29692, -12299, // w09 w01 w08 w00 + 22725, 22725, 12299, -29692, // w13 w05 w12 w04 + 22725, -22725, 12299, 29692, // w11 w03 w10 w02 + -22725, 22725, -29692, -12299, // w15 w07 w14 w06, + 31521, 17855, 26722, -31521, // w22 w20 w18 w16 + 26722, 6270, -6270, -17855, // w23 w21 w19 w17 + 17855, 6270, 6270, 26722, // w30 w28 w26 w24 + -31521, 26722, -17855, -31521 // w31 w29 w27 w25 +}; + + +static inline void fdct_col(const int16_t *in, int16_t *out, int offset) +{ + movq_m2r(*(in + offset + 1 * 8), mm0); + movq_m2r(*(in + offset + 6 * 8), mm1); + movq_r2r(mm0, mm2); + movq_m2r(*(in + offset + 2 * 8), mm3); + paddsw_r2r(mm1, mm0); + movq_m2r(*(in + offset + 5 * 8), mm4); + psllw_i2r(SHIFT_FRW_COL, mm0); + movq_m2r(*(in + offset + 0 * 8), mm5); + paddsw_r2r(mm3, mm4); + paddsw_m2r(*(in + offset + 7 * 8), mm5); + psllw_i2r(SHIFT_FRW_COL, mm4); + movq_r2r(mm0, mm6); + psubsw_r2r(mm1, mm2); + movq_m2r(*(fdct_tg_all_16 + 4), mm1); + psubsw_r2r(mm4, mm0); + movq_m2r(*(in + offset + 3 * 8), mm7); + pmulhw_r2r(mm0, mm1); + paddsw_m2r(*(in + offset + 4 * 8), mm7); + psllw_i2r(SHIFT_FRW_COL, mm5); + paddsw_r2r(mm4, mm6); + psllw_i2r(SHIFT_FRW_COL, mm7); + movq_r2r(mm5, mm4); + psubsw_r2r(mm7, mm5); + paddsw_r2r(mm5, mm1); + paddsw_r2r(mm7, mm4); + por_m2r(fdct_one_corr, mm1); + psllw_i2r(SHIFT_FRW_COL + 1, mm2); + pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5); + movq_r2r(mm4, mm7); + psubsw_m2r(*(in + offset + 5 * 8), mm3); + psubsw_r2r(mm6, mm4); + movq_r2m(mm1, *(out + offset + 2 * 8)); + paddsw_r2r(mm6, mm7); + movq_m2r(*(in + offset + 3 * 8), mm1); + psllw_i2r(SHIFT_FRW_COL + 1, mm3); + psubsw_m2r(*(in + offset + 4 * 8), mm1); + movq_r2r(mm2, mm6); + movq_r2m(mm4, *(out + offset + 4 * 8)); + paddsw_r2r(mm3, mm2); + pmulhw_m2r(*ocos_4_16, mm2); + psubsw_r2r(mm3, mm6); + pmulhw_m2r(*ocos_4_16, mm6); + psubsw_r2r(mm0, mm5); + por_m2r(fdct_one_corr, mm5); + psllw_i2r(SHIFT_FRW_COL, mm1); + por_m2r(fdct_one_corr, mm2); + movq_r2r(mm1, mm4); + movq_m2r(*(in + offset + 0 * 8), mm3); + paddsw_r2r(mm6, mm1); + psubsw_m2r(*(in + offset + 7 * 8), mm3); + psubsw_r2r(mm6, mm4); + movq_m2r(*(fdct_tg_all_16 + 0), mm0); + psllw_i2r(SHIFT_FRW_COL, mm3); + movq_m2r(*(fdct_tg_all_16 + 8), mm6); + pmulhw_r2r(mm1, mm0); + movq_r2m(mm7, *(out + offset + 0 * 8)); + pmulhw_r2r(mm4, mm6); + movq_r2m(mm5, *(out + offset + 6 * 8)); + movq_r2r(mm3, mm7); + movq_m2r(*(fdct_tg_all_16 + 8), mm5); + psubsw_r2r(mm2, mm7); + paddsw_r2r(mm2, mm3); + pmulhw_r2r(mm7, mm5); + paddsw_r2r(mm3, mm0); + paddsw_r2r(mm4, mm6); + pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3); + por_m2r(fdct_one_corr, mm0); + paddsw_r2r(mm7, mm5); + psubsw_r2r(mm6, mm7); + movq_r2m(mm0, *(out + offset + 1 * 8)); + paddsw_r2r(mm4, mm5); + movq_r2m(mm7, *(out + offset + 3 * 8)); + psubsw_r2r(mm1, mm3); + movq_r2m(mm5, *(out + offset + 5 * 8)); + movq_r2m(mm3, *(out + offset + 7 * 8)); +} + +static inline void fdct_row(const int16_t *in, int16_t *out, const int16_t *table) +{ + movd_m2r(*(in + 6), mm5); + punpcklwd_m2r(*(in + 4), mm5); + movq_r2r(mm5, mm2); + psrlq_i2r(0x20, mm5); + movq_m2r(*(in + 0), mm0); + punpcklwd_r2r(mm2, mm5); + movq_r2r(mm0, mm1); + paddsw_r2r(mm5, mm0); + psubsw_r2r(mm5, mm1); + movq_r2r(mm0, mm2); + punpcklwd_r2r(mm1, mm0); + punpckhwd_r2r(mm1, mm2); + movq_r2r(mm2, mm1); + movq_r2r(mm0, mm2); + movq_m2r(*(table + 0), mm3); + punpcklwd_r2r(mm1, mm0); + movq_r2r(mm0, mm5); + punpckldq_r2r(mm0, mm0); + movq_m2r(*(table + 4), mm4); + punpckhwd_r2r(mm1, mm2); + pmaddwd_r2r(mm0, mm3); + movq_r2r(mm2, mm6); + movq_m2r(*(table + 16), mm1); + punpckldq_r2r(mm2, mm2); + pmaddwd_r2r(mm2, mm4); + punpckhdq_r2r(mm5, mm5); + pmaddwd_m2r(*(table + 8), mm0); + punpckhdq_r2r(mm6, mm6); + movq_m2r(*(table + 20), mm7); + pmaddwd_r2r(mm5, mm1); + paddd_m2r(fdct_r_row, mm3); + pmaddwd_r2r(mm6, mm7); + pmaddwd_m2r(*(table + 12), mm2); + paddd_r2r(mm4, mm3); + pmaddwd_m2r(*(table + 24), mm5); + pmaddwd_m2r(*(table + 28), mm6); + paddd_r2r(mm7, mm1); + paddd_m2r(fdct_r_row, mm0); + psrad_i2r(SHIFT_FRW_ROW, mm3); + paddd_m2r(fdct_r_row, mm1); + paddd_r2r(mm2, mm0); + paddd_m2r(fdct_r_row, mm5); + psrad_i2r(SHIFT_FRW_ROW, mm1); + paddd_r2r(mm6, mm5); + psrad_i2r(SHIFT_FRW_ROW, mm0); + psrad_i2r(SHIFT_FRW_ROW, mm5); + packssdw_r2r(mm0, mm3); + packssdw_r2r(mm5, mm1); + movq_r2r(mm3, mm6); + punpcklwd_r2r(mm1, mm3); + punpckhwd_r2r(mm1, mm6); + movq_r2m(mm3, *(out + 0)); + movq_r2m(mm6, *(out + 4)); +} + +void fdct_mmx(int16_t *block) +{ + /* XXX: not thread safe */ + static int16_t block_tmp[64] ATTR_ALIGN(8); + int16_t *block1, *out; + const int16_t *table; + int i; + + block1 = block_tmp; + fdct_col(block, block1, 0); + fdct_col(block, block1, 4); + + block1 = block_tmp; + table = tab_frw_01234567; + out = block; + for(i=8;i>0;i--) { + fdct_row(block1, out, table); + block1 += 8; + table += 32; + out += 8; + } +} diff --git a/src/libffmpeg/libavcodec/i386/idct_mmx.c b/src/libffmpeg/libavcodec/i386/idct_mmx.c new file mode 100644 index 000000000..618c1cfde --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/idct_mmx.c @@ -0,0 +1,590 @@ +/* + * Note: For libavcodec, this code can also be used under the LGPL license + */ +/* + * idct_mmx.c + * Copyright (C) 1999-2001 Aaron Holtzman + * + * This file is part of mpeg2dec, a free MPEG-2 video stream decoder. + * + * mpeg2dec is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * mpeg2dec is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "../common.h" + +#include "mmx.h" + +#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) + +#define ROW_SHIFT 11 +#define COL_SHIFT 6 + +#define round(bias) ((int)(((bias)+0.5) * (1<> ROW_SHIFT; + row[1] = (a1 + b1) >> ROW_SHIFT; + row[2] = (a2 + b2) >> ROW_SHIFT; + row[3] = (a3 + b3) >> ROW_SHIFT; + row[4] = (a3 - b3) >> ROW_SHIFT; + row[5] = (a2 - b2) >> ROW_SHIFT; + row[6] = (a1 - b1) >> ROW_SHIFT; + row[7] = (a0 - b0) >> ROW_SHIFT; +} +#endif + + +/* MMXEXT row IDCT */ + +#define mmxext_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, -c4, -c2, \ + c4, c6, c4, c6, \ + c1, c3, -c1, -c5, \ + c5, c7, c3, -c7, \ + c4, -c6, c4, -c6, \ + -c4, c2, c4, -c2, \ + c5, -c1, c3, -c1, \ + c7, c3, c7, -c5 } + +static inline void mmxext_row_head (int16_t * row, int offset, int16_t * table) +{ + movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 + + movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 + movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 + + movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 + movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 + + movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 + pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 + + pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 +} + +static inline void mmxext_row (int16_t * table, int32_t * rounder) +{ + movq_m2r (*(table+8), mm1); // mm1 = -C5 -C1 C3 C1 + pmaddwd_r2r (mm2, mm4); // mm4 = C4*x0+C6*x2 C4*x4+C6*x6 + + pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x4-C6*x6 C4*x0-C6*x2 + pshufw_r2r (mm6, mm6, 0x4e); // mm6 = x3 x1 x7 x5 + + movq_m2r (*(table+12), mm7); // mm7 = -C7 C3 C7 C5 + pmaddwd_r2r (mm5, mm1); // mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 + + paddd_m2r (*rounder, mm3); // mm3 += rounder + pmaddwd_r2r (mm6, mm7); // mm7 = C3*x1-C7*x3 C5*x5+C7*x7 + + pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 + paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder + + pmaddwd_m2r (*(table+24), mm5); // mm5 = C3*x5-C1*x7 C5*x1-C1*x3 + movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder + + pmaddwd_m2r (*(table+28), mm6); // mm6 = C7*x1-C5*x3 C7*x5+C3*x7 + paddd_r2r (mm7, mm1); // mm1 = b1 b0 + + paddd_m2r (*rounder, mm0); // mm0 += rounder + psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder + + psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 + paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder + + paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder + psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 + + paddd_r2r (mm6, mm5); // mm5 = b3 b2 + movq_r2r (mm0, mm4); // mm4 = a3 a2 + rounder + + paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder + psubd_r2r (mm5, mm4); // mm4 = a3-b3 a2-b2 + rounder +} + +static inline void mmxext_row_tail (int16_t * row, int store) +{ + psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + + psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 + + packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 + + packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 + + movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 + pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 + + /* slot */ + + movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 +} + +static inline void mmxext_row_mid (int16_t * row, int store, + int offset, int16_t * table) +{ + movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 + psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + + movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 + psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 + + packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 + movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 + + packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 + movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 + + movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 + pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 + + movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 + movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 + + pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 + + movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 + pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 +} + + +/* MMX row IDCT */ + +#define mmx_table(c1,c2,c3,c4,c5,c6,c7) { c4, c2, c4, c6, \ + c4, c6, -c4, -c2, \ + c1, c3, c3, -c7, \ + c5, c7, -c1, -c5, \ + c4, -c6, c4, -c2, \ + -c4, c2, c4, -c6, \ + c5, -c1, c7, -c5, \ + c7, c3, c3, -c1 } + +static inline void mmx_row_head (int16_t * row, int offset, int16_t * table) +{ + movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 + + movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 + movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 + + movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 + movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 + + punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 + + movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 + pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 + + movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 + punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 +} + +static inline void mmx_row (int16_t * table, int32_t * rounder) +{ + pmaddwd_r2r (mm2, mm4); // mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 + punpckldq_r2r (mm5, mm5); // mm5 = x3 x1 x3 x1 + + pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x0-C2*x2 C4*x0-C6*x2 + punpckhdq_r2r (mm6, mm6); // mm6 = x7 x5 x7 x5 + + movq_m2r (*(table+12), mm7); // mm7 = -C5 -C1 C7 C5 + pmaddwd_r2r (mm5, mm1); // mm1 = C3*x1-C7*x3 C1*x1+C3*x3 + + paddd_m2r (*rounder, mm3); // mm3 += rounder + pmaddwd_r2r (mm6, mm7); // mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 + + pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 + paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder + + pmaddwd_m2r (*(table+24), mm5); // mm5 = C7*x1-C5*x3 C5*x1-C1*x3 + movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder + + pmaddwd_m2r (*(table+28), mm6); // mm6 = C3*x5-C1*x7 C7*x5+C3*x7 + paddd_r2r (mm7, mm1); // mm1 = b1 b0 + + paddd_m2r (*rounder, mm0); // mm0 += rounder + psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder + + psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 + paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder + + paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder + psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 + + paddd_r2r (mm6, mm5); // mm5 = b3 b2 + movq_r2r (mm0, mm7); // mm7 = a3 a2 + rounder + + paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder + psubd_r2r (mm5, mm7); // mm7 = a3-b3 a2-b2 + rounder +} + +static inline void mmx_row_tail (int16_t * row, int store) +{ + psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + + psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 + + packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 + + packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 + + movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 + movq_r2r (mm7, mm4); // mm4 = y6 y7 y4 y5 + + pslld_i2r (16, mm7); // mm7 = y7 0 y5 0 + + psrld_i2r (16, mm4); // mm4 = 0 y6 0 y4 + + por_r2r (mm4, mm7); // mm7 = y7 y6 y5 y4 + + /* slot */ + + movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 +} + +static inline void mmx_row_mid (int16_t * row, int store, + int offset, int16_t * table) +{ + movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 + psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + + movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 + psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 + + packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 + movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 + + packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 + movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 + + movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 + movq_r2r (mm7, mm1); // mm1 = y6 y7 y4 y5 + + punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 + psrld_i2r (16, mm7); // mm7 = 0 y6 0 y4 + + movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 + pslld_i2r (16, mm1); // mm1 = y7 0 y5 0 + + movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 + por_r2r (mm1, mm7); // mm7 = y7 y6 y5 y4 + + movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 + punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 + + movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 + pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 +} + + +#if 0 +// C column IDCT - its just here to document the MMXEXT and MMX versions +static inline void idct_col (int16_t * col, int offset) +{ +/* multiplication - as implemented on mmx */ +#define F(c,x) (((c) * (x)) >> 16) + +/* saturation - it helps us handle torture test cases */ +#define S(x) (((x)>32767) ? 32767 : ((x)<-32768) ? -32768 : (x)) + + int16_t x0, x1, x2, x3, x4, x5, x6, x7; + int16_t y0, y1, y2, y3, y4, y5, y6, y7; + int16_t a0, a1, a2, a3, b0, b1, b2, b3; + int16_t u04, v04, u26, v26, u17, v17, u35, v35, u12, v12; + + col += offset; + + x0 = col[0*8]; + x1 = col[1*8]; + x2 = col[2*8]; + x3 = col[3*8]; + x4 = col[4*8]; + x5 = col[5*8]; + x6 = col[6*8]; + x7 = col[7*8]; + + u04 = S (x0 + x4); + v04 = S (x0 - x4); + u26 = S (F (T2, x6) + x2); + v26 = S (F (T2, x2) - x6); + + a0 = S (u04 + u26); + a1 = S (v04 + v26); + a2 = S (v04 - v26); + a3 = S (u04 - u26); + + u17 = S (F (T1, x7) + x1); + v17 = S (F (T1, x1) - x7); + u35 = S (F (T3, x5) + x3); + v35 = S (F (T3, x3) - x5); + + b0 = S (u17 + u35); + b3 = S (v17 - v35); + u12 = S (u17 - u35); + v12 = S (v17 + v35); + u12 = S (2 * F (C4, u12)); + v12 = S (2 * F (C4, v12)); + b1 = S (u12 + v12); + b2 = S (u12 - v12); + + y0 = S (a0 + b0) >> COL_SHIFT; + y1 = S (a1 + b1) >> COL_SHIFT; + y2 = S (a2 + b2) >> COL_SHIFT; + y3 = S (a3 + b3) >> COL_SHIFT; + + y4 = S (a3 - b3) >> COL_SHIFT; + y5 = S (a2 - b2) >> COL_SHIFT; + y6 = S (a1 - b1) >> COL_SHIFT; + y7 = S (a0 - b0) >> COL_SHIFT; + + col[0*8] = y0; + col[1*8] = y1; + col[2*8] = y2; + col[3*8] = y3; + col[4*8] = y4; + col[5*8] = y5; + col[6*8] = y6; + col[7*8] = y7; +} +#endif + + +// MMX column IDCT +static inline void idct_col (int16_t * col, int offset) +{ +#define T1 13036 +#define T2 27146 +#define T3 43790 +#define C4 23170 + + static short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1}; + static short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2}; + static short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3}; + static short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4}; + + /* column code adapted from peter gubanov */ + /* http://www.elecard.com/peter/idct.shtml */ + + movq_m2r (*_T1, mm0); // mm0 = T1 + + movq_m2r (*(col+offset+1*8), mm1); // mm1 = x1 + movq_r2r (mm0, mm2); // mm2 = T1 + + movq_m2r (*(col+offset+7*8), mm4); // mm4 = x7 + pmulhw_r2r (mm1, mm0); // mm0 = T1*x1 + + movq_m2r (*_T3, mm5); // mm5 = T3 + pmulhw_r2r (mm4, mm2); // mm2 = T1*x7 + + movq_m2r (*(col+offset+5*8), mm6); // mm6 = x5 + movq_r2r (mm5, mm7); // mm7 = T3-1 + + movq_m2r (*(col+offset+3*8), mm3); // mm3 = x3 + psubsw_r2r (mm4, mm0); // mm0 = v17 + + movq_m2r (*_T2, mm4); // mm4 = T2 + pmulhw_r2r (mm3, mm5); // mm5 = (T3-1)*x3 + + paddsw_r2r (mm2, mm1); // mm1 = u17 + pmulhw_r2r (mm6, mm7); // mm7 = (T3-1)*x5 + + /* slot */ + + movq_r2r (mm4, mm2); // mm2 = T2 + paddsw_r2r (mm3, mm5); // mm5 = T3*x3 + + pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2 + paddsw_r2r (mm6, mm7); // mm7 = T3*x5 + + psubsw_r2r (mm6, mm5); // mm5 = v35 + paddsw_r2r (mm3, mm7); // mm7 = u35 + + movq_m2r (*(col+offset+6*8), mm3); // mm3 = x6 + movq_r2r (mm0, mm6); // mm6 = v17 + + pmulhw_r2r (mm3, mm2); // mm2 = T2*x6 + psubsw_r2r (mm5, mm0); // mm0 = b3 + + psubsw_r2r (mm3, mm4); // mm4 = v26 + paddsw_r2r (mm6, mm5); // mm5 = v12 + + movq_r2m (mm0, *(col+offset+3*8)); // save b3 in scratch0 + movq_r2r (mm1, mm6); // mm6 = u17 + + paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26 + paddsw_r2r (mm7, mm6); // mm6 = b0 + + psubsw_r2r (mm7, mm1); // mm1 = u12 + movq_r2r (mm1, mm7); // mm7 = u12 + + movq_m2r (*(col+offset+0*8), mm3); // mm3 = x0 + paddsw_r2r (mm5, mm1); // mm1 = u12+v12 + + movq_m2r (*_C4, mm0); // mm0 = C4/2 + psubsw_r2r (mm5, mm7); // mm7 = u12-v12 + + movq_r2m (mm6, *(col+offset+5*8)); // save b0 in scratch1 + pmulhw_r2r (mm0, mm1); // mm1 = b1/2 + + movq_r2r (mm4, mm6); // mm6 = v26 + pmulhw_r2r (mm0, mm7); // mm7 = b2/2 + + movq_m2r (*(col+offset+4*8), mm5); // mm5 = x4 + movq_r2r (mm3, mm0); // mm0 = x0 + + psubsw_r2r (mm5, mm3); // mm3 = v04 + paddsw_r2r (mm5, mm0); // mm0 = u04 + + paddsw_r2r (mm3, mm4); // mm4 = a1 + movq_r2r (mm0, mm5); // mm5 = u04 + + psubsw_r2r (mm6, mm3); // mm3 = a2 + paddsw_r2r (mm2, mm5); // mm5 = a0 + + paddsw_r2r (mm1, mm1); // mm1 = b1 + psubsw_r2r (mm2, mm0); // mm0 = a3 + + paddsw_r2r (mm7, mm7); // mm7 = b2 + movq_r2r (mm3, mm2); // mm2 = a2 + + movq_r2r (mm4, mm6); // mm6 = a1 + paddsw_r2r (mm7, mm3); // mm3 = a2+b2 + + psraw_i2r (COL_SHIFT, mm3); // mm3 = y2 + paddsw_r2r (mm1, mm4); // mm4 = a1+b1 + + psraw_i2r (COL_SHIFT, mm4); // mm4 = y1 + psubsw_r2r (mm1, mm6); // mm6 = a1-b1 + + movq_m2r (*(col+offset+5*8), mm1); // mm1 = b0 + psubsw_r2r (mm7, mm2); // mm2 = a2-b2 + + psraw_i2r (COL_SHIFT, mm6); // mm6 = y6 + movq_r2r (mm5, mm7); // mm7 = a0 + + movq_r2m (mm4, *(col+offset+1*8)); // save y1 + psraw_i2r (COL_SHIFT, mm2); // mm2 = y5 + + movq_r2m (mm3, *(col+offset+2*8)); // save y2 + paddsw_r2r (mm1, mm5); // mm5 = a0+b0 + + movq_m2r (*(col+offset+3*8), mm4); // mm4 = b3 + psubsw_r2r (mm1, mm7); // mm7 = a0-b0 + + psraw_i2r (COL_SHIFT, mm5); // mm5 = y0 + movq_r2r (mm0, mm3); // mm3 = a3 + + movq_r2m (mm2, *(col+offset+5*8)); // save y5 + psubsw_r2r (mm4, mm3); // mm3 = a3-b3 + + psraw_i2r (COL_SHIFT, mm7); // mm7 = y7 + paddsw_r2r (mm0, mm4); // mm4 = a3+b3 + + movq_r2m (mm5, *(col+offset+0*8)); // save y0 + psraw_i2r (COL_SHIFT, mm3); // mm3 = y4 + + movq_r2m (mm6, *(col+offset+6*8)); // save y6 + psraw_i2r (COL_SHIFT, mm4); // mm4 = y3 + + movq_r2m (mm7, *(col+offset+7*8)); // save y7 + + movq_r2m (mm3, *(col+offset+4*8)); // save y4 + + movq_r2m (mm4, *(col+offset+3*8)); // save y3 +} + + +static int32_t rounder0[] ATTR_ALIGN(8) = + rounder ((1 << (COL_SHIFT - 1)) - 0.5); +static int32_t rounder4[] ATTR_ALIGN(8) = rounder (0); +static int32_t rounder1[] ATTR_ALIGN(8) = + rounder (1.25683487303); /* C1*(C1/C4+C1+C7)/2 */ +static int32_t rounder7[] ATTR_ALIGN(8) = + rounder (-0.25); /* C1*(C7/C4+C7-C1)/2 */ +static int32_t rounder2[] ATTR_ALIGN(8) = + rounder (0.60355339059); /* C2 * (C6+C2)/2 */ +static int32_t rounder6[] ATTR_ALIGN(8) = + rounder (-0.25); /* C2 * (C6-C2)/2 */ +static int32_t rounder3[] ATTR_ALIGN(8) = + rounder (0.087788325588); /* C3*(-C3/C4+C3+C5)/2 */ +static int32_t rounder5[] ATTR_ALIGN(8) = + rounder (-0.441341716183); /* C3*(-C5/C4+C5-C3)/2 */ + + +#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ +void idct (int16_t * block) \ +{ \ + static int16_t table04[] ATTR_ALIGN(16) = \ + table (22725, 21407, 19266, 16384, 12873, 8867, 4520); \ + static int16_t table17[] ATTR_ALIGN(16) = \ + table (31521, 29692, 26722, 22725, 17855, 12299, 6270); \ + static int16_t table26[] ATTR_ALIGN(16) = \ + table (29692, 27969, 25172, 21407, 16819, 11585, 5906); \ + static int16_t table35[] ATTR_ALIGN(16) = \ + table (26722, 25172, 22654, 19266, 15137, 10426, 5315); \ + \ + idct_row_head (block, 0*8, table04); \ + idct_row (table04, rounder0); \ + idct_row_mid (block, 0*8, 4*8, table04); \ + idct_row (table04, rounder4); \ + idct_row_mid (block, 4*8, 1*8, table17); \ + idct_row (table17, rounder1); \ + idct_row_mid (block, 1*8, 7*8, table17); \ + idct_row (table17, rounder7); \ + idct_row_mid (block, 7*8, 2*8, table26); \ + idct_row (table26, rounder2); \ + idct_row_mid (block, 2*8, 6*8, table26); \ + idct_row (table26, rounder6); \ + idct_row_mid (block, 6*8, 3*8, table35); \ + idct_row (table35, rounder3); \ + idct_row_mid (block, 3*8, 5*8, table35); \ + idct_row (table35, rounder5); \ + idct_row_tail (block, 5*8); \ + \ + idct_col (block, 0); \ + idct_col (block, 4); \ +} + + +declare_idct (ff_mmxext_idct, mmxext_table, + mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid) + +declare_idct (ff_mmx_idct, mmx_table, + mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid) diff --git a/src/libffmpeg/libavcodec/i386/mmx.h b/src/libffmpeg/libavcodec/i386/mmx.h new file mode 100644 index 000000000..2ba28898d --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/mmx.h @@ -0,0 +1 @@ +#include "xineutils.h" diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c new file mode 100644 index 000000000..35b16b711 --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c @@ -0,0 +1,244 @@ +/* + * MMX optimized motion estimation + * Copyright (c) 2001 Gerard Lantau. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#include "../dsputil.h" +#include "mmx.h" + +static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001; +static const unsigned long long int mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002; + +/* mm7 is accumulator, mm6 is zero */ +static inline void sad_add(const UINT8 *p1, const UINT8 *p2) +{ + movq_m2r(*p1, mm0); + movq_m2r(*p2, mm1); + movq_r2r(mm0, mm2); + psubusb_r2r(mm1, mm0); + psubusb_r2r(mm2, mm1); + por_r2r(mm1, mm0); /* mm0 is absolute value */ + + movq_r2r(mm0, mm1); + punpcklbw_r2r(mm6, mm0); + punpckhbw_r2r(mm6, mm1); + paddusw_r2r(mm0, mm7); + paddusw_r2r(mm1, mm7); +} + +/* convert mm7 to value */ +static inline int sad_end(void) +{ + int res; + + movq_r2r(mm7, mm0); + psrlq_i2r(32, mm7); + paddusw_r2r(mm0, mm7); + + movq_r2r(mm7, mm0); + psrlq_i2r(16, mm7); + paddusw_r2r(mm0, mm7); + __asm __volatile ("movd %%mm7, %0" : "=a" (res)); + return res & 0xffff; +} + +int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) +{ + const UINT8 *p1, *p2; + + h >>= 1; + p1 = blk1; + p2 = blk2; + pxor_r2r(mm7, mm7); /* mm7 is accumulator */ + pxor_r2r(mm6, mm6); /* mm7 is zero constant */ + do { + sad_add(p1, p2); + sad_add(p1 + 8, p2 + 8); + p1 += lx; + p2 += lx; + sad_add(p1, p2); + sad_add(p1 + 8, p2 + 8); + p1 += lx; + p2 += lx; + } while (--h); + return sad_end(); +} + +/* please test it ! */ +static inline void sad_add_sse(const UINT8 *p1, const UINT8 *p2) +{ + movq_m2r(*(p1 + 0), mm0); + movq_m2r(*(p1 + 8), mm1); + psadbw_m2r(*(p2 + 0), mm0); + psadbw_m2r(*(p2 + 8), mm1); + paddusw_r2r(mm0, mm7); + paddusw_r2r(mm1, mm7); +} + +int pix_abs16x16_sse(UINT8 *blk1, UINT8 *blk2, int lx, int h) +{ + const UINT8 *p1, *p2; + + h >>= 1; + p1 = blk1; + p2 = blk2; + pxor_r2r(mm7, mm7); /* mm7 is accumulator */ + do { + sad_add_sse(p1, p2); + p1 += lx; + p2 += lx; + sad_add_sse(p1, p2); + p1 += lx; + p2 += lx; + } while (--h); + return sad_end(); +} + +#define DUMP(reg) { mmx_t tmp; movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq); } + +/* mm7 is accumulator, mm6 is zero */ +static inline void sad_add_x2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3) +{ + movq_m2r(*(p2 + 0), mm0); + movq_m2r(*(p3 + 0), mm1); + movq_r2r(mm0, mm2); + movq_r2r(mm1, mm3); + punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */ + punpcklbw_r2r(mm6, mm1); + punpckhbw_r2r(mm6, mm2); /* high */ + punpckhbw_r2r(mm6, mm3); + paddusw_r2r(mm1, mm0); + paddusw_r2r(mm3, mm2); + movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */ + paddusw_r2r(mm5, mm0); /* + 1 */ + paddusw_r2r(mm5, mm2); /* + 1 */ + psrlw_i2r(1, mm0); + psrlw_i2r(1, mm2); + packuswb_r2r(mm2, mm0); /* average is in mm0 */ + + movq_r2r(mm1, mm2); + psubusb_r2r(mm0, mm1); + psubusb_r2r(mm2, mm0); + por_r2r(mm1, mm0); /* mm0 is absolute value */ + + movq_r2r(mm0, mm1); + punpcklbw_r2r(mm6, mm0); + punpckhbw_r2r(mm6, mm1); + paddusw_r2r(mm0, mm7); + paddusw_r2r(mm1, mm7); +} + +int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) +{ + const UINT8 *p1, *p2; + + p1 = blk1; + p2 = blk2; + pxor_r2r(mm7, mm7); /* mm7 is accumulator */ + pxor_r2r(mm6, mm6); /* mm7 is zero constant */ + movq_m2r(mm_wone, mm5); /* one constant */ + do { + sad_add_x2(p1, p2, p2 + 1); + sad_add_x2(p1 + 8, p2 + 8, p2 + 9); + p1 += lx; + p2 += lx; + } while (--h); + return sad_end(); +} + +int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) +{ + const UINT8 *p1, *p2; + + p1 = blk1; + p2 = blk2; + pxor_r2r(mm7, mm7); /* mm7 is accumulator */ + pxor_r2r(mm6, mm6); /* mm7 is zero constant */ + movq_m2r(mm_wone, mm5); /* one constant */ + do { + sad_add_x2(p1, p2, p2 + lx); + sad_add_x2(p1 + 8, p2 + 8, p2 + 8 + lx); + p1 += lx; + p2 += lx; + } while (--h); + return sad_end(); +} + +/* mm7 is accumulator, mm6 is zero */ +static inline void sad_add_xy2(const UINT8 *p1, const UINT8 *p2, const UINT8 *p3) +{ + movq_m2r(*(p2 + 0), mm0); + movq_m2r(*(p3 + 0), mm1); + movq_r2r(mm0, mm2); + movq_r2r(mm1, mm3); + punpcklbw_r2r(mm6, mm0); /* extract 4 bytes low */ + punpcklbw_r2r(mm6, mm1); + punpckhbw_r2r(mm6, mm2); /* high */ + punpckhbw_r2r(mm6, mm3); + paddusw_r2r(mm1, mm0); + paddusw_r2r(mm3, mm2); + + movq_m2r(*(p2 + 1), mm1); + movq_m2r(*(p3 + 1), mm3); + movq_r2r(mm1, mm4); + punpcklbw_r2r(mm6, mm1); /* low */ + punpckhbw_r2r(mm6, mm4); /* high */ + paddusw_r2r(mm1, mm0); + paddusw_r2r(mm4, mm2); + movq_r2r(mm3, mm4); + punpcklbw_r2r(mm6, mm3); /* low */ + punpckhbw_r2r(mm6, mm4); /* high */ + paddusw_r2r(mm3, mm0); + paddusw_r2r(mm4, mm2); + + movq_m2r(*(p1 + 0), mm1); /* mm1 : other value */ + paddusw_r2r(mm5, mm0); /* + 2 */ + paddusw_r2r(mm5, mm2); /* + 2 */ + psrlw_i2r(2, mm0); + psrlw_i2r(2, mm2); + packuswb_r2r(mm2, mm0); /* average is in mm0 */ + + movq_r2r(mm1, mm2); + psubusb_r2r(mm0, mm1); + psubusb_r2r(mm2, mm0); + por_r2r(mm1, mm0); /* mm0 is absolute value */ + + movq_r2r(mm0, mm1); + punpcklbw_r2r(mm6, mm0); + punpckhbw_r2r(mm6, mm1); + paddusw_r2r(mm0, mm7); + paddusw_r2r(mm1, mm7); +} + +int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx, int h) +{ + const UINT8 *p1, *p2, *p3; + + p1 = blk1; + p2 = blk2; + p3 = blk2 + lx; + pxor_r2r(mm7, mm7); /* mm7 is accumulator */ + pxor_r2r(mm6, mm6); /* mm7 is zero constant */ + movq_m2r(mm_wtwo, mm5); /* one constant */ + do { + sad_add_xy2(p1, p2, p2 + lx); + sad_add_xy2(p1 + 8, p2 + 8, p2 + 8 + lx); + p1 += lx; + p2 += lx; + } while (--h); + return sad_end(); +} diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c new file mode 100644 index 000000000..b34d4eb03 --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c @@ -0,0 +1,326 @@ +/* + * The simplest mpeg encoder (well, it was the simplest!) + * Copyright (c) 2000,2001 Gerard Lantau. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Optimized for ia32 cpus by Nick Kurshev + */ + +#include "../dsputil.h" +#include "../mpegvideo.h" + +#if 0 + +/* XXX: GL: I don't understand why this function needs optimization + (it is called only once per frame!), so I disabled it */ + +void MPV_frame_start(MpegEncContext *s) +{ + if (s->pict_type == B_TYPE) { + __asm __volatile( + "movl (%1), %%eax\n\t" + "movl 4(%1), %%edx\n\t" + "movl 8(%1), %%ecx\n\t" + "movl %%eax, (%0)\n\t" + "movl %%edx, 4(%0)\n\t" + "movl %%ecx, 8(%0)\n\t" + : + :"r"(s->current_picture), "r"(s->aux_picture) + :"eax","edx","ecx","memory"); + } else { + /* swap next and last */ + __asm __volatile( + "movl (%1), %%eax\n\t" + "movl 4(%1), %%edx\n\t" + "movl 8(%1), %%ecx\n\t" + "xchgl (%0), %%eax\n\t" + "xchgl 4(%0), %%edx\n\t" + "xchgl 8(%0), %%ecx\n\t" + "movl %%eax, (%1)\n\t" + "movl %%edx, 4(%1)\n\t" + "movl %%ecx, 8(%1)\n\t" + "movl %%eax, (%2)\n\t" + "movl %%edx, 4(%2)\n\t" + "movl %%ecx, 8(%2)\n\t" + : + :"r"(s->last_picture), "r"(s->next_picture), "r"(s->current_picture) + :"eax","edx","ecx","memory"); + } +} +#endif + +static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; +static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; + + +static void dct_unquantize_h263_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int i, level, qmul, qadd; + + qmul = s->qscale << 1; + qadd = (s->qscale - 1) | 1; + + if (s->mb_intra) { + if (n < 4) + block[0] = block[0] * s->y_dc_scale; + else + block[0] = block[0] * s->c_dc_scale; + + for(i=1; i<8; i++) { + level = block[i]; + if (level) { + if (level < 0) { + level = level * qmul - qadd; + } else { + level = level * qmul + qadd; + } + block[i] = level; + } + } + } else { + i = 0; + } + +asm volatile( + "movd %1, %%mm6 \n\t" //qmul + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "movd %2, %%mm5 \n\t" //qadd + "pxor %%mm7, %%mm7 \n\t" + "packssdw %%mm5, %%mm5 \n\t" + "packssdw %%mm5, %%mm5 \n\t" + "psubw %%mm5, %%mm7 \n\t" + "pxor %%mm4, %%mm4 \n\t" + ".balign 16\n\t" + "1: \n\t" + "movq (%0, %3), %%mm0 \n\t" + "movq 8(%0, %3), %%mm1 \n\t" + + "pmullw %%mm6, %%mm0 \n\t" + "pmullw %%mm6, %%mm1 \n\t" + + "movq (%0, %3), %%mm2 \n\t" + "movq 8(%0, %3), %%mm3 \n\t" + + "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + + "paddw %%mm7, %%mm0 \n\t" + "paddw %%mm7, %%mm1 \n\t" + + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + + "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 + + "pandn %%mm2, %%mm0 \n\t" + "pandn %%mm3, %%mm1 \n\t" + + "movq %%mm0, (%0, %3) \n\t" + "movq %%mm1, 8(%0, %3) \n\t" + + "addl $16, %3 \n\t" + "cmpl $128, %3 \n\t" + "jb 1b \n\t" + ::"r" (block), "g"(qmul), "g" (qadd), "r" (2*i) + : "memory" + ); +} + + +/* + NK: + Note: looking at PARANOID: + "enable all paranoid tests for rounding, overflows, etc..." + +#ifdef PARANOID + if (level < -2048 || level > 2047) + fprintf(stderr, "unquant error %d %d\n", i, level); +#endif + We can suppose that result of two multiplications can't be greate of 0xFFFF + i.e. is 16-bit, so we use here only PMULLW instruction and can avoid + a complex multiplication. +===================================================== + Full formula for multiplication of 2 integer numbers + which are represent as high:low words: + input: value1 = high1:low1 + value2 = high2:low2 + output: value3 = value1*value2 + value3=high3:low3 (on overflow: modulus 2^32 wrap-around) + this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 + but this algorithm will compute only 0x66cb0ce4 + this limited by 16-bit size of operands + --------------------------------- + tlow1 = high1*low2 + tlow2 = high2*low1 + tlow1 = tlow1 + tlow2 + high3:low3 = low1*low2 + high3 += tlow1 +*/ +static void dct_unquantize_mpeg1_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int i, level; + const UINT16 *quant_matrix; + if (s->mb_intra) { + if (n < 4) + block[0] = block[0] * s->y_dc_scale; + else + block[0] = block[0] * s->c_dc_scale; + if (s->out_format == FMT_H263) { + i = 1; + goto unquant_even; + } + /* XXX: only mpeg1 */ + quant_matrix = s->intra_matrix; + i=1; + /* Align on 4 elements boundary */ + while(i&3) + { + level = block[i]; + if (level) { + if (level < 0) level = -level; + level = (int)(level * qscale * quant_matrix[i]) >> 3; + level = (level - 1) | 1; + if (block[i] < 0) level = -level; + block[i] = level; + } + i++; + } + __asm __volatile( + "movd %0, %%mm6\n\t" /* mm6 = qscale | 0 */ + "punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */ + "movq %2, %%mm4\n\t" + "movq %%mm6, %%mm7\n\t" + "movq %1, %%mm5\n\t" + "packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */ + "pxor %%mm6, %%mm6\n\t" + ::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory"); + for(;i<64;i+=4) { + __asm __volatile( + "movq %1, %%mm0\n\t" + "movq %%mm7, %%mm1\n\t" + "movq %%mm0, %%mm2\n\t" + "movq %%mm0, %%mm3\n\t" + "pcmpgtw %%mm6, %%mm2\n\t" + "pmullw %2, %%mm1\n\t" + "pandn %%mm4, %%mm2\n\t" + "por %%mm5, %%mm2\n\t" + "pmullw %%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */ + + "pcmpeqw %%mm6, %%mm3\n\t" + "pmullw %%mm0, %%mm1\n\t" + "psraw $3, %%mm1\n\t" + "psubw %%mm5, %%mm1\n\t" /* block[i] --; */ + "pandn %%mm4, %%mm3\n\t" /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */ + "por %%mm5, %%mm1\n\t" /* block[i] |= 1 */ + "pmullw %%mm2, %%mm1\n\t" /* change signs again */ + + "pand %%mm3, %%mm1\n\t" /* nullify if was zero */ + "movq %%mm1, %0" + :"=m"(block[i]) + :"m"(block[i]), "m"(quant_matrix[i]) + :"memory"); + } + } else { + i = 0; + unquant_even: + quant_matrix = s->non_intra_matrix; + /* Align on 4 elements boundary */ + while(i&7) + { + level = block[i]; + if (level) { + if (level < 0) level = -level; + level = (((level << 1) + 1) * qscale * + ((int) quant_matrix[i])) >> 4; + level = (level - 1) | 1; + if(block[i] < 0) level = -level; + block[i] = level; + } + i++; + } + +asm volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $15, %%mm7 \n\t" + "movd %2, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + ".balign 16\n\t" + "1: \n\t" + "movq (%0, %3), %%mm0 \n\t" + "movq 8(%0, %3), %%mm1 \n\t" + "movq (%1, %3), %%mm4 \n\t" + "movq 8(%1, %3), %%mm5 \n\t" + "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] + "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] + "pxor %%mm2, %%mm2 \n\t" + "pxor %%mm3, %%mm3 \n\t" + "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) + "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) + "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 + "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 + "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 + "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 + "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q + "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // FIXME slow + "pcmpeqw (%0, %3), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %3), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "psraw $4, %%mm0 \n\t" + "psraw $4, %%mm1 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psubw %%mm7, %%mm1 \n\t" + "por %%mm7, %%mm0 \n\t" + "por %%mm7, %%mm1 \n\t" + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "pandn %%mm0, %%mm4 \n\t" + "pandn %%mm1, %%mm5 \n\t" + "movq %%mm4, (%0, %3) \n\t" + "movq %%mm5, 8(%0, %3) \n\t" + + "addl $16, %3 \n\t" + "cmpl $128, %3 \n\t" + "jb 1b \n\t" + ::"r" (block), "r"(quant_matrix), "g" (qscale), "r" (2*i) + : "memory" + ); + } +} + +void MPV_common_init_mmx(MpegEncContext *s) +{ + if (mm_flags & MM_MMX) { + if (s->out_format == FMT_H263) + s->dct_unquantize = dct_unquantize_h263_mmx; + else + s->dct_unquantize = dct_unquantize_mpeg1_mmx; + } +} diff --git a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c new file mode 100644 index 000000000..297f23724 --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c @@ -0,0 +1,1455 @@ +/* + Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include +#include "../dsputil.h" + +#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C4 16384 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + +#define ROW_SHIFT 11 +#define COL_SHIFT 20 // 6 + +static uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL; +static uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL; +static int16_t __attribute__((aligned(8))) temp[64]; +static int16_t __attribute__((aligned(8))) coeffs[]= { + 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, +// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, +// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), + 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, + // the 1 = ((1<<(COL_SHIFT-1))/C4)<> COL_SHIFT; + col[8*1] = (a1 + b1) >> COL_SHIFT; + col[8*2] = (a2 + b2) >> COL_SHIFT; + col[8*3] = (a3 + b3) >> COL_SHIFT; + col[8*4] = (a3 - b3) >> COL_SHIFT; + col[8*5] = (a2 - b2) >> COL_SHIFT; + col[8*6] = (a1 - b1) >> COL_SHIFT; + col[8*7] = (a0 - b0) >> COL_SHIFT; +} + +static void inline idctRow (int16_t * output, int16_t * input) +{ + int16_t row[8]; + + int a0, a1, a2, a3, b0, b1, b2, b3; + const int C0 = 23170; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C1 = 22725; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C2 = 21407; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C3 = 19266; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C4 = 16384; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C5 = 12873; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C6 = 8867; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + const int C7 = 4520; //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 + +row[0] = input[0]; +row[2] = input[1]; +row[4] = input[4]; +row[6] = input[5]; +row[1] = input[8]; +row[3] = input[9]; +row[5] = input[12]; +row[7] = input[13]; + + if( !(row[1] | row[2] |row[3] |row[4] |row[5] |row[6] | row[7]) ) { + row[0] = row[1] = row[2] = row[3] = row[4] = + row[5] = row[6] = row[7] = row[0]<<3; + output[0] = row[0]; + output[2] = row[1]; + output[4] = row[2]; + output[6] = row[3]; + output[8] = row[4]; + output[10] = row[5]; + output[12] = row[6]; + output[14] = row[7]; + return; + } + + a0 = C4*row[0] + C2*row[2] + C4*row[4] + C6*row[6] + (1<<(ROW_SHIFT-1)); + a1 = C4*row[0] + C6*row[2] - C4*row[4] - C2*row[6] + (1<<(ROW_SHIFT-1)); + a2 = C4*row[0] - C6*row[2] - C4*row[4] + C2*row[6] + (1<<(ROW_SHIFT-1)); + a3 = C4*row[0] - C2*row[2] + C4*row[4] - C6*row[6] + (1<<(ROW_SHIFT-1)); + + b0 = C1*row[1] + C3*row[3] + C5*row[5] + C7*row[7]; + b1 = C3*row[1] - C7*row[3] - C1*row[5] - C5*row[7]; + b2 = C5*row[1] - C1*row[3] + C7*row[5] + C3*row[7]; + b3 = C7*row[1] - C5*row[3] + C3*row[5] - C1*row[7]; + + row[0] = (a0 + b0) >> ROW_SHIFT; + row[1] = (a1 + b1) >> ROW_SHIFT; + row[2] = (a2 + b2) >> ROW_SHIFT; + row[3] = (a3 + b3) >> ROW_SHIFT; + row[4] = (a3 - b3) >> ROW_SHIFT; + row[5] = (a2 - b2) >> ROW_SHIFT; + row[6] = (a1 - b1) >> ROW_SHIFT; + row[7] = (a0 - b0) >> ROW_SHIFT; + + output[0] = row[0]; + output[2] = row[1]; + output[4] = row[2]; + output[6] = row[3]; + output[8] = row[4]; + output[10] = row[5]; + output[12] = row[6]; + output[14] = row[7]; +} +#endif + +static inline void idct(int16_t *block) +{ + int i; +//for(i=0; i<64; i++) temp[i]= block[ block_permute_op(i) ]; +//for(i=0; i<64; i++) temp[block_permute_op(i)]= block[ i ]; +//for(i=0; i<64; i++) block[i]= temp[i]; +//block_permute(block); +/* +idctRow(temp, block); +idctRow(temp+16, block+16); +idctRow(temp+1, block+2); +idctRow(temp+17, block+18); +idctRow(temp+32, block+32); +idctRow(temp+48, block+48); +idctRow(temp+33, block+34); +idctRow(temp+49, block+50); +*/ + + asm volatile( +// "lea 64(%0), %%eax \n\t" +//r0,r2,R0,R2 r4,r6,R4,R6 r1,r3,R1,R3 r5,r7,R5,R7 +//src0 src4 src1 src5 +//r0,R0,r7,R7 r1,R1,r6,R6 r2,R2,r5,R5 r3,R3,r4,R4 +//dst0 dst1 dst2 dst3 +#if 0 //Alternative, simpler variant +#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ + "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + #rounder ", %%mm4 \n\t"\ +\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ + "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE0(%%mm6, %%mm4, dst) \ +\ + "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ + "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ + #rounder ", %%mm4 \n\t"\ +\ + "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE1(%%mm6, %%mm4, dst, %%mm7) \ +\ + "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ + "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ + #rounder ", %%mm4 \n\t"\ +\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ +\ + "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ + "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ + "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ + "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst) + +#define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq wm1010, %%mm4 \n\t"\ + "pand %%mm0, %%mm4 \n\t"\ + "por %%mm1, %%mm4 \n\t"\ + "por %%mm2, %%mm4 \n\t"\ + "por %%mm3, %%mm4 \n\t"\ + "packssdw %%mm4,%%mm4 \n\t"\ + "movd %%mm4, %%eax \n\t"\ + "orl %%eax, %%eax \n\t"\ + "jz 1f \n\t"\ + "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ + "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + #rounder ", %%mm4 \n\t"\ +\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ + "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE0(%%mm6, %%mm4, dst) \ +\ + "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ + "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ + #rounder ", %%mm4 \n\t"\ +\ + "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE1(%%mm6, %%mm4, dst, %%mm7) \ +\ + "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ + "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ + #rounder ", %%mm4 \n\t"\ +\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ +\ + "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ + "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ + "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ + "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\ + "jmp 2f \n\t"\ + "1: \n\t"\ + WRITE3(%%mm0, dst)\ + "2: \n\t"\ + + +#define WRITE0(s0, s7, dst)\ + "movq " #s0 ", " #dst " \n\t" /* R0 r0 */\ + "movq " #s7 ", 24+" #dst " \n\t" /* R7 r7 */ + +#define WRITE1(s1, s6, dst, tmp)\ + "movq " #dst ", " #tmp " \n\t" /* R0 r0 */\ + "packssdw " #s1 ", " #tmp " \n\t" /* R1 r1 R0 r0*/\ + "movq " #tmp ", " #dst " \n\t"\ + "movq 24+" #dst ", " #tmp " \n\t" /* R7 r7 */\ + "packssdw " #tmp ", " #s6 " \n\t" /* R7 r7 R6 r6*/\ + "movq " #s6 ", 24+" #dst " \n\t" + +#define WRITE2(s2, s5, s3, s4, dst)\ + "packssdw " #s3 ", " #s2 " \n\t" /* R3 r3 R2 r2*/\ + "packssdw " #s5 ", " #s4 " \n\t" /* R5 r5 R4 r4*/\ + "movq " #s2 ", 8+" #dst " \n\t"\ + "movq " #s4 ", 16+" #dst " \n\t" + +#define WRITE3(a, dst)\ + "pslld $16, " #a " \n\t"\ + "psrad $13, " #a " \n\t"\ + "packssdw " #a ", " #a " \n\t"\ + "movq " #a ", " #dst " \n\t"\ + "movq " #a ", 8+" #dst " \n\t"\ + "movq " #a ", 16+" #dst " \n\t"\ + "movq " #a ", 24+" #dst " \n\t"\ + +//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) +IDCT_CORE( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) +/* +DC_COND_IDCT_CORE( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) +DC_COND_IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) +DC_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) +*/ +IDCT_CORE( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) +IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) +IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) + +#undef WRITE0 +#undef WRITE1 +#undef WRITE2 + +#define WRITE0(s0, s7, dst)\ + "packssdw " #s0 ", " #s0 " \n\t" /* C0, c0, C0, c0 */\ + "packssdw " #s7 ", " #s7 " \n\t" /* C7, c7, C7, c7 */\ + "movd " #s0 ", " #dst " \n\t" /* C0, c0 */\ + "movd " #s7 ", 112+" #dst " \n\t" /* C7, c7 */ + +#define WRITE1(s1, s6, dst, tmp)\ + "packssdw " #s1 ", " #s1 " \n\t" /* C1, c1, C1, c1 */\ + "packssdw " #s6 ", " #s6 " \n\t" /* C6, c6, C6, c6 */\ + "movd " #s1 ", 16+" #dst " \n\t" /* C1, c1 */\ + "movd " #s6 ", 96+" #dst " \n\t" /* C6, c6 */ + +#define WRITE2(s2, s5, s3, s4, dst)\ + "packssdw " #s2 ", " #s2 " \n\t" /* C2, c2, C2, c2 */\ + "packssdw " #s3 ", " #s3 " \n\t" /* C3, c3, C3, c3 */\ + "movd " #s2 ", 32+" #dst " \n\t" /* C2, c2 */\ + "movd " #s3 ", 48+" #dst " \n\t" /* C3, c3 */\ + "packssdw " #s4 ", " #s4 " \n\t" /* C4, c4, C4, c4 */\ + "packssdw " #s5 ", " #s5 " \n\t" /* C5, c5, C5, c5 */\ + "movd " #s4 ", 64+" #dst " \n\t" /* C4, c4 */\ + "movd " #s5 ", 80+" #dst " \n\t" /* C5, c5 */\ + +//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) +IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + +#else + +#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ + "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + #rounder ", %%mm4 \n\t"\ +\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ + "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE0(%%mm6, %%mm4, dst) \ +\ + "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ + "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ + #rounder ", %%mm4 \n\t"\ +\ + "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE1(%%mm6, %%mm4, dst, %%mm7) \ +\ + "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ + "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ + #rounder ", %%mm4 \n\t"\ +\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ +\ + "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ + "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ + "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ + "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst) + +#define DC_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq wm1010, %%mm4 \n\t"\ + "pand %%mm0, %%mm4 \n\t"\ + "por %%mm1, %%mm4 \n\t"\ + "por %%mm2, %%mm4 \n\t"\ + "por %%mm3, %%mm4 \n\t"\ + "packssdw %%mm4,%%mm4 \n\t"\ + "movd %%mm4, %%eax \n\t"\ + "orl %%eax, %%eax \n\t"\ + "jz 1f \n\t"\ + "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ + "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + #rounder ", %%mm4 \n\t"\ +\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ + "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE0(%%mm6, %%mm4, dst) \ +\ + "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ + "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ + #rounder ", %%mm4 \n\t"\ +\ + "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE1(%%mm6, %%mm4, dst, %%mm7) \ +\ + "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ + "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ + #rounder ", %%mm4 \n\t"\ +\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ +\ + "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ + "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ + "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ + "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\ + "jmp 2f \n\t"\ + "#.balign 16 \n\t"\ + "1: \n\t"\ + WRITE3(%%mm0, dst)\ + "2: \n\t"\ + +#define Z_COND_IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift, bt) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq %%mm0, %%mm4 \n\t"\ + "por %%mm1, %%mm4 \n\t"\ + "por %%mm2, %%mm4 \n\t"\ + "por %%mm3, %%mm4 \n\t"\ + "packssdw %%mm4, %%mm4 \n\t"\ + "movd %%mm4, %%eax \n\t"\ + "orl %%eax, %%eax \n\t"\ + "jz " #bt " \n\t"\ + "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ + "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + #rounder ", %%mm4 \n\t"\ +\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ + "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE0(%%mm6, %%mm4, dst) \ +\ + "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ + "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ + #rounder ", %%mm4 \n\t"\ +\ + "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE1(%%mm6, %%mm4, dst, %%mm7) \ +\ + "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ + "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ + #rounder ", %%mm4 \n\t"\ +\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ +\ + "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ + #rounder ", %%mm0 \n\t"\ + "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ + "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ + "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ + "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + WRITE2(%%mm6, %%mm4, %%mm2, %%mm0, dst)\ + + +#define WRITE0(s0, s7, dst)\ + "movq " #s0 ", " #dst " \n\t" /* R0 r0 */\ + "movq " #s7 ", 24+" #dst " \n\t" /* R7 r7 */ + +#define WRITE1(s1, s6, dst, tmp)\ + "movq " #dst ", " #tmp " \n\t" /* R0 r0 */\ + "packssdw " #s1 ", " #tmp " \n\t" /* R1 r1 R0 r0*/\ + "movq " #tmp ", " #dst " \n\t"\ + "movq 24+" #dst ", " #tmp " \n\t" /* R7 r7 */\ + "packssdw " #tmp ", " #s6 " \n\t" /* R7 r7 R6 r6*/\ + "movq " #s6 ", 24+" #dst " \n\t" + +#define WRITE2(s2, s5, s3, s4, dst)\ + "packssdw " #s3 ", " #s2 " \n\t" /* R3 r3 R2 r2*/\ + "packssdw " #s5 ", " #s4 " \n\t" /* R5 r5 R4 r4*/\ + "movq " #s2 ", 8+" #dst " \n\t"\ + "movq " #s4 ", 16+" #dst " \n\t" + +#define WRITE3(a, dst)\ + "pslld $16, " #a " \n\t"\ + "paddd d40000, " #a " \n\t"\ + "psrad $13, " #a " \n\t"\ + "packssdw " #a ", " #a " \n\t"\ + "movq " #a ", " #dst " \n\t"\ + "movq " #a ", 8+" #dst " \n\t"\ + "movq " #a ", 16+" #dst " \n\t"\ + "movq " #a ", 24+" #dst " \n\t"\ + +#define WRITE0b(s0, s7, dst)\ + "packssdw " #s0 ", " #s0 " \n\t" /* C0, c0, C0, c0 */\ + "packssdw " #s7 ", " #s7 " \n\t" /* C7, c7, C7, c7 */\ + "movd " #s0 ", " #dst " \n\t" /* C0, c0 */\ + "movd " #s7 ", 112+" #dst " \n\t" /* C7, c7 */ + +#define WRITE1b(s1, s6, dst, tmp)\ + "packssdw " #s1 ", " #s1 " \n\t" /* C1, c1, C1, c1 */\ + "packssdw " #s6 ", " #s6 " \n\t" /* C6, c6, C6, c6 */\ + "movd " #s1 ", 16+" #dst " \n\t" /* C1, c1 */\ + "movd " #s6 ", 96+" #dst " \n\t" /* C6, c6 */ + +#define WRITE2b(s2, s5, s3, s4, dst)\ + "packssdw " #s2 ", " #s2 " \n\t" /* C2, c2, C2, c2 */\ + "packssdw " #s3 ", " #s3 " \n\t" /* C3, c3, C3, c3 */\ + "movd " #s2 ", 32+" #dst " \n\t" /* C2, c2 */\ + "movd " #s3 ", 48+" #dst " \n\t" /* C3, c3 */\ + "packssdw " #s4 ", " #s4 " \n\t" /* C4, c4, C4, c4 */\ + "packssdw " #s5 ", " #s5 " \n\t" /* C5, c5, C5, c5 */\ + "movd " #s4 ", 64+" #dst " \n\t" /* C4, c4 */\ + "movd " #s5 ", 80+" #dst " \n\t" /* C5, c5 */\ + + +//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) +DC_COND_IDCT_CORE( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) +Z_COND_IDCT_CORE( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) +Z_COND_IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) +Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) + +#undef IDCT_CORE +#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ + "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ +\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ + "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE0b(%%mm6, %%mm4, dst) \ +\ + "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ + "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ +\ + "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE1b(%%mm6, %%mm4, dst, %%mm7) \ +\ + "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ + "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ +\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ +\ + "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ + "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ + "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ + "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ + "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst) + +//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) +IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "jmp 9f \n\t" + + "#.balign 16 \n\t"\ + "4: \n\t" +Z_COND_IDCT_CORE( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) +Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) + +#undef IDCT_CORE +#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ + "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ +\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE0b(%%mm7, %%mm4, dst) \ +\ + "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ + "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ +\ + "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "paddd %%mm4, %%mm7 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm7, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE1b(%%mm7, %%mm4, dst, %%mm6) \ +\ + "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ + "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ + "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ +\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "paddd %%mm4, %%mm7 \n\t" /* A1+B1 a1+b1 */\ + "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm7, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ +\ + "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ + "paddd %%mm0, %%mm3 \n\t" /* A3+B3 a3+b3 */\ + "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ + "psubd %%mm3, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm3 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst) + +//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) +IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "jmp 9f \n\t" + + "#.balign 16 \n\t"\ + "6: \n\t" +Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) + +#undef IDCT_CORE +#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ +\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE0b(%%mm7, %%mm4, dst) \ +\ + "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ +\ + "movq 80(%2), %%mm4 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "paddd %%mm5, %%mm7 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm5, %%mm5 \n\t" /* 2A1 2a1 */\ + "psubd %%mm7, %%mm5 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + WRITE1b(%%mm7, %%mm5, dst, %%mm6) \ +\ + "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ +\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "paddd %%mm4, %%mm7 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm7, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ +\ + "paddd %%mm0, %%mm3 \n\t" /* A3+B3 a3+b3 */\ + "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ + "psubd %%mm3, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm3 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + WRITE2b(%%mm7, %%mm4, %%mm3, %%mm0, dst) + +//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) +IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "jmp 9f \n\t" + + "#.balign 16 \n\t"\ + "2: \n\t" +Z_COND_IDCT_CORE( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) + +#undef IDCT_CORE +#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ + "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "movq 40(%2), %%mm7 \n\t" /* C7 C5 C7 C5 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ +\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B0 b0 */\ + "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE0b(%%mm6, %%mm4, dst) \ +\ + "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ +\ + "movq 80(%2), %%mm4 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm5, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm5, %%mm5 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm5 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + WRITE1b(%%mm6, %%mm5, dst, %%mm7) \ +\ + "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "movq 104(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ +\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "paddd %%mm7, %%mm6 \n\t" /* B1 b1 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "pmaddwd 136(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ +\ + "paddd %%mm3, %%mm2 \n\t" /* B3 b3 */\ + "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ + "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ + "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst) + +//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) +IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "jmp 9f \n\t" + + "#.balign 16 \n\t"\ + "3: \n\t" +#undef IDCT_CORE +#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ +\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE0b(%%mm6, %%mm4, dst) \ +\ + "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ +\ + "movq 80(%2), %%mm4 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "paddd %%mm5, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm5, %%mm5 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm5 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + WRITE1b(%%mm6, %%mm5, dst, %%mm7) \ +\ + "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ +\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ +\ + "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ + "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ + "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst) + +//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) +IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "jmp 9f \n\t" + + "#.balign 16 \n\t"\ + "5: \n\t" +#undef IDCT_CORE +#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ + "movq %%mm4, %%mm6\n\t"\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ + "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "movq %%mm5, %%mm7\n\t"\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ + "movq 8+" #src0 ", %%mm2 \n\t" /*2R2 R0 r2 r0 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /*2C2R2+C4R0 C2r2+C4r0 */\ + "movq 8+" #src4 ", %%mm3 \n\t" /*2R6 R4 r6 r4 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /*2C6R6+C4R4 C6r6+C4r4 */\ +\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "psrad $" #shift ", %%mm4 \n\t"\ + "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ +\ + "paddd %%mm7, %%mm6 \n\t" /*2A0 a0 */\ + "movq 56(%2), %%mm7 \n\t" /* -C2 -C4 -C2 -C4 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "pmaddwd %%mm1, %%mm7 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ +\ + "packssdw %%mm6, %%mm4 \n\t" /* C0, c0, C0, c0 */\ + "movq 48(%2), %%mm6 \n\t" /* C6 C4 C6 C4 */\ + "movq %%mm4, " #dst " \n\t" /* C0, c0 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /*2C6R2+C4R0 C6r2+C4r0 */\ +\ + "movq %%mm4, 112+" #dst " \n\t" /* C0, c0 */\ + "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ + "pmaddwd %%mm3, %%mm4 \n\t" /*2-C2R6-C4R4 -C2r6-C4r4 */\ +\ + "paddd %%mm5, %%mm7 \n\t" /* A1 a1 */\ + "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ +\ + "paddd %%mm4, %%mm6 \n\t" /*2A1 a1 */\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ +\ + "psrad $" #shift ", %%mm6 \n\t"\ + "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ +\ + "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ + "packssdw %%mm6, %%mm7 \n\t" /* C1, c1, C1, c1 */\ +\ + "movq 80(%2), %%mm6 \n\t" /* -C6 C4 -C6 C4 */\ + "movq %%mm7, 16+" #dst " \n\t" /* C1, c1 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /*2-C6R2+C4R0 -C6r2+C4r0 */\ +\ + "movq %%mm7, 96+" #dst " \n\t" /* C1, c1 */\ + "movq 88(%2), %%mm7 \n\t" /* C2 -C4 C2 -C4 */\ + "pmaddwd %%mm3, %%mm7 \n\t" /*2C2R6-C4R4 C2r6-C4r4 */\ +\ + "pmaddwd 112(%2), %%mm2 \n\t" /*2-C2R2+C4R0 -C2r2+C4r0 */\ + "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ +\ + "pmaddwd 120(%2), %%mm3 \n\t" /*2-C6R6+C4R4 -C6r6+C4r4 */\ + "psrad $" #shift ", %%mm4 \n\t"\ +\ + "paddd %%mm7, %%mm6 \n\t" /*2A2 a2 */\ + "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ +\ + "psrad $" #shift ", %%mm6 \n\t"\ +\ + "packssdw %%mm6, %%mm4 \n\t" /* C2, c2, C2, c2 */\ + "movq %%mm4, 32+" #dst " \n\t" /* C2, c2 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "paddd %%mm3, %%mm2 \n\t" /*2A3 a3 */\ +\ + "movq %%mm4, 80+" #dst " \n\t" /* C2, c2 */\ + "psrad $" #shift ", %%mm2 \n\t"\ +\ + "packssdw %%mm2, %%mm0 \n\t" /* C3, c3, C3, c3 */\ + "movq %%mm0, 48+" #dst " \n\t" /* C3, c3 */\ + "movq %%mm0, 64+" #dst " \n\t" /* C3, c3 */\ + +//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) +IDCT_CORE( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +//IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +//IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "jmp 9f \n\t" + + + "#.balign 16 \n\t"\ + "1: \n\t" +#undef IDCT_CORE +#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq " #src4 ", %%mm1 \n\t" /* R6 R4 r6 r4 */\ + "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ + "movq 16(%2), %%mm4 \n\t" /* C2 C4 C2 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq 24(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C4R4 C6r6+C4r4 */\ + "movq 32(%2), %%mm6 \n\t" /* C3 C1 C3 C1 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ + "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ +\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "paddd %%mm4, %%mm6 \n\t" /* A0+B0 a0+b0 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ + "psubd %%mm6, %%mm4 \n\t" /* A0-B0 a0-b0 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE0b(%%mm6, %%mm4, dst) \ +\ + "movq 56(%2), %%mm4 \n\t" /* -C2 -C4 -C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* -C2R6-C4R4 -C2r6-C4r4 */\ + "movq 64(%2), %%mm6 \n\t" /* -C7 C3 -C7 C3 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ + "paddd %%mm5, %%mm4 \n\t" /* A1 a1 */\ +\ + "movq 80(%2), %%mm5 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm0, %%mm5 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ + WRITE1b(%%mm6, %%mm4, dst, %%mm7) \ +\ + "movq 88(%2), %%mm4 \n\t" /* C2 -C4 C2 -C4 */\ + "pmaddwd %%mm1, %%mm4 \n\t" /* C2R6-C4R4 C2r6-C4r4 */\ + "movq 96(%2), %%mm6 \n\t" /* -C1 C5 -C1 C5 */\ + "pmaddwd %%mm2, %%mm6 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ + "paddd %%mm5, %%mm4 \n\t" /* A2 a2 */\ +\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "paddd %%mm4, %%mm6 \n\t" /* A1+B1 a1+b1 */\ + "pmaddwd 120(%2), %%mm1 \n\t" /* -C6R6+C4R4 -C6r6+C4r4 */\ + "paddd %%mm4, %%mm4 \n\t" /* 2A1 2a1 */\ + "psubd %%mm6, %%mm4 \n\t" /* A1-B1 a1-b1 */\ + "pmaddwd 128(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ + "psrad $" #shift ", %%mm6 \n\t"\ + "psrad $" #shift ", %%mm4 \n\t"\ +\ + "paddd %%mm1, %%mm0 \n\t" /* A3 a3 */\ + "paddd %%mm0, %%mm2 \n\t" /* A3+B3 a3+b3 */\ + "paddd %%mm0, %%mm0 \n\t" /* 2A3 2a3 */\ + "psubd %%mm2, %%mm0 \n\t" /* A3-B3 a3-b3 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm0 \n\t"\ + WRITE2b(%%mm6, %%mm4, %%mm2, %%mm0, dst) + +//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) +IDCT_CORE( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + "jmp 9f \n\t" + + + "#.balign 16 \n\t" + "7: \n\t" +#undef IDCT_CORE +#define IDCT_CORE(src0, src4, src1, src5, dst, rounder, shift) \ + "movq " #src0 ", %%mm0 \n\t" /* R2 R0 r2 r0 */\ + "movq 16(%2), %%mm2 \n\t" /* C2 C4 C2 C4 */\ + "movq 8+" #src0 ", %%mm1 \n\t" /* R2 R0 r2 r0 */\ + "pmaddwd %%mm0, %%mm2 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ + "movq 16(%2), %%mm3 \n\t" /* C2 C4 C2 C4 */\ + "pmaddwd %%mm1, %%mm3 \n\t" /* C2R2+C4R0 C2r2+C4r0 */\ +\ + "movq 48(%2), %%mm4 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm0, %%mm4 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "movq 48(%2), %%mm5 \n\t" /* C6 C4 C6 C4 */\ + "pmaddwd %%mm1, %%mm5 \n\t" /* C6R2+C4R0 C6r2+C4r0 */\ + "movq 80(%2), %%mm6 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm0, %%mm6 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "movq 80(%2), %%mm7 \n\t" /* -C6 C4 -C6 C4 */\ + "pmaddwd %%mm1, %%mm7 \n\t" /* -C6R2+C4R0 -C6r2+C4r0 */\ + "pmaddwd 112(%2), %%mm0 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "psrad $" #shift ", %%mm2 \n\t"\ + "psrad $" #shift ", %%mm3 \n\t"\ + "pmaddwd 112(%2), %%mm1 \n\t" /* -C2R2+C4R0 -C2r2+C4r0 */\ + "packssdw %%mm3, %%mm2 \n\t" /* C0, c0, C0, c0 */\ + "movq %%mm2, " #dst " \n\t" /* C0, c0 */\ + "psrad $" #shift ", %%mm4 \n\t"\ + "psrad $" #shift ", %%mm5 \n\t"\ + "movq %%mm2, 112+" #dst " \n\t" /* C0, c0 */\ + "packssdw %%mm5, %%mm4 \n\t" /* C1, c1, C1, c1 */\ + "movq %%mm4, 16+" #dst " \n\t" /* C0, c0 */\ + "psrad $" #shift ", %%mm7 \n\t"\ + "psrad $" #shift ", %%mm6 \n\t"\ + "movq %%mm4, 96+" #dst " \n\t" /* C0, c0 */\ + "packssdw %%mm7, %%mm6 \n\t" /* C2, c2, C2, c2 */\ + "movq %%mm6, 32+" #dst " \n\t" /* C0, c0 */\ + "psrad $" #shift ", %%mm0 \n\t"\ + "movq %%mm6, 80+" #dst " \n\t" /* C0, c0 */\ + "psrad $" #shift ", %%mm1 \n\t"\ + "packssdw %%mm1, %%mm0 \n\t" /* C3, c3, C3, c3 */\ + "movq %%mm0, 48+" #dst " \n\t" /* C0, c0 */\ + "movq %%mm0, 64+" #dst " \n\t" /* C0, c0 */\ + +//IDCT_CORE( src0, src4, src1, src5, dst, rounder, shift) +IDCT_CORE( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) +//IDCT_CORE( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) +IDCT_CORE( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) +//IDCT_CORE( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) + + +#endif + +/* +Input + 00 20 02 22 40 60 42 62 + 10 30 12 32 50 70 52 72 + 01 21 03 23 41 61 43 63 + 11 31 13 33 51 71 53 73 + 04 24 06 26 44 64 46 66 + 14 34 16 36 54 74 56 76 +... +*/ +/* +Temp + 00 02 10 12 20 22 30 32 + 40 42 50 52 60 62 70 72 + 01 03 11 13 21 23 31 33 + 41 43 51 53 61 63 71 73 + 04 06 14 16 24 26 34 36 + 44 46 54 56 64 66 74 76 + 05 07 15 17 25 27 35 37 + 45 47 55 57 65 67 75 77 +*/ + +/* +Output + 00 10 20 30 40 50 60 70 + 01 11 21 31 41 51 61 71 +... +*/ + +"9: \n\t" + :: "r" (block), "r" (temp), "r" (coeffs) + : "%eax" + ); +/* +idctCol(block, temp); +idctCol(block+1, temp+2); +idctCol(block+2, temp+4); +idctCol(block+3, temp+6); +idctCol(block+4, temp+8); +idctCol(block+5, temp+10); +idctCol(block+6, temp+12); +idctCol(block+7, temp+14); +*/ +} + +void simple_idct_mmx(int16_t *block) +{ + static int imax=0, imin=0; + static int omax=0, omin=0; + int i, j; +/* + for(i=0; i<64; i++) + { + if(block[i] > imax) + { + imax= block[i]; + printf("Input-Max: %d\n", imax); + printf("Input-Min: %d\n", imin); + printf("Output-Max: %d\n", omax); + printf("Output-Min: %d\n", omin); + } + if(block[i] < imin) + { + imin= block[i]; + printf("Input-Max: %d\n", imax); + printf("Input-Min: %d\n", imin); + printf("Output-Max: %d\n", omax); + printf("Output-Min: %d\n", omin); + } + }*/ +/* static int stat[64]; + for(j=0; j<4; j++) + { + static int line[8]={0,2,1,3,4,6,5,7}; + for(i=0; i<16; i++) + { + if(block[j*16+i]) + { + stat[j*16+1]++; + break; + } + } + for(i=0; i<16; i++) + { + if(block[j*16+i] && i!=0 && i!=2) + { + stat[j*16+2]++; + break; + } + } + } + stat[0]++;*/ +/* for(i=1; i<8; i++) + { + if(block[i] != 0) + { + stat[1]++; + break; + } + } + for(i=32; i<64; i++) + { + if(block[i] != 0) + { + stat[2]++; + break; + } + } + stat[0]++; +*/ +// return; + idct(block); +// memset(block, 0, 128); +/* + if(stat[0] > 100000) + for(i=0; i<64; i++) + { + if((i&7) == 0) printf("\n"); + printf("%06d ", stat[i]); + } +*/ +/* + for(i=0; i<4; i++) printf("%d", stat[1+i*16]); + printf(" "); + for(i=0; i<4; i++) printf("%d", stat[2+i*16]); + printf("\n"); +*/ +// printf("%d", stat[2]); + +// memset(stat, 0, 256); + +/* + for(i=0; i<64; i++) + { + if(block[i] > omax) + { + omax= block[i]; + printf("Input-Max: %d\n", imax); + printf("Input-Min: %d\n", imin); + printf("Output-Max: %d\n", omax); + printf("Output-Min: %d\n", omin); + } + if(block[i] < omin) + { + omin= block[i]; + printf("Input-Max: %d\n", imax); + printf("Input-Min: %d\n", imin); + printf("Output-Max: %d\n", omax); + printf("Output-Min: %d\n", omin); + } + }*/ +} diff --git a/src/libffmpeg/libavcodec/mlib/Makefile.am b/src/libffmpeg/libavcodec/mlib/Makefile.am new file mode 100644 index 000000000..92dc8cc30 --- /dev/null +++ b/src/libffmpeg/libavcodec/mlib/Makefile.am @@ -0,0 +1,39 @@ +## +## Process this file with automake to produce Makefile.in +## + +#CFLAGS = -D_FILE_OFFSET_BITS=64 @GLOBAL_CFLAGS@ -DCONFIG_DECODERS -DHAVE_AV_CONFIG_H + +CFLAGS = @GLOBAL_CFLAGS@ @LIBFFMPEG_CFLAGS@ -DCONFIG_DECODERS -DHAVE_AV_CONFIG_H +ASFLAGS = + +LIBTOOL = $(SHELL) $(top_builddir)/libtool-nofpic + +noinst_LTLIBRARIES = libavcodec_mlib.la + +EXTRA_DIST = dsputil_mlib.c + +if HAVE_MLIB +mlib_modules = $(EXTRA_DIST) +endif + +libavcodec_mlib_la_SOURCES = $(mlib_modules) + +noinst_HEADERS = + +.s.lo: + $(ASCOMPILE) -o $@ `test -f $< || echo '$(srcdir)/'`$< + +debug: + @$(MAKE) CFLAGS="@DEBUG_CFLAGS@ @LIBFFMPEG_CFLAGS@ -DCONFIG_DECODERS -DHAVE_AV_CONFIG_H" + +install-debug: debug + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +mostlyclean-generic: + -rm -f *~ \#* .*~ .\#* + +maintainer-clean-generic: + -@echo "This command is intended for maintainers to use;" + -@echo "it deletes files that may require special tools to rebuild." + -rm -f Makefile.in diff --git a/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c b/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c new file mode 100644 index 000000000..236c9206a --- /dev/null +++ b/src/libffmpeg/libavcodec/mlib/dsputil_mlib.c @@ -0,0 +1,144 @@ +/* + * Sun mediaLib optimized DSP utils + * Copyright (c) 2001 Gerard Lantau. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "../dsputil.h" + +#include +#include +#include +#include + + +static void put_pixels_mlib (uint8_t * dest, const uint8_t * ref, + int stride, int height) +{ + assert(height == 16 || height == 8); + if (height == 16) + mlib_VideoCopyRef_U8_U8_8x16(dest, (uint8_t *)ref, stride); + else + mlib_VideoCopyRef_U8_U8_8x8 (dest, (uint8_t *)ref, stride); +} + +static void put_pixels_x2_mlib (uint8_t * dest, const uint8_t * ref, + int stride, int height) +{ + assert(height == 16 || height == 8); + if (height == 16) + mlib_VideoInterpX_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride); + else + mlib_VideoInterpX_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride); +} + +static void put_pixels_y2_mlib (uint8_t * dest, const uint8_t * ref, + int stride, int height) +{ + assert(height == 16 || height == 8); + if (height == 16) + mlib_VideoInterpY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride); + else + mlib_VideoInterpY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride); +} + +static void put_pixels_xy2_mlib(uint8_t * dest, const uint8_t * ref, + int stride, int height) +{ + assert(height == 16 || height == 8); + if (height == 16) + mlib_VideoInterpXY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride); + else + mlib_VideoInterpXY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride); +} + +static void avg_pixels_mlib (uint8_t * dest, const uint8_t * ref, + int stride, int height) +{ + assert(height == 16 || height == 8); + if (height == 16) + mlib_VideoCopyRefAve_U8_U8_8x16(dest, (uint8_t *)ref, stride); + else + mlib_VideoCopyRefAve_U8_U8_8x8 (dest, (uint8_t *)ref, stride); +} + +static void avg_pixels_x2_mlib (uint8_t * dest, const uint8_t * ref, + int stride, int height) +{ + assert(height == 16 || height == 8); + if (height == 16) + mlib_VideoInterpAveX_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride); + else + mlib_VideoInterpAveX_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride); +} + +static void avg_pixels_y2_mlib (uint8_t * dest, const uint8_t * ref, + int stride, int height) +{ + assert(height == 16 || height == 8); + if (height == 16) + mlib_VideoInterpAveY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride); + else + mlib_VideoInterpAveY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride); +} + +static void avg_pixels_xy2_mlib (uint8_t * dest, const uint8_t * ref, + int stride, int height) +{ + assert(height == 16 || height == 8); + if (height == 16) + mlib_VideoInterpAveXY_U8_U8_8x16(dest, (uint8_t *)ref, stride, stride); + else + mlib_VideoInterpAveXY_U8_U8_8x8 (dest, (uint8_t *)ref, stride, stride); +} + + +static void add_pixels_clamped_mlib(const DCTELEM *block, UINT8 *pixels, int line_size) +{ + mlib_VideoAddBlock_U8_S16(pixels, (mlib_s16 *)block, line_size); +} + + +void ff_idct_mlib(DCTELEM *data) +{ + mlib_VideoIDCT8x8_S16_S16 (data, data); +} + + +void ff_fdct_mlib(DCTELEM *data) +{ + mlib_VideoDCT8x8_S16_S16 (data, data); +} + +void dsputil_init_mlib(void) +{ + av_fdct = ff_fdct_mlib; + ff_idct = ff_idct_mlib; + + put_pixels_tab[0] = put_pixels_mlib; + put_pixels_tab[1] = put_pixels_x2_mlib; + put_pixels_tab[2] = put_pixels_y2_mlib; + put_pixels_tab[3] = put_pixels_xy2_mlib; + + avg_pixels_tab[0] = avg_pixels_mlib; + avg_pixels_tab[1] = avg_pixels_x2_mlib; + avg_pixels_tab[2] = avg_pixels_y2_mlib; + avg_pixels_tab[3] = avg_pixels_xy2_mlib; + + put_no_rnd_pixels_tab[0] = put_pixels_mlib; + + add_pixels_clamped = add_pixels_clamped_mlib; +} -- cgit v1.2.3