diff options
Diffstat (limited to 'src/libffmpeg/libavcodec/i386')
20 files changed, 1412 insertions, 452 deletions
diff --git a/src/libffmpeg/libavcodec/i386/Makefile.am b/src/libffmpeg/libavcodec/i386/Makefile.am index 0d649ae24..3eae6fe0f 100644 --- a/src/libffmpeg/libavcodec/i386/Makefile.am +++ b/src/libffmpeg/libavcodec/i386/Makefile.am @@ -22,6 +22,8 @@ libavcodec_mmx_src = \ cputest.c \ dsputil_mmx.c \ fdct_mmx.c \ + fft_3dn.c \ + fft_3dn2.c \ fft_sse.c \ idct_mmx.c \ idct_mmx_xvid.c \ diff --git a/src/libffmpeg/libavcodec/i386/cputest.c b/src/libffmpeg/libavcodec/i386/cputest.c index a66bdbe98..262786b71 100644 --- a/src/libffmpeg/libavcodec/i386/cputest.c +++ b/src/libffmpeg/libavcodec/i386/cputest.c @@ -1,9 +1,30 @@ -/* Cpu detection code, extracted from mmx.h ((c)1997-99 by H. Dietz - and R. Fisher). Converted to C and improved by Fabrice Bellard */ +/* + * CPU detection code, extracted from mmx.h + * (c)1997-99 by H. Dietz and R. Fisher + * Converted to C and improved by Fabrice Bellard. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ #include <stdlib.h> #include "../dsputil.h" +#undef printf + #ifdef ARCH_X86_64 # define REG_b "rbx" # define REG_S "rsi" diff --git a/src/libffmpeg/libavcodec/i386/dsputil_h264_template_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_h264_template_mmx.c index b49c880a7..e09a1007e 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_h264_template_mmx.c +++ b/src/libffmpeg/libavcodec/i386/dsputil_h264_template_mmx.c @@ -2,18 +2,20 @@ * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, * Loren Merritt * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c index a2cbab8ce..673e749c4 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c @@ -3,18 +3,20 @@ * Copyright (c) 2000, 2001 Fabrice Bellard. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> @@ -29,7 +31,6 @@ //#undef NDEBUG //#include <assert.h> -extern const uint8_t ff_h263_loop_filter_strength[32]; extern void ff_idct_xvid_mmx(short *block); extern void ff_idct_xvid_mmx2(short *block); @@ -40,6 +41,9 @@ static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x01 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL; static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL; +static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) = +{0x8000000080000000ULL, 0x8000000080000000ULL}; + static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL; @@ -50,10 +54,15 @@ static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL; static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; +static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL; +static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL; +static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL; static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL; +static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL; +static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL; static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; -#define JUMPALIGN() __asm __volatile (".balign 8"::) +#define JUMPALIGN() __asm __volatile (ASMALIGN(3)::) #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) #define MOVQ_WONE(regd) \ @@ -201,7 +210,7 @@ static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) asm volatile( "mov $-128, %%"REG_a" \n\t" "pxor %%mm7, %%mm7 \n\t" - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%0), %%mm0 \n\t" "movq (%0, %2), %%mm2 \n\t" @@ -229,7 +238,7 @@ static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint asm volatile( "pxor %%mm7, %%mm7 \n\t" "mov $-128, %%"REG_a" \n\t" - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%0), %%mm0 \n\t" "movq (%1), %%mm2 \n\t" @@ -372,7 +381,7 @@ static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size { __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movd (%1), %%mm0 \n\t" "movd (%1, %3), %%mm1 \n\t" @@ -398,7 +407,7 @@ static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size { __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" @@ -424,7 +433,7 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_siz { __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" @@ -625,22 +634,10 @@ static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ asm volatile( //FIXME could save 1 instruction if done as 8x4 ... - "movd %0, %%mm0 \n\t" - "movd %1, %%mm1 \n\t" - "movd %2, %%mm2 \n\t" - - : - : "m" (*(uint32_t*)(src + 0*src_stride)), - "m" (*(uint32_t*)(src + 1*src_stride)), - "m" (*(uint32_t*)(src + 2*src_stride)) - ); - asm volatile( //FIXME could save 1 instruction if done as 8x4 ... - "movd %0, %%mm3 \n\t" - - : - : "m" (*(uint32_t*)(src + 3*src_stride)) - ); - asm volatile( //FIXME could save 1 instruction if done as 8x4 ... + "movd %4, %%mm0 \n\t" + "movd %5, %%mm1 \n\t" + "movd %6, %%mm2 \n\t" + "movd %7, %%mm3 \n\t" "punpcklbw %%mm1, %%mm0 \n\t" "punpcklbw %%mm3, %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" @@ -657,6 +654,10 @@ static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int "=m" (*(uint32_t*)(dst + 1*dst_stride)), "=m" (*(uint32_t*)(dst + 2*dst_stride)), "=m" (*(uint32_t*)(dst + 3*dst_stride)) + : "m" (*(uint32_t*)(src + 0*src_stride)), + "m" (*(uint32_t*)(src + 1*src_stride)), + "m" (*(uint32_t*)(src + 2*src_stride)), + "m" (*(uint32_t*)(src + 3*src_stride)) ); } @@ -1185,8 +1186,8 @@ static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, in else score1 = sse16_mmx(c, pix1, pix2, line_size, h); score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); - if(c) return score1 + ABS(score2)*c->avctx->nsse_weight; - else return score1 + ABS(score2)*8; + if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; + else return score1 + FFABS(score2)*8; } static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { @@ -1194,8 +1195,8 @@ static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int int score1= sse8_mmx(c, pix1, pix2, line_size, h); int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); - if(c) return score1 + ABS(score2)*c->avctx->nsse_weight; - else return score1 + ABS(score2)*8; + if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; + else return score1 + FFABS(score2)*8; } static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { @@ -2403,6 +2404,53 @@ QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) +/***********************************/ +/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */ + +#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\ +static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\ +} +#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\ +static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\ +} + +#define QPEL_2TAP(OPNAME, SIZE, MMX)\ +QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\ +QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\ +QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\ +static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\ + OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\ +static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\ + OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\ +static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\ + OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\ +static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\ +}\ +static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\ +}\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\ + +QPEL_2TAP(put_, 16, mmx2) +QPEL_2TAP(avg_, 16, mmx2) +QPEL_2TAP(put_, 8, mmx2) +QPEL_2TAP(avg_, 8, mmx2) +QPEL_2TAP(put_, 16, 3dnow) +QPEL_2TAP(avg_, 16, 3dnow) +QPEL_2TAP(put_, 8, 3dnow) +QPEL_2TAP(avg_, 8, 3dnow) + + #if 0 static void just_return() { return; } #endif @@ -2523,9 +2571,10 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o "m"(src[stride]), "m"(src[stride+1]), "m"(*r4), "m"(shift2) ); - + asm volatile( "movd %%mm0, %0 \n\t" + : "=m"(dst[x+y*stride]) : ); @@ -2535,10 +2584,11 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o } } +#ifdef CONFIG_ENCODERS static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ long i=0; - assert(ABS(scale) < 256); + assert(FFABS(scale) < 256); scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; asm volatile( @@ -2586,7 +2636,7 @@ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[6 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ long i=0; - if(ABS(scale) < 256){ + if(FFABS(scale) < 256){ scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; asm volatile( "pcmpeqw %%mm6, %%mm6 \n\t" // -1w @@ -2620,9 +2670,10 @@ static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ } } } +#endif /* CONFIG_ENCODERS */ #define PREFETCH(name, op) \ -void name(void *mem, int stride, int h){\ +static void name(void *mem, int stride, int h){\ const uint8_t *p= mem;\ do{\ asm volatile(#op" %0" :: "m"(*p));\ @@ -2661,6 +2712,7 @@ void ff_vp3_dsp_init_mmx(void); /* XXX: those functions should be suppressed ASAP when all IDCTs are converted */ +#ifdef CONFIG_GPL static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) { ff_mmx_idct (block); @@ -2681,6 +2733,7 @@ static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *bloc ff_mmxext_idct (block); add_pixels_clamped_mmx(block, dest, line_size); } +#endif static void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block) { ff_vp3_idct_sse2(block); @@ -2701,7 +2754,6 @@ static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) ff_vp3_idct_mmx(block); add_pixels_clamped_mmx(block, dest, line_size); } -#ifdef CONFIG_GPL static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) { ff_idct_xvid_mmx (block); @@ -2722,7 +2774,274 @@ static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) ff_idct_xvid_mmx2 (block); add_pixels_clamped_mmx(block, dest, line_size); } -#endif + +static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) +{ + int i; + asm volatile("pxor %%mm7, %%mm7":); + for(i=0; i<blocksize; i+=2) { + asm volatile( + "movq %0, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 + "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 + "pslld $31, %%mm2 \n\t" // keep only the sign bit + "pxor %%mm2, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pand %%mm1, %%mm3 \n\t" + "pandn %%mm1, %%mm4 \n\t" + "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) + "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) + "movq %%mm3, %1 \n\t" + "movq %%mm0, %0 \n\t" + :"+m"(mag[i]), "+m"(ang[i]) + ::"memory" + ); + } + asm volatile("femms"); +} +static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) +{ + int i; + + asm volatile( + "movaps %0, %%xmm5 \n\t" + ::"m"(ff_pdw_80000000[0]) + ); + for(i=0; i<blocksize; i+=4) { + asm volatile( + "movaps %0, %%xmm0 \n\t" + "movaps %1, %%xmm1 \n\t" + "xorps %%xmm2, %%xmm2 \n\t" + "xorps %%xmm3, %%xmm3 \n\t" + "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 + "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 + "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit + "xorps %%xmm2, %%xmm1 \n\t" + "movaps %%xmm3, %%xmm4 \n\t" + "andps %%xmm1, %%xmm3 \n\t" + "andnps %%xmm1, %%xmm4 \n\t" + "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) + "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) + "movaps %%xmm3, %1 \n\t" + "movaps %%xmm0, %0 \n\t" + :"+m"(mag[i]), "+m"(ang[i]) + ::"memory" + ); + } +} + +static void vector_fmul_3dnow(float *dst, const float *src, int len){ + long i = (len-4)*4; + asm volatile( + "1: \n\t" + "movq (%1,%0), %%mm0 \n\t" + "movq 8(%1,%0), %%mm1 \n\t" + "pfmul (%2,%0), %%mm0 \n\t" + "pfmul 8(%2,%0), %%mm1 \n\t" + "movq %%mm0, (%1,%0) \n\t" + "movq %%mm1, 8(%1,%0) \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + "femms \n\t" + :"+r"(i) + :"r"(dst), "r"(src) + :"memory" + ); +} +static void vector_fmul_sse(float *dst, const float *src, int len){ + long i = (len-8)*4; + asm volatile( + "1: \n\t" + "movaps (%1,%0), %%xmm0 \n\t" + "movaps 16(%1,%0), %%xmm1 \n\t" + "mulps (%2,%0), %%xmm0 \n\t" + "mulps 16(%2,%0), %%xmm1 \n\t" + "movaps %%xmm0, (%1,%0) \n\t" + "movaps %%xmm1, 16(%1,%0) \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i) + :"r"(dst), "r"(src) + :"memory" + ); +} + +static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ + long i = len*4-16; + asm volatile( + "1: \n\t" + "pswapd 8(%1), %%mm0 \n\t" + "pswapd (%1), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" + "movq %%mm0, (%2,%0) \n\t" + "movq %%mm1, 8(%2,%0) \n\t" + "add $16, %1 \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(src1) + :"r"(dst), "r"(src0) + ); + asm volatile("femms"); +} +static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){ + long i = len*4-32; + asm volatile( + "1: \n\t" + "movaps 16(%1), %%xmm0 \n\t" + "movaps (%1), %%xmm1 \n\t" + "shufps $0x1b, %%xmm0, %%xmm0 \n\t" + "shufps $0x1b, %%xmm1, %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" + "movaps %%xmm0, (%2,%0) \n\t" + "movaps %%xmm1, 16(%2,%0) \n\t" + "add $32, %1 \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(src1) + :"r"(dst), "r"(src0) + ); +} + +static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, + const float *src2, int src3, int len, int step){ + long i = (len-4)*4; + if(step == 2 && src3 == 0){ + dst += (len-4)*2; + asm volatile( + "1: \n\t" + "movq (%2,%0), %%mm0 \n\t" + "movq 8(%2,%0), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" + "pfadd (%4,%0), %%mm0 \n\t" + "pfadd 8(%4,%0), %%mm1 \n\t" + "movd %%mm0, (%1) \n\t" + "movd %%mm1, 16(%1) \n\t" + "psrlq $32, %%mm0 \n\t" + "psrlq $32, %%mm1 \n\t" + "movd %%mm0, 8(%1) \n\t" + "movd %%mm1, 24(%1) \n\t" + "sub $32, %1 \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(dst) + :"r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else if(step == 1 && src3 == 0){ + asm volatile( + "1: \n\t" + "movq (%2,%0), %%mm0 \n\t" + "movq 8(%2,%0), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" + "pfadd (%4,%0), %%mm0 \n\t" + "pfadd 8(%4,%0), %%mm1 \n\t" + "movq %%mm0, (%1,%0) \n\t" + "movq %%mm1, 8(%1,%0) \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + :"+r"(i) + :"r"(dst), "r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else + ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); + asm volatile("femms"); +} +static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1, + const float *src2, int src3, int len, int step){ + long i = (len-8)*4; + if(step == 2 && src3 == 0){ + dst += (len-8)*2; + asm volatile( + "1: \n\t" + "movaps (%2,%0), %%xmm0 \n\t" + "movaps 16(%2,%0), %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" + "addps (%4,%0), %%xmm0 \n\t" + "addps 16(%4,%0), %%xmm1 \n\t" + "movss %%xmm0, (%1) \n\t" + "movss %%xmm1, 32(%1) \n\t" + "movhlps %%xmm0, %%xmm2 \n\t" + "movhlps %%xmm1, %%xmm3 \n\t" + "movss %%xmm2, 16(%1) \n\t" + "movss %%xmm3, 48(%1) \n\t" + "shufps $0xb1, %%xmm0, %%xmm0 \n\t" + "shufps $0xb1, %%xmm1, %%xmm1 \n\t" + "movss %%xmm0, 8(%1) \n\t" + "movss %%xmm1, 40(%1) \n\t" + "movhlps %%xmm0, %%xmm2 \n\t" + "movhlps %%xmm1, %%xmm3 \n\t" + "movss %%xmm2, 24(%1) \n\t" + "movss %%xmm3, 56(%1) \n\t" + "sub $64, %1 \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(dst) + :"r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else if(step == 1 && src3 == 0){ + asm volatile( + "1: \n\t" + "movaps (%2,%0), %%xmm0 \n\t" + "movaps 16(%2,%0), %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" + "addps (%4,%0), %%xmm0 \n\t" + "addps 16(%4,%0), %%xmm1 \n\t" + "movaps %%xmm0, (%1,%0) \n\t" + "movaps %%xmm1, 16(%1,%0) \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i) + :"r"(dst), "r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else + ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); +} + +static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){ + // not bit-exact: pf2id uses different rounding than C and SSE + int i; + for(i=0; i<len; i+=4) { + asm volatile( + "pf2id %1, %%mm0 \n\t" + "pf2id %2, %%mm1 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"=m"(dst[i]) + :"m"(src[i]), "m"(src[i+2]) + ); + } + asm volatile("femms"); +} +static void float_to_int16_sse(int16_t *dst, const float *src, int len){ + int i; + for(i=0; i<len; i+=4) { + asm volatile( + "cvtps2pi %1, %%mm0 \n\t" + "cvtps2pi %2, %%mm1 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"=m"(dst[i]) + :"m"(src[i]), "m"(src[i+2]) + ); + } + asm volatile("emms"); +} #ifdef CONFIG_SNOW_ENCODER extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width); @@ -2782,6 +3101,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct_add= ff_simple_idct_add_mmx; c->idct = ff_simple_idct_mmx; c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; +#ifdef CONFIG_GPL }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ if(mm_flags & MM_MMXEXT){ c->idct_put= ff_libmpeg2mmx2_idct_put; @@ -2793,8 +3113,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct = ff_mmx_idct; } c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; -#if 0 - }else if(idct_algo==FF_IDCT_VP3){ +#endif + }else if(idct_algo==FF_IDCT_VP3 && + avctx->codec->id!=CODEC_ID_THEORA && + !(avctx->flags & CODEC_FLAG_BITEXACT)){ if(mm_flags & MM_SSE2){ c->idct_put= ff_vp3_idct_put_sse2; c->idct_add= ff_vp3_idct_add_sse2; @@ -2807,10 +3129,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct = ff_vp3_idct_mmx; c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; } -#endif }else if(idct_algo==FF_IDCT_CAVS){ c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; -#ifdef CONFIG_GPL }else if(idct_algo==FF_IDCT_XVIDMMX){ if(mm_flags & MM_MMXEXT){ c->idct_put= ff_idct_xvid_mmx2_put; @@ -2821,7 +3141,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct_add= ff_idct_xvid_mmx_add; c->idct = ff_idct_xvid_mmx; } -#endif } } @@ -3012,6 +3331,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) dspfunc(avg_h264_qpel, 0, 16); dspfunc(avg_h264_qpel, 1, 8); dspfunc(avg_h264_qpel, 2, 4); + + dspfunc(put_2tap_qpel, 0, 16); + dspfunc(put_2tap_qpel, 1, 8); + dspfunc(avg_2tap_qpel, 0, 16); + dspfunc(avg_2tap_qpel, 1, 8); #undef dspfunc c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2; @@ -3024,6 +3348,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; + c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; @@ -3134,6 +3459,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) dspfunc(avg_h264_qpel, 1, 8); dspfunc(avg_h264_qpel, 2, 4); + dspfunc(put_2tap_qpel, 0, 16); + dspfunc(put_2tap_qpel, 1, 8); + dspfunc(avg_2tap_qpel, 0, 16); + dspfunc(avg_2tap_qpel, 1, 8); + c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow; c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; } @@ -3150,6 +3480,24 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; } #endif + + if(mm_flags & MM_3DNOW){ + c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; + c->vector_fmul = vector_fmul_3dnow; + if(!(avctx->flags & CODEC_FLAG_BITEXACT)) + c->float_to_int16 = float_to_int16_3dnow; + } + if(mm_flags & MM_3DNOWEXT) + c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; + if(mm_flags & MM_SSE){ + c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; + c->vector_fmul = vector_fmul_sse; + c->float_to_int16 = float_to_int16_sse; + c->vector_fmul_reverse = vector_fmul_reverse_sse; + c->vector_fmul_add_add = vector_fmul_add_add_sse; + } + if(mm_flags & MM_3DNOW) + c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse } #ifdef CONFIG_ENCODERS diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h index 440c5bb9c..b365cea57 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h @@ -3,18 +3,20 @@ * Copyright (c) 2000, 2001 Fabrice Bellard. * Copyright (c) 2002-2004 Michael Niedermayer * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> @@ -754,7 +756,7 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line "lea (%3, %3), %%"REG_a" \n\t" "movq (%1), %%mm0 \n\t" PAVGB" 1(%1), %%mm0 \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1, %%"REG_a"), %%mm2 \n\t" "movq (%1, %3), %%mm1 \n\t" @@ -818,3 +820,51 @@ static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int lin DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); } +#define QPEL_2TAP_L3(OPNAME) \ +static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ + asm volatile(\ + "1: \n\t"\ + "movq (%1,%2), %%mm0 \n\t"\ + "movq 8(%1,%2), %%mm1 \n\t"\ + PAVGB" (%1,%3), %%mm0 \n\t"\ + PAVGB" 8(%1,%3), %%mm1 \n\t"\ + PAVGB" (%1), %%mm0 \n\t"\ + PAVGB" 8(%1), %%mm1 \n\t"\ + STORE_OP( (%1,%4),%%mm0)\ + STORE_OP(8(%1,%4),%%mm1)\ + "movq %%mm0, (%1,%4) \n\t"\ + "movq %%mm1, 8(%1,%4) \n\t"\ + "add %5, %1 \n\t"\ + "decl %0 \n\t"\ + "jnz 1b \n\t"\ + :"+g"(h), "+r"(src)\ + :"r"((long)off1), "r"((long)off2),\ + "r"((long)(dst-src)), "r"((long)stride)\ + :"memory"\ + );\ +}\ +static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\ + asm volatile(\ + "1: \n\t"\ + "movq (%1,%2), %%mm0 \n\t"\ + PAVGB" (%1,%3), %%mm0 \n\t"\ + PAVGB" (%1), %%mm0 \n\t"\ + STORE_OP((%1,%4),%%mm0)\ + "movq %%mm0, (%1,%4) \n\t"\ + "add %5, %1 \n\t"\ + "decl %0 \n\t"\ + "jnz 1b \n\t"\ + :"+g"(h), "+r"(src)\ + :"r"((long)off1), "r"((long)off2),\ + "r"((long)(dst-src)), "r"((long)stride)\ + :"memory"\ + );\ +} + +#define STORE_OP(a,b) PAVGB" "#a","#b" \n\t" +QPEL_2TAP_L3(avg_) +#undef STORE_OP +#define STORE_OP(a,b) +QPEL_2TAP_L3(put_) +#undef STORE_OP +#undef QPEL_2TAP_L3 diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h index 3ecd776b8..f53b34662 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h @@ -3,18 +3,20 @@ * Copyright (c) 2000, 2001 Fabrice Bellard. * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> @@ -28,7 +30,7 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line MOVQ_BFE(mm6); __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1), %%mm0 \n\t" "movq 1(%1), %%mm1 \n\t" @@ -69,7 +71,7 @@ static void attribute_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, u "movq %%mm4, (%3) \n\t" "add %5, %3 \n\t" "decl %0 \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1), %%mm0 \n\t" "movq (%2), %%mm1 \n\t" @@ -110,7 +112,7 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int lin MOVQ_BFE(mm6); __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1), %%mm0 \n\t" "movq 1(%1), %%mm1 \n\t" @@ -168,7 +170,7 @@ static void attribute_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, "movq %%mm5, 8(%3) \n\t" "add %5, %3 \n\t" "decl %0 \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1), %%mm0 \n\t" "movq (%2), %%mm1 \n\t" @@ -206,7 +208,7 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" "movq (%1), %%mm0 \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1, %3), %%mm1 \n\t" "movq (%1, %%"REG_a"),%%mm2 \n\t" @@ -246,7 +248,7 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin "paddusw %%mm1, %%mm5 \n\t" "xor %%"REG_a", %%"REG_a" \n\t" "add %3, %1 \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t" "movq 1(%1, %%"REG_a"), %%mm2 \n\t" @@ -458,7 +460,7 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" "movq (%1), %%mm0 \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1, %3), %%mm1 \n\t" "movq (%1, %%"REG_a"), %%mm2 \n\t" @@ -509,7 +511,7 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin "paddusw %%mm1, %%mm5 \n\t" "xor %%"REG_a", %%"REG_a" \n\t" "add %3, %1 \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t" "movq 1(%1, %%"REG_a"), %%mm2 \n\t" diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c index f6150c83c..2ffbfecf6 100644 --- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c @@ -12,6 +12,22 @@ * Also of inspiration: * a page about fdct at http://www.geocities.com/ssavekar/dct.htm * Skal's fdct at http://skal.planet-d.net/coding/dct.html + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "common.h" #include "../dsputil.h" @@ -51,7 +67,7 @@ static const int64_t fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; static const int32_t fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; -struct +static struct { const int32_t fdct_r_row_sse2[4] ATTR_ALIGN(16); } fdct_r_row_sse2 ATTR_ALIGN(16)= @@ -134,7 +150,7 @@ static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff 29692, -12299, 26722, -31521, }; -struct +static struct { const int16_t tab_frw_01234567_sse2[256] ATTR_ALIGN(16); } tab_frw_01234567_sse2 ATTR_ALIGN(16) = @@ -351,60 +367,60 @@ static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset) static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) { asm volatile( - ".macro FDCT_ROW_SSE2_H1 i t \n\t" - "movq \\i(%0), %%xmm2 \n\t" - "movq \\i+8(%0), %%xmm0 \n\t" - "movdqa \\t+32(%1), %%xmm3 \n\t" - "movdqa \\t+48(%1), %%xmm7 \n\t" - "movdqa \\t(%1), %%xmm4 \n\t" - "movdqa \\t+16(%1), %%xmm5 \n\t" - ".endm \n\t" - ".macro FDCT_ROW_SSE2_H2 i t \n\t" - "movq \\i(%0), %%xmm2 \n\t" - "movq \\i+8(%0), %%xmm0 \n\t" - "movdqa \\t+32(%1), %%xmm3 \n\t" - "movdqa \\t+48(%1), %%xmm7 \n\t" - ".endm \n\t" - ".macro FDCT_ROW_SSE2 i \n\t" - "movq %%xmm2, %%xmm1 \n\t" - "pshuflw $27, %%xmm0, %%xmm0 \n\t" - "paddsw %%xmm0, %%xmm1 \n\t" - "psubsw %%xmm0, %%xmm2 \n\t" - "punpckldq %%xmm2, %%xmm1 \n\t" - "pshufd $78, %%xmm1, %%xmm2 \n\t" - "pmaddwd %%xmm2, %%xmm3 \n\t" - "pmaddwd %%xmm1, %%xmm7 \n\t" - "pmaddwd %%xmm5, %%xmm2 \n\t" - "pmaddwd %%xmm4, %%xmm1 \n\t" - "paddd %%xmm7, %%xmm3 \n\t" - "paddd %%xmm2, %%xmm1 \n\t" - "paddd %%xmm6, %%xmm3 \n\t" - "paddd %%xmm6, %%xmm1 \n\t" - "psrad %3, %%xmm3 \n\t" - "psrad %3, %%xmm1 \n\t" - "packssdw %%xmm3, %%xmm1 \n\t" - "movdqa %%xmm1, \\i(%4) \n\t" - ".endm \n\t" +#define FDCT_ROW_SSE2_H1(i,t) \ + "movq " #i "(%0), %%xmm2 \n\t" \ + "movq " #i "+8(%0), %%xmm0 \n\t" \ + "movdqa " #t "+32(%1), %%xmm3 \n\t" \ + "movdqa " #t "+48(%1), %%xmm7 \n\t" \ + "movdqa " #t "(%1), %%xmm4 \n\t" \ + "movdqa " #t "+16(%1), %%xmm5 \n\t" + +#define FDCT_ROW_SSE2_H2(i,t) \ + "movq " #i "(%0), %%xmm2 \n\t" \ + "movq " #i "+8(%0), %%xmm0 \n\t" \ + "movdqa " #t "+32(%1), %%xmm3 \n\t" \ + "movdqa " #t "+48(%1), %%xmm7 \n\t" + +#define FDCT_ROW_SSE2(i) \ + "movq %%xmm2, %%xmm1 \n\t" \ + "pshuflw $27, %%xmm0, %%xmm0 \n\t" \ + "paddsw %%xmm0, %%xmm1 \n\t" \ + "psubsw %%xmm0, %%xmm2 \n\t" \ + "punpckldq %%xmm2, %%xmm1 \n\t" \ + "pshufd $78, %%xmm1, %%xmm2 \n\t" \ + "pmaddwd %%xmm2, %%xmm3 \n\t" \ + "pmaddwd %%xmm1, %%xmm7 \n\t" \ + "pmaddwd %%xmm5, %%xmm2 \n\t" \ + "pmaddwd %%xmm4, %%xmm1 \n\t" \ + "paddd %%xmm7, %%xmm3 \n\t" \ + "paddd %%xmm2, %%xmm1 \n\t" \ + "paddd %%xmm6, %%xmm3 \n\t" \ + "paddd %%xmm6, %%xmm1 \n\t" \ + "psrad %3, %%xmm3 \n\t" \ + "psrad %3, %%xmm1 \n\t" \ + "packssdw %%xmm3, %%xmm1 \n\t" \ + "movdqa %%xmm1, " #i "(%4) \n\t" + "movdqa (%2), %%xmm6 \n\t" - "FDCT_ROW_SSE2_H1 0 0 \n\t" - "FDCT_ROW_SSE2 0 \n\t" - "FDCT_ROW_SSE2_H2 64 0 \n\t" - "FDCT_ROW_SSE2 64 \n\t" - - "FDCT_ROW_SSE2_H1 16 64 \n\t" - "FDCT_ROW_SSE2 16 \n\t" - "FDCT_ROW_SSE2_H2 112 64 \n\t" - "FDCT_ROW_SSE2 112 \n\t" - - "FDCT_ROW_SSE2_H1 32 128 \n\t" - "FDCT_ROW_SSE2 32 \n\t" - "FDCT_ROW_SSE2_H2 96 128 \n\t" - "FDCT_ROW_SSE2 96 \n\t" - - "FDCT_ROW_SSE2_H1 48 192 \n\t" - "FDCT_ROW_SSE2 48 \n\t" - "FDCT_ROW_SSE2_H2 80 192 \n\t" - "FDCT_ROW_SSE2 80 \n\t" + FDCT_ROW_SSE2_H1(0,0) + FDCT_ROW_SSE2(0) + FDCT_ROW_SSE2_H2(64,0) + FDCT_ROW_SSE2(64) + + FDCT_ROW_SSE2_H1(16,64) + FDCT_ROW_SSE2(16) + FDCT_ROW_SSE2_H2(112,64) + FDCT_ROW_SSE2(112) + + FDCT_ROW_SSE2_H1(32,128) + FDCT_ROW_SSE2(32) + FDCT_ROW_SSE2_H2(96,128) + FDCT_ROW_SSE2(96) + + FDCT_ROW_SSE2_H1(48,192) + FDCT_ROW_SSE2(48) + FDCT_ROW_SSE2_H2(80,192) + FDCT_ROW_SSE2(80) : : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) ); @@ -504,56 +520,44 @@ static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const in void ff_fdct_mmx(int16_t *block) { int64_t align_tmp[16] ATTR_ALIGN(8); - int16_t * const block_tmp= (int16_t*)align_tmp; - int16_t *block1, *out; - const int16_t *table; + int16_t * block1= (int16_t*)align_tmp; + const int16_t *table= tab_frw_01234567; int i; - block1 = block_tmp; fdct_col(block, block1, 0); fdct_col(block, block1, 4); - block1 = block_tmp; - table = tab_frw_01234567; - out = block; for(i=8;i>0;i--) { - fdct_row_mmx(block1, out, table); + fdct_row_mmx(block1, block, table); block1 += 8; table += 32; - out += 8; + block += 8; } } void ff_fdct_mmx2(int16_t *block) { int64_t align_tmp[16] ATTR_ALIGN(8); - int16_t * const block_tmp= (int16_t*)align_tmp; - int16_t *block1, *out; - const int16_t *table; + int16_t *block1= (int16_t*)align_tmp; + const int16_t *table= tab_frw_01234567; int i; - block1 = block_tmp; fdct_col(block, block1, 0); fdct_col(block, block1, 4); - block1 = block_tmp; - table = tab_frw_01234567; - out = block; for(i=8;i>0;i--) { - fdct_row_mmx2(block1, out, table); + fdct_row_mmx2(block1, block, table); block1 += 8; table += 32; - out += 8; + block += 8; } } void ff_fdct_sse2(int16_t *block) { - int64_t align_tmp[16] ATTR_ALIGN(8); - int16_t * const block_tmp= (int16_t*)align_tmp; - int16_t *block1; + int64_t align_tmp[16] ATTR_ALIGN(16); + int16_t * const block1= (int16_t*)align_tmp; - block1 = block_tmp; fdct_col(block, block1, 0); fdct_col(block, block1, 4); diff --git a/src/libffmpeg/libavcodec/i386/fft_3dn.c b/src/libffmpeg/libavcodec/i386/fft_3dn.c new file mode 100644 index 000000000..8087f1932 --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/fft_3dn.c @@ -0,0 +1,125 @@ +/* + * FFT/MDCT transform with 3DNow! optimizations + * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt + * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "../dsputil.h" + +static const int p1m1[2] __attribute__((aligned(8))) = + { 0, 1 << 31 }; + +static const int m1p1[2] __attribute__((aligned(8))) = + { 1 << 31, 0 }; + +void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z) +{ + int ln = s->nbits; + long i, j; + long nblocks, nloops; + FFTComplex *p, *cptr; + + asm volatile( + /* FEMMS is not a must here but recommended by AMD */ + "femms \n\t" + "movq %0, %%mm7 \n\t" + ::"m"(*(s->inverse ? m1p1 : p1m1)) + ); + + i = 8 << ln; + asm volatile( + "1: \n\t" + "sub $32, %0 \n\t" + "movq (%0,%1), %%mm0 \n\t" + "movq 16(%0,%1), %%mm1 \n\t" + "movq 8(%0,%1), %%mm2 \n\t" + "movq 24(%0,%1), %%mm3 \n\t" + "movq %%mm0, %%mm4 \n\t" + "movq %%mm1, %%mm5 \n\t" + "pfadd %%mm2, %%mm0 \n\t" + "pfadd %%mm3, %%mm1 \n\t" + "pfsub %%mm2, %%mm4 \n\t" + "pfsub %%mm3, %%mm5 \n\t" + "movq %%mm0, %%mm2 \n\t" + "punpckldq %%mm5, %%mm6 \n\t" + "punpckhdq %%mm6, %%mm5 \n\t" + "movq %%mm4, %%mm3 \n\t" + "pxor %%mm7, %%mm5 \n\t" + "pfadd %%mm1, %%mm0 \n\t" + "pfadd %%mm5, %%mm4 \n\t" + "pfsub %%mm1, %%mm2 \n\t" + "pfsub %%mm5, %%mm3 \n\t" + "movq %%mm0, (%0,%1) \n\t" + "movq %%mm4, 8(%0,%1) \n\t" + "movq %%mm2, 16(%0,%1) \n\t" + "movq %%mm3, 24(%0,%1) \n\t" + "jg 1b \n\t" + :"+r"(i) + :"r"(z) + ); + /* pass 2 .. ln-1 */ + + nblocks = 1 << (ln-3); + nloops = 1 << 2; + cptr = s->exptab1; + do { + p = z; + j = nblocks; + do { + i = nloops*8; + asm volatile( + "1: \n\t" + "sub $16, %0 \n\t" + "movq (%1,%0), %%mm0 \n\t" + "movq 8(%1,%0), %%mm1 \n\t" + "movq (%2,%0), %%mm2 \n\t" + "movq 8(%2,%0), %%mm3 \n\t" + "movq %%mm2, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "punpckldq %%mm2, %%mm2 \n\t" + "punpckldq %%mm3, %%mm3 \n\t" + "punpckhdq %%mm4, %%mm4 \n\t" + "punpckhdq %%mm5, %%mm5 \n\t" + "pfmul (%3,%0,2), %%mm2 \n\t" // cre*re cim*re + "pfmul 8(%3,%0,2), %%mm3 \n\t" + "pfmul 16(%3,%0,2), %%mm4 \n\t" // -cim*im cre*im + "pfmul 24(%3,%0,2), %%mm5 \n\t" + "pfadd %%mm2, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im + "pfadd %%mm3, %%mm5 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + "pfadd %%mm4, %%mm0 \n\t" + "pfadd %%mm5, %%mm1 \n\t" + "pfsub %%mm4, %%mm2 \n\t" + "pfsub %%mm5, %%mm3 \n\t" + "movq %%mm0, (%1,%0) \n\t" + "movq %%mm1, 8(%1,%0) \n\t" + "movq %%mm2, (%2,%0) \n\t" + "movq %%mm3, 8(%2,%0) \n\t" + "jg 1b \n\t" + :"+r"(i) + :"r"(p), "r"(p + nloops), "r"(cptr) + ); + p += nloops*2; + } while (--j); + cptr += nloops*2; + nblocks >>= 1; + nloops <<= 1; + } while (nblocks != 0); + asm volatile("femms"); +} diff --git a/src/libffmpeg/libavcodec/i386/fft_3dn2.c b/src/libffmpeg/libavcodec/i386/fft_3dn2.c new file mode 100644 index 000000000..a4fe5f0b6 --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/fft_3dn2.c @@ -0,0 +1,210 @@ +/* + * FFT/MDCT transform with Extended 3DNow! optimizations + * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt + * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "../dsputil.h" + +static const int p1m1[2] __attribute__((aligned(8))) = + { 0, 1 << 31 }; + +static const int m1p1[2] __attribute__((aligned(8))) = + { 1 << 31, 0 }; + +void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) +{ + int ln = s->nbits; + long i, j; + long nblocks, nloops; + FFTComplex *p, *cptr; + + asm volatile( + /* FEMMS is not a must here but recommended by AMD */ + "femms \n\t" + "movq %0, %%mm7 \n\t" + ::"m"(*(s->inverse ? m1p1 : p1m1)) + ); + + i = 8 << ln; + asm volatile( + "1: \n\t" + "sub $32, %0 \n\t" + "movq (%0,%1), %%mm0 \n\t" + "movq 16(%0,%1), %%mm1 \n\t" + "movq 8(%0,%1), %%mm2 \n\t" + "movq 24(%0,%1), %%mm3 \n\t" + "movq %%mm0, %%mm4 \n\t" + "movq %%mm1, %%mm5 \n\t" + "pfadd %%mm2, %%mm0 \n\t" + "pfadd %%mm3, %%mm1 \n\t" + "pfsub %%mm2, %%mm4 \n\t" + "pfsub %%mm3, %%mm5 \n\t" + "movq %%mm0, %%mm2 \n\t" + "pswapd %%mm5, %%mm5 \n\t" + "movq %%mm4, %%mm3 \n\t" + "pxor %%mm7, %%mm5 \n\t" + "pfadd %%mm1, %%mm0 \n\t" + "pfadd %%mm5, %%mm4 \n\t" + "pfsub %%mm1, %%mm2 \n\t" + "pfsub %%mm5, %%mm3 \n\t" + "movq %%mm0, (%0,%1) \n\t" + "movq %%mm4, 8(%0,%1) \n\t" + "movq %%mm2, 16(%0,%1) \n\t" + "movq %%mm3, 24(%0,%1) \n\t" + "jg 1b \n\t" + :"+r"(i) + :"r"(z) + ); + /* pass 2 .. ln-1 */ + + nblocks = 1 << (ln-3); + nloops = 1 << 2; + cptr = s->exptab1; + do { + p = z; + j = nblocks; + do { + i = nloops*8; + asm volatile( + "1: \n\t" + "sub $16, %0 \n\t" + "movq (%1,%0), %%mm0 \n\t" + "movq 8(%1,%0), %%mm1 \n\t" + "movq (%2,%0), %%mm2 \n\t" + "movq 8(%2,%0), %%mm3 \n\t" + "movq (%3,%0,2), %%mm4 \n\t" + "movq 8(%3,%0,2), %%mm5 \n\t" + "pswapd %%mm4, %%mm6 \n\t" // no need for cptr[2] & cptr[3] + "pswapd %%mm5, %%mm7 \n\t" + "pfmul %%mm2, %%mm4 \n\t" // cre*re cim*im + "pfmul %%mm3, %%mm5 \n\t" + "pfmul %%mm2, %%mm6 \n\t" // cim*re cre*im + "pfmul %%mm3, %%mm7 \n\t" + "pfpnacc %%mm6, %%mm4 \n\t" // cre*re-cim*im cim*re+cre*im + "pfpnacc %%mm7, %%mm5 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + "pfadd %%mm4, %%mm0 \n\t" + "pfadd %%mm5, %%mm1 \n\t" + "pfsub %%mm4, %%mm2 \n\t" + "pfsub %%mm5, %%mm3 \n\t" + "movq %%mm0, (%1,%0) \n\t" + "movq %%mm1, 8(%1,%0) \n\t" + "movq %%mm2, (%2,%0) \n\t" + "movq %%mm3, 8(%2,%0) \n\t" + "jg 1b \n\t" + :"+r"(i) + :"r"(p), "r"(p + nloops), "r"(cptr) + ); + p += nloops*2; + } while (--j); + cptr += nloops*2; + nblocks >>= 1; + nloops <<= 1; + } while (nblocks != 0); + asm volatile("femms"); +} + +void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + long k, n8, n4, n2, n; + const uint16_t *revtab = s->fft.revtab; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + const FFTSample *in1, *in2; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n4 = n >> 2; + n8 = n >> 3; + + /* pre rotation */ + in1 = input; + in2 = input + n2 - 1; + for(k = 0; k < n4; k++) { + // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it + asm volatile( + "movd %0, %%mm0 \n\t" + "movd %2, %%mm1 \n\t" + "punpckldq %1, %%mm0 \n\t" + "punpckldq %3, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "pfmul %%mm1, %%mm0 \n\t" + "pswapd %%mm1, %%mm1 \n\t" + "pfmul %%mm1, %%mm2 \n\t" + "pfpnacc %%mm2, %%mm0 \n\t" + ::"m"(in2[-2*k]), "m"(in1[2*k]), + "m"(tcos[k]), "m"(tsin[k]) + ); + asm volatile( + "movq %%mm0, %0 \n\t" + :"=m"(z[revtab[k]]) + ); + } + + ff_fft_calc(&s->fft, z); + + /* post rotation + reordering */ + for(k = 0; k < n4; k++) { + asm volatile( + "movq %0, %%mm0 \n\t" + "movd %1, %%mm1 \n\t" + "punpckldq %2, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "pfmul %%mm1, %%mm0 \n\t" + "pswapd %%mm1, %%mm1 \n\t" + "pfmul %%mm1, %%mm2 \n\t" + "pfpnacc %%mm2, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"+m"(z[k]) + :"m"(tcos[k]), "m"(tsin[k]) + ); + } + + k = n-8; + asm volatile("movd %0, %%mm7" ::"r"(1<<31)); + asm volatile( + "1: \n\t" + "movq (%4,%0), %%mm0 \n\t" // z[n8+k] + "neg %0 \n\t" + "pswapd -8(%4,%0), %%mm1 \n\t" // z[n8-1-k] + "movq %%mm0, %%mm2 \n\t" + "pxor %%mm7, %%mm2 \n\t" + "punpckldq %%mm1, %%mm2 \n\t" + "pswapd %%mm2, %%mm3 \n\t" + "punpckhdq %%mm1, %%mm0 \n\t" + "pswapd %%mm0, %%mm4 \n\t" + "pxor %%mm7, %%mm0 \n\t" + "pxor %%mm7, %%mm4 \n\t" + "movq %%mm3, -8(%3,%0) \n\t" // output[n-2-2*k] = { z[n8-1-k].im, -z[n8+k].re } + "movq %%mm4, -8(%2,%0) \n\t" // output[n2-2-2*k]= { -z[n8-1-k].re, z[n8+k].im } + "neg %0 \n\t" + "movq %%mm0, (%1,%0) \n\t" // output[2*k] = { -z[n8+k].im, z[n8-1-k].re } + "movq %%mm2, (%2,%0) \n\t" // output[n2+2*k] = { -z[n8+k].re, z[n8-1-k].im } + "sub $8, %0 \n\t" + "jge 1b \n\t" + :"+r"(k) + :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8) + :"memory" + ); + asm volatile("femms"); +} + diff --git a/src/libffmpeg/libavcodec/i386/fft_sse.c b/src/libffmpeg/libavcodec/i386/fft_sse.c index 631848265..0dc0c61c1 100644 --- a/src/libffmpeg/libavcodec/i386/fft_sse.c +++ b/src/libffmpeg/libavcodec/i386/fft_sse.c @@ -2,26 +2,23 @@ * FFT/MDCT transform with SSE optimizations * Copyright (c) 2002 Fabrice Bellard. * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "../dsputil.h" -#include <math.h> - -#ifdef HAVE_BUILTIN_VECTOR - -#include <xmmintrin.h> static const int p1p1p1m1[4] __attribute__((aligned(16))) = { 0, 0, 0, 1 << 31 }; @@ -32,6 +29,12 @@ static const int p1p1m1p1[4] __attribute__((aligned(16))) = static const int p1p1m1m1[4] __attribute__((aligned(16))) = { 0, 0, 1 << 31, 1 << 31 }; +static const int p1m1p1m1[4] __attribute__((aligned(16))) = + { 0, 1 << 31, 0, 1 << 31 }; + +static const int m1m1m1m1[4] __attribute__((aligned(16))) = + { 1 << 31, 1 << 31, 1 << 31, 1 << 31 }; + #if 0 static void print_v4sf(const char *str, __m128 a) { @@ -45,96 +48,200 @@ static void print_v4sf(const char *str, __m128 a) void ff_fft_calc_sse(FFTContext *s, FFTComplex *z) { int ln = s->nbits; - int j, np, np2; - int nblocks, nloops; - register FFTComplex *p, *q; - FFTComplex *cptr, *cptr1; - int k; - - np = 1 << ln; - - { - __m128 *r, a, b, a1, c1, c2; - - r = (__m128 *)&z[0]; - c1 = *(__m128 *)p1p1m1m1; - if (s->inverse) - c2 = *(__m128 *)p1p1m1p1; - else - c2 = *(__m128 *)p1p1p1m1; - - j = (np >> 2); - do { - a = r[0]; - b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); - a = _mm_xor_ps(a, c1); - /* do the pass 0 butterfly */ - a = _mm_add_ps(a, b); - - a1 = r[1]; - b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2)); - a1 = _mm_xor_ps(a1, c1); - /* do the pass 0 butterfly */ - b = _mm_add_ps(a1, b); - - /* multiply third by -i */ - /* by toggling the sign bit */ - b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0)); - b = _mm_xor_ps(b, c2); - - /* do the pass 1 butterfly */ - r[0] = _mm_add_ps(a, b); - r[1] = _mm_sub_ps(a, b); - r += 2; - } while (--j != 0); - } + long i, j; + long nblocks, nloops; + FFTComplex *p, *cptr; + + asm volatile( + "movaps %0, %%xmm4 \n\t" + "movaps %1, %%xmm5 \n\t" + ::"m"(*p1p1m1m1), + "m"(*(s->inverse ? p1p1m1p1 : p1p1p1m1)) + ); + + i = 8 << ln; + asm volatile( + "1: \n\t" + "sub $32, %0 \n\t" + /* do the pass 0 butterfly */ + "movaps (%0,%1), %%xmm0 \n\t" + "movaps %%xmm0, %%xmm1 \n\t" + "shufps $0x4E, %%xmm0, %%xmm0 \n\t" + "xorps %%xmm4, %%xmm1 \n\t" + "addps %%xmm1, %%xmm0 \n\t" + "movaps 16(%0,%1), %%xmm2 \n\t" + "movaps %%xmm2, %%xmm3 \n\t" + "shufps $0x4E, %%xmm2, %%xmm2 \n\t" + "xorps %%xmm4, %%xmm3 \n\t" + "addps %%xmm3, %%xmm2 \n\t" + /* multiply third by -i */ + /* by toggling the sign bit */ + "shufps $0xB4, %%xmm2, %%xmm2 \n\t" + "xorps %%xmm5, %%xmm2 \n\t" + /* do the pass 1 butterfly */ + "movaps %%xmm0, %%xmm1 \n\t" + "addps %%xmm2, %%xmm0 \n\t" + "subps %%xmm2, %%xmm1 \n\t" + "movaps %%xmm0, (%0,%1) \n\t" + "movaps %%xmm1, 16(%0,%1) \n\t" + "jg 1b \n\t" + :"+r"(i) + :"r"(z) + ); /* pass 2 .. ln-1 */ - nblocks = np >> 3; + nblocks = 1 << (ln-3); nloops = 1 << 2; - np2 = np >> 1; - - cptr1 = s->exptab1; + cptr = s->exptab1; do { p = z; - q = z + nloops; j = nblocks; do { - cptr = cptr1; - k = nloops >> 1; - do { - __m128 a, b, c, t1, t2; - - a = *(__m128 *)p; - b = *(__m128 *)q; - - /* complex mul */ - c = *(__m128 *)cptr; - /* cre*re cim*re */ - t1 = _mm_mul_ps(c, - _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0))); - c = *(__m128 *)(cptr + 2); - /* -cim*im cre*im */ - t2 = _mm_mul_ps(c, - _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1))); - b = _mm_add_ps(t1, t2); - - /* butterfly */ - *(__m128 *)p = _mm_add_ps(a, b); - *(__m128 *)q = _mm_sub_ps(a, b); - - p += 2; - q += 2; - cptr += 4; - } while (--k); - - p += nloops; - q += nloops; + i = nloops*8; + asm volatile( + "1: \n\t" + "sub $16, %0 \n\t" + "movaps (%2,%0), %%xmm1 \n\t" + "movaps (%1,%0), %%xmm0 \n\t" + "movaps %%xmm1, %%xmm2 \n\t" + "shufps $0xA0, %%xmm1, %%xmm1 \n\t" + "shufps $0xF5, %%xmm2, %%xmm2 \n\t" + "mulps (%3,%0,2), %%xmm1 \n\t" // cre*re cim*re + "mulps 16(%3,%0,2), %%xmm2 \n\t" // -cim*im cre*im + "addps %%xmm2, %%xmm1 \n\t" + "movaps %%xmm0, %%xmm3 \n\t" + "addps %%xmm1, %%xmm0 \n\t" + "subps %%xmm1, %%xmm3 \n\t" + "movaps %%xmm0, (%1,%0) \n\t" + "movaps %%xmm3, (%2,%0) \n\t" + "jg 1b \n\t" + :"+r"(i) + :"r"(p), "r"(p + nloops), "r"(cptr) + ); + p += nloops*2; } while (--j); - cptr1 += nloops * 2; - nblocks = nblocks >> 1; - nloops = nloops << 1; + cptr += nloops*2; + nblocks >>= 1; + nloops <<= 1; } while (nblocks != 0); } -#endif +void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, + const FFTSample *input, FFTSample *tmp) +{ + long k, n8, n4, n2, n; + const uint16_t *revtab = s->fft.revtab; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + const FFTSample *in1, *in2; + FFTComplex *z = (FFTComplex *)tmp; + + n = 1 << s->nbits; + n2 = n >> 1; + n4 = n >> 2; + n8 = n >> 3; + + asm volatile ("movaps %0, %%xmm7\n\t"::"m"(*p1m1p1m1)); + + /* pre rotation */ + in1 = input; + in2 = input + n2 - 4; + + /* Complex multiplication + Two complex products per iteration, we could have 4 with 8 xmm + registers, 8 with 16 xmm registers. + Maybe we should unroll more. + */ + for (k = 0; k < n4; k += 2) { + asm volatile ( + "movaps %0, %%xmm0 \n\t" // xmm0 = r0 X r1 X : in2 + "movaps %1, %%xmm3 \n\t" // xmm3 = X i1 X i0: in1 + "movlps %2, %%xmm1 \n\t" // xmm1 = X X R1 R0: tcos + "movlps %3, %%xmm2 \n\t" // xmm2 = X X I1 I0: tsin + "shufps $95, %%xmm0, %%xmm0 \n\t" // xmm0 = r1 r1 r0 r0 + "shufps $160,%%xmm3, %%xmm3 \n\t" // xmm3 = i1 i1 i0 i0 + "unpcklps %%xmm2, %%xmm1 \n\t" // xmm1 = I1 R1 I0 R0 + "movaps %%xmm1, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0 + "xorps %%xmm7, %%xmm2 \n\t" // xmm2 = -I1 R1 -I0 R0 + "mulps %%xmm1, %%xmm0 \n\t" // xmm0 = rI rR rI rR + "shufps $177,%%xmm2, %%xmm2 \n\t" // xmm2 = R1 -I1 R0 -I0 + "mulps %%xmm2, %%xmm3 \n\t" // xmm3 = Ri -Ii Ri -Ii + "addps %%xmm3, %%xmm0 \n\t" // xmm0 = result + ::"m"(in2[-2*k]), "m"(in1[2*k]), + "m"(tcos[k]), "m"(tsin[k]) + ); + /* Should be in the same block, hack for gcc2.95 & gcc3 */ + asm ( + "movlps %%xmm0, %0 \n\t" + "movhps %%xmm0, %1 \n\t" + :"=m"(z[revtab[k]]), "=m"(z[revtab[k + 1]]) + ); + } + + ff_fft_calc_sse(&s->fft, z); + + /* Not currently needed, added for safety */ + asm volatile ("movaps %0, %%xmm7\n\t"::"m"(*p1m1p1m1)); + + /* post rotation + reordering */ + for (k = 0; k < n4; k += 2) { + asm ( + "movaps %0, %%xmm0 \n\t" // xmm0 = i1 r1 i0 r0: z + "movlps %1, %%xmm1 \n\t" // xmm1 = X X R1 R0: tcos + "movaps %%xmm0, %%xmm3 \n\t" // xmm3 = i1 r1 i0 r0 + "movlps %2, %%xmm2 \n\t" // xmm2 = X X I1 I0: tsin + "shufps $160,%%xmm0, %%xmm0 \n\t" // xmm0 = r1 r1 r0 r0 + "shufps $245,%%xmm3, %%xmm3 \n\t" // xmm3 = i1 i1 i0 i0 + "unpcklps %%xmm2, %%xmm1 \n\t" // xmm1 = I1 R1 I0 R0 + "movaps %%xmm1, %%xmm2 \n\t" // xmm2 = I1 R1 I0 R0 + "xorps %%xmm7, %%xmm2 \n\t" // xmm2 = -I1 R1 -I0 R0 + "mulps %%xmm1, %%xmm0 \n\t" // xmm0 = rI rR rI rR + "shufps $177,%%xmm2, %%xmm2 \n\t" // xmm2 = R1 -I1 R0 -I0 + "mulps %%xmm2, %%xmm3 \n\t" // xmm3 = Ri -Ii Ri -Ii + "addps %%xmm3, %%xmm0 \n\t" // xmm0 = result + "movaps %%xmm0, %0 \n\t" + :"+m"(z[k]) + :"m"(tcos[k]), "m"(tsin[k]) + ); + } + + /* + Mnemonics: + 0 = z[k].re + 1 = z[k].im + 2 = z[k + 1].re + 3 = z[k + 1].im + 4 = z[-k - 2].re + 5 = z[-k - 2].im + 6 = z[-k - 1].re + 7 = z[-k - 1].im + */ + k = 16-n; + asm volatile("movaps %0, %%xmm7 \n\t"::"m"(*m1m1m1m1)); + asm volatile( + "1: \n\t" + "movaps -16(%4,%0), %%xmm1 \n\t" // xmm1 = 4 5 6 7 = z[-2-k] + "neg %0 \n\t" + "movaps (%4,%0), %%xmm0 \n\t" // xmm0 = 0 1 2 3 = z[k] + "xorps %%xmm7, %%xmm0 \n\t" // xmm0 = -0 -1 -2 -3 + "movaps %%xmm0, %%xmm2 \n\t" // xmm2 = -0 -1 -2 -3 + "shufps $141,%%xmm1, %%xmm0 \n\t" // xmm0 = -1 -3 4 6 + "shufps $216,%%xmm1, %%xmm2 \n\t" // xmm2 = -0 -2 5 7 + "shufps $156,%%xmm0, %%xmm0 \n\t" // xmm0 = -1 6 -3 4 ! + "shufps $156,%%xmm2, %%xmm2 \n\t" // xmm2 = -0 7 -2 5 ! + "movaps %%xmm0, (%1,%0) \n\t" // output[2*k] + "movaps %%xmm2, (%2,%0) \n\t" // output[n2+2*k] + "neg %0 \n\t" + "shufps $27, %%xmm0, %%xmm0 \n\t" // xmm0 = 4 -3 6 -1 + "xorps %%xmm7, %%xmm0 \n\t" // xmm0 = -4 3 -6 1 ! + "shufps $27, %%xmm2, %%xmm2 \n\t" // xmm2 = 5 -2 7 -0 ! + "movaps %%xmm0, -16(%2,%0) \n\t" // output[n2-4-2*k] + "movaps %%xmm2, -16(%3,%0) \n\t" // output[n-4-2*k] + "add $16, %0 \n\t" + "jle 1b \n\t" + :"+r"(k) + :"r"(output), "r"(output+n2), "r"(output+n), "r"(z+n8) + :"memory" + ); +} + diff --git a/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c b/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c index ac4ad6401..40baf199b 100644 --- a/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c +++ b/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c @@ -1,18 +1,20 @@ /* * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ @@ -174,7 +176,7 @@ static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) block[0] += 32; for(i=0; i<2; i++){ - uint64_t tmp; + DECLARE_ALIGNED_8(uint64_t, tmp); h264_idct8_1d(block+4*i); @@ -315,6 +317,17 @@ static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) "por "#t", "#o" \n\t"\ "psubusb "#a", "#o" \n\t" +// out: o = |x-y|>a +// clobbers: t +#define DIFF_GT2_MMX(x,y,a,o,t)\ + "movq "#y", "#t" \n\t"\ + "movq "#x", "#o" \n\t"\ + "psubusb "#x", "#t" \n\t"\ + "psubusb "#y", "#o" \n\t"\ + "psubusb "#a", "#t" \n\t"\ + "psubusb "#a", "#o" \n\t"\ + "pcmpeqb "#t", "#o" \n\t"\ + // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 // out: mm5=beta-1, mm7=mask // clobbers: mm4,mm6 @@ -335,46 +348,26 @@ static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) // out: mm1=p0' mm2=q0' // clobbers: mm0,3-6 #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\ - /* a = q0^p0^((p1-q1)>>2) */\ - "movq %%mm0, %%mm4 \n\t"\ - "psubb %%mm3, %%mm4 \n\t"\ - "psrlw $2, %%mm4 \n\t"\ - "pxor %%mm1, %%mm4 \n\t"\ - "pxor %%mm2, %%mm4 \n\t"\ - /* b = p0^(q1>>2) */\ - "psrlw $2, %%mm3 \n\t"\ - "pand "#pb_3f", %%mm3 \n\t"\ - "movq %%mm1, %%mm5 \n\t"\ - "pxor %%mm3, %%mm5 \n\t"\ - /* c = q0^(p1>>2) */\ - "psrlw $2, %%mm0 \n\t"\ - "pand "#pb_3f", %%mm0 \n\t"\ - "movq %%mm2, %%mm6 \n\t"\ - "pxor %%mm0, %%mm6 \n\t"\ - /* d = (c^b) & ~(b^a) & 1 */\ - "pxor %%mm5, %%mm6 \n\t"\ - "pxor %%mm4, %%mm5 \n\t"\ - "pandn %%mm6, %%mm5 \n\t"\ - "pand "#pb_01", %%mm5 \n\t"\ - /* delta = (avg(q0, p1>>2) + (d&a)) - * - (avg(p0, q1>>2) + (d&~a)) */\ - "pavgb %%mm2, %%mm0 \n\t"\ - "pand %%mm5, %%mm4 \n\t"\ - "paddusb %%mm4, %%mm0 \n\t"\ - "pavgb %%mm1, %%mm3 \n\t"\ - "pxor %%mm5, %%mm4 \n\t"\ - "paddusb %%mm4, %%mm3 \n\t"\ - /* p0 += clip(delta, -tc0, tc0) - * q0 -= clip(delta, -tc0, tc0) */\ - "movq %%mm0, %%mm4 \n\t"\ - "psubusb %%mm3, %%mm0 \n\t"\ - "psubusb %%mm4, %%mm3 \n\t"\ - "pminub %%mm7, %%mm0 \n\t"\ - "pminub %%mm7, %%mm3 \n\t"\ - "paddusb %%mm0, %%mm1 \n\t"\ - "paddusb %%mm3, %%mm2 \n\t"\ - "psubusb %%mm3, %%mm1 \n\t"\ - "psubusb %%mm0, %%mm2 \n\t" + "movq %%mm1 , %%mm5 \n\t"\ + "pxor %%mm2 , %%mm5 \n\t" /* p0^q0*/\ + "pand "#pb_01" , %%mm5 \n\t" /* (p0^q0)&1*/\ + "pcmpeqb %%mm4 , %%mm4 \n\t"\ + "pxor %%mm4 , %%mm3 \n\t"\ + "pavgb %%mm0 , %%mm3 \n\t" /* (p1 - q1 + 256)>>1*/\ + "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" /*(((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2*/\ + "pxor %%mm1 , %%mm4 \n\t"\ + "pavgb %%mm2 , %%mm4 \n\t" /* (q0 - p0 + 256)>>1*/\ + "pavgb %%mm5 , %%mm3 \n\t"\ + "paddusb %%mm4 , %%mm3 \n\t" /* d+128+33*/\ + "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\ + "psubusb %%mm3 , %%mm6 \n\t"\ + "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\ + "pminub %%mm7 , %%mm6 \n\t"\ + "pminub %%mm7 , %%mm3 \n\t"\ + "psubusb %%mm6 , %%mm1 \n\t"\ + "psubusb %%mm3 , %%mm2 \n\t"\ + "paddusb %%mm3 , %%mm1 \n\t"\ + "paddusb %%mm6 , %%mm2 \n\t" // in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=mm_bone // out: (q1addr) = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) @@ -395,10 +388,7 @@ static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride) static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0) { - uint64_t tmp0; - uint64_t tc = (uint8_t)tc0[1]*0x01010000 | (uint8_t)tc0[0]*0x0101; - // with luma, tc0=0 doesn't mean no filtering, so we need a separate input mask - uint32_t mask[2] = { (tc0[0]>=0)*0xffffffff, (tc0[1]>=0)*0xffffffff }; + DECLARE_ALIGNED_8(uint64_t, tmp0[2]); asm volatile( "movq (%1,%3), %%mm0 \n\t" //p1 @@ -406,45 +396,46 @@ static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph "movq (%2), %%mm2 \n\t" //q0 "movq (%2,%3), %%mm3 \n\t" //q1 H264_DEBLOCK_MASK(%6, %7) - "pand %5, %%mm7 \n\t" - "movq %%mm7, %0 \n\t" + + "movd %5, %%mm4 \n\t" + "punpcklbw %%mm4, %%mm4 \n\t" + "punpcklwd %%mm4, %%mm4 \n\t" + "pcmpeqb %%mm3, %%mm3 \n\t" + "movq %%mm4, %%mm6 \n\t" + "pcmpgtb %%mm3, %%mm4 \n\t" + "movq %%mm6, 8+%0 \n\t" + "pand %%mm4, %%mm7 \n\t" + "movq %%mm7, %0 \n\t" /* filter p1 */ "movq (%1), %%mm3 \n\t" //p2 - DIFF_GT_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 - "pandn %%mm7, %%mm6 \n\t" - "pcmpeqb %%mm7, %%mm6 \n\t" + DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4) // |p2-p0|>beta-1 "pand %%mm7, %%mm6 \n\t" // mask & |p2-p0|<beta - "pshufw $80, %4, %%mm4 \n\t" - "pand %%mm7, %%mm4 \n\t" // mask & tc0 - "movq %8, %%mm7 \n\t" - "pand %%mm6, %%mm7 \n\t" // mask & |p2-p0|<beta & 1 + "pand 8+%0, %%mm7 \n\t" // mask & tc0 + "movq %%mm7, %%mm4 \n\t" + "psubb %%mm6, %%mm7 \n\t" "pand %%mm4, %%mm6 \n\t" // mask & |p2-p0|<beta & tc0 - "paddb %%mm4, %%mm7 \n\t" // tc++ H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4) /* filter q1 */ "movq (%2,%3,2), %%mm4 \n\t" //q2 - DIFF_GT_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 - "pandn %0, %%mm6 \n\t" - "pcmpeqb %0, %%mm6 \n\t" + DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3) // |q2-q0|>beta-1 "pand %0, %%mm6 \n\t" - "pshufw $80, %4, %%mm5 \n\t" + "movq 8+%0, %%mm5 \n\t" // can be merged with the and below but is slower then "pand %%mm6, %%mm5 \n\t" - "pand %8, %%mm6 \n\t" - "paddb %%mm6, %%mm7 \n\t" + "psubb %%mm6, %%mm7 \n\t" "movq (%2,%3), %%mm3 \n\t" H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6) /* filter p0, q0 */ - H264_DEBLOCK_P0_Q0(%8, %9) + H264_DEBLOCK_P0_Q0(%8, unused) "movq %%mm1, (%1,%3,2) \n\t" "movq %%mm2, (%2) \n\t" - : "=m"(tmp0) + : "=m"(*tmp0) : "r"(pix-3*stride), "r"(pix), "r"((long)stride), - "m"(tc), "m"(*(uint64_t*)mask), "m"(alpha1), "m"(beta1), - "m"(mm_bone), "m"(ff_pb_3F) + "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1), + "m"(mm_bone) ); } @@ -459,7 +450,7 @@ static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, in { //FIXME: could cut some load/stores by merging transpose with filter // also, it only needs to transpose 6x8 - uint8_t trans[8*8]; + DECLARE_ALIGNED_8(uint8_t, trans[8*8]); int i; for(i=0; i<2; i++, pix+=8*stride, tc0+=2) { if((tc0[0] & tc0[1]) < 0) @@ -503,7 +494,7 @@ static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { //FIXME: could cut some load/stores by merging transpose with filter - uint8_t trans[8*4]; + DECLARE_ALIGNED_8(uint8_t, trans[8*4]); transpose4x4(trans, pix-2, 8, stride); transpose4x4(trans+4, pix-2+4*stride, 8, stride); h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0); @@ -553,7 +544,7 @@ static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta) { //FIXME: could cut some load/stores by merging transpose with filter - uint8_t trans[8*4]; + DECLARE_ALIGNED_8(uint8_t, trans[8*4]); transpose4x4(trans, pix-2, 8, stride); transpose4x4(trans+4, pix-2+4*stride, 8, stride); h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1); @@ -561,6 +552,101 @@ static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int a transpose4x4(pix-2+4*stride, trans+4, stride, 8); } +static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2], + int bidir, int edges, int step, int mask_mv0, int mask_mv1 ) { + int dir; + asm volatile( + "pxor %%mm7, %%mm7 \n\t" + "movq %0, %%mm6 \n\t" + "movq %1, %%mm5 \n\t" + "movq %2, %%mm4 \n\t" + ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7) + ); + // could do a special case for dir==0 && edges==1, but it only reduces the + // average filter time by 1.2% + for( dir=1; dir>=0; dir-- ) { + const int d_idx = dir ? -8 : -1; + const int mask_mv = dir ? mask_mv1 : mask_mv0; + DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL; + int b_idx, edge, l; + for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) { + asm volatile( + "pand %0, %%mm0 \n\t" + ::"m"(mask_dir) + ); + if(!(mask_mv & edge)) { + asm volatile("pxor %%mm0, %%mm0 \n\t":); + for( l = bidir; l >= 0; l-- ) { + asm volatile( + "movd %0, %%mm1 \n\t" + "punpckldq %1, %%mm1 \n\t" + "movq %%mm1, %%mm2 \n\t" + "psrlw $7, %%mm2 \n\t" + "pand %%mm6, %%mm2 \n\t" + "por %%mm2, %%mm1 \n\t" // ref_cache with -2 mapped to -1 + "punpckldq %%mm1, %%mm2 \n\t" + "pcmpeqb %%mm2, %%mm1 \n\t" + "paddb %%mm6, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] + "por %%mm1, %%mm0 \n\t" + + "movq %2, %%mm1 \n\t" + "movq %3, %%mm2 \n\t" + "psubw %4, %%mm1 \n\t" + "psubw %5, %%mm2 \n\t" + "packsswb %%mm2, %%mm1 \n\t" + "paddb %%mm5, %%mm1 \n\t" + "pminub %%mm4, %%mm1 \n\t" + "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit + "por %%mm1, %%mm0 \n\t" + ::"m"(ref[l][b_idx]), + "m"(ref[l][b_idx+d_idx]), + "m"(mv[l][b_idx][0]), + "m"(mv[l][b_idx+2][0]), + "m"(mv[l][b_idx+d_idx][0]), + "m"(mv[l][b_idx+d_idx+2][0]) + ); + } + } + asm volatile( + "movd %0, %%mm1 \n\t" + "por %1, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pcmpgtw %%mm7, %%mm1 \n\t" // nnz[b] || nnz[bn] + ::"m"(nnz[b_idx]), + "m"(nnz[b_idx+d_idx]) + ); + asm volatile( + "pcmpeqw %%mm7, %%mm0 \n\t" + "pcmpeqw %%mm7, %%mm0 \n\t" + "psrlw $15, %%mm0 \n\t" // nonzero -> 1 + "psrlw $14, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "por %%mm1, %%mm2 \n\t" + "psrlw $1, %%mm1 \n\t" + "pandn %%mm2, %%mm1 \n\t" + "movq %%mm1, %0 \n\t" + :"=m"(*bS[dir][edge]) + ::"memory" + ); + } + edges = 4; + step = 1; + } + asm volatile( + "movq (%0), %%mm0 \n\t" + "movq 8(%0), %%mm1 \n\t" + "movq 16(%0), %%mm2 \n\t" + "movq 24(%0), %%mm3 \n\t" + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4) + "movq %%mm0, (%0) \n\t" + "movq %%mm3, 8(%0) \n\t" + "movq %%mm4, 16(%0) \n\t" + "movq %%mm2, 24(%0) \n\t" + ::"r"(bS[0]) + :"memory" + ); +} /***********************************/ /* motion compensation */ diff --git a/src/libffmpeg/libavcodec/i386/idct_mmx.c b/src/libffmpeg/libavcodec/i386/idct_mmx.c index 1c8632fb7..ba595845a 100644 --- a/src/libffmpeg/libavcodec/i386/idct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/idct_mmx.c @@ -1,7 +1,4 @@ /* - * Note: For libavcodec, this code can also be used under the LGPL license - */ -/* * idct_mmx.c * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca> * diff --git a/src/libffmpeg/libavcodec/i386/idct_mmx_xvid.c b/src/libffmpeg/libavcodec/i386/idct_mmx_xvid.c index a55d4ea07..59b255943 100644 --- a/src/libffmpeg/libavcodec/i386/idct_mmx_xvid.c +++ b/src/libffmpeg/libavcodec/i386/idct_mmx_xvid.c @@ -5,22 +5,23 @@ // * // * Copyright(C) 2001 Peter Ross <pross@xvid.org> // * -// * This program is free software; you can redistribute it and/or modify it -// * under the terms of the GNU General Public License as published by -// * the Free Software Foundation; either version 2 of the License, or -// * (at your option) any later version. +// * This file is part of FFmpeg. // * -// * This program is distributed in the hope that it will be useful, -// * but WITHOUT ANY WARRANTY; without even the implied warranty of -// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// * GNU General Public License for more details. +// * FFmpeg is free software; you can redistribute it and/or +// * modify it under the terms of the GNU Lesser General Public +// * License as published by the Free Software Foundation; either +// * version 2.1 of the License, or (at your option) any later version. // * -// * You should have received a copy of the GNU General Public License -// * along with this program; if not, write to the Free Software Foundation, -// * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -// +// * FFmpeg is distributed in the hope that it will be useful, +// * but WITHOUT ANY WARRANTY; without even the implied warranty of +// * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// * Lesser General Public License for more details. +// * +// * You should have received a copy of the GNU Lesser General Public License +// * along with FFmpeg; if not, write to the Free Software Foundation, +// * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA // * -// * $Id: idct_mmx_xvid.c,v 1.3 2006/08/02 07:02:41 tmmm Exp $ +// * $Id: idct_mmx_xvid.c,v 1.4 2006/12/04 22:25:26 miguelfreitas Exp $ // * // ***************************************************************************/ @@ -295,17 +296,17 @@ static const int16_t tab_i_04_xmm[32*4] attribute_used __attribute__ ((aligned(8 "movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\ "movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\ "movq " #A3 ",%%mm3 \n\t"/* 3 ; w05 w04 w01 w00*/\ - "pshufw $0b10001000,%%mm0,%%mm0 \n\t"/* x2 x0 x2 x0*/\ + "pshufw $0x88,%%mm0,%%mm0 \n\t"/* x2 x0 x2 x0*/\ "movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w06 w03 w02*/\ "movq %%mm1,%%mm5 \n\t"/* 5 ; x7 x6 x5 x4*/\ "pmaddwd %%mm0,%%mm3 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\ "movq 32+" #A3 ",%%mm6 \n\t"/* 6 ; w21 w20 w17 w16*/\ - "pshufw $0b10001000,%%mm1,%%mm1 \n\t"/* x6 x4 x6 x4*/\ + "pshufw $0x88,%%mm1,%%mm1 \n\t"/* x6 x4 x6 x4*/\ "pmaddwd %%mm1,%%mm4 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\ "movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w22 w19 w18*/\ - "pshufw $0b11011101,%%mm2,%%mm2 \n\t"/* x3 x1 x3 x1*/\ + "pshufw $0xdd,%%mm2,%%mm2 \n\t"/* x3 x1 x3 x1*/\ "pmaddwd %%mm2,%%mm6 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\ - "pshufw $0b11011101,%%mm5,%%mm5 \n\t"/* x7 x5 x7 x5*/\ + "pshufw $0xdd,%%mm5,%%mm5 \n\t"/* x7 x5 x7 x5*/\ "pmaddwd %%mm5,%%mm7 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\ "paddd " #A4 ",%%mm3 \n\t"/* +%4*/\ "pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\ @@ -330,7 +331,7 @@ static const int16_t tab_i_04_xmm[32*4] attribute_used __attribute__ ((aligned(8 "packssdw %%mm0,%%mm3 \n\t"/* 0 ; y3 y2 y1 y0*/\ "packssdw %%mm4,%%mm7 \n\t"/* 4 ; y6 y7 y4 y5*/\ "movq %%mm3, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\ - "pshufw $0b10110001,%%mm7,%%mm7 \n\t"/* y7 y6 y5 y4*/\ + "pshufw $0xb1,%%mm7,%%mm7 \n\t"/* y7 y6 y5 y4*/\ "movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\ diff --git a/src/libffmpeg/libavcodec/i386/mmx.h b/src/libffmpeg/libavcodec/i386/mmx.h index eab051341..41aae6c21 100644 --- a/src/libffmpeg/libavcodec/i386/mmx.h +++ b/src/libffmpeg/libavcodec/i386/mmx.h @@ -1,6 +1,22 @@ /* * mmx.h * Copyright (C) 1997-2001 H. Dietz and R. Fisher + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef AVCODEC_I386MMX_H #define AVCODEC_I386MMX_H @@ -184,16 +200,16 @@ typedef union { #define mmx_m2ri(op,mem,reg,imm) \ __asm__ __volatile__ (#op " %1, %0, %%" #reg \ : /* nothing */ \ - : "X" (mem), "X" (imm)) + : "m" (mem), "i" (imm)) #define mmx_r2ri(op,regs,regd,imm) \ __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \ : /* nothing */ \ - : "X" (imm) ) + : "i" (imm) ) #define mmx_fetch(mem,hint) \ __asm__ __volatile__ ("prefetch" #hint " %0" \ : /* nothing */ \ - : "X" (mem)) + : "m" (mem)) #define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg) diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c index edcabcf38..e33870e0f 100644 --- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c +++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c @@ -3,18 +3,20 @@ * Copyright (c) 2001 Fabrice Bellard. * Copyright (c) 2002-2004 Michael Niedermayer * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * mostly by Michael Niedermayer <michaelni@gmx.at> @@ -34,7 +36,7 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) { long len= -(stride*h); asm volatile( - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t" "movq (%2, %%"REG_a"), %%mm2 \n\t" @@ -70,7 +72,7 @@ static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) { long len= -(stride*h); asm volatile( - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t" "movq (%2, %%"REG_a"), %%mm2 \n\t" @@ -92,7 +94,7 @@ static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, in { long len= -(stride*h); asm volatile( - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t" "movq (%2, %%"REG_a"), %%mm2 \n\t" @@ -118,7 +120,7 @@ static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) { //FIXME reuse src long len= -(stride*h); asm volatile( - ".balign 16 \n\t" + ASMALIGN(4) "movq "MANGLE(bone)", %%mm5 \n\t" "1: \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t" @@ -155,7 +157,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int { long len= -(stride*h); asm volatile( - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t" "movq (%2, %%"REG_a"), %%mm1 \n\t" @@ -193,7 +195,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) { long len= -(stride*h); asm volatile( - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t" "movq (%2, %%"REG_a"), %%mm1 \n\t" diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c index c00a602bd..1b7b1c19f 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c @@ -2,18 +2,20 @@ * The simplest mpeg encoder (well, it was the simplest!) * Copyright (c) 2000,2001 Fabrice Bellard. * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru> @@ -25,7 +27,6 @@ #include "../avcodec.h" #include "x86_cpu.h" -extern uint8_t zigzag_direct_noperm[64]; extern uint16_t inv_zigzag_direct16[64]; static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; @@ -66,7 +67,7 @@ asm volatile( "packssdw %%mm5, %%mm5 \n\t" "psubw %%mm5, %%mm7 \n\t" "pxor %%mm4, %%mm4 \n\t" - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%0, %3), %%mm0 \n\t" "movq 8(%0, %3), %%mm1 \n\t" @@ -129,7 +130,7 @@ asm volatile( "packssdw %%mm5, %%mm5 \n\t" "psubw %%mm5, %%mm7 \n\t" "pxor %%mm4, %%mm4 \n\t" - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%0, %3), %%mm0 \n\t" "movq 8(%0, %3), %%mm1 \n\t" @@ -222,7 +223,7 @@ asm volatile( "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "mov %3, %%"REG_a" \n\t" - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t" @@ -285,7 +286,7 @@ asm volatile( "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "mov %3, %%"REG_a" \n\t" - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t" @@ -357,7 +358,7 @@ asm volatile( "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "mov %3, %%"REG_a" \n\t" - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t" @@ -418,7 +419,7 @@ asm volatile( "packssdw %%mm6, %%mm6 \n\t" "packssdw %%mm6, %%mm6 \n\t" "mov %3, %%"REG_a" \n\t" - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%0, %%"REG_a"), %%mm0 \n\t" "movq 8(%0, %%"REG_a"), %%mm1 \n\t" diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c index de2ef08e5..d59b6efd9 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c @@ -3,18 +3,20 @@ * * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #undef SPREADW @@ -74,7 +76,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, asm volatile ( "mul %%ecx \n\t" : "=d" (level), "=a"(dummy) - : "a" ((block[0]>>2) + q), "c" (inverse[q<<1]) + : "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1]) ); #else asm volatile ( @@ -112,7 +114,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "pxor %%mm6, %%mm6 \n\t" "psubw (%3), %%mm6 \n\t" // -bias[0] "mov $-128, %%"REG_a" \n\t" - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "pxor %%mm1, %%mm1 \n\t" // 0 "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] @@ -156,7 +158,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm4, %%mm4 \n\t" // 0 "mov $-128, %%"REG_a" \n\t" - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "pxor %%mm1, %%mm1 \n\t" // 0 "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] diff --git a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c index b033a12b8..525ef34f7 100644 --- a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c @@ -3,18 +3,20 @@ * * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "../dsputil.h" @@ -281,7 +283,7 @@ static inline void idct(int16_t *block) "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ "movq %%mm4, 16+" #dst " \n\t"\ -#define COL_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ +#define COL_IDCT(src0, src4, src1, src5, dst, shift) \ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ @@ -294,10 +296,8 @@ static inline void idct(int16_t *block) "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - #rounder ", %%mm4 \n\t"\ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - #rounder ", %%mm0 \n\t"\ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ @@ -458,11 +458,11 @@ DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) +//IDCT( src0, src4, src1, src5, dst, shift) +COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) #else @@ -705,7 +705,7 @@ Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) #undef IDCT -#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ +#define IDCT(src0, src4, src1, src5, dst, shift) \ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ @@ -718,10 +718,8 @@ Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - #rounder ", %%mm4 \n\t"\ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - #rounder ", %%mm0 \n\t"\ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ @@ -782,20 +780,20 @@ Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) "movd %%mm5, 80+" #dst " \n\t" -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) "jmp 9f \n\t" - "#.balign 16 \n\t"\ + "#" ASMALIGN(4) \ "4: \n\t" Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) #undef IDCT -#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ +#define IDCT(src0, src4, src1, src5, dst, shift) \ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ @@ -807,9 +805,7 @@ Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - #rounder ", %%mm4 \n\t"\ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - #rounder ", %%mm0 \n\t"\ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ @@ -859,28 +855,26 @@ Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) "movd %%mm1, 64+" #dst " \n\t"\ "movd %%mm5, 80+" #dst " \n\t" -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) "jmp 9f \n\t" - "#.balign 16 \n\t"\ + "#" ASMALIGN(4) \ "6: \n\t" Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) #undef IDCT -#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ +#define IDCT(src0, src4, src1, src5, dst, shift) \ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - #rounder ", %%mm4 \n\t"\ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ - #rounder ", %%mm0 \n\t"\ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ @@ -927,19 +921,19 @@ Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) "movd %%mm5, 80+" #dst " \n\t" -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) "jmp 9f \n\t" - "#.balign 16 \n\t"\ + "#" ASMALIGN(4) \ "2: \n\t" Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) #undef IDCT -#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ +#define IDCT(src0, src4, src1, src5, dst, shift) \ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ @@ -947,10 +941,8 @@ Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - #rounder ", %%mm4 \n\t"\ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - #rounder ", %%mm0 \n\t"\ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ @@ -1006,27 +998,25 @@ Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) "movd %%mm4, 64+" #dst " \n\t"\ "movd %%mm5, 80+" #dst " \n\t" -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) "jmp 9f \n\t" - "#.balign 16 \n\t"\ + "#" ASMALIGN(4) \ "3: \n\t" #undef IDCT -#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ +#define IDCT(src0, src4, src1, src5, dst, shift) \ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - #rounder ", %%mm4 \n\t"\ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - #rounder ", %%mm0 \n\t"\ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ "movq 64(%2), %%mm3 \n\t"\ @@ -1072,17 +1062,17 @@ IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "movd %%mm5, 80+" #dst " \n\t" -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) "jmp 9f \n\t" - "#.balign 16 \n\t"\ + "#" ASMALIGN(4) \ "5: \n\t" #undef IDCT -#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ +#define IDCT(src0, src4, src1, src5, dst, shift) \ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ @@ -1093,10 +1083,8 @@ IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - #rounder ", %%mm4 \n\t"\ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ - #rounder ", %%mm0 \n\t"\ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ @@ -1110,10 +1098,8 @@ IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - #rounder ", %%mm1 \n\t"\ "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ - #rounder ", %%mm2 \n\t"\ "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ @@ -1140,18 +1126,18 @@ IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "movq %%mm5, 80+" #dst " \n\t" -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) "jmp 9f \n\t" - "#.balign 16 \n\t"\ + "#" ASMALIGN(4) \ "1: \n\t" #undef IDCT -#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ +#define IDCT(src0, src4, src1, src5, dst, shift) \ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ @@ -1163,10 +1149,8 @@ IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ - #rounder ", %%mm4 \n\t"\ "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ - #rounder ", %%mm0 \n\t"\ "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ @@ -1216,25 +1200,23 @@ IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) "movd %%mm5, 80+" #dst " \n\t" -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) "jmp 9f \n\t" - "#.balign 16 \n\t" + "#" ASMALIGN(4) "7: \n\t" #undef IDCT -#define IDCT(src0, src4, src1, src5, dst, rounder, shift) \ +#define IDCT(src0, src4, src1, src5, dst, shift) \ "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ - #rounder ", %%mm4 \n\t"\ - #rounder ", %%mm0 \n\t"\ "psrad $" #shift ", %%mm4 \n\t"\ "psrad $" #shift ", %%mm0 \n\t"\ "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ @@ -1243,8 +1225,6 @@ IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ - #rounder ", %%mm1 \n\t"\ - #rounder ", %%mm2 \n\t"\ "psrad $" #shift ", %%mm1 \n\t"\ "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ "movq %%mm4, " #dst " \n\t"\ @@ -1258,11 +1238,11 @@ IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) "movq %%mm4, 64+" #dst " \n\t"\ "movq %%mm0, 80+" #dst " \n\t" -//IDCT( src0, src4, src1, src5, dst, rounder, shift) -IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0),/nop, 20) -//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0),/nop, 20) -IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0),/nop, 20) -//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0),/nop, 20) +//IDCT( src0, src4, src1, src5, dst, shift) +IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) +//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) +IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) +//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) #endif diff --git a/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c b/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c index 0684531ae..f715dc803 100644 --- a/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c +++ b/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c @@ -1,18 +1,20 @@ /* * Copyright (C) 2004 the ffmpeg project * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ diff --git a/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c b/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c index cf822f7d4..bd2911d59 100644 --- a/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c +++ b/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c @@ -1,18 +1,20 @@ /* * Copyright (C) 2004 the ffmpeg project * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ |