diff options
Diffstat (limited to 'src/libffmpeg/libavcodec/i386/dsputil_mmx.c')
-rw-r--r-- | src/libffmpeg/libavcodec/i386/dsputil_mmx.c | 432 |
1 files changed, 390 insertions, 42 deletions
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c index a2cbab8ce..673e749c4 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c @@ -3,18 +3,20 @@ * Copyright (c) 2000, 2001 Fabrice Bellard. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * - * This library is free software; you can redistribute it and/or + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. + * version 2.1 of the License, or (at your option) any later version. * - * This library is distributed in the hope that it will be useful, + * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software + * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * MMX optimization by Nick Kurshev <nickols_k@mail.ru> @@ -29,7 +31,6 @@ //#undef NDEBUG //#include <assert.h> -extern const uint8_t ff_h263_loop_filter_strength[32]; extern void ff_idct_xvid_mmx(short *block); extern void ff_idct_xvid_mmx2(short *block); @@ -40,6 +41,9 @@ static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x01 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL; static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL; +static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) = +{0x8000000080000000ULL, 0x8000000080000000ULL}; + static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL; @@ -50,10 +54,15 @@ static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL; static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; +static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL; +static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL; +static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL; static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL; +static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL; +static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL; static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; -#define JUMPALIGN() __asm __volatile (".balign 8"::) +#define JUMPALIGN() __asm __volatile (ASMALIGN(3)::) #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) #define MOVQ_WONE(regd) \ @@ -201,7 +210,7 @@ static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) asm volatile( "mov $-128, %%"REG_a" \n\t" "pxor %%mm7, %%mm7 \n\t" - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%0), %%mm0 \n\t" "movq (%0, %2), %%mm2 \n\t" @@ -229,7 +238,7 @@ static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint asm volatile( "pxor %%mm7, %%mm7 \n\t" "mov $-128, %%"REG_a" \n\t" - ".balign 16 \n\t" + ASMALIGN(4) "1: \n\t" "movq (%0), %%mm0 \n\t" "movq (%1), %%mm2 \n\t" @@ -372,7 +381,7 @@ static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size { __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movd (%1), %%mm0 \n\t" "movd (%1, %3), %%mm1 \n\t" @@ -398,7 +407,7 @@ static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size { __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" @@ -424,7 +433,7 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_siz { __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" - ".balign 8 \n\t" + ASMALIGN(3) "1: \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" @@ -625,22 +634,10 @@ static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ asm volatile( //FIXME could save 1 instruction if done as 8x4 ... - "movd %0, %%mm0 \n\t" - "movd %1, %%mm1 \n\t" - "movd %2, %%mm2 \n\t" - - : - : "m" (*(uint32_t*)(src + 0*src_stride)), - "m" (*(uint32_t*)(src + 1*src_stride)), - "m" (*(uint32_t*)(src + 2*src_stride)) - ); - asm volatile( //FIXME could save 1 instruction if done as 8x4 ... - "movd %0, %%mm3 \n\t" - - : - : "m" (*(uint32_t*)(src + 3*src_stride)) - ); - asm volatile( //FIXME could save 1 instruction if done as 8x4 ... + "movd %4, %%mm0 \n\t" + "movd %5, %%mm1 \n\t" + "movd %6, %%mm2 \n\t" + "movd %7, %%mm3 \n\t" "punpcklbw %%mm1, %%mm0 \n\t" "punpcklbw %%mm3, %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" @@ -657,6 +654,10 @@ static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int "=m" (*(uint32_t*)(dst + 1*dst_stride)), "=m" (*(uint32_t*)(dst + 2*dst_stride)), "=m" (*(uint32_t*)(dst + 3*dst_stride)) + : "m" (*(uint32_t*)(src + 0*src_stride)), + "m" (*(uint32_t*)(src + 1*src_stride)), + "m" (*(uint32_t*)(src + 2*src_stride)), + "m" (*(uint32_t*)(src + 3*src_stride)) ); } @@ -1185,8 +1186,8 @@ static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, in else score1 = sse16_mmx(c, pix1, pix2, line_size, h); score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); - if(c) return score1 + ABS(score2)*c->avctx->nsse_weight; - else return score1 + ABS(score2)*8; + if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; + else return score1 + FFABS(score2)*8; } static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { @@ -1194,8 +1195,8 @@ static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int int score1= sse8_mmx(c, pix1, pix2, line_size, h); int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); - if(c) return score1 + ABS(score2)*c->avctx->nsse_weight; - else return score1 + ABS(score2)*8; + if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; + else return score1 + FFABS(score2)*8; } static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { @@ -2403,6 +2404,53 @@ QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2) QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2) QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2) +/***********************************/ +/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */ + +#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\ +static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\ +} +#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\ +static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\ +} + +#define QPEL_2TAP(OPNAME, SIZE, MMX)\ +QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\ +QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\ +QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\ +static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\ + OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\ +static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\ + OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\ +static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\ + OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\ +static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\ +}\ +static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ + OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\ +}\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\ +QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\ + +QPEL_2TAP(put_, 16, mmx2) +QPEL_2TAP(avg_, 16, mmx2) +QPEL_2TAP(put_, 8, mmx2) +QPEL_2TAP(avg_, 8, mmx2) +QPEL_2TAP(put_, 16, 3dnow) +QPEL_2TAP(avg_, 16, 3dnow) +QPEL_2TAP(put_, 8, 3dnow) +QPEL_2TAP(avg_, 8, 3dnow) + + #if 0 static void just_return() { return; } #endif @@ -2523,9 +2571,10 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o "m"(src[stride]), "m"(src[stride+1]), "m"(*r4), "m"(shift2) ); - + asm volatile( "movd %%mm0, %0 \n\t" + : "=m"(dst[x+y*stride]) : ); @@ -2535,10 +2584,11 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o } } +#ifdef CONFIG_ENCODERS static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){ long i=0; - assert(ABS(scale) < 256); + assert(FFABS(scale) < 256); scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; asm volatile( @@ -2586,7 +2636,7 @@ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[6 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ long i=0; - if(ABS(scale) < 256){ + if(FFABS(scale) < 256){ scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT; asm volatile( "pcmpeqw %%mm6, %%mm6 \n\t" // -1w @@ -2620,9 +2670,10 @@ static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){ } } } +#endif /* CONFIG_ENCODERS */ #define PREFETCH(name, op) \ -void name(void *mem, int stride, int h){\ +static void name(void *mem, int stride, int h){\ const uint8_t *p= mem;\ do{\ asm volatile(#op" %0" :: "m"(*p));\ @@ -2661,6 +2712,7 @@ void ff_vp3_dsp_init_mmx(void); /* XXX: those functions should be suppressed ASAP when all IDCTs are converted */ +#ifdef CONFIG_GPL static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) { ff_mmx_idct (block); @@ -2681,6 +2733,7 @@ static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *bloc ff_mmxext_idct (block); add_pixels_clamped_mmx(block, dest, line_size); } +#endif static void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block) { ff_vp3_idct_sse2(block); @@ -2701,7 +2754,6 @@ static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) ff_vp3_idct_mmx(block); add_pixels_clamped_mmx(block, dest, line_size); } -#ifdef CONFIG_GPL static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block) { ff_idct_xvid_mmx (block); @@ -2722,7 +2774,274 @@ static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block) ff_idct_xvid_mmx2 (block); add_pixels_clamped_mmx(block, dest, line_size); } -#endif + +static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize) +{ + int i; + asm volatile("pxor %%mm7, %%mm7":); + for(i=0; i<blocksize; i+=2) { + asm volatile( + "movq %0, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + "movq %%mm0, %%mm2 \n\t" + "movq %%mm1, %%mm3 \n\t" + "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0 + "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0 + "pslld $31, %%mm2 \n\t" // keep only the sign bit + "pxor %%mm2, %%mm1 \n\t" + "movq %%mm3, %%mm4 \n\t" + "pand %%mm1, %%mm3 \n\t" + "pandn %%mm1, %%mm4 \n\t" + "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) + "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) + "movq %%mm3, %1 \n\t" + "movq %%mm0, %0 \n\t" + :"+m"(mag[i]), "+m"(ang[i]) + ::"memory" + ); + } + asm volatile("femms"); +} +static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) +{ + int i; + + asm volatile( + "movaps %0, %%xmm5 \n\t" + ::"m"(ff_pdw_80000000[0]) + ); + for(i=0; i<blocksize; i+=4) { + asm volatile( + "movaps %0, %%xmm0 \n\t" + "movaps %1, %%xmm1 \n\t" + "xorps %%xmm2, %%xmm2 \n\t" + "xorps %%xmm3, %%xmm3 \n\t" + "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0 + "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0 + "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit + "xorps %%xmm2, %%xmm1 \n\t" + "movaps %%xmm3, %%xmm4 \n\t" + "andps %%xmm1, %%xmm3 \n\t" + "andnps %%xmm1, %%xmm4 \n\t" + "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m))) + "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m))) + "movaps %%xmm3, %1 \n\t" + "movaps %%xmm0, %0 \n\t" + :"+m"(mag[i]), "+m"(ang[i]) + ::"memory" + ); + } +} + +static void vector_fmul_3dnow(float *dst, const float *src, int len){ + long i = (len-4)*4; + asm volatile( + "1: \n\t" + "movq (%1,%0), %%mm0 \n\t" + "movq 8(%1,%0), %%mm1 \n\t" + "pfmul (%2,%0), %%mm0 \n\t" + "pfmul 8(%2,%0), %%mm1 \n\t" + "movq %%mm0, (%1,%0) \n\t" + "movq %%mm1, 8(%1,%0) \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + "femms \n\t" + :"+r"(i) + :"r"(dst), "r"(src) + :"memory" + ); +} +static void vector_fmul_sse(float *dst, const float *src, int len){ + long i = (len-8)*4; + asm volatile( + "1: \n\t" + "movaps (%1,%0), %%xmm0 \n\t" + "movaps 16(%1,%0), %%xmm1 \n\t" + "mulps (%2,%0), %%xmm0 \n\t" + "mulps 16(%2,%0), %%xmm1 \n\t" + "movaps %%xmm0, (%1,%0) \n\t" + "movaps %%xmm1, 16(%1,%0) \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i) + :"r"(dst), "r"(src) + :"memory" + ); +} + +static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){ + long i = len*4-16; + asm volatile( + "1: \n\t" + "pswapd 8(%1), %%mm0 \n\t" + "pswapd (%1), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" + "movq %%mm0, (%2,%0) \n\t" + "movq %%mm1, 8(%2,%0) \n\t" + "add $16, %1 \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(src1) + :"r"(dst), "r"(src0) + ); + asm volatile("femms"); +} +static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){ + long i = len*4-32; + asm volatile( + "1: \n\t" + "movaps 16(%1), %%xmm0 \n\t" + "movaps (%1), %%xmm1 \n\t" + "shufps $0x1b, %%xmm0, %%xmm0 \n\t" + "shufps $0x1b, %%xmm1, %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" + "movaps %%xmm0, (%2,%0) \n\t" + "movaps %%xmm1, 16(%2,%0) \n\t" + "add $32, %1 \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(src1) + :"r"(dst), "r"(src0) + ); +} + +static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1, + const float *src2, int src3, int len, int step){ + long i = (len-4)*4; + if(step == 2 && src3 == 0){ + dst += (len-4)*2; + asm volatile( + "1: \n\t" + "movq (%2,%0), %%mm0 \n\t" + "movq 8(%2,%0), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" + "pfadd (%4,%0), %%mm0 \n\t" + "pfadd 8(%4,%0), %%mm1 \n\t" + "movd %%mm0, (%1) \n\t" + "movd %%mm1, 16(%1) \n\t" + "psrlq $32, %%mm0 \n\t" + "psrlq $32, %%mm1 \n\t" + "movd %%mm0, 8(%1) \n\t" + "movd %%mm1, 24(%1) \n\t" + "sub $32, %1 \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(dst) + :"r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else if(step == 1 && src3 == 0){ + asm volatile( + "1: \n\t" + "movq (%2,%0), %%mm0 \n\t" + "movq 8(%2,%0), %%mm1 \n\t" + "pfmul (%3,%0), %%mm0 \n\t" + "pfmul 8(%3,%0), %%mm1 \n\t" + "pfadd (%4,%0), %%mm0 \n\t" + "pfadd 8(%4,%0), %%mm1 \n\t" + "movq %%mm0, (%1,%0) \n\t" + "movq %%mm1, 8(%1,%0) \n\t" + "sub $16, %0 \n\t" + "jge 1b \n\t" + :"+r"(i) + :"r"(dst), "r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else + ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); + asm volatile("femms"); +} +static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1, + const float *src2, int src3, int len, int step){ + long i = (len-8)*4; + if(step == 2 && src3 == 0){ + dst += (len-8)*2; + asm volatile( + "1: \n\t" + "movaps (%2,%0), %%xmm0 \n\t" + "movaps 16(%2,%0), %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" + "addps (%4,%0), %%xmm0 \n\t" + "addps 16(%4,%0), %%xmm1 \n\t" + "movss %%xmm0, (%1) \n\t" + "movss %%xmm1, 32(%1) \n\t" + "movhlps %%xmm0, %%xmm2 \n\t" + "movhlps %%xmm1, %%xmm3 \n\t" + "movss %%xmm2, 16(%1) \n\t" + "movss %%xmm3, 48(%1) \n\t" + "shufps $0xb1, %%xmm0, %%xmm0 \n\t" + "shufps $0xb1, %%xmm1, %%xmm1 \n\t" + "movss %%xmm0, 8(%1) \n\t" + "movss %%xmm1, 40(%1) \n\t" + "movhlps %%xmm0, %%xmm2 \n\t" + "movhlps %%xmm1, %%xmm3 \n\t" + "movss %%xmm2, 24(%1) \n\t" + "movss %%xmm3, 56(%1) \n\t" + "sub $64, %1 \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i), "+r"(dst) + :"r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else if(step == 1 && src3 == 0){ + asm volatile( + "1: \n\t" + "movaps (%2,%0), %%xmm0 \n\t" + "movaps 16(%2,%0), %%xmm1 \n\t" + "mulps (%3,%0), %%xmm0 \n\t" + "mulps 16(%3,%0), %%xmm1 \n\t" + "addps (%4,%0), %%xmm0 \n\t" + "addps 16(%4,%0), %%xmm1 \n\t" + "movaps %%xmm0, (%1,%0) \n\t" + "movaps %%xmm1, 16(%1,%0) \n\t" + "sub $32, %0 \n\t" + "jge 1b \n\t" + :"+r"(i) + :"r"(dst), "r"(src0), "r"(src1), "r"(src2) + :"memory" + ); + } + else + ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); +} + +static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){ + // not bit-exact: pf2id uses different rounding than C and SSE + int i; + for(i=0; i<len; i+=4) { + asm volatile( + "pf2id %1, %%mm0 \n\t" + "pf2id %2, %%mm1 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"=m"(dst[i]) + :"m"(src[i]), "m"(src[i+2]) + ); + } + asm volatile("femms"); +} +static void float_to_int16_sse(int16_t *dst, const float *src, int len){ + int i; + for(i=0; i<len; i+=4) { + asm volatile( + "cvtps2pi %1, %%mm0 \n\t" + "cvtps2pi %2, %%mm1 \n\t" + "packssdw %%mm1, %%mm0 \n\t" + "movq %%mm0, %0 \n\t" + :"=m"(dst[i]) + :"m"(src[i]), "m"(src[i+2]) + ); + } + asm volatile("emms"); +} #ifdef CONFIG_SNOW_ENCODER extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width); @@ -2782,6 +3101,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct_add= ff_simple_idct_add_mmx; c->idct = ff_simple_idct_mmx; c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; +#ifdef CONFIG_GPL }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ if(mm_flags & MM_MMXEXT){ c->idct_put= ff_libmpeg2mmx2_idct_put; @@ -2793,8 +3113,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct = ff_mmx_idct; } c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; -#if 0 - }else if(idct_algo==FF_IDCT_VP3){ +#endif + }else if(idct_algo==FF_IDCT_VP3 && + avctx->codec->id!=CODEC_ID_THEORA && + !(avctx->flags & CODEC_FLAG_BITEXACT)){ if(mm_flags & MM_SSE2){ c->idct_put= ff_vp3_idct_put_sse2; c->idct_add= ff_vp3_idct_add_sse2; @@ -2807,10 +3129,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct = ff_vp3_idct_mmx; c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; } -#endif }else if(idct_algo==FF_IDCT_CAVS){ c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; -#ifdef CONFIG_GPL }else if(idct_algo==FF_IDCT_XVIDMMX){ if(mm_flags & MM_MMXEXT){ c->idct_put= ff_idct_xvid_mmx2_put; @@ -2821,7 +3141,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->idct_add= ff_idct_xvid_mmx_add; c->idct = ff_idct_xvid_mmx; } -#endif } } @@ -3012,6 +3331,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) dspfunc(avg_h264_qpel, 0, 16); dspfunc(avg_h264_qpel, 1, 8); dspfunc(avg_h264_qpel, 2, 4); + + dspfunc(put_2tap_qpel, 0, 16); + dspfunc(put_2tap_qpel, 1, 8); + dspfunc(avg_2tap_qpel, 0, 16); + dspfunc(avg_2tap_qpel, 1, 8); #undef dspfunc c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2; @@ -3024,6 +3348,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; + c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; @@ -3134,6 +3459,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) dspfunc(avg_h264_qpel, 1, 8); dspfunc(avg_h264_qpel, 2, 4); + dspfunc(put_2tap_qpel, 0, 16); + dspfunc(put_2tap_qpel, 1, 8); + dspfunc(avg_2tap_qpel, 0, 16); + dspfunc(avg_2tap_qpel, 1, 8); + c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow; c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; } @@ -3150,6 +3480,24 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; } #endif + + if(mm_flags & MM_3DNOW){ + c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; + c->vector_fmul = vector_fmul_3dnow; + if(!(avctx->flags & CODEC_FLAG_BITEXACT)) + c->float_to_int16 = float_to_int16_3dnow; + } + if(mm_flags & MM_3DNOWEXT) + c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; + if(mm_flags & MM_SSE){ + c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; + c->vector_fmul = vector_fmul_sse; + c->float_to_int16 = float_to_int16_sse; + c->vector_fmul_reverse = vector_fmul_reverse_sse; + c->vector_fmul_add_add = vector_fmul_add_add_sse; + } + if(mm_flags & MM_3DNOW) + c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse } #ifdef CONFIG_ENCODERS |