From 6f1c8d4eafabd914b87e9171bf4d04f4ef9160ea Mon Sep 17 00:00:00 2001 From: Miguel Freitas Date: Wed, 8 Jan 2003 13:18:42 +0000 Subject: syncing ffmpeg (with some compilation fixes) - fixes wma bugs - mace, huffyuv and mp3 decoders imported (but not enabled) tested: wma (v1 and v2), mpeg4, msmpeg4 v1, v2 and v3, divx3, divx4, divx5, xvid and dv decoders. everything looks fine. CVS patchset: 3828 CVS date: 2003/01/08 13:18:42 --- src/libffmpeg/libavcodec/i386/Makefile.am | 2 +- src/libffmpeg/libavcodec/i386/dsputil_mmx.c | 934 +++++++++++++++++++++++- src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h | 60 ++ src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h | 371 +++++++++- src/libffmpeg/libavcodec/i386/fft_sse.c | 12 + src/libffmpeg/libavcodec/i386/motion_est_mmx.c | 19 + src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c | 1 - 7 files changed, 1393 insertions(+), 6 deletions(-) (limited to 'src/libffmpeg/libavcodec/i386') diff --git a/src/libffmpeg/libavcodec/i386/Makefile.am b/src/libffmpeg/libavcodec/i386/Makefile.am index 0ef6bb0eb..6386800f6 100644 --- a/src/libffmpeg/libavcodec/i386/Makefile.am +++ b/src/libffmpeg/libavcodec/i386/Makefile.am @@ -16,11 +16,11 @@ libavcodec_mmx_src = \ cputest.c \ dsputil_mmx.c \ fdct_mmx.c \ + fft_sse.c \ idct_mmx.c \ motion_est_mmx.c \ mpegvideo_mmx.c \ simple_idct_mmx.c -# fft_sse.c - needs new header from gcc 3.1 libavcodec_mmx_dummy = libavcodec_mmx_dummy.c diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c index 12a360154..5fce7f914 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c @@ -43,11 +43,21 @@ int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); +int sad16x16_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx); +int sad8x8_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx); +int sad16x16_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx); +int sad8x8_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx); + /* pixel operations */ static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002ULL; +static const uint64_t ff_pw_20 __attribute__ ((aligned(8))) = 0x0014001400140014ULL; +static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003ULL; +static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL; +static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; + #define JUMPALIGN() __asm __volatile (".balign 8"::) #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) @@ -213,7 +223,7 @@ static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) ); } -static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) +static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) { asm volatile( "pxor %%mm7, %%mm7 \n\t" @@ -496,12 +506,853 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ for(; iput_ ## postfix1 = put_ ## postfix2;\ + c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ + c->avg_ ## postfix1 = avg_ ## postfix2; + void dsputil_init_mmx(DSPContext* c, unsigned mask) { mm_flags = mm_support(); @@ -576,10 +1427,16 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask) c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx; c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; - + c->add_bytes= add_bytes_mmx; c->diff_bytes= diff_bytes_mmx; - + + c->hadamard8_diff[0]= hadamard8_diff16_mmx; + c->hadamard8_diff[1]= hadamard8_diff_mmx; + + c->sad[0]= sad16x16_mmx; + c->sad[1]= sad8x8_mmx; + if (mm_flags & MM_MMXEXT) { c->pix_abs16x16 = pix_abs16x16_mmx2; c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; @@ -591,6 +1448,9 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask) c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2; + c->sad[0]= sad16x16_mmx2; + c->sad[1]= sad8x8_mmx2; + c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; @@ -610,6 +1470,41 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask) c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; + +#if 1 + SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2) + SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) +#endif } else if (mm_flags & MM_3DNOW) { c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; @@ -630,6 +1525,39 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask) c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; + + SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow) + SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow) } } diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h index 6873432ce..4a8841156 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h @@ -53,6 +53,38 @@ static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size :"%eax", "memory"); } +static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm __volatile( + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "addl %4, %1 \n\t" + "movq (%1), %%mm1 \n\t" + "addl %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "addl %5, %3 \n\t" + "movq %%mm1, (%3) \n\t" + "addl %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "addl %4, %1 \n\t" + "movq (%1), %%mm1 \n\t" + "addl %4, %1 \n\t" + PAVGB" 16(%2), %%mm0 \n\t" + PAVGB" 24(%2), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "addl %5, %3 \n\t" + "movq %%mm1, (%3) \n\t" + "addl %5, %3 \n\t" + "addl $32, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) + :"r"(src1Stride), "r"(dstStride) + :"memory"); +} + static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { __asm __volatile( @@ -92,6 +124,34 @@ static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_siz :"r" (line_size) :"%eax", "memory"); } + +static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + __asm __volatile( + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "addl %4, %1 \n\t" + PAVGB" (%2), %%mm0 \n\t" + PAVGB" 8(%2), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "addl %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm1 \n\t" + "addl %4, %1 \n\t" + PAVGB" 16(%2), %%mm0 \n\t" + PAVGB" 24(%2), %%mm1 \n\t" + "movq %%mm0, (%3) \n\t" + "movq %%mm1, 8(%3) \n\t" + "addl %5, %3 \n\t" + "addl $32, %2 \n\t" + "subl $2, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst) + :"r"(src1Stride), "r"(dstStride) + :"memory"); +} /* GL: this function does incorrect rounding if overflow */ static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h index 3605e03f9..0ae1cd99d 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h @@ -54,6 +54,46 @@ static void DEF(put, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_siz :"eax", "memory"); } +static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + MOVQ_BFE(mm6); + __asm __volatile( + ".balign 8 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "addl %4, %1 \n\t" + "movq (%1), %%mm2 \n\t" + "movq 8(%2), %%mm3 \n\t" + "addl %4, %1 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3) \n\t" + "addl %5, %3 \n\t" + "movq %%mm5, (%3) \n\t" + "addl %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 16(%2), %%mm1 \n\t" + "addl %4, %1 \n\t" + "movq (%1), %%mm2 \n\t" + "movq 24(%2), %%mm3 \n\t" + "addl %4, %1 \n\t" + "addl $32, %2 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3) \n\t" + "addl %5, %3 \n\t" + "movq %%mm5, (%3) \n\t" + "addl %5, %3 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"(src1Stride), "D"(dstStride) + :"memory"); +} + static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BFE(mm6); @@ -90,7 +130,7 @@ static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_si "movq 9(%1, %3), %%mm3 \n\t" PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) "movq %%mm4, 8(%2) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" "addl %%eax, %1 \n\t" "addl %%eax, %2 \n\t" "subl $4, %0 \n\t" @@ -100,6 +140,42 @@ static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_si :"eax", "memory"); } +static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + MOVQ_BFE(mm6); + __asm __volatile( + ".balign 8 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 8(%2), %%mm3 \n\t" + "addl %4, %1 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3) \n\t" + "movq %%mm5, 8(%3) \n\t" + "addl %5, %3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 16(%2), %%mm1 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 24(%2), %%mm3 \n\t" + "addl %4, %1 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%3) \n\t" + "movq %%mm5, 8(%3) \n\t" + "addl %5, %3 \n\t" + "addl $32, %2 \n\t" + "subl $2, %0 \n\t" + "jnz 1b \n\t" +#ifdef PIC //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cant be used + :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#else + :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst) +#endif + :"S"(src1Stride), "D"(dstStride) + :"memory"); +} + static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BFE(mm6); @@ -195,6 +271,124 @@ static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si :"eax", "memory"); } +static void DEF(put, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h) +{ + MOVQ_ZERO(mm7); + SET_RND(mm6); // =2 for rnd and =1 for no_rnd version + __asm __volatile( + ".balign 8 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq 64(%2), %%mm2 \n\t" + "movq 136(%2), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm0, %%mm1 \n\t" + "paddusw %%mm2, %%mm3 \n\t" + "paddusw %%mm1, %%mm3 \n\t" + "psrlw $2, %%mm3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq 64(%2), %%mm2 \n\t" + "movq 136(%2), %%mm4 \n\t" + "punpckhbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm4 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm0, %%mm1 \n\t" + "paddusw %%mm2, %%mm4 \n\t" + "paddusw %%mm1, %%mm4 \n\t" + "psrlw $2, %%mm4 \n\t" + "packuswb %%mm4, %%mm3 \n\t" + "movq %%mm3, (%0) \n\t" + "addl %4, %0 \n\t" + "addl %4, %1 \n\t" + "addl $8, %2 \n\t" + "decl %3 \n\t" + "jnz 1b \n\t" + :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h) + :"r"(stride) + :"memory"); +} + +static void DEF(put, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h) +{ + MOVQ_ZERO(mm7); + SET_RND(mm6); // =2 for rnd and =1 for no_rnd version + __asm __volatile( + ".balign 8 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq 256(%2), %%mm2 \n\t" + "movq 528(%2), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm0, %%mm1 \n\t" + "paddusw %%mm2, %%mm3 \n\t" + "paddusw %%mm1, %%mm3 \n\t" + "psrlw $2, %%mm3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq 256(%2), %%mm2 \n\t" + "movq 528(%2), %%mm4 \n\t" + "punpckhbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm4 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm0, %%mm1 \n\t" + "paddusw %%mm2, %%mm4 \n\t" + "paddusw %%mm1, %%mm4 \n\t" + "psrlw $2, %%mm4 \n\t" + "packuswb %%mm4, %%mm3 \n\t" + "movq %%mm3, (%0) \n\t" + "movq 8(%1), %%mm0 \n\t" + "movq 8(%2), %%mm1 \n\t" + "movq 264(%2), %%mm2 \n\t" + "movq 536(%2), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm0, %%mm1 \n\t" + "paddusw %%mm2, %%mm3 \n\t" + "paddusw %%mm1, %%mm3 \n\t" + "psrlw $2, %%mm3 \n\t" + "movq 8(%1), %%mm0 \n\t" + "movq 8(%2), %%mm1 \n\t" + "movq 264(%2), %%mm2 \n\t" + "movq 536(%2), %%mm4 \n\t" + "punpckhbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm4 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm0, %%mm1 \n\t" + "paddusw %%mm2, %%mm4 \n\t" + "paddusw %%mm1, %%mm4 \n\t" + "psrlw $2, %%mm4 \n\t" + "packuswb %%mm4, %%mm3 \n\t" + "movq %%mm3, 8(%0) \n\t" + "addl %4, %0 \n\t" + "addl %4, %1 \n\t" + "addl $16, %2 \n\t" + "decl %3 \n\t" + "jnz 1b \n\t" + :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h) + :"r"(stride) + :"memory"); +} + // avg_pixels // in case more speed is needed - unroling would certainly help static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h) @@ -259,6 +453,27 @@ static void DEF(avg, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_siz } while (--h); } +static void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm __volatile( + "movq %1, %%mm0 \n\t" + "movq %2, %%mm1 \n\t" + "movq %0, %%mm3 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, %0 \n\t" + :"+m"(*dst) + :"m"(*src1), "m"(*src2) + :"memory"); + dst += dstStride; + src1 += src1Stride; + src2 += 8; + } while (--h); +} + static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BFE(mm6); @@ -285,6 +500,33 @@ static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_si } while (--h); } +static void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm __volatile( + "movq %1, %%mm0 \n\t" + "movq %2, %%mm1 \n\t" + "movq %0, %%mm3 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, %0 \n\t" + "movq 8%1, %%mm0 \n\t" + "movq 8%2, %%mm1 \n\t" + "movq 8%0, %%mm3 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, 8%0 \n\t" + :"+m"(*dst) + :"m"(*src1), "m"(*src2) + :"memory"); + dst += dstStride; + src1 += src1Stride; + src2 += 16; + } while (--h); +} + static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BFE(mm6); @@ -399,6 +641,133 @@ static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si :"eax", "memory"); } +static void DEF(avg, pixels8_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h) +{ + MOVQ_ZERO(mm7); + SET_RND(mm6); // =2 for rnd and =1 for no_rnd version + MOVQ_BFE(mm5); + __asm __volatile( + ".balign 8 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq 64(%2), %%mm2 \n\t" + "movq 136(%2), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm0, %%mm1 \n\t" + "paddusw %%mm2, %%mm3 \n\t" + "paddusw %%mm1, %%mm3 \n\t" + "psrlw $2, %%mm3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq 64(%2), %%mm2 \n\t" + "movq 136(%2), %%mm4 \n\t" + "punpckhbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm4 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm0, %%mm1 \n\t" + "paddusw %%mm2, %%mm4 \n\t" + "paddusw %%mm1, %%mm4 \n\t" + "psrlw $2, %%mm4 \n\t" + "packuswb %%mm4, %%mm3 \n\t" + "movq (%0), %%mm4 \n\t" + PAVGB(%%mm3, %%mm4, %%mm0, %%mm5) + "movq %%mm0, (%0) \n\t" + "addl %4, %0 \n\t" + "addl %4, %1 \n\t" + "addl $8, %2 \n\t" + "decl %3 \n\t" + "jnz 1b \n\t" + :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h) + :"r"(stride) + :"memory"); +} + +static void DEF(avg, pixels16_l4)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int stride, int h) +{ + MOVQ_ZERO(mm7); + SET_RND(mm6); // =2 for rnd and =1 for no_rnd version + MOVQ_BFE(mm5); + __asm __volatile( + ".balign 8 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq 256(%2), %%mm2 \n\t" + "movq 528(%2), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm0, %%mm1 \n\t" + "paddusw %%mm2, %%mm3 \n\t" + "paddusw %%mm1, %%mm3 \n\t" + "psrlw $2, %%mm3 \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%2), %%mm1 \n\t" + "movq 256(%2), %%mm2 \n\t" + "movq 528(%2), %%mm4 \n\t" + "punpckhbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm4 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm0, %%mm1 \n\t" + "paddusw %%mm2, %%mm4 \n\t" + "paddusw %%mm1, %%mm4 \n\t" + "psrlw $2, %%mm4 \n\t" + "packuswb %%mm4, %%mm3 \n\t" + "movq (%0), %%mm4 \n\t" + PAVGB(%%mm3, %%mm4, %%mm0, %%mm5) + "movq %%mm0, (%0) \n\t" + "movq 8(%1), %%mm0 \n\t" + "movq 8(%2), %%mm1 \n\t" + "movq 264(%2), %%mm2 \n\t" + "movq 536(%2), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm3 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm0, %%mm1 \n\t" + "paddusw %%mm2, %%mm3 \n\t" + "paddusw %%mm1, %%mm3 \n\t" + "psrlw $2, %%mm3 \n\t" + "movq 8(%1), %%mm0 \n\t" + "movq 8(%2), %%mm1 \n\t" + "movq 264(%2), %%mm2 \n\t" + "movq 536(%2), %%mm4 \n\t" + "punpckhbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm4 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm0, %%mm1 \n\t" + "paddusw %%mm2, %%mm4 \n\t" + "paddusw %%mm1, %%mm4 \n\t" + "psrlw $2, %%mm4 \n\t" + "packuswb %%mm4, %%mm3 \n\t" + "movq 8(%0), %%mm4 \n\t" + PAVGB(%%mm3, %%mm4, %%mm0, %%mm5) + "movq %%mm0, 8(%0) \n\t" + "addl %4, %0 \n\t" + "addl %4, %1 \n\t" + "addl $16, %2 \n\t" + "decl %3 \n\t" + "jnz 1b \n\t" + :"+r"(dst), "+r"(src1), "+r"(src2), "+r"(h) + :"r"(stride) + :"memory"); +} + + //FIXME optimize static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ DEF(put, pixels8_y2)(block , pixels , line_size, h); diff --git a/src/libffmpeg/libavcodec/i386/fft_sse.c b/src/libffmpeg/libavcodec/i386/fft_sse.c index 8e8e36b0f..175cea506 100644 --- a/src/libffmpeg/libavcodec/i386/fft_sse.c +++ b/src/libffmpeg/libavcodec/i386/fft_sse.c @@ -19,11 +19,16 @@ #include "../dsputil.h" #include +#ifdef HAVE_BUILTIN_VECTOR + #include static const float p1p1p1m1[4] __attribute__((aligned(16))) = { 1.0, 1.0, 1.0, -1.0 }; +static const float p1p1m1p1[4] __attribute__((aligned(16))) = + { 1.0, 1.0, -1.0, 1.0 }; + static const float p1p1m1m1[4] __attribute__((aligned(16))) = { 1.0, 1.0, -1.0, -1.0 }; @@ -54,6 +59,11 @@ void fft_calc_sse(FFTContext *s, FFTComplex *z) r = (__m128 *)&z[0]; c1 = *(__m128 *)p1p1m1m1; c2 = *(__m128 *)p1p1p1m1; + if (s->inverse) + c2 = *(__m128 *)p1p1m1p1; + else + c2 = *(__m128 *)p1p1p1m1; + j = (np >> 2); do { a = r[0]; @@ -126,3 +136,5 @@ void fft_calc_sse(FFTContext *s, FFTComplex *z) nloops = nloops << 1; } while (nblocks != 0); } + +#endif diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c index 3368e7333..fa85db67b 100644 --- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c +++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c @@ -274,6 +274,15 @@ int pix_abs8x8_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ \ return sum_ ## suf();\ }\ +int sad8x8_ ## suf(void *s, UINT8 *blk2, UINT8 *blk1, int stride)\ +{\ + asm volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t":);\ +\ + sad8_ ## suf(blk1, blk2, stride, 3);\ +\ + return sum_ ## suf();\ +}\ \ int pix_abs8x8_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ {\ @@ -324,6 +333,16 @@ int pix_abs16x16_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ \ return sum_ ## suf();\ }\ +int sad16x16_ ## suf(void *s, UINT8 *blk2, UINT8 *blk1, int stride)\ +{\ + asm volatile("pxor %%mm7, %%mm7 \n\t"\ + "pxor %%mm6, %%mm6 \n\t":);\ +\ + sad8_ ## suf(blk1 , blk2 , stride, 4);\ + sad8_ ## suf(blk1+8, blk2+8, stride, 4);\ +\ + return sum_ ## suf();\ +}\ int pix_abs16x16_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c index cb7af3e62..d936abfd5 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c @@ -529,7 +529,6 @@ static void ff_libmpeg2mmx2_idct_add(UINT8 *dest, int line_size, DCTELEM *block) void MPV_common_init_mmx(MpegEncContext *s) { if (mm_flags & MM_MMX) { - int i; const int dct_algo = s->avctx->dct_algo; const int idct_algo= s->avctx->idct_algo; -- cgit v1.2.3