From e69ac0f8052424e3ce344365b6be0c95f0e68db6 Mon Sep 17 00:00:00 2001 From: Miguel Freitas Date: Tue, 29 Oct 2002 16:29:16 +0000 Subject: sync to ffmpeg cvs CVS patchset: 3090 CVS date: 2002/10/29 16:29:16 --- src/libffmpeg/libavcodec/i386/Makefile.am | 3 +- src/libffmpeg/libavcodec/i386/dsputil_mmx.c | 115 ++-- src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h | 8 +- src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h | 18 +- src/libffmpeg/libavcodec/i386/fdct_mmx.c | 192 +++--- src/libffmpeg/libavcodec/i386/fft_sse.c | 128 ++++ src/libffmpeg/libavcodec/i386/idct_mmx.c | 413 ++++++------ src/libffmpeg/libavcodec/i386/motion_est_mmx.c | 42 +- src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c | 748 ++++++++++----------- .../libavcodec/i386/mpegvideo_mmx_template.c | 279 +++++--- src/libffmpeg/libavcodec/i386/simple_idct_mmx.c | 139 ++-- 11 files changed, 1167 insertions(+), 918 deletions(-) create mode 100644 src/libffmpeg/libavcodec/i386/fft_sse.c (limited to 'src/libffmpeg/libavcodec/i386') diff --git a/src/libffmpeg/libavcodec/i386/Makefile.am b/src/libffmpeg/libavcodec/i386/Makefile.am index 15bec161e..53f1f6528 100644 --- a/src/libffmpeg/libavcodec/i386/Makefile.am +++ b/src/libffmpeg/libavcodec/i386/Makefile.am @@ -17,8 +17,9 @@ libavcodec_mmx_src = \ fdct_mmx.c \ idct_mmx.c \ motion_est_mmx.c \ - mpegvideo_mmx.c \ + mpegvideo_mmx.c \ simple_idct_mmx.c +# fft_sse.c - needs new header from gcc 3.1 libavcodec_mmx_dummy = libavcodec_mmx_dummy.c diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c index a524f96c8..708d0b091 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c @@ -20,7 +20,6 @@ */ #include "../dsputil.h" -#include "../simple_idct.h" int mm_flags; /* multimedia extension flags */ @@ -44,10 +43,6 @@ int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); -/* external functions, from idct_mmx.c */ -void ff_mmx_idct(DCTELEM *block); -void ff_mmxext_idct(DCTELEM *block); - /* pixel operations */ static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; static const uint64_t mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; @@ -70,8 +65,8 @@ static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002U #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone)) #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) #else -/* for shared library it's better to use this way for accessing constants */ -/* pcmpeqd -> -1 */ +// for shared library it's better to use this way for accessing constants +// pcmpeqd -> -1 #define MOVQ_BONE(regd) \ __asm __volatile ( \ "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ @@ -86,9 +81,9 @@ static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002U #endif -/* using regr as temporary and for the output result */ -/* first argument is unmodifed and second is trashed */ -/* regfe is supposed to contain 0xfefefefefefefefe */ +// using regr as temporary and for the output result +// first argument is unmodifed and second is trashed +// regfe is supposed to contain 0xfefefefefefefefe #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ "movq " #rega ", " #regr " \n\t"\ "pand " #regb ", " #regr " \n\t"\ @@ -105,7 +100,7 @@ static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002U "psrlq $1, " #regb " \n\t"\ "psubb " #regb ", " #regr " \n\t" -/* mm6 is supposed to contain 0xfefefefefefefefe */ +// mm6 is supposed to contain 0xfefefefefefefefe #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ "movq " #rega ", " #regr " \n\t"\ "movq " #regc ", " #regp " \n\t"\ @@ -192,7 +187,7 @@ static const uint64_t mm_wtwo __attribute__ ((aligned(8))) = 0x0002000200020002U static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) { - __asm__ volatile( + asm volatile( "movl $-128, %%eax \n\t" "pxor %%mm7, %%mm7 \n\t" ".balign 16 \n\t" @@ -220,7 +215,7 @@ static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) static void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) { - __asm__ volatile( + asm volatile( "pxor %%mm7, %%mm7 \n\t" "movl $-128, %%eax \n\t" ".balign 16 \n\t" @@ -278,9 +273,9 @@ static void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line pix += line_size*4; p += 32; - /* if here would be an exact copy of the code above */ - /* compiler would generate some very strange code */ - /* thus using "r" */ + // if here would be an exact copy of the code above + // compiler would generate some very strange code + // thus using "r" __asm __volatile( "movq (%3), %%mm0\n\t" "movq 8(%3), %%mm1\n\t" @@ -420,6 +415,44 @@ static void clear_blocks_mmx(DCTELEM *blocks) ); } +static int pix_sum16_mmx(UINT8 * pix, int line_size){ + const int h=16; + int sum; + int index= -line_size*h; + + __asm __volatile( + "pxor %%mm7, %%mm7 \n\t" + "pxor %%mm6, %%mm6 \n\t" + "1: \n\t" + "movq (%2, %1), %%mm0 \n\t" + "movq (%2, %1), %%mm1 \n\t" + "movq 8(%2, %1), %%mm2 \n\t" + "movq 8(%2, %1), %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddw %%mm0, %%mm1 \n\t" + "paddw %%mm2, %%mm3 \n\t" + "paddw %%mm1, %%mm3 \n\t" + "paddw %%mm3, %%mm6 \n\t" + "addl %3, %1 \n\t" + " js 1b \n\t" + "movq %%mm6, %%mm5 \n\t" + "psrlq $32, %%mm6 \n\t" + "paddw %%mm5, %%mm6 \n\t" + "movq %%mm6, %%mm5 \n\t" + "psrlq $16, %%mm6 \n\t" + "paddw %%mm5, %%mm6 \n\t" + "movd %%mm6, %0 \n\t" + "andl $0xFFFF, %0 \n\t" + : "=&r" (sum), "+r" (index) + : "r" (pix - index), "r" (line_size) + ); + + return sum; +} + #if 0 static void just_return() { return; } #endif @@ -448,6 +481,7 @@ void dsputil_init_mmx(void) put_pixels_clamped = put_pixels_clamped_mmx; add_pixels_clamped = add_pixels_clamped_mmx; clear_blocks= clear_blocks_mmx; + pix_sum= pix_sum16_mmx; pix_abs16x16 = pix_abs16x16_mmx; pix_abs16x16_x2 = pix_abs16x16_x2_mmx; @@ -477,7 +511,7 @@ void dsputil_init_mmx(void) avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx; avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx; avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx; - + put_pixels_tab[1][0] = put_pixels8_mmx; put_pixels_tab[1][1] = put_pixels8_x2_mmx; put_pixels_tab[1][2] = put_pixels8_y2_mmx; @@ -538,7 +572,7 @@ void dsputil_init_mmx(void) avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; - + put_pixels_tab[1][1] = put_pixels8_x2_3dnow; put_pixels_tab[1][2] = put_pixels8_y2_3dnow; put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; @@ -549,21 +583,10 @@ void dsputil_init_mmx(void) avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; } - - /* idct */ - if (mm_flags & MM_MMXEXT) { - ff_idct = ff_mmxext_idct; - } else { - ff_idct = ff_mmx_idct; - } -#ifdef SIMPLE_IDCT -/* ff_idct = simple_idct; */ - ff_idct = simple_idct_mmx; -#endif } #if 0 - /* for speed testing */ + // for speed testing get_pixels = just_return; put_pixels_clamped = just_return; add_pixels_clamped = just_return; @@ -593,40 +616,18 @@ void dsputil_init_mmx(void) avg_no_rnd_pixels_tab[2] = just_return; avg_no_rnd_pixels_tab[3] = just_return; - /* av_fdct = just_return; */ - /* ff_idct = just_return; */ + //av_fdct = just_return; + //ff_idct = just_return; #endif } -void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block); - -/** - * this will send coeff matrixes which would have different results for the 16383 type MMX vs C IDCTs to the C IDCT - */ -void bit_exact_idct_put(UINT8 *dest, int line_size, INT16 *block){ - if( block[0]>1022 && block[1]==0 && block[4 ]==0 && block[5 ]==0 - && block[8]==0 && block[9]==0 && block[12]==0 && block[13]==0){ - int16_t tmp[64]; - int i; - - for(i=0; i<64; i++) - tmp[i]= block[i]; - for(i=0; i<64; i++) - block[i]= tmp[block_permute_op(i)]; - - simple_idct_put(dest, line_size, block); - } - else - gen_idct_put(dest, line_size, block); -} - /* remove any non bit exact operation (testing purpose). NOTE that this function should be kept as small as possible because it is always difficult to test automatically non bit exact cases. */ void dsputil_set_bit_exact_mmx(void) { if (mm_flags & MM_MMX) { - + /* MMX2 & 3DNOW */ put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; @@ -643,9 +644,5 @@ void dsputil_set_bit_exact_mmx(void) pix_abs8x8_y2 = pix_abs8x8_y2_mmx; pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; } -#ifdef SIMPLE_IDCT - if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx) - ff_idct_put= bit_exact_idct_put; -#endif } } diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h index 818ec0e6d..6873432ce 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h @@ -21,7 +21,7 @@ * mostly rewritten by Michael Niedermayer * and improved by Zdenek Kabelac */ - + /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm clobber bug - now it will work with 2.95.2 and also with -fPIC */ @@ -92,7 +92,7 @@ static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_siz :"r" (line_size) :"%eax", "memory"); } - + /* GL: this function does incorrect rounding if overflow */ static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { @@ -293,7 +293,7 @@ static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size :"%eax", "memory"); } -/* Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter */ +// Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BONE(mm6); @@ -335,7 +335,7 @@ static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_siz :"%eax", "memory"); } -/* FIXME the following could be optimized too ... */ +//FIXME the following could be optimized too ... static void DEF(put_no_rnd_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h index a6e84c199..3605e03f9 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h @@ -21,7 +21,7 @@ * and improved by Zdenek Kabelac */ -/* put_pixels */ +// put_pixels static void DEF(put, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BFE(mm6); @@ -132,7 +132,7 @@ static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_siz static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_ZERO(mm7); - SET_RND(mm6); /* =2 for rnd and =1 for no_rnd version */ + SET_RND(mm6); // =2 for rnd and =1 for no_rnd version __asm __volatile( "movq (%1), %%mm0 \n\t" "movq 1(%1), %%mm4 \n\t" @@ -168,7 +168,7 @@ static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si "movq %%mm4, (%2, %%eax) \n\t" "addl %3, %%eax \n\t" - "movq (%1, %%eax), %%mm2 \n\t" /* 0 <-> 2 1 <-> 3 */ + "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 "movq 1(%1, %%eax), %%mm4 \n\t" "movq %%mm2, %%mm3 \n\t" "movq %%mm4, %%mm5 \n\t" @@ -195,8 +195,8 @@ static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si :"eax", "memory"); } -/* avg_pixels */ -/* in case more speed is needed - unroling would certainly help */ +// avg_pixels +// in case more speed is needed - unroling would certainly help static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BFE(mm6); @@ -324,11 +324,11 @@ static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_siz :"eax", "memory"); } -/* this routine is 'slightly' suboptimal but mostly unused */ +// this routine is 'slightly' suboptimal but mostly unused static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_ZERO(mm7); - SET_RND(mm6); /* =2 for rnd and =1 for no_rnd version */ + SET_RND(mm6); // =2 for rnd and =1 for no_rnd version __asm __volatile( "movq (%1), %%mm0 \n\t" "movq 1(%1), %%mm4 \n\t" @@ -368,7 +368,7 @@ static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si "movq %%mm5, (%2, %%eax) \n\t" "addl %3, %%eax \n\t" - "movq (%1, %%eax), %%mm2 \n\t" /* 0 <-> 2 1 <-> 3 */ + "movq (%1, %%eax), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 "movq 1(%1, %%eax), %%mm4 \n\t" "movq %%mm2, %%mm3 \n\t" "movq %%mm4, %%mm5 \n\t" @@ -399,7 +399,7 @@ static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si :"eax", "memory"); } -/* FIXME optimize */ +//FIXME optimize static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ DEF(put, pixels8_y2)(block , pixels , line_size, h); DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c index 14d93370d..19f656afd 100644 --- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c @@ -3,133 +3,131 @@ * The gcc porting is Copyright (c) 2001 Fabrice Bellard. * * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT - * + * * Intel Application Note AP-922 - fast, precise implementation of DCT * http://developer.intel.com/vtune/cbts/appnotes.htm */ #include "../common.h" #include "mmx.h" -/* #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) */ +#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) -/* -*********************************** -* -* constants for the forward DCT -* ----------------------------- -* -* Be sure to check that your compiler is aligning all constants to QWORD -* (8-byte) memory boundaries! Otherwise the unaligned memory access will -* severely stall MMX execution. -* -*********************************** -*/ +////////////////////////////////////////////////////////////////////// +// +// constants for the forward DCT +// ----------------------------- +// +// Be sure to check that your compiler is aligning all constants to QWORD +// (8-byte) memory boundaries! Otherwise the unaligned memory access will +// severely stall MMX execution. +// +////////////////////////////////////////////////////////////////////// -#define BITS_FRW_ACC 3 /*; 2 or 3 for accuracy */ +#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy #define SHIFT_FRW_COL BITS_FRW_ACC #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) -/* #define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1) */ +//#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1) #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) -/* #define RND_FRW_COL (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1) */ +//#define RND_FRW_COL (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1) #define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) -/* concatenated table, for forward DCT transformation */ +//concatenated table, for forward DCT transformation const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = { - 13036, 13036, 13036, 13036, /* tg * (2<<16) + 0.5 */ - 27146, 27146, 27146, 27146, /* tg * (2<<16) + 0.5 */ - -21746, -21746, -21746, -21746, /* tg * (2<<16) + 0.5 */ + 13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5 + 27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5 + -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5 }; const int16_t cos_4_16[4] = { - -19195, -19195, -19195, -19195, /* cos * (2<<16) + 0.5 */ + -19195, -19195, -19195, -19195, //cos * (2<<16) + 0.5 }; const int16_t ocos_4_16[4] = { - 23170, 23170, 23170, 23170, /* cos * (2<<15) + 0.5 */ + 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5 }; static const mmx_t fdct_one_corr = {0x0001000100010001LL}; static volatile mmx_t fdct_r_row = { d:{RND_FRW_ROW, RND_FRW_ROW} }; -const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { /* forward_dct coeff table */ - /* row0 */ - 16384, 16384, 21407, -8867, /* w09 w01 w08 w00 */ - 16384, 16384, 8867, -21407, /* w13 w05 w12 w04 */ - 16384, -16384, 8867, 21407, /* w11 w03 w10 w02 */ - -16384, 16384, -21407, -8867, /* w15 w07 w14 w06 */ - 22725, 12873, 19266, -22725, /* w22 w20 w18 w16 */ - 19266, 4520, -4520, -12873, /* w23 w21 w19 w17 */ - 12873, 4520, 4520, 19266, /* w30 w28 w26 w24 */ - -22725, 19266, -12873, -22725, /* w31 w29 w27 w25 */ +const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table + //row0 + 16384, 16384, 21407, -8867, // w09 w01 w08 w00 + 16384, 16384, 8867, -21407, // w13 w05 w12 w04 + 16384, -16384, 8867, 21407, // w11 w03 w10 w02 + -16384, 16384, -21407, -8867, // w15 w07 w14 w06 + 22725, 12873, 19266, -22725, // w22 w20 w18 w16 + 19266, 4520, -4520, -12873, // w23 w21 w19 w17 + 12873, 4520, 4520, 19266, // w30 w28 w26 w24 + -22725, 19266, -12873, -22725, // w31 w29 w27 w25 - /* row1 */ - 22725, 22725, 29692, -12299, /* w09 w01 w08 w00 */ - 22725, 22725, 12299, -29692, /* w13 w05 w12 w04 */ - 22725, -22725, 12299, 29692, /* w11 w03 w10 w02 */ - -22725, 22725, -29692, -12299, /* w15 w07 w14 w06 */ - 31521, 17855, 26722, -31521, /* w22 w20 w18 w16 */ - 26722, 6270, -6270, -17855, /* w23 w21 w19 w17 */ - 17855, 6270, 6270, 26722, /* w30 w28 w26 w24 */ - -31521, 26722, -17855, -31521, /* w31 w29 w27 w25 */ + //row1 + 22725, 22725, 29692, -12299, // w09 w01 w08 w00 + 22725, 22725, 12299, -29692, // w13 w05 w12 w04 + 22725, -22725, 12299, 29692, // w11 w03 w10 w02 + -22725, 22725, -29692, -12299, // w15 w07 w14 w06 + 31521, 17855, 26722, -31521, // w22 w20 w18 w16 + 26722, 6270, -6270, -17855, // w23 w21 w19 w17 + 17855, 6270, 6270, 26722, // w30 w28 w26 w24 + -31521, 26722, -17855, -31521, // w31 w29 w27 w25 - /* row2 */ - 21407, 21407, 27969, -11585, /* w09 w01 w08 w00 */ - 21407, 21407, 11585, -27969, /* w13 w05 w12 w04 */ - 21407, -21407, 11585, 27969, /* w11 w03 w10 w02 */ - -21407, 21407, -27969, -11585, /* w15 w07 w14 w06 */ - 29692, 16819, 25172, -29692, /* w22 w20 w18 w16 */ - 25172, 5906, -5906, -16819, /* w23 w21 w19 w17 */ - 16819, 5906, 5906, 25172, /* w30 w28 w26 w24 */ - -29692, 25172, -16819, -29692, /* w31 w29 w27 w25 */ + //row2 + 21407, 21407, 27969, -11585, // w09 w01 w08 w00 + 21407, 21407, 11585, -27969, // w13 w05 w12 w04 + 21407, -21407, 11585, 27969, // w11 w03 w10 w02 + -21407, 21407, -27969, -11585, // w15 w07 w14 w06 + 29692, 16819, 25172, -29692, // w22 w20 w18 w16 + 25172, 5906, -5906, -16819, // w23 w21 w19 w17 + 16819, 5906, 5906, 25172, // w30 w28 w26 w24 + -29692, 25172, -16819, -29692, // w31 w29 w27 w25 - /* row3 */ - 19266, 19266, 25172, -10426, /* w09 w01 w08 w00 */ - 19266, 19266, 10426, -25172, /* w13 w05 w12 w04 */ - 19266, -19266, 10426, 25172, /* w11 w03 w10 w02 */ - -19266, 19266, -25172, -10426, /* w15 w07 w14 w06, */ - 26722, 15137, 22654, -26722, /* w22 w20 w18 w16 */ - 22654, 5315, -5315, -15137, /* w23 w21 w19 w17 */ - 15137, 5315, 5315, 22654, /* w30 w28 w26 w24 */ - -26722, 22654, -15137, -26722, /* w31 w29 w27 w25, */ + //row3 + 19266, 19266, 25172, -10426, // w09 w01 w08 w00 + 19266, 19266, 10426, -25172, // w13 w05 w12 w04 + 19266, -19266, 10426, 25172, // w11 w03 w10 w02 + -19266, 19266, -25172, -10426, // w15 w07 w14 w06, + 26722, 15137, 22654, -26722, // w22 w20 w18 w16 + 22654, 5315, -5315, -15137, // w23 w21 w19 w17 + 15137, 5315, 5315, 22654, // w30 w28 w26 w24 + -26722, 22654, -15137, -26722, // w31 w29 w27 w25, - /* row4 */ - 16384, 16384, 21407, -8867, /* w09 w01 w08 w00 */ - 16384, 16384, 8867, -21407, /* w13 w05 w12 w04 */ - 16384, -16384, 8867, 21407, /* w11 w03 w10 w02 */ - -16384, 16384, -21407, -8867, /* w15 w07 w14 w06 */ - 22725, 12873, 19266, -22725, /* w22 w20 w18 w16 */ - 19266, 4520, -4520, -12873, /* w23 w21 w19 w17 */ - 12873, 4520, 4520, 19266, /* w30 w28 w26 w24 */ - -22725, 19266, -12873, -22725, /* w31 w29 w27 w25 */ + //row4 + 16384, 16384, 21407, -8867, // w09 w01 w08 w00 + 16384, 16384, 8867, -21407, // w13 w05 w12 w04 + 16384, -16384, 8867, 21407, // w11 w03 w10 w02 + -16384, 16384, -21407, -8867, // w15 w07 w14 w06 + 22725, 12873, 19266, -22725, // w22 w20 w18 w16 + 19266, 4520, -4520, -12873, // w23 w21 w19 w17 + 12873, 4520, 4520, 19266, // w30 w28 w26 w24 + -22725, 19266, -12873, -22725, // w31 w29 w27 w25 - /* row5 */ - 19266, 19266, 25172, -10426, /* w09 w01 w08 w00 */ - 19266, 19266, 10426, -25172, /* w13 w05 w12 w04 */ - 19266, -19266, 10426, 25172, /* w11 w03 w10 w02 */ - -19266, 19266, -25172, -10426, /* w15 w07 w14 w06 */ - 26722, 15137, 22654, -26722, /* w22 w20 w18 w16 */ - 22654, 5315, -5315, -15137, /* w23 w21 w19 w17 */ - 15137, 5315, 5315, 22654, /* w30 w28 w26 w24 */ - -26722, 22654, -15137, -26722, /* w31 w29 w27 w25 */ + //row5 + 19266, 19266, 25172, -10426, // w09 w01 w08 w00 + 19266, 19266, 10426, -25172, // w13 w05 w12 w04 + 19266, -19266, 10426, 25172, // w11 w03 w10 w02 + -19266, 19266, -25172, -10426, // w15 w07 w14 w06 + 26722, 15137, 22654, -26722, // w22 w20 w18 w16 + 22654, 5315, -5315, -15137, // w23 w21 w19 w17 + 15137, 5315, 5315, 22654, // w30 w28 w26 w24 + -26722, 22654, -15137, -26722, // w31 w29 w27 w25 - /* row6 */ - 21407, 21407, 27969, -11585, /* w09 w01 w08 w00 */ - 21407, 21407, 11585, -27969, /* w13 w05 w12 w04 */ - 21407, -21407, 11585, 27969, /* w11 w03 w10 w02 */ - -21407, 21407, -27969, -11585, /* w15 w07 w14 w06, */ - 29692, 16819, 25172, -29692, /* w22 w20 w18 w16 */ - 25172, 5906, -5906, -16819, /* w23 w21 w19 w17 */ - 16819, 5906, 5906, 25172, /* w30 w28 w26 w24 */ - -29692, 25172, -16819, -29692, /* w31 w29 w27 w25, */ + //row6 + 21407, 21407, 27969, -11585, // w09 w01 w08 w00 + 21407, 21407, 11585, -27969, // w13 w05 w12 w04 + 21407, -21407, 11585, 27969, // w11 w03 w10 w02 + -21407, 21407, -27969, -11585, // w15 w07 w14 w06, + 29692, 16819, 25172, -29692, // w22 w20 w18 w16 + 25172, 5906, -5906, -16819, // w23 w21 w19 w17 + 16819, 5906, 5906, 25172, // w30 w28 w26 w24 + -29692, 25172, -16819, -29692, // w31 w29 w27 w25, - /* row7 */ - 22725, 22725, 29692, -12299, /* w09 w01 w08 w00 */ - 22725, 22725, 12299, -29692, /* w13 w05 w12 w04 */ - 22725, -22725, 12299, 29692, /* w11 w03 w10 w02 */ - -22725, 22725, -29692, -12299, /* w15 w07 w14 w06, */ - 31521, 17855, 26722, -31521, /* w22 w20 w18 w16 */ - 26722, 6270, -6270, -17855, /* w23 w21 w19 w17 */ - 17855, 6270, 6270, 26722, /* w30 w28 w26 w24 */ - -31521, 26722, -17855, -31521 /* w31 w29 w27 w25 */ + //row7 + 22725, 22725, 29692, -12299, // w09 w01 w08 w00 + 22725, 22725, 12299, -29692, // w13 w05 w12 w04 + 22725, -22725, 12299, 29692, // w11 w03 w10 w02 + -22725, 22725, -29692, -12299, // w15 w07 w14 w06, + 31521, 17855, 26722, -31521, // w22 w20 w18 w16 + 26722, 6270, -6270, -17855, // w23 w21 w19 w17 + 17855, 6270, 6270, 26722, // w30 w28 w26 w24 + -31521, 26722, -17855, -31521 // w31 w29 w27 w25 }; @@ -269,7 +267,7 @@ static inline void fdct_row(const int16_t *in, int16_t *out, const int16_t *tabl movq_r2m(mm6, *(out + 4)); } -void fdct_mmx(int16_t *block) +void ff_fdct_mmx(int16_t *block) { /* XXX: not thread safe */ static int16_t block_tmp[64] ATTR_ALIGN(8); diff --git a/src/libffmpeg/libavcodec/i386/fft_sse.c b/src/libffmpeg/libavcodec/i386/fft_sse.c new file mode 100644 index 000000000..8e8e36b0f --- /dev/null +++ b/src/libffmpeg/libavcodec/i386/fft_sse.c @@ -0,0 +1,128 @@ +/* + * FFT/MDCT transform with SSE optimizations + * Copyright (c) 2002 Fabrice Bellard. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include "../dsputil.h" +#include + +#include + +static const float p1p1p1m1[4] __attribute__((aligned(16))) = + { 1.0, 1.0, 1.0, -1.0 }; + +static const float p1p1m1m1[4] __attribute__((aligned(16))) = + { 1.0, 1.0, -1.0, -1.0 }; + +#if 0 +static void print_v4sf(const char *str, __m128 a) +{ + float *p = (float *)&a; + printf("%s: %f %f %f %f\n", + str, p[0], p[1], p[2], p[3]); +} +#endif + +/* XXX: handle reverse case */ +void fft_calc_sse(FFTContext *s, FFTComplex *z) +{ + int ln = s->nbits; + int j, np, np2; + int nblocks, nloops; + register FFTComplex *p, *q; + FFTComplex *cptr, *cptr1; + int k; + + np = 1 << ln; + + { + __m128 *r, a, b, a1, c1, c2; + + r = (__m128 *)&z[0]; + c1 = *(__m128 *)p1p1m1m1; + c2 = *(__m128 *)p1p1p1m1; + j = (np >> 2); + do { + a = r[0]; + b = _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)); + a = _mm_mul_ps(a, c1); + /* do the pass 0 butterfly */ + a = _mm_add_ps(a, b); + + a1 = r[1]; + b = _mm_shuffle_ps(a1, a1, _MM_SHUFFLE(1, 0, 3, 2)); + a1 = _mm_mul_ps(a1, c1); + /* do the pass 0 butterfly */ + b = _mm_add_ps(a1, b); + + /* multiply third by -i */ + b = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 3, 1, 0)); + b = _mm_mul_ps(b, c2); + + /* do the pass 1 butterfly */ + r[0] = _mm_add_ps(a, b); + r[1] = _mm_sub_ps(a, b); + r += 2; + } while (--j != 0); + } + /* pass 2 .. ln-1 */ + + nblocks = np >> 3; + nloops = 1 << 2; + np2 = np >> 1; + + cptr1 = s->exptab1; + do { + p = z; + q = z + nloops; + j = nblocks; + do { + cptr = cptr1; + k = nloops >> 1; + do { + __m128 a, b, c, t1, t2; + + a = *(__m128 *)p; + b = *(__m128 *)q; + + /* complex mul */ + c = *(__m128 *)cptr; + /* cre*re cim*re */ + t1 = _mm_mul_ps(c, + _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 0, 0))); + c = *(__m128 *)(cptr + 2); + /* -cim*im cre*im */ + t2 = _mm_mul_ps(c, + _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 1, 1))); + b = _mm_add_ps(t1, t2); + + /* butterfly */ + *(__m128 *)p = _mm_add_ps(a, b); + *(__m128 *)q = _mm_sub_ps(a, b); + + p += 2; + q += 2; + cptr += 4; + } while (--k); + + p += nloops; + q += nloops; + } while (--j); + cptr1 += nloops * 2; + nblocks = nblocks >> 1; + nloops = nloops << 1; + } while (nblocks != 0); +} diff --git a/src/libffmpeg/libavcodec/i386/idct_mmx.c b/src/libffmpeg/libavcodec/i386/idct_mmx.c index 1225de5d2..298c8a8b0 100644 --- a/src/libffmpeg/libavcodec/i386/idct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/idct_mmx.c @@ -87,99 +87,102 @@ static inline void idct_row (int16_t * row, int offset, static inline void mmxext_row_head (int16_t * row, int offset, int16_t * table) { - movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ - movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ + movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 - movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ - movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ + movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 + movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 - movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ - movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ + movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 + movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 - pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ - pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ + movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 + pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 + + pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 } static inline void mmxext_row (int16_t * table, int32_t * rounder) { - movq_m2r (*(table+8), mm1); /* mm1 = -C5 -C1 C3 C1 */ - pmaddwd_r2r (mm2, mm4); /* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */ + movq_m2r (*(table+8), mm1); // mm1 = -C5 -C1 C3 C1 + pmaddwd_r2r (mm2, mm4); // mm4 = C4*x0+C6*x2 C4*x4+C6*x6 - pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */ - pshufw_r2r (mm6, mm6, 0x4e); /* mm6 = x3 x1 x7 x5 */ + pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x4-C6*x6 C4*x0-C6*x2 + pshufw_r2r (mm6, mm6, 0x4e); // mm6 = x3 x1 x7 x5 - movq_m2r (*(table+12), mm7); /* mm7 = -C7 C3 C7 C5 */ - pmaddwd_r2r (mm5, mm1); /* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */ + movq_m2r (*(table+12), mm7); // mm7 = -C7 C3 C7 C5 + pmaddwd_r2r (mm5, mm1); // mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 - paddd_m2r (*rounder, mm3); /* mm3 += rounder */ - pmaddwd_r2r (mm6, mm7); /* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */ + paddd_m2r (*rounder, mm3); // mm3 += rounder + pmaddwd_r2r (mm6, mm7); // mm7 = C3*x1-C7*x3 C5*x5+C7*x7 - pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */ - paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ + pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 + paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder - pmaddwd_m2r (*(table+24), mm5); /* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */ - movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ + pmaddwd_m2r (*(table+24), mm5); // mm5 = C3*x5-C1*x7 C5*x1-C1*x3 + movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder - pmaddwd_m2r (*(table+28), mm6); /* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */ - paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ + pmaddwd_m2r (*(table+28), mm6); // mm6 = C7*x1-C5*x3 C7*x5+C3*x7 + paddd_r2r (mm7, mm1); // mm1 = b1 b0 - paddd_m2r (*rounder, mm0); /* mm0 += rounder */ - psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ + paddd_m2r (*rounder, mm0); // mm0 += rounder + psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder - psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ - paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ + psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 + paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder - paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ - psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ + paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder + psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 - paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ - movq_r2r (mm0, mm4); /* mm4 = a3 a2 + rounder */ + paddd_r2r (mm6, mm5); // mm5 = b3 b2 + movq_r2r (mm0, mm4); // mm4 = a3 a2 + rounder - paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ - psubd_r2r (mm5, mm4); /* mm4 = a3-b3 a2-b2 + rounder */ + paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder + psubd_r2r (mm5, mm4); // mm4 = a3-b3 a2-b2 + rounder } static inline void mmxext_row_tail (int16_t * row, int store) { - psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ - psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ + psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 + + psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 + + packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 - packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ - packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ + packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 - movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ - pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ + movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 + pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 /* slot */ - movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ + movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 } static inline void mmxext_row_mid (int16_t * row, int store, int offset, int16_t * table) { - movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ - psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ + movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 + psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 - movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ - psrad_i2r (ROW_SHIFT, mm4); /* mm4 = y4 y5 */ + movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 + psrad_i2r (ROW_SHIFT, mm4); // mm4 = y4 y5 - packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ - movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ + packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 + movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 - packssdw_r2r (mm3, mm4); /* mm4 = y6 y7 y4 y5 */ - movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ + packssdw_r2r (mm3, mm4); // mm4 = y6 y7 y4 y5 + movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 - movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ - pshufw_r2r (mm4, mm4, 0xb1); /* mm4 = y7 y6 y5 y4 */ + movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 + pshufw_r2r (mm4, mm4, 0xb1); // mm4 = y7 y6 y5 y4 - movq_m2r (*table, mm3); /* mm3 = -C2 -C4 C2 C4 */ - movq_r2m (mm4, *(row+store+4)); /* save y7 y6 y5 y4 */ + movq_m2r (*table, mm3); // mm3 = -C2 -C4 C2 C4 + movq_r2m (mm4, *(row+store+4)); // save y7 y6 y5 y4 - pmaddwd_r2r (mm0, mm3); /* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */ - movq_m2r (*(table+4), mm4); /* mm4 = C6 C4 C6 C4 */ + pmaddwd_r2r (mm0, mm3); // mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 - pshufw_r2r (mm2, mm2, 0x4e); /* mm2 = x2 x0 x6 x4 */ + movq_m2r (*(table+4), mm4); // mm4 = C6 C4 C6 C4 + pshufw_r2r (mm2, mm2, 0x4e); // mm2 = x2 x0 x6 x4 } @@ -196,120 +199,123 @@ static inline void mmxext_row_mid (int16_t * row, int store, static inline void mmx_row_head (int16_t * row, int offset, int16_t * table) { - movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ - movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ + movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 - movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ - movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ + movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 + movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 - movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ - punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ + movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 + movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 - movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ - pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ + punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 - movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ - punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ + movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 + pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 + + movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 + punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 } static inline void mmx_row (int16_t * table, int32_t * rounder) { - pmaddwd_r2r (mm2, mm4); /* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */ - punpckldq_r2r (mm5, mm5); /* mm5 = x3 x1 x3 x1 */ - - pmaddwd_m2r (*(table+16), mm0); /* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */ - punpckhdq_r2r (mm6, mm6); /* mm6 = x7 x5 x7 x5 */ + pmaddwd_r2r (mm2, mm4); // mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 + punpckldq_r2r (mm5, mm5); // mm5 = x3 x1 x3 x1 - movq_m2r (*(table+12), mm7); /* mm7 = -C5 -C1 C7 C5 */ - pmaddwd_r2r (mm5, mm1); /* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */ + pmaddwd_m2r (*(table+16), mm0); // mm0 = C4*x0-C2*x2 C4*x0-C6*x2 + punpckhdq_r2r (mm6, mm6); // mm6 = x7 x5 x7 x5 - paddd_m2r (*rounder, mm3); /* mm3 += rounder */ - pmaddwd_r2r (mm6, mm7); /* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */ + movq_m2r (*(table+12), mm7); // mm7 = -C5 -C1 C7 C5 + pmaddwd_r2r (mm5, mm1); // mm1 = C3*x1-C7*x3 C1*x1+C3*x3 - pmaddwd_m2r (*(table+20), mm2); /* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */ - paddd_r2r (mm4, mm3); /* mm3 = a1 a0 + rounder */ + paddd_m2r (*rounder, mm3); // mm3 += rounder + pmaddwd_r2r (mm6, mm7); // mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 - pmaddwd_m2r (*(table+24), mm5); /* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */ - movq_r2r (mm3, mm4); /* mm4 = a1 a0 + rounder */ + pmaddwd_m2r (*(table+20), mm2); // mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 + paddd_r2r (mm4, mm3); // mm3 = a1 a0 + rounder - pmaddwd_m2r (*(table+28), mm6); /* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */ - paddd_r2r (mm7, mm1); /* mm1 = b1 b0 */ + pmaddwd_m2r (*(table+24), mm5); // mm5 = C7*x1-C5*x3 C5*x1-C1*x3 + movq_r2r (mm3, mm4); // mm4 = a1 a0 + rounder - paddd_m2r (*rounder, mm0); /* mm0 += rounder */ - psubd_r2r (mm1, mm3); /* mm3 = a1-b1 a0-b0 + rounder */ + pmaddwd_m2r (*(table+28), mm6); // mm6 = C3*x5-C1*x7 C7*x5+C3*x7 + paddd_r2r (mm7, mm1); // mm1 = b1 b0 - psrad_i2r (ROW_SHIFT, mm3); /* mm3 = y6 y7 */ - paddd_r2r (mm4, mm1); /* mm1 = a1+b1 a0+b0 + rounder */ + paddd_m2r (*rounder, mm0); // mm0 += rounder + psubd_r2r (mm1, mm3); // mm3 = a1-b1 a0-b0 + rounder - paddd_r2r (mm2, mm0); /* mm0 = a3 a2 + rounder */ - psrad_i2r (ROW_SHIFT, mm1); /* mm1 = y1 y0 */ + psrad_i2r (ROW_SHIFT, mm3); // mm3 = y6 y7 + paddd_r2r (mm4, mm1); // mm1 = a1+b1 a0+b0 + rounder - paddd_r2r (mm6, mm5); /* mm5 = b3 b2 */ - movq_r2r (mm0, mm7); /* mm7 = a3 a2 + rounder */ + paddd_r2r (mm2, mm0); // mm0 = a3 a2 + rounder + psrad_i2r (ROW_SHIFT, mm1); // mm1 = y1 y0 - paddd_r2r (mm5, mm0); /* mm0 = a3+b3 a2+b2 + rounder */ - psubd_r2r (mm5, mm7); /* mm7 = a3-b3 a2-b2 + rounder */ + paddd_r2r (mm6, mm5); // mm5 = b3 b2 + movq_r2r (mm0, mm7); // mm7 = a3 a2 + rounder + paddd_r2r (mm5, mm0); // mm0 = a3+b3 a2+b2 + rounder + psubd_r2r (mm5, mm7); // mm7 = a3-b3 a2-b2 + rounder } static inline void mmx_row_tail (int16_t * row, int store) { - psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ - psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ + psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 - packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ - packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ + psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 - movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ - movq_r2r (mm7, mm4); /* mm4 = y6 y7 y4 y5 */ + packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 - pslld_i2r (16, mm7); /* mm7 = y7 0 y5 0 */ - psrld_i2r (16, mm4); /* mm4 = 0 y6 0 y4 */ + packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 - por_r2r (mm4, mm7); /* mm7 = y7 y6 y5 y4 */ + movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 + movq_r2r (mm7, mm4); // mm4 = y6 y7 y4 y5 + + pslld_i2r (16, mm7); // mm7 = y7 0 y5 0 + + psrld_i2r (16, mm4); // mm4 = 0 y6 0 y4 + + por_r2r (mm4, mm7); // mm7 = y7 y6 y5 y4 /* slot */ - movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ + movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 } static inline void mmx_row_mid (int16_t * row, int store, int offset, int16_t * table) { - movq_m2r (*(row+offset), mm2); /* mm2 = x6 x4 x2 x0 */ - psrad_i2r (ROW_SHIFT, mm0); /* mm0 = y3 y2 */ + movq_m2r (*(row+offset), mm2); // mm2 = x6 x4 x2 x0 + psrad_i2r (ROW_SHIFT, mm0); // mm0 = y3 y2 - movq_m2r (*(row+offset+4), mm5); /* mm5 = x7 x5 x3 x1 */ - psrad_i2r (ROW_SHIFT, mm7); /* mm7 = y4 y5 */ + movq_m2r (*(row+offset+4), mm5); // mm5 = x7 x5 x3 x1 + psrad_i2r (ROW_SHIFT, mm7); // mm7 = y4 y5 - packssdw_r2r (mm0, mm1); /* mm1 = y3 y2 y1 y0 */ - movq_r2r (mm5, mm6); /* mm6 = x7 x5 x3 x1 */ + packssdw_r2r (mm0, mm1); // mm1 = y3 y2 y1 y0 + movq_r2r (mm5, mm6); // mm6 = x7 x5 x3 x1 - packssdw_r2r (mm3, mm7); /* mm7 = y6 y7 y4 y5 */ - movq_r2r (mm2, mm0); /* mm0 = x6 x4 x2 x0 */ + packssdw_r2r (mm3, mm7); // mm7 = y6 y7 y4 y5 + movq_r2r (mm2, mm0); // mm0 = x6 x4 x2 x0 - movq_r2m (mm1, *(row+store)); /* save y3 y2 y1 y0 */ - movq_r2r (mm7, mm1); /* mm1 = y6 y7 y4 y5 */ + movq_r2m (mm1, *(row+store)); // save y3 y2 y1 y0 + movq_r2r (mm7, mm1); // mm1 = y6 y7 y4 y5 - punpckldq_r2r (mm0, mm0); /* mm0 = x2 x0 x2 x0 */ - psrld_i2r (16, mm7); /* mm7 = 0 y6 0 y4 */ + punpckldq_r2r (mm0, mm0); // mm0 = x2 x0 x2 x0 + psrld_i2r (16, mm7); // mm7 = 0 y6 0 y4 - movq_m2r (*table, mm3); /* mm3 = C6 C4 C2 C4 */ - pslld_i2r (16, mm1); /* mm1 = y7 0 y5 0 */ + movq_m2r (*table, mm3); // mm3 = C6 C4 C2 C4 + pslld_i2r (16, mm1); // mm1 = y7 0 y5 0 - movq_m2r (*(table+4), mm4); /* mm4 = -C2 -C4 C6 C4 */ - por_r2r (mm1, mm7); /* mm7 = y7 y6 y5 y4 */ + movq_m2r (*(table+4), mm4); // mm4 = -C2 -C4 C6 C4 + por_r2r (mm1, mm7); // mm7 = y7 y6 y5 y4 - movq_m2r (*(table+8), mm1); /* mm1 = -C7 C3 C3 C1 */ - punpckhdq_r2r (mm2, mm2); /* mm2 = x6 x4 x6 x4 */ + movq_m2r (*(table+8), mm1); // mm1 = -C7 C3 C3 C1 + punpckhdq_r2r (mm2, mm2); // mm2 = x6 x4 x6 x4 - movq_r2m (mm7, *(row+store+4)); /* save y7 y6 y5 y4 */ - pmaddwd_r2r (mm0, mm3); /* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */ + movq_r2m (mm7, *(row+store+4)); // save y7 y6 y5 y4 + pmaddwd_r2r (mm0, mm3); // mm3 = C4*x0+C6*x2 C4*x0+C2*x2 } #if 0 -/* C column IDCT - its just here to document the MMXEXT and MMX versions */ +// C column IDCT - its just here to document the MMXEXT and MMX versions static inline void idct_col (int16_t * col, int offset) { /* multiplication - as implemented on mmx */ @@ -380,7 +386,7 @@ static inline void idct_col (int16_t * col, int offset) #endif -/* MMX column IDCT */ +// MMX column IDCT static inline void idct_col (int16_t * col, int offset) { #define T1 13036 @@ -396,131 +402,132 @@ static inline void idct_col (int16_t * col, int offset) /* column code adapted from peter gubanov */ /* http://www.elecard.com/peter/idct.shtml */ - movq_m2r (*_T1, mm0); /* mm0 = T1 */ - movq_m2r (*(col+offset+1*8), mm1); /* mm1 = x1 */ + movq_m2r (*_T1, mm0); // mm0 = T1 - movq_r2r (mm0, mm2); /* mm2 = T1 */ - movq_m2r (*(col+offset+7*8), mm4); /* mm4 = x7 */ + movq_m2r (*(col+offset+1*8), mm1); // mm1 = x1 + movq_r2r (mm0, mm2); // mm2 = T1 - pmulhw_r2r (mm1, mm0); /* mm0 = T1*x1 */ - movq_m2r (*_T3, mm5); /* mm5 = T3 */ + movq_m2r (*(col+offset+7*8), mm4); // mm4 = x7 + pmulhw_r2r (mm1, mm0); // mm0 = T1*x1 - pmulhw_r2r (mm4, mm2); /* mm2 = T1*x7 */ - movq_m2r (*(col+offset+5*8), mm6); /* mm6 = x5 */ + movq_m2r (*_T3, mm5); // mm5 = T3 + pmulhw_r2r (mm4, mm2); // mm2 = T1*x7 - movq_r2r (mm5, mm7); /* mm7 = T3-1 */ - movq_m2r (*(col+offset+3*8), mm3); /* mm3 = x3 */ + movq_m2r (*(col+offset+5*8), mm6); // mm6 = x5 + movq_r2r (mm5, mm7); // mm7 = T3-1 - psubsw_r2r (mm4, mm0); /* mm0 = v17 */ - movq_m2r (*_T2, mm4); /* mm4 = T2 */ + movq_m2r (*(col+offset+3*8), mm3); // mm3 = x3 + psubsw_r2r (mm4, mm0); // mm0 = v17 - pmulhw_r2r (mm3, mm5); /* mm5 = (T3-1)*x3 */ - paddsw_r2r (mm2, mm1); /* mm1 = u17 */ + movq_m2r (*_T2, mm4); // mm4 = T2 + pmulhw_r2r (mm3, mm5); // mm5 = (T3-1)*x3 - pmulhw_r2r (mm6, mm7); /* mm7 = (T3-1)*x5 */ + paddsw_r2r (mm2, mm1); // mm1 = u17 + pmulhw_r2r (mm6, mm7); // mm7 = (T3-1)*x5 /* slot */ - movq_r2r (mm4, mm2); /* mm2 = T2 */ - paddsw_r2r (mm3, mm5); /* mm5 = T3*x3 */ + movq_r2r (mm4, mm2); // mm2 = T2 + paddsw_r2r (mm3, mm5); // mm5 = T3*x3 + + pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2 + paddsw_r2r (mm6, mm7); // mm7 = T3*x5 - pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */ - paddsw_r2r (mm6, mm7); /* mm7 = T3*x5 */ + psubsw_r2r (mm6, mm5); // mm5 = v35 + paddsw_r2r (mm3, mm7); // mm7 = u35 - psubsw_r2r (mm6, mm5); /* mm5 = v35 */ - paddsw_r2r (mm3, mm7); /* mm7 = u35 */ + movq_m2r (*(col+offset+6*8), mm3); // mm3 = x6 + movq_r2r (mm0, mm6); // mm6 = v17 - movq_m2r (*(col+offset+6*8), mm3); /* mm3 = x6 */ - movq_r2r (mm0, mm6); /* mm6 = v17 */ + pmulhw_r2r (mm3, mm2); // mm2 = T2*x6 + psubsw_r2r (mm5, mm0); // mm0 = b3 - pmulhw_r2r (mm3, mm2); /* mm2 = T2*x6 */ - psubsw_r2r (mm5, mm0); /* mm0 = b3 */ + psubsw_r2r (mm3, mm4); // mm4 = v26 + paddsw_r2r (mm6, mm5); // mm5 = v12 - psubsw_r2r (mm3, mm4); /* mm4 = v26 */ - paddsw_r2r (mm6, mm5); /* mm5 = v12 */ + movq_r2m (mm0, *(col+offset+3*8)); // save b3 in scratch0 + movq_r2r (mm1, mm6); // mm6 = u17 - movq_r2m (mm0, *(col+offset+3*8)); /* save b3 in scratch0 */ - movq_r2r (mm1, mm6); /* mm6 = u17 */ + paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26 + paddsw_r2r (mm7, mm6); // mm6 = b0 - paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */ - paddsw_r2r (mm7, mm6); /* mm6 = b0 */ + psubsw_r2r (mm7, mm1); // mm1 = u12 + movq_r2r (mm1, mm7); // mm7 = u12 - psubsw_r2r (mm7, mm1); /* mm1 = u12 */ - movq_r2r (mm1, mm7); /* mm7 = u12 */ + movq_m2r (*(col+offset+0*8), mm3); // mm3 = x0 + paddsw_r2r (mm5, mm1); // mm1 = u12+v12 - movq_m2r (*(col+offset+0*8), mm3); /* mm3 = x0 */ - paddsw_r2r (mm5, mm1); /* mm1 = u12+v12 */ + movq_m2r (*_C4, mm0); // mm0 = C4/2 + psubsw_r2r (mm5, mm7); // mm7 = u12-v12 - movq_m2r (*_C4, mm0); /* mm0 = C4/2 */ - psubsw_r2r (mm5, mm7); /* mm7 = u12-v12 */ + movq_r2m (mm6, *(col+offset+5*8)); // save b0 in scratch1 + pmulhw_r2r (mm0, mm1); // mm1 = b1/2 - movq_r2m (mm6, *(col+offset+5*8)); /* save b0 in scratch1 */ - pmulhw_r2r (mm0, mm1); /* mm1 = b1/2 */ + movq_r2r (mm4, mm6); // mm6 = v26 + pmulhw_r2r (mm0, mm7); // mm7 = b2/2 - movq_r2r (mm4, mm6); /* mm6 = v26 */ - pmulhw_r2r (mm0, mm7); /* mm7 = b2/2 */ + movq_m2r (*(col+offset+4*8), mm5); // mm5 = x4 + movq_r2r (mm3, mm0); // mm0 = x0 - movq_m2r (*(col+offset+4*8), mm5); /* mm5 = x4 */ - movq_r2r (mm3, mm0); /* mm0 = x0 */ + psubsw_r2r (mm5, mm3); // mm3 = v04 + paddsw_r2r (mm5, mm0); // mm0 = u04 - psubsw_r2r (mm5, mm3); /* mm3 = v04 */ - paddsw_r2r (mm5, mm0); /* mm0 = u04 */ + paddsw_r2r (mm3, mm4); // mm4 = a1 + movq_r2r (mm0, mm5); // mm5 = u04 - paddsw_r2r (mm3, mm4); /* mm4 = a1 */ - movq_r2r (mm0, mm5); /* mm5 = u04 */ + psubsw_r2r (mm6, mm3); // mm3 = a2 + paddsw_r2r (mm2, mm5); // mm5 = a0 - psubsw_r2r (mm6, mm3); /* mm3 = a2 */ - paddsw_r2r (mm2, mm5); /* mm5 = a0 */ + paddsw_r2r (mm1, mm1); // mm1 = b1 + psubsw_r2r (mm2, mm0); // mm0 = a3 - paddsw_r2r (mm1, mm1); /* mm1 = b1 */ - psubsw_r2r (mm2, mm0); /* mm0 = a3 */ + paddsw_r2r (mm7, mm7); // mm7 = b2 + movq_r2r (mm3, mm2); // mm2 = a2 - paddsw_r2r (mm7, mm7); /* mm7 = b2 */ - movq_r2r (mm3, mm2); /* mm2 = a2 */ + movq_r2r (mm4, mm6); // mm6 = a1 + paddsw_r2r (mm7, mm3); // mm3 = a2+b2 - movq_r2r (mm4, mm6); /* mm6 = a1 */ - paddsw_r2r (mm7, mm3); /* mm3 = a2+b2 */ + psraw_i2r (COL_SHIFT, mm3); // mm3 = y2 + paddsw_r2r (mm1, mm4); // mm4 = a1+b1 - psraw_i2r (COL_SHIFT, mm3); /* mm3 = y2 */ - paddsw_r2r (mm1, mm4); /* mm4 = a1+b1 */ + psraw_i2r (COL_SHIFT, mm4); // mm4 = y1 + psubsw_r2r (mm1, mm6); // mm6 = a1-b1 - psraw_i2r (COL_SHIFT, mm4); /* mm4 = y1 */ - psubsw_r2r (mm1, mm6); /* mm6 = a1-b1 */ + movq_m2r (*(col+offset+5*8), mm1); // mm1 = b0 + psubsw_r2r (mm7, mm2); // mm2 = a2-b2 - movq_m2r (*(col+offset+5*8), mm1); /* mm1 = b0 */ - psubsw_r2r (mm7, mm2); /* mm2 = a2-b2 */ + psraw_i2r (COL_SHIFT, mm6); // mm6 = y6 + movq_r2r (mm5, mm7); // mm7 = a0 - psraw_i2r (COL_SHIFT, mm6); /* mm6 = y6 */ - movq_r2r (mm5, mm7); /* mm7 = a0 */ + movq_r2m (mm4, *(col+offset+1*8)); // save y1 + psraw_i2r (COL_SHIFT, mm2); // mm2 = y5 - movq_r2m (mm4, *(col+offset+1*8)); /* save y1 */ - psraw_i2r (COL_SHIFT, mm2); /* mm2 = y5 */ + movq_r2m (mm3, *(col+offset+2*8)); // save y2 + paddsw_r2r (mm1, mm5); // mm5 = a0+b0 - movq_r2m (mm3, *(col+offset+2*8)); /* save y2 */ - paddsw_r2r (mm1, mm5); /* mm5 = a0+b0 */ + movq_m2r (*(col+offset+3*8), mm4); // mm4 = b3 + psubsw_r2r (mm1, mm7); // mm7 = a0-b0 - movq_m2r (*(col+offset+3*8), mm4); /* mm4 = b3 */ - psubsw_r2r (mm1, mm7); /* mm7 = a0-b0 */ + psraw_i2r (COL_SHIFT, mm5); // mm5 = y0 + movq_r2r (mm0, mm3); // mm3 = a3 - psraw_i2r (COL_SHIFT, mm5); /* mm5 = y0 */ - movq_r2r (mm0, mm3); /* mm3 = a3 */ + movq_r2m (mm2, *(col+offset+5*8)); // save y5 + psubsw_r2r (mm4, mm3); // mm3 = a3-b3 - movq_r2m (mm2, *(col+offset+5*8)); /* save y5 */ - psubsw_r2r (mm4, mm3); /* mm3 = a3-b3 */ + psraw_i2r (COL_SHIFT, mm7); // mm7 = y7 + paddsw_r2r (mm0, mm4); // mm4 = a3+b3 - psraw_i2r (COL_SHIFT, mm7); /* mm7 = y7 */ - paddsw_r2r (mm0, mm4); /* mm4 = a3+b3 */ + movq_r2m (mm5, *(col+offset+0*8)); // save y0 + psraw_i2r (COL_SHIFT, mm3); // mm3 = y4 - movq_r2m (mm5, *(col+offset+0*8)); /* save y0 */ - psraw_i2r (COL_SHIFT, mm3); /* mm3 = y4 */ + movq_r2m (mm6, *(col+offset+6*8)); // save y6 + psraw_i2r (COL_SHIFT, mm4); // mm4 = y3 - movq_r2m (mm6, *(col+offset+6*8)); /* save y6 */ - psraw_i2r (COL_SHIFT, mm4); /* mm4 = y3 */ + movq_r2m (mm7, *(col+offset+7*8)); // save y7 - movq_r2m (mm7, *(col+offset+7*8)); /* save y7 */ - movq_r2m (mm3, *(col+offset+4*8)); /* save y4 */ + movq_r2m (mm3, *(col+offset+4*8)); // save y4 - movq_r2m (mm4, *(col+offset+3*8)); /* save y3 */ + movq_r2m (mm4, *(col+offset+3*8)); // save y3 #undef T1 #undef T2 diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c index 9a29df49a..9b76cdb07 100644 --- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c +++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c @@ -31,7 +31,7 @@ static __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL; static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h) { int len= -(stride<pict_type == B_TYPE) { - ____asm____ volatile( - "movl (%1), %%eax\n\t" - "movl 4(%1), %%edx\n\t" - "movl 8(%1), %%ecx\n\t" - "movl %%eax, (%0)\n\t" - "movl %%edx, 4(%0)\n\t" - "movl %%ecx, 8(%0)\n\t" - : - :"r"(s->current_picture), "r"(s->aux_picture) - :"eax","edx","ecx","memory"); - } else { - /* swap next and last */ - ____asm____ volatile( - "movl (%1), %%eax\n\t" - "movl 4(%1), %%edx\n\t" - "movl 8(%1), %%ecx\n\t" - "xchgl (%0), %%eax\n\t" - "xchgl 4(%0), %%edx\n\t" - "xchgl 8(%0), %%ecx\n\t" - "movl %%eax, (%1)\n\t" - "movl %%edx, 4(%1)\n\t" - "movl %%ecx, 8(%1)\n\t" - "movl %%eax, (%2)\n\t" - "movl %%edx, 4(%2)\n\t" - "movl %%ecx, 8(%2)\n\t" - : - :"r"(s->last_picture), "r"(s->next_picture), "r"(s->current_picture) - :"eax","edx","ecx","memory"); - } -} -#endif - static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; @@ -77,85 +36,77 @@ static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x000 static void dct_unquantize_h263_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { - int i, level, qmul, qadd, nCoeffs; + int level, qmul, qadd, nCoeffs; - qmul = s->qscale << 1; - if (s->h263_aic && s->mb_intra) - qadd = 0; - else - qadd = (s->qscale - 1) | 1; + qmul = qscale << 1; + qadd = (qscale - 1) | 1; + assert(s->block_last_index[n]>=0); + if (s->mb_intra) { if (!s->h263_aic) { if (n < 4) - block[0] = block[0] * s->y_dc_scale; + level = block[0] * s->y_dc_scale; else - block[0] = block[0] * s->c_dc_scale; - } - for(i=1; i<8; i++) { - level = block[i]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; - } - block[i] = level; - } + level = block[0] * s->c_dc_scale; + }else{ + qadd = 0; + level= block[0]; } - nCoeffs=64; + nCoeffs=63; } else { - i = 0; - nCoeffs= zigzag_end[ s->block_last_index[n] ]; + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; } -/* printf("%d %d ", qmul, qadd); */ -__asm__ volatile( - "movd %1, %%mm6 \n\t" /* qmul */ - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "movd %2, %%mm5 \n\t" /* qadd */ - "pxor %%mm7, %%mm7 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "packssdw %%mm5, %%mm5 \n\t" - "psubw %%mm5, %%mm7 \n\t" - "pxor %%mm4, %%mm4 \n\t" - ".balign 16\n\t" - "1: \n\t" - "movq (%0, %3), %%mm0 \n\t" - "movq 8(%0, %3), %%mm1 \n\t" - - "pmullw %%mm6, %%mm0 \n\t" - "pmullw %%mm6, %%mm1 \n\t" - - "movq (%0, %3), %%mm2 \n\t" - "movq 8(%0, %3), %%mm3 \n\t" - - "pcmpgtw %%mm4, %%mm2 \n\t" /* block[i] < 0 ? -1 : 0 */ - "pcmpgtw %%mm4, %%mm3 \n\t" /* block[i] < 0 ? -1 : 0 */ - - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - - "paddw %%mm7, %%mm0 \n\t" - "paddw %%mm7, %%mm1 \n\t" - - "pxor %%mm0, %%mm2 \n\t" - "pxor %%mm1, %%mm3 \n\t" - - "pcmpeqw %%mm7, %%mm0 \n\t" /* block[i] == 0 ? -1 : 0 */ - "pcmpeqw %%mm7, %%mm1 \n\t" /* block[i] == 0 ? -1 : 0 */ - - "pandn %%mm2, %%mm0 \n\t" - "pandn %%mm3, %%mm1 \n\t" - - "movq %%mm0, (%0, %3) \n\t" - "movq %%mm1, 8(%0, %3) \n\t" - - "addl $16, %3 \n\t" - "js 1b \n\t" - ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(i-nCoeffs)) - : "memory" +//printf("%d %d ", qmul, qadd); +asm volatile( + "movd %1, %%mm6 \n\t" //qmul + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "movd %2, %%mm5 \n\t" //qadd + "pxor %%mm7, %%mm7 \n\t" + "packssdw %%mm5, %%mm5 \n\t" + "packssdw %%mm5, %%mm5 \n\t" + "psubw %%mm5, %%mm7 \n\t" + "pxor %%mm4, %%mm4 \n\t" + ".balign 16\n\t" + "1: \n\t" + "movq (%0, %3), %%mm0 \n\t" + "movq 8(%0, %3), %%mm1 \n\t" + + "pmullw %%mm6, %%mm0 \n\t" + "pmullw %%mm6, %%mm1 \n\t" + + "movq (%0, %3), %%mm2 \n\t" + "movq 8(%0, %3), %%mm3 \n\t" + + "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + + "paddw %%mm7, %%mm0 \n\t" + "paddw %%mm7, %%mm1 \n\t" + + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + + "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 + + "pandn %%mm2, %%mm0 \n\t" + "pandn %%mm3, %%mm1 \n\t" + + "movq %%mm0, (%0, %3) \n\t" + "movq %%mm1, 8(%0, %3) \n\t" + + "addl $16, %3 \n\t" + "jng 1b \n\t" + ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) + : "memory" ); + if(s->mb_intra) + block[0]= level; } @@ -194,124 +145,126 @@ static void dct_unquantize_mpeg1_mmx(MpegEncContext *s, int nCoeffs; const UINT16 *quant_matrix; - if(s->alternate_scan) nCoeffs= 64; - else nCoeffs= zigzag_end[ s->block_last_index[n] ]; + assert(s->block_last_index[n]>=0); + + nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; if (s->mb_intra) { int block0; - if (n < 4) + if (n < 4) block0 = block[0] * s->y_dc_scale; else block0 = block[0] * s->c_dc_scale; /* XXX: only mpeg1 */ quant_matrix = s->intra_matrix; -__asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $15, %%mm7 \n\t" - "movd %2, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" +asm volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $15, %%mm7 \n\t" + "movd %2, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" "movl %3, %%eax \n\t" - ".balign 16\n\t" - "1: \n\t" - "movq (%0, %%eax), %%mm0 \n\t" - "movq 8(%0, %%eax), %%mm1 \n\t" - "movq (%1, %%eax), %%mm4 \n\t" - "movq 8(%1, %%eax), %%mm5 \n\t" - "pmullw %%mm6, %%mm4 \n\t" /* q=qscale*quant_matrix[i] */ - "pmullw %%mm6, %%mm5 \n\t" /* q=qscale*quant_matrix[i] */ - "pxor %%mm2, %%mm2 \n\t" - "pxor %%mm3, %%mm3 \n\t" - "pcmpgtw %%mm0, %%mm2 \n\t" /* block[i] < 0 ? -1 : 0 */ - "pcmpgtw %%mm1, %%mm3 \n\t" /* block[i] < 0 ? -1 : 0 */ - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" /* abs(block[i]) */ - "psubw %%mm3, %%mm1 \n\t" /* abs(block[i]) */ - "pmullw %%mm4, %%mm0 \n\t" /* abs(block[i])*q */ - "pmullw %%mm5, %%mm1 \n\t" /* abs(block[i])*q */ - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" /* FIXME slow */ - "pcmpeqw (%0, %%eax), %%mm4 \n\t" /* block[i] == 0 ? -1 : 0 */ - "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" /* block[i] == 0 ? -1 : 0 */ - "psraw $3, %%mm0 \n\t" - "psraw $3, %%mm1 \n\t" - "psubw %%mm7, %%mm0 \n\t" - "psubw %%mm7, %%mm1 \n\t" - "por %%mm7, %%mm0 \n\t" - "por %%mm7, %%mm1 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "pandn %%mm0, %%mm4 \n\t" - "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %%eax) \n\t" - "movq %%mm5, 8(%0, %%eax) \n\t" - - "addl $16, %%eax \n\t" - "js 1b \n\t" - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) - : "%eax", "memory" - ); + ".balign 16\n\t" + "1: \n\t" + "movq (%0, %%eax), %%mm0 \n\t" + "movq 8(%0, %%eax), %%mm1 \n\t" + "movq (%1, %%eax), %%mm4 \n\t" + "movq 8(%1, %%eax), %%mm5 \n\t" + "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] + "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] + "pxor %%mm2, %%mm2 \n\t" + "pxor %%mm3, %%mm3 \n\t" + "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) + "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) + "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q + "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // FIXME slow + "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "psraw $3, %%mm0 \n\t" + "psraw $3, %%mm1 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psubw %%mm7, %%mm1 \n\t" + "por %%mm7, %%mm0 \n\t" + "por %%mm7, %%mm1 \n\t" + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "pandn %%mm0, %%mm4 \n\t" + "pandn %%mm1, %%mm5 \n\t" + "movq %%mm4, (%0, %%eax) \n\t" + "movq %%mm5, 8(%0, %%eax) \n\t" + + "addl $16, %%eax \n\t" + "js 1b \n\t" + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) + : "%eax", "memory" + ); block[0]= block0; } else { quant_matrix = s->inter_matrix; -__asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $15, %%mm7 \n\t" - "movd %2, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" +asm volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $15, %%mm7 \n\t" + "movd %2, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" "movl %3, %%eax \n\t" - ".balign 16\n\t" - "1: \n\t" - "movq (%0, %%eax), %%mm0 \n\t" - "movq 8(%0, %%eax), %%mm1 \n\t" - "movq (%1, %%eax), %%mm4 \n\t" - "movq 8(%1, %%eax), %%mm5 \n\t" - "pmullw %%mm6, %%mm4 \n\t" /* q=qscale*quant_matrix[i] */ - "pmullw %%mm6, %%mm5 \n\t" /* q=qscale*quant_matrix[i] */ - "pxor %%mm2, %%mm2 \n\t" - "pxor %%mm3, %%mm3 \n\t" - "pcmpgtw %%mm0, %%mm2 \n\t" /* block[i] < 0 ? -1 : 0 */ - "pcmpgtw %%mm1, %%mm3 \n\t" /* block[i] < 0 ? -1 : 0 */ - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" /* abs(block[i]) */ - "psubw %%mm3, %%mm1 \n\t" /* abs(block[i]) */ - "paddw %%mm0, %%mm0 \n\t" /* abs(block[i])*2 */ - "paddw %%mm1, %%mm1 \n\t" /* abs(block[i])*2 */ - "paddw %%mm7, %%mm0 \n\t" /* abs(block[i])*2 + 1 */ - "paddw %%mm7, %%mm1 \n\t" /* abs(block[i])*2 + 1 */ - "pmullw %%mm4, %%mm0 \n\t" /* (abs(block[i])*2 + 1)*q */ - "pmullw %%mm5, %%mm1 \n\t" /* (abs(block[i])*2 + 1)*q */ - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" /* FIXME slow */ - "pcmpeqw (%0, %%eax), %%mm4 \n\t" /* block[i] == 0 ? -1 : 0 */ - "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" /* block[i] == 0 ? -1 : 0 */ - "psraw $4, %%mm0 \n\t" - "psraw $4, %%mm1 \n\t" - "psubw %%mm7, %%mm0 \n\t" - "psubw %%mm7, %%mm1 \n\t" - "por %%mm7, %%mm0 \n\t" - "por %%mm7, %%mm1 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "pandn %%mm0, %%mm4 \n\t" - "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %%eax) \n\t" - "movq %%mm5, 8(%0, %%eax) \n\t" - - "addl $16, %%eax \n\t" - "js 1b \n\t" - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) - : "%eax", "memory" + ".balign 16\n\t" + "1: \n\t" + "movq (%0, %%eax), %%mm0 \n\t" + "movq 8(%0, %%eax), %%mm1 \n\t" + "movq (%1, %%eax), %%mm4 \n\t" + "movq 8(%1, %%eax), %%mm5 \n\t" + "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] + "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] + "pxor %%mm2, %%mm2 \n\t" + "pxor %%mm3, %%mm3 \n\t" + "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) + "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) + "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 + "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 + "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 + "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 + "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q + "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // FIXME slow + "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "psraw $4, %%mm0 \n\t" + "psraw $4, %%mm1 \n\t" + "psubw %%mm7, %%mm0 \n\t" + "psubw %%mm7, %%mm1 \n\t" + "por %%mm7, %%mm0 \n\t" + "por %%mm7, %%mm1 \n\t" + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "pandn %%mm0, %%mm4 \n\t" + "pandn %%mm1, %%mm5 \n\t" + "movq %%mm4, (%0, %%eax) \n\t" + "movq %%mm5, 8(%0, %%eax) \n\t" + + "addl $16, %%eax \n\t" + "js 1b \n\t" + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) + : "%eax", "memory" ); } + } static void dct_unquantize_mpeg2_mmx(MpegEncContext *s, @@ -319,115 +272,117 @@ static void dct_unquantize_mpeg2_mmx(MpegEncContext *s, { int nCoeffs; const UINT16 *quant_matrix; + + assert(s->block_last_index[n]>=0); - if(s->alternate_scan) nCoeffs= 64; - else nCoeffs= zigzag_end[ s->block_last_index[n] ]; + if(s->alternate_scan) nCoeffs= 63; //FIXME + else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; if (s->mb_intra) { int block0; - if (n < 4) + if (n < 4) block0 = block[0] * s->y_dc_scale; else block0 = block[0] * s->c_dc_scale; quant_matrix = s->intra_matrix; -__asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" - "psrlw $15, %%mm7 \n\t" - "movd %2, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" +asm volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" + "psrlw $15, %%mm7 \n\t" + "movd %2, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" "movl %3, %%eax \n\t" - ".balign 16\n\t" - "1: \n\t" - "movq (%0, %%eax), %%mm0 \n\t" - "movq 8(%0, %%eax), %%mm1 \n\t" - "movq (%1, %%eax), %%mm4 \n\t" - "movq 8(%1, %%eax), %%mm5 \n\t" - "pmullw %%mm6, %%mm4 \n\t" /* q=qscale*quant_matrix[i] */ - "pmullw %%mm6, %%mm5 \n\t" /* q=qscale*quant_matrix[i] */ - "pxor %%mm2, %%mm2 \n\t" - "pxor %%mm3, %%mm3 \n\t" - "pcmpgtw %%mm0, %%mm2 \n\t" /* block[i] < 0 ? -1 : 0 */ - "pcmpgtw %%mm1, %%mm3 \n\t" /* block[i] < 0 ? -1 : 0 */ - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" /* abs(block[i]) */ - "psubw %%mm3, %%mm1 \n\t" /* abs(block[i]) */ - "pmullw %%mm4, %%mm0 \n\t" /* abs(block[i])*q */ - "pmullw %%mm5, %%mm1 \n\t" /* abs(block[i])*q */ - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" /* FIXME slow */ - "pcmpeqw (%0, %%eax), %%mm4 \n\t" /* block[i] == 0 ? -1 : 0 */ - "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" /* block[i] == 0 ? -1 : 0 */ - "psraw $3, %%mm0 \n\t" - "psraw $3, %%mm1 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "pandn %%mm0, %%mm4 \n\t" - "pandn %%mm1, %%mm5 \n\t" - "movq %%mm4, (%0, %%eax) \n\t" - "movq %%mm5, 8(%0, %%eax) \n\t" - - "addl $16, %%eax \n\t" - "js 1b \n\t" - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) - : "%eax", "memory" - ); + ".balign 16\n\t" + "1: \n\t" + "movq (%0, %%eax), %%mm0 \n\t" + "movq 8(%0, %%eax), %%mm1 \n\t" + "movq (%1, %%eax), %%mm4 \n\t" + "movq 8(%1, %%eax), %%mm5 \n\t" + "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] + "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] + "pxor %%mm2, %%mm2 \n\t" + "pxor %%mm3, %%mm3 \n\t" + "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) + "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) + "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q + "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // FIXME slow + "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "psraw $3, %%mm0 \n\t" + "psraw $3, %%mm1 \n\t" + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "pandn %%mm0, %%mm4 \n\t" + "pandn %%mm1, %%mm5 \n\t" + "movq %%mm4, (%0, %%eax) \n\t" + "movq %%mm5, 8(%0, %%eax) \n\t" + + "addl $16, %%eax \n\t" + "jng 1b \n\t" + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) + : "%eax", "memory" + ); block[0]= block0; - /* Note, we dont do mismatch control for intra as errors cannot accumulate */ + //Note, we dont do mismatch control for intra as errors cannot accumulate } else { quant_matrix = s->inter_matrix; -__asm__ volatile( - "pcmpeqw %%mm7, %%mm7 \n\t" +asm volatile( + "pcmpeqw %%mm7, %%mm7 \n\t" "psrlq $48, %%mm7 \n\t" - "movd %2, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" - "packssdw %%mm6, %%mm6 \n\t" + "movd %2, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" "movl %3, %%eax \n\t" - ".balign 16\n\t" - "1: \n\t" - "movq (%0, %%eax), %%mm0 \n\t" - "movq 8(%0, %%eax), %%mm1 \n\t" - "movq (%1, %%eax), %%mm4 \n\t" - "movq 8(%1, %%eax), %%mm5 \n\t" - "pmullw %%mm6, %%mm4 \n\t" /* q=qscale*quant_matrix[i] */ - "pmullw %%mm6, %%mm5 \n\t" /* q=qscale*quant_matrix[i] */ - "pxor %%mm2, %%mm2 \n\t" - "pxor %%mm3, %%mm3 \n\t" - "pcmpgtw %%mm0, %%mm2 \n\t" /* block[i] < 0 ? -1 : 0 */ - "pcmpgtw %%mm1, %%mm3 \n\t" /* block[i] < 0 ? -1 : 0 */ - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" /* abs(block[i]) */ - "psubw %%mm3, %%mm1 \n\t" /* abs(block[i]) */ - "paddw %%mm0, %%mm0 \n\t" /* abs(block[i])*2 */ - "paddw %%mm1, %%mm1 \n\t" /* abs(block[i])*2 */ - "pmullw %%mm4, %%mm0 \n\t" /* abs(block[i])*2*q */ - "pmullw %%mm5, %%mm1 \n\t" /* abs(block[i])*2*q */ - "paddw %%mm4, %%mm0 \n\t" /* (abs(block[i])*2 + 1)*q */ - "paddw %%mm5, %%mm1 \n\t" /* (abs(block[i])*2 + 1)*q */ - "pxor %%mm4, %%mm4 \n\t" - "pxor %%mm5, %%mm5 \n\t" /* FIXME slow */ - "pcmpeqw (%0, %%eax), %%mm4 \n\t" /* block[i] == 0 ? -1 : 0 */ - "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" /* block[i] == 0 ? -1 : 0 */ - "psrlw $4, %%mm0 \n\t" - "psrlw $4, %%mm1 \n\t" - "pxor %%mm2, %%mm0 \n\t" - "pxor %%mm3, %%mm1 \n\t" - "psubw %%mm2, %%mm0 \n\t" - "psubw %%mm3, %%mm1 \n\t" - "pandn %%mm0, %%mm4 \n\t" - "pandn %%mm1, %%mm5 \n\t" + ".balign 16\n\t" + "1: \n\t" + "movq (%0, %%eax), %%mm0 \n\t" + "movq 8(%0, %%eax), %%mm1 \n\t" + "movq (%1, %%eax), %%mm4 \n\t" + "movq 8(%1, %%eax), %%mm5 \n\t" + "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] + "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] + "pxor %%mm2, %%mm2 \n\t" + "pxor %%mm3, %%mm3 \n\t" + "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) + "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) + "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 + "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 + "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q + "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q + "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q + "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q + "pxor %%mm4, %%mm4 \n\t" + "pxor %%mm5, %%mm5 \n\t" // FIXME slow + "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0 + "psrlw $4, %%mm0 \n\t" + "psrlw $4, %%mm1 \n\t" + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + "psubw %%mm2, %%mm0 \n\t" + "psubw %%mm3, %%mm1 \n\t" + "pandn %%mm0, %%mm4 \n\t" + "pandn %%mm1, %%mm5 \n\t" "pxor %%mm4, %%mm7 \n\t" "pxor %%mm5, %%mm7 \n\t" - "movq %%mm4, (%0, %%eax) \n\t" - "movq %%mm5, 8(%0, %%eax) \n\t" + "movq %%mm4, (%0, %%eax) \n\t" + "movq %%mm5, 8(%0, %%eax) \n\t" - "addl $16, %%eax \n\t" - "js 1b \n\t" + "addl $16, %%eax \n\t" + "jng 1b \n\t" "movd 124(%0, %3), %%mm0 \n\t" "movq %%mm7, %%mm6 \n\t" "psrlq $32, %%mm7 \n\t" @@ -439,14 +394,14 @@ __asm__ volatile( "psrlq $15, %%mm7 \n\t" "pxor %%mm7, %%mm0 \n\t" "movd %%mm0, 124(%0, %3) \n\t" - - ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) - : "%eax", "memory" + + ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) + : "%eax", "memory" ); } } -/* draw the edges of width 'w' of an image of size width, height +/* draw the edges of width 'w' of an image of size width, height this mmx version can only handle w==8 || w==16 */ static void draw_edges_mmx(UINT8 *buf, int wrap, int width, int height, int w) { @@ -458,88 +413,82 @@ static void draw_edges_mmx(UINT8 *buf, int wrap, int width, int height, int w) ptr = buf; if(w==8) { - __asm__ volatile( - "1: \n\t" - "movd (%0), %%mm0 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" - "punpcklwd %%mm0, %%mm0 \n\t" - "punpckldq %%mm0, %%mm0 \n\t" - "movq %%mm0, -8(%0) \n\t" - "movq -8(%0, %2), %%mm1 \n\t" - "punpckhbw %%mm1, %%mm1 \n\t" - "punpckhwd %%mm1, %%mm1 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movq %%mm1, (%0, %2) \n\t" - "addl %1, %0 \n\t" - "cmpl %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (ptr) - : "r" (wrap), "r" (width), "r" (ptr + wrap*height) + asm volatile( + "1: \n\t" + "movd (%0), %%mm0 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpcklwd %%mm0, %%mm0 \n\t" + "punpckldq %%mm0, %%mm0 \n\t" + "movq %%mm0, -8(%0) \n\t" + "movq -8(%0, %2), %%mm1 \n\t" + "punpckhbw %%mm1, %%mm1 \n\t" + "punpckhwd %%mm1, %%mm1 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movq %%mm1, (%0, %2) \n\t" + "addl %1, %0 \n\t" + "cmpl %3, %0 \n\t" + " jb 1b \n\t" + : "+r" (ptr) + : "r" (wrap), "r" (width), "r" (ptr + wrap*height) ); } else { - __asm__ volatile( - "1: \n\t" - "movd (%0), %%mm0 \n\t" - "punpcklbw %%mm0, %%mm0 \n\t" - "punpcklwd %%mm0, %%mm0 \n\t" - "punpckldq %%mm0, %%mm0 \n\t" - "movq %%mm0, -8(%0) \n\t" - "movq %%mm0, -16(%0) \n\t" - "movq -8(%0, %2), %%mm1 \n\t" - "punpckhbw %%mm1, %%mm1 \n\t" - "punpckhwd %%mm1, %%mm1 \n\t" - "punpckhdq %%mm1, %%mm1 \n\t" - "movq %%mm1, (%0, %2) \n\t" - "movq %%mm1, 8(%0, %2) \n\t" - "addl %1, %0 \n\t" - "cmpl %3, %0 \n\t" - " jb 1b \n\t" - : "+r" (ptr) - : "r" (wrap), "r" (width), "r" (ptr + wrap*height) + asm volatile( + "1: \n\t" + "movd (%0), %%mm0 \n\t" + "punpcklbw %%mm0, %%mm0 \n\t" + "punpcklwd %%mm0, %%mm0 \n\t" + "punpckldq %%mm0, %%mm0 \n\t" + "movq %%mm0, -8(%0) \n\t" + "movq %%mm0, -16(%0) \n\t" + "movq -8(%0, %2), %%mm1 \n\t" + "punpckhbw %%mm1, %%mm1 \n\t" + "punpckhwd %%mm1, %%mm1 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movq %%mm1, (%0, %2) \n\t" + "movq %%mm1, 8(%0, %2) \n\t" + "addl %1, %0 \n\t" + "cmpl %3, %0 \n\t" + " jb 1b \n\t" + : "+r" (ptr) + : "r" (wrap), "r" (width), "r" (ptr + wrap*height) ); } - + for(i=0;iavctx->dct_algo; + int i; + const int dct_algo = s->avctx->dct_algo; + const int idct_algo= s->avctx->idct_algo; + s->dct_unquantize_h263 = dct_unquantize_h263_mmx; s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx; s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx; @@ -560,7 +539,7 @@ void MPV_common_init_mmx(MpegEncContext *s) draw_edges = draw_edges_mmx; if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ - s->fdct = fdct_mmx; + s->fdct = ff_fdct_mmx; if(mm_flags & MM_MMXEXT){ s->dct_quantize= dct_quantize_MMX2; @@ -568,5 +547,20 @@ void MPV_common_init_mmx(MpegEncContext *s) s->dct_quantize= dct_quantize_MMX; } } + + if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ + s->idct_put= ff_simple_idct_put_mmx; + s->idct_add= ff_simple_idct_add_mmx; + s->idct_permutation_type= FF_SIMPLE_IDCT_PERM; + }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ + if(mm_flags & MM_MMXEXT){ + s->idct_put= ff_libmpeg2mmx2_idct_put; + s->idct_add= ff_libmpeg2mmx2_idct_add; + }else{ + s->idct_put= ff_libmpeg2mmx_idct_put; + s->idct_add= ff_libmpeg2mmx_idct_add; + } + s->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; + } } } diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c index f10014837..799ff1666 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c @@ -36,12 +36,12 @@ static int RENAME(dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow) { - int level=0, last_non_zero_p1, q; /* =0 is cuz gcc says uninitalized ... */ + int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ... const UINT16 *qmat, *bias; static __align8 INT16 temp_block[64]; - /* s->fdct (block); */ - fdct_mmx (block); /* cant be anything else ... */ + //s->fdct (block); + ff_fdct_mmx (block); //cant be anything else ... if (s->mb_intra) { int dummy; @@ -52,14 +52,14 @@ static int RENAME(dct_quantize)(MpegEncContext *s, /* note: block[0] is assumed to be positive */ if (!s->h263_aic) { #if 1 - __asm__ volatile ( + asm volatile ( "xorl %%edx, %%edx \n\t" "mul %%ecx \n\t" : "=d" (level), "=a"(dummy) : "a" ((block[0]>>2) + q), "c" (inverse[q<<1]) ); #else - __asm__ volatile ( + asm volatile ( "xorl %%edx, %%edx \n\t" "divw %%cx \n\t" "movzwl %%ax, %%eax \n\t" @@ -71,9 +71,9 @@ static int RENAME(dct_quantize)(MpegEncContext *s, } else /* For AIC we skip quant/dequant of INTRADC */ level = (block[0] + 4)>>3; - - block[0]=0; /* avoid fake overflow */ -/* temp_block[0] = (block[0] + (q >> 1)) / q; */ + + block[0]=0; //avoid fake overflow +// temp_block[0] = (block[0] + (q >> 1)) / q; last_non_zero_p1 = 1; bias = s->q_intra_matrix16_bias[qscale]; qmat = s->q_intra_matrix16[qscale]; @@ -89,36 +89,36 @@ static int RENAME(dct_quantize)(MpegEncContext *s, to enable -fpic compilation. this patch has not been accepted on main ffmpeg cvs. */ - __asm__ volatile( - "movd %%eax, %%mm3 \n\t" /* last_non_zero_p1 */ + asm volatile( + "movd %%eax, %%mm3 \n\t" // last_non_zero_p1 SPREADW(%%mm3) - "pxor %%mm7, %%mm7 \n\t" /* 0 */ - "pxor %%mm4, %%mm4 \n\t" /* 0 */ - "movq (%1), %%mm5 \n\t" /* qmat[0] */ + "pxor %%mm7, %%mm7 \n\t" // 0 + "pxor %%mm4, %%mm4 \n\t" // 0 + "movq (%1), %%mm5 \n\t" // qmat[0] "pxor %%mm6, %%mm6 \n\t" - "psubw (%2), %%mm6 \n\t" /* -bias[0] */ + "psubw (%2), %%mm6 \n\t" // -bias[0] "movl $-128, %%eax \n\t" : "+a" (last_non_zero_p1) : "r" (qmat), "r" (bias) ); /* CORE */ - __asm__ volatile( + asm volatile( ".balign 16 \n\t" "1: \n\t" - "pxor %%mm1, %%mm1 \n\t" /* 0 */ - "movq (%1, %%eax), %%mm0 \n\t" /* block[i] */ - "pcmpgtw %%mm0, %%mm1 \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */ - "pxor %%mm1, %%mm0 \n\t" - "psubw %%mm1, %%mm0 \n\t" /* ABS(block[i]) */ - "psubusw %%mm6, %%mm0 \n\t" /* ABS(block[i]) + bias[0] */ - "pmulhw %%mm5, %%mm0 \n\t" /* (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 */ - "por %%mm0, %%mm4 \n\t" - "pxor %%mm1, %%mm0 \n\t" - "psubw %%mm1, %%mm0 \n\t" /* out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) */ + "pxor %%mm1, %%mm1 \n\t" // 0 + "movq (%1, %%eax), %%mm0 \n\t" // block[i] + "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 + "pxor %%mm1, %%mm0 \n\t" + "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) + "psubusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] + "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 + "por %%mm0, %%mm4 \n\t" + "pxor %%mm1, %%mm0 \n\t" + "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) "movq %%mm0, (%3, %%eax) \n\t" - "pcmpeqw %%mm7, %%mm0 \n\t" /* out==0 ? 0xFF : 0x00 */ - "movq (%4, %%eax), %%mm1 \n\t" - "movq %%mm7, (%1, %%eax) \n\t" /* 0 */ + "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 + "movq (%4, %%eax), %%mm1 \n\t" + "movq %%mm7, (%1, %%eax) \n\t" // 0 "pandn %%mm1, %%mm0 \n\t" PMAXW(%%mm0, %%mm3) "addl $8, %%eax \n\t" @@ -127,7 +127,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, : "r" (block+64), "r" (inv_zigzag_direct16+64), "r" (temp_block+64) ); /* EPILOGUE */ - __asm__ volatile( + asm volatile( "movq %%mm3, %%mm0 \n\t" "psrlq $32, %%mm3 \n\t" PMAXW(%%mm0, %%mm3) @@ -135,44 +135,44 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "psrlq $16, %%mm3 \n\t" PMAXW(%%mm0, %%mm3) "movd %%mm3, %%eax \n\t" - "movzbl %%al, %%eax \n\t" /* last_non_zero_p1 */ - "movd %2, %%mm1 \n\t" /* max_qcoeff */ + "movzbl %%al, %%eax \n\t" // last_non_zero_p1 + "movd %2, %%mm1 \n\t" // max_qcoeff SPREADW(%%mm1) - "psubusw %%mm1, %%mm4 \n\t" + "psubusw %%mm1, %%mm4 \n\t" "packuswb %%mm4, %%mm4 \n\t" - "movd %%mm4, %1 \n\t" /* *overflow */ + "movd %%mm4, %1 \n\t" // *overflow : "+a" (last_non_zero_p1), "=r" (*overflow) : "r" (s->max_qcoeff) ); - }else{ /* FMT_H263 */ - __asm__ volatile( + }else{ // FMT_H263 + asm volatile( "pushl %%ebp \n\t" "pushl %%ebx \n\t" "movl %0, %%ebp \n\t" "movl (%%ebp), %%ebx \n\t" - "movd %%ebx, %%mm3 \n\t" /* last_non_zero_p1 */ + "movd %%ebx, %%mm3 \n\t" // last_non_zero_p1 SPREADW(%%mm3) - "pxor %%mm7, %%mm7 \n\t" /* 0 */ - "pxor %%mm4, %%mm4 \n\t" /* 0 */ + "pxor %%mm7, %%mm7 \n\t" // 0 + "pxor %%mm4, %%mm4 \n\t" // 0 "movl $-128, %%ebx \n\t" ".balign 16 \n\t" "1: \n\t" - "pxor %%mm1, %%mm1 \n\t" /* 0 */ - "movq (%1, %%ebx), %%mm0 \n\t" /* block[i] */ - "pcmpgtw %%mm0, %%mm1 \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */ - "pxor %%mm1, %%mm0 \n\t" - "psubw %%mm1, %%mm0 \n\t" /* ABS(block[i]) */ - "movq (%3, %%ebx), %%mm6 \n\t" /* bias[0] */ - "paddusw %%mm6, %%mm0 \n\t" /* ABS(block[i]) + bias[0] */ - "movq (%2, %%ebx), %%mm5 \n\t" /* qmat[i] */ - "pmulhw %%mm5, %%mm0 \n\t" /* (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 */ - "por %%mm0, %%mm4 \n\t" - "pxor %%mm1, %%mm0 \n\t" - "psubw %%mm1, %%mm0 \n\t" /* out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) */ + "pxor %%mm1, %%mm1 \n\t" // 0 + "movq (%1, %%ebx), %%mm0 \n\t" // block[i] + "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 + "pxor %%mm1, %%mm0 \n\t" + "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) + "movq (%3, %%ebx), %%mm6 \n\t" // bias[0] + "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0] + "movq (%2, %%ebx), %%mm5 \n\t" // qmat[i] + "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 + "por %%mm0, %%mm4 \n\t" + "pxor %%mm1, %%mm0 \n\t" + "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) "movq %%mm0, (%5, %%ebx) \n\t" - "pcmpeqw %%mm7, %%mm0 \n\t" /* out==0 ? 0xFF : 0x00 */ - "movq (%4, %%ebx), %%mm1 \n\t" - "movq %%mm7, (%1, %%ebx) \n\t" /* 0 */ + "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 + "movq (%4, %%ebx), %%mm1 \n\t" + "movq %%mm7, (%1, %%ebx) \n\t" // 0 "pandn %%mm1, %%mm0 \n\t" PMAXW(%%mm0, %%mm3) "addl $8, %%ebx \n\t" @@ -184,7 +184,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "psrlq $16, %%mm3 \n\t" PMAXW(%%mm0, %%mm3) "movd %%mm3, %%ebx \n\t" - "movzbl %%bl, %%ebx \n\t" /* last_non_zero_p1 */ + "movzbl %%bl, %%ebx \n\t" // last_non_zero_p1 "movl %%ebx, (%%ebp) \n\t" "popl %%ebx \n\t" "popl %%ebp \n\t" @@ -193,43 +193,155 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "r" (block+64), "r" (qmat+64), "r" (bias+64), "r" (inv_zigzag_direct16+64), "r" (temp_block+64) ); - /* note the __asm__ is split cuz gcc doesnt like that many operands ... */ - __asm__ volatile( - "movd %1, %%mm1 \n\t" /* max_qcoeff */ + // note the asm is split cuz gcc doesnt like that many operands ... + asm volatile( + "movd %1, %%mm1 \n\t" // max_qcoeff SPREADW(%%mm1) - "psubusw %%mm1, %%mm4 \n\t" + "psubusw %%mm1, %%mm4 \n\t" "packuswb %%mm4, %%mm4 \n\t" - "movd %%mm4, %0 \n\t" /* *overflow */ + "movd %%mm4, %0 \n\t" // *overflow : "=r" (*overflow) : "r" (s->max_qcoeff) ); } - if(s->mb_intra) temp_block[0]= level; /* FIXME move after permute */ + if(s->mb_intra) block[0]= level; + else block[0]= temp_block[0]; -/* last_non_zero_p1=64; */ - /* permute for IDCT */ - __asm__ volatile( - "movl %0, %%eax \n\t" - "pushl %%ebp \n\t" - "movl %%esp, " MANGLE(esp_temp) "\n\t" - "1: \n\t" - "movzbl (%1, %%eax), %%ebx \n\t" - "movzbl 1(%1, %%eax), %%ebp \n\t" - "movw (%2, %%ebx, 2), %%cx \n\t" - "movw (%2, %%ebp, 2), %%sp \n\t" - "movzbl " MANGLE(permutation) "(%%ebx), %%ebx\n\t" - "movzbl " MANGLE(permutation) "(%%ebp), %%ebp\n\t" - "movw %%cx, (%3, %%ebx, 2) \n\t" - "movw %%sp, (%3, %%ebp, 2) \n\t" - "addl $2, %%eax \n\t" - " js 1b \n\t" - "movl " MANGLE(esp_temp) ", %%esp\n\t" - "popl %%ebp \n\t" - : - : "g" (-last_non_zero_p1), "d" (zigzag_direct_noperm+last_non_zero_p1), "S" (temp_block), "D" (block) - : "%eax", "%ebx", "%ecx" - ); + if(s->idct_permutation[1]==8){ + if(last_non_zero_p1 <= 1) goto end; + block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08]; + block[0x20] = temp_block[0x10]; + if(last_non_zero_p1 <= 4) goto end; + block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02]; + block[0x09] = temp_block[0x03]; + if(last_non_zero_p1 <= 7) goto end; + block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11]; + block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20]; + if(last_non_zero_p1 <= 11) goto end; + block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12]; + block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04]; + block[0x0C] = temp_block[0x05]; + if(last_non_zero_p1 <= 16) goto end; + block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13]; + block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21]; + block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30]; + block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22]; + if(last_non_zero_p1 <= 24) goto end; + block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14]; + block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06]; + block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E]; + block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C]; + if(last_non_zero_p1 <= 32) goto end; + block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A]; + block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38]; + block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32]; + block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24]; + if(last_non_zero_p1 <= 40) goto end; + block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16]; + block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17]; + block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25]; + block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33]; + if(last_non_zero_p1 <= 48) goto end; + block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; + block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D]; + block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; + block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E]; + if(last_non_zero_p1 <= 56) goto end; + block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C]; + block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36]; + block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37]; + block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; + }else if(s->idct_permutation[1]==4){ + if(last_non_zero_p1 <= 1) goto end; + block[0x04] = temp_block[0x01]; + block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; + if(last_non_zero_p1 <= 4) goto end; + block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02]; + block[0x05] = temp_block[0x03]; + if(last_non_zero_p1 <= 7) goto end; + block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11]; + block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; + if(last_non_zero_p1 <= 11) goto end; + block[0x1C] = temp_block[0x19]; + block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B]; + block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05]; + if(last_non_zero_p1 <= 16) goto end; + block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13]; + block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21]; + block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; + block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22]; + if(last_non_zero_p1 <= 24) goto end; + block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14]; + block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06]; + block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E]; + block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C]; + if(last_non_zero_p1 <= 32) goto end; + block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A]; + block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38]; + block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32]; + block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24]; + if(last_non_zero_p1 <= 40) goto end; + block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16]; + block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; + block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25]; + block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33]; + if(last_non_zero_p1 <= 48) goto end; + block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B]; + block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D]; + block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; + block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E]; + if(last_non_zero_p1 <= 56) goto end; + block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C]; + block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36]; + block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; + block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; + }else{ + if(last_non_zero_p1 <= 1) goto end; + block[0x01] = temp_block[0x01]; + block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; + if(last_non_zero_p1 <= 4) goto end; + block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02]; + block[0x03] = temp_block[0x03]; + if(last_non_zero_p1 <= 7) goto end; + block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11]; + block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; + if(last_non_zero_p1 <= 11) goto end; + block[0x19] = temp_block[0x19]; + block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B]; + block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05]; + if(last_non_zero_p1 <= 16) goto end; + block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13]; + block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21]; + block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; + block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22]; + if(last_non_zero_p1 <= 24) goto end; + block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14]; + block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06]; + block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E]; + block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C]; + if(last_non_zero_p1 <= 32) goto end; + block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A]; + block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38]; + block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32]; + block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24]; + if(last_non_zero_p1 <= 40) goto end; + block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16]; + block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; + block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25]; + block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33]; + if(last_non_zero_p1 <= 48) goto end; + block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; + block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D]; + block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; + block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E]; + if(last_non_zero_p1 <= 56) goto end; + block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C]; + block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36]; + block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; + block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; + } + end: /* for(i=0; i