diff options
author | Miguel Freitas <miguelfreitas@users.sourceforge.net> | 2002-09-16 21:49:34 +0000 |
---|---|---|
committer | Miguel Freitas <miguelfreitas@users.sourceforge.net> | 2002-09-16 21:49:34 +0000 |
commit | d2db0f3bbdc81aae2c316751daf1d53b42a3e6a0 (patch) | |
tree | e9f4a94547ada41ea13445185a20305378d6c0bd /src/libffmpeg/libavcodec/i386 | |
parent | d35f836384d9fa164350612ead96362d9cd7b457 (diff) | |
download | xine-lib-d2db0f3bbdc81aae2c316751daf1d53b42a3e6a0.tar.gz xine-lib-d2db0f3bbdc81aae2c316751daf1d53b42a3e6a0.tar.bz2 |
- sync ffmpeg to cvs (sorry Mike it still doesn't decode your teststream --
something must be broken at ffmpeg, also happens with mplayer)
- added priority sorted lists, now autoprobing should work again.
- fixed infinite loop in plugin loader.
obs: latest ffmpeg contains ppc optimizations, someone will have to enable these
though.
CVS patchset: 2676
CVS date: 2002/09/16 21:49:34
Diffstat (limited to 'src/libffmpeg/libavcodec/i386')
-rw-r--r-- | src/libffmpeg/libavcodec/i386/dsputil_mmx.c | 211 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h | 87 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h | 136 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/fdct_mmx.c | 2 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c | 17 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c | 29 |
6 files changed, 391 insertions, 91 deletions
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c index 02558604b..4336e4bde 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c @@ -343,7 +343,7 @@ static void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line } while (--i); } -static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -369,6 +369,40 @@ static void put_pixels_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int ); } +static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + __asm __volatile( + "lea (%3, %3), %%eax \n\t" + ".balign 8 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm4 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq 8(%1, %3), %%mm5 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm4, 8(%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 8(%1), %%mm4 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq 8(%1, %3), %%mm5 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm4, 8(%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + : "+g"(h), "+r" (pixels), "+r" (block) + : "r"(line_size) + : "%eax", "memory" + ); +} + static void clear_blocks_mmx(DCTELEM *blocks) { __asm __volatile( @@ -393,19 +427,19 @@ static void just_return() { return; } void dsputil_init_mmx(void) { mm_flags = mm_support(); -#if 1 - printf("libavcodec: CPU flags:"); +#if 0 + fprintf(stderr, "libavcodec: CPU flags:"); if (mm_flags & MM_MMX) - printf(" mmx"); + fprintf(stderr, " mmx"); if (mm_flags & MM_MMXEXT) - printf(" mmxext"); + fprintf(stderr, " mmxext"); if (mm_flags & MM_3DNOW) - printf(" 3dnow"); + fprintf(stderr, " 3dnow"); if (mm_flags & MM_SSE) - printf(" sse"); + fprintf(stderr, " sse"); if (mm_flags & MM_SSE2) - printf(" sse2"); - printf("\n"); + fprintf(stderr, " sse2"); + fprintf(stderr, "\n"); #endif if (mm_flags & MM_MMX) { @@ -424,27 +458,45 @@ void dsputil_init_mmx(void) pix_abs8x8_y2 = pix_abs8x8_y2_mmx; pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; - av_fdct = fdct_mmx; - - put_pixels_tab[0] = put_pixels_mmx; - put_pixels_tab[1] = put_pixels_x2_mmx; - put_pixels_tab[2] = put_pixels_y2_mmx; - put_pixels_tab[3] = put_pixels_xy2_mmx; - - put_no_rnd_pixels_tab[0] = put_pixels_mmx; - put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; - put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; - put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_mmx; - - avg_pixels_tab[0] = avg_pixels_mmx; - avg_pixels_tab[1] = avg_pixels_x2_mmx; - avg_pixels_tab[2] = avg_pixels_y2_mmx; - avg_pixels_tab[3] = avg_pixels_xy2_mmx; - - avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_mmx; - avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_mmx; - avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_mmx; - avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_mmx; + put_pixels_tab[0][0] = put_pixels16_mmx; + put_pixels_tab[0][1] = put_pixels16_x2_mmx; + put_pixels_tab[0][2] = put_pixels16_y2_mmx; + put_pixels_tab[0][3] = put_pixels16_xy2_mmx; + + put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx; + put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; + put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; + put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx; + + avg_pixels_tab[0][0] = avg_pixels16_mmx; + avg_pixels_tab[0][1] = avg_pixels16_x2_mmx; + avg_pixels_tab[0][2] = avg_pixels16_y2_mmx; + avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; + + avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx; + avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx; + avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx; + avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx; + + put_pixels_tab[1][0] = put_pixels8_mmx; + put_pixels_tab[1][1] = put_pixels8_x2_mmx; + put_pixels_tab[1][2] = put_pixels8_y2_mmx; + put_pixels_tab[1][3] = put_pixels8_xy2_mmx; + + put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx; + put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; + put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; + put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx; + + avg_pixels_tab[1][0] = avg_pixels8_mmx; + avg_pixels_tab[1][1] = avg_pixels8_x2_mmx; + avg_pixels_tab[1][2] = avg_pixels8_y2_mmx; + avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; + + avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx; + avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx; + avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; + avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; if (mm_flags & MM_MMXEXT) { pix_abs16x16 = pix_abs16x16_mmx2; @@ -457,25 +509,45 @@ void dsputil_init_mmx(void) pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; pix_abs8x8_xy2= pix_abs8x8_xy2_mmx2; - put_pixels_tab[1] = put_pixels_x2_mmx2; - put_pixels_tab[2] = put_pixels_y2_mmx2; - put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx2; - put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx2; - - avg_pixels_tab[0] = avg_pixels_mmx2; - avg_pixels_tab[1] = avg_pixels_x2_mmx2; - avg_pixels_tab[2] = avg_pixels_y2_mmx2; - avg_pixels_tab[3] = avg_pixels_xy2_mmx2; + put_pixels_tab[0][1] = put_pixels16_x2_mmx2; + put_pixels_tab[0][2] = put_pixels16_y2_mmx2; + put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; + put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; + + avg_pixels_tab[0][0] = avg_pixels16_mmx2; + avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; + avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; + avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; + + put_pixels_tab[1][1] = put_pixels8_x2_mmx2; + put_pixels_tab[1][2] = put_pixels8_y2_mmx2; + put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; + put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; + + avg_pixels_tab[1][0] = avg_pixels8_mmx2; + avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; + avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; + avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; } else if (mm_flags & MM_3DNOW) { - put_pixels_tab[1] = put_pixels_x2_3dnow; - put_pixels_tab[2] = put_pixels_y2_3dnow; - put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_3dnow; - put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_3dnow; - - avg_pixels_tab[0] = avg_pixels_3dnow; - avg_pixels_tab[1] = avg_pixels_x2_3dnow; - avg_pixels_tab[2] = avg_pixels_y2_3dnow; - avg_pixels_tab[3] = avg_pixels_xy2_3dnow; + put_pixels_tab[0][1] = put_pixels16_x2_3dnow; + put_pixels_tab[0][2] = put_pixels16_y2_3dnow; + put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; + put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; + + avg_pixels_tab[0][0] = avg_pixels16_3dnow; + avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; + avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; + avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; + + put_pixels_tab[1][1] = put_pixels8_x2_3dnow; + put_pixels_tab[1][2] = put_pixels8_y2_3dnow; + put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; + put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; + + avg_pixels_tab[1][0] = avg_pixels8_3dnow; + avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; + avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; + avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; } /* idct */ @@ -526,27 +598,54 @@ void dsputil_init_mmx(void) #endif } +void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block); + +/** + * this will send coeff matrixes which would have different results for the 16383 type MMX vs C IDCTs to the C IDCT + */ +void bit_exact_idct_put(UINT8 *dest, int line_size, INT16 *block){ + if( block[0]>1022 && block[1]==0 && block[4 ]==0 && block[5 ]==0 + && block[8]==0 && block[9]==0 && block[12]==0 && block[13]==0){ + int16_t tmp[64]; + int i; + + for(i=0; i<64; i++) + tmp[i]= block[i]; + for(i=0; i<64; i++) + block[i]= tmp[block_permute_op(i)]; + + simple_idct_put(dest, line_size, block); + } + else + gen_idct_put(dest, line_size, block); +} + /* remove any non bit exact operation (testing purpose). NOTE that this function should be kept as small as possible because it is always difficult to test automatically non bit exact cases. */ void dsputil_set_bit_exact_mmx(void) { if (mm_flags & MM_MMX) { - if (mm_flags & MM_MMXEXT) { - put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; - put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; - avg_pixels_tab[3] = avg_pixels_xy2_mmx; + + /* MMX2 & 3DNOW */ + put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; + put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; + avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; + put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; + put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; + avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; + if (mm_flags & MM_MMXEXT) { pix_abs16x16_x2 = pix_abs16x16_x2_mmx; pix_abs16x16_y2 = pix_abs16x16_y2_mmx; pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; pix_abs8x8_x2 = pix_abs8x8_x2_mmx; pix_abs8x8_y2 = pix_abs8x8_y2_mmx; pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; - } else if (mm_flags & MM_3DNOW) { - put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_mmx; - put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_mmx; - avg_pixels_tab[3] = avg_pixels_xy2_mmx; } +#ifdef SIMPLE_IDCT + if(ff_idct_put==gen_idct_put && ff_idct == simple_idct_mmx) + ff_idct_put= bit_exact_idct_put; +#endif } } diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h index a16ccc88b..6873432ce 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h @@ -25,7 +25,7 @@ /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm clobber bug - now it will work with 2.95.2 and also with -fPIC */ -static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -52,9 +52,49 @@ static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, :"r" (line_size) :"%eax", "memory"); } + +static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + __asm __volatile( + "lea (%3, %3), %%eax \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 8(%1, %3), %%mm3 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm1 \n\t" + PAVGB" 9(%1), %%mm2 \n\t" + PAVGB" 9(%1, %3), %%mm3 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq %%mm2, 8(%2) \n\t" + "movq %%mm3, 8(%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq 8(%1), %%mm2 \n\t" + "movq 8(%1, %3), %%mm3 \n\t" + PAVGB" 1(%1), %%mm0 \n\t" + PAVGB" 1(%1, %3), %%mm1 \n\t" + PAVGB" 9(%1), %%mm2 \n\t" + PAVGB" 9(%1, %3), %%mm3 \n\t" + "addl %%eax, %1 \n\t" + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "movq %%mm2, 8(%2) \n\t" + "movq %%mm3, 8(%2, %3) \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r" (line_size) + :"%eax", "memory"); +} /* GL: this function does incorrect rounding if overflow */ -static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BONE(mm6); __asm __volatile( @@ -91,7 +131,7 @@ static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int lin :"%eax", "memory"); } -static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -122,7 +162,7 @@ static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, } /* GL: this function does incorrect rounding if overflow */ -static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BONE(mm6); __asm __volatile( @@ -155,7 +195,7 @@ static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int lin :"%eax", "memory"); } -static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -183,7 +223,7 @@ static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, in :"%eax", "memory"); } -static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -215,7 +255,7 @@ static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, :"%eax", "memory"); } -static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -254,7 +294,7 @@ static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, } // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter -static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BONE(mm6); __asm __volatile( @@ -294,3 +334,34 @@ static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size :"r" (line_size) :"%eax", "memory"); } + +//FIXME the following could be optimized too ... +static void DEF(put_no_rnd_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ + DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); + DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); +} +static void DEF(put_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ + DEF(put_pixels8_y2)(block , pixels , line_size, h); + DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); +} +static void DEF(put_no_rnd_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ + DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); + DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); +} +static void DEF(avg_pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ + DEF(avg_pixels8)(block , pixels , line_size, h); + DEF(avg_pixels8)(block+8, pixels+8, line_size, h); +} +static void DEF(avg_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ + DEF(avg_pixels8_x2)(block , pixels , line_size, h); + DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); +} +static void DEF(avg_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ + DEF(avg_pixels8_y2)(block , pixels , line_size, h); + DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); +} +static void DEF(avg_pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ + DEF(avg_pixels8_xy2)(block , pixels , line_size, h); + DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); +} + diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h index dc70c9c8e..3605e03f9 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h @@ -22,7 +22,7 @@ */ // put_pixels -static void DEF(put, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BFE(mm6); __asm __volatile( @@ -54,7 +54,53 @@ static void DEF(put, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size :"eax", "memory"); } -static void DEF(put, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + __asm __volatile( + "lea (%3, %3), %%eax \n\t" + ".balign 8 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "movq 8(%1), %%mm0 \n\t" + "movq 9(%1), %%mm1 \n\t" + "movq 8(%1, %3), %%mm2 \n\t" + "movq 9(%1, %3), %%mm3 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, 8(%2) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "movq 8(%1), %%mm0 \n\t" + "movq 9(%1), %%mm1 \n\t" + "movq 8(%1, %3), %%mm2 \n\t" + "movq 9(%1, %3), %%mm3 \n\t" + PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, 8(%2) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "addl %%eax, %1 \n\t" + "addl %%eax, %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"(line_size) + :"eax", "memory"); +} + +static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BFE(mm6); __asm __volatile( @@ -83,7 +129,7 @@ static void DEF(put, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size :"eax", "memory"); } -static void DEF(put, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_ZERO(mm7); SET_RND(mm6); // =2 for rnd and =1 for no_rnd version @@ -151,7 +197,7 @@ static void DEF(put, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_siz // avg_pixels // in case more speed is needed - unroling would certainly help -static void DEF(avg, pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BFE(mm6); JUMPALIGN(); @@ -170,7 +216,50 @@ static void DEF(avg, pixels)(UINT8 *block, const UINT8 *pixels, int line_size, i while (--h); } -static void DEF(avg, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg, pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm __volatile( + "movq %0, %%mm0 \n\t" + "movq %1, %%mm1 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + "movq %%mm2, %0 \n\t" + "movq 8%0, %%mm0 \n\t" + "movq 8%1, %%mm1 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + "movq %%mm2, 8%0 \n\t" + :"+m"(*block) + :"m"(*pixels) + :"memory"); + pixels += line_size; + block += line_size; + } + while (--h); +} + +static void DEF(avg, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +{ + MOVQ_BFE(mm6); + JUMPALIGN(); + do { + __asm __volatile( + "movq %1, %%mm0 \n\t" + "movq 1%1, %%mm1 \n\t" + "movq %0, %%mm3 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, %0 \n\t" + :"+m"(*block) + :"m"(*pixels) + :"memory"); + pixels += line_size; + block += line_size; + } while (--h); +} + +static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BFE(mm6); JUMPALIGN(); @@ -182,6 +271,12 @@ static void DEF(avg, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) "movq %%mm0, %0 \n\t" + "movq 8%1, %%mm0 \n\t" + "movq 9%1, %%mm1 \n\t" + "movq 8%0, %%mm3 \n\t" + PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, 8%0 \n\t" :"+m"(*block) :"m"(*pixels) :"memory"); @@ -190,7 +285,7 @@ static void DEF(avg, pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size } while (--h); } -static void DEF(avg, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_BFE(mm6); __asm __volatile( @@ -214,10 +309,10 @@ static void DEF(avg, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size "movq (%1, %%eax), %%mm0 \n\t" PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) "movq (%2), %%mm3 \n\t" - PAVGB(%%mm3, %%mm4, %%mm0, %%mm6) + PAVGB(%%mm3, %%mm4, %%mm2, %%mm6) "movq (%2, %3), %%mm3 \n\t" PAVGB(%%mm3, %%mm5, %%mm1, %%mm6) - "movq %%mm0, (%2) \n\t" + "movq %%mm2, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "addl %%eax, %1 \n\t" "addl %%eax, %2 \n\t" @@ -230,7 +325,7 @@ static void DEF(avg, pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size } // this routine is 'slightly' suboptimal but mostly unused -static void DEF(avg, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) { MOVQ_ZERO(mm7); SET_RND(mm6); // =2 for rnd and =1 for no_rnd version @@ -303,3 +398,26 @@ static void DEF(avg, pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_siz :"D"(block), "r"(line_size) :"eax", "memory"); } + +//FIXME optimize +static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ + DEF(put, pixels8_y2)(block , pixels , line_size, h); + DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); +} + +static void DEF(put, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ + DEF(put, pixels8_xy2)(block , pixels , line_size, h); + DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); +} + +static void DEF(avg, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ + DEF(avg, pixels8_y2)(block , pixels , line_size, h); + DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); +} + +static void DEF(avg, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ + DEF(avg, pixels8_xy2)(block , pixels , line_size, h); + DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); +} + + diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c index 7135beb21..73b63ac63 100644 --- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c @@ -25,7 +25,7 @@ #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy #define SHIFT_FRW_COL BITS_FRW_ACC -#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) +#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) //#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1) #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) //#define RND_FRW_COL (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1) diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c index 390aa554c..10efc173f 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c @@ -195,7 +195,7 @@ static void dct_unquantize_mpeg1_mmx(MpegEncContext *s, const UINT16 *quant_matrix; if(s->alternate_scan) nCoeffs= 64; - else nCoeffs= nCoeffs= zigzag_end[ s->block_last_index[n] ]; + else nCoeffs= zigzag_end[ s->block_last_index[n] ]; if (s->mb_intra) { int block0; @@ -321,7 +321,7 @@ static void dct_unquantize_mpeg2_mmx(MpegEncContext *s, const UINT16 *quant_matrix; if(s->alternate_scan) nCoeffs= 64; - else nCoeffs= nCoeffs= zigzag_end[ s->block_last_index[n] ]; + else nCoeffs= zigzag_end[ s->block_last_index[n] ]; if (s->mb_intra) { int block0; @@ -552,16 +552,21 @@ void unused_var_warning_killer(){ void MPV_common_init_mmx(MpegEncContext *s) { if (mm_flags & MM_MMX) { + const int dct_algo= s->avctx->dct_algo; s->dct_unquantize_h263 = dct_unquantize_h263_mmx; s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx; s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx; draw_edges = draw_edges_mmx; - if(mm_flags & MM_MMXEXT){ - dct_quantize= dct_quantize_MMX2; - } else { - dct_quantize= dct_quantize_MMX; + if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ + s->fdct = fdct_mmx; + + if(mm_flags & MM_MMXEXT){ + s->dct_quantize= dct_quantize_MMX2; + } else { + s->dct_quantize= dct_quantize_MMX; + } } } } diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c index 1119313cc..94a6711db 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c @@ -40,7 +40,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s, const UINT16 *qmat, *bias; static __align8 INT16 temp_block[64]; - av_fdct (block); + //s->fdct (block); + fdct_mmx (block); //cant be anything else ... if (s->mb_intra) { int dummy; @@ -55,7 +56,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "xorl %%edx, %%edx \n\t" "mul %%ecx \n\t" : "=d" (level), "=a"(dummy) - : "a" (block[0] + (q >> 1)), "c" (inverse[q]) + : "a" ((block[0]>>2) + q), "c" (inverse[q<<1]) ); #else asm volatile ( @@ -63,13 +64,13 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "divw %%cx \n\t" "movzwl %%ax, %%eax \n\t" : "=a" (level) - : "a" (block[0] + (q >> 1)), "c" (q) + : "a" ((block[0]>>2) + q), "c" (q<<1) : "%edx" ); #endif } else /* For AIC we skip quant/dequant of INTRADC */ - level = block[0]; + level = (block[0] + 4)>>3; block[0]=0; //avoid fake overflow // temp_block[0] = (block[0] + (q >> 1)) / q; @@ -83,7 +84,11 @@ static int RENAME(dct_quantize)(MpegEncContext *s, } if(s->out_format == FMT_H263 && s->mpeg_quant==0){ - + + /* the following code is patched using avifile's modifications + to enable -fpic compilation. this patch has not been accepted on + main ffmpeg cvs. */ + asm volatile( "movd %%eax, %%mm3 \n\t" // last_non_zero_p1 SPREADW(%%mm3) @@ -112,7 +117,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) "movq %%mm0, (%3, %%eax) \n\t" "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00 - "movq (%2, %%eax), %%mm1 \n\t" + "movq (%4, %%eax), %%mm1 \n\t" "movq %%mm7, (%1, %%eax) \n\t" // 0 "pandn %%mm1, %%mm0 \n\t" PMAXW(%%mm0, %%mm3) @@ -201,10 +206,12 @@ static int RENAME(dct_quantize)(MpegEncContext *s, } if(s->mb_intra) temp_block[0]= level; //FIXME move afer permute + // last_non_zero_p1=64; /* permute for IDCT */ asm volatile( - "pushl %%ebp \n\t" + "movl %0, %%eax \n\t" + "pushl %%ebp \n\t" "movl %%esp, " MANGLE(esp_temp) "\n\t" "1: \n\t" "movzbl (%1, %%eax), %%ebx \n\t" @@ -219,10 +226,10 @@ static int RENAME(dct_quantize)(MpegEncContext *s, " js 1b \n\t" "movl " MANGLE(esp_temp) ", %%esp\n\t" "popl %%ebp \n\t" - : - : "a" (-last_non_zero_p1), "d" (zigzag_direct_noperm+last_non_zero_p1), "S" (temp_block), "D" (block) - : "%ebx", "%ecx" - ); + : + : "g" (-last_non_zero_p1), "d" (zigzag_direct_noperm+last_non_zero_p1), "S" (temp_block), "D" (block) + : "%eax", "%ebx", "%ecx" + ); /* for(i=0; i<last_non_zero_p1; i++) { |