diff options
Diffstat (limited to 'src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c')
-rw-r--r-- | src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c | 297 |
1 files changed, 250 insertions, 47 deletions
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c index d2f477b7b..1c0e9f5ae 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c @@ -31,31 +31,92 @@ static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xfff static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; -static void dct_unquantize_h263_mmx(MpegEncContext *s, +static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int level, qmul, qadd, nCoeffs; qmul = qscale << 1; - qadd = (qscale - 1) | 1; - assert(s->block_last_index[n]>=0); + assert(s->block_last_index[n]>=0 || s->h263_aic); - if (s->mb_intra) { - if (!s->h263_aic) { - if (n < 4) - level = block[0] * s->y_dc_scale; - else - level = block[0] * s->c_dc_scale; - }else{ - qadd = 0; - level= block[0]; - } - nCoeffs=63; - } else { - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - level = 0;/* keep gcc quiet */ + if (!s->h263_aic) { + if (n < 4) + level = block[0] * s->y_dc_scale; + else + level = block[0] * s->c_dc_scale; + qadd = (qscale - 1) | 1; + }else{ + qadd = 0; + level= block[0]; } + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; +//printf("%d %d ", qmul, qadd); +asm volatile( + "movd %1, %%mm6 \n\t" //qmul + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "movd %2, %%mm5 \n\t" //qadd + "pxor %%mm7, %%mm7 \n\t" + "packssdw %%mm5, %%mm5 \n\t" + "packssdw %%mm5, %%mm5 \n\t" + "psubw %%mm5, %%mm7 \n\t" + "pxor %%mm4, %%mm4 \n\t" + ".balign 16\n\t" + "1: \n\t" + "movq (%0, %3), %%mm0 \n\t" + "movq 8(%0, %3), %%mm1 \n\t" + + "pmullw %%mm6, %%mm0 \n\t" + "pmullw %%mm6, %%mm1 \n\t" + + "movq (%0, %3), %%mm2 \n\t" + "movq 8(%0, %3), %%mm3 \n\t" + + "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + + "paddw %%mm7, %%mm0 \n\t" + "paddw %%mm7, %%mm1 \n\t" + + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + + "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 + + "pandn %%mm2, %%mm0 \n\t" + "pandn %%mm3, %%mm1 \n\t" + + "movq %%mm0, (%0, %3) \n\t" + "movq %%mm1, 8(%0, %3) \n\t" + + "addl $16, %3 \n\t" + "jng 1b \n\t" + ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) + : "memory" + ); + block[0]= level; +} + + +static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int level, qmul, qadd, nCoeffs; + + qmul = qscale << 1; + qadd = (qscale - 1) | 1; + + assert(s->block_last_index[n]>=0 || s->h263_aic); + + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; //printf("%d %d ", qmul, qadd); asm volatile( "movd %1, %%mm6 \n\t" //qmul @@ -104,8 +165,6 @@ asm volatile( ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) : "memory" ); - if(s->mb_intra) - block[0]= level; } @@ -138,24 +197,23 @@ asm volatile( high3:low3 = low1*low2 high3 += tlow1 */ -static void dct_unquantize_mpeg1_mmx(MpegEncContext *s, +static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int nCoeffs; const uint16_t *quant_matrix; + int block0; assert(s->block_last_index[n]>=0); nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; - if (s->mb_intra) { - int block0; - if (n < 4) - block0 = block[0] * s->y_dc_scale; - else - block0 = block[0] * s->c_dc_scale; - /* XXX: only mpeg1 */ - quant_matrix = s->intra_matrix; + if (n < 4) + block0 = block[0] * s->y_dc_scale; + else + block0 = block[0] * s->c_dc_scale; + /* XXX: only mpeg1 */ + quant_matrix = s->intra_matrix; asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlw $15, %%mm7 \n\t" @@ -205,9 +263,19 @@ asm volatile( ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) : "%eax", "memory" ); - block[0]= block0; + block[0]= block0; +} + +static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int nCoeffs; + const uint16_t *quant_matrix; + + assert(s->block_last_index[n]>=0); + + nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; - } else { quant_matrix = s->inter_matrix; asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" @@ -262,28 +330,25 @@ asm volatile( ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) : "%eax", "memory" ); - } - } -static void dct_unquantize_mpeg2_mmx(MpegEncContext *s, +static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int nCoeffs; const uint16_t *quant_matrix; + int block0; assert(s->block_last_index[n]>=0); if(s->alternate_scan) nCoeffs= 63; //FIXME else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - if (s->mb_intra) { - int block0; - if (n < 4) - block0 = block[0] * s->y_dc_scale; - else - block0 = block[0] * s->c_dc_scale; - quant_matrix = s->intra_matrix; + if (n < 4) + block0 = block[0] * s->y_dc_scale; + else + block0 = block[0] * s->c_dc_scale; + quant_matrix = s->intra_matrix; asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlw $15, %%mm7 \n\t" @@ -329,10 +394,21 @@ asm volatile( ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) : "%eax", "memory" ); - block[0]= block0; + block[0]= block0; //Note, we dont do mismatch control for intra as errors cannot accumulate +} + +static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int nCoeffs; + const uint16_t *quant_matrix; + + assert(s->block_last_index[n]>=0); + + if(s->alternate_scan) nCoeffs= 63; //FIXME + else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - } else { quant_matrix = s->inter_matrix; asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" @@ -397,7 +473,6 @@ asm volatile( ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) : "%eax", "memory" ); - } } /* draw the edges of width 'w' of an image of size width, height @@ -488,13 +563,130 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) } } +static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ + const int intra= s->mb_intra; + int *sum= s->dct_error_sum[intra]; + uint16_t *offset= s->dct_offset[intra]; + + s->dct_count[intra]++; + + asm volatile( + "pxor %%mm7, %%mm7 \n\t" + "1: \n\t" + "pxor %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "movq (%0), %%mm2 \n\t" + "movq 8(%0), %%mm3 \n\t" + "pcmpgtw %%mm2, %%mm0 \n\t" + "pcmpgtw %%mm3, %%mm1 \n\t" + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + "psubw %%mm0, %%mm2 \n\t" + "psubw %%mm1, %%mm3 \n\t" + "movq %%mm2, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psubusw (%2), %%mm2 \n\t" + "psubusw 8(%2), %%mm3 \n\t" + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + "psubw %%mm0, %%mm2 \n\t" + "psubw %%mm1, %%mm3 \n\t" + "movq %%mm2, (%0) \n\t" + "movq %%mm3, 8(%0) \n\t" + "movq %%mm4, %%mm2 \n\t" + "movq %%mm5, %%mm3 \n\t" + "punpcklwd %%mm7, %%mm4 \n\t" + "punpckhwd %%mm7, %%mm2 \n\t" + "punpcklwd %%mm7, %%mm5 \n\t" + "punpckhwd %%mm7, %%mm3 \n\t" + "paddd (%1), %%mm4 \n\t" + "paddd 8(%1), %%mm2 \n\t" + "paddd 16(%1), %%mm5 \n\t" + "paddd 24(%1), %%mm3 \n\t" + "movq %%mm4, (%1) \n\t" + "movq %%mm2, 8(%1) \n\t" + "movq %%mm5, 16(%1) \n\t" + "movq %%mm3, 24(%1) \n\t" + "addl $16, %0 \n\t" + "addl $32, %1 \n\t" + "addl $16, %2 \n\t" + "cmpl %3, %0 \n\t" + " jb 1b \n\t" + : "+r" (block), "+r" (sum), "+r" (offset) + : "r"(block+64) + ); +} + +static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ + const int intra= s->mb_intra; + int *sum= s->dct_error_sum[intra]; + uint16_t *offset= s->dct_offset[intra]; + + s->dct_count[intra]++; + + asm volatile( + "pxor %%xmm7, %%xmm7 \n\t" + "1: \n\t" + "pxor %%xmm0, %%xmm0 \n\t" + "pxor %%xmm1, %%xmm1 \n\t" + "movdqa (%0), %%xmm2 \n\t" + "movdqa 16(%0), %%xmm3 \n\t" + "pcmpgtw %%xmm2, %%xmm0 \n\t" + "pcmpgtw %%xmm3, %%xmm1 \n\t" + "pxor %%xmm0, %%xmm2 \n\t" + "pxor %%xmm1, %%xmm3 \n\t" + "psubw %%xmm0, %%xmm2 \n\t" + "psubw %%xmm1, %%xmm3 \n\t" + "movdqa %%xmm2, %%xmm4 \n\t" + "movdqa %%xmm3, %%xmm5 \n\t" + "psubusw (%2), %%xmm2 \n\t" + "psubusw 16(%2), %%xmm3 \n\t" + "pxor %%xmm0, %%xmm2 \n\t" + "pxor %%xmm1, %%xmm3 \n\t" + "psubw %%xmm0, %%xmm2 \n\t" + "psubw %%xmm1, %%xmm3 \n\t" + "movdqa %%xmm2, (%0) \n\t" + "movdqa %%xmm3, 16(%0) \n\t" + "movdqa %%xmm4, %%xmm6 \n\t" + "movdqa %%xmm5, %%xmm0 \n\t" + "punpcklwd %%xmm7, %%xmm4 \n\t" + "punpckhwd %%xmm7, %%xmm6 \n\t" + "punpcklwd %%xmm7, %%xmm5 \n\t" + "punpckhwd %%xmm7, %%xmm0 \n\t" + "paddd (%1), %%xmm4 \n\t" + "paddd 16(%1), %%xmm6 \n\t" + "paddd 32(%1), %%xmm5 \n\t" + "paddd 48(%1), %%xmm0 \n\t" + "movdqa %%xmm4, (%1) \n\t" + "movdqa %%xmm6, 16(%1) \n\t" + "movdqa %%xmm5, 32(%1) \n\t" + "movdqa %%xmm0, 48(%1) \n\t" + "addl $32, %0 \n\t" + "addl $64, %1 \n\t" + "addl $32, %2 \n\t" + "cmpl %3, %0 \n\t" + " jb 1b \n\t" + : "+r" (block), "+r" (sum), "+r" (offset) + : "r"(block+64) + ); +} + #undef HAVE_MMX2 #define RENAME(a) a ## _MMX +#define RENAMEl(a) a ## _mmx #include "mpegvideo_mmx_template.c" #define HAVE_MMX2 #undef RENAME +#undef RENAMEl #define RENAME(a) a ## _MMX2 +#define RENAMEl(a) a ## _mmx2 +#include "mpegvideo_mmx_template.c" + +#undef RENAME +#undef RENAMEl +#define RENAME(a) a ## _SSE2 +#define RENAMEl(a) a ## _sse2 #include "mpegvideo_mmx_template.c" void MPV_common_init_mmx(MpegEncContext *s) @@ -502,14 +694,25 @@ void MPV_common_init_mmx(MpegEncContext *s) if (mm_flags & MM_MMX) { const int dct_algo = s->avctx->dct_algo; - s->dct_unquantize_h263 = dct_unquantize_h263_mmx; - s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx; - s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx; + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; + s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; + s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; + s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; + s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; draw_edges = draw_edges_mmx; + + if (mm_flags & MM_SSE2) { + s->denoise_dct= denoise_dct_sse2; + } else { + s->denoise_dct= denoise_dct_mmx; + } if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ - if(mm_flags & MM_MMXEXT){ + if(mm_flags & MM_SSE2){ + s->dct_quantize= dct_quantize_SSE2; + } else if(mm_flags & MM_MMXEXT){ s->dct_quantize= dct_quantize_MMX2; } else { s->dct_quantize= dct_quantize_MMX; |