diff options
Diffstat (limited to 'src/libffmpeg/libavcodec/i386/h264dsp_mmx.c')
-rw-r--r-- | src/libffmpeg/libavcodec/i386/h264dsp_mmx.c | 104 |
1 files changed, 100 insertions, 4 deletions
diff --git a/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c b/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c index 47fcf938b..c278affc8 100644 --- a/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c +++ b/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c @@ -162,11 +162,10 @@ void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride) /* delta = (avg(q0, p1>>2) + (d&a)) * - (avg(p0, q1>>2) + (d&~a)) */\ "pavgb %%mm2, %%mm0 \n\t"\ - "movq %%mm5, %%mm6 \n\t"\ - "pand %%mm4, %%mm6 \n\t"\ - "paddusb %%mm6, %%mm0 \n\t"\ + "pand %%mm5, %%mm4 \n\t"\ + "paddusb %%mm4, %%mm0 \n\t"\ "pavgb %%mm1, %%mm3 \n\t"\ - "pandn %%mm5, %%mm4 \n\t"\ + "pxor %%mm5, %%mm4 \n\t"\ "paddusb %%mm4, %%mm3 \n\t"\ /* p0 += clip(delta, -tc0, tc0) * q0 -= clip(delta, -tc0, tc0) */\ @@ -910,3 +909,100 @@ H264_MC(avg_, 16,mmx2) #undef H264_CHROMA_OP #undef H264_CHROMA_MC8_TMPL +/***********************************/ +/* weighted prediction */ + +static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h) +{ + int x, y; + offset <<= log2_denom; + offset += (1 << log2_denom) >> 1; + asm volatile( + "movd %0, %%mm4 \n\t" + "movd %1, %%mm5 \n\t" + "movd %2, %%mm6 \n\t" + "pshufw $0, %%mm4, %%mm4 \n\t" + "pshufw $0, %%mm5, %%mm5 \n\t" + "pxor %%mm7, %%mm7 \n\t" + :: "g"(weight), "g"(offset), "g"(log2_denom) + ); + for(y=0; y<h; y+=2){ + for(x=0; x<w; x+=4){ + asm volatile( + "movd %0, %%mm0 \n\t" + "movd %1, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pmullw %%mm4, %%mm0 \n\t" + "pmullw %%mm4, %%mm1 \n\t" + "paddw %%mm5, %%mm0 \n\t" + "paddw %%mm5, %%mm1 \n\t" + "psraw %%mm6, %%mm0 \n\t" + "psraw %%mm6, %%mm1 \n\t" + "packuswb %%mm7, %%mm0 \n\t" + "packuswb %%mm7, %%mm1 \n\t" + "movd %%mm0, %0 \n\t" + "movd %%mm1, %1 \n\t" + : "+m"(*(uint32_t*)(dst+x)), + "+m"(*(uint32_t*)(dst+x+stride)) + ); + } + dst += 2*stride; + } +} + +static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets, int w, int h) +{ + int x, y; + int offset = ((offsets + offsetd + 1) | 1) << log2_denom; + asm volatile( + "movd %0, %%mm3 \n\t" + "movd %1, %%mm4 \n\t" + "movd %2, %%mm5 \n\t" + "movd %3, %%mm6 \n\t" + "pshufw $0, %%mm3, %%mm3 \n\t" + "pshufw $0, %%mm4, %%mm4 \n\t" + "pshufw $0, %%mm5, %%mm5 \n\t" + "pxor %%mm7, %%mm7 \n\t" + :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1) + ); + for(y=0; y<h; y++){ + for(x=0; x<w; x+=4){ + asm volatile( + "movd %0, %%mm0 \n\t" + "movd %1, %%mm1 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm1 \n\t" + "pmullw %%mm3, %%mm0 \n\t" + "pmullw %%mm4, %%mm1 \n\t" + "paddw %%mm5, %%mm0 \n\t" + "paddw %%mm1, %%mm0 \n\t" + "psraw %%mm6, %%mm0 \n\t" + "packuswb %%mm0, %%mm0 \n\t" + "movd %%mm0, %0 \n\t" + : "+m"(*(uint32_t*)(dst+x)) + : "m"(*(uint32_t*)(src+x)) + ); + } + src += stride; + dst += stride; + } +} + +#define H264_WEIGHT(W,H) \ +static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \ + ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offsetd, offsets, W, H); \ +} \ +static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \ + ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \ +} + +H264_WEIGHT(16,16) +H264_WEIGHT(16, 8) +H264_WEIGHT( 8,16) +H264_WEIGHT( 8, 8) +H264_WEIGHT( 8, 4) +H264_WEIGHT( 4, 8) +H264_WEIGHT( 4, 4) +H264_WEIGHT( 4, 2) + |