summaryrefslogtreecommitdiff
path: root/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libffmpeg/libavcodec/i386/h264dsp_mmx.c')
-rw-r--r--src/libffmpeg/libavcodec/i386/h264dsp_mmx.c104
1 files changed, 100 insertions, 4 deletions
diff --git a/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c b/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c
index 47fcf938b..c278affc8 100644
--- a/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/h264dsp_mmx.c
@@ -162,11 +162,10 @@ void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride)
/* delta = (avg(q0, p1>>2) + (d&a))
* - (avg(p0, q1>>2) + (d&~a)) */\
"pavgb %%mm2, %%mm0 \n\t"\
- "movq %%mm5, %%mm6 \n\t"\
- "pand %%mm4, %%mm6 \n\t"\
- "paddusb %%mm6, %%mm0 \n\t"\
+ "pand %%mm5, %%mm4 \n\t"\
+ "paddusb %%mm4, %%mm0 \n\t"\
"pavgb %%mm1, %%mm3 \n\t"\
- "pandn %%mm5, %%mm4 \n\t"\
+ "pxor %%mm5, %%mm4 \n\t"\
"paddusb %%mm4, %%mm3 \n\t"\
/* p0 += clip(delta, -tc0, tc0)
* q0 -= clip(delta, -tc0, tc0) */\
@@ -910,3 +909,100 @@ H264_MC(avg_, 16,mmx2)
#undef H264_CHROMA_OP
#undef H264_CHROMA_MC8_TMPL
+/***********************************/
+/* weighted prediction */
+
+static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
+{
+ int x, y;
+ offset <<= log2_denom;
+ offset += (1 << log2_denom) >> 1;
+ asm volatile(
+ "movd %0, %%mm4 \n\t"
+ "movd %1, %%mm5 \n\t"
+ "movd %2, %%mm6 \n\t"
+ "pshufw $0, %%mm4, %%mm4 \n\t"
+ "pshufw $0, %%mm5, %%mm5 \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ :: "g"(weight), "g"(offset), "g"(log2_denom)
+ );
+ for(y=0; y<h; y+=2){
+ for(x=0; x<w; x+=4){
+ asm volatile(
+ "movd %0, %%mm0 \n\t"
+ "movd %1, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "pmullw %%mm4, %%mm0 \n\t"
+ "pmullw %%mm4, %%mm1 \n\t"
+ "paddw %%mm5, %%mm0 \n\t"
+ "paddw %%mm5, %%mm1 \n\t"
+ "psraw %%mm6, %%mm0 \n\t"
+ "psraw %%mm6, %%mm1 \n\t"
+ "packuswb %%mm7, %%mm0 \n\t"
+ "packuswb %%mm7, %%mm1 \n\t"
+ "movd %%mm0, %0 \n\t"
+ "movd %%mm1, %1 \n\t"
+ : "+m"(*(uint32_t*)(dst+x)),
+ "+m"(*(uint32_t*)(dst+x+stride))
+ );
+ }
+ dst += 2*stride;
+ }
+}
+
+static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets, int w, int h)
+{
+ int x, y;
+ int offset = ((offsets + offsetd + 1) | 1) << log2_denom;
+ asm volatile(
+ "movd %0, %%mm3 \n\t"
+ "movd %1, %%mm4 \n\t"
+ "movd %2, %%mm5 \n\t"
+ "movd %3, %%mm6 \n\t"
+ "pshufw $0, %%mm3, %%mm3 \n\t"
+ "pshufw $0, %%mm4, %%mm4 \n\t"
+ "pshufw $0, %%mm5, %%mm5 \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
+ );
+ for(y=0; y<h; y++){
+ for(x=0; x<w; x+=4){
+ asm volatile(
+ "movd %0, %%mm0 \n\t"
+ "movd %1, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "pmullw %%mm3, %%mm0 \n\t"
+ "pmullw %%mm4, %%mm1 \n\t"
+ "paddw %%mm5, %%mm0 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "psraw %%mm6, %%mm0 \n\t"
+ "packuswb %%mm0, %%mm0 \n\t"
+ "movd %%mm0, %0 \n\t"
+ : "+m"(*(uint32_t*)(dst+x))
+ : "m"(*(uint32_t*)(src+x))
+ );
+ }
+ src += stride;
+ dst += stride;
+ }
+}
+
+#define H264_WEIGHT(W,H) \
+static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
+ ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offsetd, offsets, W, H); \
+} \
+static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
+ ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16, 8)
+H264_WEIGHT( 8,16)
+H264_WEIGHT( 8, 8)
+H264_WEIGHT( 8, 4)
+H264_WEIGHT( 4, 8)
+H264_WEIGHT( 4, 4)
+H264_WEIGHT( 4, 2)
+