1 files changed, 163 insertions, 28 deletions
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 6bd2b32b9..c1dd2176a 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -29,6 +29,8 @@
 //#include <assert.h>
 
 extern const uint8_t ff_h263_loop_filter_strength[32];
+extern void ff_idct_xvid_mmx(short *block);
+extern void ff_idct_xvid_mmx2(short *block);
 
 int mm_flags; /* multimedia extension flags */
 
@@ -615,31 +617,32 @@ static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
 }
 
 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
+    void *dst_reg = dst, *src_reg = src;
+
     asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
-        "movd  %4, %%mm0		\n\t"
-        "movd  %5, %%mm1		\n\t"
-        "movd  %6, %%mm2		\n\t"
-        "movd  %7, %%mm3		\n\t"
+        "movd  (%1), %%mm0		\n\t"
+        "movd  (%1,%5), %%mm1		\n\t"
+        "lea (%1, %5, 2), %1		\n\t"
+        "movd  (%1), %%mm2		\n\t"
+        "movd  (%1,%5), %%mm3		\n\t"
         "punpcklbw %%mm1, %%mm0		\n\t"
         "punpcklbw %%mm3, %%mm2		\n\t"
         "movq %%mm0, %%mm1		\n\t"
         "punpcklwd %%mm2, %%mm0		\n\t"
         "punpckhwd %%mm2, %%mm1		\n\t"
-        "movd  %%mm0, %0		\n\t"
+        "movd  %%mm0, (%0)		\n\t"
         "punpckhdq %%mm0, %%mm0		\n\t"
-        "movd  %%mm0, %1		\n\t"
-        "movd  %%mm1, %2		\n\t"
+        "movd  %%mm0, (%0,%4)		\n\t"
+        "lea (%0, %4, 2), %0		\n\t"
+        "movd  %%mm1, (%0)		\n\t"
         "punpckhdq %%mm1, %%mm1		\n\t"
-        "movd  %%mm1, %3		\n\t"
-        
-        : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
-          "=m" (*(uint32_t*)(dst + 1*dst_stride)),
-          "=m" (*(uint32_t*)(dst + 2*dst_stride)),
-          "=m" (*(uint32_t*)(dst + 3*dst_stride))
-        :  "m" (*(uint32_t*)(src + 0*src_stride)),
-           "m" (*(uint32_t*)(src + 1*src_stride)),
-           "m" (*(uint32_t*)(src + 2*src_stride)),
-           "m" (*(uint32_t*)(src + 3*src_stride))
+        "movd  %%mm1, (%0,%4)		\n\t"
+        : "=&r" (dst_reg),
+          "=&r" (src_reg)
+        : "0"   (dst_reg),
+          "1"   (src_reg),
+          "r"   (dst_stride),
+          "r"   (src_stride)
     );
 }
 
@@ -742,31 +745,49 @@ static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
     int tmp;
   asm volatile (
       "movl %4,%%ecx\n"
+      "shr $1,%%ecx\n"
       "pxor %%mm0,%%mm0\n"	/* mm0 = 0 */
       "pxor %%mm7,%%mm7\n"	/* mm7 holds the sum */
       "1:\n"
-      "movq (%0),%%mm1\n"	/* mm1 = pix1[0-7] */
-      "movq (%1),%%mm2\n"	/* mm2 = pix2[0-7] */
+      "movq (%0),%%mm1\n"	/* mm1 = pix1[0][0-7] */
+      "movq (%1),%%mm2\n"	/* mm2 = pix2[0][0-7] */
+      "movq (%0,%3),%%mm3\n"	/* mm3 = pix1[1][0-7] */
+      "movq (%1,%3),%%mm4\n"	/* mm4 = pix2[1][0-7] */
 
+      /* todo: mm1-mm2, mm3-mm4 */
+      /* algo: substract mm1 from mm2 with saturation and vice versa */
+      /*       OR the results to get absolute difference */
       "movq %%mm1,%%mm5\n"
+      "movq %%mm3,%%mm6\n"
       "psubusb %%mm2,%%mm1\n"
+      "psubusb %%mm4,%%mm3\n"
       "psubusb %%mm5,%%mm2\n"
+      "psubusb %%mm6,%%mm4\n"
 
       "por %%mm1,%%mm2\n"
+      "por %%mm3,%%mm4\n"
 
+      /* now convert to 16-bit vectors so we can square them */
       "movq %%mm2,%%mm1\n"
+      "movq %%mm4,%%mm3\n"
 
       "punpckhbw %%mm0,%%mm2\n"
+      "punpckhbw %%mm0,%%mm4\n"
       "punpcklbw %%mm0,%%mm1\n"	/* mm1 now spread over (mm1,mm2) */
+      "punpcklbw %%mm0,%%mm3\n"	/* mm4 now spread over (mm3,mm4) */
 
       "pmaddwd %%mm2,%%mm2\n"
+      "pmaddwd %%mm4,%%mm4\n"
       "pmaddwd %%mm1,%%mm1\n"
+      "pmaddwd %%mm3,%%mm3\n"
 
-      "add %3,%0\n"
-      "add %3,%1\n"
+      "lea (%0,%3,2), %0\n"	/* pix1 += 2*line_size */
+      "lea (%1,%3,2), %1\n"	/* pix2 += 2*line_size */
 
       "paddd %%mm2,%%mm1\n"
+      "paddd %%mm4,%%mm3\n"
       "paddd %%mm1,%%mm7\n"
+      "paddd %%mm3,%%mm7\n"
 
       "decl %%ecx\n"
       "jnz 1b\n"
@@ -841,6 +862,68 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
     return tmp;
 }
 
+static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+    int tmp;
+  asm volatile (
+      "shr $1,%2\n"
+      "pxor %%xmm0,%%xmm0\n"	/* mm0 = 0 */
+      "pxor %%xmm7,%%xmm7\n"	/* mm7 holds the sum */
+      "1:\n"
+      "movdqu (%0),%%xmm1\n"	/* mm1 = pix1[0][0-15] */
+      "movdqu (%1),%%xmm2\n"	/* mm2 = pix2[0][0-15] */
+      "movdqu (%0,%4),%%xmm3\n"	/* mm3 = pix1[1][0-15] */
+      "movdqu (%1,%4),%%xmm4\n"	/* mm4 = pix2[1][0-15] */
+
+      /* todo: mm1-mm2, mm3-mm4 */
+      /* algo: substract mm1 from mm2 with saturation and vice versa */
+      /*       OR the results to get absolute difference */
+      "movdqa %%xmm1,%%xmm5\n"
+      "movdqa %%xmm3,%%xmm6\n"
+      "psubusb %%xmm2,%%xmm1\n"
+      "psubusb %%xmm4,%%xmm3\n"
+      "psubusb %%xmm5,%%xmm2\n"
+      "psubusb %%xmm6,%%xmm4\n"
+
+      "por %%xmm1,%%xmm2\n"
+      "por %%xmm3,%%xmm4\n"
+
+      /* now convert to 16-bit vectors so we can square them */
+      "movdqa %%xmm2,%%xmm1\n"
+      "movdqa %%xmm4,%%xmm3\n"
+
+      "punpckhbw %%xmm0,%%xmm2\n"
+      "punpckhbw %%xmm0,%%xmm4\n"
+      "punpcklbw %%xmm0,%%xmm1\n"	/* mm1 now spread over (mm1,mm2) */
+      "punpcklbw %%xmm0,%%xmm3\n"	/* mm4 now spread over (mm3,mm4) */
+
+      "pmaddwd %%xmm2,%%xmm2\n"
+      "pmaddwd %%xmm4,%%xmm4\n"
+      "pmaddwd %%xmm1,%%xmm1\n"
+      "pmaddwd %%xmm3,%%xmm3\n"
+
+      "lea (%0,%4,2), %0\n"	/* pix1 += 2*line_size */
+      "lea (%1,%4,2), %1\n"	/* pix2 += 2*line_size */
+
+      "paddd %%xmm2,%%xmm1\n"
+      "paddd %%xmm4,%%xmm3\n"
+      "paddd %%xmm1,%%xmm7\n"
+      "paddd %%xmm3,%%xmm7\n"
+
+      "decl %2\n"
+      "jnz 1b\n"
+
+      "movdqa %%xmm7,%%xmm1\n"
+      "psrldq $8, %%xmm7\n"	/* shift hi qword to lo */
+      "paddd %%xmm1,%%xmm7\n"
+      "movdqa %%xmm7,%%xmm1\n"
+      "psrldq $4, %%xmm7\n"	/* shift hi dword to lo */
+      "paddd %%xmm1,%%xmm7\n"
+      "movd %%xmm7,%3\n"
+      : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp) 
+      : "r" ((long)line_size));
+    return tmp;
+}
+
 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
     int tmp;
   asm volatile (
@@ -1080,7 +1163,8 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
       return tmp + hf_noise8_mmx(pix+8, line_size, h);
 }
 
-static int nsse16_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+    MpegEncContext *c = p;
     int score1= sse16_mmx(c, pix1, pix2, line_size, h);
     int score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
 
@@ -1088,7 +1172,8 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int lin
     else  return score1 + ABS(score2)*8;
 }
 
-static int nsse8_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+    MpegEncContext *c = p;
     int score1= sse8_mmx(c, pix1, pix2, line_size, h);
     int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
 
@@ -1617,11 +1702,9 @@ static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride
         "movq 64(%1), %%mm1		\n\t"
         MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
         
-        "movq %%mm0, %%mm1		\n\t"
-        "psrlq $32, %%mm0		\n\t"
+        "pshufw $0x0E, %%mm0, %%mm1     \n\t"
         "paddusw %%mm1, %%mm0		\n\t"
-        "movq %%mm0, %%mm1		\n\t"
-        "psrlq $16, %%mm0		\n\t"
+        "pshufw $0x01, %%mm0, %%mm1     \n\t"
         "paddusw %%mm1, %%mm0		\n\t"
         "movd %%mm0, %0			\n\t"
                 
@@ -2455,6 +2538,28 @@ static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
     ff_vp3_idct_mmx(block);
     add_pixels_clamped_mmx(block, dest, line_size);
 }
+#ifdef CONFIG_GPL
+static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_idct_xvid_mmx (block);
+    put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_idct_xvid_mmx (block);
+    add_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_idct_xvid_mmx2 (block);
+    put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_idct_xvid_mmx2 (block);
+    add_pixels_clamped_mmx(block, dest, line_size);
+}
+#endif
     
 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
@@ -2527,6 +2632,18 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                     c->idct    = ff_vp3_idct_mmx;
                     c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
                 }
+#ifdef CONFIG_GPL
+            }else if(idct_algo==FF_IDCT_XVIDMMX){
+                if(mm_flags & MM_MMXEXT){
+                    c->idct_put= ff_idct_xvid_mmx2_put;
+                    c->idct_add= ff_idct_xvid_mmx2_add;
+                    c->idct    = ff_idct_xvid_mmx2;
+                }else{
+                    c->idct_put= ff_idct_xvid_mmx_put;
+                    c->idct_add= ff_idct_xvid_mmx_add;
+                    c->idct    = ff_idct_xvid_mmx;
+                }
+#endif
             }
         }
 
@@ -2590,7 +2707,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         c->hadamard8_diff[1]= hadamard8_diff_mmx;
         
 	c->pix_norm1 = pix_norm1_mmx;
-	c->sse[0] = sse16_mmx;
+	c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
   	c->sse[1] = sse8_mmx;
         c->vsad[4]= vsad_intra16_mmx;
 
@@ -2716,6 +2833,24 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
             c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
 
+            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
+            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
+            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
+            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
+            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
+            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
+            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
+            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
+
+            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
+            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
+            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
+            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
+            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
+            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
+            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
+            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+
 #ifdef CONFIG_ENCODERS
             c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
 #endif //CONFIG_ENCODERS