7 files changed, 340 insertions, 276 deletions
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 857f1d398..d5a2d3734 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -20,33 +20,9 @@
  */
 
 #include "../dsputil.h"
+#include "../simple_idct.h"
 
 int mm_flags; /* multimedia extension flags */
-/* FIXME use them in static form */
-int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
-
-int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
-
-int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx);
-
-int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
-int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx);
-
-int sad16x16_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
-int sad8x8_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
-int sad16x16_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
-int sad8x8_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx);
 
 /* pixel operations */
 static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
@@ -195,7 +171,7 @@ static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000F
 /***********************************/
 /* standard MMX */
 
-static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
+static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
 {
     asm volatile(
         "movl $-128, %%eax	\n\t"
@@ -223,7 +199,7 @@ static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size)
     );
 }
 
-static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride)
+static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
 {
     asm volatile(
         "pxor %%mm7, %%mm7	\n\t"
@@ -252,10 +228,10 @@ static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8
     );
 }
 
-void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
+void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
 {
     const DCTELEM *p;
-    UINT8 *pix;
+    uint8_t *pix;
 
     /* read the pixels */
     p = block;
@@ -307,10 +283,10 @@ void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
 	    :"memory");
 }
 
-void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
+void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
 {
     const DCTELEM *p;
-    UINT8 *pix;
+    uint8_t *pix;
     int i;
 
     /* read the pixels */
@@ -348,7 +324,7 @@ void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size)
     } while (--i);
 }
 
-static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
 	 "lea (%3, %3), %%eax		\n\t"
@@ -374,7 +350,7 @@ static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, in
 	);
 }
 
-static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
 	 "lea (%3, %3), %%eax		\n\t"
@@ -425,7 +401,7 @@ static void clear_blocks_mmx(DCTELEM *blocks)
         );
 }
 
-static int pix_sum16_mmx(UINT8 * pix, int line_size){
+static int pix_sum16_mmx(uint8_t * pix, int line_size){
     const int h=16;
     int sum;
     int index= -line_size*h;
@@ -528,7 +504,7 @@ static int pix_norm1_mmx(uint8_t *pix, int line_size) {
     return tmp;
 }
 
-static int sse16_mmx(void *v, UINT8 * pix1, UINT8 * pix2, int line_size) {
+static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) {
     int tmp;
   asm volatile (
       "movl $16,%%ecx\n"
@@ -607,26 +583,21 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
     for(; i<w; i++)
         dst[i+0] = src1[i+0]-src2[i+0];
 }
-#define LBUTTERFLY(a,b)\
-    "paddw " #b ", " #a "		\n\t"\
-    "paddw " #b ", " #b "		\n\t"\
-    "psubw " #a ", " #b "		\n\t"
+#define LBUTTERFLY2(a1,b1,a2,b2)\
+    "paddw " #b1 ", " #a1 "		\n\t"\
+    "paddw " #b2 ", " #a2 "		\n\t"\
+    "paddw " #b1 ", " #b1 "		\n\t"\
+    "paddw " #b2 ", " #b2 "		\n\t"\
+    "psubw " #a1 ", " #b1 "		\n\t"\
+    "psubw " #a2 ", " #b1 "		\n\t"
 
 #define HADAMARD48\
-        LBUTTERFLY(%%mm0, %%mm1)\
-        LBUTTERFLY(%%mm2, %%mm3)\
-        LBUTTERFLY(%%mm4, %%mm5)\
-        LBUTTERFLY(%%mm6, %%mm7)\
-        \
-        LBUTTERFLY(%%mm0, %%mm2)\
-        LBUTTERFLY(%%mm1, %%mm3)\
-        LBUTTERFLY(%%mm4, %%mm6)\
-        LBUTTERFLY(%%mm5, %%mm7)\
-        \
-        LBUTTERFLY(%%mm0, %%mm4)\
-        LBUTTERFLY(%%mm1, %%mm5)\
-        LBUTTERFLY(%%mm2, %%mm6)\
-        LBUTTERFLY(%%mm3, %%mm7)
+        LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
+        LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
+        LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
+        LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
+        LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
+        LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
 
 #define MMABS(a,z)\
     "pxor " #z ", " #z "		\n\t"\
@@ -641,12 +612,22 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
     "psubw " #z ", " #a "		\n\t"\
     "paddusw " #a ", " #sum "		\n\t"
 
-    
+#define MMABS_MMX2(a,z)\
+    "pxor " #z ", " #z "		\n\t"\
+    "psubw " #a ", " #z "		\n\t"\
+    "pmaxsw " #z ", " #a "		\n\t"
+
+#define MMABS_SUM_MMX2(a,z, sum)\
+    "pxor " #z ", " #z "		\n\t"\
+    "psubw " #a ", " #z "		\n\t"\
+    "pmaxsw " #z ", " #a "		\n\t"\
+    "paddusw " #a ", " #sum "		\n\t"
+        
 #define SBUTTERFLY(a,b,t,n)\
     "movq " #a ", " #t "		\n\t" /* abcd */\
     "punpckl" #n " " #b ", " #a "	\n\t" /* aebf */\
     "punpckh" #n " " #b ", " #t "	\n\t" /* cgdh */\
-    
+
 #define TRANSPOSE4(a,b,c,d,t)\
     SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
     SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
@@ -750,7 +731,94 @@ static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride)
     return sum&0xFFFF;
 }
 
+static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride){
+    uint64_t temp[16] __align8;
+    int sum=0;
+
+    diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
+
+    asm volatile(
+        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
+        LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
+        
+        HADAMARD48
+        
+        "movq %%mm7, 112(%1)		\n\t"
+        
+        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
+        STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
+        
+        "movq 112(%1), %%mm7 		\n\t"
+        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
+        STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
+
+        LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
+        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
+        
+        HADAMARD48
+        
+        "movq %%mm7, 120(%1)		\n\t"
+        
+        TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
+        STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
+        
+        "movq 120(%1), %%mm7 		\n\t"
+        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
+        "movq %%mm7, %%mm5		\n\t"//FIXME remove
+        "movq %%mm6, %%mm7		\n\t"
+        "movq %%mm0, %%mm6		\n\t"
+//        STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
+        
+        LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
+//        LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
+        
+        HADAMARD48
+        "movq %%mm7, 64(%1)		\n\t"
+        MMABS_MMX2(%%mm0, %%mm7)
+        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
+        MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
+        MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
+        MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
+        MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
+        MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
+        "movq 64(%1), %%mm1		\n\t"
+        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
+        "movq %%mm0, 64(%1)		\n\t"
+        
+        LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
+        LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
+        
+        HADAMARD48
+        "movq %%mm7, (%1)		\n\t"
+        MMABS_MMX2(%%mm0, %%mm7)
+        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
+        MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
+        MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
+        MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
+        MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
+        MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
+        "movq (%1), %%mm1		\n\t"
+        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
+        "movq 64(%1), %%mm1		\n\t"
+        MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
+        
+        "movq %%mm0, %%mm1		\n\t"
+        "psrlq $32, %%mm0		\n\t"
+        "paddusw %%mm1, %%mm0		\n\t"
+        "movq %%mm0, %%mm1		\n\t"
+        "psrlq $16, %%mm0		\n\t"
+        "paddusw %%mm1, %%mm0		\n\t"
+        "movd %%mm0, %0			\n\t"
+                
+        : "=r" (sum)
+        : "r"(temp)
+    );
+    return sum&0xFFFF;
+}
+
+
 WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
+WARPER88_1616(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
 
 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
@@ -777,7 +845,7 @@ WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
         OP(%%mm5, out, %%mm7, d)
 
 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
-void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
     uint64_t temp;\
 \
     asm volatile(\
@@ -944,7 +1012,7 @@ static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, i
     }\
 }\
 \
-void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
     uint64_t temp;\
 \
     asm volatile(\
@@ -1121,7 +1189,7 @@ static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
     );\
 }\
 \
-void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
     uint64_t temp[9*4];\
     uint64_t *temp_ptr= temp;\
     int count= 9;\
@@ -1181,46 +1249,46 @@ void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dst
    );\
 }\
 \
-static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## pixels8_mmx(dst, src, stride, 8);\
 }\
 \
-static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t temp[8];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
     OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
 }\
 \
-static void OPNAME ## qpel8_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
 }\
 \
-static void OPNAME ## qpel8_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t temp[8];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
     OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\
 }\
 \
-static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t temp[8];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
     OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\
 }\
 \
-static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
 }\
 \
-static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t temp[8];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
     OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\
 }\
-static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half) + 64;\
     uint8_t * const halfHV= ((uint8_t*)half);\
@@ -1229,7 +1297,7 @@ static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half) + 64;\
     uint8_t * const halfHV= ((uint8_t*)half);\
@@ -1238,7 +1306,7 @@ static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half) + 64;\
     uint8_t * const halfHV= ((uint8_t*)half);\
@@ -1247,7 +1315,7 @@ static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half) + 64;\
     uint8_t * const halfHV= ((uint8_t*)half);\
@@ -1256,7 +1324,7 @@ static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half) + 64;\
     uint8_t * const halfHV= ((uint8_t*)half);\
@@ -1264,7 +1332,7 @@ static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half) + 64;\
     uint8_t * const halfHV= ((uint8_t*)half);\
@@ -1272,66 +1340,66 @@ static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
     OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\
 }\
-static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
     put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\
     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
 }\
-static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[8 + 9];\
     uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
     put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\
     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
 }\
-static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[9];\
     uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
     OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
 }\
-static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## pixels16_mmx(dst, src, stride, 16);\
 }\
 \
-static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t temp[32];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
     OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
 }\
 \
-static void OPNAME ## qpel16_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
 }\
 \
-static void OPNAME ## qpel16_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t temp[32];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
     OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\
 }\
 \
-static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t temp[32];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
     OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\
 }\
 \
-static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
 }\
 \
-static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t temp[32];\
     uint8_t * const half= (uint8_t*)temp;\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
     OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\
 }\
-static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[16*2 + 17*2];\
     uint8_t * const halfH= ((uint8_t*)half) + 256;\
     uint8_t * const halfHV= ((uint8_t*)half);\
@@ -1340,7 +1408,7 @@ static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[16*2 + 17*2];\
     uint8_t * const halfH= ((uint8_t*)half) + 256;\
     uint8_t * const halfHV= ((uint8_t*)half);\
@@ -1349,7 +1417,7 @@ static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[16*2 + 17*2];\
     uint8_t * const halfH= ((uint8_t*)half) + 256;\
     uint8_t * const halfHV= ((uint8_t*)half);\
@@ -1358,7 +1426,7 @@ static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[16*2 + 17*2];\
     uint8_t * const halfH= ((uint8_t*)half) + 256;\
     uint8_t * const halfHV= ((uint8_t*)half);\
@@ -1367,7 +1435,7 @@ static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[16*2 + 17*2];\
     uint8_t * const halfH= ((uint8_t*)half) + 256;\
     uint8_t * const halfHV= ((uint8_t*)half);\
@@ -1375,7 +1443,7 @@ static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[16*2 + 17*2];\
     uint8_t * const halfH= ((uint8_t*)half) + 256;\
     uint8_t * const halfHV= ((uint8_t*)half);\
@@ -1383,21 +1451,21 @@ static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
     put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
     OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\
 }\
-static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[17*2];\
     uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
     put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\
     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
 }\
-static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[17*2];\
     uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
     put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\
     OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
 }\
-static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\
+static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     uint64_t half[17*2];\
     uint8_t * const halfH= ((uint8_t*)half);\
     put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
@@ -1433,10 +1501,45 @@ static void just_return() { return; }
     c->put_ ## postfix1 = put_ ## postfix2;\
     c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
     c->avg_ ## postfix1 = avg_ ## postfix2;
+
+/* external functions, from idct_mmx.c */
+void ff_mmx_idct(DCTELEM *block);
+void ff_mmxext_idct(DCTELEM *block);
+
+/* XXX: those functions should be suppressed ASAP when all IDCTs are
+   converted */
+static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_mmx_idct (block);
+    put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_mmx_idct (block);
+    add_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_mmxext_idct (block);
+    put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+    ff_mmxext_idct (block);
+    add_pixels_clamped_mmx(block, dest, line_size);
+}
     
-void dsputil_init_mmx(DSPContext* c, unsigned mask)
+void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
 {
     mm_flags = mm_support();
+
+    if (avctx->dsp_mask) {
+	if (avctx->dsp_mask & FF_MM_FORCE)
+	    mm_flags |= (avctx->dsp_mask & 0xffff);
+	else
+	    mm_flags &= ~(avctx->dsp_mask & 0xffff);
+    }
+
 #if 0
     fprintf(stderr, "libavcodec: CPU flags:");
     if (mm_flags & MM_MMX)
@@ -1453,6 +1556,27 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
 #endif
 
     if (mm_flags & MM_MMX) {
+        const int dct_algo = avctx->dct_algo;
+        const int idct_algo= avctx->idct_algo;
+
+        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)
+            c->fdct = ff_fdct_mmx;
+
+        if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
+            c->idct_put= ff_simple_idct_put_mmx;
+            c->idct_add= ff_simple_idct_add_mmx;
+            c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
+        }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
+            if(mm_flags & MM_MMXEXT){
+                c->idct_put= ff_libmpeg2mmx2_idct_put;
+                c->idct_add= ff_libmpeg2mmx2_idct_add;
+            }else{
+                c->idct_put= ff_libmpeg2mmx_idct_put;
+                c->idct_add= ff_libmpeg2mmx_idct_add;
+            }
+            c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
+        }
+        
         c->get_pixels = get_pixels_mmx;
         c->diff_pixels = diff_pixels_mmx;
         c->put_pixels_clamped = put_pixels_clamped_mmx;
@@ -1460,15 +1584,6 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
         c->clear_blocks = clear_blocks_mmx;
         c->pix_sum = pix_sum16_mmx;
 
-        c->pix_abs16x16     = pix_abs16x16_mmx;
-        c->pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
-        c->pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
-        c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
-        c->pix_abs8x8     = pix_abs8x8_mmx;
-        c->pix_abs8x8_x2  = pix_abs8x8_x2_mmx;
-        c->pix_abs8x8_y2  = pix_abs8x8_y2_mmx;
-        c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
-
         c->put_pixels_tab[0][0] = put_pixels16_mmx;
         c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
         c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
@@ -1515,45 +1630,35 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
         c->hadamard8_diff[0]= hadamard8_diff16_mmx;
         c->hadamard8_diff[1]= hadamard8_diff_mmx;
         
-        c->sad[0]= sad16x16_mmx;
-        c->sad[1]= sad8x8_mmx;
-
 	c->pix_norm1 = pix_norm1_mmx;
 	c->sse[0] = sse16_mmx;
         
         if (mm_flags & MM_MMXEXT) {
-            c->pix_abs16x16     = pix_abs16x16_mmx2;
-            c->pix_abs16x16_x2  = pix_abs16x16_x2_mmx2;
-            c->pix_abs16x16_y2  = pix_abs16x16_y2_mmx2;
-            c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
-
-            c->pix_abs8x8     = pix_abs8x8_mmx2;
-            c->pix_abs8x8_x2  = pix_abs8x8_x2_mmx2;
-            c->pix_abs8x8_y2  = pix_abs8x8_y2_mmx2;
-            c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
-
-            c->sad[0]= sad16x16_mmx2;
-            c->sad[1]= sad8x8_mmx2;
-            
             c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
             c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
-            c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
-            c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
 
             c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
             c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
             c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
-            c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
 
             c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
             c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
-            c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
-            c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
 
             c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
             c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
             c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
-            c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
+
+            c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
+            c->hadamard8_diff[1]= hadamard8_diff_mmx2;
+
+            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+                c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
+                c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
+                c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
+                c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
+                c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
+                c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
+            }
 
 #if 1
             SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
@@ -1592,23 +1697,26 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
         } else if (mm_flags & MM_3DNOW) {
             c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
             c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
-            c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
-            c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
 
             c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
             c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
             c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
-            c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
 
             c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
             c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
-            c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
-            c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
 
             c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
             c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
             c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
-            c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
+
+            if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+                c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
+                c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
+                c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
+                c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
+                c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
+                c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
+            }
 
             SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
             SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
@@ -1644,7 +1752,8 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
             SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
         }
     }
-
+        
+    dsputil_init_pix_mmx(c, avctx);
 #if 0
     // for speed testing
     get_pixels = just_return;
@@ -1680,28 +1789,3 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask)
     //ff_idct = just_return;
 #endif
 }
-
-/* remove any non bit exact operation (testing purpose). NOTE that
-   this function should be kept as small as possible because it is
-   always difficult to test automatically non bit exact cases. */
-void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask)
-{
-    if (mm_flags & MM_MMX) {
-        /* MMX2 & 3DNOW */
-        c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
-        c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
-        c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
-        c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
-        c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
-        c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
-
-        if (mm_flags & MM_MMXEXT) {
-            c->pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
-            c->pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
-            c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
-            c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
-            c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
-            c->pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
-        }
-    }
-}
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
index 4a8841156..8418123ac 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
@@ -25,7 +25,7 @@
 /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
    clobber bug - now it will work with 2.95.2 and also with -fPIC
  */
-static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -85,7 +85,7 @@ static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
 	:"memory");
 }
 
-static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -154,7 +154,7 @@ static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
 }
  
 /* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_BONE(mm6);
     __asm __volatile(
@@ -191,7 +191,7 @@ static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int li
 	:"%eax", "memory");
 }
 
-static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -222,7 +222,7 @@ static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size
 }
 
 /* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_BONE(mm6);
     __asm __volatile(
@@ -255,7 +255,7 @@ static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int li
 	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -283,7 +283,7 @@ static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, i
 	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -315,7 +315,7 @@ static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size
 	:"%eax", "memory");
 }
 
-static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     __asm __volatile(
 	"lea (%3, %3), %%eax		\n\t"
@@ -354,7 +354,7 @@ static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size
 }
 
 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter 
-static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_BONE(mm6);
     __asm __volatile(
@@ -396,31 +396,31 @@ static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_siz
 }
 
 //FIXME the following could be optimized too ...
-static void DEF(put_no_rnd_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
     DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
 }
-static void DEF(put_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(put_pixels8_y2)(block  , pixels  , line_size, h);
     DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
 }
-static void DEF(put_no_rnd_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
     DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
 }
-static void DEF(avg_pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(avg_pixels8)(block  , pixels  , line_size, h);
     DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
 }
-static void DEF(avg_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(avg_pixels8_x2)(block  , pixels  , line_size, h);
     DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
 }
-static void DEF(avg_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(avg_pixels8_y2)(block  , pixels  , line_size, h);
     DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
 }
-static void DEF(avg_pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(avg_pixels8_xy2)(block  , pixels  , line_size, h);
     DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
 }
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
index 956edf798..bbd5aec97 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
@@ -22,7 +22,7 @@
  */
 
 // put_pixels
-static void DEF(put, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
@@ -104,7 +104,7 @@ static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
 	:"memory");
 }
 
-static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
@@ -199,7 +199,7 @@ static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in
 	:"memory"); 
 }
 
-static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
@@ -228,7 +228,7 @@ static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_siz
 	:"eax", "memory");
 }
 
-static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_ZERO(mm7);
     SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
@@ -296,7 +296,7 @@ static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si
 
 // avg_pixels
 // in case more speed is needed - unroling would certainly help
-static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     JUMPALIGN();
@@ -315,7 +315,7 @@ static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size,
     while (--h);
 }
 
-static void DEF(avg, pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     JUMPALIGN();
@@ -338,7 +338,7 @@ static void DEF(avg, pixels16)(UINT8 *block, const UINT8 *pixels, int line_size,
     while (--h);
 }
 
-static void DEF(avg, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     JUMPALIGN();
@@ -379,7 +379,7 @@ static void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
     } while (--h);
 }
 
-static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     JUMPALIGN();
@@ -432,7 +432,7 @@ static void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in
     } while (--h);
 }
 
-static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_BFE(mm6);
     __asm __volatile(
@@ -472,7 +472,7 @@ static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_siz
 }
 
 // this routine is 'slightly' suboptimal but mostly unused
-static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
+static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
 {
     MOVQ_ZERO(mm7);
     SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
@@ -547,22 +547,22 @@ static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si
 }
 
 //FIXME optimize
-static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(put, pixels8_y2)(block  , pixels  , line_size, h);
     DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h);
 }
 
-static void DEF(put, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(put, pixels8_xy2)(block  , pixels  , line_size, h);
     DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
 }
 
-static void DEF(avg, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(avg, pixels8_y2)(block  , pixels  , line_size, h);
     DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h);
 }
 
-static void DEF(avg, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){
+static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
     DEF(avg, pixels8_xy2)(block  , pixels  , line_size, h);
     DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
 }
diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
index fa85db67b..5c4b32dcd 100644
--- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
@@ -20,7 +20,7 @@
  */
 #include "../dsputil.h"
 
-static const __attribute__ ((aligned(8))) UINT64 round_tab[3]={
+static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
 0x0000000000000000,
 0x0001000100010001,
 0x0002000200020002,
@@ -28,7 +28,7 @@ static const __attribute__ ((aligned(8))) UINT64 round_tab[3]={
 
 static __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL;
 
-static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
+static inline void sad8_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
     int len= -(stride<<h);
     asm volatile(
@@ -64,7 +64,7 @@ static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
     );
 }
 
-static inline void sad8_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
+static inline void sad8_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
     int len= -(stride<<h);
     asm volatile(
@@ -86,7 +86,7 @@ static inline void sad8_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
     );
 }
 
-static inline void sad8_2_mmx2(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h)
+static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
 {
     int len= -(stride<<h);
     asm volatile(
@@ -112,13 +112,13 @@ static inline void sad8_2_mmx2(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stri
     );
 }
 
-static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
+static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 { //FIXME reuse src
     int len= -(stride<<h);
     asm volatile(
         ".balign 16			\n\t"
         "movq "MANGLE(bone)", %%mm5	\n\t"
-        "1:				\n\t" 
+        "1:				\n\t"
         "movq (%1, %%eax), %%mm0	\n\t"
         "movq (%2, %%eax), %%mm2	\n\t"
         "movq 1(%1, %%eax), %%mm1	\n\t"
@@ -149,7 +149,7 @@ static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h)
     );
 }
 
-static inline void sad8_2_mmx(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h)
+static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
 {
     int len= -(stride<<h);
     asm volatile(
@@ -165,7 +165,7 @@ static inline void sad8_2_mmx(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int strid
         "punpckhbw %%mm7, %%mm3		\n\t"
         "paddw %%mm0, %%mm1		\n\t"
         "paddw %%mm2, %%mm3		\n\t"
-        "movq (%3, %%eax), %%mm4	\n\t" 
+        "movq (%3, %%eax), %%mm4	\n\t"
         "movq (%3, %%eax), %%mm2	\n\t"
         "paddw %%mm5, %%mm1		\n\t"
         "paddw %%mm5, %%mm3		\n\t"
@@ -187,7 +187,7 @@ static inline void sad8_2_mmx(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int strid
     );
 }
 
-static inline void sad8_4_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
+static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
     int len= -(stride<<h);
     asm volatile(
@@ -215,8 +215,8 @@ static inline void sad8_4_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
         "punpckhbw %%mm7, %%mm4		\n\t"
         "paddw %%mm3, %%mm2		\n\t"
         "paddw %%mm4, %%mm1		\n\t"
-        "movq (%3, %%eax), %%mm3	\n\t" 
-        "movq (%3, %%eax), %%mm4	\n\t" 
+        "movq (%3, %%eax), %%mm3	\n\t"
+        "movq (%3, %%eax), %%mm4	\n\t"
         "paddw %%mm5, %%mm2		\n\t"
         "paddw %%mm5, %%mm1		\n\t"
         "psrlw $2, %%mm2		\n\t"
@@ -237,7 +237,7 @@ static inline void sad8_4_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h)
     );
 }
 
-static inline int sum_mmx()
+static inline int sum_mmx(void)
 {
     int ret;
     asm volatile(
@@ -253,7 +253,7 @@ static inline int sum_mmx()
     return ret&0xFFFF;
 }
 
-static inline int sum_mmx2()
+static inline int sum_mmx2(void)
 {
     int ret;
     asm volatile(
@@ -265,7 +265,7 @@ static inline int sum_mmx2()
 
 
 #define PIX_SAD(suf)\
-int pix_abs8x8_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+static int pix_abs8x8_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
 {\
     asm volatile("pxor %%mm7, %%mm7		\n\t"\
                  "pxor %%mm6, %%mm6		\n\t":);\
@@ -274,7 +274,7 @@ int pix_abs8x8_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
 \
     return sum_ ## suf();\
 }\
-int sad8x8_ ## suf(void *s, UINT8 *blk2, UINT8 *blk1, int stride)\
+static int sad8x8_ ## suf(void *s, uint8_t *blk2, uint8_t *blk1, int stride)\
 {\
     asm volatile("pxor %%mm7, %%mm7		\n\t"\
                  "pxor %%mm6, %%mm6		\n\t":);\
@@ -284,7 +284,7 @@ int sad8x8_ ## suf(void *s, UINT8 *blk2, UINT8 *blk1, int stride)\
     return sum_ ## suf();\
 }\
 \
-int pix_abs8x8_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+static int pix_abs8x8_x2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
 {\
     asm volatile("pxor %%mm7, %%mm7		\n\t"\
                  "pxor %%mm6, %%mm6		\n\t"\
@@ -297,7 +297,7 @@ int pix_abs8x8_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
     return sum_ ## suf();\
 }\
 \
-int pix_abs8x8_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+static int pix_abs8x8_y2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
 {\
     asm volatile("pxor %%mm7, %%mm7		\n\t"\
                  "pxor %%mm6, %%mm6		\n\t"\
@@ -310,7 +310,7 @@ int pix_abs8x8_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
     return sum_ ## suf();\
 }\
 \
-int pix_abs8x8_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+static int pix_abs8x8_xy2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
 {\
     asm volatile("pxor %%mm7, %%mm7		\n\t"\
                  "pxor %%mm6, %%mm6		\n\t"\
@@ -323,7 +323,7 @@ int pix_abs8x8_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
     return sum_ ## suf();\
 }\
 \
-int pix_abs16x16_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+static int pix_abs16x16_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
 {\
     asm volatile("pxor %%mm7, %%mm7		\n\t"\
                  "pxor %%mm6, %%mm6		\n\t":);\
@@ -333,7 +333,7 @@ int pix_abs16x16_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
 \
     return sum_ ## suf();\
 }\
-int sad16x16_ ## suf(void *s, UINT8 *blk2, UINT8 *blk1, int stride)\
+static int sad16x16_ ## suf(void *s, uint8_t *blk2, uint8_t *blk1, int stride)\
 {\
     asm volatile("pxor %%mm7, %%mm7		\n\t"\
                  "pxor %%mm6, %%mm6		\n\t":);\
@@ -343,7 +343,7 @@ int sad16x16_ ## suf(void *s, UINT8 *blk2, UINT8 *blk1, int stride)\
 \
     return sum_ ## suf();\
 }\
-int pix_abs16x16_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+static int pix_abs16x16_x2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
 {\
     asm volatile("pxor %%mm7, %%mm7		\n\t"\
                  "pxor %%mm6, %%mm6		\n\t"\
@@ -356,7 +356,7 @@ int pix_abs16x16_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
 \
     return sum_ ## suf();\
 }\
-int pix_abs16x16_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+static int pix_abs16x16_y2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
 {\
     asm volatile("pxor %%mm7, %%mm7		\n\t"\
                  "pxor %%mm6, %%mm6		\n\t"\
@@ -369,7 +369,7 @@ int pix_abs16x16_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
 \
     return sum_ ## suf();\
 }\
-int pix_abs16x16_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
+static int pix_abs16x16_xy2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
 {\
     asm volatile("pxor %%mm7, %%mm7		\n\t"\
                  "pxor %%mm6, %%mm6		\n\t"\
@@ -385,3 +385,36 @@ int pix_abs16x16_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\
 
 PIX_SAD(mmx)
 PIX_SAD(mmx2)
+
+void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
+{
+    if (mm_flags & MM_MMX) {
+        c->pix_abs16x16     = pix_abs16x16_mmx;
+        c->pix_abs16x16_x2  = pix_abs16x16_x2_mmx;
+        c->pix_abs16x16_y2  = pix_abs16x16_y2_mmx;
+        c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
+        c->pix_abs8x8     = pix_abs8x8_mmx;
+        c->pix_abs8x8_x2  = pix_abs8x8_x2_mmx;
+        c->pix_abs8x8_y2  = pix_abs8x8_y2_mmx;
+        c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
+
+	c->sad[0]= sad16x16_mmx;
+        c->sad[1]= sad8x8_mmx;
+    }
+    if (mm_flags & MM_MMXEXT) {
+	c->pix_abs16x16     = pix_abs16x16_mmx2;
+	c->pix_abs8x8     = pix_abs8x8_mmx2;
+
+	c->sad[0]= sad16x16_mmx2;
+	c->sad[1]= sad8x8_mmx2;
+        
+        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+            c->pix_abs16x16_x2  = pix_abs16x16_x2_mmx2;
+            c->pix_abs16x16_y2  = pix_abs16x16_y2_mmx2;
+            c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
+            c->pix_abs8x8_x2  = pix_abs8x8_x2_mmx2;
+            c->pix_abs8x8_y2  = pix_abs8x8_y2_mmx2;
+            c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
+        }
+    }
+}
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
index be8015dd3..8e452b499 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
@@ -23,12 +23,10 @@
 #include "../dsputil.h"
 #include "../mpegvideo.h"
 #include "../avcodec.h"
-#include "../simple_idct.h"
-#include "xineutils.h"
 
-extern UINT8 zigzag_direct_noperm[64];
-extern UINT16 inv_zigzag_direct16[64];
-extern UINT32 inverse[256];
+extern uint8_t zigzag_direct_noperm[64];
+extern uint16_t inv_zigzag_direct16[64];
+extern uint32_t inverse[256];
 
 static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
 static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
@@ -42,9 +40,7 @@ static void dct_unquantize_h263_mmx(MpegEncContext *s,
     qmul = qscale << 1;
     qadd = (qscale - 1) | 1;
 
-    XINE_ASSERT(s->block_last_index[n]>=0,
-		"value 's->block_last_index[%d] is < 0: %d", 
-		n, s->block_last_index[n]);
+    assert(s->block_last_index[n]>=0);
         
     if (s->mb_intra) {
         if (!s->h263_aic) {
@@ -147,11 +143,9 @@ static void dct_unquantize_mpeg1_mmx(MpegEncContext *s,
                                      DCTELEM *block, int n, int qscale)
 {
     int nCoeffs;
-    const UINT16 *quant_matrix;
+    const uint16_t *quant_matrix;
 
-    XINE_ASSERT(s->block_last_index[n]>=0,
-		"value 's->block_last_index[%d] is < 0: %d", 
-		n, s->block_last_index[n]);
+    assert(s->block_last_index[n]>=0);
 
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
@@ -277,11 +271,9 @@ static void dct_unquantize_mpeg2_mmx(MpegEncContext *s,
                                      DCTELEM *block, int n, int qscale)
 {
     int nCoeffs;
-    const UINT16 *quant_matrix;
+    const uint16_t *quant_matrix;
     
-    XINE_ASSERT(s->block_last_index[n]>=0,
-		"value 's->block_last_index[%d] is < 0: %d", 
-		n, s->block_last_index[n]);
+    assert(s->block_last_index[n]>=0);
 
     if(s->alternate_scan) nCoeffs= 63; //FIXME
     else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
@@ -411,9 +403,9 @@ asm volatile(
 
 /* draw the edges of width 'w' of an image of size width, height 
    this mmx version can only handle w==8 || w==16 */
-static void draw_edges_mmx(UINT8 *buf, int wrap, int width, int height, int w)
+static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
 {
-    UINT8 *ptr, *last_line;
+    uint8_t *ptr, *last_line;
     int i;
 
     last_line = buf + (height - 1) * wrap;
@@ -506,38 +498,10 @@ static void draw_edges_mmx(UINT8 *buf, int wrap, int width, int height, int w)
 #define RENAME(a) a ## _MMX2
 #include "mpegvideo_mmx_template.c"
 
-/* external functions, from idct_mmx.c */
-void ff_mmx_idct(DCTELEM *block);
-void ff_mmxext_idct(DCTELEM *block);
-
-/* XXX: those functions should be suppressed ASAP when all IDCTs are
-   converted */
-static void ff_libmpeg2mmx_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
-{
-    ff_mmx_idct (block);
-    put_pixels_clamped_mmx(block, dest, line_size);
-}
-static void ff_libmpeg2mmx_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
-{
-    ff_mmx_idct (block);
-    add_pixels_clamped_mmx(block, dest, line_size);
-}
-static void ff_libmpeg2mmx2_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
-{
-    ff_mmxext_idct (block);
-    put_pixels_clamped_mmx(block, dest, line_size);
-}
-static void ff_libmpeg2mmx2_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
-{
-    ff_mmxext_idct (block);
-    add_pixels_clamped_mmx(block, dest, line_size);
-}
-
 void MPV_common_init_mmx(MpegEncContext *s)
 {
     if (mm_flags & MM_MMX) {
         const int dct_algo = s->avctx->dct_algo;
-        const int idct_algo= s->avctx->idct_algo;
         
         s->dct_unquantize_h263 = dct_unquantize_h263_mmx;
         s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx;
@@ -546,28 +510,11 @@ void MPV_common_init_mmx(MpegEncContext *s)
         draw_edges = draw_edges_mmx;
 
         if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
-            s->fdct = ff_fdct_mmx;
-
             if(mm_flags & MM_MMXEXT){
                 s->dct_quantize= dct_quantize_MMX2;
             } else {
                 s->dct_quantize= dct_quantize_MMX;
             }
         }
-
-        if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
-            s->idct_put= ff_simple_idct_put_mmx;
-            s->idct_add= ff_simple_idct_add_mmx;
-            s->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
-        }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
-            if(mm_flags & MM_MMXEXT){
-                s->idct_put= ff_libmpeg2mmx2_idct_put;
-                s->idct_add= ff_libmpeg2mmx2_idct_add;
-            }else{
-                s->idct_put= ff_libmpeg2mmx_idct_put;
-                s->idct_add= ff_libmpeg2mmx_idct_add;
-            }
-            s->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
-        }
     }
 }
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
index ead30ed31..8cd91024b 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
@@ -37,8 +37,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
                             int qscale, int *overflow)
 {
     int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ...
-    const UINT16 *qmat, *bias;
-    static __align8 INT16 temp_block[64];
+    const uint16_t *qmat, *bias;
+    static __align8 int16_t temp_block[64];
 
     //s->fdct (block);
     ff_fdct_mmx (block); //cant be anything else ...
@@ -207,7 +207,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
     if(s->mb_intra) block[0]= level;
     else            block[0]= temp_block[0];
 
-    if(s->idct_permutation[1]==8){
+    if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){
         if(last_non_zero_p1 <= 1) goto end;
         block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08]; 
         block[0x20] = temp_block[0x10]; 
@@ -251,7 +251,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
         block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36]; 
         block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37]; 
         block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
-    }else if(s->idct_permutation[1]==4){
+    }else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){
         if(last_non_zero_p1 <= 1) goto end;
         block[0x04] = temp_block[0x01]; 
         block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; 
diff --git a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c
index 9dfd5f149..836403ca5 100644
--- a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c
@@ -1298,12 +1298,12 @@ void ff_simple_idct_mmx(int16_t *block)
 
 //FIXME merge add/put into the idct
 
-void ff_simple_idct_put_mmx(UINT8 *dest, int line_size, DCTELEM *block)
+void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
 {
     idct(block);
     put_pixels_clamped_mmx(block, dest, line_size);
 }
-void ff_simple_idct_add_mmx(UINT8 *dest, int line_size, DCTELEM *block)
+void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
 {
     idct(block);
     add_pixels_clamped_mmx(block, dest, line_size);