4 files changed, 67 insertions, 4 deletions
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index d58ebcc4e..c523be74a 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -171,6 +171,7 @@ static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000F
 /***********************************/
 /* standard MMX */
 
+#ifdef CONFIG_ENCODERS
 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
 {
     asm volatile(
@@ -227,6 +228,7 @@ static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint
         : "%eax"
     );
 }
+#endif //CONFIG_ENCODERS
 
 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
 {
@@ -401,6 +403,7 @@ static void clear_blocks_mmx(DCTELEM *blocks)
         );
 }
 
+#ifdef CONFIG_ENCODERS
 static int pix_sum16_mmx(uint8_t * pix, int line_size){
     const int h=16;
     int sum;
@@ -438,6 +441,7 @@ static int pix_sum16_mmx(uint8_t * pix, int line_size){
 
         return sum;
 }
+#endif //CONFIG_ENCODERS
 
 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
     int i=0;
@@ -461,6 +465,7 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
         dst[i+0] += src[i+0];
 }
 
+#ifdef CONFIG_ENCODERS
 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
     int tmp;
   asm volatile (
@@ -583,6 +588,43 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
     for(; i<w; i++)
         dst[i+0] = src1[i+0]-src2[i+0];
 }
+
+static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
+    int i=0;
+    uint8_t l, lt;
+
+    asm volatile(
+        "1:				\n\t"
+        "movq  -1(%1, %0), %%mm0	\n\t" // LT
+        "movq  (%1, %0), %%mm1		\n\t" // T
+        "movq  -1(%2, %0), %%mm2	\n\t" // L
+        "movq  (%2, %0), %%mm3		\n\t" // X
+        "movq %%mm2, %%mm4		\n\t" // L
+        "psubb %%mm0, %%mm2		\n\t"
+        "paddb %%mm1, %%mm2		\n\t" // L + T - LT
+        "movq %%mm4, %%mm5		\n\t" // L
+        "pmaxub %%mm1, %%mm4		\n\t" // max(T, L)
+        "pminub %%mm5, %%mm1		\n\t" // min(T, L)
+        "pminub %%mm2, %%mm4		\n\t" 
+        "pmaxub %%mm1, %%mm4		\n\t"
+        "psubb %%mm4, %%mm3		\n\t" // dst - pred
+        "movq %%mm3, (%3, %0)		\n\t"
+        "addl $8, %0			\n\t"
+        "cmpl %4, %0			\n\t"
+        " jb 1b				\n\t"
+        : "+r" (i)
+        : "r"(src1), "r"(src2), "r"(dst), "r"(w)
+    );
+
+    l= *left;
+    lt= *left_top;
+    
+    dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
+    
+    *left_top= src1[w-1];
+    *left    = src2[w-1];
+}
+
 #define LBUTTERFLY2(a1,b1,a2,b2)\
     "paddw " #b1 ", " #a1 "		\n\t"\
     "paddw " #b2 ", " #a2 "		\n\t"\
@@ -819,6 +861,7 @@ static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride
 
 WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
 WARPER88_1616(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
+#endif //CONFIG_ENCODERS
 
 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
@@ -1567,24 +1610,31 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
             c->idct_put= ff_simple_idct_put_mmx;
             c->idct_add= ff_simple_idct_add_mmx;
+            c->idct    = ff_simple_idct_mmx;
             c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
         }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
             if(mm_flags & MM_MMXEXT){
                 c->idct_put= ff_libmpeg2mmx2_idct_put;
                 c->idct_add= ff_libmpeg2mmx2_idct_add;
+                c->idct    = ff_mmxext_idct;
             }else{
                 c->idct_put= ff_libmpeg2mmx_idct_put;
                 c->idct_add= ff_libmpeg2mmx_idct_add;
+                c->idct    = ff_mmx_idct;
             }
             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
         }
         
+#ifdef CONFIG_ENCODERS
         c->get_pixels = get_pixels_mmx;
         c->diff_pixels = diff_pixels_mmx;
+#endif //CONFIG_ENCODERS
         c->put_pixels_clamped = put_pixels_clamped_mmx;
         c->add_pixels_clamped = add_pixels_clamped_mmx;
         c->clear_blocks = clear_blocks_mmx;
+#ifdef CONFIG_ENCODERS
         c->pix_sum = pix_sum16_mmx;
+#endif //CONFIG_ENCODERS
 
         c->put_pixels_tab[0][0] = put_pixels16_mmx;
         c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
@@ -1627,6 +1677,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
                 
         c->add_bytes= add_bytes_mmx;
+#ifdef CONFIG_ENCODERS
         c->diff_bytes= diff_bytes_mmx;
         
         c->hadamard8_diff[0]= hadamard8_diff16_mmx;
@@ -1634,6 +1685,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         
 	c->pix_norm1 = pix_norm1_mmx;
 	c->sse[0] = sse16_mmx;
+#endif //CONFIG_ENCODERS
         
         if (mm_flags & MM_MMXEXT) {
             c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
@@ -1650,8 +1702,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
             c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
 
+#ifdef CONFIG_ENCODERS
             c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
             c->hadamard8_diff[1]= hadamard8_diff_mmx2;
+#endif //CONFIG_ENCODERS
 
             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
                 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
@@ -1696,6 +1750,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
             SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
 #endif
+
+            c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
         } else if (mm_flags & MM_3DNOW) {
             c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
             c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
@@ -1755,7 +1811,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         }
     }
         
+#ifdef CONFIG_ENCODERS
     dsputil_init_pix_mmx(c, avctx);
+#endif //CONFIG_ENCODERS
 #if 0
     // for speed testing
     get_pixels = just_return;
diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
index 1bc5dfeb2..aa22f0649 100644
--- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
@@ -26,7 +26,7 @@ static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
 0x0002000200020002,
 };
 
-static const __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL;
+static const __attribute__ ((aligned(8), unused)) uint64_t bone= 0x0101010101010101LL;
 
 static inline void sad8_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 {
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
index fa1ab579d..706211eec 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
@@ -38,7 +38,9 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
 {
     int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ...
     const uint16_t *qmat, *bias;
-    static __align8 int16_t temp_block[64];
+    __align8 int16_t temp_block[64];
+    
+    assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
 
     //s->fdct (block);
     ff_fdct_mmx (block); //cant be anything else ...
diff --git a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c
index 836403ca5..626c1f565 100644
--- a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c
@@ -47,8 +47,8 @@
 
 static const uint64_t __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000ULL;
 static const uint64_t __attribute__((aligned(8))) d40000= 0x0000000000040000ULL;
-static int16_t __attribute__((aligned(8))) temp[64];
-static int16_t __attribute__((aligned(8))) coeffs[]= {
+
+static const int16_t __attribute__((aligned(8))) coeffs[]= {
 	1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
 //	1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
 //	0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
@@ -206,6 +206,9 @@ row[7] = input[13];
 
 static inline void idct(int16_t *block)
 {
+	int64_t __attribute__((aligned(8))) align_tmp[16];
+	int16_t * const temp= (int16_t*)align_tmp;
+
 	asm volatile(
 #if 0 //Alternative, simpler variant