4 files changed, 162 insertions, 279 deletions
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 772c9c1f0..c8db22e64 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -22,6 +22,7 @@
 
 #include "../dsputil.h"
 #include "../simple_idct.h"
+#include "mmx.h"
 
 //#undef NDEBUG
 //#include <assert.h>
@@ -293,6 +294,24 @@ void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size
 	    :"memory");
 }
 
+static unsigned char __align8 vector128[8] =
+  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+
+void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
+{
+    int i;
+
+    movq_m2r(*vector128, mm1);
+    for (i = 0; i < 8; i++) {
+        movq_m2r(*(block), mm0);
+        packsswb_m2r(*(block + 4), mm0);
+        block += 8;
+        paddb_r2r(mm1, mm0);
+        movq_r2m(mm0, *pixels);
+        pixels += line_size;
+    }
+}
+
 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
 {
     const DCTELEM *p;
@@ -2149,19 +2168,18 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
         /* VP3 optimized DSP functions */
         if (mm_flags & MM_SSE2) {
             c->vp3_dsp_init = vp3_dsp_init_sse2;
-            c->vp3_idct_put = vp3_idct_put_sse2;
-            c->vp3_idct_add = vp3_idct_add_sse2;
+            c->vp3_idct = vp3_idct_sse2;
         } else {
             c->vp3_dsp_init = vp3_dsp_init_mmx;
-            c->vp3_idct_put = vp3_idct_put_mmx;
-            c->vp3_idct_add = vp3_idct_add_mmx;
+            c->vp3_idct = vp3_idct_mmx;
         }
-        
+
 #ifdef CONFIG_ENCODERS
         c->get_pixels = get_pixels_mmx;
         c->diff_pixels = diff_pixels_mmx;
 #endif //CONFIG_ENCODERS
         c->put_pixels_clamped = put_pixels_clamped_mmx;
+        c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
         c->add_pixels_clamped = add_pixels_clamped_mmx;
         c->clear_blocks = clear_blocks_mmx;
 #ifdef CONFIG_ENCODERS
diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
index 7af576971..68f788a23 100644
--- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
@@ -60,77 +60,77 @@ struct
 //static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
 
 static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = {  // forward_dct coeff table
-  16384,   16384,   -8867,  -21407, 
-  16384,   16384,   21407,    8867, 
-  16384,  -16384,   21407,   -8867, 
- -16384,   16384,    8867,  -21407, 
-  22725,   19266,  -22725,  -12873, 
-  12873,    4520,   19266,   -4520, 
-  12873,  -22725,   19266,  -22725, 
-   4520,   19266,    4520,  -12873, 
-
-  22725,   22725,  -12299,  -29692, 
-  22725,   22725,   29692,   12299, 
-  22725,  -22725,   29692,  -12299, 
- -22725,   22725,   12299,  -29692, 
-  31521,   26722,  -31521,  -17855, 
-  17855,    6270,   26722,   -6270, 
-  17855,  -31521,   26722,  -31521, 
-   6270,   26722,    6270,  -17855, 
-
-  21407,   21407,  -11585,  -27969, 
-  21407,   21407,   27969,   11585, 
-  21407,  -21407,   27969,  -11585, 
- -21407,   21407,   11585,  -27969, 
-  29692,   25172,  -29692,  -16819, 
-  16819,    5906,   25172,   -5906, 
-  16819,  -29692,   25172,  -29692, 
-   5906,   25172,    5906,  -16819, 
-
-  19266,   19266,  -10426,  -25172, 
-  19266,   19266,   25172,   10426, 
-  19266,  -19266,   25172,  -10426, 
- -19266,   19266,   10426,  -25172, 
-  26722,   22654,  -26722,  -15137, 
-  15137,    5315,   22654,   -5315, 
-  15137,  -26722,   22654,  -26722, 
-   5315,   22654,    5315,  -15137, 
-
-  16384,   16384,   -8867,  -21407, 
-  16384,   16384,   21407,    8867, 
-  16384,  -16384,   21407,   -8867, 
- -16384,   16384,    8867,  -21407, 
-  22725,   19266,  -22725,  -12873, 
-  12873,    4520,   19266,   -4520, 
-  12873,  -22725,   19266,  -22725, 
-   4520,   19266,    4520,  -12873, 
-
-  19266,   19266,  -10426,  -25172, 
-  19266,   19266,   25172,   10426, 
-  19266,  -19266,   25172,  -10426, 
- -19266,   19266,   10426,  -25172, 
-  26722,   22654,  -26722,  -15137, 
-  15137,    5315,   22654,   -5315, 
-  15137,  -26722,   22654,  -26722, 
-   5315,   22654,    5315,  -15137, 
-
-  21407,   21407,  -11585,  -27969, 
-  21407,   21407,   27969,   11585, 
-  21407,  -21407,   27969,  -11585, 
- -21407,   21407,   11585,  -27969, 
-  29692,   25172,  -29692,  -16819, 
-  16819,    5906,   25172,   -5906, 
-  16819,  -29692,   25172,  -29692, 
-   5906,   25172,    5906,  -16819, 
-
-  22725,   22725,  -12299,  -29692, 
-  22725,   22725,   29692,   12299, 
-  22725,  -22725,   29692,  -12299, 
- -22725,   22725,   12299,  -29692, 
-  31521,   26722,  -31521,  -17855, 
-  17855,    6270,   26722,   -6270, 
-  17855,  -31521,   26722,  -31521, 
-   6270,   26722,    6270,  -17855, 
+  16384,   16384,   22725,   19266, 
+  16384,   16384,   12873,    4520, 
+  21407,    8867,   19266,   -4520, 
+  -8867,  -21407,  -22725,  -12873, 
+  16384,  -16384,   12873,  -22725, 
+ -16384,   16384,    4520,   19266, 
+   8867,  -21407,    4520,  -12873, 
+  21407,   -8867,   19266,  -22725, 
+
+  22725,   22725,   31521,   26722, 
+  22725,   22725,   17855,    6270, 
+  29692,   12299,   26722,   -6270, 
+ -12299,  -29692,  -31521,  -17855, 
+  22725,  -22725,   17855,  -31521, 
+ -22725,   22725,    6270,   26722, 
+  12299,  -29692,    6270,  -17855, 
+  29692,  -12299,   26722,  -31521, 
+
+  21407,   21407,   29692,   25172, 
+  21407,   21407,   16819,    5906, 
+  27969,   11585,   25172,   -5906, 
+ -11585,  -27969,  -29692,  -16819, 
+  21407,  -21407,   16819,  -29692, 
+ -21407,   21407,    5906,   25172, 
+  11585,  -27969,    5906,  -16819, 
+  27969,  -11585,   25172,  -29692, 
+
+  19266,   19266,   26722,   22654, 
+  19266,   19266,   15137,    5315, 
+  25172,   10426,   22654,   -5315, 
+ -10426,  -25172,  -26722,  -15137, 
+  19266,  -19266,   15137,  -26722, 
+ -19266,   19266,    5315,   22654, 
+  10426,  -25172,    5315,  -15137, 
+  25172,  -10426,   22654,  -26722, 
+
+  16384,   16384,   22725,   19266, 
+  16384,   16384,   12873,    4520, 
+  21407,    8867,   19266,   -4520, 
+  -8867,  -21407,  -22725,  -12873, 
+  16384,  -16384,   12873,  -22725, 
+ -16384,   16384,    4520,   19266, 
+   8867,  -21407,    4520,  -12873, 
+  21407,   -8867,   19266,  -22725, 
+
+  19266,   19266,   26722,   22654, 
+  19266,   19266,   15137,    5315, 
+  25172,   10426,   22654,   -5315, 
+ -10426,  -25172,  -26722,  -15137, 
+  19266,  -19266,   15137,  -26722, 
+ -19266,   19266,    5315,   22654, 
+  10426,  -25172,    5315,  -15137, 
+  25172,  -10426,   22654,  -26722, 
+
+  21407,   21407,   29692,   25172, 
+  21407,   21407,   16819,    5906, 
+  27969,   11585,   25172,   -5906, 
+ -11585,  -27969,  -29692,  -16819, 
+  21407,  -21407,   16819,  -29692, 
+ -21407,   21407,    5906,   25172, 
+  11585,  -27969,    5906,  -16819, 
+  27969,  -11585,   25172,  -29692, 
+
+  22725,   22725,   31521,   26722, 
+  22725,   22725,   17855,    6270, 
+  29692,   12299,   26722,   -6270, 
+ -12299,  -29692,  -31521,  -17855, 
+  22725,  -22725,   17855,  -31521, 
+ -22725,   22725,    6270,   26722, 
+  12299,  -29692,    6270,  -17855, 
+  29692,  -12299,   26722,  -31521, 
 };
 
 struct 
@@ -413,93 +413,91 @@ static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const i
 { 
     pshufw_m2r(*(in + 4), mm5, 0x1B);
     movq_m2r(*(in + 0), mm0);
-    movq_r2r(mm0, mm1);	
+    movq_r2r(mm0, mm1);
     paddsw_r2r(mm5, mm0);
     psubsw_r2r(mm5, mm1);
-    pshufw_r2r(mm0, mm2, 0x4E);
-    pshufw_r2r(mm1, mm3, 0x4E);
-    movq_m2r(*(table + 0), mm4);
-    movq_m2r(*(table + 4), mm6);
-    movq_m2r(*(table + 16), mm5);
+    movq_r2r(mm0, mm2);
+    punpckldq_r2r(mm1, mm0);
+    punpckhdq_r2r(mm1, mm2);
+    movq_m2r(*(table + 0), mm1);
+    movq_m2r(*(table + 4), mm3);
+    movq_m2r(*(table + 8), mm4);
+    movq_m2r(*(table + 12), mm5);
+    movq_m2r(*(table + 16), mm6);
     movq_m2r(*(table + 20), mm7);
+    pmaddwd_r2r(mm0, mm1);
+    pmaddwd_r2r(mm2, mm3);
     pmaddwd_r2r(mm0, mm4);
-    pmaddwd_r2r(mm1, mm5);
-    pmaddwd_r2r(mm2, mm6);
-    pmaddwd_r2r(mm3, mm7);
-    pmaddwd_m2r(*(table + 8), mm0);
-    pmaddwd_m2r(*(table + 12), mm2);
-    pmaddwd_m2r(*(table + 24), mm1);
-    pmaddwd_m2r(*(table + 28), mm3);
-    paddd_r2r(mm6, mm4);
-    paddd_r2r(mm7, mm5);
-    paddd_r2r(mm2, mm0);
-    paddd_r2r(mm3, mm1);
-    movq_m2r(*fdct_r_row, mm7);
-    paddd_r2r(mm7, mm4);
-    paddd_r2r(mm7, mm5);
-    paddd_r2r(mm7, mm0);
-    paddd_r2r(mm7, mm1);
-    psrad_i2r(SHIFT_FRW_ROW, mm4);
+    pmaddwd_r2r(mm2, mm5);
+    pmaddwd_r2r(mm0, mm6);
+    pmaddwd_r2r(mm2, mm7);
+    pmaddwd_m2r(*(table + 24), mm0);
+    pmaddwd_m2r(*(table + 28), mm2);
+    paddd_r2r(mm1, mm3);
+    paddd_r2r(mm4, mm5);
+    paddd_r2r(mm6, mm7);
+    paddd_r2r(mm0, mm2);
+    movq_m2r(*fdct_r_row, mm0);
+    paddd_r2r(mm0, mm3);
+    paddd_r2r(mm0, mm5);
+    paddd_r2r(mm0, mm7);
+    paddd_r2r(mm0, mm2);
+    psrad_i2r(SHIFT_FRW_ROW, mm3);
     psrad_i2r(SHIFT_FRW_ROW, mm5);
-    psrad_i2r(SHIFT_FRW_ROW, mm0);
-    psrad_i2r(SHIFT_FRW_ROW, mm1);
-    packssdw_r2r(mm0, mm4);
-    packssdw_r2r(mm1, mm5);
-    movq_r2r(mm4, mm2);
-    punpcklwd_r2r(mm5, mm4);
-    punpckhwd_r2r(mm5, mm2);
-    movq_r2m(mm4, *(out + 0));
-    movq_r2m(mm2, *(out + 4));
+    psrad_i2r(SHIFT_FRW_ROW, mm7);
+    psrad_i2r(SHIFT_FRW_ROW, mm2);
+    packssdw_r2r(mm5, mm3);
+    packssdw_r2r(mm2, mm7);
+    movq_r2m(mm3, *(out + 0));
+    movq_r2m(mm7, *(out + 4));
 }
 
 static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
 { 
+//FIXME reorder (i dont have a old mmx only cpu here to benchmark ...)
     movd_m2r(*(in + 6), mm1);
     punpcklwd_m2r(*(in + 4), mm1);
     movq_r2r(mm1, mm2);
     psrlq_i2r(0x20, mm1);
     movq_m2r(*(in + 0), mm0);
     punpcklwd_r2r(mm2, mm1);
-    movq_r2r(mm0, mm5);	
+    movq_r2r(mm0, mm5);
     paddsw_r2r(mm1, mm0);
     psubsw_r2r(mm1, mm5);
-    movq_r2r(mm0, mm1);	    
-    movq_r2r(mm5, mm6);	
-    punpckldq_r2r(mm5, mm3);
-    punpckhdq_r2r(mm3, mm6);
-    movq_m2r(*(table + 0), mm3);
-    movq_m2r(*(table + 4), mm4);
-    punpckldq_r2r(mm0, mm2);
-    pmaddwd_r2r(mm0, mm3);
-    punpckhdq_r2r(mm2, mm1);
-    movq_m2r(*(table + 16), mm2);
-    pmaddwd_r2r(mm1, mm4);
-    pmaddwd_m2r(*(table + 8), mm0);
+    movq_r2r(mm0, mm2);
+    punpckldq_r2r(mm5, mm0);
+    punpckhdq_r2r(mm5, mm2);
+    movq_m2r(*(table + 0), mm1);
+    movq_m2r(*(table + 4), mm3);
+    movq_m2r(*(table + 8), mm4);
+    movq_m2r(*(table + 12), mm5);
+    movq_m2r(*(table + 16), mm6);
     movq_m2r(*(table + 20), mm7);
-    pmaddwd_r2r(mm5, mm2);
-    paddd_m2r(*fdct_r_row, mm3);
-    pmaddwd_r2r(mm6, mm7);
-    pmaddwd_m2r(*(table + 12), mm1);
-    paddd_r2r(mm4, mm3);
-    pmaddwd_m2r(*(table + 24), mm5);
-    pmaddwd_m2r(*(table + 28), mm6);
-    paddd_r2r(mm7, mm2);
-    paddd_m2r(*fdct_r_row, mm0);
+    pmaddwd_r2r(mm0, mm1);
+    pmaddwd_r2r(mm2, mm3);
+    pmaddwd_r2r(mm0, mm4);
+    pmaddwd_r2r(mm2, mm5);
+    pmaddwd_r2r(mm0, mm6);
+    pmaddwd_r2r(mm2, mm7);
+    pmaddwd_m2r(*(table + 24), mm0);
+    pmaddwd_m2r(*(table + 28), mm2);
+    paddd_r2r(mm1, mm3);
+    paddd_r2r(mm4, mm5);
+    paddd_r2r(mm6, mm7);
+    paddd_r2r(mm0, mm2);
+    movq_m2r(*fdct_r_row, mm0);
+    paddd_r2r(mm0, mm3);
+    paddd_r2r(mm0, mm5);
+    paddd_r2r(mm0, mm7);
+    paddd_r2r(mm0, mm2);
     psrad_i2r(SHIFT_FRW_ROW, mm3);
-    paddd_m2r(*fdct_r_row, mm2);
-    paddd_r2r(mm1, mm0);
-    paddd_m2r(*fdct_r_row, mm5);
-    psrad_i2r(SHIFT_FRW_ROW, mm2);
-    paddd_r2r(mm6, mm5);
-    psrad_i2r(SHIFT_FRW_ROW, mm0);
     psrad_i2r(SHIFT_FRW_ROW, mm5);
-    packssdw_r2r(mm0, mm3);
-    packssdw_r2r(mm5, mm2);
-    movq_r2r(mm3, mm6);
-    punpcklwd_r2r(mm2, mm3);
-    punpckhwd_r2r(mm2, mm6);
+    psrad_i2r(SHIFT_FRW_ROW, mm7);
+    psrad_i2r(SHIFT_FRW_ROW, mm2);
+    packssdw_r2r(mm5, mm3);
+    packssdw_r2r(mm2, mm7);
     movq_r2m(mm3, *(out + 0));
-    movq_r2m(mm6, *(out + 4));
+    movq_r2m(mm7, *(out + 4));
 }
 
 void ff_fdct_mmx(int16_t *block)
@@ -553,7 +551,6 @@ void ff_fdct_sse2(int16_t *block)
     int64_t align_tmp[16] ATTR_ALIGN(8);
     int16_t * const block_tmp= (int16_t*)align_tmp;
     int16_t *block1;
-    int i;
 
     block1 = block_tmp;
     fdct_col(block, block1, 0);
diff --git a/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c b/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c
index 76007a1d1..319e57f1b 100644
--- a/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/vp3dsp_mmx.c
@@ -279,8 +279,8 @@ void vp3_dsp_init_mmx(void)
     idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
 }
 
-static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
-     int16_t *output_data)
+void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, int16_t *output_data)
 {
     /* eax = quantized input
      * ebx = dequantizer matrix
@@ -563,79 +563,3 @@ static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
 #undef J
 
 }
-
-void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride)
-{
-    int16_t transformed_data[64];
-    int16_t *op;
-    int i, j;
-    uint8_t vector128[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
-
-    vp3_idct_mmx(input_data, dequant_matrix, transformed_data);
-
-    /* place in final output */
-    op = transformed_data;
-    movq_m2r(*vector128, mm0);
-    for (i = 0; i < 8; i++) {
-#if 1
-        for (j = 0; j < 8; j++) {
-            if (*op < -128)
-                *dest = 0;
-            else if (*op > 127)
-                *dest = 255;
-            else
-                *dest = (uint8_t)(*op + 128);
-            op++;
-            dest++;
-        }
-        dest += (stride - 8);
-#else
-/* prototype optimization */
-        pxor_r2r(mm1, mm1);
-        packsswb_m2r(*(op + 4), mm1);
-        movq_r2r(mm1, mm2);
-        psrlq_i2r(32, mm2);
-        packsswb_m2r(*(op + 0), mm1);
-        op += 8;
-        por_r2r(mm2, mm1);
-        paddb_r2r(mm0, mm1);
-        movq_r2m(mm1, *dest);
-        dest += stride;
-#endif
-    }
-
-    /* be a good MMX citizen */
-    emms();
-}
-
-void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride)
-{
-    int16_t transformed_data[64];
-    int16_t *op;
-    int i, j;
-    int16_t sample;
-
-    vp3_idct_mmx(input_data, dequant_matrix, transformed_data);
-
-    /* place in final output */
-    op = transformed_data;
-    for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-            sample = *dest + *op;
-            if (sample < 0)
-                *dest = 0;
-            else if (sample > 255)
-                *dest = 255;
-            else
-                *dest = (uint8_t)(sample & 0xFF);
-            op++;
-            dest++;
-        }
-        dest += (stride - 8);
-    }
-
-    /* be a good MMX citizen */
-    emms();
-}
diff --git a/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c b/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c
index c8f9158af..60c6bf80e 100644
--- a/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c
+++ b/src/libffmpeg/libavcodec/i386/vp3dsp_sse2.c
@@ -802,8 +802,8 @@ void vp3_dsp_init_sse2(void)
 }
 
 
-static void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
-     int16_t *output_data)
+void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, int16_t *output_data)
 {
     unsigned char *input_bytes = (unsigned char *)input_data;
     unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix;
@@ -832,59 +832,3 @@ static void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
         
     SSE2_Column_IDCT();
 }
-
-
-void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride)
-{
-    int16_t transformed_data[64];
-    int16_t *op;
-    int i, j;
-
-    vp3_idct_sse2(input_data, dequant_matrix, transformed_data);
-
-    /* place in final output */
-    op = transformed_data;
-    for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-            if (*op < -128)
-                *dest = 0;
-            else if (*op > 127)
-                *dest = 255;
-            else
-                *dest = (uint8_t)(*op + 128);
-            op++;
-            dest++;
-        }
-        dest += (stride - 8);
-    }
-}
-
-
-void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride)
-{
-    int16_t transformed_data[64];
-    int16_t *op;
-    int i, j;
-    int16_t sample;
-
-    vp3_idct_sse2(input_data, dequant_matrix, transformed_data);
-
-    /* place in final output */
-    op = transformed_data;
-    for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-            sample = *dest + *op;
-            if (sample < 0)
-                *dest = 0;
-            else if (sample > 255)
-                *dest = 255;
-            else
-                *dest = (uint8_t)(sample & 0xFF);
-            op++;
-            dest++;
-        }
-        dest += (stride - 8);
-    }
-}