1 files changed, 390 insertions, 42 deletions
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index a2cbab8ce..673e749c4 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -3,18 +3,20 @@
  * Copyright (c) 2000, 2001 Fabrice Bellard.
  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  *
- * This library is free software; you can redistribute it and/or
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
- * This library is distributed in the hope that it will be useful,
+ * FFmpeg is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
+ * License along with FFmpeg; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  *
  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
@@ -29,7 +31,6 @@
 //#undef NDEBUG
 //#include <assert.h>
 
-extern const uint8_t ff_h263_loop_filter_strength[32];
 extern void ff_idct_xvid_mmx(short *block);
 extern void ff_idct_xvid_mmx2(short *block);
 
@@ -40,6 +41,9 @@ static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x01
 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
 
+static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) =
+{0x8000000080000000ULL, 0x8000000080000000ULL};
+
 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
 static const uint64_t ff_pw_3  attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
 static const uint64_t ff_pw_4  attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
@@ -50,10 +54,15 @@ static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0
 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
 
+static const uint64_t ff_pb_1  attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
+static const uint64_t ff_pb_3  attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL;
+static const uint64_t ff_pb_7  attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL;
 static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
+static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL;
+static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL;
 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
 
-#define JUMPALIGN() __asm __volatile (".balign 8"::)
+#define JUMPALIGN() __asm __volatile (ASMALIGN(3)::)
 #define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
 
 #define MOVQ_WONE(regd) \
@@ -201,7 +210,7 @@ static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
     asm volatile(
         "mov $-128, %%"REG_a"           \n\t"
         "pxor %%mm7, %%mm7              \n\t"
-        ".balign 16                     \n\t"
+        ASMALIGN(4)
         "1:                             \n\t"
         "movq (%0), %%mm0               \n\t"
         "movq (%0, %2), %%mm2           \n\t"
@@ -229,7 +238,7 @@ static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint
     asm volatile(
         "pxor %%mm7, %%mm7              \n\t"
         "mov $-128, %%"REG_a"           \n\t"
-        ".balign 16                     \n\t"
+        ASMALIGN(4)
         "1:                             \n\t"
         "movq (%0), %%mm0               \n\t"
         "movq (%1), %%mm2               \n\t"
@@ -372,7 +381,7 @@ static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size
 {
     __asm __volatile(
          "lea (%3, %3), %%"REG_a"       \n\t"
-         ".balign 8                     \n\t"
+         ASMALIGN(3)
          "1:                            \n\t"
          "movd (%1), %%mm0              \n\t"
          "movd (%1, %3), %%mm1          \n\t"
@@ -398,7 +407,7 @@ static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size
 {
     __asm __volatile(
          "lea (%3, %3), %%"REG_a"       \n\t"
-         ".balign 8                     \n\t"
+         ASMALIGN(3)
          "1:                            \n\t"
          "movq (%1), %%mm0              \n\t"
          "movq (%1, %3), %%mm1          \n\t"
@@ -424,7 +433,7 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_siz
 {
     __asm __volatile(
          "lea (%3, %3), %%"REG_a"       \n\t"
-         ".balign 8                     \n\t"
+         ASMALIGN(3)
          "1:                            \n\t"
          "movq (%1), %%mm0              \n\t"
          "movq 8(%1), %%mm4             \n\t"
@@ -625,22 +634,10 @@ static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
 
 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
     asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
-        "movd  %0, %%mm0                \n\t"
-        "movd  %1, %%mm1                \n\t"
-        "movd  %2, %%mm2                \n\t"
-        
-        :
-        :  "m" (*(uint32_t*)(src + 0*src_stride)),
-           "m" (*(uint32_t*)(src + 1*src_stride)),
-           "m" (*(uint32_t*)(src + 2*src_stride))
-    );
-    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
-        "movd  %0, %%mm3                \n\t"
-        
-        :
-        :  "m" (*(uint32_t*)(src + 3*src_stride))
-    );
-    asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
+        "movd  %4, %%mm0                \n\t"
+        "movd  %5, %%mm1                \n\t"
+        "movd  %6, %%mm2                \n\t"
+        "movd  %7, %%mm3                \n\t"
         "punpcklbw %%mm1, %%mm0         \n\t"
         "punpcklbw %%mm3, %%mm2         \n\t"
         "movq %%mm0, %%mm1              \n\t"
@@ -657,6 +654,10 @@ static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int
           "=m" (*(uint32_t*)(dst + 1*dst_stride)),
           "=m" (*(uint32_t*)(dst + 2*dst_stride)),
           "=m" (*(uint32_t*)(dst + 3*dst_stride))
+        :  "m" (*(uint32_t*)(src + 0*src_stride)),
+           "m" (*(uint32_t*)(src + 1*src_stride)),
+           "m" (*(uint32_t*)(src + 2*src_stride)),
+           "m" (*(uint32_t*)(src + 3*src_stride))
     );
 }
 
@@ -1185,8 +1186,8 @@ static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, in
     else  score1 = sse16_mmx(c, pix1, pix2, line_size, h);
     score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
 
-    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
-    else  return score1 + ABS(score2)*8;
+    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
+    else  return score1 + FFABS(score2)*8;
 }
 
 static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
@@ -1194,8 +1195,8 @@ static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int
     int score1= sse8_mmx(c, pix1, pix2, line_size, h);
     int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
 
-    if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
-    else  return score1 + ABS(score2)*8;
+    if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
+    else  return score1 + FFABS(score2)*8;
 }
 
 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
@@ -2403,6 +2404,53 @@ QPEL_OP(put_       , ff_pw_16, _       , PUT_OP, mmx2)
 QPEL_OP(avg_       , ff_pw_16, _       , AVG_MMX2_OP, mmx2)
 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
 
+/***********************************/
+/* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
+
+#define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
+}
+#define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
+}
+
+#define QPEL_2TAP(OPNAME, SIZE, MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
+QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
+                          OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
+                          OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
+static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
+                          OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
+}\
+static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
+}\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0,         1,       0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1,        -1,       0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0,         stride,  0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride,   -stride,  0)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0,         stride,  1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1,         stride, -1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride,   -stride,  1)\
+QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
+
+QPEL_2TAP(put_, 16, mmx2)
+QPEL_2TAP(avg_, 16, mmx2)
+QPEL_2TAP(put_,  8, mmx2)
+QPEL_2TAP(avg_,  8, mmx2)
+QPEL_2TAP(put_, 16, 3dnow)
+QPEL_2TAP(avg_, 16, 3dnow)
+QPEL_2TAP(put_,  8, 3dnow)
+QPEL_2TAP(avg_,  8, 3dnow)
+
+
 #if 0
 static void just_return() { return; }
 #endif
@@ -2523,9 +2571,10 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o
                   "m"(src[stride]), "m"(src[stride+1]),
                   "m"(*r4), "m"(shift2)
             );
-
+            
             asm volatile(
                 "movd     %%mm0, %0    \n\t"
+
                 : "=m"(dst[x+y*stride])
                 : 
             );
@@ -2535,10 +2584,11 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o
     }
 }
 
+#ifdef CONFIG_ENCODERS
 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
     long i=0;
 
-    assert(ABS(scale) < 256);
+    assert(FFABS(scale) < 256);
     scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
 
     asm volatile(
@@ -2586,7 +2636,7 @@ static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[6
 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
     long i=0;
 
-    if(ABS(scale) < 256){
+    if(FFABS(scale) < 256){
         scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
         asm volatile(
                 "pcmpeqw %%mm6, %%mm6   \n\t" // -1w
@@ -2620,9 +2670,10 @@ static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
         }
     }
 }
+#endif /* CONFIG_ENCODERS */
 
 #define PREFETCH(name, op) \
-void name(void *mem, int stride, int h){\
+static void name(void *mem, int stride, int h){\
     const uint8_t *p= mem;\
     do{\
         asm volatile(#op" %0" :: "m"(*p));\
@@ -2661,6 +2712,7 @@ void ff_vp3_dsp_init_mmx(void);
 
 /* XXX: those functions should be suppressed ASAP when all IDCTs are
    converted */
+#ifdef CONFIG_GPL
 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
 {
     ff_mmx_idct (block);
@@ -2681,6 +2733,7 @@ static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *bloc
     ff_mmxext_idct (block);
     add_pixels_clamped_mmx(block, dest, line_size);
 }
+#endif
 static void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
 {
     ff_vp3_idct_sse2(block);
@@ -2701,7 +2754,6 @@ static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
     ff_vp3_idct_mmx(block);
     add_pixels_clamped_mmx(block, dest, line_size);
 }
-#ifdef CONFIG_GPL
 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
 {
     ff_idct_xvid_mmx (block);
@@ -2722,7 +2774,274 @@ static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
     ff_idct_xvid_mmx2 (block);
     add_pixels_clamped_mmx(block, dest, line_size);
 }
-#endif
+
+static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
+{
+    int i;
+    asm volatile("pxor %%mm7, %%mm7":);
+    for(i=0; i<blocksize; i+=2) {
+        asm volatile(
+            "movq    %0,    %%mm0 \n\t"
+            "movq    %1,    %%mm1 \n\t"
+            "movq    %%mm0, %%mm2 \n\t"
+            "movq    %%mm1, %%mm3 \n\t"
+            "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
+            "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
+            "pslld   $31,   %%mm2 \n\t" // keep only the sign bit
+            "pxor    %%mm2, %%mm1 \n\t"
+            "movq    %%mm3, %%mm4 \n\t"
+            "pand    %%mm1, %%mm3 \n\t"
+            "pandn   %%mm1, %%mm4 \n\t"
+            "pfadd   %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
+            "pfsub   %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
+            "movq    %%mm3, %1    \n\t"
+            "movq    %%mm0, %0    \n\t"
+            :"+m"(mag[i]), "+m"(ang[i])
+            ::"memory"
+        );
+    }
+    asm volatile("femms");
+}
+static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
+{
+    int i;
+
+    asm volatile(
+            "movaps  %0,     %%xmm5 \n\t"
+        ::"m"(ff_pdw_80000000[0])
+    );
+    for(i=0; i<blocksize; i+=4) {
+        asm volatile(
+            "movaps  %0,     %%xmm0 \n\t"
+            "movaps  %1,     %%xmm1 \n\t"
+            "xorps   %%xmm2, %%xmm2 \n\t"
+            "xorps   %%xmm3, %%xmm3 \n\t"
+            "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
+            "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
+            "andps   %%xmm5, %%xmm2 \n\t" // keep only the sign bit
+            "xorps   %%xmm2, %%xmm1 \n\t"
+            "movaps  %%xmm3, %%xmm4 \n\t"
+            "andps   %%xmm1, %%xmm3 \n\t"
+            "andnps  %%xmm1, %%xmm4 \n\t"
+            "addps   %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
+            "subps   %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
+            "movaps  %%xmm3, %1     \n\t"
+            "movaps  %%xmm0, %0     \n\t"
+            :"+m"(mag[i]), "+m"(ang[i])
+            ::"memory"
+        );
+    }
+}
+
+static void vector_fmul_3dnow(float *dst, const float *src, int len){
+    long i = (len-4)*4;
+    asm volatile(
+        "1: \n\t"
+        "movq    (%1,%0), %%mm0 \n\t"
+        "movq   8(%1,%0), %%mm1 \n\t"
+        "pfmul   (%2,%0), %%mm0 \n\t"
+        "pfmul  8(%2,%0), %%mm1 \n\t"
+        "movq   %%mm0,  (%1,%0) \n\t"
+        "movq   %%mm1, 8(%1,%0) \n\t"
+        "sub  $16, %0 \n\t"
+        "jge 1b \n\t"
+        "femms  \n\t"
+        :"+r"(i)
+        :"r"(dst), "r"(src)
+        :"memory"
+    );
+}
+static void vector_fmul_sse(float *dst, const float *src, int len){
+    long i = (len-8)*4;
+    asm volatile(
+        "1: \n\t"
+        "movaps    (%1,%0), %%xmm0 \n\t"
+        "movaps  16(%1,%0), %%xmm1 \n\t"
+        "mulps     (%2,%0), %%xmm0 \n\t"
+        "mulps   16(%2,%0), %%xmm1 \n\t"
+        "movaps  %%xmm0,   (%1,%0) \n\t"
+        "movaps  %%xmm1, 16(%1,%0) \n\t"
+        "sub  $32, %0 \n\t"
+        "jge 1b \n\t"
+        :"+r"(i)
+        :"r"(dst), "r"(src)
+        :"memory"
+    );
+}
+
+static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
+    long i = len*4-16;
+    asm volatile(
+        "1: \n\t"
+        "pswapd   8(%1), %%mm0 \n\t"
+        "pswapd    (%1), %%mm1 \n\t"
+        "pfmul  (%3,%0), %%mm0 \n\t"
+        "pfmul 8(%3,%0), %%mm1 \n\t"
+        "movq  %%mm0,  (%2,%0) \n\t"
+        "movq  %%mm1, 8(%2,%0) \n\t"
+        "add   $16, %1 \n\t"
+        "sub   $16, %0 \n\t"
+        "jge   1b \n\t"
+        :"+r"(i), "+r"(src1)
+        :"r"(dst), "r"(src0)
+    );
+    asm volatile("femms");
+}
+static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
+    long i = len*4-32;
+    asm volatile(
+        "1: \n\t"
+        "movaps        16(%1), %%xmm0 \n\t"
+        "movaps          (%1), %%xmm1 \n\t"
+        "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
+        "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
+        "mulps        (%3,%0), %%xmm0 \n\t"
+        "mulps      16(%3,%0), %%xmm1 \n\t"
+        "movaps     %%xmm0,   (%2,%0) \n\t"
+        "movaps     %%xmm1, 16(%2,%0) \n\t"
+        "add    $32, %1 \n\t"
+        "sub    $32, %0 \n\t"
+        "jge    1b \n\t"
+        :"+r"(i), "+r"(src1)
+        :"r"(dst), "r"(src0)
+    );
+}
+
+static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
+                                      const float *src2, int src3, int len, int step){
+    long i = (len-4)*4;
+    if(step == 2 && src3 == 0){
+        dst += (len-4)*2;
+        asm volatile(
+            "1: \n\t"
+            "movq   (%2,%0),  %%mm0 \n\t"
+            "movq  8(%2,%0),  %%mm1 \n\t"
+            "pfmul  (%3,%0),  %%mm0 \n\t"
+            "pfmul 8(%3,%0),  %%mm1 \n\t"
+            "pfadd  (%4,%0),  %%mm0 \n\t"
+            "pfadd 8(%4,%0),  %%mm1 \n\t"
+            "movd     %%mm0,   (%1) \n\t"
+            "movd     %%mm1, 16(%1) \n\t"
+            "psrlq      $32,  %%mm0 \n\t"
+            "psrlq      $32,  %%mm1 \n\t"
+            "movd     %%mm0,  8(%1) \n\t"
+            "movd     %%mm1, 24(%1) \n\t"
+            "sub  $32, %1 \n\t"
+            "sub  $16, %0 \n\t"
+            "jge  1b \n\t"
+            :"+r"(i), "+r"(dst)
+            :"r"(src0), "r"(src1), "r"(src2)
+            :"memory"
+        );
+    }
+    else if(step == 1 && src3 == 0){
+        asm volatile(
+            "1: \n\t"
+            "movq    (%2,%0), %%mm0 \n\t"
+            "movq   8(%2,%0), %%mm1 \n\t"
+            "pfmul   (%3,%0), %%mm0 \n\t"
+            "pfmul  8(%3,%0), %%mm1 \n\t"
+            "pfadd   (%4,%0), %%mm0 \n\t"
+            "pfadd  8(%4,%0), %%mm1 \n\t"
+            "movq  %%mm0,   (%1,%0) \n\t"
+            "movq  %%mm1,  8(%1,%0) \n\t"
+            "sub  $16, %0 \n\t"
+            "jge  1b \n\t"
+            :"+r"(i)
+            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
+            :"memory"
+        );
+    }
+    else
+        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+    asm volatile("femms");
+}
+static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
+                                    const float *src2, int src3, int len, int step){
+    long i = (len-8)*4;
+    if(step == 2 && src3 == 0){
+        dst += (len-8)*2;
+        asm volatile(
+            "1: \n\t"
+            "movaps   (%2,%0), %%xmm0 \n\t"
+            "movaps 16(%2,%0), %%xmm1 \n\t"
+            "mulps    (%3,%0), %%xmm0 \n\t"
+            "mulps  16(%3,%0), %%xmm1 \n\t"
+            "addps    (%4,%0), %%xmm0 \n\t"
+            "addps  16(%4,%0), %%xmm1 \n\t"
+            "movss     %%xmm0,   (%1) \n\t"
+            "movss     %%xmm1, 32(%1) \n\t"
+            "movhlps   %%xmm0, %%xmm2 \n\t"
+            "movhlps   %%xmm1, %%xmm3 \n\t"
+            "movss     %%xmm2, 16(%1) \n\t"
+            "movss     %%xmm3, 48(%1) \n\t"
+            "shufps $0xb1, %%xmm0, %%xmm0 \n\t"
+            "shufps $0xb1, %%xmm1, %%xmm1 \n\t"
+            "movss     %%xmm0,  8(%1) \n\t"
+            "movss     %%xmm1, 40(%1) \n\t"
+            "movhlps   %%xmm0, %%xmm2 \n\t"
+            "movhlps   %%xmm1, %%xmm3 \n\t"
+            "movss     %%xmm2, 24(%1) \n\t"
+            "movss     %%xmm3, 56(%1) \n\t"
+            "sub  $64, %1 \n\t"
+            "sub  $32, %0 \n\t"
+            "jge  1b \n\t"
+            :"+r"(i), "+r"(dst)
+            :"r"(src0), "r"(src1), "r"(src2)
+            :"memory"
+        );
+    }
+    else if(step == 1 && src3 == 0){
+        asm volatile(
+            "1: \n\t"
+            "movaps   (%2,%0), %%xmm0 \n\t"
+            "movaps 16(%2,%0), %%xmm1 \n\t"
+            "mulps    (%3,%0), %%xmm0 \n\t"
+            "mulps  16(%3,%0), %%xmm1 \n\t"
+            "addps    (%4,%0), %%xmm0 \n\t"
+            "addps  16(%4,%0), %%xmm1 \n\t"
+            "movaps %%xmm0,   (%1,%0) \n\t"
+            "movaps %%xmm1, 16(%1,%0) \n\t"
+            "sub  $32, %0 \n\t"
+            "jge  1b \n\t"
+            :"+r"(i)
+            :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
+            :"memory"
+        );
+    }
+    else
+        ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step);
+}
+
+static void float_to_int16_3dnow(int16_t *dst, const float *src, int len){
+    // not bit-exact: pf2id uses different rounding than C and SSE
+    int i;
+    for(i=0; i<len; i+=4) {
+        asm volatile(
+            "pf2id       %1, %%mm0 \n\t"
+            "pf2id       %2, %%mm1 \n\t"
+            "packssdw %%mm1, %%mm0 \n\t"
+            "movq     %%mm0, %0    \n\t"
+            :"=m"(dst[i])
+            :"m"(src[i]), "m"(src[i+2])
+        );
+    }
+    asm volatile("femms");
+}
+static void float_to_int16_sse(int16_t *dst, const float *src, int len){
+    int i;
+    for(i=0; i<len; i+=4) {
+        asm volatile(
+            "cvtps2pi    %1, %%mm0 \n\t"
+            "cvtps2pi    %2, %%mm1 \n\t"
+            "packssdw %%mm1, %%mm0 \n\t"
+            "movq     %%mm0, %0    \n\t"
+            :"=m"(dst[i])
+            :"m"(src[i]), "m"(src[i+2])
+        );
+    }
+    asm volatile("emms");
+}
 
 #ifdef CONFIG_SNOW_ENCODER
 extern void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width);
@@ -2782,6 +3101,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                 c->idct_add= ff_simple_idct_add_mmx;
                 c->idct    = ff_simple_idct_mmx;
                 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
+#ifdef CONFIG_GPL
             }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
                 if(mm_flags & MM_MMXEXT){
                     c->idct_put= ff_libmpeg2mmx2_idct_put;
@@ -2793,8 +3113,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                     c->idct    = ff_mmx_idct;
                 }
                 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
-#if 0
-            }else if(idct_algo==FF_IDCT_VP3){
+#endif
+            }else if(idct_algo==FF_IDCT_VP3 &&
+                     avctx->codec->id!=CODEC_ID_THEORA &&
+                     !(avctx->flags & CODEC_FLAG_BITEXACT)){
                 if(mm_flags & MM_SSE2){
                     c->idct_put= ff_vp3_idct_put_sse2;
                     c->idct_add= ff_vp3_idct_add_sse2;
@@ -2807,10 +3129,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                     c->idct    = ff_vp3_idct_mmx;
                     c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
                 }
-#endif
             }else if(idct_algo==FF_IDCT_CAVS){
                     c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
-#ifdef CONFIG_GPL
             }else if(idct_algo==FF_IDCT_XVIDMMX){
                 if(mm_flags & MM_MMXEXT){
                     c->idct_put= ff_idct_xvid_mmx2_put;
@@ -2821,7 +3141,6 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
                     c->idct_add= ff_idct_xvid_mmx_add;
                     c->idct    = ff_idct_xvid_mmx;
                 }
-#endif
             }
         }
 
@@ -3012,6 +3331,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             dspfunc(avg_h264_qpel, 0, 16);
             dspfunc(avg_h264_qpel, 1, 8);
             dspfunc(avg_h264_qpel, 2, 4);
+
+            dspfunc(put_2tap_qpel, 0, 16);
+            dspfunc(put_2tap_qpel, 1, 8);
+            dspfunc(avg_2tap_qpel, 0, 16);
+            dspfunc(avg_2tap_qpel, 1, 8);
 #undef dspfunc
 
             c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
@@ -3024,6 +3348,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
             c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
             c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
+            c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
 
             c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
             c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
@@ -3134,6 +3459,11 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             dspfunc(avg_h264_qpel, 1, 8);
             dspfunc(avg_h264_qpel, 2, 4);
 
+            dspfunc(put_2tap_qpel, 0, 16);
+            dspfunc(put_2tap_qpel, 1, 8);
+            dspfunc(avg_2tap_qpel, 0, 16);
+            dspfunc(avg_2tap_qpel, 1, 8);
+
             c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
             c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
         }
@@ -3150,6 +3480,24 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
             c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
         }
 #endif
+
+        if(mm_flags & MM_3DNOW){
+            c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
+            c->vector_fmul = vector_fmul_3dnow;
+            if(!(avctx->flags & CODEC_FLAG_BITEXACT))
+                c->float_to_int16 = float_to_int16_3dnow;
+        }
+        if(mm_flags & MM_3DNOWEXT)
+            c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
+        if(mm_flags & MM_SSE){
+            c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
+            c->vector_fmul = vector_fmul_sse;
+            c->float_to_int16 = float_to_int16_sse;
+            c->vector_fmul_reverse = vector_fmul_reverse_sse;
+            c->vector_fmul_add_add = vector_fmul_add_add_sse;
+        }
+        if(mm_flags & MM_3DNOW)
+            c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse
     }
 
 #ifdef CONFIG_ENCODERS