1 files changed, 723 insertions, 173 deletions
diff --git a/contrib/ffmpeg/libavcodec/i386/h264dsp_mmx.c b/contrib/ffmpeg/libavcodec/i386/h264dsp_mmx.c
index 56004b674..7f5c5a95b 100644
--- a/contrib/ffmpeg/libavcodec/i386/h264dsp_mmx.c
+++ b/contrib/ffmpeg/libavcodec/i386/h264dsp_mmx.c
@@ -18,16 +18,11 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include "dsputil_mmx.h"
 
 /***********************************/
 /* IDCT */
 
-/* in/out: mma=mma+mmb, mmb=mmb-mma */
-#define SUMSUB_BA( a, b ) \
-    "paddw "#b", "#a" \n\t"\
-    "paddw "#b", "#b" \n\t"\
-    "psubw "#a", "#b" \n\t"
-
 #define SUMSUB_BADC( a, b, c, d ) \
     "paddw "#b", "#a" \n\t"\
     "paddw "#d", "#c" \n\t"\
@@ -48,12 +43,6 @@
     SUMSUBD2_AB( s13, d13, t )\
     SUMSUB_BADC( d13, s02, s13, d02 )
 
-#define TRANSPOSE4(a,b,c,d,t)\
-    SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
-    SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
-    SBUTTERFLY(a,c,d,dq) /* a=aeim d=bfjn */\
-    SBUTTERFLY(t,b,c,dq) /* t=cgko c=dhlp */
-
 #define STORE_DIFF_4P( p, t, z ) \
     "psraw      $6,     "#p" \n\t"\
     "movd       (%0),   "#t" \n\t"\
@@ -105,44 +94,38 @@ static inline void h264_idct8_1d(int16_t *block)
 {
     asm volatile(
         "movq 112(%0), %%mm7  \n\t"
-        "movq  80(%0), %%mm5  \n\t"
+        "movq  80(%0), %%mm0  \n\t"
         "movq  48(%0), %%mm3  \n\t"
-        "movq  16(%0), %%mm1  \n\t"
+        "movq  16(%0), %%mm5  \n\t"
 
-        "movq   %%mm7, %%mm4  \n\t"
-        "movq   %%mm3, %%mm6  \n\t"
-        "movq   %%mm5, %%mm0  \n\t"
-        "movq   %%mm7, %%mm2  \n\t"
+        "movq   %%mm0, %%mm4  \n\t"
+        "movq   %%mm5, %%mm1  \n\t"
         "psraw  $1,    %%mm4  \n\t"
-        "psraw  $1,    %%mm6  \n\t"
-        "psubw  %%mm7, %%mm0  \n\t"
-        "psubw  %%mm6, %%mm2  \n\t"
-        "psubw  %%mm4, %%mm0  \n\t"
-        "psubw  %%mm3, %%mm2  \n\t"
-        "psubw  %%mm3, %%mm0  \n\t"
-        "paddw  %%mm1, %%mm2  \n\t"
-
-        "movq   %%mm5, %%mm4  \n\t"
-        "movq   %%mm1, %%mm6  \n\t"
-        "psraw  $1,    %%mm4  \n\t"
-        "psraw  $1,    %%mm6  \n\t"
-        "paddw  %%mm5, %%mm4  \n\t"
-        "paddw  %%mm1, %%mm6  \n\t"
+        "psraw  $1,    %%mm1  \n\t"
+        "paddw  %%mm0, %%mm4  \n\t"
+        "paddw  %%mm5, %%mm1  \n\t"
         "paddw  %%mm7, %%mm4  \n\t"
-        "paddw  %%mm5, %%mm6  \n\t"
-        "psubw  %%mm1, %%mm4  \n\t"
-        "paddw  %%mm3, %%mm6  \n\t"
+        "paddw  %%mm0, %%mm1  \n\t"
+        "psubw  %%mm5, %%mm4  \n\t"
+        "paddw  %%mm3, %%mm1  \n\t"
+
+        "psubw  %%mm3, %%mm5  \n\t"
+        "psubw  %%mm3, %%mm0  \n\t"
+        "paddw  %%mm7, %%mm5  \n\t"
+        "psubw  %%mm7, %%mm0  \n\t"
+        "psraw  $1,    %%mm3  \n\t"
+        "psraw  $1,    %%mm7  \n\t"
+        "psubw  %%mm3, %%mm5  \n\t"
+        "psubw  %%mm7, %%mm0  \n\t"
 
-        "movq   %%mm0, %%mm1  \n\t"
         "movq   %%mm4, %%mm3  \n\t"
-        "movq   %%mm2, %%mm5  \n\t"
-        "movq   %%mm6, %%mm7  \n\t"
-        "psraw  $2,    %%mm6  \n\t"
+        "movq   %%mm1, %%mm7  \n\t"
+        "psraw  $2,    %%mm1  \n\t"
         "psraw  $2,    %%mm3  \n\t"
+        "paddw  %%mm5, %%mm3  \n\t"
         "psraw  $2,    %%mm5  \n\t"
+        "paddw  %%mm0, %%mm1  \n\t"
         "psraw  $2,    %%mm0  \n\t"
-        "paddw  %%mm6, %%mm1  \n\t"
-        "paddw  %%mm2, %%mm3  \n\t"
         "psubw  %%mm4, %%mm5  \n\t"
         "psubw  %%mm0, %%mm7  \n\t"
 
@@ -228,6 +211,93 @@ static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
     add_pixels_clamped_mmx(b2, dst, stride);
 }
 
+#define STORE_DIFF_8P( p, d, t, z )\
+        "movq       "#d", "#t" \n"\
+        "psraw       $6,  "#p" \n"\
+        "punpcklbw  "#z", "#t" \n"\
+        "paddsw     "#t", "#p" \n"\
+        "packuswb   "#p", "#p" \n"\
+        "movq       "#p", "#d" \n"
+
+#define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
+        "movdqa     "#c", "#a" \n"\
+        "movdqa     "#g", "#e" \n"\
+        "psraw       $1,  "#c" \n"\
+        "psraw       $1,  "#g" \n"\
+        "psubw      "#e", "#c" \n"\
+        "paddw      "#a", "#g" \n"\
+        "movdqa     "#b", "#e" \n"\
+        "psraw       $1,  "#e" \n"\
+        "paddw      "#b", "#e" \n"\
+        "paddw      "#d", "#e" \n"\
+        "paddw      "#f", "#e" \n"\
+        "movdqa     "#f", "#a" \n"\
+        "psraw       $1,  "#a" \n"\
+        "paddw      "#f", "#a" \n"\
+        "paddw      "#h", "#a" \n"\
+        "psubw      "#b", "#a" \n"\
+        "psubw      "#d", "#b" \n"\
+        "psubw      "#d", "#f" \n"\
+        "paddw      "#h", "#b" \n"\
+        "psubw      "#h", "#f" \n"\
+        "psraw       $1,  "#d" \n"\
+        "psraw       $1,  "#h" \n"\
+        "psubw      "#d", "#b" \n"\
+        "psubw      "#h", "#f" \n"\
+        "movdqa     "#e", "#d" \n"\
+        "movdqa     "#a", "#h" \n"\
+        "psraw       $2,  "#d" \n"\
+        "psraw       $2,  "#h" \n"\
+        "paddw      "#f", "#d" \n"\
+        "paddw      "#b", "#h" \n"\
+        "psraw       $2,  "#f" \n"\
+        "psraw       $2,  "#b" \n"\
+        "psubw      "#f", "#e" \n"\
+        "psubw      "#a", "#b" \n"\
+        "movdqa 0x00(%1), "#a" \n"\
+        "movdqa 0x40(%1), "#f" \n"\
+        SUMSUB_BA(f, a)\
+        SUMSUB_BA(g, f)\
+        SUMSUB_BA(c, a)\
+        SUMSUB_BA(e, g)\
+        SUMSUB_BA(b, c)\
+        SUMSUB_BA(h, a)\
+        SUMSUB_BA(d, f)
+
+static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
+{
+    asm volatile(
+        "movdqa   0x10(%1), %%xmm1 \n"
+        "movdqa   0x20(%1), %%xmm2 \n"
+        "movdqa   0x30(%1), %%xmm3 \n"
+        "movdqa   0x50(%1), %%xmm5 \n"
+        "movdqa   0x60(%1), %%xmm6 \n"
+        "movdqa   0x70(%1), %%xmm7 \n"
+        H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
+        TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
+        "paddw          %4, %%xmm4 \n"
+        "movdqa     %%xmm4, 0x00(%1) \n"
+        "movdqa     %%xmm2, 0x40(%1) \n"
+        H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
+        "movdqa     %%xmm6, 0x60(%1) \n"
+        "movdqa     %%xmm7, 0x70(%1) \n"
+        "pxor       %%xmm7, %%xmm7 \n"
+        STORE_DIFF_8P(%%xmm2, (%0),      %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm0, (%0,%2),   %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm3, (%0,%3),   %%xmm6, %%xmm7)
+        "lea     (%0,%2,4), %0 \n"
+        STORE_DIFF_8P(%%xmm5, (%0),      %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm4, (%0,%2),   %%xmm6, %%xmm7)
+        "movdqa   0x60(%1), %%xmm0 \n"
+        "movdqa   0x70(%1), %%xmm1 \n"
+        STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
+        STORE_DIFF_8P(%%xmm1, (%0,%3),   %%xmm6, %%xmm7)
+        :"+r"(dst)
+        :"r"(block), "r"((long)stride), "r"(3L*stride), "m"(ff_pw_32)
+    );
+}
+
 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
 {
     int dc = (block[0] + 32) >> 6;
@@ -369,8 +439,8 @@ static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
         "paddusb %%mm3              , %%mm1 \n\t"\
         "paddusb %%mm6              , %%mm2 \n\t"
 
-// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=mm_bone
-// out: (q1addr) = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
+// in: mm0=p1 mm1=p0 mm2=q0 mm3=q1 mm7=(tc&mask) %8=ff_bone
+// out: (q1addr) = av_clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
 // clobbers: q2, tmp, tc0
 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
         "movq     %%mm1,  "#tmp"   \n\t"\
@@ -435,7 +505,7 @@ static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph
         : "=m"(*tmp0)
         : "r"(pix-3*stride), "r"(pix), "r"((long)stride),
           "m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
-          "m"(mm_bone)
+          "m"(ff_bone)
     );
 }
 
@@ -482,7 +552,7 @@ static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int al
 
         :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
            "r"(*(uint32_t*)tc0),
-           "m"(alpha1), "m"(beta1), "m"(mm_bone), "m"(ff_pb_3F)
+           "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
     );
 }
 
@@ -532,7 +602,7 @@ static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride,
         "movq    %%mm1,   (%0,%2)   \n\t"
         "movq    %%mm2,   (%1)      \n\t"
         :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
-           "m"(alpha1), "m"(beta1), "m"(mm_bone)
+           "m"(alpha1), "m"(beta1), "m"(ff_bone)
     );
 }
 
@@ -651,37 +721,44 @@ static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40]
 /***********************************/
 /* motion compensation */
 
-#define QPEL_H264V(A,B,C,D,E,F,OP)\
-        "movd (%0), "#F"            \n\t"\
-        "movq "#C", %%mm6           \n\t"\
-        "paddw "#D", %%mm6          \n\t"\
-        "psllw $2, %%mm6            \n\t"\
-        "psubw "#B", %%mm6          \n\t"\
-        "psubw "#E", %%mm6          \n\t"\
-        "pmullw %4, %%mm6           \n\t"\
-        "add %2, %0                 \n\t"\
-        "punpcklbw %%mm7, "#F"      \n\t"\
+#define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
+        "mov"#q" "#C", "#T"         \n\t"\
+        "mov"#d" (%0), "#F"         \n\t"\
+        "paddw "#D", "#T"           \n\t"\
+        "psllw $2, "#T"             \n\t"\
+        "psubw "#B", "#T"           \n\t"\
+        "psubw "#E", "#T"           \n\t"\
+        "punpcklbw "#Z", "#F"       \n\t"\
+        "pmullw %4, "#T"            \n\t"\
         "paddw %5, "#A"             \n\t"\
+        "add %2, %0                 \n\t"\
         "paddw "#F", "#A"           \n\t"\
-        "paddw "#A", %%mm6          \n\t"\
-        "psraw $5, %%mm6            \n\t"\
-        "packuswb %%mm6, %%mm6      \n\t"\
-        OP(%%mm6, (%1), A, d)\
+        "paddw "#A", "#T"           \n\t"\
+        "psraw $5, "#T"             \n\t"\
+        "packuswb "#T", "#T"        \n\t"\
+        OP(T, (%1), A, d)\
         "add %3, %1                 \n\t"
 
-#define QPEL_H264HV(A,B,C,D,E,F,OF)\
-        "movd (%0), "#F"            \n\t"\
-        "movq "#C", %%mm6           \n\t"\
-        "paddw "#D", %%mm6          \n\t"\
-        "psllw $2, %%mm6            \n\t"\
-        "psubw "#B", %%mm6          \n\t"\
-        "psubw "#E", %%mm6          \n\t"\
-        "pmullw %3, %%mm6           \n\t"\
-        "add %2, %0                 \n\t"\
-        "punpcklbw %%mm7, "#F"      \n\t"\
+#define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
+        "mov"#q" "#C", "#T"         \n\t"\
+        "mov"#d" (%0), "#F"         \n\t"\
+        "paddw "#D", "#T"           \n\t"\
+        "psllw $2, "#T"             \n\t"\
+        "paddw %4, "#A"             \n\t"\
+        "psubw "#B", "#T"           \n\t"\
+        "psubw "#E", "#T"           \n\t"\
+        "punpcklbw "#Z", "#F"       \n\t"\
+        "pmullw %3, "#T"            \n\t"\
         "paddw "#F", "#A"           \n\t"\
-        "paddw "#A", %%mm6          \n\t"\
-        "movq %%mm6, "#OF"(%1)      \n\t"
+        "add %2, %0                 \n\t"\
+        "paddw "#A", "#T"           \n\t"\
+        "mov"#q" "#T", "#OF"(%1)    \n\t"
+
+#define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
+#define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
+#define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
+#define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
+
 
 #define QPEL_H264(OPNAME, OP, MMX)\
 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
@@ -719,7 +796,7 @@ static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uin
         "add %4, %1                 \n\t"\
         "decl %2                    \n\t"\
         " jnz 1b                    \n\t"\
-        : "+a"(src), "+c"(dst), "+m"(h)\
+        : "+a"(src), "+c"(dst), "+g"(h)\
         : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
         : "memory"\
     );\
@@ -825,7 +902,7 @@ static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, in
             QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
              \
             : "+a"(src)\
-            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
+            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
             : "memory"\
         );\
         tmp += 4;\
@@ -833,7 +910,6 @@ static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, in
     }\
     tmp -= 3*4;\
     asm volatile(\
-        "movq %4, %%mm6             \n\t"\
         "1:                         \n\t"\
         "movq     (%0), %%mm0       \n\t"\
         "paddw  10(%0), %%mm0       \n\t"\
@@ -846,8 +922,7 @@ static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, in
         "psubw %%mm1, %%mm0         \n\t"/*(a-b)/4-b */\
         "paddsw %%mm2, %%mm0        \n\t"\
         "psraw $2, %%mm0            \n\t"/*((a-b)/4-b+c)/4 */\
-        "paddw %%mm6, %%mm2         \n\t"\
-        "paddw %%mm2, %%mm0         \n\t"/*(a-5*b+20*c)/16 +32 */\
+        "paddw %%mm2, %%mm0         \n\t"/*(a-5*b+20*c)/16 */\
         "psraw $6, %%mm0            \n\t"\
         "packuswb %%mm0, %%mm0      \n\t"\
         OP(%%mm0, (%1),%%mm7, d)\
@@ -855,8 +930,8 @@ static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, in
         "add %3, %1                 \n\t"\
         "decl %2                    \n\t"\
         " jnz 1b                    \n\t"\
-        : "+a"(tmp), "+c"(dst), "+m"(h)\
-        : "S"((long)dstStride), "m"(ff_pw_32)\
+        : "+a"(tmp), "+c"(dst), "+g"(h)\
+        : "S"((long)dstStride)\
         : "memory"\
     );\
 }\
@@ -912,7 +987,7 @@ static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uin
         "add %4, %1                 \n\t"\
         "decl %2                    \n\t"\
         " jnz 1b                    \n\t"\
-        : "+a"(src), "+c"(dst), "+m"(h)\
+        : "+a"(src), "+c"(dst), "+g"(h)\
         : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
         : "memory"\
     );\
@@ -1036,8 +1111,7 @@ static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst,
      dst += 4-h*dstStride;\
    }\
 }\
-static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
-    int h = size;\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
     int w = (size+8)>>2;\
     src -= 2*srcStride+2;\
     while(w--){\
@@ -1067,7 +1141,7 @@ static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst
             QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
             QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
             : "+a"(src)\
-            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
+            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
             : "memory"\
         );\
         if(size==16){\
@@ -1081,19 +1155,19 @@ static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst
                 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
                 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
                 : "+a"(src)\
-                : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
+                : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
                 : "memory"\
             );\
         }\
         tmp += 4;\
         src += 4 - (size+5)*srcStride;\
     }\
-    tmp -= size+8;\
-    w = size>>4;\
+}\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+    int w = size>>4;\
     do{\
-    h = size;\
+    int h = size;\
     asm volatile(\
-        "movq %4, %%mm6             \n\t"\
         "1:                         \n\t"\
         "movq     (%0), %%mm0       \n\t"\
         "movq    8(%0), %%mm3       \n\t"\
@@ -1117,8 +1191,6 @@ static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst
         "paddsw %%mm5, %%mm3        \n\t"\
         "psraw $2, %%mm0            \n\t"\
         "psraw $2, %%mm3            \n\t"\
-        "paddw %%mm6, %%mm2         \n\t"\
-        "paddw %%mm6, %%mm5         \n\t"\
         "paddw %%mm2, %%mm0         \n\t"\
         "paddw %%mm5, %%mm3         \n\t"\
         "psraw $6, %%mm0            \n\t"\
@@ -1129,8 +1201,8 @@ static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst
         "add %3, %1                 \n\t"\
         "decl %2                    \n\t"\
         " jnz 1b                    \n\t"\
-        : "+a"(tmp), "+c"(dst), "+m"(h)\
-        : "S"((long)dstStride), "m"(ff_pw_32)\
+        : "+a"(tmp), "+c"(dst), "+g"(h)\
+        : "S"((long)dstStride)\
         : "memory"\
     );\
     tmp += 8 - size*24;\
@@ -1146,7 +1218,7 @@ static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, ui
     OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
 }\
 \
-static av_noinline void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
     OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
     src += 8*srcStride;\
@@ -1165,6 +1237,10 @@ static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst,
     OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
 }\
 \
+static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+          put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
+    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
     OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst  , tmp  , src  , dstStride, tmpStride, srcStride, 8);\
 }\
@@ -1176,11 +1252,8 @@ static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp,
 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
 {\
     asm volatile(\
-        "movq       %5,  %%mm6          \n\t"\
         "movq      (%1), %%mm0          \n\t"\
         "movq    24(%1), %%mm1          \n\t"\
-        "paddw    %%mm6, %%mm0          \n\t"\
-        "paddw    %%mm6, %%mm1          \n\t"\
         "psraw      $5,  %%mm0          \n\t"\
         "psraw      $5,  %%mm1          \n\t"\
         "packuswb %%mm0, %%mm0          \n\t"\
@@ -1193,8 +1266,6 @@ static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_
         "lea  (%2,%4,2), %2             \n\t"\
         "movq    48(%1), %%mm0          \n\t"\
         "movq    72(%1), %%mm1          \n\t"\
-        "paddw    %%mm6, %%mm0          \n\t"\
-        "paddw    %%mm6, %%mm1          \n\t"\
         "psraw      $5,  %%mm0          \n\t"\
         "psraw      $5,  %%mm1          \n\t"\
         "packuswb %%mm0, %%mm0          \n\t"\
@@ -1204,32 +1275,34 @@ static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_
         OP(%%mm0, (%2),    %%mm4, d)\
         OP(%%mm1, (%2,%4), %%mm5, d)\
         :"+a"(src8), "+c"(src16), "+d"(dst)\
-        :"S"((long)src8Stride), "D"((long)dstStride), "m"(ff_pw_16)\
+        :"S"((long)src8Stride), "D"((long)dstStride)\
         :"memory");\
 }\
 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
 {\
-    asm volatile(\
-        "movq       %0,  %%mm6          \n\t"\
-        ::"m"(ff_pw_16)\
-        );\
-    while(h--){\
+    do{\
     asm volatile(\
         "movq      (%1), %%mm0          \n\t"\
         "movq     8(%1), %%mm1          \n\t"\
-        "paddw    %%mm6, %%mm0          \n\t"\
-        "paddw    %%mm6, %%mm1          \n\t"\
+        "movq    48(%1), %%mm2          \n\t"\
+        "movq  8+48(%1), %%mm3          \n\t"\
         "psraw      $5,  %%mm0          \n\t"\
         "psraw      $5,  %%mm1          \n\t"\
+        "psraw      $5,  %%mm2          \n\t"\
+        "psraw      $5,  %%mm3          \n\t"\
         "packuswb %%mm1, %%mm0          \n\t"\
+        "packuswb %%mm3, %%mm2          \n\t"\
         PAVGB"     (%0), %%mm0          \n\t"\
+        PAVGB"  (%0,%3), %%mm2          \n\t"\
         OP(%%mm0, (%2), %%mm5, q)\
-        ::"a"(src8), "c"(src16), "d"(dst)\
+        OP(%%mm2, (%2,%4), %%mm5, q)\
+        ::"a"(src8), "c"(src16), "d"(dst),\
+          "r"((long)src8Stride), "r"((long)dstStride)\
         :"memory");\
-        src8 += src8Stride;\
-        src16 += 24;\
-        dst += dstStride;\
-    }\
+        src8 += 2L*src8Stride;\
+        src16 += 48;\
+        dst += 2L*dstStride;\
+    }while(h-=2);\
 }\
 static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
 {\
@@ -1238,11 +1311,453 @@ static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, u
 }\
 
 
-#define H264_MC(OPNAME, SIZE, MMX) \
-static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
-    OPNAME ## pixels ## SIZE ## _mmx(dst, src, stride, SIZE);\
+#ifdef ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    int h=16;\
+    asm volatile(\
+        "pxor %%xmm15, %%xmm15      \n\t"\
+        "movdqa %6, %%xmm14         \n\t"\
+        "movdqa %7, %%xmm13         \n\t"\
+        "1:                         \n\t"\
+        "lddqu    3(%0), %%xmm1     \n\t"\
+        "lddqu   -5(%0), %%xmm7     \n\t"\
+        "movdqa  %%xmm1, %%xmm0     \n\t"\
+        "punpckhbw %%xmm15, %%xmm1  \n\t"\
+        "punpcklbw %%xmm15, %%xmm0  \n\t"\
+        "punpcklbw %%xmm15, %%xmm7  \n\t"\
+        "movdqa  %%xmm1, %%xmm2     \n\t"\
+        "movdqa  %%xmm0, %%xmm6     \n\t"\
+        "movdqa  %%xmm1, %%xmm3     \n\t"\
+        "movdqa  %%xmm0, %%xmm8     \n\t"\
+        "movdqa  %%xmm1, %%xmm4     \n\t"\
+        "movdqa  %%xmm0, %%xmm9     \n\t"\
+        "movdqa  %%xmm1, %%xmm5     \n\t"\
+        "movdqa  %%xmm0, %%xmm10    \n\t"\
+        "palignr $6, %%xmm0, %%xmm5 \n\t"\
+        "palignr $6, %%xmm7, %%xmm10\n\t"\
+        "palignr $8, %%xmm0, %%xmm4 \n\t"\
+        "palignr $8, %%xmm7, %%xmm9 \n\t"\
+        "palignr $10,%%xmm0, %%xmm3 \n\t"\
+        "palignr $10,%%xmm7, %%xmm8 \n\t"\
+        "paddw   %%xmm1, %%xmm5     \n\t"\
+        "paddw   %%xmm0, %%xmm10    \n\t"\
+        "palignr $12,%%xmm0, %%xmm2 \n\t"\
+        "palignr $12,%%xmm7, %%xmm6 \n\t"\
+        "palignr $14,%%xmm0, %%xmm1 \n\t"\
+        "palignr $14,%%xmm7, %%xmm0 \n\t"\
+        "paddw   %%xmm3, %%xmm2     \n\t"\
+        "paddw   %%xmm8, %%xmm6     \n\t"\
+        "paddw   %%xmm4, %%xmm1     \n\t"\
+        "paddw   %%xmm9, %%xmm0     \n\t"\
+        "psllw   $2,     %%xmm2     \n\t"\
+        "psllw   $2,     %%xmm6     \n\t"\
+        "psubw   %%xmm1, %%xmm2     \n\t"\
+        "psubw   %%xmm0, %%xmm6     \n\t"\
+        "paddw   %%xmm13,%%xmm5     \n\t"\
+        "paddw   %%xmm13,%%xmm10    \n\t"\
+        "pmullw  %%xmm14,%%xmm2     \n\t"\
+        "pmullw  %%xmm14,%%xmm6     \n\t"\
+        "lddqu   (%2),   %%xmm3     \n\t"\
+        "paddw   %%xmm5, %%xmm2     \n\t"\
+        "paddw   %%xmm10,%%xmm6     \n\t"\
+        "psraw   $5,     %%xmm2     \n\t"\
+        "psraw   $5,     %%xmm6     \n\t"\
+        "packuswb %%xmm2,%%xmm6     \n\t"\
+        "pavgb   %%xmm3, %%xmm6     \n\t"\
+        OP(%%xmm6, (%1), %%xmm4, dqa)\
+        "add %5, %0                 \n\t"\
+        "add %5, %1                 \n\t"\
+        "add %4, %2                 \n\t"\
+        "decl %3                    \n\t"\
+        "jg 1b                      \n\t"\
+        : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
+        : "D"((long)src2Stride), "S"((long)dstStride),\
+          "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+}
+#else // ARCH_X86_64
+#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+    src += 8*dstStride;\
+    dst += 8*dstStride;\
+    src2 += 8*src2Stride;\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , dstStride, src2Stride);\
+    OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
+}
+#endif // ARCH_X86_64
+
+#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
+    int h=8;\
+    asm volatile(\
+        "pxor %%xmm7, %%xmm7        \n\t"\
+        "movdqa %0, %%xmm6          \n\t"\
+        :: "m"(ff_pw_5)\
+    );\
+    do{\
+    asm volatile(\
+        "lddqu   -5(%0), %%xmm1     \n\t"\
+        "movdqa  %%xmm1, %%xmm0     \n\t"\
+        "punpckhbw %%xmm7, %%xmm1   \n\t"\
+        "punpcklbw %%xmm7, %%xmm0   \n\t"\
+        "movdqa  %%xmm1, %%xmm2     \n\t"\
+        "movdqa  %%xmm1, %%xmm3     \n\t"\
+        "movdqa  %%xmm1, %%xmm4     \n\t"\
+        "movdqa  %%xmm1, %%xmm5     \n\t"\
+        "palignr $6, %%xmm0, %%xmm5 \n\t"\
+        "palignr $8, %%xmm0, %%xmm4 \n\t"\
+        "palignr $10,%%xmm0, %%xmm3 \n\t"\
+        "paddw   %%xmm1, %%xmm5     \n\t"\
+        "palignr $12,%%xmm0, %%xmm2 \n\t"\
+        "palignr $14,%%xmm0, %%xmm1 \n\t"\
+        "paddw   %%xmm3, %%xmm2     \n\t"\
+        "paddw   %%xmm4, %%xmm1     \n\t"\
+        "psllw   $2,     %%xmm2     \n\t"\
+        "movq    (%2),   %%xmm3     \n\t"\
+        "psubw   %%xmm1, %%xmm2     \n\t"\
+        "paddw   %5,     %%xmm5     \n\t"\
+        "pmullw  %%xmm6, %%xmm2     \n\t"\
+        "paddw   %%xmm5, %%xmm2     \n\t"\
+        "psraw   $5,     %%xmm2     \n\t"\
+        "packuswb %%xmm2, %%xmm2    \n\t"\
+        "pavgb   %%xmm3, %%xmm2     \n\t"\
+        OP(%%xmm2, (%1), %%xmm4, q)\
+        "add %4, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "add %3, %2                 \n\t"\
+        : "+a"(src), "+c"(dst), "+d"(src2)\
+        : "D"((long)src2Stride), "S"((long)dstStride),\
+          "m"(ff_pw_16)\
+        : "memory"\
+    );\
+    }while(--h);\
 }\
+QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
 \
+static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    int h=8;\
+    asm volatile(\
+        "pxor %%xmm7, %%xmm7        \n\t"\
+        "movdqa %5, %%xmm6          \n\t"\
+        "1:                         \n\t"\
+        "lddqu   -5(%0), %%xmm1     \n\t"\
+        "movdqa  %%xmm1, %%xmm0     \n\t"\
+        "punpckhbw %%xmm7, %%xmm1   \n\t"\
+        "punpcklbw %%xmm7, %%xmm0   \n\t"\
+        "movdqa  %%xmm1, %%xmm2     \n\t"\
+        "movdqa  %%xmm1, %%xmm3     \n\t"\
+        "movdqa  %%xmm1, %%xmm4     \n\t"\
+        "movdqa  %%xmm1, %%xmm5     \n\t"\
+        "palignr $6, %%xmm0, %%xmm5 \n\t"\
+        "palignr $8, %%xmm0, %%xmm4 \n\t"\
+        "palignr $10,%%xmm0, %%xmm3 \n\t"\
+        "paddw   %%xmm1, %%xmm5     \n\t"\
+        "palignr $12,%%xmm0, %%xmm2 \n\t"\
+        "palignr $14,%%xmm0, %%xmm1 \n\t"\
+        "paddw   %%xmm3, %%xmm2     \n\t"\
+        "paddw   %%xmm4, %%xmm1     \n\t"\
+        "psllw   $2,     %%xmm2     \n\t"\
+        "psubw   %%xmm1, %%xmm2     \n\t"\
+        "paddw   %6,     %%xmm5     \n\t"\
+        "pmullw  %%xmm6, %%xmm2     \n\t"\
+        "paddw   %%xmm5, %%xmm2     \n\t"\
+        "psraw   $5,     %%xmm2     \n\t"\
+        "packuswb %%xmm2, %%xmm2    \n\t"\
+        OP(%%xmm2, (%1), %%xmm4, q)\
+        "add %3, %0                 \n\t"\
+        "add %4, %1                 \n\t"\
+        "decl %2                    \n\t"\
+        " jnz 1b                    \n\t"\
+        : "+a"(src), "+c"(dst), "+g"(h)\
+        : "D"((long)srcStride), "S"((long)dstStride),\
+          "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+}\
+static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+    src += 8*srcStride;\
+    dst += 8*dstStride;\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, srcStride);\
+    OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
+}\
+
+#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
+    src -= 2*srcStride;\
+    \
+    asm volatile(\
+        "pxor %%xmm7, %%xmm7        \n\t"\
+        "movq (%0), %%xmm0          \n\t"\
+        "add %2, %0                 \n\t"\
+        "movq (%0), %%xmm1          \n\t"\
+        "add %2, %0                 \n\t"\
+        "movq (%0), %%xmm2          \n\t"\
+        "add %2, %0                 \n\t"\
+        "movq (%0), %%xmm3          \n\t"\
+        "add %2, %0                 \n\t"\
+        "movq (%0), %%xmm4          \n\t"\
+        "add %2, %0                 \n\t"\
+        "punpcklbw %%xmm7, %%xmm0   \n\t"\
+        "punpcklbw %%xmm7, %%xmm1   \n\t"\
+        "punpcklbw %%xmm7, %%xmm2   \n\t"\
+        "punpcklbw %%xmm7, %%xmm3   \n\t"\
+        "punpcklbw %%xmm7, %%xmm4   \n\t"\
+        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+        QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+        QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+        QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
+        QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
+        QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+        QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+         \
+        : "+a"(src), "+c"(dst)\
+        : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+        : "memory"\
+    );\
+    if(h==16){\
+        asm volatile(\
+            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+            QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
+            QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
+            QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
+            QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
+            QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
+            QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
+            \
+            : "+a"(src), "+c"(dst)\
+            : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
+            : "memory"\
+        );\
+    }\
+}\
+static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 8);\
+}\
+static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst  , src  , dstStride, srcStride, 16);\
+    OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
+}
+
+static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
+    int w = (size+8)>>3;
+    src -= 2*srcStride+2;
+    while(w--){
+        asm volatile(
+            "pxor %%xmm7, %%xmm7        \n\t"
+            "movq (%0), %%xmm0          \n\t"
+            "add %2, %0                 \n\t"
+            "movq (%0), %%xmm1          \n\t"
+            "add %2, %0                 \n\t"
+            "movq (%0), %%xmm2          \n\t"
+            "add %2, %0                 \n\t"
+            "movq (%0), %%xmm3          \n\t"
+            "add %2, %0                 \n\t"
+            "movq (%0), %%xmm4          \n\t"
+            "add %2, %0                 \n\t"
+            "punpcklbw %%xmm7, %%xmm0   \n\t"
+            "punpcklbw %%xmm7, %%xmm1   \n\t"
+            "punpcklbw %%xmm7, %%xmm2   \n\t"
+            "punpcklbw %%xmm7, %%xmm3   \n\t"
+            "punpcklbw %%xmm7, %%xmm4   \n\t"
+            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
+            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
+            QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
+            QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
+            QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
+            QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
+            QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
+            QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
+            : "+a"(src)
+            : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
+            : "memory"
+        );
+        if(size==16){
+            asm volatile(
+                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1,  8*48)
+                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2,  9*48)
+                QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
+                QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
+                QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
+                QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
+                QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
+                QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
+                : "+a"(src)
+                : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
+                : "memory"
+            );
+        }
+        tmp += 8;
+        src += 8 - (size+5)*srcStride;
+    }
+}
+
+#define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
+static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
+    int h = size;\
+    if(size == 16){\
+        asm volatile(\
+            "1:                         \n\t"\
+            "movdqa 32(%0), %%xmm4      \n\t"\
+            "movdqa 16(%0), %%xmm5      \n\t"\
+            "movdqa   (%0), %%xmm7      \n\t"\
+            "movdqa %%xmm4, %%xmm3      \n\t"\
+            "movdqa %%xmm4, %%xmm2      \n\t"\
+            "movdqa %%xmm4, %%xmm1      \n\t"\
+            "movdqa %%xmm4, %%xmm0      \n\t"\
+            "palignr $10, %%xmm5, %%xmm0 \n\t"\
+            "palignr  $8, %%xmm5, %%xmm1 \n\t"\
+            "palignr  $6, %%xmm5, %%xmm2 \n\t"\
+            "palignr  $4, %%xmm5, %%xmm3 \n\t"\
+            "palignr  $2, %%xmm5, %%xmm4 \n\t"\
+            "paddw  %%xmm5, %%xmm0      \n\t"\
+            "paddw  %%xmm4, %%xmm1      \n\t"\
+            "paddw  %%xmm3, %%xmm2      \n\t"\
+            "movdqa %%xmm5, %%xmm6      \n\t"\
+            "movdqa %%xmm5, %%xmm4      \n\t"\
+            "movdqa %%xmm5, %%xmm3      \n\t"\
+            "palignr  $8, %%xmm7, %%xmm4 \n\t"\
+            "palignr  $2, %%xmm7, %%xmm6 \n\t"\
+            "palignr $10, %%xmm7, %%xmm3 \n\t"\
+            "paddw  %%xmm6, %%xmm4      \n\t"\
+            "movdqa %%xmm5, %%xmm6      \n\t"\
+            "palignr  $6, %%xmm7, %%xmm5 \n\t"\
+            "palignr  $4, %%xmm7, %%xmm6 \n\t"\
+            "paddw  %%xmm7, %%xmm3      \n\t"\
+            "paddw  %%xmm6, %%xmm5      \n\t"\
+            \
+            "psubw  %%xmm1, %%xmm0      \n\t"\
+            "psubw  %%xmm4, %%xmm3      \n\t"\
+            "psraw      $2, %%xmm0      \n\t"\
+            "psraw      $2, %%xmm3      \n\t"\
+            "psubw  %%xmm1, %%xmm0      \n\t"\
+            "psubw  %%xmm4, %%xmm3      \n\t"\
+            "paddw  %%xmm2, %%xmm0      \n\t"\
+            "paddw  %%xmm5, %%xmm3      \n\t"\
+            "psraw      $2, %%xmm0      \n\t"\
+            "psraw      $2, %%xmm3      \n\t"\
+            "paddw  %%xmm2, %%xmm0      \n\t"\
+            "paddw  %%xmm5, %%xmm3      \n\t"\
+            "psraw      $6, %%xmm0      \n\t"\
+            "psraw      $6, %%xmm3      \n\t"\
+            "packuswb %%xmm0, %%xmm3    \n\t"\
+            OP(%%xmm3, (%1), %%xmm7, dqa)\
+            "add $48, %0                \n\t"\
+            "add %3, %1                 \n\t"\
+            "decl %2                    \n\t"\
+            " jnz 1b                    \n\t"\
+            : "+a"(tmp), "+c"(dst), "+g"(h)\
+            : "S"((long)dstStride)\
+            : "memory"\
+        );\
+    }else{\
+        asm volatile(\
+            "1:                         \n\t"\
+            "movdqa 16(%0), %%xmm1      \n\t"\
+            "movdqa   (%0), %%xmm0      \n\t"\
+            "movdqa %%xmm1, %%xmm2      \n\t"\
+            "movdqa %%xmm1, %%xmm3      \n\t"\
+            "movdqa %%xmm1, %%xmm4      \n\t"\
+            "movdqa %%xmm1, %%xmm5      \n\t"\
+            "palignr $10, %%xmm0, %%xmm5 \n\t"\
+            "palignr  $8, %%xmm0, %%xmm4 \n\t"\
+            "palignr  $6, %%xmm0, %%xmm3 \n\t"\
+            "palignr  $4, %%xmm0, %%xmm2 \n\t"\
+            "palignr  $2, %%xmm0, %%xmm1 \n\t"\
+            "paddw  %%xmm5, %%xmm0      \n\t"\
+            "paddw  %%xmm4, %%xmm1      \n\t"\
+            "paddw  %%xmm3, %%xmm2      \n\t"\
+            "psubw  %%xmm1, %%xmm0      \n\t"\
+            "psraw      $2, %%xmm0      \n\t"\
+            "psubw  %%xmm1, %%xmm0      \n\t"\
+            "paddw  %%xmm2, %%xmm0      \n\t"\
+            "psraw      $2, %%xmm0      \n\t"\
+            "paddw  %%xmm2, %%xmm0      \n\t"\
+            "psraw      $6, %%xmm0      \n\t"\
+            "packuswb %%xmm0, %%xmm0    \n\t"\
+            OP(%%xmm0, (%1), %%xmm7, q)\
+            "add $48, %0                \n\t"\
+            "add %3, %1                 \n\t"\
+            "decl %2                    \n\t"\
+            " jnz 1b                    \n\t"\
+            : "+a"(tmp), "+c"(dst), "+g"(h)\
+            : "S"((long)dstStride)\
+            : "memory"\
+        );\
+    }\
+}
+
+#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
+static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
+          put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
+    OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
+}\
+static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
+}\
+static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
+    OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
+}\
+
+#define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
+#define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
+#define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
+#define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
+#define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
+#define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
+#define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
+#define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
+
+#define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
+#define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
+#define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
+#define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
+#define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
+#define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
+#define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
+#define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
+
+#define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
+#define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
+#define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
+#define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
+
+#define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
+#define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
+#define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
+#define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
+
+#define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
+#define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
+
+#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
+H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
+H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
+
+static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
+    put_pixels16_sse2(dst, src, stride, 16);
+}
+static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
+    avg_pixels16_sse2(dst, src, stride, 16);
+}
+#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
+#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
+
+#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
+static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
+    OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
+}\
+
+#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
 }\
@@ -1254,12 +1769,12 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *
 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
 }\
-\
+
+#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*SIZE/8];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
@@ -1267,82 +1782,91 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*SIZE/8];\
-    uint8_t * const half= (uint8_t*)temp;\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
-    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, half, stride, stride, SIZE);\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
 }\
-\
+
+#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*SIZE/8];\
-    uint8_t * const halfV= (uint8_t*)temp;\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfV, stride, SIZE);\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*SIZE/8];\
-    uint8_t * const halfV= (uint8_t*)temp;\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfV, stride, SIZE);\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*SIZE/8];\
-    uint8_t * const halfV= (uint8_t*)temp;\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfV, stride, SIZE);\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*SIZE/8];\
-    uint8_t * const halfV= (uint8_t*)temp;\
-    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
-    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfV, stride, SIZE);\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
+    put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
+    OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*(SIZE<8?12:24)/4];\
-    int16_t * const tmp= (int16_t*)temp;\
-    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
+    DECLARE_ALIGNED(ALIGN, uint16_t, temp[SIZE*(SIZE<8?12:24)]);\
+    OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
-    uint8_t * const halfHV= (uint8_t*)temp;\
-    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE/2;\
-    assert((int)temp & 7 == 0);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    assert(((int)temp & 7) == 0);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
-    uint8_t * const halfHV= (uint8_t*)temp;\
-    int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE/2;\
-    assert((int)temp & 7 == 0);\
-    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    assert(((int)temp & 7) == 0);\
+    put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
-    int16_t * const halfV= ((int16_t*)temp) + SIZE*SIZE/2;\
-    uint8_t * const halfHV= ((uint8_t*)temp);\
-    assert((int)temp & 7 == 0);\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    assert(((int)temp & 7) == 0);\
     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
 }\
 \
 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
-    uint64_t temp[SIZE*(SIZE<8?12:24)/4 + SIZE*SIZE/8];\
-    int16_t * const halfV= ((int16_t*)temp) + SIZE*SIZE/2;\
-    uint8_t * const halfHV= ((uint8_t*)temp);\
-    assert((int)temp & 7 == 0);\
+    DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
+    uint8_t * const halfHV= temp;\
+    int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
+    assert(((int)temp & 7) == 0);\
     put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
     OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
 }\
 
+#define H264_MC_4816(MMX)\
+H264_MC(put_, 4, MMX, 8)\
+H264_MC(put_, 8, MMX, 8)\
+H264_MC(put_, 16,MMX, 8)\
+H264_MC(avg_, 4, MMX, 8)\
+H264_MC(avg_, 8, MMX, 8)\
+H264_MC(avg_, 16,MMX, 8)\
+
+#define H264_MC_816(QPEL, XMM)\
+QPEL(put_, 8, XMM, 16)\
+QPEL(put_, 16,XMM, 16)\
+QPEL(avg_, 8, XMM, 16)\
+QPEL(avg_, 16,XMM, 16)\
+
 
 #define AVG_3DNOW_OP(a,b,temp, size) \
 "mov" #size " " #b ", " #temp "   \n\t"\
@@ -1360,20 +1884,28 @@ QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
 #define PAVGB "pavgb"
 QPEL_H264(put_,       PUT_OP, mmx2)
 QPEL_H264(avg_,  AVG_MMX2_OP, mmx2)
+QPEL_H264_V_XMM(put_,       PUT_OP, sse2)
+QPEL_H264_V_XMM(avg_,  AVG_MMX2_OP, sse2)
+QPEL_H264_HV_XMM(put_,       PUT_OP, sse2)
+QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, sse2)
+#ifdef HAVE_SSSE3
+QPEL_H264_H_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_H_XMM(avg_,  AVG_MMX2_OP, ssse3)
+QPEL_H264_HV2_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_HV2_XMM(avg_,  AVG_MMX2_OP, ssse3)
+QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
+QPEL_H264_HV_XMM(avg_,  AVG_MMX2_OP, ssse3)
+#endif
 #undef PAVGB
 
-H264_MC(put_, 4, 3dnow)
-H264_MC(put_, 8, 3dnow)
-H264_MC(put_, 16,3dnow)
-H264_MC(avg_, 4, 3dnow)
-H264_MC(avg_, 8, 3dnow)
-H264_MC(avg_, 16,3dnow)
-H264_MC(put_, 4, mmx2)
-H264_MC(put_, 8, mmx2)
-H264_MC(put_, 16,mmx2)
-H264_MC(avg_, 4, mmx2)
-H264_MC(avg_, 8, mmx2)
-H264_MC(avg_, 16,mmx2)
+H264_MC_4816(3dnow)
+H264_MC_4816(mmx2)
+H264_MC_816(H264_MC_V, sse2)
+H264_MC_816(H264_MC_HV, sse2)
+#ifdef HAVE_SSSE3
+H264_MC_816(H264_MC_H, ssse3)
+H264_MC_816(H264_MC_HV, ssse3)
+#endif
 
 
 #define H264_CHROMA_OP(S,D)
@@ -1383,6 +1915,16 @@ H264_MC(avg_, 16,mmx2)
 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
 #include "dsputil_h264_template_mmx.c"
+
+static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 1);
+}
+static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 0);
+}
+
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
 #undef H264_CHROMA_MC8_TMPL
@@ -1398,6 +1940,10 @@ H264_MC(avg_, 16,mmx2)
 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
 #include "dsputil_h264_template_mmx.c"
+static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_mc8_mmx2(dst, src, stride, h, x, y, 1);
+}
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
 #undef H264_CHROMA_MC8_TMPL
@@ -1412,6 +1958,10 @@ H264_MC(avg_, 16,mmx2)
 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_3dnow
 #define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
 #include "dsputil_h264_template_mmx.c"
+static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_mc8_3dnow(dst, src, stride, h, x, y, 1);
+}
 #undef H264_CHROMA_OP
 #undef H264_CHROMA_OP4
 #undef H264_CHROMA_MC8_TMPL