summaryrefslogtreecommitdiff
path: root/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libffmpeg/libavcodec/i386/dsputil_mmx.c')
-rw-r--r--src/libffmpeg/libavcodec/i386/dsputil_mmx.c474
1 files changed, 460 insertions, 14 deletions
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index a1e1642d5..efa022557 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -1,6 +1,7 @@
/*
* MMX optimized DSP utils
* Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -22,6 +23,11 @@
#include "../dsputil.h"
#include "../simple_idct.h"
+//#undef NDEBUG
+//#include <assert.h>
+
+extern const uint8_t ff_h263_loop_filter_strength[32];
+
int mm_flags; /* multimedia extension flags */
/* pixel operations */
@@ -34,6 +40,8 @@ static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003
static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
+static const uint64_t ff_pb_FC __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
+
#define JUMPALIGN() __asm __volatile (".balign 8"::)
#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
@@ -465,6 +473,180 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
dst[i+0] += src[i+0];
}
+#define H263_LOOP_FILTER \
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq %0, %%mm0 \n\t"\
+ "movq %0, %%mm1 \n\t"\
+ "movq %3, %%mm2 \n\t"\
+ "movq %3, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpckhbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "psubw %%mm2, %%mm0 \n\t"\
+ "psubw %%mm3, %%mm1 \n\t"\
+ "movq %1, %%mm2 \n\t"\
+ "movq %1, %%mm3 \n\t"\
+ "movq %2, %%mm4 \n\t"\
+ "movq %2, %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "punpckhbw %%mm7, %%mm5 \n\t"\
+ "psubw %%mm2, %%mm4 \n\t"\
+ "psubw %%mm3, %%mm5 \n\t"\
+ "psllw $2, %%mm4 \n\t"\
+ "psllw $2, %%mm5 \n\t"\
+ "paddw %%mm0, %%mm4 \n\t"\
+ "paddw %%mm1, %%mm5 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "pcmpgtw %%mm4, %%mm6 \n\t"\
+ "pcmpgtw %%mm5, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm4 \n\t"\
+ "pxor %%mm7, %%mm5 \n\t"\
+ "psubw %%mm6, %%mm4 \n\t"\
+ "psubw %%mm7, %%mm5 \n\t"\
+ "psrlw $3, %%mm4 \n\t"\
+ "psrlw $3, %%mm5 \n\t"\
+ "packuswb %%mm5, %%mm4 \n\t"\
+ "packsswb %%mm7, %%mm6 \n\t"\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd %4, %%mm2 \n\t"\
+ "punpcklbw %%mm2, %%mm2 \n\t"\
+ "punpcklbw %%mm2, %%mm2 \n\t"\
+ "punpcklbw %%mm2, %%mm2 \n\t"\
+ "psubusb %%mm4, %%mm2 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "psubusb %%mm4, %%mm3 \n\t"\
+ "psubb %%mm3, %%mm2 \n\t"\
+ "movq %1, %%mm3 \n\t"\
+ "movq %2, %%mm4 \n\t"\
+ "pxor %%mm6, %%mm3 \n\t"\
+ "pxor %%mm6, %%mm4 \n\t"\
+ "paddusb %%mm2, %%mm3 \n\t"\
+ "psubusb %%mm2, %%mm4 \n\t"\
+ "pxor %%mm6, %%mm3 \n\t"\
+ "pxor %%mm6, %%mm4 \n\t"\
+ "paddusb %%mm2, %%mm2 \n\t"\
+ "packsswb %%mm1, %%mm0 \n\t"\
+ "pcmpgtb %%mm0, %%mm7 \n\t"\
+ "pxor %%mm7, %%mm0 \n\t"\
+ "psubb %%mm7, %%mm0 \n\t"\
+ "movq %%mm0, %%mm1 \n\t"\
+ "psubusb %%mm2, %%mm0 \n\t"\
+ "psubb %%mm0, %%mm1 \n\t"\
+ "pand %5, %%mm1 \n\t"\
+ "psrlw $2, %%mm1 \n\t"\
+ "pxor %%mm7, %%mm1 \n\t"\
+ "psubb %%mm7, %%mm1 \n\t"\
+ "movq %0, %%mm5 \n\t"\
+ "movq %3, %%mm6 \n\t"\
+ "psubb %%mm1, %%mm5 \n\t"\
+ "paddb %%mm1, %%mm6 \n\t"
+
+static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
+ const int strength= ff_h263_loop_filter_strength[qscale];
+
+ asm volatile(
+
+ H263_LOOP_FILTER
+
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %0 \n\t"
+ "movq %%mm6, %3 \n\t"
+ : "+m" (*(uint64_t*)(src - 2*stride)),
+ "+m" (*(uint64_t*)(src - 1*stride)),
+ "+m" (*(uint64_t*)(src + 0*stride)),
+ "+m" (*(uint64_t*)(src + 1*stride))
+ : "g" (2*strength), "m"(ff_pb_FC)
+ );
+}
+
+static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
+ asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
+ "movd %4, %%mm0 \n\t"
+ "movd %5, %%mm1 \n\t"
+ "movd %6, %%mm2 \n\t"
+ "movd %7, %%mm3 \n\t"
+ "punpcklbw %%mm1, %%mm0 \n\t"
+ "punpcklbw %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "punpcklwd %%mm2, %%mm0 \n\t"
+ "punpckhwd %%mm2, %%mm1 \n\t"
+ "movd %%mm0, %0 \n\t"
+ "punpckhdq %%mm0, %%mm0 \n\t"
+ "movd %%mm0, %1 \n\t"
+ "movd %%mm1, %2 \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movd %%mm1, %3 \n\t"
+
+ : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
+ "=m" (*(uint32_t*)(dst + 1*dst_stride)),
+ "=m" (*(uint32_t*)(dst + 2*dst_stride)),
+ "=m" (*(uint32_t*)(dst + 3*dst_stride))
+ : "m" (*(uint32_t*)(src + 0*src_stride)),
+ "m" (*(uint32_t*)(src + 1*src_stride)),
+ "m" (*(uint32_t*)(src + 2*src_stride)),
+ "m" (*(uint32_t*)(src + 3*src_stride))
+ );
+}
+
+static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
+ const int strength= ff_h263_loop_filter_strength[qscale];
+ uint64_t temp[4] __attribute__ ((aligned(8)));
+ uint8_t *btemp= (uint8_t*)temp;
+
+ src -= 2;
+
+ transpose4x4(btemp , src , 8, stride);
+ transpose4x4(btemp+4, src + 4*stride, 8, stride);
+ asm volatile(
+ H263_LOOP_FILTER // 5 3 4 6
+
+ : "+m" (temp[0]),
+ "+m" (temp[1]),
+ "+m" (temp[2]),
+ "+m" (temp[3])
+ : "g" (2*strength), "m"(ff_pb_FC)
+ );
+
+ asm volatile(
+ "movq %%mm5, %%mm1 \n\t"
+ "movq %%mm4, %%mm0 \n\t"
+ "punpcklbw %%mm3, %%mm5 \n\t"
+ "punpcklbw %%mm6, %%mm4 \n\t"
+ "punpckhbw %%mm3, %%mm1 \n\t"
+ "punpckhbw %%mm6, %%mm0 \n\t"
+ "movq %%mm5, %%mm3 \n\t"
+ "movq %%mm1, %%mm6 \n\t"
+ "punpcklwd %%mm4, %%mm5 \n\t"
+ "punpcklwd %%mm0, %%mm1 \n\t"
+ "punpckhwd %%mm4, %%mm3 \n\t"
+ "punpckhwd %%mm0, %%mm6 \n\t"
+ "movd %%mm5, %0 \n\t"
+ "punpckhdq %%mm5, %%mm5 \n\t"
+ "movd %%mm5, %1 \n\t"
+ "movd %%mm3, %2 \n\t"
+ "punpckhdq %%mm3, %%mm3 \n\t"
+ "movd %%mm3, %3 \n\t"
+ "movd %%mm1, %4 \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movd %%mm1, %5 \n\t"
+ "movd %%mm6, %6 \n\t"
+ "punpckhdq %%mm6, %%mm6 \n\t"
+ "movd %%mm6, %7 \n\t"
+ : "=m" (*(uint32_t*)(src + 0*stride)),
+ "=m" (*(uint32_t*)(src + 1*stride)),
+ "=m" (*(uint32_t*)(src + 2*stride)),
+ "=m" (*(uint32_t*)(src + 3*stride)),
+ "=m" (*(uint32_t*)(src + 4*stride)),
+ "=m" (*(uint32_t*)(src + 5*stride)),
+ "=m" (*(uint32_t*)(src + 6*stride)),
+ "=m" (*(uint32_t*)(src + 7*stride))
+ );
+}
+
#ifdef CONFIG_ENCODERS
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
int tmp;
@@ -509,10 +691,10 @@ static int pix_norm1_mmx(uint8_t *pix, int line_size) {
return tmp;
}
-static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) {
+static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
int tmp;
asm volatile (
- "movl $16,%%ecx\n"
+ "movl %4,%%ecx\n"
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
"1:\n"
@@ -563,10 +745,252 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) {
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
"paddd %%mm7,%%mm1\n"
"movd %%mm1,%2\n"
- : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "ecx");
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" (line_size) , "m" (h)
+ : "%ecx");
return tmp;
}
+static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
+ int tmp;
+
+ assert( (((int)pix) & 7) == 0);
+ assert((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0), %%mm2\n"\
+ "movq 8(%0), %%mm3\n"\
+ "addl %2,%0\n"\
+ "movq %%mm2, " #out0 "\n"\
+ "movq %%mm3, " #out1 "\n"\
+ "psubusb " #in0 ", %%mm2\n"\
+ "psubusb " #in1 ", %%mm3\n"\
+ "psubusb " #out0 ", " #in0 "\n"\
+ "psubusb " #out1 ", " #in1 "\n"\
+ "por %%mm2, " #in0 "\n"\
+ "por %%mm3, " #in1 "\n"\
+ "movq " #in0 ", %%mm2\n"\
+ "movq " #in1 ", %%mm3\n"\
+ "punpcklbw %%mm7, " #in0 "\n"\
+ "punpcklbw %%mm7, " #in1 "\n"\
+ "punpckhbw %%mm7, %%mm2\n"\
+ "punpckhbw %%mm7, %%mm3\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw %%mm3, %%mm2\n"\
+ "paddw %%mm2, " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+
+ asm volatile (
+ "movl %3,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pxor %%mm7,%%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq 8(%0),%%mm1\n"
+ "addl %2,%0\n"
+ "subl $2, %%ecx\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm6,%%mm0\n"
+ "psrlq $32, %%mm6\n"
+ "paddw %%mm6,%%mm0\n"
+ "movq %%mm0,%%mm6\n"
+ "psrlq $16, %%mm0\n"
+ "paddw %%mm6,%%mm0\n"
+ "movd %%mm0,%1\n"
+ : "+r" (pix), "=r"(tmp)
+ : "r" (line_size) , "m" (h)
+ : "%ecx");
+ return tmp & 0xFFFF;
+}
+#undef SUM
+
+static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
+ int tmp;
+
+ assert( (((int)pix) & 7) == 0);
+ assert((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0), " #out0 "\n"\
+ "movq 8(%0), " #out1 "\n"\
+ "addl %2,%0\n"\
+ "psadbw " #out0 ", " #in0 "\n"\
+ "psadbw " #out1 ", " #in1 "\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+ asm volatile (
+ "movl %3,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pxor %%mm7,%%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq 8(%0),%%mm1\n"
+ "addl %2,%0\n"
+ "subl $2, %%ecx\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movd %%mm6,%1\n"
+ : "+r" (pix), "=r"(tmp)
+ : "r" (line_size) , "m" (h)
+ : "%ecx");
+ return tmp;
+}
+#undef SUM
+
+static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+
+ assert( (((int)pix1) & 7) == 0);
+ assert( (((int)pix2) & 7) == 0);
+ assert((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0),%%mm2\n"\
+ "movq (%1)," #out0 "\n"\
+ "movq 8(%0),%%mm3\n"\
+ "movq 8(%1)," #out1 "\n"\
+ "addl %3,%0\n"\
+ "addl %3,%1\n"\
+ "psubb " #out0 ", %%mm2\n"\
+ "psubb " #out1 ", %%mm3\n"\
+ "pxor %%mm7, %%mm2\n"\
+ "pxor %%mm7, %%mm3\n"\
+ "movq %%mm2, " #out0 "\n"\
+ "movq %%mm3, " #out1 "\n"\
+ "psubusb " #in0 ", %%mm2\n"\
+ "psubusb " #in1 ", %%mm3\n"\
+ "psubusb " #out0 ", " #in0 "\n"\
+ "psubusb " #out1 ", " #in1 "\n"\
+ "por %%mm2, " #in0 "\n"\
+ "por %%mm3, " #in1 "\n"\
+ "movq " #in0 ", %%mm2\n"\
+ "movq " #in1 ", %%mm3\n"\
+ "punpcklbw %%mm7, " #in0 "\n"\
+ "punpcklbw %%mm7, " #in1 "\n"\
+ "punpckhbw %%mm7, %%mm2\n"\
+ "punpckhbw %%mm7, %%mm3\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw %%mm3, %%mm2\n"\
+ "paddw %%mm2, " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+
+ asm volatile (
+ "movl %4,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pcmpeqw %%mm7,%%mm7\n"
+ "psllw $15, %%mm7\n"
+ "packsswb %%mm7, %%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq (%1),%%mm2\n"
+ "movq 8(%0),%%mm1\n"
+ "movq 8(%1),%%mm3\n"
+ "addl %3,%0\n"
+ "addl %3,%1\n"
+ "subl $2, %%ecx\n"
+ "psubb %%mm2, %%mm0\n"
+ "psubb %%mm3, %%mm1\n"
+ "pxor %%mm7, %%mm0\n"
+ "pxor %%mm7, %%mm1\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm6,%%mm0\n"
+ "psrlq $32, %%mm6\n"
+ "paddw %%mm6,%%mm0\n"
+ "movq %%mm0,%%mm6\n"
+ "psrlq $16, %%mm0\n"
+ "paddw %%mm6,%%mm0\n"
+ "movd %%mm0,%2\n"
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" (line_size) , "m" (h)
+ : "%ecx");
+ return tmp & 0x7FFF;
+}
+#undef SUM
+
+static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+
+ assert( (((int)pix1) & 7) == 0);
+ assert( (((int)pix2) & 7) == 0);
+ assert((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0)," #out0 "\n"\
+ "movq (%1),%%mm2\n"\
+ "movq 8(%0)," #out1 "\n"\
+ "movq 8(%1),%%mm3\n"\
+ "addl %3,%0\n"\
+ "addl %3,%1\n"\
+ "psubb %%mm2, " #out0 "\n"\
+ "psubb %%mm3, " #out1 "\n"\
+ "pxor %%mm7, " #out0 "\n"\
+ "pxor %%mm7, " #out1 "\n"\
+ "psadbw " #out0 ", " #in0 "\n"\
+ "psadbw " #out1 ", " #in1 "\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+ asm volatile (
+ "movl %4,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pcmpeqw %%mm7,%%mm7\n"
+ "psllw $15, %%mm7\n"
+ "packsswb %%mm7, %%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq (%1),%%mm2\n"
+ "movq 8(%0),%%mm1\n"
+ "movq 8(%1),%%mm3\n"
+ "addl %3,%0\n"
+ "addl %3,%1\n"
+ "subl $2, %%ecx\n"
+ "psubb %%mm2, %%mm0\n"
+ "psubb %%mm3, %%mm1\n"
+ "pxor %%mm7, %%mm0\n"
+ "pxor %%mm7, %%mm1\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movd %%mm6,%2\n"
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" (line_size) , "m" (h)
+ : "%ecx");
+ return tmp;
+}
+#undef SUM
+
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
int i=0;
asm volatile(
@@ -588,7 +1012,6 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
for(; i<w; i++)
dst[i+0] = src1[i+0]-src2[i+0];
}
-#endif
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
int i=0;
@@ -626,8 +1049,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
*left = src2[w-1];
}
-#ifdef CONFIG_ENCODERS
-
#define LBUTTERFLY2(a1,b1,a2,b2)\
"paddw " #b1 ", " #a1 " \n\t"\
"paddw " #b2 ", " #a2 " \n\t"\
@@ -691,9 +1112,11 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
"movq "#c", "#o"+32(%1) \n\t"\
"movq "#d", "#o"+48(%1) \n\t"\
-static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
+static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
uint64_t temp[16] __align8;
int sum=0;
+
+ assert(h==8);
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
@@ -776,9 +1199,11 @@ static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride)
return sum&0xFFFF;
}
-static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride){
+static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
uint64_t temp[16] __align8;
int sum=0;
+
+ assert(h==8);
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
@@ -862,8 +1287,8 @@ static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride
}
-WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
-WARPER88_1616(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
+WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
+WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
#endif //CONFIG_ENCODERS
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
@@ -1602,12 +2027,19 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#endif
if (mm_flags & MM_MMX) {
- const int idct_algo= avctx->idct_algo;
-#ifdef CONFIG_ENCODERS
const int dct_algo = avctx->dct_algo;
+ const int idct_algo= avctx->idct_algo;
- if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)
- c->fdct = ff_fdct_mmx;
+#ifdef CONFIG_ENCODERS
+ if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
+ if(mm_flags & MM_SSE2){
+ c->fdct = ff_fdct_sse2;
+ }else if(mm_flags & MM_MMXEXT){
+ c->fdct = ff_fdct_mmx2;
+ }else{
+ c->fdct = ff_fdct_mmx;
+ }
+ }
#endif //CONFIG_ENCODERS
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
@@ -1688,7 +2120,15 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->pix_norm1 = pix_norm1_mmx;
c->sse[0] = sse16_mmx;
+ c->vsad[4]= vsad_intra16_mmx;
+
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->vsad[0] = vsad16_mmx;
+ }
#endif //CONFIG_ENCODERS
+
+ c->h263_v_loop_filter= h263_v_loop_filter_mmx;
+ c->h263_h_loop_filter= h263_h_loop_filter_mmx;
if (mm_flags & MM_MMXEXT) {
c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
@@ -1708,6 +2148,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#ifdef CONFIG_ENCODERS
c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
c->hadamard8_diff[1]= hadamard8_diff_mmx2;
+ c->vsad[4]= vsad_intra16_mmx2;
#endif //CONFIG_ENCODERS
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
@@ -1717,6 +2158,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
+#ifdef CONFIG_ENCODERS
+ c->vsad[0] = vsad16_mmx2;
+#endif //CONFIG_ENCODERS
}
#if 1
@@ -1754,7 +2198,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
#endif
+#ifdef CONFIG_ENCODERS
c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
+#endif //CONFIG_ENCODERS
} else if (mm_flags & MM_3DNOW) {
c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;