summaryrefslogtreecommitdiff
path: root/src/libffmpeg/libavcodec/i386
diff options
context:
space:
mode:
authorMike Melanson <mike@multimedia.cx>2004-02-01 05:31:16 +0000
committerMike Melanson <mike@multimedia.cx>2004-02-01 05:31:16 +0000
commit61d793ef13ac2ef8f9c2b41b71430f21fac80337 (patch)
tree69a1b20b01993b4d61fe1c8c2be33dd2e362a40c /src/libffmpeg/libavcodec/i386
parentf707774ac5d48c02c6a36327304d88629b0e38f9 (diff)
downloadxine-lib-61d793ef13ac2ef8f9c2b41b71430f21fac80337.tar.gz
xine-lib-61d793ef13ac2ef8f9c2b41b71430f21fac80337.tar.bz2
sync to ffmpeg build 4699
CVS patchset: 6090 CVS date: 2004/02/01 05:31:16
Diffstat (limited to 'src/libffmpeg/libavcodec/i386')
-rw-r--r--src/libffmpeg/libavcodec/i386/cputest.c115
-rw-r--r--src/libffmpeg/libavcodec/i386/dsputil_mmx.c474
-rw-r--r--src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h2
-rw-r--r--src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h1
-rw-r--r--src/libffmpeg/libavcodec/i386/fdct_mmx.c501
-rw-r--r--src/libffmpeg/libavcodec/i386/idct_mmx.c7
-rw-r--r--src/libffmpeg/libavcodec/i386/mmx.h244
-rw-r--r--src/libffmpeg/libavcodec/i386/motion_est_mmx.c123
-rw-r--r--src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c297
-rw-r--r--src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c13
-rw-r--r--src/libffmpeg/libavcodec/i386/simple_idct_mmx.c1
11 files changed, 1510 insertions, 268 deletions
diff --git a/src/libffmpeg/libavcodec/i386/cputest.c b/src/libffmpeg/libavcodec/i386/cputest.c
index b885548ee..b50d653c4 100644
--- a/src/libffmpeg/libavcodec/i386/cputest.c
+++ b/src/libffmpeg/libavcodec/i386/cputest.c
@@ -1,13 +1,122 @@
-/* dummy file to use xine mm_support function */
+/* Cpu detection code, extracted from mmx.h ((c)1997-99 by H. Dietz
+ and R. Fisher). Converted to C and improved by Fabrice Bellard */
-#include "xineutils.h"
+#include <stdlib.h>
#include "../dsputil.h"
+/* ebx saving is necessary for PIC. gcc seems unable to see it alone */
+#define cpuid(index,eax,ebx,ecx,edx)\
+ __asm __volatile\
+ ("movl %%ebx, %%esi\n\t"\
+ "cpuid\n\t"\
+ "xchgl %%ebx, %%esi"\
+ : "=a" (eax), "=S" (ebx),\
+ "=c" (ecx), "=d" (edx)\
+ : "0" (index));
/* Function to test if multimedia instructions are supported... */
int mm_support(void)
{
- return xine_mm_accel();
+ int rval;
+ int eax, ebx, ecx, edx;
+
+ __asm__ __volatile__ (
+ /* See if CPUID instruction is supported ... */
+ /* ... Get copies of EFLAGS into eax and ecx */
+ "pushf\n\t"
+ "popl %0\n\t"
+ "movl %0, %1\n\t"
+
+ /* ... Toggle the ID bit in one copy and store */
+ /* to the EFLAGS reg */
+ "xorl $0x200000, %0\n\t"
+ "push %0\n\t"
+ "popf\n\t"
+
+ /* ... Get the (hopefully modified) EFLAGS */
+ "pushf\n\t"
+ "popl %0\n\t"
+ : "=a" (eax), "=c" (ecx)
+ :
+ : "cc"
+ );
+
+ if (eax == ecx)
+ return 0; /* CPUID not supported */
+
+ cpuid(0, eax, ebx, ecx, edx);
+
+ if (ebx == 0x756e6547 &&
+ edx == 0x49656e69 &&
+ ecx == 0x6c65746e) {
+
+ /* intel */
+ inteltest:
+ cpuid(1, eax, ebx, ecx, edx);
+ if ((edx & 0x00800000) == 0)
+ return 0;
+ rval = MM_MMX;
+ if (edx & 0x02000000)
+ rval |= MM_MMXEXT | MM_SSE;
+ if (edx & 0x04000000)
+ rval |= MM_SSE2;
+ return rval;
+ } else if (ebx == 0x68747541 &&
+ edx == 0x69746e65 &&
+ ecx == 0x444d4163) {
+ /* AMD */
+ cpuid(0x80000000, eax, ebx, ecx, edx);
+ if ((unsigned)eax < 0x80000001)
+ goto inteltest;
+ cpuid(0x80000001, eax, ebx, ecx, edx);
+ if ((edx & 0x00800000) == 0)
+ return 0;
+ rval = MM_MMX;
+ if (edx & 0x80000000)
+ rval |= MM_3DNOW;
+ if (edx & 0x00400000)
+ rval |= MM_MMXEXT;
+ return rval;
+ } else if (ebx == 0x746e6543 &&
+ edx == 0x48727561 &&
+ ecx == 0x736c7561) { /* "CentaurHauls" */
+ /* VIA C3 */
+ cpuid(0x80000000, eax, ebx, ecx, edx);
+ if ((unsigned)eax < 0x80000001)
+ goto inteltest;
+ cpuid(0x80000001, eax, ebx, ecx, edx);
+ rval = 0;
+ if( edx & ( 1 << 31) )
+ rval |= MM_3DNOW;
+ if( edx & ( 1 << 23) )
+ rval |= MM_MMX;
+ if( edx & ( 1 << 24) )
+ rval |= MM_MMXEXT;
+ return rval;
+ } else if (ebx == 0x69727943 &&
+ edx == 0x736e4978 &&
+ ecx == 0x64616574) {
+ /* Cyrix Section */
+ /* See if extended CPUID level 80000001 is supported */
+ /* The value of CPUID/80000001 for the 6x86MX is undefined
+ according to the Cyrix CPU Detection Guide (Preliminary
+ Rev. 1.01 table 1), so we'll check the value of eax for
+ CPUID/0 to see if standard CPUID level 2 is supported.
+ According to the table, the only CPU which supports level
+ 2 is also the only one which supports extended CPUID levels.
+ */
+ if (eax != 2)
+ goto inteltest;
+ cpuid(0x80000001, eax, ebx, ecx, edx);
+ if ((eax & 0x00800000) == 0)
+ return 0;
+ rval = MM_MMX;
+ if (eax & 0x01000000)
+ rval |= MM_MMXEXT;
+ return rval;
+ } else {
+ return 0;
+ }
}
#ifdef __TEST__
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index a1e1642d5..efa022557 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -1,6 +1,7 @@
/*
* MMX optimized DSP utils
* Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -22,6 +23,11 @@
#include "../dsputil.h"
#include "../simple_idct.h"
+//#undef NDEBUG
+//#include <assert.h>
+
+extern const uint8_t ff_h263_loop_filter_strength[32];
+
int mm_flags; /* multimedia extension flags */
/* pixel operations */
@@ -34,6 +40,8 @@ static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003
static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
+static const uint64_t ff_pb_FC __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
+
#define JUMPALIGN() __asm __volatile (".balign 8"::)
#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
@@ -465,6 +473,180 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
dst[i+0] += src[i+0];
}
+#define H263_LOOP_FILTER \
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movq %0, %%mm0 \n\t"\
+ "movq %0, %%mm1 \n\t"\
+ "movq %3, %%mm2 \n\t"\
+ "movq %3, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm0 \n\t"\
+ "punpckhbw %%mm7, %%mm1 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "psubw %%mm2, %%mm0 \n\t"\
+ "psubw %%mm3, %%mm1 \n\t"\
+ "movq %1, %%mm2 \n\t"\
+ "movq %1, %%mm3 \n\t"\
+ "movq %2, %%mm4 \n\t"\
+ "movq %2, %%mm5 \n\t"\
+ "punpcklbw %%mm7, %%mm2 \n\t"\
+ "punpckhbw %%mm7, %%mm3 \n\t"\
+ "punpcklbw %%mm7, %%mm4 \n\t"\
+ "punpckhbw %%mm7, %%mm5 \n\t"\
+ "psubw %%mm2, %%mm4 \n\t"\
+ "psubw %%mm3, %%mm5 \n\t"\
+ "psllw $2, %%mm4 \n\t"\
+ "psllw $2, %%mm5 \n\t"\
+ "paddw %%mm0, %%mm4 \n\t"\
+ "paddw %%mm1, %%mm5 \n\t"\
+ "pxor %%mm6, %%mm6 \n\t"\
+ "pcmpgtw %%mm4, %%mm6 \n\t"\
+ "pcmpgtw %%mm5, %%mm7 \n\t"\
+ "pxor %%mm6, %%mm4 \n\t"\
+ "pxor %%mm7, %%mm5 \n\t"\
+ "psubw %%mm6, %%mm4 \n\t"\
+ "psubw %%mm7, %%mm5 \n\t"\
+ "psrlw $3, %%mm4 \n\t"\
+ "psrlw $3, %%mm5 \n\t"\
+ "packuswb %%mm5, %%mm4 \n\t"\
+ "packsswb %%mm7, %%mm6 \n\t"\
+ "pxor %%mm7, %%mm7 \n\t"\
+ "movd %4, %%mm2 \n\t"\
+ "punpcklbw %%mm2, %%mm2 \n\t"\
+ "punpcklbw %%mm2, %%mm2 \n\t"\
+ "punpcklbw %%mm2, %%mm2 \n\t"\
+ "psubusb %%mm4, %%mm2 \n\t"\
+ "movq %%mm2, %%mm3 \n\t"\
+ "psubusb %%mm4, %%mm3 \n\t"\
+ "psubb %%mm3, %%mm2 \n\t"\
+ "movq %1, %%mm3 \n\t"\
+ "movq %2, %%mm4 \n\t"\
+ "pxor %%mm6, %%mm3 \n\t"\
+ "pxor %%mm6, %%mm4 \n\t"\
+ "paddusb %%mm2, %%mm3 \n\t"\
+ "psubusb %%mm2, %%mm4 \n\t"\
+ "pxor %%mm6, %%mm3 \n\t"\
+ "pxor %%mm6, %%mm4 \n\t"\
+ "paddusb %%mm2, %%mm2 \n\t"\
+ "packsswb %%mm1, %%mm0 \n\t"\
+ "pcmpgtb %%mm0, %%mm7 \n\t"\
+ "pxor %%mm7, %%mm0 \n\t"\
+ "psubb %%mm7, %%mm0 \n\t"\
+ "movq %%mm0, %%mm1 \n\t"\
+ "psubusb %%mm2, %%mm0 \n\t"\
+ "psubb %%mm0, %%mm1 \n\t"\
+ "pand %5, %%mm1 \n\t"\
+ "psrlw $2, %%mm1 \n\t"\
+ "pxor %%mm7, %%mm1 \n\t"\
+ "psubb %%mm7, %%mm1 \n\t"\
+ "movq %0, %%mm5 \n\t"\
+ "movq %3, %%mm6 \n\t"\
+ "psubb %%mm1, %%mm5 \n\t"\
+ "paddb %%mm1, %%mm6 \n\t"
+
+static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
+ const int strength= ff_h263_loop_filter_strength[qscale];
+
+ asm volatile(
+
+ H263_LOOP_FILTER
+
+ "movq %%mm3, %1 \n\t"
+ "movq %%mm4, %2 \n\t"
+ "movq %%mm5, %0 \n\t"
+ "movq %%mm6, %3 \n\t"
+ : "+m" (*(uint64_t*)(src - 2*stride)),
+ "+m" (*(uint64_t*)(src - 1*stride)),
+ "+m" (*(uint64_t*)(src + 0*stride)),
+ "+m" (*(uint64_t*)(src + 1*stride))
+ : "g" (2*strength), "m"(ff_pb_FC)
+ );
+}
+
+static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
+ asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
+ "movd %4, %%mm0 \n\t"
+ "movd %5, %%mm1 \n\t"
+ "movd %6, %%mm2 \n\t"
+ "movd %7, %%mm3 \n\t"
+ "punpcklbw %%mm1, %%mm0 \n\t"
+ "punpcklbw %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "punpcklwd %%mm2, %%mm0 \n\t"
+ "punpckhwd %%mm2, %%mm1 \n\t"
+ "movd %%mm0, %0 \n\t"
+ "punpckhdq %%mm0, %%mm0 \n\t"
+ "movd %%mm0, %1 \n\t"
+ "movd %%mm1, %2 \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movd %%mm1, %3 \n\t"
+
+ : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
+ "=m" (*(uint32_t*)(dst + 1*dst_stride)),
+ "=m" (*(uint32_t*)(dst + 2*dst_stride)),
+ "=m" (*(uint32_t*)(dst + 3*dst_stride))
+ : "m" (*(uint32_t*)(src + 0*src_stride)),
+ "m" (*(uint32_t*)(src + 1*src_stride)),
+ "m" (*(uint32_t*)(src + 2*src_stride)),
+ "m" (*(uint32_t*)(src + 3*src_stride))
+ );
+}
+
+static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
+ const int strength= ff_h263_loop_filter_strength[qscale];
+ uint64_t temp[4] __attribute__ ((aligned(8)));
+ uint8_t *btemp= (uint8_t*)temp;
+
+ src -= 2;
+
+ transpose4x4(btemp , src , 8, stride);
+ transpose4x4(btemp+4, src + 4*stride, 8, stride);
+ asm volatile(
+ H263_LOOP_FILTER // 5 3 4 6
+
+ : "+m" (temp[0]),
+ "+m" (temp[1]),
+ "+m" (temp[2]),
+ "+m" (temp[3])
+ : "g" (2*strength), "m"(ff_pb_FC)
+ );
+
+ asm volatile(
+ "movq %%mm5, %%mm1 \n\t"
+ "movq %%mm4, %%mm0 \n\t"
+ "punpcklbw %%mm3, %%mm5 \n\t"
+ "punpcklbw %%mm6, %%mm4 \n\t"
+ "punpckhbw %%mm3, %%mm1 \n\t"
+ "punpckhbw %%mm6, %%mm0 \n\t"
+ "movq %%mm5, %%mm3 \n\t"
+ "movq %%mm1, %%mm6 \n\t"
+ "punpcklwd %%mm4, %%mm5 \n\t"
+ "punpcklwd %%mm0, %%mm1 \n\t"
+ "punpckhwd %%mm4, %%mm3 \n\t"
+ "punpckhwd %%mm0, %%mm6 \n\t"
+ "movd %%mm5, %0 \n\t"
+ "punpckhdq %%mm5, %%mm5 \n\t"
+ "movd %%mm5, %1 \n\t"
+ "movd %%mm3, %2 \n\t"
+ "punpckhdq %%mm3, %%mm3 \n\t"
+ "movd %%mm3, %3 \n\t"
+ "movd %%mm1, %4 \n\t"
+ "punpckhdq %%mm1, %%mm1 \n\t"
+ "movd %%mm1, %5 \n\t"
+ "movd %%mm6, %6 \n\t"
+ "punpckhdq %%mm6, %%mm6 \n\t"
+ "movd %%mm6, %7 \n\t"
+ : "=m" (*(uint32_t*)(src + 0*stride)),
+ "=m" (*(uint32_t*)(src + 1*stride)),
+ "=m" (*(uint32_t*)(src + 2*stride)),
+ "=m" (*(uint32_t*)(src + 3*stride)),
+ "=m" (*(uint32_t*)(src + 4*stride)),
+ "=m" (*(uint32_t*)(src + 5*stride)),
+ "=m" (*(uint32_t*)(src + 6*stride)),
+ "=m" (*(uint32_t*)(src + 7*stride))
+ );
+}
+
#ifdef CONFIG_ENCODERS
static int pix_norm1_mmx(uint8_t *pix, int line_size) {
int tmp;
@@ -509,10 +691,10 @@ static int pix_norm1_mmx(uint8_t *pix, int line_size) {
return tmp;
}
-static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) {
+static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
int tmp;
asm volatile (
- "movl $16,%%ecx\n"
+ "movl %4,%%ecx\n"
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
"1:\n"
@@ -563,10 +745,252 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) {
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
"paddd %%mm7,%%mm1\n"
"movd %%mm1,%2\n"
- : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "ecx");
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" (line_size) , "m" (h)
+ : "%ecx");
return tmp;
}
+static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
+ int tmp;
+
+ assert( (((int)pix) & 7) == 0);
+ assert((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0), %%mm2\n"\
+ "movq 8(%0), %%mm3\n"\
+ "addl %2,%0\n"\
+ "movq %%mm2, " #out0 "\n"\
+ "movq %%mm3, " #out1 "\n"\
+ "psubusb " #in0 ", %%mm2\n"\
+ "psubusb " #in1 ", %%mm3\n"\
+ "psubusb " #out0 ", " #in0 "\n"\
+ "psubusb " #out1 ", " #in1 "\n"\
+ "por %%mm2, " #in0 "\n"\
+ "por %%mm3, " #in1 "\n"\
+ "movq " #in0 ", %%mm2\n"\
+ "movq " #in1 ", %%mm3\n"\
+ "punpcklbw %%mm7, " #in0 "\n"\
+ "punpcklbw %%mm7, " #in1 "\n"\
+ "punpckhbw %%mm7, %%mm2\n"\
+ "punpckhbw %%mm7, %%mm3\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw %%mm3, %%mm2\n"\
+ "paddw %%mm2, " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+
+ asm volatile (
+ "movl %3,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pxor %%mm7,%%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq 8(%0),%%mm1\n"
+ "addl %2,%0\n"
+ "subl $2, %%ecx\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm6,%%mm0\n"
+ "psrlq $32, %%mm6\n"
+ "paddw %%mm6,%%mm0\n"
+ "movq %%mm0,%%mm6\n"
+ "psrlq $16, %%mm0\n"
+ "paddw %%mm6,%%mm0\n"
+ "movd %%mm0,%1\n"
+ : "+r" (pix), "=r"(tmp)
+ : "r" (line_size) , "m" (h)
+ : "%ecx");
+ return tmp & 0xFFFF;
+}
+#undef SUM
+
+static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
+ int tmp;
+
+ assert( (((int)pix) & 7) == 0);
+ assert((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0), " #out0 "\n"\
+ "movq 8(%0), " #out1 "\n"\
+ "addl %2,%0\n"\
+ "psadbw " #out0 ", " #in0 "\n"\
+ "psadbw " #out1 ", " #in1 "\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+ asm volatile (
+ "movl %3,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pxor %%mm7,%%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq 8(%0),%%mm1\n"
+ "addl %2,%0\n"
+ "subl $2, %%ecx\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movd %%mm6,%1\n"
+ : "+r" (pix), "=r"(tmp)
+ : "r" (line_size) , "m" (h)
+ : "%ecx");
+ return tmp;
+}
+#undef SUM
+
+static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+
+ assert( (((int)pix1) & 7) == 0);
+ assert( (((int)pix2) & 7) == 0);
+ assert((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0),%%mm2\n"\
+ "movq (%1)," #out0 "\n"\
+ "movq 8(%0),%%mm3\n"\
+ "movq 8(%1)," #out1 "\n"\
+ "addl %3,%0\n"\
+ "addl %3,%1\n"\
+ "psubb " #out0 ", %%mm2\n"\
+ "psubb " #out1 ", %%mm3\n"\
+ "pxor %%mm7, %%mm2\n"\
+ "pxor %%mm7, %%mm3\n"\
+ "movq %%mm2, " #out0 "\n"\
+ "movq %%mm3, " #out1 "\n"\
+ "psubusb " #in0 ", %%mm2\n"\
+ "psubusb " #in1 ", %%mm3\n"\
+ "psubusb " #out0 ", " #in0 "\n"\
+ "psubusb " #out1 ", " #in1 "\n"\
+ "por %%mm2, " #in0 "\n"\
+ "por %%mm3, " #in1 "\n"\
+ "movq " #in0 ", %%mm2\n"\
+ "movq " #in1 ", %%mm3\n"\
+ "punpcklbw %%mm7, " #in0 "\n"\
+ "punpcklbw %%mm7, " #in1 "\n"\
+ "punpckhbw %%mm7, %%mm2\n"\
+ "punpckhbw %%mm7, %%mm3\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw %%mm3, %%mm2\n"\
+ "paddw %%mm2, " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+
+ asm volatile (
+ "movl %4,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pcmpeqw %%mm7,%%mm7\n"
+ "psllw $15, %%mm7\n"
+ "packsswb %%mm7, %%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq (%1),%%mm2\n"
+ "movq 8(%0),%%mm1\n"
+ "movq 8(%1),%%mm3\n"
+ "addl %3,%0\n"
+ "addl %3,%1\n"
+ "subl $2, %%ecx\n"
+ "psubb %%mm2, %%mm0\n"
+ "psubb %%mm3, %%mm1\n"
+ "pxor %%mm7, %%mm0\n"
+ "pxor %%mm7, %%mm1\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movq %%mm6,%%mm0\n"
+ "psrlq $32, %%mm6\n"
+ "paddw %%mm6,%%mm0\n"
+ "movq %%mm0,%%mm6\n"
+ "psrlq $16, %%mm0\n"
+ "paddw %%mm6,%%mm0\n"
+ "movd %%mm0,%2\n"
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" (line_size) , "m" (h)
+ : "%ecx");
+ return tmp & 0x7FFF;
+}
+#undef SUM
+
+static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+
+ assert( (((int)pix1) & 7) == 0);
+ assert( (((int)pix2) & 7) == 0);
+ assert((line_size &7) ==0);
+
+#define SUM(in0, in1, out0, out1) \
+ "movq (%0)," #out0 "\n"\
+ "movq (%1),%%mm2\n"\
+ "movq 8(%0)," #out1 "\n"\
+ "movq 8(%1),%%mm3\n"\
+ "addl %3,%0\n"\
+ "addl %3,%1\n"\
+ "psubb %%mm2, " #out0 "\n"\
+ "psubb %%mm3, " #out1 "\n"\
+ "pxor %%mm7, " #out0 "\n"\
+ "pxor %%mm7, " #out1 "\n"\
+ "psadbw " #out0 ", " #in0 "\n"\
+ "psadbw " #out1 ", " #in1 "\n"\
+ "paddw " #in1 ", " #in0 "\n"\
+ "paddw " #in0 ", %%mm6\n"
+
+ asm volatile (
+ "movl %4,%%ecx\n"
+ "pxor %%mm6,%%mm6\n"
+ "pcmpeqw %%mm7,%%mm7\n"
+ "psllw $15, %%mm7\n"
+ "packsswb %%mm7, %%mm7\n"
+ "movq (%0),%%mm0\n"
+ "movq (%1),%%mm2\n"
+ "movq 8(%0),%%mm1\n"
+ "movq 8(%1),%%mm3\n"
+ "addl %3,%0\n"
+ "addl %3,%1\n"
+ "subl $2, %%ecx\n"
+ "psubb %%mm2, %%mm0\n"
+ "psubb %%mm3, %%mm1\n"
+ "pxor %%mm7, %%mm0\n"
+ "pxor %%mm7, %%mm1\n"
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+ "1:\n"
+
+ SUM(%%mm4, %%mm5, %%mm0, %%mm1)
+
+ SUM(%%mm0, %%mm1, %%mm4, %%mm5)
+
+ "subl $2, %%ecx\n"
+ "jnz 1b\n"
+
+ "movd %%mm6,%2\n"
+ : "+r" (pix1), "+r" (pix2), "=r"(tmp)
+ : "r" (line_size) , "m" (h)
+ : "%ecx");
+ return tmp;
+}
+#undef SUM
+
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
int i=0;
asm volatile(
@@ -588,7 +1012,6 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
for(; i<w; i++)
dst[i+0] = src1[i+0]-src2[i+0];
}
-#endif
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
int i=0;
@@ -626,8 +1049,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
*left = src2[w-1];
}
-#ifdef CONFIG_ENCODERS
-
#define LBUTTERFLY2(a1,b1,a2,b2)\
"paddw " #b1 ", " #a1 " \n\t"\
"paddw " #b2 ", " #a2 " \n\t"\
@@ -691,9 +1112,11 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
"movq "#c", "#o"+32(%1) \n\t"\
"movq "#d", "#o"+48(%1) \n\t"\
-static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){
+static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
uint64_t temp[16] __align8;
int sum=0;
+
+ assert(h==8);
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
@@ -776,9 +1199,11 @@ static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride)
return sum&0xFFFF;
}
-static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride){
+static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
uint64_t temp[16] __align8;
int sum=0;
+
+ assert(h==8);
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
@@ -862,8 +1287,8 @@ static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride
}
-WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx)
-WARPER88_1616(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
+WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
+WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
#endif //CONFIG_ENCODERS
#define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
@@ -1602,12 +2027,19 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#endif
if (mm_flags & MM_MMX) {
- const int idct_algo= avctx->idct_algo;
-#ifdef CONFIG_ENCODERS
const int dct_algo = avctx->dct_algo;
+ const int idct_algo= avctx->idct_algo;
- if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)
- c->fdct = ff_fdct_mmx;
+#ifdef CONFIG_ENCODERS
+ if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
+ if(mm_flags & MM_SSE2){
+ c->fdct = ff_fdct_sse2;
+ }else if(mm_flags & MM_MMXEXT){
+ c->fdct = ff_fdct_mmx2;
+ }else{
+ c->fdct = ff_fdct_mmx;
+ }
+ }
#endif //CONFIG_ENCODERS
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
@@ -1688,7 +2120,15 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->pix_norm1 = pix_norm1_mmx;
c->sse[0] = sse16_mmx;
+ c->vsad[4]= vsad_intra16_mmx;
+
+ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
+ c->vsad[0] = vsad16_mmx;
+ }
#endif //CONFIG_ENCODERS
+
+ c->h263_v_loop_filter= h263_v_loop_filter_mmx;
+ c->h263_h_loop_filter= h263_h_loop_filter_mmx;
if (mm_flags & MM_MMXEXT) {
c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
@@ -1708,6 +2148,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#ifdef CONFIG_ENCODERS
c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
c->hadamard8_diff[1]= hadamard8_diff_mmx2;
+ c->vsad[4]= vsad_intra16_mmx2;
#endif //CONFIG_ENCODERS
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
@@ -1717,6 +2158,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
+#ifdef CONFIG_ENCODERS
+ c->vsad[0] = vsad16_mmx2;
+#endif //CONFIG_ENCODERS
}
#if 1
@@ -1754,7 +2198,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
#endif
+#ifdef CONFIG_ENCODERS
c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
+#endif //CONFIG_ENCODERS
} else if (mm_flags & MM_3DNOW) {
c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
index 8418123ac..c8494f51a 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h
@@ -1,7 +1,7 @@
/*
* DSP utils : average functions are compiled twice for 3dnow/mmx2
* Copyright (c) 2000, 2001 Fabrice Bellard.
- * Copyright (c) 2002 Michael Niedermayer
+ * Copyright (c) 2002-2004 Michael Niedermayer
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
index bbd5aec97..21f0bfd84 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h
@@ -1,6 +1,7 @@
/*
* DSP utils mmx functions are compiled twice for rnd/no_rnd
* Copyright (c) 2000, 2001 Fabrice Bellard.
+ * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
index a2402c95d..877160773 100644
--- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c
@@ -1,16 +1,21 @@
/*
* MMX optimized forward DCT
* The gcc porting is Copyright (c) 2001 Fabrice Bellard.
+ * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
*
* from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
*
* Intel Application Note AP-922 - fast, precise implementation of DCT
* http://developer.intel.com/vtune/cbts/appnotes.htm
+ *
+ * Also of inspiration:
+ * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
+ * Skal's fdct at http://skal.planet-d.net/coding/dct.html
*/
-#include "../dsputil.h"
+#include "../common.h"
#include "mmx.h"
-#undef ATTR_ALIGN
#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
//////////////////////////////////////////////////////////////////////
@@ -27,10 +32,8 @@
#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
#define SHIFT_FRW_COL BITS_FRW_ACC
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
-//#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1)
#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
-//#define RND_FRW_COL (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1)
-#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
+//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
//concatenated table, for forward DCT transformation
static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
@@ -38,101 +41,220 @@ static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = {
27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5
-21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5
};
-static const int16_t cos_4_16[4] ATTR_ALIGN(8) = {
- -19195, -19195, -19195, -19195, //cos * (2<<16) + 0.5
-};
static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = {
23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5
};
-static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
+static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL;
+
static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW };
+static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
+
static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table
- //row0
- 16384, 16384, 21407, -8867, // w09 w01 w08 w00
- 16384, 16384, 8867, -21407, // w13 w05 w12 w04
- 16384, -16384, 8867, 21407, // w11 w03 w10 w02
- -16384, 16384, -21407, -8867, // w15 w07 w14 w06
- 22725, 12873, 19266, -22725, // w22 w20 w18 w16
- 19266, 4520, -4520, -12873, // w23 w21 w19 w17
- 12873, 4520, 4520, 19266, // w30 w28 w26 w24
- -22725, 19266, -12873, -22725, // w31 w29 w27 w25
-
- //row1
- 22725, 22725, 29692, -12299, // w09 w01 w08 w00
- 22725, 22725, 12299, -29692, // w13 w05 w12 w04
- 22725, -22725, 12299, 29692, // w11 w03 w10 w02
- -22725, 22725, -29692, -12299, // w15 w07 w14 w06
- 31521, 17855, 26722, -31521, // w22 w20 w18 w16
- 26722, 6270, -6270, -17855, // w23 w21 w19 w17
- 17855, 6270, 6270, 26722, // w30 w28 w26 w24
- -31521, 26722, -17855, -31521, // w31 w29 w27 w25
-
- //row2
- 21407, 21407, 27969, -11585, // w09 w01 w08 w00
- 21407, 21407, 11585, -27969, // w13 w05 w12 w04
- 21407, -21407, 11585, 27969, // w11 w03 w10 w02
- -21407, 21407, -27969, -11585, // w15 w07 w14 w06
- 29692, 16819, 25172, -29692, // w22 w20 w18 w16
- 25172, 5906, -5906, -16819, // w23 w21 w19 w17
- 16819, 5906, 5906, 25172, // w30 w28 w26 w24
- -29692, 25172, -16819, -29692, // w31 w29 w27 w25
-
- //row3
- 19266, 19266, 25172, -10426, // w09 w01 w08 w00
- 19266, 19266, 10426, -25172, // w13 w05 w12 w04
- 19266, -19266, 10426, 25172, // w11 w03 w10 w02
- -19266, 19266, -25172, -10426, // w15 w07 w14 w06,
- 26722, 15137, 22654, -26722, // w22 w20 w18 w16
- 22654, 5315, -5315, -15137, // w23 w21 w19 w17
- 15137, 5315, 5315, 22654, // w30 w28 w26 w24
- -26722, 22654, -15137, -26722, // w31 w29 w27 w25,
-
- //row4
- 16384, 16384, 21407, -8867, // w09 w01 w08 w00
- 16384, 16384, 8867, -21407, // w13 w05 w12 w04
- 16384, -16384, 8867, 21407, // w11 w03 w10 w02
- -16384, 16384, -21407, -8867, // w15 w07 w14 w06
- 22725, 12873, 19266, -22725, // w22 w20 w18 w16
- 19266, 4520, -4520, -12873, // w23 w21 w19 w17
- 12873, 4520, 4520, 19266, // w30 w28 w26 w24
- -22725, 19266, -12873, -22725, // w31 w29 w27 w25
-
- //row5
- 19266, 19266, 25172, -10426, // w09 w01 w08 w00
- 19266, 19266, 10426, -25172, // w13 w05 w12 w04
- 19266, -19266, 10426, 25172, // w11 w03 w10 w02
- -19266, 19266, -25172, -10426, // w15 w07 w14 w06
- 26722, 15137, 22654, -26722, // w22 w20 w18 w16
- 22654, 5315, -5315, -15137, // w23 w21 w19 w17
- 15137, 5315, 5315, 22654, // w30 w28 w26 w24
- -26722, 22654, -15137, -26722, // w31 w29 w27 w25
-
- //row6
- 21407, 21407, 27969, -11585, // w09 w01 w08 w00
- 21407, 21407, 11585, -27969, // w13 w05 w12 w04
- 21407, -21407, 11585, 27969, // w11 w03 w10 w02
- -21407, 21407, -27969, -11585, // w15 w07 w14 w06,
- 29692, 16819, 25172, -29692, // w22 w20 w18 w16
- 25172, 5906, -5906, -16819, // w23 w21 w19 w17
- 16819, 5906, 5906, 25172, // w30 w28 w26 w24
- -29692, 25172, -16819, -29692, // w31 w29 w27 w25,
-
- //row7
- 22725, 22725, 29692, -12299, // w09 w01 w08 w00
- 22725, 22725, 12299, -29692, // w13 w05 w12 w04
- 22725, -22725, 12299, 29692, // w11 w03 w10 w02
- -22725, 22725, -29692, -12299, // w15 w07 w14 w06,
- 31521, 17855, 26722, -31521, // w22 w20 w18 w16
- 26722, 6270, -6270, -17855, // w23 w21 w19 w17
- 17855, 6270, 6270, 26722, // w30 w28 w26 w24
- -31521, 26722, -17855, -31521 // w31 w29 w27 w25
+ 16384, 16384, -8867, -21407,
+ 16384, 16384, 21407, 8867,
+ 16384, -16384, 21407, -8867,
+ -16384, 16384, 8867, -21407,
+ 22725, 19266, -22725, -12873,
+ 12873, 4520, 19266, -4520,
+ 12873, -22725, 19266, -22725,
+ 4520, 19266, 4520, -12873,
+
+ 22725, 22725, -12299, -29692,
+ 22725, 22725, 29692, 12299,
+ 22725, -22725, 29692, -12299,
+ -22725, 22725, 12299, -29692,
+ 31521, 26722, -31521, -17855,
+ 17855, 6270, 26722, -6270,
+ 17855, -31521, 26722, -31521,
+ 6270, 26722, 6270, -17855,
+
+ 21407, 21407, -11585, -27969,
+ 21407, 21407, 27969, 11585,
+ 21407, -21407, 27969, -11585,
+ -21407, 21407, 11585, -27969,
+ 29692, 25172, -29692, -16819,
+ 16819, 5906, 25172, -5906,
+ 16819, -29692, 25172, -29692,
+ 5906, 25172, 5906, -16819,
+
+ 19266, 19266, -10426, -25172,
+ 19266, 19266, 25172, 10426,
+ 19266, -19266, 25172, -10426,
+ -19266, 19266, 10426, -25172,
+ 26722, 22654, -26722, -15137,
+ 15137, 5315, 22654, -5315,
+ 15137, -26722, 22654, -26722,
+ 5315, 22654, 5315, -15137,
+
+ 16384, 16384, -8867, -21407,
+ 16384, 16384, 21407, 8867,
+ 16384, -16384, 21407, -8867,
+ -16384, 16384, 8867, -21407,
+ 22725, 19266, -22725, -12873,
+ 12873, 4520, 19266, -4520,
+ 12873, -22725, 19266, -22725,
+ 4520, 19266, 4520, -12873,
+
+ 19266, 19266, -10426, -25172,
+ 19266, 19266, 25172, 10426,
+ 19266, -19266, 25172, -10426,
+ -19266, 19266, 10426, -25172,
+ 26722, 22654, -26722, -15137,
+ 15137, 5315, 22654, -5315,
+ 15137, -26722, 22654, -26722,
+ 5315, 22654, 5315, -15137,
+
+ 21407, 21407, -11585, -27969,
+ 21407, 21407, 27969, 11585,
+ 21407, -21407, 27969, -11585,
+ -21407, 21407, 11585, -27969,
+ 29692, 25172, -29692, -16819,
+ 16819, 5906, 25172, -5906,
+ 16819, -29692, 25172, -29692,
+ 5906, 25172, 5906, -16819,
+
+ 22725, 22725, -12299, -29692,
+ 22725, 22725, 29692, 12299,
+ 22725, -22725, 29692, -12299,
+ -22725, 22725, 12299, -29692,
+ 31521, 26722, -31521, -17855,
+ 17855, 6270, 26722, -6270,
+ 17855, -31521, 26722, -31521,
+ 6270, 26722, 6270, -17855,
};
+static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table
+#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
+ C4, C4, C5, C7, C2, C6, C3, -C7, \
+ -C4, C4, C7, C3, C6, -C2, C7, -C5, \
+ C4, -C4, C5, -C1, C2, -C6, C3, -C1,
+// c1..c7 * cos(pi/4) * 2^15
+#define C1 22725
+#define C2 21407
+#define C3 19266
+#define C4 16384
+#define C5 12873
+#define C6 8867
+#define C7 4520
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 31521
+#define C2 29692
+#define C3 26722
+#define C4 22725
+#define C5 17855
+#define C6 12299
+#define C7 6270
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 29692
+#define C2 27969
+#define C3 25172
+#define C4 21407
+#define C5 16819
+#define C6 11585
+#define C7 5906
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 26722
+#define C2 25172
+#define C3 22654
+#define C4 19266
+#define C5 15137
+#define C6 10426
+#define C7 5315
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 22725
+#define C2 21407
+#define C3 19266
+#define C4 16384
+#define C5 12873
+#define C6 8867
+#define C7 4520
+TABLE_SSE2
-static inline void fdct_col(const int16_t *in, int16_t *out, int offset)
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 26722
+#define C2 25172
+#define C3 22654
+#define C4 19266
+#define C5 15137
+#define C6 10426
+#define C7 5315
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 29692
+#define C2 27969
+#define C3 25172
+#define C4 21407
+#define C5 16819
+#define C6 11585
+#define C7 5906
+TABLE_SSE2
+
+#undef C1
+#undef C2
+#undef C3
+#undef C4
+#undef C5
+#undef C6
+#undef C7
+#define C1 31521
+#define C2 29692
+#define C3 26722
+#define C4 22725
+#define C5 17855
+#define C6 12299
+#define C7 6270
+TABLE_SSE2
+};
+
+static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset)
{
movq_m2r(*(in + offset + 1 * 8), mm0);
movq_m2r(*(in + offset + 6 * 8), mm1);
@@ -211,59 +333,158 @@ static inline void fdct_col(const int16_t *in, int16_t *out, int offset)
movq_r2m(mm3, *(out + offset + 7 * 8));
}
-static inline void fdct_row(const int16_t *in, int16_t *out, const int16_t *table)
+
+static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
{
- movd_m2r(*(in + 6), mm5);
- punpcklwd_m2r(*(in + 4), mm5);
- movq_r2r(mm5, mm2);
- psrlq_i2r(0x20, mm5);
+ asm volatile(
+ ".macro FDCT_ROW_SSE2_H1 i t \n\t"
+ "movq \\i(%0), %%xmm2 \n\t"
+ "movq \\i+8(%0), %%xmm0 \n\t"
+ "movdqa \\t+32(%1), %%xmm3 \n\t"
+ "movdqa \\t+48(%1), %%xmm7 \n\t"
+ "movdqa \\t(%1), %%xmm4 \n\t"
+ "movdqa \\t+16(%1), %%xmm5 \n\t"
+ ".endm \n\t"
+ ".macro FDCT_ROW_SSE2_H2 i t \n\t"
+ "movq \\i(%0), %%xmm2 \n\t"
+ "movq \\i+8(%0), %%xmm0 \n\t"
+ "movdqa \\t+32(%1), %%xmm3 \n\t"
+ "movdqa \\t+48(%1), %%xmm7 \n\t"
+ ".endm \n\t"
+ ".macro FDCT_ROW_SSE2 i \n\t"
+ "movq %%xmm2, %%xmm1 \n\t"
+ "pshuflw $27, %%xmm0, %%xmm0 \n\t"
+ "paddsw %%xmm0, %%xmm1 \n\t"
+ "psubsw %%xmm0, %%xmm2 \n\t"
+ "punpckldq %%xmm2, %%xmm1 \n\t"
+ "pshufd $78, %%xmm1, %%xmm2 \n\t"
+ "pmaddwd %%xmm2, %%xmm3 \n\t"
+ "pmaddwd %%xmm1, %%xmm7 \n\t"
+ "pmaddwd %%xmm5, %%xmm2 \n\t"
+ "pmaddwd %%xmm4, %%xmm1 \n\t"
+ "paddd %%xmm7, %%xmm3 \n\t"
+ "paddd %%xmm2, %%xmm1 \n\t"
+ "paddd %%xmm6, %%xmm3 \n\t"
+ "paddd %%xmm6, %%xmm1 \n\t"
+ "psrad %3, %%xmm3 \n\t"
+ "psrad %3, %%xmm1 \n\t"
+ "packssdw %%xmm3, %%xmm1 \n\t"
+ "movdqa %%xmm1, \\i(%4) \n\t"
+ ".endm \n\t"
+ "movdqa (%2), %%xmm6 \n\t"
+ "FDCT_ROW_SSE2_H1 0 0 \n\t"
+ "FDCT_ROW_SSE2 0 \n\t"
+ "FDCT_ROW_SSE2_H2 64 0 \n\t"
+ "FDCT_ROW_SSE2 64 \n\t"
+
+ "FDCT_ROW_SSE2_H1 16 64 \n\t"
+ "FDCT_ROW_SSE2 16 \n\t"
+ "FDCT_ROW_SSE2_H2 112 64 \n\t"
+ "FDCT_ROW_SSE2 112 \n\t"
+
+ "FDCT_ROW_SSE2_H1 32 128 \n\t"
+ "FDCT_ROW_SSE2 32 \n\t"
+ "FDCT_ROW_SSE2_H2 96 128 \n\t"
+ "FDCT_ROW_SSE2 96 \n\t"
+
+ "FDCT_ROW_SSE2_H1 48 192 \n\t"
+ "FDCT_ROW_SSE2 48 \n\t"
+ "FDCT_ROW_SSE2_H2 80 192 \n\t"
+ "FDCT_ROW_SSE2 80 \n\t"
+ :
+ : "r" (in), "r" (tab_frw_01234567_sse2), "r" (fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
+ );
+}
+
+static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table)
+{
+ pshufw_m2r(*(in + 4), mm5, 0x1B);
movq_m2r(*(in + 0), mm0);
- punpcklwd_r2r(mm2, mm5);
- movq_r2r(mm0, mm1);
+ movq_r2r(mm0, mm1);
paddsw_r2r(mm5, mm0);
psubsw_r2r(mm5, mm1);
- movq_r2r(mm0, mm2);
- punpcklwd_r2r(mm1, mm0);
- punpckhwd_r2r(mm1, mm2);
- movq_r2r(mm2, mm1);
- movq_r2r(mm0, mm2);
+ pshufw_r2r(mm0, mm2, 0x4E);
+ pshufw_r2r(mm1, mm3, 0x4E);
+ movq_m2r(*(table + 0), mm4);
+ movq_m2r(*(table + 4), mm6);
+ movq_m2r(*(table + 16), mm5);
+ movq_m2r(*(table + 20), mm7);
+ pmaddwd_r2r(mm0, mm4);
+ pmaddwd_r2r(mm1, mm5);
+ pmaddwd_r2r(mm2, mm6);
+ pmaddwd_r2r(mm3, mm7);
+ pmaddwd_m2r(*(table + 8), mm0);
+ pmaddwd_m2r(*(table + 12), mm2);
+ pmaddwd_m2r(*(table + 24), mm1);
+ pmaddwd_m2r(*(table + 28), mm3);
+ paddd_r2r(mm6, mm4);
+ paddd_r2r(mm7, mm5);
+ paddd_r2r(mm2, mm0);
+ paddd_r2r(mm3, mm1);
+ movq_m2r(*fdct_r_row, mm7);
+ paddd_r2r(mm7, mm4);
+ paddd_r2r(mm7, mm5);
+ paddd_r2r(mm7, mm0);
+ paddd_r2r(mm7, mm1);
+ psrad_i2r(SHIFT_FRW_ROW, mm4);
+ psrad_i2r(SHIFT_FRW_ROW, mm5);
+ psrad_i2r(SHIFT_FRW_ROW, mm0);
+ psrad_i2r(SHIFT_FRW_ROW, mm1);
+ packssdw_r2r(mm0, mm4);
+ packssdw_r2r(mm1, mm5);
+ movq_r2r(mm4, mm2);
+ punpcklwd_r2r(mm5, mm4);
+ punpckhwd_r2r(mm5, mm2);
+ movq_r2m(mm4, *(out + 0));
+ movq_r2m(mm2, *(out + 4));
+}
+
+static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
+{
+ movd_m2r(*(in + 6), mm1);
+ punpcklwd_m2r(*(in + 4), mm1);
+ movq_r2r(mm1, mm2);
+ psrlq_i2r(0x20, mm1);
+ movq_m2r(*(in + 0), mm0);
+ punpcklwd_r2r(mm2, mm1);
+ movq_r2r(mm0, mm5);
+ paddsw_r2r(mm1, mm0);
+ psubsw_r2r(mm1, mm5);
+ movq_r2r(mm0, mm1);
+ movq_r2r(mm5, mm6);
+ punpckldq_r2r(mm5, mm3);
+ punpckhdq_r2r(mm3, mm6);
movq_m2r(*(table + 0), mm3);
- punpcklwd_r2r(mm1, mm0);
- movq_r2r(mm0, mm5);
- punpckldq_r2r(mm0, mm0);
movq_m2r(*(table + 4), mm4);
- punpckhwd_r2r(mm1, mm2);
+ punpckldq_r2r(mm0, mm2);
pmaddwd_r2r(mm0, mm3);
- movq_r2r(mm2, mm6);
- movq_m2r(*(table + 16), mm1);
- punpckldq_r2r(mm2, mm2);
- pmaddwd_r2r(mm2, mm4);
- punpckhdq_r2r(mm5, mm5);
+ punpckhdq_r2r(mm2, mm1);
+ movq_m2r(*(table + 16), mm2);
+ pmaddwd_r2r(mm1, mm4);
pmaddwd_m2r(*(table + 8), mm0);
- punpckhdq_r2r(mm6, mm6);
movq_m2r(*(table + 20), mm7);
- pmaddwd_r2r(mm5, mm1);
+ pmaddwd_r2r(mm5, mm2);
paddd_m2r(*fdct_r_row, mm3);
pmaddwd_r2r(mm6, mm7);
- pmaddwd_m2r(*(table + 12), mm2);
+ pmaddwd_m2r(*(table + 12), mm1);
paddd_r2r(mm4, mm3);
pmaddwd_m2r(*(table + 24), mm5);
pmaddwd_m2r(*(table + 28), mm6);
- paddd_r2r(mm7, mm1);
+ paddd_r2r(mm7, mm2);
paddd_m2r(*fdct_r_row, mm0);
psrad_i2r(SHIFT_FRW_ROW, mm3);
- paddd_m2r(*fdct_r_row, mm1);
- paddd_r2r(mm2, mm0);
+ paddd_m2r(*fdct_r_row, mm2);
+ paddd_r2r(mm1, mm0);
paddd_m2r(*fdct_r_row, mm5);
- psrad_i2r(SHIFT_FRW_ROW, mm1);
+ psrad_i2r(SHIFT_FRW_ROW, mm2);
paddd_r2r(mm6, mm5);
psrad_i2r(SHIFT_FRW_ROW, mm0);
psrad_i2r(SHIFT_FRW_ROW, mm5);
packssdw_r2r(mm0, mm3);
- packssdw_r2r(mm5, mm1);
+ packssdw_r2r(mm5, mm2);
movq_r2r(mm3, mm6);
- punpcklwd_r2r(mm1, mm3);
- punpckhwd_r2r(mm1, mm6);
+ punpcklwd_r2r(mm2, mm3);
+ punpckhwd_r2r(mm2, mm6);
movq_r2m(mm3, *(out + 0));
movq_r2m(mm6, *(out + 4));
}
@@ -284,9 +505,47 @@ void ff_fdct_mmx(int16_t *block)
table = tab_frw_01234567;
out = block;
for(i=8;i>0;i--) {
- fdct_row(block1, out, table);
+ fdct_row_mmx(block1, out, table);
+ block1 += 8;
+ table += 32;
+ out += 8;
+ }
+}
+
+void ff_fdct_mmx2(int16_t *block)
+{
+ int64_t align_tmp[16] ATTR_ALIGN(8);
+ int16_t * const block_tmp= (int16_t*)align_tmp;
+ int16_t *block1, *out;
+ const int16_t *table;
+ int i;
+
+ block1 = block_tmp;
+ fdct_col(block, block1, 0);
+ fdct_col(block, block1, 4);
+
+ block1 = block_tmp;
+ table = tab_frw_01234567;
+ out = block;
+ for(i=8;i>0;i--) {
+ fdct_row_mmx2(block1, out, table);
block1 += 8;
table += 32;
out += 8;
}
}
+
+void ff_fdct_sse2(int16_t *block)
+{
+ int64_t align_tmp[16] ATTR_ALIGN(8);
+ int16_t * const block_tmp= (int16_t*)align_tmp;
+ int16_t *block1;
+ int i;
+
+ block1 = block_tmp;
+ fdct_col(block, block1, 0);
+ fdct_col(block, block1, 4);
+
+ fdct_row_sse2(block1, block);
+}
+
diff --git a/src/libffmpeg/libavcodec/i386/idct_mmx.c b/src/libffmpeg/libavcodec/i386/idct_mmx.c
index 654792e5e..298c8a8b0 100644
--- a/src/libffmpeg/libavcodec/i386/idct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/idct_mmx.c
@@ -22,11 +22,10 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include "../dsputil.h"
+#include "../common.h"
#include "mmx.h"
-#undef ATTR_ALIGN
#define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align)))
#define ROW_SHIFT 11
@@ -555,10 +554,6 @@ static int32_t rounder5[] ATTR_ALIGN(8) =
#undef COL_SHIFT
#undef ROW_SHIFT
-/* the macro below will generate these */
-void ff_mmx_idct(DCTELEM *block);
-void ff_mmxext_idct(DCTELEM *block);
-
#define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \
void idct (int16_t * block) \
{ \
diff --git a/src/libffmpeg/libavcodec/i386/mmx.h b/src/libffmpeg/libavcodec/i386/mmx.h
index 2ba28898d..7e94cfd9b 100644
--- a/src/libffmpeg/libavcodec/i386/mmx.h
+++ b/src/libffmpeg/libavcodec/i386/mmx.h
@@ -1 +1,243 @@
-#include "xineutils.h"
+/*
+ * mmx.h
+ * Copyright (C) 1997-2001 H. Dietz and R. Fisher
+ */
+#ifndef AVCODEC_I386MMX_H
+#define AVCODEC_I386MMX_H
+
+/*
+ * The type of an value that fits in an MMX register (note that long
+ * long constant values MUST be suffixed by LL and unsigned long long
+ * values by ULL, lest they be truncated by the compiler)
+ */
+
+typedef union {
+ long long q; /* Quadword (64-bit) value */
+ unsigned long long uq; /* Unsigned Quadword */
+ int d[2]; /* 2 Doubleword (32-bit) values */
+ unsigned int ud[2]; /* 2 Unsigned Doubleword */
+ short w[4]; /* 4 Word (16-bit) values */
+ unsigned short uw[4]; /* 4 Unsigned Word */
+ char b[8]; /* 8 Byte (8-bit) values */
+ unsigned char ub[8]; /* 8 Unsigned Byte */
+ float s[2]; /* Single-precision (32-bit) value */
+} mmx_t; /* On an 8-byte (64-bit) boundary */
+
+
+#define mmx_i2r(op,imm,reg) \
+ __asm__ __volatile__ (#op " %0, %%" #reg \
+ : /* nothing */ \
+ : "i" (imm) )
+
+#define mmx_m2r(op,mem,reg) \
+ __asm__ __volatile__ (#op " %0, %%" #reg \
+ : /* nothing */ \
+ : "m" (mem))
+
+#define mmx_r2m(op,reg,mem) \
+ __asm__ __volatile__ (#op " %%" #reg ", %0" \
+ : "=m" (mem) \
+ : /* nothing */ )
+
+#define mmx_r2r(op,regs,regd) \
+ __asm__ __volatile__ (#op " %" #regs ", %" #regd)
+
+
+#define emms() __asm__ __volatile__ ("emms")
+
+#define movd_m2r(var,reg) mmx_m2r (movd, var, reg)
+#define movd_r2m(reg,var) mmx_r2m (movd, reg, var)
+#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd)
+
+#define movq_m2r(var,reg) mmx_m2r (movq, var, reg)
+#define movq_r2m(reg,var) mmx_r2m (movq, reg, var)
+#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd)
+
+#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg)
+#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
+#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg)
+#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
+
+#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg)
+#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
+
+#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg)
+#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd)
+#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg)
+#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd)
+#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg)
+#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd)
+
+#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg)
+#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd)
+#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg)
+#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd)
+
+#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg)
+#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd)
+#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg)
+#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd)
+
+#define pand_m2r(var,reg) mmx_m2r (pand, var, reg)
+#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd)
+
+#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg)
+#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd)
+
+#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg)
+#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd)
+#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg)
+#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd)
+#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg)
+#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd)
+
+#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg)
+#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd)
+#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg)
+#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd)
+#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg)
+#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd)
+
+#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg)
+#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd)
+
+#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg)
+#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd)
+
+#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg)
+#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd)
+
+#define por_m2r(var,reg) mmx_m2r (por, var, reg)
+#define por_r2r(regs,regd) mmx_r2r (por, regs, regd)
+
+#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg)
+#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg)
+#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd)
+#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg)
+#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg)
+#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd)
+#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg)
+#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg)
+#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd)
+
+#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg)
+#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg)
+#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd)
+#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg)
+#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg)
+#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd)
+
+#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg)
+#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg)
+#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd)
+#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg)
+#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg)
+#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd)
+#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg)
+#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg)
+#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd)
+
+#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg)
+#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd)
+#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg)
+#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd)
+#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg)
+#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd)
+
+#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg)
+#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd)
+#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg)
+#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd)
+
+#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg)
+#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd)
+#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg)
+#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd)
+
+#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg)
+#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd)
+#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg)
+#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd)
+#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg)
+#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd)
+
+#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg)
+#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd)
+#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg)
+#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd)
+#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg)
+#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd)
+
+#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg)
+#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd)
+
+
+/* 3DNOW extensions */
+
+#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg)
+#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd)
+
+
+/* AMD MMX extensions - also available in intel SSE */
+
+
+#define mmx_m2ri(op,mem,reg,imm) \
+ __asm__ __volatile__ (#op " %1, %0, %%" #reg \
+ : /* nothing */ \
+ : "X" (mem), "X" (imm))
+#define mmx_r2ri(op,regs,regd,imm) \
+ __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
+ : /* nothing */ \
+ : "X" (imm) )
+
+#define mmx_fetch(mem,hint) \
+ __asm__ __volatile__ ("prefetch" #hint " %0" \
+ : /* nothing */ \
+ : "X" (mem))
+
+
+#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg)
+
+#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var)
+
+#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg)
+#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd)
+#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg)
+#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd)
+
+#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm)
+
+#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm)
+
+#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg)
+#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd)
+
+#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg)
+#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd)
+
+#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg)
+#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd)
+
+#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg)
+#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd)
+
+#define pmovmskb(mmreg,reg) \
+ __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg)
+
+#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg)
+#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd)
+
+#define prefetcht0(mem) mmx_fetch (mem, t0)
+#define prefetcht1(mem) mmx_fetch (mem, t1)
+#define prefetcht2(mem) mmx_fetch (mem, t2)
+#define prefetchnta(mem) mmx_fetch (mem, nta)
+
+#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg)
+#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd)
+
+#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm)
+#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm)
+
+#define sfence() __asm__ __volatile__ ("sfence\n\t")
+
+#endif /* AVCODEC_I386MMX_H */
diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
index 950100e63..f32afae0b 100644
--- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c
@@ -1,6 +1,7 @@
/*
* MMX optimized motion estimation
* Copyright (c) 2001 Fabrice Bellard.
+ * Copyright (c) 2002-2004 Michael Niedermayer
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@@ -26,11 +27,11 @@ static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
0x0002000200020002ULL,
};
-static const __attribute__ ((aligned(8), unused)) uint64_t bone= 0x0101010101010101LL;
+static __attribute__ ((aligned(8), unused)) uint64_t bone= 0x0101010101010101LL;
-static inline void sad8_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
- int len= -(stride<<h);
+ int len= -(stride*h);
asm volatile(
".balign 16 \n\t"
"1: \n\t"
@@ -64,9 +65,9 @@ static inline void sad8_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
);
}
-static inline void sad8_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
+static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
- int len= -(stride<<h);
+ int len= -(stride*h);
asm volatile(
".balign 16 \n\t"
"1: \n\t"
@@ -88,7 +89,7 @@ static inline void sad8_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
{
- int len= -(stride<<h);
+ int len= -(stride*h);
asm volatile(
".balign 16 \n\t"
"1: \n\t"
@@ -114,7 +115,7 @@ static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, in
static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{ //FIXME reuse src
- int len= -(stride<<h);
+ int len= -(stride*h);
asm volatile(
".balign 16 \n\t"
"movq "MANGLE(bone)", %%mm5 \n\t"
@@ -151,7 +152,7 @@ static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
{
- int len= -(stride<<h);
+ int len= -(stride*h);
asm volatile(
".balign 16 \n\t"
"1: \n\t"
@@ -189,7 +190,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
- int len= -(stride<<h);
+ int len= -(stride*h);
asm volatile(
".balign 16 \n\t"
"1: \n\t"
@@ -265,85 +266,69 @@ static inline int sum_mmx2(void)
#define PIX_SAD(suf)\
-static int pix_abs8x8_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
+static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
+ assert(h==8);\
asm volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t":);\
\
- sad8_ ## suf(blk1, blk2, stride, 3);\
+ sad8_1_ ## suf(blk1, blk2, stride, 8);\
\
return sum_ ## suf();\
}\
-static int sad8x8_ ## suf(void *s, uint8_t *blk2, uint8_t *blk1, int stride)\
-{\
- asm volatile("pxor %%mm7, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t":);\
-\
- sad8_ ## suf(blk1, blk2, stride, 3);\
-\
- return sum_ ## suf();\
-}\
-\
-static int pix_abs8x8_x2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
+static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
+ assert(h==8);\
asm volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
"movq %0, %%mm5 \n\t"\
:: "m"(round_tab[1]) \
);\
\
- sad8_2_ ## suf(blk1, blk1+1, blk2, stride, 3);\
+ sad8_2_ ## suf(blk1, blk1+1, blk2, stride, 8);\
\
return sum_ ## suf();\
}\
\
-static int pix_abs8x8_y2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
+static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
+ assert(h==8);\
asm volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
"movq %0, %%mm5 \n\t"\
:: "m"(round_tab[1]) \
);\
\
- sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 3);\
+ sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 8);\
\
return sum_ ## suf();\
}\
\
-static int pix_abs8x8_xy2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
+static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
+ assert(h==8);\
asm volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
"movq %0, %%mm5 \n\t"\
:: "m"(round_tab[2]) \
);\
\
- sad8_4_ ## suf(blk1, blk2, stride, 3);\
+ sad8_4_ ## suf(blk1, blk2, stride, 8);\
\
return sum_ ## suf();\
}\
\
-static int pix_abs16x16_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
-{\
- asm volatile("pxor %%mm7, %%mm7 \n\t"\
- "pxor %%mm6, %%mm6 \n\t":);\
-\
- sad8_ ## suf(blk1 , blk2 , stride, 4);\
- sad8_ ## suf(blk1+8, blk2+8, stride, 4);\
-\
- return sum_ ## suf();\
-}\
-static int sad16x16_ ## suf(void *s, uint8_t *blk2, uint8_t *blk1, int stride)\
+static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
asm volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t":);\
\
- sad8_ ## suf(blk1 , blk2 , stride, 4);\
- sad8_ ## suf(blk1+8, blk2+8, stride, 4);\
+ sad8_1_ ## suf(blk1 , blk2 , stride, h);\
+ sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
\
return sum_ ## suf();\
}\
-static int pix_abs16x16_x2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
+static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
asm volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
@@ -351,12 +336,12 @@ static int pix_abs16x16_x2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
:: "m"(round_tab[1]) \
);\
\
- sad8_2_ ## suf(blk1 , blk1+1, blk2 , stride, 4);\
- sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, 4);\
+ sad8_2_ ## suf(blk1 , blk1+1, blk2 , stride, h);\
+ sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, h);\
\
return sum_ ## suf();\
}\
-static int pix_abs16x16_y2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
+static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
asm volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
@@ -364,12 +349,12 @@ static int pix_abs16x16_y2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
:: "m"(round_tab[1]) \
);\
\
- sad8_2_ ## suf(blk1 , blk1+stride, blk2 , stride, 4);\
- sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, 4);\
+ sad8_2_ ## suf(blk1 , blk1+stride, blk2 , stride, h);\
+ sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, h);\
\
return sum_ ## suf();\
}\
-static int pix_abs16x16_xy2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
+static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
asm volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
@@ -377,8 +362,8 @@ static int pix_abs16x16_xy2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\
:: "m"(round_tab[2]) \
);\
\
- sad8_4_ ## suf(blk1 , blk2 , stride, 4);\
- sad8_4_ ## suf(blk1+8, blk2+8, stride, 4);\
+ sad8_4_ ## suf(blk1 , blk2 , stride, h);\
+ sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
\
return sum_ ## suf();\
}\
@@ -389,32 +374,32 @@ PIX_SAD(mmx2)
void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
{
if (mm_flags & MM_MMX) {
- c->pix_abs16x16 = pix_abs16x16_mmx;
- c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx;
- c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx;
- c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx;
- c->pix_abs8x8 = pix_abs8x8_mmx;
- c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx;
- c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
- c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx;
+ c->pix_abs[0][0] = sad16_mmx;
+ c->pix_abs[0][1] = sad16_x2_mmx;
+ c->pix_abs[0][2] = sad16_y2_mmx;
+ c->pix_abs[0][3] = sad16_xy2_mmx;
+ c->pix_abs[1][0] = sad8_mmx;
+ c->pix_abs[1][1] = sad8_x2_mmx;
+ c->pix_abs[1][2] = sad8_y2_mmx;
+ c->pix_abs[1][3] = sad8_xy2_mmx;
- c->sad[0]= sad16x16_mmx;
- c->sad[1]= sad8x8_mmx;
+ c->sad[0]= sad16_mmx;
+ c->sad[1]= sad8_mmx;
}
if (mm_flags & MM_MMXEXT) {
- c->pix_abs16x16 = pix_abs16x16_mmx2;
- c->pix_abs8x8 = pix_abs8x8_mmx2;
+ c->pix_abs[0][0] = sad16_mmx2;
+ c->pix_abs[1][0] = sad8_mmx2;
- c->sad[0]= sad16x16_mmx2;
- c->sad[1]= sad8x8_mmx2;
+ c->sad[0]= sad16_mmx2;
+ c->sad[1]= sad8_mmx2;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
- c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2;
- c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2;
- c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2;
- c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2;
- c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2;
- c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2;
+ c->pix_abs[0][1] = sad16_x2_mmx2;
+ c->pix_abs[0][2] = sad16_y2_mmx2;
+ c->pix_abs[0][3] = sad16_xy2_mmx2;
+ c->pix_abs[1][1] = sad8_x2_mmx2;
+ c->pix_abs[1][2] = sad8_y2_mmx2;
+ c->pix_abs[1][3] = sad8_xy2_mmx2;
}
}
}
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
index d2f477b7b..1c0e9f5ae 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c
@@ -31,31 +31,92 @@ static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xfff
static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
-static void dct_unquantize_h263_mmx(MpegEncContext *s,
+static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
int level, qmul, qadd, nCoeffs;
qmul = qscale << 1;
- qadd = (qscale - 1) | 1;
- assert(s->block_last_index[n]>=0);
+ assert(s->block_last_index[n]>=0 || s->h263_aic);
- if (s->mb_intra) {
- if (!s->h263_aic) {
- if (n < 4)
- level = block[0] * s->y_dc_scale;
- else
- level = block[0] * s->c_dc_scale;
- }else{
- qadd = 0;
- level= block[0];
- }
- nCoeffs=63;
- } else {
- nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
- level = 0;/* keep gcc quiet */
+ if (!s->h263_aic) {
+ if (n < 4)
+ level = block[0] * s->y_dc_scale;
+ else
+ level = block[0] * s->c_dc_scale;
+ qadd = (qscale - 1) | 1;
+ }else{
+ qadd = 0;
+ level= block[0];
}
+ if(s->ac_pred)
+ nCoeffs=63;
+ else
+ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+//printf("%d %d ", qmul, qadd);
+asm volatile(
+ "movd %1, %%mm6 \n\t" //qmul
+ "packssdw %%mm6, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm6 \n\t"
+ "movd %2, %%mm5 \n\t" //qadd
+ "pxor %%mm7, %%mm7 \n\t"
+ "packssdw %%mm5, %%mm5 \n\t"
+ "packssdw %%mm5, %%mm5 \n\t"
+ "psubw %%mm5, %%mm7 \n\t"
+ "pxor %%mm4, %%mm4 \n\t"
+ ".balign 16\n\t"
+ "1: \n\t"
+ "movq (%0, %3), %%mm0 \n\t"
+ "movq 8(%0, %3), %%mm1 \n\t"
+
+ "pmullw %%mm6, %%mm0 \n\t"
+ "pmullw %%mm6, %%mm1 \n\t"
+
+ "movq (%0, %3), %%mm2 \n\t"
+ "movq 8(%0, %3), %%mm3 \n\t"
+
+ "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
+ "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
+
+ "pxor %%mm2, %%mm0 \n\t"
+ "pxor %%mm3, %%mm1 \n\t"
+
+ "paddw %%mm7, %%mm0 \n\t"
+ "paddw %%mm7, %%mm1 \n\t"
+
+ "pxor %%mm0, %%mm2 \n\t"
+ "pxor %%mm1, %%mm3 \n\t"
+
+ "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
+ "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
+
+ "pandn %%mm2, %%mm0 \n\t"
+ "pandn %%mm3, %%mm1 \n\t"
+
+ "movq %%mm0, (%0, %3) \n\t"
+ "movq %%mm1, 8(%0, %3) \n\t"
+
+ "addl $16, %3 \n\t"
+ "jng 1b \n\t"
+ ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
+ : "memory"
+ );
+ block[0]= level;
+}
+
+
+static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ int level, qmul, qadd, nCoeffs;
+
+ qmul = qscale << 1;
+ qadd = (qscale - 1) | 1;
+
+ assert(s->block_last_index[n]>=0 || s->h263_aic);
+
+ nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
//printf("%d %d ", qmul, qadd);
asm volatile(
"movd %1, %%mm6 \n\t" //qmul
@@ -104,8 +165,6 @@ asm volatile(
::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
: "memory"
);
- if(s->mb_intra)
- block[0]= level;
}
@@ -138,24 +197,23 @@ asm volatile(
high3:low3 = low1*low2
high3 += tlow1
*/
-static void dct_unquantize_mpeg1_mmx(MpegEncContext *s,
+static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
int nCoeffs;
const uint16_t *quant_matrix;
+ int block0;
assert(s->block_last_index[n]>=0);
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
- if (s->mb_intra) {
- int block0;
- if (n < 4)
- block0 = block[0] * s->y_dc_scale;
- else
- block0 = block[0] * s->c_dc_scale;
- /* XXX: only mpeg1 */
- quant_matrix = s->intra_matrix;
+ if (n < 4)
+ block0 = block[0] * s->y_dc_scale;
+ else
+ block0 = block[0] * s->c_dc_scale;
+ /* XXX: only mpeg1 */
+ quant_matrix = s->intra_matrix;
asm volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
@@ -205,9 +263,19 @@ asm volatile(
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
: "%eax", "memory"
);
- block[0]= block0;
+ block[0]= block0;
+}
+
+static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ int nCoeffs;
+ const uint16_t *quant_matrix;
+
+ assert(s->block_last_index[n]>=0);
+
+ nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
- } else {
quant_matrix = s->inter_matrix;
asm volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
@@ -262,28 +330,25 @@ asm volatile(
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
: "%eax", "memory"
);
- }
-
}
-static void dct_unquantize_mpeg2_mmx(MpegEncContext *s,
+static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
DCTELEM *block, int n, int qscale)
{
int nCoeffs;
const uint16_t *quant_matrix;
+ int block0;
assert(s->block_last_index[n]>=0);
if(s->alternate_scan) nCoeffs= 63; //FIXME
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
- if (s->mb_intra) {
- int block0;
- if (n < 4)
- block0 = block[0] * s->y_dc_scale;
- else
- block0 = block[0] * s->c_dc_scale;
- quant_matrix = s->intra_matrix;
+ if (n < 4)
+ block0 = block[0] * s->y_dc_scale;
+ else
+ block0 = block[0] * s->c_dc_scale;
+ quant_matrix = s->intra_matrix;
asm volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
@@ -329,10 +394,21 @@ asm volatile(
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
: "%eax", "memory"
);
- block[0]= block0;
+ block[0]= block0;
//Note, we dont do mismatch control for intra as errors cannot accumulate
+}
+
+static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
+ DCTELEM *block, int n, int qscale)
+{
+ int nCoeffs;
+ const uint16_t *quant_matrix;
+
+ assert(s->block_last_index[n]>=0);
+
+ if(s->alternate_scan) nCoeffs= 63; //FIXME
+ else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
- } else {
quant_matrix = s->inter_matrix;
asm volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
@@ -397,7 +473,6 @@ asm volatile(
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs)
: "%eax", "memory"
);
- }
}
/* draw the edges of width 'w' of an image of size width, height
@@ -488,13 +563,130 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
}
}
+static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){
+ const int intra= s->mb_intra;
+ int *sum= s->dct_error_sum[intra];
+ uint16_t *offset= s->dct_offset[intra];
+
+ s->dct_count[intra]++;
+
+ asm volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "1: \n\t"
+ "pxor %%mm0, %%mm0 \n\t"
+ "pxor %%mm1, %%mm1 \n\t"
+ "movq (%0), %%mm2 \n\t"
+ "movq 8(%0), %%mm3 \n\t"
+ "pcmpgtw %%mm2, %%mm0 \n\t"
+ "pcmpgtw %%mm3, %%mm1 \n\t"
+ "pxor %%mm0, %%mm2 \n\t"
+ "pxor %%mm1, %%mm3 \n\t"
+ "psubw %%mm0, %%mm2 \n\t"
+ "psubw %%mm1, %%mm3 \n\t"
+ "movq %%mm2, %%mm4 \n\t"
+ "movq %%mm3, %%mm5 \n\t"
+ "psubusw (%2), %%mm2 \n\t"
+ "psubusw 8(%2), %%mm3 \n\t"
+ "pxor %%mm0, %%mm2 \n\t"
+ "pxor %%mm1, %%mm3 \n\t"
+ "psubw %%mm0, %%mm2 \n\t"
+ "psubw %%mm1, %%mm3 \n\t"
+ "movq %%mm2, (%0) \n\t"
+ "movq %%mm3, 8(%0) \n\t"
+ "movq %%mm4, %%mm2 \n\t"
+ "movq %%mm5, %%mm3 \n\t"
+ "punpcklwd %%mm7, %%mm4 \n\t"
+ "punpckhwd %%mm7, %%mm2 \n\t"
+ "punpcklwd %%mm7, %%mm5 \n\t"
+ "punpckhwd %%mm7, %%mm3 \n\t"
+ "paddd (%1), %%mm4 \n\t"
+ "paddd 8(%1), %%mm2 \n\t"
+ "paddd 16(%1), %%mm5 \n\t"
+ "paddd 24(%1), %%mm3 \n\t"
+ "movq %%mm4, (%1) \n\t"
+ "movq %%mm2, 8(%1) \n\t"
+ "movq %%mm5, 16(%1) \n\t"
+ "movq %%mm3, 24(%1) \n\t"
+ "addl $16, %0 \n\t"
+ "addl $32, %1 \n\t"
+ "addl $16, %2 \n\t"
+ "cmpl %3, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (block), "+r" (sum), "+r" (offset)
+ : "r"(block+64)
+ );
+}
+
+static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){
+ const int intra= s->mb_intra;
+ int *sum= s->dct_error_sum[intra];
+ uint16_t *offset= s->dct_offset[intra];
+
+ s->dct_count[intra]++;
+
+ asm volatile(
+ "pxor %%xmm7, %%xmm7 \n\t"
+ "1: \n\t"
+ "pxor %%xmm0, %%xmm0 \n\t"
+ "pxor %%xmm1, %%xmm1 \n\t"
+ "movdqa (%0), %%xmm2 \n\t"
+ "movdqa 16(%0), %%xmm3 \n\t"
+ "pcmpgtw %%xmm2, %%xmm0 \n\t"
+ "pcmpgtw %%xmm3, %%xmm1 \n\t"
+ "pxor %%xmm0, %%xmm2 \n\t"
+ "pxor %%xmm1, %%xmm3 \n\t"
+ "psubw %%xmm0, %%xmm2 \n\t"
+ "psubw %%xmm1, %%xmm3 \n\t"
+ "movdqa %%xmm2, %%xmm4 \n\t"
+ "movdqa %%xmm3, %%xmm5 \n\t"
+ "psubusw (%2), %%xmm2 \n\t"
+ "psubusw 16(%2), %%xmm3 \n\t"
+ "pxor %%xmm0, %%xmm2 \n\t"
+ "pxor %%xmm1, %%xmm3 \n\t"
+ "psubw %%xmm0, %%xmm2 \n\t"
+ "psubw %%xmm1, %%xmm3 \n\t"
+ "movdqa %%xmm2, (%0) \n\t"
+ "movdqa %%xmm3, 16(%0) \n\t"
+ "movdqa %%xmm4, %%xmm6 \n\t"
+ "movdqa %%xmm5, %%xmm0 \n\t"
+ "punpcklwd %%xmm7, %%xmm4 \n\t"
+ "punpckhwd %%xmm7, %%xmm6 \n\t"
+ "punpcklwd %%xmm7, %%xmm5 \n\t"
+ "punpckhwd %%xmm7, %%xmm0 \n\t"
+ "paddd (%1), %%xmm4 \n\t"
+ "paddd 16(%1), %%xmm6 \n\t"
+ "paddd 32(%1), %%xmm5 \n\t"
+ "paddd 48(%1), %%xmm0 \n\t"
+ "movdqa %%xmm4, (%1) \n\t"
+ "movdqa %%xmm6, 16(%1) \n\t"
+ "movdqa %%xmm5, 32(%1) \n\t"
+ "movdqa %%xmm0, 48(%1) \n\t"
+ "addl $32, %0 \n\t"
+ "addl $64, %1 \n\t"
+ "addl $32, %2 \n\t"
+ "cmpl %3, %0 \n\t"
+ " jb 1b \n\t"
+ : "+r" (block), "+r" (sum), "+r" (offset)
+ : "r"(block+64)
+ );
+}
+
#undef HAVE_MMX2
#define RENAME(a) a ## _MMX
+#define RENAMEl(a) a ## _mmx
#include "mpegvideo_mmx_template.c"
#define HAVE_MMX2
#undef RENAME
+#undef RENAMEl
#define RENAME(a) a ## _MMX2
+#define RENAMEl(a) a ## _mmx2
+#include "mpegvideo_mmx_template.c"
+
+#undef RENAME
+#undef RENAMEl
+#define RENAME(a) a ## _SSE2
+#define RENAMEl(a) a ## _sse2
#include "mpegvideo_mmx_template.c"
void MPV_common_init_mmx(MpegEncContext *s)
@@ -502,14 +694,25 @@ void MPV_common_init_mmx(MpegEncContext *s)
if (mm_flags & MM_MMX) {
const int dct_algo = s->avctx->dct_algo;
- s->dct_unquantize_h263 = dct_unquantize_h263_mmx;
- s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx;
- s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx;
+ s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
+ s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
+ s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
+ s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
+ s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
+ s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
draw_edges = draw_edges_mmx;
+
+ if (mm_flags & MM_SSE2) {
+ s->denoise_dct= denoise_dct_sse2;
+ } else {
+ s->denoise_dct= denoise_dct_mmx;
+ }
if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
- if(mm_flags & MM_MMXEXT){
+ if(mm_flags & MM_SSE2){
+ s->dct_quantize= dct_quantize_SSE2;
+ } else if(mm_flags & MM_MMXEXT){
s->dct_quantize= dct_quantize_MMX2;
} else {
s->dct_quantize= dct_quantize_MMX;
diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
index 706211eec..d4ed61ecb 100644
--- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
+++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c
@@ -43,7 +43,10 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
//s->fdct (block);
- ff_fdct_mmx (block); //cant be anything else ...
+ RENAMEl(ff_fdct) (block); //cant be anything else ...
+
+ if(s->dct_error_sum)
+ s->denoise_dct(s, block);
if (s->mb_intra) {
int dummy;
@@ -76,12 +79,12 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
block[0]=0; //avoid fake overflow
// temp_block[0] = (block[0] + (q >> 1)) / q;
last_non_zero_p1 = 1;
- bias = s->q_intra_matrix16_bias[qscale];
- qmat = s->q_intra_matrix16[qscale];
+ bias = s->q_intra_matrix16[qscale][1];
+ qmat = s->q_intra_matrix16[qscale][0];
} else {
last_non_zero_p1 = 0;
- bias = s->q_inter_matrix16_bias[qscale];
- qmat = s->q_inter_matrix16[qscale];
+ bias = s->q_inter_matrix16[qscale][1];
+ qmat = s->q_inter_matrix16[qscale][0];
}
if(s->out_format == FMT_H263 && s->mpeg_quant==0){
diff --git a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c
index 1ee88b634..626c1f565 100644
--- a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c
@@ -18,7 +18,6 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include "../dsputil.h"
-#include "../simple_idct.h"
/*
23170.475006