diff options
author | Mike Melanson <mike@multimedia.cx> | 2004-02-01 05:31:16 +0000 |
---|---|---|
committer | Mike Melanson <mike@multimedia.cx> | 2004-02-01 05:31:16 +0000 |
commit | 61d793ef13ac2ef8f9c2b41b71430f21fac80337 (patch) | |
tree | 69a1b20b01993b4d61fe1c8c2be33dd2e362a40c /src/libffmpeg/libavcodec/i386 | |
parent | f707774ac5d48c02c6a36327304d88629b0e38f9 (diff) | |
download | xine-lib-61d793ef13ac2ef8f9c2b41b71430f21fac80337.tar.gz xine-lib-61d793ef13ac2ef8f9c2b41b71430f21fac80337.tar.bz2 |
sync to ffmpeg build 4699
CVS patchset: 6090
CVS date: 2004/02/01 05:31:16
Diffstat (limited to 'src/libffmpeg/libavcodec/i386')
-rw-r--r-- | src/libffmpeg/libavcodec/i386/cputest.c | 115 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/dsputil_mmx.c | 474 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h | 2 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h | 1 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/fdct_mmx.c | 501 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/idct_mmx.c | 7 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/mmx.h | 244 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/motion_est_mmx.c | 123 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c | 297 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c | 13 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/simple_idct_mmx.c | 1 |
11 files changed, 1510 insertions, 268 deletions
diff --git a/src/libffmpeg/libavcodec/i386/cputest.c b/src/libffmpeg/libavcodec/i386/cputest.c index b885548ee..b50d653c4 100644 --- a/src/libffmpeg/libavcodec/i386/cputest.c +++ b/src/libffmpeg/libavcodec/i386/cputest.c @@ -1,13 +1,122 @@ -/* dummy file to use xine mm_support function */ +/* Cpu detection code, extracted from mmx.h ((c)1997-99 by H. Dietz + and R. Fisher). Converted to C and improved by Fabrice Bellard */ -#include "xineutils.h" +#include <stdlib.h> #include "../dsputil.h" +/* ebx saving is necessary for PIC. gcc seems unable to see it alone */ +#define cpuid(index,eax,ebx,ecx,edx)\ + __asm __volatile\ + ("movl %%ebx, %%esi\n\t"\ + "cpuid\n\t"\ + "xchgl %%ebx, %%esi"\ + : "=a" (eax), "=S" (ebx),\ + "=c" (ecx), "=d" (edx)\ + : "0" (index)); /* Function to test if multimedia instructions are supported... */ int mm_support(void) { - return xine_mm_accel(); + int rval; + int eax, ebx, ecx, edx; + + __asm__ __volatile__ ( + /* See if CPUID instruction is supported ... */ + /* ... Get copies of EFLAGS into eax and ecx */ + "pushf\n\t" + "popl %0\n\t" + "movl %0, %1\n\t" + + /* ... Toggle the ID bit in one copy and store */ + /* to the EFLAGS reg */ + "xorl $0x200000, %0\n\t" + "push %0\n\t" + "popf\n\t" + + /* ... Get the (hopefully modified) EFLAGS */ + "pushf\n\t" + "popl %0\n\t" + : "=a" (eax), "=c" (ecx) + : + : "cc" + ); + + if (eax == ecx) + return 0; /* CPUID not supported */ + + cpuid(0, eax, ebx, ecx, edx); + + if (ebx == 0x756e6547 && + edx == 0x49656e69 && + ecx == 0x6c65746e) { + + /* intel */ + inteltest: + cpuid(1, eax, ebx, ecx, edx); + if ((edx & 0x00800000) == 0) + return 0; + rval = MM_MMX; + if (edx & 0x02000000) + rval |= MM_MMXEXT | MM_SSE; + if (edx & 0x04000000) + rval |= MM_SSE2; + return rval; + } else if (ebx == 0x68747541 && + edx == 0x69746e65 && + ecx == 0x444d4163) { + /* AMD */ + cpuid(0x80000000, eax, ebx, ecx, edx); + if ((unsigned)eax < 0x80000001) + goto inteltest; + cpuid(0x80000001, eax, ebx, ecx, edx); + if ((edx & 0x00800000) == 0) + return 0; + rval = MM_MMX; + if (edx & 0x80000000) + rval |= MM_3DNOW; + if (edx & 0x00400000) + rval |= MM_MMXEXT; + return rval; + } else if (ebx == 0x746e6543 && + edx == 0x48727561 && + ecx == 0x736c7561) { /* "CentaurHauls" */ + /* VIA C3 */ + cpuid(0x80000000, eax, ebx, ecx, edx); + if ((unsigned)eax < 0x80000001) + goto inteltest; + cpuid(0x80000001, eax, ebx, ecx, edx); + rval = 0; + if( edx & ( 1 << 31) ) + rval |= MM_3DNOW; + if( edx & ( 1 << 23) ) + rval |= MM_MMX; + if( edx & ( 1 << 24) ) + rval |= MM_MMXEXT; + return rval; + } else if (ebx == 0x69727943 && + edx == 0x736e4978 && + ecx == 0x64616574) { + /* Cyrix Section */ + /* See if extended CPUID level 80000001 is supported */ + /* The value of CPUID/80000001 for the 6x86MX is undefined + according to the Cyrix CPU Detection Guide (Preliminary + Rev. 1.01 table 1), so we'll check the value of eax for + CPUID/0 to see if standard CPUID level 2 is supported. + According to the table, the only CPU which supports level + 2 is also the only one which supports extended CPUID levels. + */ + if (eax != 2) + goto inteltest; + cpuid(0x80000001, eax, ebx, ecx, edx); + if ((eax & 0x00800000) == 0) + return 0; + rval = MM_MMX; + if (eax & 0x01000000) + rval |= MM_MMXEXT; + return rval; + } else { + return 0; + } } #ifdef __TEST__ diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c index a1e1642d5..efa022557 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c @@ -1,6 +1,7 @@ /* * MMX optimized DSP utils * Copyright (c) 2000, 2001 Fabrice Bellard. + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -22,6 +23,11 @@ #include "../dsputil.h" #include "../simple_idct.h" +//#undef NDEBUG +//#include <assert.h> + +extern const uint8_t ff_h263_loop_filter_strength[32]; + int mm_flags; /* multimedia extension flags */ /* pixel operations */ @@ -34,6 +40,8 @@ static const uint64_t ff_pw_3 __attribute__ ((aligned(8))) = 0x0003000300030003 static const uint64_t ff_pw_16 __attribute__ ((aligned(8))) = 0x0010001000100010ULL; static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; +static const uint64_t ff_pb_FC __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; + #define JUMPALIGN() __asm __volatile (".balign 8"::) #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) @@ -465,6 +473,180 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ dst[i+0] += src[i+0]; } +#define H263_LOOP_FILTER \ + "pxor %%mm7, %%mm7 \n\t"\ + "movq %0, %%mm0 \n\t"\ + "movq %0, %%mm1 \n\t"\ + "movq %3, %%mm2 \n\t"\ + "movq %3, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm0 \n\t"\ + "punpckhbw %%mm7, %%mm1 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "psubw %%mm2, %%mm0 \n\t"\ + "psubw %%mm3, %%mm1 \n\t"\ + "movq %1, %%mm2 \n\t"\ + "movq %1, %%mm3 \n\t"\ + "movq %2, %%mm4 \n\t"\ + "movq %2, %%mm5 \n\t"\ + "punpcklbw %%mm7, %%mm2 \n\t"\ + "punpckhbw %%mm7, %%mm3 \n\t"\ + "punpcklbw %%mm7, %%mm4 \n\t"\ + "punpckhbw %%mm7, %%mm5 \n\t"\ + "psubw %%mm2, %%mm4 \n\t"\ + "psubw %%mm3, %%mm5 \n\t"\ + "psllw $2, %%mm4 \n\t"\ + "psllw $2, %%mm5 \n\t"\ + "paddw %%mm0, %%mm4 \n\t"\ + "paddw %%mm1, %%mm5 \n\t"\ + "pxor %%mm6, %%mm6 \n\t"\ + "pcmpgtw %%mm4, %%mm6 \n\t"\ + "pcmpgtw %%mm5, %%mm7 \n\t"\ + "pxor %%mm6, %%mm4 \n\t"\ + "pxor %%mm7, %%mm5 \n\t"\ + "psubw %%mm6, %%mm4 \n\t"\ + "psubw %%mm7, %%mm5 \n\t"\ + "psrlw $3, %%mm4 \n\t"\ + "psrlw $3, %%mm5 \n\t"\ + "packuswb %%mm5, %%mm4 \n\t"\ + "packsswb %%mm7, %%mm6 \n\t"\ + "pxor %%mm7, %%mm7 \n\t"\ + "movd %4, %%mm2 \n\t"\ + "punpcklbw %%mm2, %%mm2 \n\t"\ + "punpcklbw %%mm2, %%mm2 \n\t"\ + "punpcklbw %%mm2, %%mm2 \n\t"\ + "psubusb %%mm4, %%mm2 \n\t"\ + "movq %%mm2, %%mm3 \n\t"\ + "psubusb %%mm4, %%mm3 \n\t"\ + "psubb %%mm3, %%mm2 \n\t"\ + "movq %1, %%mm3 \n\t"\ + "movq %2, %%mm4 \n\t"\ + "pxor %%mm6, %%mm3 \n\t"\ + "pxor %%mm6, %%mm4 \n\t"\ + "paddusb %%mm2, %%mm3 \n\t"\ + "psubusb %%mm2, %%mm4 \n\t"\ + "pxor %%mm6, %%mm3 \n\t"\ + "pxor %%mm6, %%mm4 \n\t"\ + "paddusb %%mm2, %%mm2 \n\t"\ + "packsswb %%mm1, %%mm0 \n\t"\ + "pcmpgtb %%mm0, %%mm7 \n\t"\ + "pxor %%mm7, %%mm0 \n\t"\ + "psubb %%mm7, %%mm0 \n\t"\ + "movq %%mm0, %%mm1 \n\t"\ + "psubusb %%mm2, %%mm0 \n\t"\ + "psubb %%mm0, %%mm1 \n\t"\ + "pand %5, %%mm1 \n\t"\ + "psrlw $2, %%mm1 \n\t"\ + "pxor %%mm7, %%mm1 \n\t"\ + "psubb %%mm7, %%mm1 \n\t"\ + "movq %0, %%mm5 \n\t"\ + "movq %3, %%mm6 \n\t"\ + "psubb %%mm1, %%mm5 \n\t"\ + "paddb %%mm1, %%mm6 \n\t" + +static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){ + const int strength= ff_h263_loop_filter_strength[qscale]; + + asm volatile( + + H263_LOOP_FILTER + + "movq %%mm3, %1 \n\t" + "movq %%mm4, %2 \n\t" + "movq %%mm5, %0 \n\t" + "movq %%mm6, %3 \n\t" + : "+m" (*(uint64_t*)(src - 2*stride)), + "+m" (*(uint64_t*)(src - 1*stride)), + "+m" (*(uint64_t*)(src + 0*stride)), + "+m" (*(uint64_t*)(src + 1*stride)) + : "g" (2*strength), "m"(ff_pb_FC) + ); +} + +static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ + asm volatile( //FIXME could save 1 instruction if done as 8x4 ... + "movd %4, %%mm0 \n\t" + "movd %5, %%mm1 \n\t" + "movd %6, %%mm2 \n\t" + "movd %7, %%mm3 \n\t" + "punpcklbw %%mm1, %%mm0 \n\t" + "punpcklbw %%mm3, %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "punpcklwd %%mm2, %%mm0 \n\t" + "punpckhwd %%mm2, %%mm1 \n\t" + "movd %%mm0, %0 \n\t" + "punpckhdq %%mm0, %%mm0 \n\t" + "movd %%mm0, %1 \n\t" + "movd %%mm1, %2 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movd %%mm1, %3 \n\t" + + : "=m" (*(uint32_t*)(dst + 0*dst_stride)), + "=m" (*(uint32_t*)(dst + 1*dst_stride)), + "=m" (*(uint32_t*)(dst + 2*dst_stride)), + "=m" (*(uint32_t*)(dst + 3*dst_stride)) + : "m" (*(uint32_t*)(src + 0*src_stride)), + "m" (*(uint32_t*)(src + 1*src_stride)), + "m" (*(uint32_t*)(src + 2*src_stride)), + "m" (*(uint32_t*)(src + 3*src_stride)) + ); +} + +static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){ + const int strength= ff_h263_loop_filter_strength[qscale]; + uint64_t temp[4] __attribute__ ((aligned(8))); + uint8_t *btemp= (uint8_t*)temp; + + src -= 2; + + transpose4x4(btemp , src , 8, stride); + transpose4x4(btemp+4, src + 4*stride, 8, stride); + asm volatile( + H263_LOOP_FILTER // 5 3 4 6 + + : "+m" (temp[0]), + "+m" (temp[1]), + "+m" (temp[2]), + "+m" (temp[3]) + : "g" (2*strength), "m"(ff_pb_FC) + ); + + asm volatile( + "movq %%mm5, %%mm1 \n\t" + "movq %%mm4, %%mm0 \n\t" + "punpcklbw %%mm3, %%mm5 \n\t" + "punpcklbw %%mm6, %%mm4 \n\t" + "punpckhbw %%mm3, %%mm1 \n\t" + "punpckhbw %%mm6, %%mm0 \n\t" + "movq %%mm5, %%mm3 \n\t" + "movq %%mm1, %%mm6 \n\t" + "punpcklwd %%mm4, %%mm5 \n\t" + "punpcklwd %%mm0, %%mm1 \n\t" + "punpckhwd %%mm4, %%mm3 \n\t" + "punpckhwd %%mm0, %%mm6 \n\t" + "movd %%mm5, %0 \n\t" + "punpckhdq %%mm5, %%mm5 \n\t" + "movd %%mm5, %1 \n\t" + "movd %%mm3, %2 \n\t" + "punpckhdq %%mm3, %%mm3 \n\t" + "movd %%mm3, %3 \n\t" + "movd %%mm1, %4 \n\t" + "punpckhdq %%mm1, %%mm1 \n\t" + "movd %%mm1, %5 \n\t" + "movd %%mm6, %6 \n\t" + "punpckhdq %%mm6, %%mm6 \n\t" + "movd %%mm6, %7 \n\t" + : "=m" (*(uint32_t*)(src + 0*stride)), + "=m" (*(uint32_t*)(src + 1*stride)), + "=m" (*(uint32_t*)(src + 2*stride)), + "=m" (*(uint32_t*)(src + 3*stride)), + "=m" (*(uint32_t*)(src + 4*stride)), + "=m" (*(uint32_t*)(src + 5*stride)), + "=m" (*(uint32_t*)(src + 6*stride)), + "=m" (*(uint32_t*)(src + 7*stride)) + ); +} + #ifdef CONFIG_ENCODERS static int pix_norm1_mmx(uint8_t *pix, int line_size) { int tmp; @@ -509,10 +691,10 @@ static int pix_norm1_mmx(uint8_t *pix, int line_size) { return tmp; } -static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) { +static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { int tmp; asm volatile ( - "movl $16,%%ecx\n" + "movl %4,%%ecx\n" "pxor %%mm0,%%mm0\n" /* mm0 = 0 */ "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ "1:\n" @@ -563,10 +745,252 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) { "psrlq $32, %%mm7\n" /* shift hi dword to lo */ "paddd %%mm7,%%mm1\n" "movd %%mm1,%2\n" - : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" (line_size) : "ecx"); + : "+r" (pix1), "+r" (pix2), "=r"(tmp) + : "r" (line_size) , "m" (h) + : "%ecx"); return tmp; } +static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { + int tmp; + + assert( (((int)pix) & 7) == 0); + assert((line_size &7) ==0); + +#define SUM(in0, in1, out0, out1) \ + "movq (%0), %%mm2\n"\ + "movq 8(%0), %%mm3\n"\ + "addl %2,%0\n"\ + "movq %%mm2, " #out0 "\n"\ + "movq %%mm3, " #out1 "\n"\ + "psubusb " #in0 ", %%mm2\n"\ + "psubusb " #in1 ", %%mm3\n"\ + "psubusb " #out0 ", " #in0 "\n"\ + "psubusb " #out1 ", " #in1 "\n"\ + "por %%mm2, " #in0 "\n"\ + "por %%mm3, " #in1 "\n"\ + "movq " #in0 ", %%mm2\n"\ + "movq " #in1 ", %%mm3\n"\ + "punpcklbw %%mm7, " #in0 "\n"\ + "punpcklbw %%mm7, " #in1 "\n"\ + "punpckhbw %%mm7, %%mm2\n"\ + "punpckhbw %%mm7, %%mm3\n"\ + "paddw " #in1 ", " #in0 "\n"\ + "paddw %%mm3, %%mm2\n"\ + "paddw %%mm2, " #in0 "\n"\ + "paddw " #in0 ", %%mm6\n" + + + asm volatile ( + "movl %3,%%ecx\n" + "pxor %%mm6,%%mm6\n" + "pxor %%mm7,%%mm7\n" + "movq (%0),%%mm0\n" + "movq 8(%0),%%mm1\n" + "addl %2,%0\n" + "subl $2, %%ecx\n" + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + "1:\n" + + SUM(%%mm4, %%mm5, %%mm0, %%mm1) + + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + + "subl $2, %%ecx\n" + "jnz 1b\n" + + "movq %%mm6,%%mm0\n" + "psrlq $32, %%mm6\n" + "paddw %%mm6,%%mm0\n" + "movq %%mm0,%%mm6\n" + "psrlq $16, %%mm0\n" + "paddw %%mm6,%%mm0\n" + "movd %%mm0,%1\n" + : "+r" (pix), "=r"(tmp) + : "r" (line_size) , "m" (h) + : "%ecx"); + return tmp & 0xFFFF; +} +#undef SUM + +static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { + int tmp; + + assert( (((int)pix) & 7) == 0); + assert((line_size &7) ==0); + +#define SUM(in0, in1, out0, out1) \ + "movq (%0), " #out0 "\n"\ + "movq 8(%0), " #out1 "\n"\ + "addl %2,%0\n"\ + "psadbw " #out0 ", " #in0 "\n"\ + "psadbw " #out1 ", " #in1 "\n"\ + "paddw " #in1 ", " #in0 "\n"\ + "paddw " #in0 ", %%mm6\n" + + asm volatile ( + "movl %3,%%ecx\n" + "pxor %%mm6,%%mm6\n" + "pxor %%mm7,%%mm7\n" + "movq (%0),%%mm0\n" + "movq 8(%0),%%mm1\n" + "addl %2,%0\n" + "subl $2, %%ecx\n" + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + "1:\n" + + SUM(%%mm4, %%mm5, %%mm0, %%mm1) + + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + + "subl $2, %%ecx\n" + "jnz 1b\n" + + "movd %%mm6,%1\n" + : "+r" (pix), "=r"(tmp) + : "r" (line_size) , "m" (h) + : "%ecx"); + return tmp; +} +#undef SUM + +static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { + int tmp; + + assert( (((int)pix1) & 7) == 0); + assert( (((int)pix2) & 7) == 0); + assert((line_size &7) ==0); + +#define SUM(in0, in1, out0, out1) \ + "movq (%0),%%mm2\n"\ + "movq (%1)," #out0 "\n"\ + "movq 8(%0),%%mm3\n"\ + "movq 8(%1)," #out1 "\n"\ + "addl %3,%0\n"\ + "addl %3,%1\n"\ + "psubb " #out0 ", %%mm2\n"\ + "psubb " #out1 ", %%mm3\n"\ + "pxor %%mm7, %%mm2\n"\ + "pxor %%mm7, %%mm3\n"\ + "movq %%mm2, " #out0 "\n"\ + "movq %%mm3, " #out1 "\n"\ + "psubusb " #in0 ", %%mm2\n"\ + "psubusb " #in1 ", %%mm3\n"\ + "psubusb " #out0 ", " #in0 "\n"\ + "psubusb " #out1 ", " #in1 "\n"\ + "por %%mm2, " #in0 "\n"\ + "por %%mm3, " #in1 "\n"\ + "movq " #in0 ", %%mm2\n"\ + "movq " #in1 ", %%mm3\n"\ + "punpcklbw %%mm7, " #in0 "\n"\ + "punpcklbw %%mm7, " #in1 "\n"\ + "punpckhbw %%mm7, %%mm2\n"\ + "punpckhbw %%mm7, %%mm3\n"\ + "paddw " #in1 ", " #in0 "\n"\ + "paddw %%mm3, %%mm2\n"\ + "paddw %%mm2, " #in0 "\n"\ + "paddw " #in0 ", %%mm6\n" + + + asm volatile ( + "movl %4,%%ecx\n" + "pxor %%mm6,%%mm6\n" + "pcmpeqw %%mm7,%%mm7\n" + "psllw $15, %%mm7\n" + "packsswb %%mm7, %%mm7\n" + "movq (%0),%%mm0\n" + "movq (%1),%%mm2\n" + "movq 8(%0),%%mm1\n" + "movq 8(%1),%%mm3\n" + "addl %3,%0\n" + "addl %3,%1\n" + "subl $2, %%ecx\n" + "psubb %%mm2, %%mm0\n" + "psubb %%mm3, %%mm1\n" + "pxor %%mm7, %%mm0\n" + "pxor %%mm7, %%mm1\n" + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + "1:\n" + + SUM(%%mm4, %%mm5, %%mm0, %%mm1) + + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + + "subl $2, %%ecx\n" + "jnz 1b\n" + + "movq %%mm6,%%mm0\n" + "psrlq $32, %%mm6\n" + "paddw %%mm6,%%mm0\n" + "movq %%mm0,%%mm6\n" + "psrlq $16, %%mm0\n" + "paddw %%mm6,%%mm0\n" + "movd %%mm0,%2\n" + : "+r" (pix1), "+r" (pix2), "=r"(tmp) + : "r" (line_size) , "m" (h) + : "%ecx"); + return tmp & 0x7FFF; +} +#undef SUM + +static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { + int tmp; + + assert( (((int)pix1) & 7) == 0); + assert( (((int)pix2) & 7) == 0); + assert((line_size &7) ==0); + +#define SUM(in0, in1, out0, out1) \ + "movq (%0)," #out0 "\n"\ + "movq (%1),%%mm2\n"\ + "movq 8(%0)," #out1 "\n"\ + "movq 8(%1),%%mm3\n"\ + "addl %3,%0\n"\ + "addl %3,%1\n"\ + "psubb %%mm2, " #out0 "\n"\ + "psubb %%mm3, " #out1 "\n"\ + "pxor %%mm7, " #out0 "\n"\ + "pxor %%mm7, " #out1 "\n"\ + "psadbw " #out0 ", " #in0 "\n"\ + "psadbw " #out1 ", " #in1 "\n"\ + "paddw " #in1 ", " #in0 "\n"\ + "paddw " #in0 ", %%mm6\n" + + asm volatile ( + "movl %4,%%ecx\n" + "pxor %%mm6,%%mm6\n" + "pcmpeqw %%mm7,%%mm7\n" + "psllw $15, %%mm7\n" + "packsswb %%mm7, %%mm7\n" + "movq (%0),%%mm0\n" + "movq (%1),%%mm2\n" + "movq 8(%0),%%mm1\n" + "movq 8(%1),%%mm3\n" + "addl %3,%0\n" + "addl %3,%1\n" + "subl $2, %%ecx\n" + "psubb %%mm2, %%mm0\n" + "psubb %%mm3, %%mm1\n" + "pxor %%mm7, %%mm0\n" + "pxor %%mm7, %%mm1\n" + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + "1:\n" + + SUM(%%mm4, %%mm5, %%mm0, %%mm1) + + SUM(%%mm0, %%mm1, %%mm4, %%mm5) + + "subl $2, %%ecx\n" + "jnz 1b\n" + + "movd %%mm6,%2\n" + : "+r" (pix1), "+r" (pix2), "=r"(tmp) + : "r" (line_size) , "m" (h) + : "%ecx"); + return tmp; +} +#undef SUM + static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ int i=0; asm volatile( @@ -588,7 +1012,6 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ for(; i<w; i++) dst[i+0] = src1[i+0]-src2[i+0]; } -#endif static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){ int i=0; @@ -626,8 +1049,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *left = src2[w-1]; } -#ifdef CONFIG_ENCODERS - #define LBUTTERFLY2(a1,b1,a2,b2)\ "paddw " #b1 ", " #a1 " \n\t"\ "paddw " #b2 ", " #a2 " \n\t"\ @@ -691,9 +1112,11 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t "movq "#c", "#o"+32(%1) \n\t"\ "movq "#d", "#o"+48(%1) \n\t"\ -static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride){ +static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ uint64_t temp[16] __align8; int sum=0; + + assert(h==8); diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); @@ -776,9 +1199,11 @@ static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride) return sum&0xFFFF; } -static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride){ +static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ uint64_t temp[16] __align8; int sum=0; + + assert(h==8); diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); @@ -862,8 +1287,8 @@ static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride } -WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx) -WARPER88_1616(hadamard8_diff_mmx2, hadamard8_diff16_mmx2) +WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx) +WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2) #endif //CONFIG_ENCODERS #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d) @@ -1602,12 +2027,19 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #endif if (mm_flags & MM_MMX) { - const int idct_algo= avctx->idct_algo; -#ifdef CONFIG_ENCODERS const int dct_algo = avctx->dct_algo; + const int idct_algo= avctx->idct_algo; - if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX) - c->fdct = ff_fdct_mmx; +#ifdef CONFIG_ENCODERS + if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ + if(mm_flags & MM_SSE2){ + c->fdct = ff_fdct_sse2; + }else if(mm_flags & MM_MMXEXT){ + c->fdct = ff_fdct_mmx2; + }else{ + c->fdct = ff_fdct_mmx; + } + } #endif //CONFIG_ENCODERS if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ @@ -1688,7 +2120,15 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->pix_norm1 = pix_norm1_mmx; c->sse[0] = sse16_mmx; + c->vsad[4]= vsad_intra16_mmx; + + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->vsad[0] = vsad16_mmx; + } #endif //CONFIG_ENCODERS + + c->h263_v_loop_filter= h263_v_loop_filter_mmx; + c->h263_h_loop_filter= h263_h_loop_filter_mmx; if (mm_flags & MM_MMXEXT) { c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; @@ -1708,6 +2148,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) #ifdef CONFIG_ENCODERS c->hadamard8_diff[0]= hadamard8_diff16_mmx2; c->hadamard8_diff[1]= hadamard8_diff_mmx2; + c->vsad[4]= vsad_intra16_mmx2; #endif //CONFIG_ENCODERS if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ @@ -1717,6 +2158,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; +#ifdef CONFIG_ENCODERS + c->vsad[0] = vsad16_mmx2; +#endif //CONFIG_ENCODERS } #if 1 @@ -1754,7 +2198,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) #endif +#ifdef CONFIG_ENCODERS c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; +#endif //CONFIG_ENCODERS } else if (mm_flags & MM_3DNOW) { c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h index 8418123ac..c8494f51a 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h @@ -1,7 +1,7 @@ /* * DSP utils : average functions are compiled twice for 3dnow/mmx2 * Copyright (c) 2000, 2001 Fabrice Bellard. - * Copyright (c) 2002 Michael Niedermayer + * Copyright (c) 2002-2004 Michael Niedermayer * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h index bbd5aec97..21f0bfd84 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h @@ -1,6 +1,7 @@ /* * DSP utils mmx functions are compiled twice for rnd/no_rnd * Copyright (c) 2000, 2001 Fabrice Bellard. + * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public diff --git a/src/libffmpeg/libavcodec/i386/fdct_mmx.c b/src/libffmpeg/libavcodec/i386/fdct_mmx.c index a2402c95d..877160773 100644 --- a/src/libffmpeg/libavcodec/i386/fdct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/fdct_mmx.c @@ -1,16 +1,21 @@ /* * MMX optimized forward DCT * The gcc porting is Copyright (c) 2001 Fabrice Bellard. + * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * SSE2 optimization is Copyright (c) 2004 Denes Balatoni. * * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT * * Intel Application Note AP-922 - fast, precise implementation of DCT * http://developer.intel.com/vtune/cbts/appnotes.htm + * + * Also of inspiration: + * a page about fdct at http://www.geocities.com/ssavekar/dct.htm + * Skal's fdct at http://skal.planet-d.net/coding/dct.html */ -#include "../dsputil.h" +#include "../common.h" #include "mmx.h" -#undef ATTR_ALIGN #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) ////////////////////////////////////////////////////////////////////// @@ -27,10 +32,8 @@ #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy #define SHIFT_FRW_COL BITS_FRW_ACC #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) -//#define RND_FRW_ROW (262144 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_ROW-1) #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) -//#define RND_FRW_COL (2 * (BITS_FRW_ACC - 1)) //; 1 << (SHIFT_FRW_COL-1) -#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) +//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) //concatenated table, for forward DCT transformation static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = { @@ -38,101 +41,220 @@ static const int16_t fdct_tg_all_16[] ATTR_ALIGN(8) = { 27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5 -21746, -21746, -21746, -21746, // tg * (2<<16) + 0.5 }; -static const int16_t cos_4_16[4] ATTR_ALIGN(8) = { - -19195, -19195, -19195, -19195, //cos * (2<<16) + 0.5 -}; static const int16_t ocos_4_16[4] ATTR_ALIGN(8) = { 23170, 23170, 23170, 23170, //cos * (2<<15) + 0.5 }; -static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; +static const long long fdct_one_corr ATTR_ALIGN(8) = 0x0001000100010001LL; + static const long fdct_r_row[2] ATTR_ALIGN(8) = {RND_FRW_ROW, RND_FRW_ROW }; +static const long fdct_r_row_sse2[4] ATTR_ALIGN(16) = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; + static const int16_t tab_frw_01234567[] ATTR_ALIGN(8) = { // forward_dct coeff table - //row0 - 16384, 16384, 21407, -8867, // w09 w01 w08 w00 - 16384, 16384, 8867, -21407, // w13 w05 w12 w04 - 16384, -16384, 8867, 21407, // w11 w03 w10 w02 - -16384, 16384, -21407, -8867, // w15 w07 w14 w06 - 22725, 12873, 19266, -22725, // w22 w20 w18 w16 - 19266, 4520, -4520, -12873, // w23 w21 w19 w17 - 12873, 4520, 4520, 19266, // w30 w28 w26 w24 - -22725, 19266, -12873, -22725, // w31 w29 w27 w25 - - //row1 - 22725, 22725, 29692, -12299, // w09 w01 w08 w00 - 22725, 22725, 12299, -29692, // w13 w05 w12 w04 - 22725, -22725, 12299, 29692, // w11 w03 w10 w02 - -22725, 22725, -29692, -12299, // w15 w07 w14 w06 - 31521, 17855, 26722, -31521, // w22 w20 w18 w16 - 26722, 6270, -6270, -17855, // w23 w21 w19 w17 - 17855, 6270, 6270, 26722, // w30 w28 w26 w24 - -31521, 26722, -17855, -31521, // w31 w29 w27 w25 - - //row2 - 21407, 21407, 27969, -11585, // w09 w01 w08 w00 - 21407, 21407, 11585, -27969, // w13 w05 w12 w04 - 21407, -21407, 11585, 27969, // w11 w03 w10 w02 - -21407, 21407, -27969, -11585, // w15 w07 w14 w06 - 29692, 16819, 25172, -29692, // w22 w20 w18 w16 - 25172, 5906, -5906, -16819, // w23 w21 w19 w17 - 16819, 5906, 5906, 25172, // w30 w28 w26 w24 - -29692, 25172, -16819, -29692, // w31 w29 w27 w25 - - //row3 - 19266, 19266, 25172, -10426, // w09 w01 w08 w00 - 19266, 19266, 10426, -25172, // w13 w05 w12 w04 - 19266, -19266, 10426, 25172, // w11 w03 w10 w02 - -19266, 19266, -25172, -10426, // w15 w07 w14 w06, - 26722, 15137, 22654, -26722, // w22 w20 w18 w16 - 22654, 5315, -5315, -15137, // w23 w21 w19 w17 - 15137, 5315, 5315, 22654, // w30 w28 w26 w24 - -26722, 22654, -15137, -26722, // w31 w29 w27 w25, - - //row4 - 16384, 16384, 21407, -8867, // w09 w01 w08 w00 - 16384, 16384, 8867, -21407, // w13 w05 w12 w04 - 16384, -16384, 8867, 21407, // w11 w03 w10 w02 - -16384, 16384, -21407, -8867, // w15 w07 w14 w06 - 22725, 12873, 19266, -22725, // w22 w20 w18 w16 - 19266, 4520, -4520, -12873, // w23 w21 w19 w17 - 12873, 4520, 4520, 19266, // w30 w28 w26 w24 - -22725, 19266, -12873, -22725, // w31 w29 w27 w25 - - //row5 - 19266, 19266, 25172, -10426, // w09 w01 w08 w00 - 19266, 19266, 10426, -25172, // w13 w05 w12 w04 - 19266, -19266, 10426, 25172, // w11 w03 w10 w02 - -19266, 19266, -25172, -10426, // w15 w07 w14 w06 - 26722, 15137, 22654, -26722, // w22 w20 w18 w16 - 22654, 5315, -5315, -15137, // w23 w21 w19 w17 - 15137, 5315, 5315, 22654, // w30 w28 w26 w24 - -26722, 22654, -15137, -26722, // w31 w29 w27 w25 - - //row6 - 21407, 21407, 27969, -11585, // w09 w01 w08 w00 - 21407, 21407, 11585, -27969, // w13 w05 w12 w04 - 21407, -21407, 11585, 27969, // w11 w03 w10 w02 - -21407, 21407, -27969, -11585, // w15 w07 w14 w06, - 29692, 16819, 25172, -29692, // w22 w20 w18 w16 - 25172, 5906, -5906, -16819, // w23 w21 w19 w17 - 16819, 5906, 5906, 25172, // w30 w28 w26 w24 - -29692, 25172, -16819, -29692, // w31 w29 w27 w25, - - //row7 - 22725, 22725, 29692, -12299, // w09 w01 w08 w00 - 22725, 22725, 12299, -29692, // w13 w05 w12 w04 - 22725, -22725, 12299, 29692, // w11 w03 w10 w02 - -22725, 22725, -29692, -12299, // w15 w07 w14 w06, - 31521, 17855, 26722, -31521, // w22 w20 w18 w16 - 26722, 6270, -6270, -17855, // w23 w21 w19 w17 - 17855, 6270, 6270, 26722, // w30 w28 w26 w24 - -31521, 26722, -17855, -31521 // w31 w29 w27 w25 + 16384, 16384, -8867, -21407, + 16384, 16384, 21407, 8867, + 16384, -16384, 21407, -8867, + -16384, 16384, 8867, -21407, + 22725, 19266, -22725, -12873, + 12873, 4520, 19266, -4520, + 12873, -22725, 19266, -22725, + 4520, 19266, 4520, -12873, + + 22725, 22725, -12299, -29692, + 22725, 22725, 29692, 12299, + 22725, -22725, 29692, -12299, + -22725, 22725, 12299, -29692, + 31521, 26722, -31521, -17855, + 17855, 6270, 26722, -6270, + 17855, -31521, 26722, -31521, + 6270, 26722, 6270, -17855, + + 21407, 21407, -11585, -27969, + 21407, 21407, 27969, 11585, + 21407, -21407, 27969, -11585, + -21407, 21407, 11585, -27969, + 29692, 25172, -29692, -16819, + 16819, 5906, 25172, -5906, + 16819, -29692, 25172, -29692, + 5906, 25172, 5906, -16819, + + 19266, 19266, -10426, -25172, + 19266, 19266, 25172, 10426, + 19266, -19266, 25172, -10426, + -19266, 19266, 10426, -25172, + 26722, 22654, -26722, -15137, + 15137, 5315, 22654, -5315, + 15137, -26722, 22654, -26722, + 5315, 22654, 5315, -15137, + + 16384, 16384, -8867, -21407, + 16384, 16384, 21407, 8867, + 16384, -16384, 21407, -8867, + -16384, 16384, 8867, -21407, + 22725, 19266, -22725, -12873, + 12873, 4520, 19266, -4520, + 12873, -22725, 19266, -22725, + 4520, 19266, 4520, -12873, + + 19266, 19266, -10426, -25172, + 19266, 19266, 25172, 10426, + 19266, -19266, 25172, -10426, + -19266, 19266, 10426, -25172, + 26722, 22654, -26722, -15137, + 15137, 5315, 22654, -5315, + 15137, -26722, 22654, -26722, + 5315, 22654, 5315, -15137, + + 21407, 21407, -11585, -27969, + 21407, 21407, 27969, 11585, + 21407, -21407, 27969, -11585, + -21407, 21407, 11585, -27969, + 29692, 25172, -29692, -16819, + 16819, 5906, 25172, -5906, + 16819, -29692, 25172, -29692, + 5906, 25172, 5906, -16819, + + 22725, 22725, -12299, -29692, + 22725, 22725, 29692, 12299, + 22725, -22725, 29692, -12299, + -22725, 22725, 12299, -29692, + 31521, 26722, -31521, -17855, + 17855, 6270, 26722, -6270, + 17855, -31521, 26722, -31521, + 6270, 26722, 6270, -17855, }; +static const int16_t tab_frw_01234567_sse2[] ATTR_ALIGN(16) = { // forward_dct coeff table +#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ + C4, C4, C5, C7, C2, C6, C3, -C7, \ + -C4, C4, C7, C3, C6, -C2, C7, -C5, \ + C4, -C4, C5, -C1, C2, -C6, C3, -C1, +// c1..c7 * cos(pi/4) * 2^15 +#define C1 22725 +#define C2 21407 +#define C3 19266 +#define C4 16384 +#define C5 12873 +#define C6 8867 +#define C7 4520 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 31521 +#define C2 29692 +#define C3 26722 +#define C4 22725 +#define C5 17855 +#define C6 12299 +#define C7 6270 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 29692 +#define C2 27969 +#define C3 25172 +#define C4 21407 +#define C5 16819 +#define C6 11585 +#define C7 5906 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 26722 +#define C2 25172 +#define C3 22654 +#define C4 19266 +#define C5 15137 +#define C6 10426 +#define C7 5315 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 22725 +#define C2 21407 +#define C3 19266 +#define C4 16384 +#define C5 12873 +#define C6 8867 +#define C7 4520 +TABLE_SSE2 -static inline void fdct_col(const int16_t *in, int16_t *out, int offset) +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 26722 +#define C2 25172 +#define C3 22654 +#define C4 19266 +#define C5 15137 +#define C6 10426 +#define C7 5315 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 29692 +#define C2 27969 +#define C3 25172 +#define C4 21407 +#define C5 16819 +#define C6 11585 +#define C7 5906 +TABLE_SSE2 + +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#define C1 31521 +#define C2 29692 +#define C3 26722 +#define C4 22725 +#define C5 17855 +#define C6 12299 +#define C7 6270 +TABLE_SSE2 +}; + +static always_inline void fdct_col(const int16_t *in, int16_t *out, int offset) { movq_m2r(*(in + offset + 1 * 8), mm0); movq_m2r(*(in + offset + 6 * 8), mm1); @@ -211,59 +333,158 @@ static inline void fdct_col(const int16_t *in, int16_t *out, int offset) movq_r2m(mm3, *(out + offset + 7 * 8)); } -static inline void fdct_row(const int16_t *in, int16_t *out, const int16_t *table) + +static always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) { - movd_m2r(*(in + 6), mm5); - punpcklwd_m2r(*(in + 4), mm5); - movq_r2r(mm5, mm2); - psrlq_i2r(0x20, mm5); + asm volatile( + ".macro FDCT_ROW_SSE2_H1 i t \n\t" + "movq \\i(%0), %%xmm2 \n\t" + "movq \\i+8(%0), %%xmm0 \n\t" + "movdqa \\t+32(%1), %%xmm3 \n\t" + "movdqa \\t+48(%1), %%xmm7 \n\t" + "movdqa \\t(%1), %%xmm4 \n\t" + "movdqa \\t+16(%1), %%xmm5 \n\t" + ".endm \n\t" + ".macro FDCT_ROW_SSE2_H2 i t \n\t" + "movq \\i(%0), %%xmm2 \n\t" + "movq \\i+8(%0), %%xmm0 \n\t" + "movdqa \\t+32(%1), %%xmm3 \n\t" + "movdqa \\t+48(%1), %%xmm7 \n\t" + ".endm \n\t" + ".macro FDCT_ROW_SSE2 i \n\t" + "movq %%xmm2, %%xmm1 \n\t" + "pshuflw $27, %%xmm0, %%xmm0 \n\t" + "paddsw %%xmm0, %%xmm1 \n\t" + "psubsw %%xmm0, %%xmm2 \n\t" + "punpckldq %%xmm2, %%xmm1 \n\t" + "pshufd $78, %%xmm1, %%xmm2 \n\t" + "pmaddwd %%xmm2, %%xmm3 \n\t" + "pmaddwd %%xmm1, %%xmm7 \n\t" + "pmaddwd %%xmm5, %%xmm2 \n\t" + "pmaddwd %%xmm4, %%xmm1 \n\t" + "paddd %%xmm7, %%xmm3 \n\t" + "paddd %%xmm2, %%xmm1 \n\t" + "paddd %%xmm6, %%xmm3 \n\t" + "paddd %%xmm6, %%xmm1 \n\t" + "psrad %3, %%xmm3 \n\t" + "psrad %3, %%xmm1 \n\t" + "packssdw %%xmm3, %%xmm1 \n\t" + "movdqa %%xmm1, \\i(%4) \n\t" + ".endm \n\t" + "movdqa (%2), %%xmm6 \n\t" + "FDCT_ROW_SSE2_H1 0 0 \n\t" + "FDCT_ROW_SSE2 0 \n\t" + "FDCT_ROW_SSE2_H2 64 0 \n\t" + "FDCT_ROW_SSE2 64 \n\t" + + "FDCT_ROW_SSE2_H1 16 64 \n\t" + "FDCT_ROW_SSE2 16 \n\t" + "FDCT_ROW_SSE2_H2 112 64 \n\t" + "FDCT_ROW_SSE2 112 \n\t" + + "FDCT_ROW_SSE2_H1 32 128 \n\t" + "FDCT_ROW_SSE2 32 \n\t" + "FDCT_ROW_SSE2_H2 96 128 \n\t" + "FDCT_ROW_SSE2 96 \n\t" + + "FDCT_ROW_SSE2_H1 48 192 \n\t" + "FDCT_ROW_SSE2 48 \n\t" + "FDCT_ROW_SSE2_H2 80 192 \n\t" + "FDCT_ROW_SSE2 80 \n\t" + : + : "r" (in), "r" (tab_frw_01234567_sse2), "r" (fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) + ); +} + +static always_inline void fdct_row_mmx2(const int16_t *in, int16_t *out, const int16_t *table) +{ + pshufw_m2r(*(in + 4), mm5, 0x1B); movq_m2r(*(in + 0), mm0); - punpcklwd_r2r(mm2, mm5); - movq_r2r(mm0, mm1); + movq_r2r(mm0, mm1); paddsw_r2r(mm5, mm0); psubsw_r2r(mm5, mm1); - movq_r2r(mm0, mm2); - punpcklwd_r2r(mm1, mm0); - punpckhwd_r2r(mm1, mm2); - movq_r2r(mm2, mm1); - movq_r2r(mm0, mm2); + pshufw_r2r(mm0, mm2, 0x4E); + pshufw_r2r(mm1, mm3, 0x4E); + movq_m2r(*(table + 0), mm4); + movq_m2r(*(table + 4), mm6); + movq_m2r(*(table + 16), mm5); + movq_m2r(*(table + 20), mm7); + pmaddwd_r2r(mm0, mm4); + pmaddwd_r2r(mm1, mm5); + pmaddwd_r2r(mm2, mm6); + pmaddwd_r2r(mm3, mm7); + pmaddwd_m2r(*(table + 8), mm0); + pmaddwd_m2r(*(table + 12), mm2); + pmaddwd_m2r(*(table + 24), mm1); + pmaddwd_m2r(*(table + 28), mm3); + paddd_r2r(mm6, mm4); + paddd_r2r(mm7, mm5); + paddd_r2r(mm2, mm0); + paddd_r2r(mm3, mm1); + movq_m2r(*fdct_r_row, mm7); + paddd_r2r(mm7, mm4); + paddd_r2r(mm7, mm5); + paddd_r2r(mm7, mm0); + paddd_r2r(mm7, mm1); + psrad_i2r(SHIFT_FRW_ROW, mm4); + psrad_i2r(SHIFT_FRW_ROW, mm5); + psrad_i2r(SHIFT_FRW_ROW, mm0); + psrad_i2r(SHIFT_FRW_ROW, mm1); + packssdw_r2r(mm0, mm4); + packssdw_r2r(mm1, mm5); + movq_r2r(mm4, mm2); + punpcklwd_r2r(mm5, mm4); + punpckhwd_r2r(mm5, mm2); + movq_r2m(mm4, *(out + 0)); + movq_r2m(mm2, *(out + 4)); +} + +static always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) +{ + movd_m2r(*(in + 6), mm1); + punpcklwd_m2r(*(in + 4), mm1); + movq_r2r(mm1, mm2); + psrlq_i2r(0x20, mm1); + movq_m2r(*(in + 0), mm0); + punpcklwd_r2r(mm2, mm1); + movq_r2r(mm0, mm5); + paddsw_r2r(mm1, mm0); + psubsw_r2r(mm1, mm5); + movq_r2r(mm0, mm1); + movq_r2r(mm5, mm6); + punpckldq_r2r(mm5, mm3); + punpckhdq_r2r(mm3, mm6); movq_m2r(*(table + 0), mm3); - punpcklwd_r2r(mm1, mm0); - movq_r2r(mm0, mm5); - punpckldq_r2r(mm0, mm0); movq_m2r(*(table + 4), mm4); - punpckhwd_r2r(mm1, mm2); + punpckldq_r2r(mm0, mm2); pmaddwd_r2r(mm0, mm3); - movq_r2r(mm2, mm6); - movq_m2r(*(table + 16), mm1); - punpckldq_r2r(mm2, mm2); - pmaddwd_r2r(mm2, mm4); - punpckhdq_r2r(mm5, mm5); + punpckhdq_r2r(mm2, mm1); + movq_m2r(*(table + 16), mm2); + pmaddwd_r2r(mm1, mm4); pmaddwd_m2r(*(table + 8), mm0); - punpckhdq_r2r(mm6, mm6); movq_m2r(*(table + 20), mm7); - pmaddwd_r2r(mm5, mm1); + pmaddwd_r2r(mm5, mm2); paddd_m2r(*fdct_r_row, mm3); pmaddwd_r2r(mm6, mm7); - pmaddwd_m2r(*(table + 12), mm2); + pmaddwd_m2r(*(table + 12), mm1); paddd_r2r(mm4, mm3); pmaddwd_m2r(*(table + 24), mm5); pmaddwd_m2r(*(table + 28), mm6); - paddd_r2r(mm7, mm1); + paddd_r2r(mm7, mm2); paddd_m2r(*fdct_r_row, mm0); psrad_i2r(SHIFT_FRW_ROW, mm3); - paddd_m2r(*fdct_r_row, mm1); - paddd_r2r(mm2, mm0); + paddd_m2r(*fdct_r_row, mm2); + paddd_r2r(mm1, mm0); paddd_m2r(*fdct_r_row, mm5); - psrad_i2r(SHIFT_FRW_ROW, mm1); + psrad_i2r(SHIFT_FRW_ROW, mm2); paddd_r2r(mm6, mm5); psrad_i2r(SHIFT_FRW_ROW, mm0); psrad_i2r(SHIFT_FRW_ROW, mm5); packssdw_r2r(mm0, mm3); - packssdw_r2r(mm5, mm1); + packssdw_r2r(mm5, mm2); movq_r2r(mm3, mm6); - punpcklwd_r2r(mm1, mm3); - punpckhwd_r2r(mm1, mm6); + punpcklwd_r2r(mm2, mm3); + punpckhwd_r2r(mm2, mm6); movq_r2m(mm3, *(out + 0)); movq_r2m(mm6, *(out + 4)); } @@ -284,9 +505,47 @@ void ff_fdct_mmx(int16_t *block) table = tab_frw_01234567; out = block; for(i=8;i>0;i--) { - fdct_row(block1, out, table); + fdct_row_mmx(block1, out, table); + block1 += 8; + table += 32; + out += 8; + } +} + +void ff_fdct_mmx2(int16_t *block) +{ + int64_t align_tmp[16] ATTR_ALIGN(8); + int16_t * const block_tmp= (int16_t*)align_tmp; + int16_t *block1, *out; + const int16_t *table; + int i; + + block1 = block_tmp; + fdct_col(block, block1, 0); + fdct_col(block, block1, 4); + + block1 = block_tmp; + table = tab_frw_01234567; + out = block; + for(i=8;i>0;i--) { + fdct_row_mmx2(block1, out, table); block1 += 8; table += 32; out += 8; } } + +void ff_fdct_sse2(int16_t *block) +{ + int64_t align_tmp[16] ATTR_ALIGN(8); + int16_t * const block_tmp= (int16_t*)align_tmp; + int16_t *block1; + int i; + + block1 = block_tmp; + fdct_col(block, block1, 0); + fdct_col(block, block1, 4); + + fdct_row_sse2(block1, block); +} + diff --git a/src/libffmpeg/libavcodec/i386/idct_mmx.c b/src/libffmpeg/libavcodec/i386/idct_mmx.c index 654792e5e..298c8a8b0 100644 --- a/src/libffmpeg/libavcodec/i386/idct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/idct_mmx.c @@ -22,11 +22,10 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include "../dsputil.h" +#include "../common.h" #include "mmx.h" -#undef ATTR_ALIGN #define ATTR_ALIGN(align) __attribute__ ((__aligned__ (align))) #define ROW_SHIFT 11 @@ -555,10 +554,6 @@ static int32_t rounder5[] ATTR_ALIGN(8) = #undef COL_SHIFT #undef ROW_SHIFT -/* the macro below will generate these */ -void ff_mmx_idct(DCTELEM *block); -void ff_mmxext_idct(DCTELEM *block); - #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid) \ void idct (int16_t * block) \ { \ diff --git a/src/libffmpeg/libavcodec/i386/mmx.h b/src/libffmpeg/libavcodec/i386/mmx.h index 2ba28898d..7e94cfd9b 100644 --- a/src/libffmpeg/libavcodec/i386/mmx.h +++ b/src/libffmpeg/libavcodec/i386/mmx.h @@ -1 +1,243 @@ -#include "xineutils.h" +/* + * mmx.h + * Copyright (C) 1997-2001 H. Dietz and R. Fisher + */ +#ifndef AVCODEC_I386MMX_H +#define AVCODEC_I386MMX_H + +/* + * The type of an value that fits in an MMX register (note that long + * long constant values MUST be suffixed by LL and unsigned long long + * values by ULL, lest they be truncated by the compiler) + */ + +typedef union { + long long q; /* Quadword (64-bit) value */ + unsigned long long uq; /* Unsigned Quadword */ + int d[2]; /* 2 Doubleword (32-bit) values */ + unsigned int ud[2]; /* 2 Unsigned Doubleword */ + short w[4]; /* 4 Word (16-bit) values */ + unsigned short uw[4]; /* 4 Unsigned Word */ + char b[8]; /* 8 Byte (8-bit) values */ + unsigned char ub[8]; /* 8 Unsigned Byte */ + float s[2]; /* Single-precision (32-bit) value */ +} mmx_t; /* On an 8-byte (64-bit) boundary */ + + +#define mmx_i2r(op,imm,reg) \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "i" (imm) ) + +#define mmx_m2r(op,mem,reg) \ + __asm__ __volatile__ (#op " %0, %%" #reg \ + : /* nothing */ \ + : "m" (mem)) + +#define mmx_r2m(op,reg,mem) \ + __asm__ __volatile__ (#op " %%" #reg ", %0" \ + : "=m" (mem) \ + : /* nothing */ ) + +#define mmx_r2r(op,regs,regd) \ + __asm__ __volatile__ (#op " %" #regs ", %" #regd) + + +#define emms() __asm__ __volatile__ ("emms") + +#define movd_m2r(var,reg) mmx_m2r (movd, var, reg) +#define movd_r2m(reg,var) mmx_r2m (movd, reg, var) +#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd) + +#define movq_m2r(var,reg) mmx_m2r (movq, var, reg) +#define movq_r2m(reg,var) mmx_r2m (movq, reg, var) +#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd) + +#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg) +#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd) +#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg) +#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd) + +#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg) +#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd) + +#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg) +#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd) +#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg) +#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd) +#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg) +#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd) + +#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg) +#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd) +#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg) +#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd) + +#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg) +#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd) +#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg) +#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd) + +#define pand_m2r(var,reg) mmx_m2r (pand, var, reg) +#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd) + +#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg) +#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd) + +#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg) +#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd) +#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg) +#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd) +#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg) +#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd) + +#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg) +#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd) +#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg) +#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd) +#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg) +#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd) + +#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg) +#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd) + +#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg) +#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd) + +#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg) +#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd) + +#define por_m2r(var,reg) mmx_m2r (por, var, reg) +#define por_r2r(regs,regd) mmx_r2r (por, regs, regd) + +#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg) +#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg) +#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd) +#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg) +#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg) +#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd) +#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg) +#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg) +#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd) + +#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg) +#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg) +#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd) +#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg) +#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg) +#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd) + +#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg) +#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg) +#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd) +#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg) +#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg) +#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd) +#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg) +#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg) +#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd) + +#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg) +#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd) +#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg) +#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd) +#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg) +#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd) + +#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg) +#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd) +#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg) +#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd) + +#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg) +#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd) +#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg) +#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd) + +#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg) +#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd) +#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg) +#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd) +#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg) +#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd) + +#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg) +#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd) +#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg) +#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd) +#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg) +#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd) + +#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg) +#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd) + + +/* 3DNOW extensions */ + +#define pavgusb_m2r(var,reg) mmx_m2r (pavgusb, var, reg) +#define pavgusb_r2r(regs,regd) mmx_r2r (pavgusb, regs, regd) + + +/* AMD MMX extensions - also available in intel SSE */ + + +#define mmx_m2ri(op,mem,reg,imm) \ + __asm__ __volatile__ (#op " %1, %0, %%" #reg \ + : /* nothing */ \ + : "X" (mem), "X" (imm)) +#define mmx_r2ri(op,regs,regd,imm) \ + __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \ + : /* nothing */ \ + : "X" (imm) ) + +#define mmx_fetch(mem,hint) \ + __asm__ __volatile__ ("prefetch" #hint " %0" \ + : /* nothing */ \ + : "X" (mem)) + + +#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg) + +#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var) + +#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg) +#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd) +#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg) +#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd) + +#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm) + +#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm) + +#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg) +#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd) + +#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg) +#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd) + +#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg) +#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd) + +#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg) +#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd) + +#define pmovmskb(mmreg,reg) \ + __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg) + +#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg) +#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd) + +#define prefetcht0(mem) mmx_fetch (mem, t0) +#define prefetcht1(mem) mmx_fetch (mem, t1) +#define prefetcht2(mem) mmx_fetch (mem, t2) +#define prefetchnta(mem) mmx_fetch (mem, nta) + +#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg) +#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd) + +#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm) +#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm) + +#define sfence() __asm__ __volatile__ ("sfence\n\t") + +#endif /* AVCODEC_I386MMX_H */ diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c index 950100e63..f32afae0b 100644 --- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c +++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c @@ -1,6 +1,7 @@ /* * MMX optimized motion estimation * Copyright (c) 2001 Fabrice Bellard. + * Copyright (c) 2002-2004 Michael Niedermayer * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -26,11 +27,11 @@ static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={ 0x0002000200020002ULL, }; -static const __attribute__ ((aligned(8), unused)) uint64_t bone= 0x0101010101010101LL; +static __attribute__ ((aligned(8), unused)) uint64_t bone= 0x0101010101010101LL; -static inline void sad8_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) +static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) { - int len= -(stride<<h); + int len= -(stride*h); asm volatile( ".balign 16 \n\t" "1: \n\t" @@ -64,9 +65,9 @@ static inline void sad8_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) ); } -static inline void sad8_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) +static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) { - int len= -(stride<<h); + int len= -(stride*h); asm volatile( ".balign 16 \n\t" "1: \n\t" @@ -88,7 +89,7 @@ static inline void sad8_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) { - int len= -(stride<<h); + int len= -(stride*h); asm volatile( ".balign 16 \n\t" "1: \n\t" @@ -114,7 +115,7 @@ static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, in static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) { //FIXME reuse src - int len= -(stride<<h); + int len= -(stride*h); asm volatile( ".balign 16 \n\t" "movq "MANGLE(bone)", %%mm5 \n\t" @@ -151,7 +152,7 @@ static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) { - int len= -(stride<<h); + int len= -(stride*h); asm volatile( ".balign 16 \n\t" "1: \n\t" @@ -189,7 +190,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) { - int len= -(stride<<h); + int len= -(stride*h); asm volatile( ".balign 16 \n\t" "1: \n\t" @@ -265,85 +266,69 @@ static inline int sum_mmx2(void) #define PIX_SAD(suf)\ -static int pix_abs8x8_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ +static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ {\ + assert(h==8);\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t":);\ \ - sad8_ ## suf(blk1, blk2, stride, 3);\ + sad8_1_ ## suf(blk1, blk2, stride, 8);\ \ return sum_ ## suf();\ }\ -static int sad8x8_ ## suf(void *s, uint8_t *blk2, uint8_t *blk1, int stride)\ -{\ - asm volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t":);\ -\ - sad8_ ## suf(blk1, blk2, stride, 3);\ -\ - return sum_ ## suf();\ -}\ -\ -static int pix_abs8x8_x2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ +static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ {\ + assert(h==8);\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ "movq %0, %%mm5 \n\t"\ :: "m"(round_tab[1]) \ );\ \ - sad8_2_ ## suf(blk1, blk1+1, blk2, stride, 3);\ + sad8_2_ ## suf(blk1, blk1+1, blk2, stride, 8);\ \ return sum_ ## suf();\ }\ \ -static int pix_abs8x8_y2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ +static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ {\ + assert(h==8);\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ "movq %0, %%mm5 \n\t"\ :: "m"(round_tab[1]) \ );\ \ - sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 3);\ + sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 8);\ \ return sum_ ## suf();\ }\ \ -static int pix_abs8x8_xy2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ +static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ {\ + assert(h==8);\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ "movq %0, %%mm5 \n\t"\ :: "m"(round_tab[2]) \ );\ \ - sad8_4_ ## suf(blk1, blk2, stride, 3);\ + sad8_4_ ## suf(blk1, blk2, stride, 8);\ \ return sum_ ## suf();\ }\ \ -static int pix_abs16x16_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ -{\ - asm volatile("pxor %%mm7, %%mm7 \n\t"\ - "pxor %%mm6, %%mm6 \n\t":);\ -\ - sad8_ ## suf(blk1 , blk2 , stride, 4);\ - sad8_ ## suf(blk1+8, blk2+8, stride, 4);\ -\ - return sum_ ## suf();\ -}\ -static int sad16x16_ ## suf(void *s, uint8_t *blk2, uint8_t *blk1, int stride)\ +static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t":);\ \ - sad8_ ## suf(blk1 , blk2 , stride, 4);\ - sad8_ ## suf(blk1+8, blk2+8, stride, 4);\ + sad8_1_ ## suf(blk1 , blk2 , stride, h);\ + sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\ \ return sum_ ## suf();\ }\ -static int pix_abs16x16_x2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ +static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ @@ -351,12 +336,12 @@ static int pix_abs16x16_x2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ :: "m"(round_tab[1]) \ );\ \ - sad8_2_ ## suf(blk1 , blk1+1, blk2 , stride, 4);\ - sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, 4);\ + sad8_2_ ## suf(blk1 , blk1+1, blk2 , stride, h);\ + sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, h);\ \ return sum_ ## suf();\ }\ -static int pix_abs16x16_y2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ +static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ @@ -364,12 +349,12 @@ static int pix_abs16x16_y2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ :: "m"(round_tab[1]) \ );\ \ - sad8_2_ ## suf(blk1 , blk1+stride, blk2 , stride, 4);\ - sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, 4);\ + sad8_2_ ## suf(blk1 , blk1+stride, blk2 , stride, h);\ + sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, h);\ \ return sum_ ## suf();\ }\ -static int pix_abs16x16_xy2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ +static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ @@ -377,8 +362,8 @@ static int pix_abs16x16_xy2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ :: "m"(round_tab[2]) \ );\ \ - sad8_4_ ## suf(blk1 , blk2 , stride, 4);\ - sad8_4_ ## suf(blk1+8, blk2+8, stride, 4);\ + sad8_4_ ## suf(blk1 , blk2 , stride, h);\ + sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\ \ return sum_ ## suf();\ }\ @@ -389,32 +374,32 @@ PIX_SAD(mmx2) void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx) { if (mm_flags & MM_MMX) { - c->pix_abs16x16 = pix_abs16x16_mmx; - c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx; - c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx; - c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; - c->pix_abs8x8 = pix_abs8x8_mmx; - c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx; - c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx; - c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx; + c->pix_abs[0][0] = sad16_mmx; + c->pix_abs[0][1] = sad16_x2_mmx; + c->pix_abs[0][2] = sad16_y2_mmx; + c->pix_abs[0][3] = sad16_xy2_mmx; + c->pix_abs[1][0] = sad8_mmx; + c->pix_abs[1][1] = sad8_x2_mmx; + c->pix_abs[1][2] = sad8_y2_mmx; + c->pix_abs[1][3] = sad8_xy2_mmx; - c->sad[0]= sad16x16_mmx; - c->sad[1]= sad8x8_mmx; + c->sad[0]= sad16_mmx; + c->sad[1]= sad8_mmx; } if (mm_flags & MM_MMXEXT) { - c->pix_abs16x16 = pix_abs16x16_mmx2; - c->pix_abs8x8 = pix_abs8x8_mmx2; + c->pix_abs[0][0] = sad16_mmx2; + c->pix_abs[1][0] = sad8_mmx2; - c->sad[0]= sad16x16_mmx2; - c->sad[1]= sad8x8_mmx2; + c->sad[0]= sad16_mmx2; + c->sad[1]= sad8_mmx2; if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ - c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; - c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; - c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2; - c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; - c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; - c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2; + c->pix_abs[0][1] = sad16_x2_mmx2; + c->pix_abs[0][2] = sad16_y2_mmx2; + c->pix_abs[0][3] = sad16_xy2_mmx2; + c->pix_abs[1][1] = sad8_x2_mmx2; + c->pix_abs[1][2] = sad8_y2_mmx2; + c->pix_abs[1][3] = sad8_xy2_mmx2; } } } diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c index d2f477b7b..1c0e9f5ae 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c @@ -31,31 +31,92 @@ static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xfff static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; -static void dct_unquantize_h263_mmx(MpegEncContext *s, +static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int level, qmul, qadd, nCoeffs; qmul = qscale << 1; - qadd = (qscale - 1) | 1; - assert(s->block_last_index[n]>=0); + assert(s->block_last_index[n]>=0 || s->h263_aic); - if (s->mb_intra) { - if (!s->h263_aic) { - if (n < 4) - level = block[0] * s->y_dc_scale; - else - level = block[0] * s->c_dc_scale; - }else{ - qadd = 0; - level= block[0]; - } - nCoeffs=63; - } else { - nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; - level = 0;/* keep gcc quiet */ + if (!s->h263_aic) { + if (n < 4) + level = block[0] * s->y_dc_scale; + else + level = block[0] * s->c_dc_scale; + qadd = (qscale - 1) | 1; + }else{ + qadd = 0; + level= block[0]; } + if(s->ac_pred) + nCoeffs=63; + else + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; +//printf("%d %d ", qmul, qadd); +asm volatile( + "movd %1, %%mm6 \n\t" //qmul + "packssdw %%mm6, %%mm6 \n\t" + "packssdw %%mm6, %%mm6 \n\t" + "movd %2, %%mm5 \n\t" //qadd + "pxor %%mm7, %%mm7 \n\t" + "packssdw %%mm5, %%mm5 \n\t" + "packssdw %%mm5, %%mm5 \n\t" + "psubw %%mm5, %%mm7 \n\t" + "pxor %%mm4, %%mm4 \n\t" + ".balign 16\n\t" + "1: \n\t" + "movq (%0, %3), %%mm0 \n\t" + "movq 8(%0, %3), %%mm1 \n\t" + + "pmullw %%mm6, %%mm0 \n\t" + "pmullw %%mm6, %%mm1 \n\t" + + "movq (%0, %3), %%mm2 \n\t" + "movq 8(%0, %3), %%mm3 \n\t" + + "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 + "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 + + "pxor %%mm2, %%mm0 \n\t" + "pxor %%mm3, %%mm1 \n\t" + + "paddw %%mm7, %%mm0 \n\t" + "paddw %%mm7, %%mm1 \n\t" + + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + + "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 + "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 + + "pandn %%mm2, %%mm0 \n\t" + "pandn %%mm3, %%mm1 \n\t" + + "movq %%mm0, (%0, %3) \n\t" + "movq %%mm1, 8(%0, %3) \n\t" + + "addl $16, %3 \n\t" + "jng 1b \n\t" + ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) + : "memory" + ); + block[0]= level; +} + + +static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int level, qmul, qadd, nCoeffs; + + qmul = qscale << 1; + qadd = (qscale - 1) | 1; + + assert(s->block_last_index[n]>=0 || s->h263_aic); + + nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; //printf("%d %d ", qmul, qadd); asm volatile( "movd %1, %%mm6 \n\t" //qmul @@ -104,8 +165,6 @@ asm volatile( ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs)) : "memory" ); - if(s->mb_intra) - block[0]= level; } @@ -138,24 +197,23 @@ asm volatile( high3:low3 = low1*low2 high3 += tlow1 */ -static void dct_unquantize_mpeg1_mmx(MpegEncContext *s, +static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int nCoeffs; const uint16_t *quant_matrix; + int block0; assert(s->block_last_index[n]>=0); nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; - if (s->mb_intra) { - int block0; - if (n < 4) - block0 = block[0] * s->y_dc_scale; - else - block0 = block[0] * s->c_dc_scale; - /* XXX: only mpeg1 */ - quant_matrix = s->intra_matrix; + if (n < 4) + block0 = block[0] * s->y_dc_scale; + else + block0 = block[0] * s->c_dc_scale; + /* XXX: only mpeg1 */ + quant_matrix = s->intra_matrix; asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlw $15, %%mm7 \n\t" @@ -205,9 +263,19 @@ asm volatile( ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) : "%eax", "memory" ); - block[0]= block0; + block[0]= block0; +} + +static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int nCoeffs; + const uint16_t *quant_matrix; + + assert(s->block_last_index[n]>=0); + + nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; - } else { quant_matrix = s->inter_matrix; asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" @@ -262,28 +330,25 @@ asm volatile( ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) : "%eax", "memory" ); - } - } -static void dct_unquantize_mpeg2_mmx(MpegEncContext *s, +static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int nCoeffs; const uint16_t *quant_matrix; + int block0; assert(s->block_last_index[n]>=0); if(s->alternate_scan) nCoeffs= 63; //FIXME else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - if (s->mb_intra) { - int block0; - if (n < 4) - block0 = block[0] * s->y_dc_scale; - else - block0 = block[0] * s->c_dc_scale; - quant_matrix = s->intra_matrix; + if (n < 4) + block0 = block[0] * s->y_dc_scale; + else + block0 = block[0] * s->c_dc_scale; + quant_matrix = s->intra_matrix; asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" "psrlw $15, %%mm7 \n\t" @@ -329,10 +394,21 @@ asm volatile( ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs) : "%eax", "memory" ); - block[0]= block0; + block[0]= block0; //Note, we dont do mismatch control for intra as errors cannot accumulate +} + +static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, + DCTELEM *block, int n, int qscale) +{ + int nCoeffs; + const uint16_t *quant_matrix; + + assert(s->block_last_index[n]>=0); + + if(s->alternate_scan) nCoeffs= 63; //FIXME + else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - } else { quant_matrix = s->inter_matrix; asm volatile( "pcmpeqw %%mm7, %%mm7 \n\t" @@ -397,7 +473,6 @@ asm volatile( ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs) : "%eax", "memory" ); - } } /* draw the edges of width 'w' of an image of size width, height @@ -488,13 +563,130 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) } } +static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ + const int intra= s->mb_intra; + int *sum= s->dct_error_sum[intra]; + uint16_t *offset= s->dct_offset[intra]; + + s->dct_count[intra]++; + + asm volatile( + "pxor %%mm7, %%mm7 \n\t" + "1: \n\t" + "pxor %%mm0, %%mm0 \n\t" + "pxor %%mm1, %%mm1 \n\t" + "movq (%0), %%mm2 \n\t" + "movq 8(%0), %%mm3 \n\t" + "pcmpgtw %%mm2, %%mm0 \n\t" + "pcmpgtw %%mm3, %%mm1 \n\t" + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + "psubw %%mm0, %%mm2 \n\t" + "psubw %%mm1, %%mm3 \n\t" + "movq %%mm2, %%mm4 \n\t" + "movq %%mm3, %%mm5 \n\t" + "psubusw (%2), %%mm2 \n\t" + "psubusw 8(%2), %%mm3 \n\t" + "pxor %%mm0, %%mm2 \n\t" + "pxor %%mm1, %%mm3 \n\t" + "psubw %%mm0, %%mm2 \n\t" + "psubw %%mm1, %%mm3 \n\t" + "movq %%mm2, (%0) \n\t" + "movq %%mm3, 8(%0) \n\t" + "movq %%mm4, %%mm2 \n\t" + "movq %%mm5, %%mm3 \n\t" + "punpcklwd %%mm7, %%mm4 \n\t" + "punpckhwd %%mm7, %%mm2 \n\t" + "punpcklwd %%mm7, %%mm5 \n\t" + "punpckhwd %%mm7, %%mm3 \n\t" + "paddd (%1), %%mm4 \n\t" + "paddd 8(%1), %%mm2 \n\t" + "paddd 16(%1), %%mm5 \n\t" + "paddd 24(%1), %%mm3 \n\t" + "movq %%mm4, (%1) \n\t" + "movq %%mm2, 8(%1) \n\t" + "movq %%mm5, 16(%1) \n\t" + "movq %%mm3, 24(%1) \n\t" + "addl $16, %0 \n\t" + "addl $32, %1 \n\t" + "addl $16, %2 \n\t" + "cmpl %3, %0 \n\t" + " jb 1b \n\t" + : "+r" (block), "+r" (sum), "+r" (offset) + : "r"(block+64) + ); +} + +static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ + const int intra= s->mb_intra; + int *sum= s->dct_error_sum[intra]; + uint16_t *offset= s->dct_offset[intra]; + + s->dct_count[intra]++; + + asm volatile( + "pxor %%xmm7, %%xmm7 \n\t" + "1: \n\t" + "pxor %%xmm0, %%xmm0 \n\t" + "pxor %%xmm1, %%xmm1 \n\t" + "movdqa (%0), %%xmm2 \n\t" + "movdqa 16(%0), %%xmm3 \n\t" + "pcmpgtw %%xmm2, %%xmm0 \n\t" + "pcmpgtw %%xmm3, %%xmm1 \n\t" + "pxor %%xmm0, %%xmm2 \n\t" + "pxor %%xmm1, %%xmm3 \n\t" + "psubw %%xmm0, %%xmm2 \n\t" + "psubw %%xmm1, %%xmm3 \n\t" + "movdqa %%xmm2, %%xmm4 \n\t" + "movdqa %%xmm3, %%xmm5 \n\t" + "psubusw (%2), %%xmm2 \n\t" + "psubusw 16(%2), %%xmm3 \n\t" + "pxor %%xmm0, %%xmm2 \n\t" + "pxor %%xmm1, %%xmm3 \n\t" + "psubw %%xmm0, %%xmm2 \n\t" + "psubw %%xmm1, %%xmm3 \n\t" + "movdqa %%xmm2, (%0) \n\t" + "movdqa %%xmm3, 16(%0) \n\t" + "movdqa %%xmm4, %%xmm6 \n\t" + "movdqa %%xmm5, %%xmm0 \n\t" + "punpcklwd %%xmm7, %%xmm4 \n\t" + "punpckhwd %%xmm7, %%xmm6 \n\t" + "punpcklwd %%xmm7, %%xmm5 \n\t" + "punpckhwd %%xmm7, %%xmm0 \n\t" + "paddd (%1), %%xmm4 \n\t" + "paddd 16(%1), %%xmm6 \n\t" + "paddd 32(%1), %%xmm5 \n\t" + "paddd 48(%1), %%xmm0 \n\t" + "movdqa %%xmm4, (%1) \n\t" + "movdqa %%xmm6, 16(%1) \n\t" + "movdqa %%xmm5, 32(%1) \n\t" + "movdqa %%xmm0, 48(%1) \n\t" + "addl $32, %0 \n\t" + "addl $64, %1 \n\t" + "addl $32, %2 \n\t" + "cmpl %3, %0 \n\t" + " jb 1b \n\t" + : "+r" (block), "+r" (sum), "+r" (offset) + : "r"(block+64) + ); +} + #undef HAVE_MMX2 #define RENAME(a) a ## _MMX +#define RENAMEl(a) a ## _mmx #include "mpegvideo_mmx_template.c" #define HAVE_MMX2 #undef RENAME +#undef RENAMEl #define RENAME(a) a ## _MMX2 +#define RENAMEl(a) a ## _mmx2 +#include "mpegvideo_mmx_template.c" + +#undef RENAME +#undef RENAMEl +#define RENAME(a) a ## _SSE2 +#define RENAMEl(a) a ## _sse2 #include "mpegvideo_mmx_template.c" void MPV_common_init_mmx(MpegEncContext *s) @@ -502,14 +694,25 @@ void MPV_common_init_mmx(MpegEncContext *s) if (mm_flags & MM_MMX) { const int dct_algo = s->avctx->dct_algo; - s->dct_unquantize_h263 = dct_unquantize_h263_mmx; - s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx; - s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx; + s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; + s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; + s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; + s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; + s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; + s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; draw_edges = draw_edges_mmx; + + if (mm_flags & MM_SSE2) { + s->denoise_dct= denoise_dct_sse2; + } else { + s->denoise_dct= denoise_dct_mmx; + } if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ - if(mm_flags & MM_MMXEXT){ + if(mm_flags & MM_SSE2){ + s->dct_quantize= dct_quantize_SSE2; + } else if(mm_flags & MM_MMXEXT){ s->dct_quantize= dct_quantize_MMX2; } else { s->dct_quantize= dct_quantize_MMX; diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c index 706211eec..d4ed61ecb 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c @@ -43,7 +43,10 @@ static int RENAME(dct_quantize)(MpegEncContext *s, assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? //s->fdct (block); - ff_fdct_mmx (block); //cant be anything else ... + RENAMEl(ff_fdct) (block); //cant be anything else ... + + if(s->dct_error_sum) + s->denoise_dct(s, block); if (s->mb_intra) { int dummy; @@ -76,12 +79,12 @@ static int RENAME(dct_quantize)(MpegEncContext *s, block[0]=0; //avoid fake overflow // temp_block[0] = (block[0] + (q >> 1)) / q; last_non_zero_p1 = 1; - bias = s->q_intra_matrix16_bias[qscale]; - qmat = s->q_intra_matrix16[qscale]; + bias = s->q_intra_matrix16[qscale][1]; + qmat = s->q_intra_matrix16[qscale][0]; } else { last_non_zero_p1 = 0; - bias = s->q_inter_matrix16_bias[qscale]; - qmat = s->q_inter_matrix16[qscale]; + bias = s->q_inter_matrix16[qscale][1]; + qmat = s->q_inter_matrix16[qscale][0]; } if(s->out_format == FMT_H263 && s->mpeg_quant==0){ diff --git a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c index 1ee88b634..626c1f565 100644 --- a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c @@ -18,7 +18,6 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "../dsputil.h" -#include "../simple_idct.h" /* 23170.475006 |