diff options
Diffstat (limited to 'src/libffmpeg/libavcodec/i386')
-rw-r--r-- | src/libffmpeg/libavcodec/i386/dsputil_mmx.c | 392 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h | 32 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h | 28 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/motion_est_mmx.c | 79 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c | 73 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c | 8 | ||||
-rw-r--r-- | src/libffmpeg/libavcodec/i386/simple_idct_mmx.c | 4 |
7 files changed, 340 insertions, 276 deletions
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c index 857f1d398..d5a2d3734 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c @@ -20,33 +20,9 @@ */ #include "../dsputil.h" +#include "../simple_idct.h" int mm_flags; /* multimedia extension flags */ -/* FIXME use them in static form */ -int pix_abs16x16_mmx(UINT8 *blk1, UINT8 *blk2, int lx); -int pix_abs16x16_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); -int pix_abs16x16_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); -int pix_abs16x16_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); - -int pix_abs16x16_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); -int pix_abs16x16_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); -int pix_abs16x16_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); -int pix_abs16x16_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); - -int pix_abs8x8_mmx(UINT8 *blk1, UINT8 *blk2, int lx); -int pix_abs8x8_x2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); -int pix_abs8x8_y2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); -int pix_abs8x8_xy2_mmx(UINT8 *blk1, UINT8 *blk2, int lx); - -int pix_abs8x8_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); -int pix_abs8x8_x2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); -int pix_abs8x8_y2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); -int pix_abs8x8_xy2_mmx2(UINT8 *blk1, UINT8 *blk2, int lx); - -int sad16x16_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx); -int sad8x8_mmx(void *s, UINT8 *blk1, UINT8 *blk2, int lx); -int sad16x16_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx); -int sad8x8_mmx2(void *s, UINT8 *blk1, UINT8 *blk2, int lx); /* pixel operations */ static const uint64_t mm_bone __attribute__ ((aligned(8))) = 0x0101010101010101ULL; @@ -195,7 +171,7 @@ static const uint64_t ff_pw_15 __attribute__ ((aligned(8))) = 0x000F000F000F000F /***********************************/ /* standard MMX */ -static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) +static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) { asm volatile( "movl $-128, %%eax \n\t" @@ -223,7 +199,7 @@ static void get_pixels_mmx(DCTELEM *block, const UINT8 *pixels, int line_size) ); } -static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride) +static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) { asm volatile( "pxor %%mm7, %%mm7 \n\t" @@ -252,10 +228,10 @@ static inline void diff_pixels_mmx(DCTELEM *block, const UINT8 *s1, const UINT8 ); } -void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) +void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) { const DCTELEM *p; - UINT8 *pix; + uint8_t *pix; /* read the pixels */ p = block; @@ -307,10 +283,10 @@ void put_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) :"memory"); } -void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) +void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) { const DCTELEM *p; - UINT8 *pix; + uint8_t *pix; int i; /* read the pixels */ @@ -348,7 +324,7 @@ void add_pixels_clamped_mmx(const DCTELEM *block, UINT8 *pixels, int line_size) } while (--i); } -static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -374,7 +350,7 @@ static void put_pixels8_mmx(UINT8 *block, const UINT8 *pixels, int line_size, in ); } -static void put_pixels16_mmx(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -425,7 +401,7 @@ static void clear_blocks_mmx(DCTELEM *blocks) ); } -static int pix_sum16_mmx(UINT8 * pix, int line_size){ +static int pix_sum16_mmx(uint8_t * pix, int line_size){ const int h=16; int sum; int index= -line_size*h; @@ -528,7 +504,7 @@ static int pix_norm1_mmx(uint8_t *pix, int line_size) { return tmp; } -static int sse16_mmx(void *v, UINT8 * pix1, UINT8 * pix2, int line_size) { +static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size) { int tmp; asm volatile ( "movl $16,%%ecx\n" @@ -607,26 +583,21 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ for(; i<w; i++) dst[i+0] = src1[i+0]-src2[i+0]; } -#define LBUTTERFLY(a,b)\ - "paddw " #b ", " #a " \n\t"\ - "paddw " #b ", " #b " \n\t"\ - "psubw " #a ", " #b " \n\t" +#define LBUTTERFLY2(a1,b1,a2,b2)\ + "paddw " #b1 ", " #a1 " \n\t"\ + "paddw " #b2 ", " #a2 " \n\t"\ + "paddw " #b1 ", " #b1 " \n\t"\ + "paddw " #b2 ", " #b2 " \n\t"\ + "psubw " #a1 ", " #b1 " \n\t"\ + "psubw " #a2 ", " #b1 " \n\t" #define HADAMARD48\ - LBUTTERFLY(%%mm0, %%mm1)\ - LBUTTERFLY(%%mm2, %%mm3)\ - LBUTTERFLY(%%mm4, %%mm5)\ - LBUTTERFLY(%%mm6, %%mm7)\ - \ - LBUTTERFLY(%%mm0, %%mm2)\ - LBUTTERFLY(%%mm1, %%mm3)\ - LBUTTERFLY(%%mm4, %%mm6)\ - LBUTTERFLY(%%mm5, %%mm7)\ - \ - LBUTTERFLY(%%mm0, %%mm4)\ - LBUTTERFLY(%%mm1, %%mm5)\ - LBUTTERFLY(%%mm2, %%mm6)\ - LBUTTERFLY(%%mm3, %%mm7) + LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\ + LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\ + LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\ + LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\ + LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\ + LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\ #define MMABS(a,z)\ "pxor " #z ", " #z " \n\t"\ @@ -641,12 +612,22 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ "psubw " #z ", " #a " \n\t"\ "paddusw " #a ", " #sum " \n\t" - +#define MMABS_MMX2(a,z)\ + "pxor " #z ", " #z " \n\t"\ + "psubw " #a ", " #z " \n\t"\ + "pmaxsw " #z ", " #a " \n\t" + +#define MMABS_SUM_MMX2(a,z, sum)\ + "pxor " #z ", " #z " \n\t"\ + "psubw " #a ", " #z " \n\t"\ + "pmaxsw " #z ", " #a " \n\t"\ + "paddusw " #a ", " #sum " \n\t" + #define SBUTTERFLY(a,b,t,n)\ "movq " #a ", " #t " \n\t" /* abcd */\ "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ - + #define TRANSPOSE4(a,b,c,d,t)\ SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\ SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\ @@ -750,7 +731,94 @@ static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride) return sum&0xFFFF; } +static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride){ + uint64_t temp[16] __align8; + int sum=0; + + diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); + + asm volatile( + LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) + LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) + + HADAMARD48 + + "movq %%mm7, 112(%1) \n\t" + + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) + STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) + + "movq 112(%1), %%mm7 \n\t" + TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) + STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) + + LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) + LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) + + HADAMARD48 + + "movq %%mm7, 120(%1) \n\t" + + TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) + STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) + + "movq 120(%1), %%mm7 \n\t" + TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) + "movq %%mm7, %%mm5 \n\t"//FIXME remove + "movq %%mm6, %%mm7 \n\t" + "movq %%mm0, %%mm6 \n\t" +// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove + + LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) +// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) + + HADAMARD48 + "movq %%mm7, 64(%1) \n\t" + MMABS_MMX2(%%mm0, %%mm7) + MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) + MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) + MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0) + MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0) + MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0) + MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0) + "movq 64(%1), %%mm1 \n\t" + MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) + "movq %%mm0, 64(%1) \n\t" + + LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) + LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) + + HADAMARD48 + "movq %%mm7, (%1) \n\t" + MMABS_MMX2(%%mm0, %%mm7) + MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) + MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0) + MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0) + MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0) + MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0) + MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0) + "movq (%1), %%mm1 \n\t" + MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) + "movq 64(%1), %%mm1 \n\t" + MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0) + + "movq %%mm0, %%mm1 \n\t" + "psrlq $32, %%mm0 \n\t" + "paddusw %%mm1, %%mm0 \n\t" + "movq %%mm0, %%mm1 \n\t" + "psrlq $16, %%mm0 \n\t" + "paddusw %%mm1, %%mm0 \n\t" + "movd %%mm0, %0 \n\t" + + : "=r" (sum) + : "r"(temp) + ); + return sum&0xFFFF; +} + + WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx) +WARPER88_1616(hadamard8_diff_mmx2, hadamard8_diff16_mmx2) #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d) #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d) @@ -777,7 +845,7 @@ WARPER88_1616(hadamard8_diff_mmx, hadamard8_diff16_mmx) OP(%%mm5, out, %%mm7, d) #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ -void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ +static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ uint64_t temp;\ \ asm volatile(\ @@ -944,7 +1012,7 @@ static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, i }\ }\ \ -void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ +static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ uint64_t temp;\ \ asm volatile(\ @@ -1121,7 +1189,7 @@ static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, );\ }\ \ -void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ +static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ uint64_t temp[9*4];\ uint64_t *temp_ptr= temp;\ int count= 9;\ @@ -1181,46 +1249,46 @@ void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dst );\ }\ \ -static void OPNAME ## qpel8_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ OPNAME ## pixels8_mmx(dst, src, stride, 8);\ }\ \ -static void OPNAME ## qpel8_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t temp[8];\ uint8_t * const half= (uint8_t*)temp;\ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\ }\ \ -static void OPNAME ## qpel8_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\ }\ \ -static void OPNAME ## qpel8_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t temp[8];\ uint8_t * const half= (uint8_t*)temp;\ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\ OPNAME ## pixels8_l2_mmx(dst, src+1, half, stride, stride, 8);\ }\ \ -static void OPNAME ## qpel8_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t temp[8];\ uint8_t * const half= (uint8_t*)temp;\ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ OPNAME ## pixels8_l2_mmx(dst, src, half, stride, stride, 8);\ }\ \ -static void OPNAME ## qpel8_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\ }\ \ -static void OPNAME ## qpel8_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t temp[8];\ uint8_t * const half= (uint8_t*)temp;\ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\ OPNAME ## pixels8_l2_mmx(dst, src+stride, half, stride, stride, 8);\ }\ -static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[8 + 9];\ uint8_t * const halfH= ((uint8_t*)half) + 64;\ uint8_t * const halfHV= ((uint8_t*)half);\ @@ -1229,7 +1297,7 @@ static void OPNAME ## qpel8_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\ }\ -static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[8 + 9];\ uint8_t * const halfH= ((uint8_t*)half) + 64;\ uint8_t * const halfHV= ((uint8_t*)half);\ @@ -1238,7 +1306,7 @@ static void OPNAME ## qpel8_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\ }\ -static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[8 + 9];\ uint8_t * const halfH= ((uint8_t*)half) + 64;\ uint8_t * const halfHV= ((uint8_t*)half);\ @@ -1247,7 +1315,7 @@ static void OPNAME ## qpel8_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\ }\ -static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[8 + 9];\ uint8_t * const halfH= ((uint8_t*)half) + 64;\ uint8_t * const halfHV= ((uint8_t*)half);\ @@ -1256,7 +1324,7 @@ static void OPNAME ## qpel8_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\ }\ -static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[8 + 9];\ uint8_t * const halfH= ((uint8_t*)half) + 64;\ uint8_t * const halfHV= ((uint8_t*)half);\ @@ -1264,7 +1332,7 @@ static void OPNAME ## qpel8_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ OPNAME ## pixels8_l2_mmx(dst, halfH, halfHV, stride, 8, 8);\ }\ -static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[8 + 9];\ uint8_t * const halfH= ((uint8_t*)half) + 64;\ uint8_t * const halfHV= ((uint8_t*)half);\ @@ -1272,66 +1340,66 @@ static void OPNAME ## qpel8_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ OPNAME ## pixels8_l2_mmx(dst, halfH+8, halfHV, stride, 8, 8);\ }\ -static void OPNAME ## qpel8_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[8 + 9];\ uint8_t * const halfH= ((uint8_t*)half);\ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ put ## RND ## pixels8_l2_mmx(halfH, src, halfH, 8, stride, 9);\ OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ }\ -static void OPNAME ## qpel8_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[8 + 9];\ uint8_t * const halfH= ((uint8_t*)half);\ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ put ## RND ## pixels8_l2_mmx(halfH, src+1, halfH, 8, stride, 9);\ OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ }\ -static void OPNAME ## qpel8_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[9];\ uint8_t * const halfH= ((uint8_t*)half);\ put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\ OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\ }\ -static void OPNAME ## qpel16_mc00_ ## MMX (UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\ OPNAME ## pixels16_mmx(dst, src, stride, 16);\ }\ \ -static void OPNAME ## qpel16_mc10_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t temp[32];\ uint8_t * const half= (uint8_t*)temp;\ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\ }\ \ -static void OPNAME ## qpel16_mc20_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\ }\ \ -static void OPNAME ## qpel16_mc30_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t temp[32];\ uint8_t * const half= (uint8_t*)temp;\ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\ OPNAME ## pixels16_l2_mmx(dst, src+1, half, stride, stride, 16);\ }\ \ -static void OPNAME ## qpel16_mc01_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t temp[32];\ uint8_t * const half= (uint8_t*)temp;\ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ OPNAME ## pixels16_l2_mmx(dst, src, half, stride, stride, 16);\ }\ \ -static void OPNAME ## qpel16_mc02_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\ }\ \ -static void OPNAME ## qpel16_mc03_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t temp[32];\ uint8_t * const half= (uint8_t*)temp;\ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\ OPNAME ## pixels16_l2_mmx(dst, src+stride, half, stride, stride, 16);\ }\ -static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[16*2 + 17*2];\ uint8_t * const halfH= ((uint8_t*)half) + 256;\ uint8_t * const halfHV= ((uint8_t*)half);\ @@ -1340,7 +1408,7 @@ static void OPNAME ## qpel16_mc11_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\ }\ -static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[16*2 + 17*2];\ uint8_t * const halfH= ((uint8_t*)half) + 256;\ uint8_t * const halfHV= ((uint8_t*)half);\ @@ -1349,7 +1417,7 @@ static void OPNAME ## qpel16_mc31_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\ }\ -static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[16*2 + 17*2];\ uint8_t * const halfH= ((uint8_t*)half) + 256;\ uint8_t * const halfHV= ((uint8_t*)half);\ @@ -1358,7 +1426,7 @@ static void OPNAME ## qpel16_mc13_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\ }\ -static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[16*2 + 17*2];\ uint8_t * const halfH= ((uint8_t*)half) + 256;\ uint8_t * const halfHV= ((uint8_t*)half);\ @@ -1367,7 +1435,7 @@ static void OPNAME ## qpel16_mc33_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\ }\ -static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[16*2 + 17*2];\ uint8_t * const halfH= ((uint8_t*)half) + 256;\ uint8_t * const halfHV= ((uint8_t*)half);\ @@ -1375,7 +1443,7 @@ static void OPNAME ## qpel16_mc21_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ OPNAME ## pixels16_l2_mmx(dst, halfH, halfHV, stride, 16, 16);\ }\ -static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[16*2 + 17*2];\ uint8_t * const halfH= ((uint8_t*)half) + 256;\ uint8_t * const halfHV= ((uint8_t*)half);\ @@ -1383,21 +1451,21 @@ static void OPNAME ## qpel16_mc23_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\ OPNAME ## pixels16_l2_mmx(dst, halfH+16, halfHV, stride, 16, 16);\ }\ -static void OPNAME ## qpel16_mc12_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[17*2];\ uint8_t * const halfH= ((uint8_t*)half);\ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ put ## RND ## pixels16_l2_mmx(halfH, src, halfH, 16, stride, 17);\ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ }\ -static void OPNAME ## qpel16_mc32_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[17*2];\ uint8_t * const halfH= ((uint8_t*)half);\ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ put ## RND ## pixels16_l2_mmx(halfH, src+1, halfH, 16, stride, 17);\ OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\ }\ -static void OPNAME ## qpel16_mc22_ ## MMX(UINT8 *dst, UINT8 *src, int stride){\ +static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\ uint64_t half[17*2];\ uint8_t * const halfH= ((uint8_t*)half);\ put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\ @@ -1433,10 +1501,45 @@ static void just_return() { return; } c->put_ ## postfix1 = put_ ## postfix2;\ c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ c->avg_ ## postfix1 = avg_ ## postfix2; + +/* external functions, from idct_mmx.c */ +void ff_mmx_idct(DCTELEM *block); +void ff_mmxext_idct(DCTELEM *block); + +/* XXX: those functions should be suppressed ASAP when all IDCTs are + converted */ +static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_mmx_idct (block); + put_pixels_clamped_mmx(block, dest, line_size); +} +static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_mmx_idct (block); + add_pixels_clamped_mmx(block, dest, line_size); +} +static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_mmxext_idct (block); + put_pixels_clamped_mmx(block, dest, line_size); +} +static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block) +{ + ff_mmxext_idct (block); + add_pixels_clamped_mmx(block, dest, line_size); +} -void dsputil_init_mmx(DSPContext* c, unsigned mask) +void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) { mm_flags = mm_support(); + + if (avctx->dsp_mask) { + if (avctx->dsp_mask & FF_MM_FORCE) + mm_flags |= (avctx->dsp_mask & 0xffff); + else + mm_flags &= ~(avctx->dsp_mask & 0xffff); + } + #if 0 fprintf(stderr, "libavcodec: CPU flags:"); if (mm_flags & MM_MMX) @@ -1453,6 +1556,27 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask) #endif if (mm_flags & MM_MMX) { + const int dct_algo = avctx->dct_algo; + const int idct_algo= avctx->idct_algo; + + if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX) + c->fdct = ff_fdct_mmx; + + if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ + c->idct_put= ff_simple_idct_put_mmx; + c->idct_add= ff_simple_idct_add_mmx; + c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; + }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ + if(mm_flags & MM_MMXEXT){ + c->idct_put= ff_libmpeg2mmx2_idct_put; + c->idct_add= ff_libmpeg2mmx2_idct_add; + }else{ + c->idct_put= ff_libmpeg2mmx_idct_put; + c->idct_add= ff_libmpeg2mmx_idct_add; + } + c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; + } + c->get_pixels = get_pixels_mmx; c->diff_pixels = diff_pixels_mmx; c->put_pixels_clamped = put_pixels_clamped_mmx; @@ -1460,15 +1584,6 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask) c->clear_blocks = clear_blocks_mmx; c->pix_sum = pix_sum16_mmx; - c->pix_abs16x16 = pix_abs16x16_mmx; - c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx; - c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx; - c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; - c->pix_abs8x8 = pix_abs8x8_mmx; - c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx; - c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx; - c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx; - c->put_pixels_tab[0][0] = put_pixels16_mmx; c->put_pixels_tab[0][1] = put_pixels16_x2_mmx; c->put_pixels_tab[0][2] = put_pixels16_y2_mmx; @@ -1515,45 +1630,35 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask) c->hadamard8_diff[0]= hadamard8_diff16_mmx; c->hadamard8_diff[1]= hadamard8_diff_mmx; - c->sad[0]= sad16x16_mmx; - c->sad[1]= sad8x8_mmx; - c->pix_norm1 = pix_norm1_mmx; c->sse[0] = sse16_mmx; if (mm_flags & MM_MMXEXT) { - c->pix_abs16x16 = pix_abs16x16_mmx2; - c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; - c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; - c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2; - - c->pix_abs8x8 = pix_abs8x8_mmx2; - c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; - c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; - c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2; - - c->sad[0]= sad16x16_mmx2; - c->sad[1]= sad8x8_mmx2; - c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; + + c->hadamard8_diff[0]= hadamard8_diff16_mmx2; + c->hadamard8_diff[1]= hadamard8_diff_mmx2; + + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; + c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; + } #if 1 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2) @@ -1592,23 +1697,26 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask) } else if (mm_flags & MM_3DNOW) { c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; + + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; + c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; + c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; + c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; + c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; + c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; + } SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow) @@ -1644,7 +1752,8 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask) SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow) } } - + + dsputil_init_pix_mmx(c, avctx); #if 0 // for speed testing get_pixels = just_return; @@ -1680,28 +1789,3 @@ void dsputil_init_mmx(DSPContext* c, unsigned mask) //ff_idct = just_return; #endif } - -/* remove any non bit exact operation (testing purpose). NOTE that - this function should be kept as small as possible because it is - always difficult to test automatically non bit exact cases. */ -void dsputil_set_bit_exact_mmx(DSPContext* c, unsigned mask) -{ - if (mm_flags & MM_MMX) { - /* MMX2 & 3DNOW */ - c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; - c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; - c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; - c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; - c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; - - if (mm_flags & MM_MMXEXT) { - c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx; - c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx; - c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; - c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx; - c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx; - c->pix_abs8x8_xy2= pix_abs8x8_xy2_mmx; - } - } -} diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h index 4a8841156..8418123ac 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_avg.h @@ -25,7 +25,7 @@ /* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm clobber bug - now it will work with 2.95.2 and also with -fPIC */ -static void DEF(put_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -85,7 +85,7 @@ static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int :"memory"); } -static void DEF(put_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -154,7 +154,7 @@ static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int } /* GL: this function does incorrect rounding if overflow */ -static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_BONE(mm6); __asm __volatile( @@ -191,7 +191,7 @@ static void DEF(put_no_rnd_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int li :"%eax", "memory"); } -static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -222,7 +222,7 @@ static void DEF(put_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size } /* GL: this function does incorrect rounding if overflow */ -static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_BONE(mm6); __asm __volatile( @@ -255,7 +255,7 @@ static void DEF(put_no_rnd_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int li :"%eax", "memory"); } -static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -283,7 +283,7 @@ static void DEF(avg_pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, i :"%eax", "memory"); } -static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -315,7 +315,7 @@ static void DEF(avg_pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size :"%eax", "memory"); } -static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%eax \n\t" @@ -354,7 +354,7 @@ static void DEF(avg_pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size } // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter -static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_BONE(mm6); __asm __volatile( @@ -396,31 +396,31 @@ static void DEF(avg_pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_siz } //FIXME the following could be optimized too ... -static void DEF(put_no_rnd_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ +static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h); DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h); } -static void DEF(put_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ +static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ DEF(put_pixels8_y2)(block , pixels , line_size, h); DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h); } -static void DEF(put_no_rnd_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ +static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h); DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h); } -static void DEF(avg_pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ +static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ DEF(avg_pixels8)(block , pixels , line_size, h); DEF(avg_pixels8)(block+8, pixels+8, line_size, h); } -static void DEF(avg_pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ +static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ DEF(avg_pixels8_x2)(block , pixels , line_size, h); DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h); } -static void DEF(avg_pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ +static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ DEF(avg_pixels8_y2)(block , pixels , line_size, h); DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h); } -static void DEF(avg_pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ +static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ DEF(avg_pixels8_xy2)(block , pixels , line_size, h); DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h); } diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h index 956edf798..bbd5aec97 100644 --- a/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h +++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx_rnd.h @@ -22,7 +22,7 @@ */ // put_pixels -static void DEF(put, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_BFE(mm6); __asm __volatile( @@ -104,7 +104,7 @@ static void DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int :"memory"); } -static void DEF(put, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_BFE(mm6); __asm __volatile( @@ -199,7 +199,7 @@ static void DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in :"memory"); } -static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_BFE(mm6); __asm __volatile( @@ -228,7 +228,7 @@ static void DEF(put, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_siz :"eax", "memory"); } -static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_ZERO(mm7); SET_RND(mm6); // =2 for rnd and =1 for no_rnd version @@ -296,7 +296,7 @@ static void DEF(put, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si // avg_pixels // in case more speed is needed - unroling would certainly help -static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_BFE(mm6); JUMPALIGN(); @@ -315,7 +315,7 @@ static void DEF(avg, pixels8)(UINT8 *block, const UINT8 *pixels, int line_size, while (--h); } -static void DEF(avg, pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_BFE(mm6); JUMPALIGN(); @@ -338,7 +338,7 @@ static void DEF(avg, pixels16)(UINT8 *block, const UINT8 *pixels, int line_size, while (--h); } -static void DEF(avg, pixels8_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_BFE(mm6); JUMPALIGN(); @@ -379,7 +379,7 @@ static void DEF(avg, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int } while (--h); } -static void DEF(avg, pixels16_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_BFE(mm6); JUMPALIGN(); @@ -432,7 +432,7 @@ static void DEF(avg, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, in } while (--h); } -static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_BFE(mm6); __asm __volatile( @@ -472,7 +472,7 @@ static void DEF(avg, pixels8_y2)(UINT8 *block, const UINT8 *pixels, int line_siz } // this routine is 'slightly' suboptimal but mostly unused -static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h) +static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h) { MOVQ_ZERO(mm7); SET_RND(mm6); // =2 for rnd and =1 for no_rnd version @@ -547,22 +547,22 @@ static void DEF(avg, pixels8_xy2)(UINT8 *block, const UINT8 *pixels, int line_si } //FIXME optimize -static void DEF(put, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ +static void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ DEF(put, pixels8_y2)(block , pixels , line_size, h); DEF(put, pixels8_y2)(block+8, pixels+8, line_size, h); } -static void DEF(put, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ +static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ DEF(put, pixels8_xy2)(block , pixels , line_size, h); DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h); } -static void DEF(avg, pixels16_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ +static void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ DEF(avg, pixels8_y2)(block , pixels , line_size, h); DEF(avg, pixels8_y2)(block+8, pixels+8, line_size, h); } -static void DEF(avg, pixels16_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h){ +static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){ DEF(avg, pixels8_xy2)(block , pixels , line_size, h); DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h); } diff --git a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c index fa85db67b..5c4b32dcd 100644 --- a/src/libffmpeg/libavcodec/i386/motion_est_mmx.c +++ b/src/libffmpeg/libavcodec/i386/motion_est_mmx.c @@ -20,7 +20,7 @@ */ #include "../dsputil.h" -static const __attribute__ ((aligned(8))) UINT64 round_tab[3]={ +static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={ 0x0000000000000000, 0x0001000100010001, 0x0002000200020002, @@ -28,7 +28,7 @@ static const __attribute__ ((aligned(8))) UINT64 round_tab[3]={ static __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL; -static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h) +static inline void sad8_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) { int len= -(stride<<h); asm volatile( @@ -64,7 +64,7 @@ static inline void sad8_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h) ); } -static inline void sad8_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h) +static inline void sad8_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) { int len= -(stride<<h); asm volatile( @@ -86,7 +86,7 @@ static inline void sad8_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h) ); } -static inline void sad8_2_mmx2(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h) +static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) { int len= -(stride<<h); asm volatile( @@ -112,13 +112,13 @@ static inline void sad8_2_mmx2(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stri ); } -static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h) +static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h) { //FIXME reuse src int len= -(stride<<h); asm volatile( ".balign 16 \n\t" "movq "MANGLE(bone)", %%mm5 \n\t" - "1: \n\t" + "1: \n\t" "movq (%1, %%eax), %%mm0 \n\t" "movq (%2, %%eax), %%mm2 \n\t" "movq 1(%1, %%eax), %%mm1 \n\t" @@ -149,7 +149,7 @@ static inline void sad8_4_mmx2(UINT8 *blk1, UINT8 *blk2, int stride, int h) ); } -static inline void sad8_2_mmx(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int stride, int h) +static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) { int len= -(stride<<h); asm volatile( @@ -165,7 +165,7 @@ static inline void sad8_2_mmx(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int strid "punpckhbw %%mm7, %%mm3 \n\t" "paddw %%mm0, %%mm1 \n\t" "paddw %%mm2, %%mm3 \n\t" - "movq (%3, %%eax), %%mm4 \n\t" + "movq (%3, %%eax), %%mm4 \n\t" "movq (%3, %%eax), %%mm2 \n\t" "paddw %%mm5, %%mm1 \n\t" "paddw %%mm5, %%mm3 \n\t" @@ -187,7 +187,7 @@ static inline void sad8_2_mmx(UINT8 *blk1a, UINT8 *blk1b, UINT8 *blk2, int strid ); } -static inline void sad8_4_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h) +static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) { int len= -(stride<<h); asm volatile( @@ -215,8 +215,8 @@ static inline void sad8_4_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h) "punpckhbw %%mm7, %%mm4 \n\t" "paddw %%mm3, %%mm2 \n\t" "paddw %%mm4, %%mm1 \n\t" - "movq (%3, %%eax), %%mm3 \n\t" - "movq (%3, %%eax), %%mm4 \n\t" + "movq (%3, %%eax), %%mm3 \n\t" + "movq (%3, %%eax), %%mm4 \n\t" "paddw %%mm5, %%mm2 \n\t" "paddw %%mm5, %%mm1 \n\t" "psrlw $2, %%mm2 \n\t" @@ -237,7 +237,7 @@ static inline void sad8_4_mmx(UINT8 *blk1, UINT8 *blk2, int stride, int h) ); } -static inline int sum_mmx() +static inline int sum_mmx(void) { int ret; asm volatile( @@ -253,7 +253,7 @@ static inline int sum_mmx() return ret&0xFFFF; } -static inline int sum_mmx2() +static inline int sum_mmx2(void) { int ret; asm volatile( @@ -265,7 +265,7 @@ static inline int sum_mmx2() #define PIX_SAD(suf)\ -int pix_abs8x8_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ +static int pix_abs8x8_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t":);\ @@ -274,7 +274,7 @@ int pix_abs8x8_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ \ return sum_ ## suf();\ }\ -int sad8x8_ ## suf(void *s, UINT8 *blk2, UINT8 *blk1, int stride)\ +static int sad8x8_ ## suf(void *s, uint8_t *blk2, uint8_t *blk1, int stride)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t":);\ @@ -284,7 +284,7 @@ int sad8x8_ ## suf(void *s, UINT8 *blk2, UINT8 *blk1, int stride)\ return sum_ ## suf();\ }\ \ -int pix_abs8x8_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ +static int pix_abs8x8_x2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ @@ -297,7 +297,7 @@ int pix_abs8x8_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ return sum_ ## suf();\ }\ \ -int pix_abs8x8_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ +static int pix_abs8x8_y2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ @@ -310,7 +310,7 @@ int pix_abs8x8_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ return sum_ ## suf();\ }\ \ -int pix_abs8x8_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ +static int pix_abs8x8_xy2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ @@ -323,7 +323,7 @@ int pix_abs8x8_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ return sum_ ## suf();\ }\ \ -int pix_abs16x16_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ +static int pix_abs16x16_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t":);\ @@ -333,7 +333,7 @@ int pix_abs16x16_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ \ return sum_ ## suf();\ }\ -int sad16x16_ ## suf(void *s, UINT8 *blk2, UINT8 *blk1, int stride)\ +static int sad16x16_ ## suf(void *s, uint8_t *blk2, uint8_t *blk1, int stride)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t":);\ @@ -343,7 +343,7 @@ int sad16x16_ ## suf(void *s, UINT8 *blk2, UINT8 *blk1, int stride)\ \ return sum_ ## suf();\ }\ -int pix_abs16x16_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ +static int pix_abs16x16_x2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ @@ -356,7 +356,7 @@ int pix_abs16x16_x2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ \ return sum_ ## suf();\ }\ -int pix_abs16x16_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ +static int pix_abs16x16_y2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ @@ -369,7 +369,7 @@ int pix_abs16x16_y2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ \ return sum_ ## suf();\ }\ -int pix_abs16x16_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ +static int pix_abs16x16_xy2_ ## suf(uint8_t *blk2, uint8_t *blk1, int stride)\ {\ asm volatile("pxor %%mm7, %%mm7 \n\t"\ "pxor %%mm6, %%mm6 \n\t"\ @@ -385,3 +385,36 @@ int pix_abs16x16_xy2_ ## suf(UINT8 *blk2, UINT8 *blk1, int stride)\ PIX_SAD(mmx) PIX_SAD(mmx2) + +void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx) +{ + if (mm_flags & MM_MMX) { + c->pix_abs16x16 = pix_abs16x16_mmx; + c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx; + c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx; + c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx; + c->pix_abs8x8 = pix_abs8x8_mmx; + c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx; + c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx; + c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx; + + c->sad[0]= sad16x16_mmx; + c->sad[1]= sad8x8_mmx; + } + if (mm_flags & MM_MMXEXT) { + c->pix_abs16x16 = pix_abs16x16_mmx2; + c->pix_abs8x8 = pix_abs8x8_mmx2; + + c->sad[0]= sad16x16_mmx2; + c->sad[1]= sad8x8_mmx2; + + if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ + c->pix_abs16x16_x2 = pix_abs16x16_x2_mmx2; + c->pix_abs16x16_y2 = pix_abs16x16_y2_mmx2; + c->pix_abs16x16_xy2 = pix_abs16x16_xy2_mmx2; + c->pix_abs8x8_x2 = pix_abs8x8_x2_mmx2; + c->pix_abs8x8_y2 = pix_abs8x8_y2_mmx2; + c->pix_abs8x8_xy2 = pix_abs8x8_xy2_mmx2; + } + } +} diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c index be8015dd3..8e452b499 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx.c @@ -23,12 +23,10 @@ #include "../dsputil.h" #include "../mpegvideo.h" #include "../avcodec.h" -#include "../simple_idct.h" -#include "xineutils.h" -extern UINT8 zigzag_direct_noperm[64]; -extern UINT16 inv_zigzag_direct16[64]; -extern UINT32 inverse[256]; +extern uint8_t zigzag_direct_noperm[64]; +extern uint16_t inv_zigzag_direct16[64]; +extern uint32_t inverse[256]; static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL; static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL; @@ -42,9 +40,7 @@ static void dct_unquantize_h263_mmx(MpegEncContext *s, qmul = qscale << 1; qadd = (qscale - 1) | 1; - XINE_ASSERT(s->block_last_index[n]>=0, - "value 's->block_last_index[%d] is < 0: %d", - n, s->block_last_index[n]); + assert(s->block_last_index[n]>=0); if (s->mb_intra) { if (!s->h263_aic) { @@ -147,11 +143,9 @@ static void dct_unquantize_mpeg1_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int nCoeffs; - const UINT16 *quant_matrix; + const uint16_t *quant_matrix; - XINE_ASSERT(s->block_last_index[n]>=0, - "value 's->block_last_index[%d] is < 0: %d", - n, s->block_last_index[n]); + assert(s->block_last_index[n]>=0); nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; @@ -277,11 +271,9 @@ static void dct_unquantize_mpeg2_mmx(MpegEncContext *s, DCTELEM *block, int n, int qscale) { int nCoeffs; - const UINT16 *quant_matrix; + const uint16_t *quant_matrix; - XINE_ASSERT(s->block_last_index[n]>=0, - "value 's->block_last_index[%d] is < 0: %d", - n, s->block_last_index[n]); + assert(s->block_last_index[n]>=0); if(s->alternate_scan) nCoeffs= 63; //FIXME else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; @@ -411,9 +403,9 @@ asm volatile( /* draw the edges of width 'w' of an image of size width, height this mmx version can only handle w==8 || w==16 */ -static void draw_edges_mmx(UINT8 *buf, int wrap, int width, int height, int w) +static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w) { - UINT8 *ptr, *last_line; + uint8_t *ptr, *last_line; int i; last_line = buf + (height - 1) * wrap; @@ -506,38 +498,10 @@ static void draw_edges_mmx(UINT8 *buf, int wrap, int width, int height, int w) #define RENAME(a) a ## _MMX2 #include "mpegvideo_mmx_template.c" -/* external functions, from idct_mmx.c */ -void ff_mmx_idct(DCTELEM *block); -void ff_mmxext_idct(DCTELEM *block); - -/* XXX: those functions should be suppressed ASAP when all IDCTs are - converted */ -static void ff_libmpeg2mmx_idct_put(UINT8 *dest, int line_size, DCTELEM *block) -{ - ff_mmx_idct (block); - put_pixels_clamped_mmx(block, dest, line_size); -} -static void ff_libmpeg2mmx_idct_add(UINT8 *dest, int line_size, DCTELEM *block) -{ - ff_mmx_idct (block); - add_pixels_clamped_mmx(block, dest, line_size); -} -static void ff_libmpeg2mmx2_idct_put(UINT8 *dest, int line_size, DCTELEM *block) -{ - ff_mmxext_idct (block); - put_pixels_clamped_mmx(block, dest, line_size); -} -static void ff_libmpeg2mmx2_idct_add(UINT8 *dest, int line_size, DCTELEM *block) -{ - ff_mmxext_idct (block); - add_pixels_clamped_mmx(block, dest, line_size); -} - void MPV_common_init_mmx(MpegEncContext *s) { if (mm_flags & MM_MMX) { const int dct_algo = s->avctx->dct_algo; - const int idct_algo= s->avctx->idct_algo; s->dct_unquantize_h263 = dct_unquantize_h263_mmx; s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx; @@ -546,28 +510,11 @@ void MPV_common_init_mmx(MpegEncContext *s) draw_edges = draw_edges_mmx; if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ - s->fdct = ff_fdct_mmx; - if(mm_flags & MM_MMXEXT){ s->dct_quantize= dct_quantize_MMX2; } else { s->dct_quantize= dct_quantize_MMX; } } - - if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ - s->idct_put= ff_simple_idct_put_mmx; - s->idct_add= ff_simple_idct_add_mmx; - s->idct_permutation_type= FF_SIMPLE_IDCT_PERM; - }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ - if(mm_flags & MM_MMXEXT){ - s->idct_put= ff_libmpeg2mmx2_idct_put; - s->idct_add= ff_libmpeg2mmx2_idct_add; - }else{ - s->idct_put= ff_libmpeg2mmx_idct_put; - s->idct_add= ff_libmpeg2mmx_idct_add; - } - s->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; - } } } diff --git a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c index ead30ed31..8cd91024b 100644 --- a/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c +++ b/src/libffmpeg/libavcodec/i386/mpegvideo_mmx_template.c @@ -37,8 +37,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s, int qscale, int *overflow) { int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ... - const UINT16 *qmat, *bias; - static __align8 INT16 temp_block[64]; + const uint16_t *qmat, *bias; + static __align8 int16_t temp_block[64]; //s->fdct (block); ff_fdct_mmx (block); //cant be anything else ... @@ -207,7 +207,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, if(s->mb_intra) block[0]= level; else block[0]= temp_block[0]; - if(s->idct_permutation[1]==8){ + if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){ if(last_non_zero_p1 <= 1) goto end; block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08]; block[0x20] = temp_block[0x10]; @@ -251,7 +251,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s, block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36]; block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37]; block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; - }else if(s->idct_permutation[1]==4){ + }else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){ if(last_non_zero_p1 <= 1) goto end; block[0x04] = temp_block[0x01]; block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; diff --git a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c index 9dfd5f149..836403ca5 100644 --- a/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c +++ b/src/libffmpeg/libavcodec/i386/simple_idct_mmx.c @@ -1298,12 +1298,12 @@ void ff_simple_idct_mmx(int16_t *block) //FIXME merge add/put into the idct -void ff_simple_idct_put_mmx(UINT8 *dest, int line_size, DCTELEM *block) +void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block) { idct(block); put_pixels_clamped_mmx(block, dest, line_size); } -void ff_simple_idct_add_mmx(UINT8 *dest, int line_size, DCTELEM *block) +void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block) { idct(block); add_pixels_clamped_mmx(block, dest, line_size); |