summaryrefslogtreecommitdiff
path: root/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libffmpeg/libavcodec/i386/dsputil_mmx.c')
-rw-r--r--src/libffmpeg/libavcodec/i386/dsputil_mmx.c191
1 files changed, 163 insertions, 28 deletions
diff --git a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
index 6bd2b32b9..c1dd2176a 100644
--- a/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
+++ b/src/libffmpeg/libavcodec/i386/dsputil_mmx.c
@@ -29,6 +29,8 @@
//#include <assert.h>
extern const uint8_t ff_h263_loop_filter_strength[32];
+extern void ff_idct_xvid_mmx(short *block);
+extern void ff_idct_xvid_mmx2(short *block);
int mm_flags; /* multimedia extension flags */
@@ -615,31 +617,32 @@ static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
}
static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
+ void *dst_reg = dst, *src_reg = src;
+
asm volatile( //FIXME could save 1 instruction if done as 8x4 ...
- "movd %4, %%mm0 \n\t"
- "movd %5, %%mm1 \n\t"
- "movd %6, %%mm2 \n\t"
- "movd %7, %%mm3 \n\t"
+ "movd (%1), %%mm0 \n\t"
+ "movd (%1,%5), %%mm1 \n\t"
+ "lea (%1, %5, 2), %1 \n\t"
+ "movd (%1), %%mm2 \n\t"
+ "movd (%1,%5), %%mm3 \n\t"
"punpcklbw %%mm1, %%mm0 \n\t"
"punpcklbw %%mm3, %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"punpcklwd %%mm2, %%mm0 \n\t"
"punpckhwd %%mm2, %%mm1 \n\t"
- "movd %%mm0, %0 \n\t"
+ "movd %%mm0, (%0) \n\t"
"punpckhdq %%mm0, %%mm0 \n\t"
- "movd %%mm0, %1 \n\t"
- "movd %%mm1, %2 \n\t"
+ "movd %%mm0, (%0,%4) \n\t"
+ "lea (%0, %4, 2), %0 \n\t"
+ "movd %%mm1, (%0) \n\t"
"punpckhdq %%mm1, %%mm1 \n\t"
- "movd %%mm1, %3 \n\t"
-
- : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
- "=m" (*(uint32_t*)(dst + 1*dst_stride)),
- "=m" (*(uint32_t*)(dst + 2*dst_stride)),
- "=m" (*(uint32_t*)(dst + 3*dst_stride))
- : "m" (*(uint32_t*)(src + 0*src_stride)),
- "m" (*(uint32_t*)(src + 1*src_stride)),
- "m" (*(uint32_t*)(src + 2*src_stride)),
- "m" (*(uint32_t*)(src + 3*src_stride))
+ "movd %%mm1, (%0,%4) \n\t"
+ : "=&r" (dst_reg),
+ "=&r" (src_reg)
+ : "0" (dst_reg),
+ "1" (src_reg),
+ "r" (dst_stride),
+ "r" (src_stride)
);
}
@@ -742,31 +745,49 @@ static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
int tmp;
asm volatile (
"movl %4,%%ecx\n"
+ "shr $1,%%ecx\n"
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
"1:\n"
- "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
- "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
+ "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
+ "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
+ "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
+ "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
+ /* todo: mm1-mm2, mm3-mm4 */
+ /* algo: substract mm1 from mm2 with saturation and vice versa */
+ /* OR the results to get absolute difference */
"movq %%mm1,%%mm5\n"
+ "movq %%mm3,%%mm6\n"
"psubusb %%mm2,%%mm1\n"
+ "psubusb %%mm4,%%mm3\n"
"psubusb %%mm5,%%mm2\n"
+ "psubusb %%mm6,%%mm4\n"
"por %%mm1,%%mm2\n"
+ "por %%mm3,%%mm4\n"
+ /* now convert to 16-bit vectors so we can square them */
"movq %%mm2,%%mm1\n"
+ "movq %%mm4,%%mm3\n"
"punpckhbw %%mm0,%%mm2\n"
+ "punpckhbw %%mm0,%%mm4\n"
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
+ "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
"pmaddwd %%mm2,%%mm2\n"
+ "pmaddwd %%mm4,%%mm4\n"
"pmaddwd %%mm1,%%mm1\n"
+ "pmaddwd %%mm3,%%mm3\n"
- "add %3,%0\n"
- "add %3,%1\n"
+ "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
+ "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
"paddd %%mm2,%%mm1\n"
+ "paddd %%mm4,%%mm3\n"
"paddd %%mm1,%%mm7\n"
+ "paddd %%mm3,%%mm7\n"
"decl %%ecx\n"
"jnz 1b\n"
@@ -841,6 +862,68 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
return tmp;
}
+static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ int tmp;
+ asm volatile (
+ "shr $1,%2\n"
+ "pxor %%xmm0,%%xmm0\n" /* mm0 = 0 */
+ "pxor %%xmm7,%%xmm7\n" /* mm7 holds the sum */
+ "1:\n"
+ "movdqu (%0),%%xmm1\n" /* mm1 = pix1[0][0-15] */
+ "movdqu (%1),%%xmm2\n" /* mm2 = pix2[0][0-15] */
+ "movdqu (%0,%4),%%xmm3\n" /* mm3 = pix1[1][0-15] */
+ "movdqu (%1,%4),%%xmm4\n" /* mm4 = pix2[1][0-15] */
+
+ /* todo: mm1-mm2, mm3-mm4 */
+ /* algo: substract mm1 from mm2 with saturation and vice versa */
+ /* OR the results to get absolute difference */
+ "movdqa %%xmm1,%%xmm5\n"
+ "movdqa %%xmm3,%%xmm6\n"
+ "psubusb %%xmm2,%%xmm1\n"
+ "psubusb %%xmm4,%%xmm3\n"
+ "psubusb %%xmm5,%%xmm2\n"
+ "psubusb %%xmm6,%%xmm4\n"
+
+ "por %%xmm1,%%xmm2\n"
+ "por %%xmm3,%%xmm4\n"
+
+ /* now convert to 16-bit vectors so we can square them */
+ "movdqa %%xmm2,%%xmm1\n"
+ "movdqa %%xmm4,%%xmm3\n"
+
+ "punpckhbw %%xmm0,%%xmm2\n"
+ "punpckhbw %%xmm0,%%xmm4\n"
+ "punpcklbw %%xmm0,%%xmm1\n" /* mm1 now spread over (mm1,mm2) */
+ "punpcklbw %%xmm0,%%xmm3\n" /* mm4 now spread over (mm3,mm4) */
+
+ "pmaddwd %%xmm2,%%xmm2\n"
+ "pmaddwd %%xmm4,%%xmm4\n"
+ "pmaddwd %%xmm1,%%xmm1\n"
+ "pmaddwd %%xmm3,%%xmm3\n"
+
+ "lea (%0,%4,2), %0\n" /* pix1 += 2*line_size */
+ "lea (%1,%4,2), %1\n" /* pix2 += 2*line_size */
+
+ "paddd %%xmm2,%%xmm1\n"
+ "paddd %%xmm4,%%xmm3\n"
+ "paddd %%xmm1,%%xmm7\n"
+ "paddd %%xmm3,%%xmm7\n"
+
+ "decl %2\n"
+ "jnz 1b\n"
+
+ "movdqa %%xmm7,%%xmm1\n"
+ "psrldq $8, %%xmm7\n" /* shift hi qword to lo */
+ "paddd %%xmm1,%%xmm7\n"
+ "movdqa %%xmm7,%%xmm1\n"
+ "psrldq $4, %%xmm7\n" /* shift hi dword to lo */
+ "paddd %%xmm1,%%xmm7\n"
+ "movd %%xmm7,%3\n"
+ : "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
+ : "r" ((long)line_size));
+ return tmp;
+}
+
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
int tmp;
asm volatile (
@@ -1080,7 +1163,8 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
return tmp + hf_noise8_mmx(pix+8, line_size, h);
}
-static int nsse16_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ MpegEncContext *c = p;
int score1= sse16_mmx(c, pix1, pix2, line_size, h);
int score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
@@ -1088,7 +1172,8 @@ static int nsse16_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int lin
else return score1 + ABS(score2)*8;
}
-static int nsse8_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
+ MpegEncContext *c = p;
int score1= sse8_mmx(c, pix1, pix2, line_size, h);
int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
@@ -1617,11 +1702,9 @@ static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride
"movq 64(%1), %%mm1 \n\t"
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
- "movq %%mm0, %%mm1 \n\t"
- "psrlq $32, %%mm0 \n\t"
+ "pshufw $0x0E, %%mm0, %%mm1 \n\t"
"paddusw %%mm1, %%mm0 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "psrlq $16, %%mm0 \n\t"
+ "pshufw $0x01, %%mm0, %%mm1 \n\t"
"paddusw %%mm1, %%mm0 \n\t"
"movd %%mm0, %0 \n\t"
@@ -2455,6 +2538,28 @@ static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
ff_vp3_idct_mmx(block);
add_pixels_clamped_mmx(block, dest, line_size);
}
+#ifdef CONFIG_GPL
+static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_idct_xvid_mmx (block);
+ put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_idct_xvid_mmx (block);
+ add_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_idct_xvid_mmx2 (block);
+ put_pixels_clamped_mmx(block, dest, line_size);
+}
+static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
+{
+ ff_idct_xvid_mmx2 (block);
+ add_pixels_clamped_mmx(block, dest, line_size);
+}
+#endif
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
{
@@ -2527,6 +2632,18 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->idct = ff_vp3_idct_mmx;
c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
}
+#ifdef CONFIG_GPL
+ }else if(idct_algo==FF_IDCT_XVIDMMX){
+ if(mm_flags & MM_MMXEXT){
+ c->idct_put= ff_idct_xvid_mmx2_put;
+ c->idct_add= ff_idct_xvid_mmx2_add;
+ c->idct = ff_idct_xvid_mmx2;
+ }else{
+ c->idct_put= ff_idct_xvid_mmx_put;
+ c->idct_add= ff_idct_xvid_mmx_add;
+ c->idct = ff_idct_xvid_mmx;
+ }
+#endif
}
}
@@ -2590,7 +2707,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->hadamard8_diff[1]= hadamard8_diff_mmx;
c->pix_norm1 = pix_norm1_mmx;
- c->sse[0] = sse16_mmx;
+ c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx;
c->sse[1] = sse8_mmx;
c->vsad[4]= vsad_intra16_mmx;
@@ -2716,6 +2833,24 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
+ c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
+ c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
+ c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
+ c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
+ c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
+ c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
+ c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
+ c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
+
+ c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
+ c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
+ c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
+ c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
+ c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
+ c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
+ c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
+ c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+
#ifdef CONFIG_ENCODERS
c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
#endif //CONFIG_ENCODERS