From c5b6afab8b74e5cc938b8467d3808a877ded7d03 Mon Sep 17 00:00:00 2001 From: Mike Melanson Date: Mon, 27 Oct 2003 15:24:38 +0000 Subject: super mega ffmpeg tree sync CVS patchset: 5615 CVS date: 2003/10/27 15:24:38 --- src/libffmpeg/libavcodec/ppc/dsputil_altivec.c | 115 ++++++++++++++++--------- 1 file changed, 75 insertions(+), 40 deletions(-) (limited to 'src/libffmpeg/libavcodec/ppc/dsputil_altivec.c') diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c index 32e881b70..635480784 100644 --- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c +++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c @@ -655,11 +655,11 @@ void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { /* next one assumes that ((line_size % 16) == 0) */ void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) { -POWERPC_TBL_DECLARE(altivec_put_pixels16_num, 1); +POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); #ifdef ALTIVEC_USE_REFERENCE_C_CODE int i; -POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1); +POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); for(i=0; il); @@ -670,15 +670,27 @@ POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1); block +=line_size; } -POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); #else /* ALTIVEC_USE_REFERENCE_C_CODE */ register vector unsigned char pixelsv1, pixelsv2; + register vector unsigned char pixelsv1B, pixelsv2B; + register vector unsigned char pixelsv1C, pixelsv2C; + register vector unsigned char pixelsv1D, pixelsv2D; + register vector unsigned char perm = vec_lvsl(0, pixels); int i; - -POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1); - + register int line_size_2 = line_size << 1; + register int line_size_3 = line_size + line_size_2; + register int line_size_4 = line_size << 2; + +POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); +// hand-unrolling the loop by 4 gains about 15% +// mininum execution time goes from 74 to 60 cycles +// it's faster than -funroll-loops, but using +// -funroll-loops w/ this is bad - 74 cycles again. +// all this is on a 7450, tuning for the 7450 +#if 0 for(i=0; i>1) ) void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) { -POWERPC_TBL_DECLARE(altivec_avg_pixels16_num, 1); +POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); #ifdef ALTIVEC_USE_REFERENCE_C_CODE int i; -POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); +POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); for(i=0; il)); @@ -712,14 +745,14 @@ POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); block +=line_size; } -POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); +POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); #else /* ALTIVEC_USE_REFERENCE_C_CODE */ register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; register vector unsigned char perm = vec_lvsl(0, pixels); int i; -POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); +POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); for(i=0; il); @@ -842,7 +875,7 @@ POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1); block += 4 - line_size * h; } -POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); +POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); #else /* ALTIVEC_USE_REFERENCE_C_CODE */ register int i; @@ -873,7 +906,7 @@ POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vctwo); -POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1); +POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); for (i = 0; i < h ; i++) { int rightside = ((unsigned long)block & 0x0000000F); blockv = vec_ld(0, block); @@ -914,17 +947,17 @@ POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1); pixels += line_size; } -POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); +POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } /* next one assumes that ((line_size % 8) == 0) */ void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) { -POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); #ifdef ALTIVEC_USE_REFERENCE_C_CODE int j; -POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); for (j = 0; j < 2; j++) { int i; const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); @@ -957,7 +990,7 @@ POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); block += 4 - line_size * h; } -POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); #else /* ALTIVEC_USE_REFERENCE_C_CODE */ register int i; @@ -989,7 +1022,7 @@ POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vcone); -POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); for (i = 0; i < h ; i++) { int rightside = ((unsigned long)block & 0x0000000F); blockv = vec_ld(0, block); @@ -1030,17 +1063,17 @@ POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); pixels += line_size; } -POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } /* next one assumes that ((line_size % 16) == 0) */ void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) { -POWERPC_TBL_DECLARE(altivec_put_pixels16_xy2_num, 1); +POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); #ifdef ALTIVEC_USE_REFERENCE_C_CODE int j; -POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); +POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); for (j = 0; j < 4; j++) { int i; const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); @@ -1073,7 +1106,7 @@ POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); block += 4 - line_size * h; } -POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); #else /* ALTIVEC_USE_REFERENCE_C_CODE */ register int i; @@ -1086,7 +1119,9 @@ POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); pixelssum3, pixelssum4, temp4; register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - + +POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); + temp1 = vec_ld(0, pixels); temp2 = vec_ld(16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); @@ -1109,7 +1144,6 @@ POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vctwo); -POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); for (i = 0; i < h ; i++) { blockv = vec_ld(0, block); @@ -1150,17 +1184,17 @@ POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); pixels += line_size; } -POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); +POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } /* next one assumes that ((line_size % 16) == 0) */ void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) { -POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); +POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); #ifdef ALTIVEC_USE_REFERENCE_C_CODE int j; -POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); for (j = 0; j < 4; j++) { int i; const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); @@ -1193,7 +1227,7 @@ POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); block += 4 - line_size * h; } -POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); #else /* ALTIVEC_USE_REFERENCE_C_CODE */ register int i; @@ -1207,7 +1241,9 @@ POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - + +POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); + temp1 = vec_ld(0, pixels); temp2 = vec_ld(16, pixels); pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); @@ -1230,7 +1266,6 @@ POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); (vector unsigned short)pixelsv2); pixelssum1 = vec_add(pixelssum1, vcone); -POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); for (i = 0; i < h ; i++) { blockv = vec_ld(0, block); @@ -1271,7 +1306,7 @@ POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); pixels += line_size; } -POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); +POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); #endif /* ALTIVEC_USE_REFERENCE_C_CODE */ } -- cgit v1.2.3