diff options
Diffstat (limited to 'contrib/ffmpeg/libavcodec/ppc')
-rw-r--r-- | contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c | 6 | ||||
-rw-r--r-- | contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c | 10 | ||||
-rw-r--r-- | contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h | 6 | ||||
-rw-r--r-- | contrib/ffmpeg/libavcodec/ppc/float_altivec.c | 3 | ||||
-rw-r--r-- | contrib/ffmpeg/libavcodec/ppc/h264_altivec.c | 129 | ||||
-rw-r--r-- | contrib/ffmpeg/libavcodec/ppc/idct_altivec.c | 4 |
6 files changed, 141 insertions, 17 deletions
diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c index 6f48893a4..bbc53d761 100644 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c @@ -1107,12 +1107,10 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); register vector signed short srcV, dstV; \ register vector signed short but0, but1, but2, op1, op2, op3; \ src1 = vec_ld(stride * i, src); \ - if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \ - src2 = vec_ld((stride * i) + 16, src); \ + src2 = vec_ld((stride * i) + 15, src); \ srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ dst1 = vec_ld(stride * i, dst); \ - if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \ - dst2 = vec_ld((stride * i) + 16, dst); \ + dst2 = vec_ld((stride * i) + 15, dst); \ dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ /* promote the unsigned chars to signed shorts */ \ /* we're in the 8x8 function, we only care for the first 8 */ \ diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c index 9169eaef0..117a7adf1 100644 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c +++ b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c @@ -55,7 +55,7 @@ int mm_support(void) return result; } -#ifdef POWERPC_PERFORMANCE_REPORT +#ifdef CONFIG_POWERPC_PERF unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; /* list below must match enum in dsputil_ppc.h */ static unsigned char* perfname[] = { @@ -90,7 +90,7 @@ static unsigned char* perfname[] = { #include <stdio.h> #endif -#ifdef POWERPC_PERFORMANCE_REPORT +#ifdef CONFIG_POWERPC_PERF void powerpc_display_perf_report(void) { int i, j; @@ -112,7 +112,7 @@ void powerpc_display_perf_report(void) } } } -#endif /* POWERPC_PERFORMANCE_REPORT */ +#endif /* CONFIG_POWERPC_PERF */ /* ***** WARNING ***** WARNING ***** WARNING ***** */ /* @@ -305,7 +305,7 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) } } -#ifdef POWERPC_PERFORMANCE_REPORT +#ifdef CONFIG_POWERPC_PERF { int i, j; for (i = 0 ; i < powerpc_perf_total ; i++) @@ -319,7 +319,7 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) } } } -#endif /* POWERPC_PERFORMANCE_REPORT */ +#endif /* CONFIG_POWERPC_PERF */ } #endif /* HAVE_ALTIVEC */ } diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h index ab2b05780..5b25732b2 100644 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h +++ b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h @@ -21,7 +21,7 @@ #ifndef _DSPUTIL_PPC_ #define _DSPUTIL_PPC_ -#ifdef POWERPC_PERFORMANCE_REPORT +#ifdef CONFIG_POWERPC_PERF void powerpc_display_perf_report(void); /* the 604* have 2, the G3* have 4, the G4s have 6, and the G5 are completely different (they MUST use @@ -145,11 +145,11 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][ } \ } \ } while (0) -#else /* POWERPC_PERFORMANCE_REPORT */ +#else /* CONFIG_POWERPC_PERF */ // those are needed to avoid empty statements. #define POWERPC_PERF_DECLARE(a, cond) int altivec_placeholder __attribute__ ((unused)) #define POWERPC_PERF_START_COUNT(a, cond) do {} while (0) #define POWERPC_PERF_STOP_COUNT(a, cond) do {} while (0) -#endif /* POWERPC_PERFORMANCE_REPORT */ +#endif /* CONFIG_POWERPC_PERF */ #endif /* _DSPUTIL_PPC_ */ diff --git a/contrib/ffmpeg/libavcodec/ppc/float_altivec.c b/contrib/ffmpeg/libavcodec/ppc/float_altivec.c index c6e43dec2..22c2de61a 100644 --- a/contrib/ffmpeg/libavcodec/ppc/float_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/float_altivec.c @@ -76,7 +76,6 @@ static void vector_fmul_add_add_altivec(float *dst, const float *src0, vector unsigned char align = vec_lvsr(0,dst), mask = vec_lvsl(0, dst); - t0 = vec_ld(0, dst); #if 0 //FIXME: there is still something wrong if (step == 2) { int y; @@ -134,6 +133,7 @@ static void vector_fmul_add_add_altivec(float *dst, const float *src0, #endif if (step == 1 && src3 == 0) for (i=0; i<len-3; i+=4) { + t0 = vec_ld(0, dst+i); t1 = vec_ld(15, dst+i); s0 = vec_ld(0, src0+i); s1 = vec_ld(0, src1+i); @@ -144,7 +144,6 @@ static void vector_fmul_add_add_altivec(float *dst, const float *src0, t0 = vec_perm(edges, d, align); vec_st(t1, 15, dst+i); vec_st(t0, 0, dst+i); - t0 = t1; } else ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); diff --git a/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c b/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c index 4aa366f97..bac620e82 100644 --- a/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c @@ -23,6 +23,7 @@ #include "gcc_fixes.h" #include "dsputil_altivec.h" +#include "types_altivec.h" #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) @@ -202,7 +203,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride vector signed short vsrc0ssH, vsrc1ssH; vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; vector signed short vsrc2ssH, vsrc3ssH, psum; - vector unsigned char vdst, ppsum, vfdst, fsum; + vector unsigned char vdst, ppsum, fsum; if (((unsigned long)dst) % 16 == 0) { fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, @@ -398,6 +399,131 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, H264_MC(put_, 16, altivec) H264_MC(avg_, 16, altivec) + +/**************************************************************************** + * IDCT transform: + ****************************************************************************/ + +#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ + /* a0 = SRC(0) + SRC(4); */ \ + vec_s16_t a0v = vec_add(s0, s4); \ + /* a2 = SRC(0) - SRC(4); */ \ + vec_s16_t a2v = vec_sub(s0, s4); \ + /* a4 = (SRC(2)>>1) - SRC(6); */ \ + vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \ + /* a6 = (SRC(6)>>1) + SRC(2); */ \ + vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \ + /* b0 = a0 + a6; */ \ + vec_s16_t b0v = vec_add(a0v, a6v); \ + /* b2 = a2 + a4; */ \ + vec_s16_t b2v = vec_add(a2v, a4v); \ + /* b4 = a2 - a4; */ \ + vec_s16_t b4v = vec_sub(a2v, a4v); \ + /* b6 = a0 - a6; */ \ + vec_s16_t b6v = vec_sub(a0v, a6v); \ + /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ + /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ + vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \ + /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ + /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ + vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ + /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ + /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ + vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ + /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ + vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ + /* b1 = (a7>>2) + a1; */ \ + vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \ + /* b3 = a3 + (a5>>2); */ \ + vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \ + /* b5 = (a3>>2) - a5; */ \ + vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \ + /* b7 = a7 - (a1>>2); */ \ + vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ + /* DST(0, b0 + b7); */ \ + d0 = vec_add(b0v, b7v); \ + /* DST(1, b2 + b5); */ \ + d1 = vec_add(b2v, b5v); \ + /* DST(2, b4 + b3); */ \ + d2 = vec_add(b4v, b3v); \ + /* DST(3, b6 + b1); */ \ + d3 = vec_add(b6v, b1v); \ + /* DST(4, b6 - b1); */ \ + d4 = vec_sub(b6v, b1v); \ + /* DST(5, b4 - b3); */ \ + d5 = vec_sub(b4v, b3v); \ + /* DST(6, b2 - b5); */ \ + d6 = vec_sub(b2v, b5v); \ + /* DST(7, b0 - b7); */ \ + d7 = vec_sub(b0v, b7v); \ +} + +#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ + /* unaligned load */ \ + vec_u8_t hv = vec_ld( 0, dest ); \ + vec_u8_t lv = vec_ld( 7, dest ); \ + vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \ + vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \ + vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \ + vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \ + vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \ + vec_u8_t edgehv; \ + /* unaligned store */ \ + vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ + vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ + lv = vec_sel( lv, bodyv, edgelv ); \ + vec_st( lv, 7, dest ); \ + hv = vec_ld( 0, dest ); \ + edgehv = vec_perm( zero_u8v, sel, perm_stv ); \ + hv = vec_sel( hv, bodyv, edgehv ); \ + vec_st( hv, 0, dest ); \ + } + +void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { + vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7; + vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7; + vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; + + vec_u8_t perm_ldv = vec_lvsl(0, dst); + vec_u8_t perm_stv = vec_lvsr(8, dst); + + const vec_u16_t onev = vec_splat_u16(1); + const vec_u16_t twov = vec_splat_u16(2); + const vec_u16_t sixv = vec_splat_u16(6); + + const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0, + -1,-1,-1,-1,-1,-1,-1,-1); + LOAD_ZERO; + + dct[0] += 32; // rounding for the >>6 at the end + + s0 = vec_ld(0x00, (int16_t*)dct); + s1 = vec_ld(0x10, (int16_t*)dct); + s2 = vec_ld(0x20, (int16_t*)dct); + s3 = vec_ld(0x30, (int16_t*)dct); + s4 = vec_ld(0x40, (int16_t*)dct); + s5 = vec_ld(0x50, (int16_t*)dct); + s6 = vec_ld(0x60, (int16_t*)dct); + s7 = vec_ld(0x70, (int16_t*)dct); + + IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, + d0, d1, d2, d3, d4, d5, d6, d7); + + TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 ); + + IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7, + idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); + + ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); + ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); +} + void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { #ifdef HAVE_ALTIVEC @@ -405,6 +531,7 @@ void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec; c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; + c->h264_idct8_add = ff_h264_idct8_add_altivec; #define dspfunc(PFX, IDX, NUM) \ c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ diff --git a/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c b/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c index cee46fc25..66c8082f7 100644 --- a/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c @@ -171,7 +171,7 @@ void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block) POWERPC_PERF_DECLARE(altivec_idct_put_num, 1); vector_u8_t tmp; -#ifdef POWERPC_PERFORMANCE_REPORT +#ifdef CONFIG_POWERPC_PERF POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); #endif IDCT @@ -202,7 +202,7 @@ POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); vector_u8_t perm1; vector_u8_t p0, p1, p; -#ifdef POWERPC_PERFORMANCE_REPORT +#ifdef CONFIG_POWERPC_PERF POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); #endif |