diff options
Diffstat (limited to 'contrib/ffmpeg/libavcodec/ppc')
18 files changed, 0 insertions, 6703 deletions
diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c deleted file mode 100644 index bbc53d761..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c +++ /dev/null @@ -1,1589 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "../dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_altivec.h" - -#ifdef CONFIG_DARWIN -#include <sys/sysctl.h> -#else /* CONFIG_DARWIN */ -#ifdef __AMIGAOS4__ -#include <exec/exec.h> -#include <interfaces/exec.h> -#include <proto/exec.h> -#else /* __AMIGAOS4__ */ -#include <signal.h> -#include <setjmp.h> - -static sigjmp_buf jmpbuf; -static volatile sig_atomic_t canjump = 0; - -static void sigill_handler (int sig) -{ - if (!canjump) { - signal (sig, SIG_DFL); - raise (sig); - } - - canjump = 0; - siglongjmp (jmpbuf, 1); -} -#endif /* CONFIG_DARWIN */ -#endif /* __AMIGAOS4__ */ - -int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s __attribute__((aligned(16))); - const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); - vector unsigned char *tv; - vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; - vector unsigned int sad; - vector signed int sumdiffs; - - s = 0; - sad = (vector unsigned int)vec_splat_u32(0); - for(i=0;i<h;i++) { - /* - Read unaligned pixels into our vectors. The vectors are as follows: - pix1v: pix1[0]-pix1[15] - pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] - */ - tv = (vector unsigned char *) pix1; - pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); - - tv = (vector unsigned char *) &pix2[0]; - pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); - - tv = (vector unsigned char *) &pix2[1]; - pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); - - /* Calculate the average vector */ - avgv = vec_avg(pix2v, pix2iv); - - /* Calculate a sum of abs differences vector */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s __attribute__((aligned(16))); - const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); - vector unsigned char *tv; - vector unsigned char pix1v, pix2v, pix3v, avgv, t5; - vector unsigned int sad; - vector signed int sumdiffs; - uint8_t *pix3 = pix2 + line_size; - - s = 0; - sad = (vector unsigned int)vec_splat_u32(0); - - /* - Due to the fact that pix3 = pix2 + line_size, the pix3 of one - iteration becomes pix2 in the next iteration. We can use this - fact to avoid a potentially expensive unaligned read, each - time around the loop. - Read unaligned pixels into our vectors. The vectors are as follows: - pix2v: pix2[0]-pix2[15] - Split the pixel vectors into shorts - */ - tv = (vector unsigned char *) &pix2[0]; - pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); - - for(i=0;i<h;i++) { - /* - Read unaligned pixels into our vectors. The vectors are as follows: - pix1v: pix1[0]-pix1[15] - pix3v: pix3[0]-pix3[15] - */ - tv = (vector unsigned char *) pix1; - pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); - - tv = (vector unsigned char *) &pix3[0]; - pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); - - /* Calculate the average vector */ - avgv = vec_avg(pix2v, pix3v); - - /* Calculate a sum of abs differences vector */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2v = pix3v; - pix3 += line_size; - - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - return s; -} - -int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s __attribute__((aligned(16))); - uint8_t *pix3 = pix2 + line_size; - const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); - const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2); - vector unsigned char *tv, avgv, t5; - vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; - vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; - vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; - vector unsigned short avghv, avglv; - vector unsigned short t1, t2, t3, t4; - vector unsigned int sad; - vector signed int sumdiffs; - - sad = (vector unsigned int)vec_splat_u32(0); - - s = 0; - - /* - Due to the fact that pix3 = pix2 + line_size, the pix3 of one - iteration becomes pix2 in the next iteration. We can use this - fact to avoid a potentially expensive unaligned read, as well - as some splitting, and vector addition each time around the loop. - Read unaligned pixels into our vectors. The vectors are as follows: - pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] - Split the pixel vectors into shorts - */ - tv = (vector unsigned char *) &pix2[0]; - pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); - - tv = (vector unsigned char *) &pix2[1]; - pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); - - pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); - pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); - pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); - pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); - t1 = vec_add(pix2hv, pix2ihv); - t2 = vec_add(pix2lv, pix2ilv); - - for(i=0;i<h;i++) { - /* - Read unaligned pixels into our vectors. The vectors are as follows: - pix1v: pix1[0]-pix1[15] - pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] - */ - tv = (vector unsigned char *) pix1; - pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); - - tv = (vector unsigned char *) &pix3[0]; - pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); - - tv = (vector unsigned char *) &pix3[1]; - pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); - - /* - Note that Altivec does have vec_avg, but this works on vector pairs - and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding - would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. - Instead, we have to split the pixel vectors into vectors of shorts, - and do the averaging by hand. - */ - - /* Split the pixel vectors into shorts */ - pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); - pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); - pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); - pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); - - /* Do the averaging on them */ - t3 = vec_add(pix3hv, pix3ihv); - t4 = vec_add(pix3lv, pix3ilv); - - avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); - avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); - - /* Pack the shorts back into a result */ - avgv = vec_pack(avghv, avglv); - - /* Calculate a sum of abs differences vector */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix3 += line_size; - /* Transfer the calculated values for pix3 into pix2 */ - t1 = t3; - t2 = t4; - } - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s __attribute__((aligned(16))); - const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); - vector unsigned char perm1, perm2, *pix1v, *pix2v; - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sad; - vector signed int sumdiffs; - - sad = (vector unsigned int)vec_splat_u32(0); - - - for(i=0;i<h;i++) { - /* Read potentially unaligned pixels into t1 and t2 */ - perm1 = vec_lvsl(0, pix1); - pix1v = (vector unsigned char *) pix1; - perm2 = vec_lvsl(0, pix2); - pix2v = (vector unsigned char *) pix2; - t1 = vec_perm(pix1v[0], pix1v[1], perm1); - t2 = vec_perm(pix2v[0], pix2v[1], perm2); - - /* Calculate a sum of abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s __attribute__((aligned(16))); - const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); - vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sad; - vector signed int sumdiffs; - - sad = (vector unsigned int)vec_splat_u32(0); - - permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); - - for(i=0;i<h;i++) { - /* Read potentially unaligned pixels into t1 and t2 - Since we're reading 16 pixels, and actually only want 8, - mask out the last 8 pixels. The 0s don't change the sum. */ - perm1 = vec_lvsl(0, pix1); - pix1v = (vector unsigned char *) pix1; - perm2 = vec_lvsl(0, pix2); - pix2v = (vector unsigned char *) pix2; - t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); - t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); - - /* Calculate a sum of abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -int pix_norm1_altivec(uint8_t *pix, int line_size) -{ - int i; - int s __attribute__((aligned(16))); - const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); - vector unsigned char *tv; - vector unsigned char pixv; - vector unsigned int sv; - vector signed int sum; - - sv = (vector unsigned int)vec_splat_u32(0); - - s = 0; - for (i = 0; i < 16; i++) { - /* Read in the potentially unaligned pixels */ - tv = (vector unsigned char *) pix; - pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); - - /* Square the values, and add them to our sum */ - sv = vec_msum(pixv, pixv, sv); - - pix += line_size; - } - /* Sum up the four partial sums, and put the result into s */ - sum = vec_sums((vector signed int) sv, (vector signed int) zero); - sum = vec_splat(sum, 3); - vec_ste(sum, 0, &s); - - return s; -} - -/** - * Sum of Squared Errors for a 8x8 block. - * AltiVec-enhanced. - * It's the sad8_altivec code above w/ squaring added. - */ -int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s __attribute__((aligned(16))); - const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); - vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sum; - vector signed int sumsqr; - - sum = (vector unsigned int)vec_splat_u32(0); - - permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); - - - for(i=0;i<h;i++) { - /* Read potentially unaligned pixels into t1 and t2 - Since we're reading 16 pixels, and actually only want 8, - mask out the last 8 pixels. The 0s don't change the sum. */ - perm1 = vec_lvsl(0, pix1); - pix1v = (vector unsigned char *) pix1; - perm2 = vec_lvsl(0, pix2); - pix2v = (vector unsigned char *) pix2; - t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); - t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); - - /* - Since we want to use unsigned chars, we can take advantage - of the fact that abs(a-b)^2 = (a-b)^2. - */ - - /* Calculate abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Square the values and add them to our sum */ - sum = vec_msum(t5, t5, sum); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); - sumsqr = vec_splat(sumsqr, 3); - vec_ste(sumsqr, 0, &s); - - return s; -} - -/** - * Sum of Squared Errors for a 16x16 block. - * AltiVec-enhanced. - * It's the sad16_altivec code above w/ squaring added. - */ -int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - int s __attribute__((aligned(16))); - const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); - vector unsigned char perm1, perm2, *pix1v, *pix2v; - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sum; - vector signed int sumsqr; - - sum = (vector unsigned int)vec_splat_u32(0); - - for(i=0;i<h;i++) { - /* Read potentially unaligned pixels into t1 and t2 */ - perm1 = vec_lvsl(0, pix1); - pix1v = (vector unsigned char *) pix1; - perm2 = vec_lvsl(0, pix2); - pix2v = (vector unsigned char *) pix2; - t1 = vec_perm(pix1v[0], pix1v[1], perm1); - t2 = vec_perm(pix2v[0], pix2v[1], perm2); - - /* - Since we want to use unsigned chars, we can take advantage - of the fact that abs(a-b)^2 = (a-b)^2. - */ - - /* Calculate abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Square the values and add them to our sum */ - sum = vec_msum(t5, t5, sum); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); - sumsqr = vec_splat(sumsqr, 3); - vec_ste(sumsqr, 0, &s); - - return s; -} - -int pix_sum_altivec(uint8_t * pix, int line_size) -{ - const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); - vector unsigned char perm, *pixv; - vector unsigned char t1; - vector unsigned int sad; - vector signed int sumdiffs; - - int i; - int s __attribute__((aligned(16))); - - sad = (vector unsigned int)vec_splat_u32(0); - - for (i = 0; i < 16; i++) { - /* Read the potentially unaligned 16 pixels into t1 */ - perm = vec_lvsl(0, pix); - pixv = (vector unsigned char *) pix; - t1 = vec_perm(pixv[0], pixv[1], perm); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t1, sad); - - pix += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) -{ - int i; - vector unsigned char perm, bytes, *pixv; - const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); - vector signed short shorts; - - for(i=0;i<8;i++) - { - // Read potentially unaligned pixels. - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - perm = vec_lvsl(0, pixels); - pixv = (vector unsigned char *) pixels; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts = (vector signed short)vec_mergeh(zero, bytes); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts, i*16, (vector signed short*)block); - - pixels += line_size; - } -} - -void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, - const uint8_t *s2, int stride) -{ - int i; - vector unsigned char perm, bytes, *pixv; - const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); - vector signed short shorts1, shorts2; - - for(i=0;i<4;i++) - { - // Read potentially unaligned pixels - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - perm = vec_lvsl(0, s1); - pixv = (vector unsigned char *) s1; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts1 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the same for the second block of pixels - perm = vec_lvsl(0, s2); - pixv = (vector unsigned char *) s2; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts2 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the subtraction - shorts1 = vec_sub(shorts1, shorts2); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts1, 0, (vector signed short*)block); - - s1 += stride; - s2 += stride; - block += 8; - - - // The code below is a copy of the code above... This is a manual - // unroll. - - // Read potentially unaligned pixels - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - perm = vec_lvsl(0, s1); - pixv = (vector unsigned char *) s1; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts1 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the same for the second block of pixels - perm = vec_lvsl(0, s2); - pixv = (vector unsigned char *) s2; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts2 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the subtraction - shorts1 = vec_sub(shorts1, shorts2); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts1, 0, (vector signed short*)block); - - s1 += stride; - s2 += stride; - block += 8; - } -} - -void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { - register int i; - register vector unsigned char vdst, vsrc; - - /* dst and src are 16 bytes-aligned (guaranteed) */ - for(i = 0 ; (i + 15) < w ; i+=16) - { - vdst = vec_ld(i, (unsigned char*)dst); - vsrc = vec_ld(i, (unsigned char*)src); - vdst = vec_add(vsrc, vdst); - vec_st(vdst, i, (unsigned char*)dst); - } - /* if w is not a multiple of 16 */ - for (; (i < w) ; i++) - { - dst[i] = src[i]; - } -} - -/* next one assumes that ((line_size % 16) == 0) */ -void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); - register vector unsigned char pixelsv1, pixelsv2; - register vector unsigned char pixelsv1B, pixelsv2B; - register vector unsigned char pixelsv1C, pixelsv2C; - register vector unsigned char pixelsv1D, pixelsv2D; - - register vector unsigned char perm = vec_lvsl(0, pixels); - int i; - register int line_size_2 = line_size << 1; - register int line_size_3 = line_size + line_size_2; - register int line_size_4 = line_size << 2; - -POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); -// hand-unrolling the loop by 4 gains about 15% -// mininum execution time goes from 74 to 60 cycles -// it's faster than -funroll-loops, but using -// -funroll-loops w/ this is bad - 74 cycles again. -// all this is on a 7450, tuning for the 7450 -#if 0 - for(i=0; i<h; i++) { - pixelsv1 = vec_ld(0, (unsigned char*)pixels); - pixelsv2 = vec_ld(16, (unsigned char*)pixels); - vec_st(vec_perm(pixelsv1, pixelsv2, perm), - 0, (unsigned char*)block); - pixels+=line_size; - block +=line_size; - } -#else - for(i=0; i<h; i+=4) { - pixelsv1 = vec_ld(0, (unsigned char*)pixels); - pixelsv2 = vec_ld(15, (unsigned char*)pixels); - pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); - pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels); - pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels); - pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels); - pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels); - pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels); - vec_st(vec_perm(pixelsv1, pixelsv2, perm), - 0, (unsigned char*)block); - vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), - line_size, (unsigned char*)block); - vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), - line_size_2, (unsigned char*)block); - vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), - line_size_3, (unsigned char*)block); - pixels+=line_size_4; - block +=line_size_4; - } -#endif -POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); -} - -/* next one assumes that ((line_size % 16) == 0) */ -#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) -void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); - register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; - register vector unsigned char perm = vec_lvsl(0, pixels); - int i; - -POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); - - for(i=0; i<h; i++) { - pixelsv1 = vec_ld(0, (unsigned char*)pixels); - pixelsv2 = vec_ld(16, (unsigned char*)pixels); - blockv = vec_ld(0, block); - pixelsv = vec_perm(pixelsv1, pixelsv2, perm); - blockv = vec_avg(blockv,pixelsv); - vec_st(blockv, 0, (unsigned char*)block); - pixels+=line_size; - block +=line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); -} - -/* next one assumes that ((line_size % 8) == 0) */ -void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); - register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; - int i; - -POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); - - for (i = 0; i < h; i++) { - /* - block is 8 bytes-aligned, so we're either in the - left block (16 bytes-aligned) or in the right block (not) - */ - int rightside = ((unsigned long)block & 0x0000000F); - - blockv = vec_ld(0, block); - pixelsv1 = vec_ld(0, (unsigned char*)pixels); - pixelsv2 = vec_ld(16, (unsigned char*)pixels); - pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); - - if (rightside) - { - pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); - } - else - { - pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); - } - - blockv = vec_avg(blockv, pixelsv); - - vec_st(blockv, 0, block); - - pixels += line_size; - block += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); -} - -/* next one assumes that ((line_size % 8) == 0) */ -void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); - register int i; - register vector unsigned char - pixelsv1, pixelsv2, - pixelsavg; - register vector unsigned char - blockv, temp1, temp2; - register vector unsigned short - pixelssum1, pixelssum2, temp3; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - -POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) - { - blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } - else - { - blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); -} - -/* next one assumes that ((line_size % 8) == 0) */ -void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); - register int i; - register vector unsigned char - pixelsv1, pixelsv2, - pixelsavg; - register vector unsigned char - blockv, temp1, temp2; - register vector unsigned short - pixelssum1, pixelssum2, temp3; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vcone); - -POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vcone); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) - { - blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } - else - { - blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); -} - -/* next one assumes that ((line_size % 16) == 0) */ -void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); - register int i; - register vector unsigned char - pixelsv1, pixelsv2, pixelsv3, pixelsv4; - register vector unsigned char - blockv, temp1, temp2; - register vector unsigned short - pixelssum1, pixelssum2, temp3, - pixelssum3, pixelssum4, temp4; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); - -POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum3 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum3 = vec_add(pixelssum3, vctwo); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - - for (i = 0; i < h ; i++) { - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - - pixelssum4 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp4 = vec_add(pixelssum3, pixelssum4); - temp4 = vec_sra(temp4, vctwo); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - - pixelssum3 = vec_add(pixelssum4, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - - blockv = vec_packsu(temp3, temp4); - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); -} - -/* next one assumes that ((line_size % 16) == 0) */ -void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); - register int i; - register vector unsigned char - pixelsv1, pixelsv2, pixelsv3, pixelsv4; - register vector unsigned char - blockv, temp1, temp2; - register vector unsigned short - pixelssum1, pixelssum2, temp3, - pixelssum3, pixelssum4, temp4; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); - -POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum3 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum3 = vec_add(pixelssum3, vcone); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vcone); - - for (i = 0; i < h ; i++) { - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - - pixelssum4 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp4 = vec_add(pixelssum3, pixelssum4); - temp4 = vec_sra(temp4, vctwo); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - - pixelssum3 = vec_add(pixelssum4, vcone); - pixelssum1 = vec_add(pixelssum2, vcone); - - blockv = vec_packsu(temp3, temp4); - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); -} - -int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ -POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); - int sum; - register const_vector unsigned char vzero = - (const_vector unsigned char)vec_splat_u8(0); - register vector signed short temp0, temp1, temp2, temp3, temp4, - temp5, temp6, temp7; -POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); - { - register const_vector signed short vprod1 =(const_vector signed short) - AVV( 1,-1, 1,-1, 1,-1, 1,-1); - register const_vector signed short vprod2 =(const_vector signed short) - AVV( 1, 1,-1,-1, 1, 1,-1,-1); - register const_vector signed short vprod3 =(const_vector signed short) - AVV( 1, 1, 1, 1,-1,-1,-1,-1); - register const_vector unsigned char perm1 = (const_vector unsigned char) - AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, - 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); - register const_vector unsigned char perm2 = (const_vector unsigned char) - AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, - 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B); - register const_vector unsigned char perm3 = (const_vector unsigned char) - AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); - -#define ONEITERBUTTERFLY(i, res) \ - { \ - register vector unsigned char src1, src2, srcO; \ - register vector unsigned char dst1, dst2, dstO; \ - register vector signed short srcV, dstV; \ - register vector signed short but0, but1, but2, op1, op2, op3; \ - src1 = vec_ld(stride * i, src); \ - src2 = vec_ld((stride * i) + 15, src); \ - srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ - dst1 = vec_ld(stride * i, dst); \ - dst2 = vec_ld((stride * i) + 15, dst); \ - dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ - /* promote the unsigned chars to signed shorts */ \ - /* we're in the 8x8 function, we only care for the first 8 */ \ - srcV = \ - (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)srcO); \ - dstV = \ - (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)dstO); \ - /* substractions inside the first butterfly */ \ - but0 = vec_sub(srcV, dstV); \ - op1 = vec_perm(but0, but0, perm1); \ - but1 = vec_mladd(but0, vprod1, op1); \ - op2 = vec_perm(but1, but1, perm2); \ - but2 = vec_mladd(but1, vprod2, op2); \ - op3 = vec_perm(but2, but2, perm3); \ - res = vec_mladd(but2, vprod3, op3); \ - } - ONEITERBUTTERFLY(0, temp0); - ONEITERBUTTERFLY(1, temp1); - ONEITERBUTTERFLY(2, temp2); - ONEITERBUTTERFLY(3, temp3); - ONEITERBUTTERFLY(4, temp4); - ONEITERBUTTERFLY(5, temp5); - ONEITERBUTTERFLY(6, temp6); - ONEITERBUTTERFLY(7, temp7); - } -#undef ONEITERBUTTERFLY - { - register vector signed int vsum; - register vector signed short line0 = vec_add(temp0, temp1); - register vector signed short line1 = vec_sub(temp0, temp1); - register vector signed short line2 = vec_add(temp2, temp3); - register vector signed short line3 = vec_sub(temp2, temp3); - register vector signed short line4 = vec_add(temp4, temp5); - register vector signed short line5 = vec_sub(temp4, temp5); - register vector signed short line6 = vec_add(temp6, temp7); - register vector signed short line7 = vec_sub(temp6, temp7); - - register vector signed short line0B = vec_add(line0, line2); - register vector signed short line2B = vec_sub(line0, line2); - register vector signed short line1B = vec_add(line1, line3); - register vector signed short line3B = vec_sub(line1, line3); - register vector signed short line4B = vec_add(line4, line6); - register vector signed short line6B = vec_sub(line4, line6); - register vector signed short line5B = vec_add(line5, line7); - register vector signed short line7B = vec_sub(line5, line7); - - register vector signed short line0C = vec_add(line0B, line4B); - register vector signed short line4C = vec_sub(line0B, line4B); - register vector signed short line1C = vec_add(line1B, line5B); - register vector signed short line5C = vec_sub(line1B, line5B); - register vector signed short line2C = vec_add(line2B, line6B); - register vector signed short line6C = vec_sub(line2B, line6B); - register vector signed short line3C = vec_add(line3B, line7B); - register vector signed short line7C = vec_sub(line3B, line7B); - - vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); - vsum = vec_sum4s(vec_abs(line1C), vsum); - vsum = vec_sum4s(vec_abs(line2C), vsum); - vsum = vec_sum4s(vec_abs(line3C), vsum); - vsum = vec_sum4s(vec_abs(line4C), vsum); - vsum = vec_sum4s(vec_abs(line5C), vsum); - vsum = vec_sum4s(vec_abs(line6C), vsum); - vsum = vec_sum4s(vec_abs(line7C), vsum); - vsum = vec_sums(vsum, (vector signed int)vzero); - vsum = vec_splat(vsum, 3); - vec_ste(vsum, 0, &sum); - } -POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); - return sum; -} - -/* - 16x8 works with 16 elements ; it allows to avoid replicating - loads, and give the compiler more rooms for scheduling. - It's only used from inside hadamard8_diff16_altivec. - - Unfortunately, it seems gcc-3.3 is a bit dumb, and - the compiled code has a LOT of spill code, it seems - gcc (unlike xlc) cannot keep everything in registers - by itself. The following code include hand-made - registers allocation. It's not clean, but on - a 7450 the resulting code is much faster (best case - fall from 700+ cycles to 550). - - xlc doesn't add spill code, but it doesn't know how to - schedule for the 7450, and its code isn't much faster than - gcc-3.3 on the 7450 (but uses 25% less instructions...) - - On the 970, the hand-made RA is still a win (arount 690 - vs. around 780), but xlc goes to around 660 on the - regular C code... -*/ - -static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { - int sum; - register vector signed short - temp0 REG_v(v0), - temp1 REG_v(v1), - temp2 REG_v(v2), - temp3 REG_v(v3), - temp4 REG_v(v4), - temp5 REG_v(v5), - temp6 REG_v(v6), - temp7 REG_v(v7); - register vector signed short - temp0S REG_v(v8), - temp1S REG_v(v9), - temp2S REG_v(v10), - temp3S REG_v(v11), - temp4S REG_v(v12), - temp5S REG_v(v13), - temp6S REG_v(v14), - temp7S REG_v(v15); - register const_vector unsigned char vzero REG_v(v31)= - (const_vector unsigned char)vec_splat_u8(0); - { - register const_vector signed short vprod1 REG_v(v16)= - (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1); - register const_vector signed short vprod2 REG_v(v17)= - (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1); - register const_vector signed short vprod3 REG_v(v18)= - (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1); - register const_vector unsigned char perm1 REG_v(v19)= - (const_vector unsigned char) - AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, - 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); - register const_vector unsigned char perm2 REG_v(v20)= - (const_vector unsigned char) - AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, - 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B); - register const_vector unsigned char perm3 REG_v(v21)= - (const_vector unsigned char) - AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); - -#define ONEITERBUTTERFLY(i, res1, res2) \ - { \ - register vector unsigned char src1 REG_v(v22), \ - src2 REG_v(v23), \ - dst1 REG_v(v24), \ - dst2 REG_v(v25), \ - srcO REG_v(v22), \ - dstO REG_v(v23); \ - \ - register vector signed short srcV REG_v(v24), \ - dstV REG_v(v25), \ - srcW REG_v(v26), \ - dstW REG_v(v27), \ - but0 REG_v(v28), \ - but0S REG_v(v29), \ - op1 REG_v(v30), \ - but1 REG_v(v22), \ - op1S REG_v(v23), \ - but1S REG_v(v24), \ - op2 REG_v(v25), \ - but2 REG_v(v26), \ - op2S REG_v(v27), \ - but2S REG_v(v28), \ - op3 REG_v(v29), \ - op3S REG_v(v30); \ - \ - src1 = vec_ld(stride * i, src); \ - src2 = vec_ld((stride * i) + 16, src); \ - srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ - dst1 = vec_ld(stride * i, dst); \ - dst2 = vec_ld((stride * i) + 16, dst); \ - dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ - /* promote the unsigned chars to signed shorts */ \ - srcV = \ - (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)srcO); \ - dstV = \ - (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)dstO); \ - srcW = \ - (vector signed short)vec_mergel((vector signed char)vzero, \ - (vector signed char)srcO); \ - dstW = \ - (vector signed short)vec_mergel((vector signed char)vzero, \ - (vector signed char)dstO); \ - /* substractions inside the first butterfly */ \ - but0 = vec_sub(srcV, dstV); \ - but0S = vec_sub(srcW, dstW); \ - op1 = vec_perm(but0, but0, perm1); \ - but1 = vec_mladd(but0, vprod1, op1); \ - op1S = vec_perm(but0S, but0S, perm1); \ - but1S = vec_mladd(but0S, vprod1, op1S); \ - op2 = vec_perm(but1, but1, perm2); \ - but2 = vec_mladd(but1, vprod2, op2); \ - op2S = vec_perm(but1S, but1S, perm2); \ - but2S = vec_mladd(but1S, vprod2, op2S); \ - op3 = vec_perm(but2, but2, perm3); \ - res1 = vec_mladd(but2, vprod3, op3); \ - op3S = vec_perm(but2S, but2S, perm3); \ - res2 = vec_mladd(but2S, vprod3, op3S); \ - } - ONEITERBUTTERFLY(0, temp0, temp0S); - ONEITERBUTTERFLY(1, temp1, temp1S); - ONEITERBUTTERFLY(2, temp2, temp2S); - ONEITERBUTTERFLY(3, temp3, temp3S); - ONEITERBUTTERFLY(4, temp4, temp4S); - ONEITERBUTTERFLY(5, temp5, temp5S); - ONEITERBUTTERFLY(6, temp6, temp6S); - ONEITERBUTTERFLY(7, temp7, temp7S); - } -#undef ONEITERBUTTERFLY - { - register vector signed int vsum; - register vector signed short line0S, line1S, line2S, line3S, line4S, - line5S, line6S, line7S, line0BS,line2BS, - line1BS,line3BS,line4BS,line6BS,line5BS, - line7BS,line0CS,line4CS,line1CS,line5CS, - line2CS,line6CS,line3CS,line7CS; - - register vector signed short line0 = vec_add(temp0, temp1); - register vector signed short line1 = vec_sub(temp0, temp1); - register vector signed short line2 = vec_add(temp2, temp3); - register vector signed short line3 = vec_sub(temp2, temp3); - register vector signed short line4 = vec_add(temp4, temp5); - register vector signed short line5 = vec_sub(temp4, temp5); - register vector signed short line6 = vec_add(temp6, temp7); - register vector signed short line7 = vec_sub(temp6, temp7); - - register vector signed short line0B = vec_add(line0, line2); - register vector signed short line2B = vec_sub(line0, line2); - register vector signed short line1B = vec_add(line1, line3); - register vector signed short line3B = vec_sub(line1, line3); - register vector signed short line4B = vec_add(line4, line6); - register vector signed short line6B = vec_sub(line4, line6); - register vector signed short line5B = vec_add(line5, line7); - register vector signed short line7B = vec_sub(line5, line7); - - register vector signed short line0C = vec_add(line0B, line4B); - register vector signed short line4C = vec_sub(line0B, line4B); - register vector signed short line1C = vec_add(line1B, line5B); - register vector signed short line5C = vec_sub(line1B, line5B); - register vector signed short line2C = vec_add(line2B, line6B); - register vector signed short line6C = vec_sub(line2B, line6B); - register vector signed short line3C = vec_add(line3B, line7B); - register vector signed short line7C = vec_sub(line3B, line7B); - - vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); - vsum = vec_sum4s(vec_abs(line1C), vsum); - vsum = vec_sum4s(vec_abs(line2C), vsum); - vsum = vec_sum4s(vec_abs(line3C), vsum); - vsum = vec_sum4s(vec_abs(line4C), vsum); - vsum = vec_sum4s(vec_abs(line5C), vsum); - vsum = vec_sum4s(vec_abs(line6C), vsum); - vsum = vec_sum4s(vec_abs(line7C), vsum); - - line0S = vec_add(temp0S, temp1S); - line1S = vec_sub(temp0S, temp1S); - line2S = vec_add(temp2S, temp3S); - line3S = vec_sub(temp2S, temp3S); - line4S = vec_add(temp4S, temp5S); - line5S = vec_sub(temp4S, temp5S); - line6S = vec_add(temp6S, temp7S); - line7S = vec_sub(temp6S, temp7S); - - line0BS = vec_add(line0S, line2S); - line2BS = vec_sub(line0S, line2S); - line1BS = vec_add(line1S, line3S); - line3BS = vec_sub(line1S, line3S); - line4BS = vec_add(line4S, line6S); - line6BS = vec_sub(line4S, line6S); - line5BS = vec_add(line5S, line7S); - line7BS = vec_sub(line5S, line7S); - - line0CS = vec_add(line0BS, line4BS); - line4CS = vec_sub(line0BS, line4BS); - line1CS = vec_add(line1BS, line5BS); - line5CS = vec_sub(line1BS, line5BS); - line2CS = vec_add(line2BS, line6BS); - line6CS = vec_sub(line2BS, line6BS); - line3CS = vec_add(line3BS, line7BS); - line7CS = vec_sub(line3BS, line7BS); - - vsum = vec_sum4s(vec_abs(line0CS), vsum); - vsum = vec_sum4s(vec_abs(line1CS), vsum); - vsum = vec_sum4s(vec_abs(line2CS), vsum); - vsum = vec_sum4s(vec_abs(line3CS), vsum); - vsum = vec_sum4s(vec_abs(line4CS), vsum); - vsum = vec_sum4s(vec_abs(line5CS), vsum); - vsum = vec_sum4s(vec_abs(line6CS), vsum); - vsum = vec_sum4s(vec_abs(line7CS), vsum); - vsum = vec_sums(vsum, (vector signed int)vzero); - vsum = vec_splat(vsum, 3); - vec_ste(vsum, 0, &sum); - } - return sum; -} - -int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ -POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1); - int score; -POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1); - score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); - if (h==16) { - dst += 8*stride; - src += 8*stride; - score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); - } -POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1); - return score; -} - -int has_altivec(void) -{ -#ifdef __AMIGAOS4__ - ULONG result = 0; - extern struct ExecIFace *IExec; - - IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE); - if (result == VECTORTYPE_ALTIVEC) return 1; - return 0; -#else /* __AMIGAOS4__ */ - -#ifdef CONFIG_DARWIN - int sels[2] = {CTL_HW, HW_VECTORUNIT}; - int has_vu = 0; - size_t len = sizeof(has_vu); - int err; - - err = sysctl(sels, 2, &has_vu, &len, NULL, 0); - - if (err == 0) return (has_vu != 0); -#else /* CONFIG_DARWIN */ -/* no Darwin, do it the brute-force way */ -/* this is borrowed from the libmpeg2 library */ - { - signal (SIGILL, sigill_handler); - if (sigsetjmp (jmpbuf, 1)) { - signal (SIGILL, SIG_DFL); - } else { - canjump = 1; - - asm volatile ("mtspr 256, %0\n\t" - "vand %%v0, %%v0, %%v0" - : - : "r" (-1)); - - signal (SIGILL, SIG_DFL); - return 1; - } - } -#endif /* CONFIG_DARWIN */ - return 0; -#endif /* __AMIGAOS4__ */ -} - -static void vorbis_inverse_coupling_altivec(float *mag, float *ang, - int blocksize) -{ - int i; - vector float m, a; - vector bool int t0, t1; - const vector unsigned int v_31 = //XXX - vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); - for(i=0; i<blocksize; i+=4) { - m = vec_ld(0, mag+i); - a = vec_ld(0, ang+i); - t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); - t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); - a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); - t0 = (vector bool int)vec_and(a, t1); - t1 = (vector bool int)vec_andc(a, t1); - a = vec_sub(m, (vector float)t1); - m = vec_add(m, (vector float)t0); - vec_stl(a, 0, ang+i); - vec_stl(m, 0, mag+i); - } -} - -/* next one assumes that ((line_size % 8) == 0) */ -void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsavg; - register vector unsigned char blockv, temp1, temp2, blocktemp; - register vector unsigned short pixelssum1, pixelssum2, temp3; - - register const_vector unsigned char vczero = (const_vector unsigned char) - vec_splat_u8(0); - register const_vector unsigned short vctwo = (const_vector unsigned short) - vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - -POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) { - blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } else { - blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - blockv = vec_avg(blocktemp, blockv); - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1); -} - -void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx) -{ - c->pix_abs[0][1] = sad16_x2_altivec; - c->pix_abs[0][2] = sad16_y2_altivec; - c->pix_abs[0][3] = sad16_xy2_altivec; - c->pix_abs[0][0] = sad16_altivec; - c->pix_abs[1][0] = sad8_altivec; - c->sad[0]= sad16_altivec; - c->sad[1]= sad8_altivec; - c->pix_norm1 = pix_norm1_altivec; - c->sse[1]= sse8_altivec; - c->sse[0]= sse16_altivec; - c->pix_sum = pix_sum_altivec; - c->diff_pixels = diff_pixels_altivec; - c->get_pixels = get_pixels_altivec; - c->add_bytes= add_bytes_altivec; - c->put_pixels_tab[0][0] = put_pixels16_altivec; - /* the two functions do the same thing, so use the same code */ - c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; - c->avg_pixels_tab[0][0] = avg_pixels16_altivec; - c->avg_pixels_tab[1][0] = avg_pixels8_altivec; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; - c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; - c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; - c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; - c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; - - c->hadamard8_diff[0] = hadamard8_diff16_altivec; - c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; -#ifdef CONFIG_VORBIS_DECODER - c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; -#endif -} diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h deleted file mode 100644 index 560d778bb..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef _DSPUTIL_ALTIVEC_ -#define _DSPUTIL_ALTIVEC_ - -#include "dsputil_ppc.h" - -#ifdef HAVE_ALTIVEC - -extern int has_altivec(void); - -void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -// used to build registers permutation vectors (vcprm) -// the 's' are for words in the _s_econd vector -#define WORD_0 0x00,0x01,0x02,0x03 -#define WORD_1 0x04,0x05,0x06,0x07 -#define WORD_2 0x08,0x09,0x0a,0x0b -#define WORD_3 0x0c,0x0d,0x0e,0x0f -#define WORD_s0 0x10,0x11,0x12,0x13 -#define WORD_s1 0x14,0x15,0x16,0x17 -#define WORD_s2 0x18,0x19,0x1a,0x1b -#define WORD_s3 0x1c,0x1d,0x1e,0x1f - -#ifdef CONFIG_DARWIN -#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) -#else -#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} -#endif - -// vcprmle is used to keep the same index as in the SSE version. -// it's the same as vcprm, with the index inversed -// ('le' is Little Endian) -#define vcprmle(a,b,c,d) vcprm(d,c,b,a) - -// used to build inverse/identity vectors (vcii) -// n is _n_egative, p is _p_ositive -#define FLOAT_n -1. -#define FLOAT_p 1. - - -#ifdef CONFIG_DARWIN -#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) -#else -#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} -#endif - -// Transpose 8x8 matrix of 16-bit elements (in-place) -#define TRANSPOSE8(a,b,c,d,e,f,g,h) \ -do { \ - vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \ - vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \ - \ - A1 = vec_mergeh (a, e); \ - B1 = vec_mergel (a, e); \ - C1 = vec_mergeh (b, f); \ - D1 = vec_mergel (b, f); \ - E1 = vec_mergeh (c, g); \ - F1 = vec_mergel (c, g); \ - G1 = vec_mergeh (d, h); \ - H1 = vec_mergel (d, h); \ - \ - A2 = vec_mergeh (A1, E1); \ - B2 = vec_mergel (A1, E1); \ - C2 = vec_mergeh (B1, F1); \ - D2 = vec_mergel (B1, F1); \ - E2 = vec_mergeh (C1, G1); \ - F2 = vec_mergel (C1, G1); \ - G2 = vec_mergeh (D1, H1); \ - H2 = vec_mergel (D1, H1); \ - \ - a = vec_mergeh (A2, E2); \ - b = vec_mergel (A2, E2); \ - c = vec_mergeh (B2, F2); \ - d = vec_mergel (B2, F2); \ - e = vec_mergeh (C2, G2); \ - f = vec_mergel (C2, G2); \ - g = vec_mergeh (D2, H2); \ - h = vec_mergel (D2, H2); \ -} while (0) - -#endif /* HAVE_ALTIVEC */ - -#endif /* _DSPUTIL_ALTIVEC_ */ diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c deleted file mode 100644 index 117a7adf1..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c +++ /dev/null @@ -1,325 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "../dsputil.h" - -#include "dsputil_ppc.h" - -#ifdef HAVE_ALTIVEC -#include "dsputil_altivec.h" - -extern void fdct_altivec(int16_t *block); -extern void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, - int x16, int y16, int rounder); -extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); -extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); - -void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx); - -void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx); -void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx); -void snow_init_altivec(DSPContext* c, AVCodecContext *avctx); -void float_init_altivec(DSPContext* c, AVCodecContext *avctx); - -#endif - -int mm_flags = 0; - -int mm_support(void) -{ - int result = 0; -#ifdef HAVE_ALTIVEC - if (has_altivec()) { - result |= MM_ALTIVEC; - } -#endif /* result */ - return result; -} - -#ifdef CONFIG_POWERPC_PERF -unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; -/* list below must match enum in dsputil_ppc.h */ -static unsigned char* perfname[] = { - "ff_fft_calc_altivec", - "gmc1_altivec", - "dct_unquantize_h263_altivec", - "fdct_altivec", - "idct_add_altivec", - "idct_put_altivec", - "put_pixels16_altivec", - "avg_pixels16_altivec", - "avg_pixels8_altivec", - "put_pixels8_xy2_altivec", - "put_no_rnd_pixels8_xy2_altivec", - "put_pixels16_xy2_altivec", - "put_no_rnd_pixels16_xy2_altivec", - "hadamard8_diff8x8_altivec", - "hadamard8_diff16_altivec", - "avg_pixels8_xy2_altivec", - "clear_blocks_dcbz32_ppc", - "clear_blocks_dcbz128_ppc", - "put_h264_chroma_mc8_altivec", - "avg_h264_chroma_mc8_altivec", - "put_h264_qpel16_h_lowpass_altivec", - "avg_h264_qpel16_h_lowpass_altivec", - "put_h264_qpel16_v_lowpass_altivec", - "avg_h264_qpel16_v_lowpass_altivec", - "put_h264_qpel16_hv_lowpass_altivec", - "avg_h264_qpel16_hv_lowpass_altivec", - "" -}; -#include <stdio.h> -#endif - -#ifdef CONFIG_POWERPC_PERF -void powerpc_display_perf_report(void) -{ - int i, j; - av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); - for(i = 0 ; i < powerpc_perf_total ; i++) - { - for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) - { - if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) - av_log(NULL, AV_LOG_INFO, - " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n", - perfname[i], - j+1, - perfdata[j][i][powerpc_data_min], - perfdata[j][i][powerpc_data_max], - (double)perfdata[j][i][powerpc_data_sum] / - (double)perfdata[j][i][powerpc_data_num], - perfdata[j][i][powerpc_data_num]); - } - } -} -#endif /* CONFIG_POWERPC_PERF */ - -/* ***** WARNING ***** WARNING ***** WARNING ***** */ -/* - clear_blocks_dcbz32_ppc will not work properly - on PowerPC processors with a cache line size - not equal to 32 bytes. - Fortunately all processor used by Apple up to - at least the 7450 (aka second generation G4) - use 32 bytes cache line. - This is due to the use of the 'dcbz' instruction. - It simply clear to zero a single cache line, - so you need to know the cache line size to use it ! - It's absurd, but it's fast... - - update 24/06/2003 : Apple released yesterday the G5, - with a PPC970. cache line size : 128 bytes. Oups. - The semantic of dcbz was changed, it always clear - 32 bytes. so the function below will work, but will - be slow. So I fixed check_dcbz_effect to use dcbzl, - which is defined to clear a cache line (as dcbz before). - So we still can distinguish, and use dcbz (32 bytes) - or dcbzl (one cache line) as required. - - see <http://developer.apple.com/technotes/tn/tn2087.html> - and <http://developer.apple.com/technotes/tn/tn2086.html> -*/ -void clear_blocks_dcbz32_ppc(DCTELEM *blocks) -{ -POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1); - register int misal = ((unsigned long)blocks & 0x00000010); - register int i = 0; -POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); -#if 1 - if (misal) { - ((unsigned long*)blocks)[0] = 0L; - ((unsigned long*)blocks)[1] = 0L; - ((unsigned long*)blocks)[2] = 0L; - ((unsigned long*)blocks)[3] = 0L; - i += 16; - } - for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) { -#ifndef __MWERKS__ - asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); -#else - __dcbz( blocks, i ); -#endif - } - if (misal) { - ((unsigned long*)blocks)[188] = 0L; - ((unsigned long*)blocks)[189] = 0L; - ((unsigned long*)blocks)[190] = 0L; - ((unsigned long*)blocks)[191] = 0L; - i += 16; - } -#else - memset(blocks, 0, sizeof(DCTELEM)*6*64); -#endif -POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1); -} - -/* same as above, when dcbzl clear a whole 128B cache line - i.e. the PPC970 aka G5 */ -#ifdef HAVE_DCBZL -void clear_blocks_dcbz128_ppc(DCTELEM *blocks) -{ -POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1); - register int misal = ((unsigned long)blocks & 0x0000007f); - register int i = 0; -POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); -#if 1 - if (misal) { - // we could probably also optimize this case, - // but there's not much point as the machines - // aren't available yet (2003-06-26) - memset(blocks, 0, sizeof(DCTELEM)*6*64); - } - else - for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { - asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); - } -#else - memset(blocks, 0, sizeof(DCTELEM)*6*64); -#endif -POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1); -} -#else -void clear_blocks_dcbz128_ppc(DCTELEM *blocks) -{ - memset(blocks, 0, sizeof(DCTELEM)*6*64); -} -#endif - -#ifdef HAVE_DCBZL -/* check dcbz report how many bytes are set to 0 by dcbz */ -/* update 24/06/2003 : replace dcbz by dcbzl to get - the intended effect (Apple "fixed" dcbz) - unfortunately this cannot be used unless the assembler - knows about dcbzl ... */ -long check_dcbzl_effect(void) -{ - register char *fakedata = (char*)av_malloc(1024); - register char *fakedata_middle; - register long zero = 0; - register long i = 0; - long count = 0; - - if (!fakedata) - { - return 0L; - } - - fakedata_middle = (fakedata + 512); - - memset(fakedata, 0xFF, 1024); - - /* below the constraint "b" seems to mean "Address base register" - in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ - asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); - - for (i = 0; i < 1024 ; i ++) - { - if (fakedata[i] == (char)0) - count++; - } - - av_free(fakedata); - - return count; -} -#else -long check_dcbzl_effect(void) -{ - return 0; -} -#endif - -static void prefetch_ppc(void *mem, int stride, int h) -{ - register const uint8_t *p = mem; - do { - asm volatile ("dcbt 0,%0" : : "r" (p)); - p+= stride; - } while(--h); -} - -void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) -{ - // Common optimizations whether Altivec is available or not - c->prefetch = prefetch_ppc; - switch (check_dcbzl_effect()) { - case 32: - c->clear_blocks = clear_blocks_dcbz32_ppc; - break; - case 128: - c->clear_blocks = clear_blocks_dcbz128_ppc; - break; - default: - break; - } - -#ifdef HAVE_ALTIVEC - if(ENABLE_H264_DECODER) dsputil_h264_init_ppc(c, avctx); - - if (has_altivec()) { - mm_flags |= MM_ALTIVEC; - - dsputil_init_altivec(c, avctx); - if(ENABLE_SNOW_DECODER) snow_init_altivec(c, avctx); - if(ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER) - vc1dsp_init_altivec(c, avctx); - float_init_altivec(c, avctx); - c->gmc1 = gmc1_altivec; - -#ifdef CONFIG_ENCODERS - if (avctx->dct_algo == FF_DCT_AUTO || - avctx->dct_algo == FF_DCT_ALTIVEC) - { - c->fdct = fdct_altivec; - } -#endif //CONFIG_ENCODERS - - if (avctx->lowres==0) - { - if ((avctx->idct_algo == FF_IDCT_AUTO) || - (avctx->idct_algo == FF_IDCT_ALTIVEC)) - { - c->idct_put = idct_put_altivec; - c->idct_add = idct_add_altivec; - c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - } - } - -#ifdef CONFIG_POWERPC_PERF - { - int i, j; - for (i = 0 ; i < powerpc_perf_total ; i++) - { - for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) - { - perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL; - perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL; - perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL; - perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL; - } - } - } -#endif /* CONFIG_POWERPC_PERF */ - } -#endif /* HAVE_ALTIVEC */ -} diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h deleted file mode 100644 index 5b25732b2..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef _DSPUTIL_PPC_ -#define _DSPUTIL_PPC_ - -#ifdef CONFIG_POWERPC_PERF -void powerpc_display_perf_report(void); -/* the 604* have 2, the G3* have 4, the G4s have 6, - and the G5 are completely different (they MUST use - POWERPC_MODE_64BITS, and let's hope all future 64 bis PPC - will use the same PMCs... */ -#define POWERPC_NUM_PMC_ENABLED 6 -/* if you add to the enum below, also add to the perfname array - in dsputil_ppc.c */ -enum powerpc_perf_index { - altivec_fft_num = 0, - altivec_gmc1_num, - altivec_dct_unquantize_h263_num, - altivec_fdct, - altivec_idct_add_num, - altivec_idct_put_num, - altivec_put_pixels16_num, - altivec_avg_pixels16_num, - altivec_avg_pixels8_num, - altivec_put_pixels8_xy2_num, - altivec_put_no_rnd_pixels8_xy2_num, - altivec_put_pixels16_xy2_num, - altivec_put_no_rnd_pixels16_xy2_num, - altivec_hadamard8_diff8x8_num, - altivec_hadamard8_diff16_num, - altivec_avg_pixels8_xy2_num, - powerpc_clear_blocks_dcbz32, - powerpc_clear_blocks_dcbz128, - altivec_put_h264_chroma_mc8_num, - altivec_avg_h264_chroma_mc8_num, - altivec_put_h264_qpel16_h_lowpass_num, - altivec_avg_h264_qpel16_h_lowpass_num, - altivec_put_h264_qpel16_v_lowpass_num, - altivec_avg_h264_qpel16_v_lowpass_num, - altivec_put_h264_qpel16_hv_lowpass_num, - altivec_avg_h264_qpel16_hv_lowpass_num, - powerpc_perf_total -}; -enum powerpc_data_index { - powerpc_data_min = 0, - powerpc_data_max, - powerpc_data_sum, - powerpc_data_num, - powerpc_data_total -}; -extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; - -#ifndef POWERPC_MODE_64BITS -#define POWERP_PMC_DATATYPE unsigned long -#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 937" : "=r" (a)) -#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a)) -#if (POWERPC_NUM_PMC_ENABLED > 2) -#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 941" : "=r" (a)) -#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 942" : "=r" (a)) -#else -#define POWERPC_GET_PMC3(a) do {} while (0) -#define POWERPC_GET_PMC4(a) do {} while (0) -#endif -#if (POWERPC_NUM_PMC_ENABLED > 4) -#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 929" : "=r" (a)) -#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 930" : "=r" (a)) -#else -#define POWERPC_GET_PMC5(a) do {} while (0) -#define POWERPC_GET_PMC6(a) do {} while (0) -#endif -#else /* POWERPC_MODE_64BITS */ -#define POWERP_PMC_DATATYPE unsigned long long -#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 771" : "=r" (a)) -#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 772" : "=r" (a)) -#if (POWERPC_NUM_PMC_ENABLED > 2) -#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 773" : "=r" (a)) -#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 774" : "=r" (a)) -#else -#define POWERPC_GET_PMC3(a) do {} while (0) -#define POWERPC_GET_PMC4(a) do {} while (0) -#endif -#if (POWERPC_NUM_PMC_ENABLED > 4) -#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 775" : "=r" (a)) -#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 776" : "=r" (a)) -#else -#define POWERPC_GET_PMC5(a) do {} while (0) -#define POWERPC_GET_PMC6(a) do {} while (0) -#endif -#endif /* POWERPC_MODE_64BITS */ -#define POWERPC_PERF_DECLARE(a, cond) \ - POWERP_PMC_DATATYPE \ - pmc_start[POWERPC_NUM_PMC_ENABLED], \ - pmc_stop[POWERPC_NUM_PMC_ENABLED], \ - pmc_loop_index; -#define POWERPC_PERF_START_COUNT(a, cond) do { \ - POWERPC_GET_PMC6(pmc_start[5]); \ - POWERPC_GET_PMC5(pmc_start[4]); \ - POWERPC_GET_PMC4(pmc_start[3]); \ - POWERPC_GET_PMC3(pmc_start[2]); \ - POWERPC_GET_PMC2(pmc_start[1]); \ - POWERPC_GET_PMC1(pmc_start[0]); \ - } while (0) -#define POWERPC_PERF_STOP_COUNT(a, cond) do { \ - POWERPC_GET_PMC1(pmc_stop[0]); \ - POWERPC_GET_PMC2(pmc_stop[1]); \ - POWERPC_GET_PMC3(pmc_stop[2]); \ - POWERPC_GET_PMC4(pmc_stop[3]); \ - POWERPC_GET_PMC5(pmc_stop[4]); \ - POWERPC_GET_PMC6(pmc_stop[5]); \ - if (cond) \ - { \ - for(pmc_loop_index = 0; \ - pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ - pmc_loop_index++) \ - { \ - if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \ - { \ - POWERP_PMC_DATATYPE diff = \ - pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ - if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ - perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ - if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ - perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ - perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ - perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ - } \ - } \ - } \ -} while (0) -#else /* CONFIG_POWERPC_PERF */ -// those are needed to avoid empty statements. -#define POWERPC_PERF_DECLARE(a, cond) int altivec_placeholder __attribute__ ((unused)) -#define POWERPC_PERF_START_COUNT(a, cond) do {} while (0) -#define POWERPC_PERF_STOP_COUNT(a, cond) do {} while (0) -#endif /* CONFIG_POWERPC_PERF */ - -#endif /* _DSPUTIL_PPC_ */ diff --git a/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c b/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c deleted file mode 100644 index 2418c32bb..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c +++ /dev/null @@ -1,493 +0,0 @@ -/* ffmpeg/libavcodec/ppc/fdct_altivec.c, this file is part of the - * AltiVec optimized library for the FFMPEG Multimedia System - * Copyright (C) 2003 James Klicman <james@klicman.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - - -#include "common.h" -#include "../dsputil.h" -#include "dsputil_altivec.h" -#include "gcc_fixes.h" - - -#define vs16(v) ((vector signed short)(v)) -#define vs32(v) ((vector signed int)(v)) -#define vu8(v) ((vector unsigned char)(v)) -#define vu16(v) ((vector unsigned short)(v)) -#define vu32(v) ((vector unsigned int)(v)) - - -#define C1 0.98078525066375732421875000 /* cos(1*PI/16) */ -#define C2 0.92387950420379638671875000 /* cos(2*PI/16) */ -#define C3 0.83146959543228149414062500 /* cos(3*PI/16) */ -#define C4 0.70710676908493041992187500 /* cos(4*PI/16) */ -#define C5 0.55557024478912353515625000 /* cos(5*PI/16) */ -#define C6 0.38268342614173889160156250 /* cos(6*PI/16) */ -#define C7 0.19509032368659973144531250 /* cos(7*PI/16) */ -#define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */ - - -#define W0 -(2 * C2) -#define W1 (2 * C6) -#define W2 (SQRT_2 * C6) -#define W3 (SQRT_2 * C3) -#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7)) -#define W5 (SQRT_2 * ( C1 + C3 - C5 + C7)) -#define W6 (SQRT_2 * ( C1 + C3 + C5 - C7)) -#define W7 (SQRT_2 * ( C1 + C3 - C5 - C7)) -#define W8 (SQRT_2 * ( C7 - C3)) -#define W9 (SQRT_2 * (-C1 - C3)) -#define WA (SQRT_2 * (-C3 - C5)) -#define WB (SQRT_2 * ( C5 - C3)) - - -static vector float fdctconsts[3] = { - (vector float)AVV( W0, W1, W2, W3 ), - (vector float)AVV( W4, W5, W6, W7 ), - (vector float)AVV( W8, W9, WA, WB ) -}; - -#define LD_W0 vec_splat(cnsts0, 0) -#define LD_W1 vec_splat(cnsts0, 1) -#define LD_W2 vec_splat(cnsts0, 2) -#define LD_W3 vec_splat(cnsts0, 3) -#define LD_W4 vec_splat(cnsts1, 0) -#define LD_W5 vec_splat(cnsts1, 1) -#define LD_W6 vec_splat(cnsts1, 2) -#define LD_W7 vec_splat(cnsts1, 3) -#define LD_W8 vec_splat(cnsts2, 0) -#define LD_W9 vec_splat(cnsts2, 1) -#define LD_WA vec_splat(cnsts2, 2) -#define LD_WB vec_splat(cnsts2, 3) - - -#define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \ - x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ - x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ - x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ - x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ - x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ - x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ - x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ - x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ - \ - b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ - b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ - b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ - b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ - \ - b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ - b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ - b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ - cnst = LD_W2; \ - b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ - cnst = LD_W1; \ - b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ - cnst = LD_W0; \ - b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ - \ - x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ - x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ - x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ - x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ - x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ - cnst = LD_W3; \ - x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ - \ - cnst = LD_W8; \ - x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ - cnst = LD_W9; \ - x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ - cnst = LD_WA; \ - x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ - cnst = LD_WB; \ - x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ - \ - cnst = LD_W4; \ - b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ - cnst = LD_W5; \ - b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ - cnst = LD_W6; \ - b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ - cnst = LD_W7; \ - b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ - \ - b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \ - b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \ - b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \ - b1 = vec_add(b1, x3); /* b1 = b1 + x3; */ \ - /* }}} */ - -#define FDCTCOL(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \ - x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ - x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ - x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ - x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ - x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ - x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ - x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ - x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ - \ - b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ - b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ - b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ - b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ - \ - b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ - b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ - b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ - cnst = LD_W2; \ - b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ - cnst = LD_W1; \ - b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ - cnst = LD_W0; \ - b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ - \ - x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ - x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ - x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ - x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ - x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ - cnst = LD_W3; \ - x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ - \ - cnst = LD_W8; \ - x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ - cnst = LD_W9; \ - x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ - cnst = LD_WA; \ - x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ - cnst = LD_WB; \ - x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ - \ - cnst = LD_W4; \ - b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ - cnst = LD_W5; \ - b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ - cnst = LD_W6; \ - b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ - cnst = LD_W7; \ - b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ - \ - b7 = vec_add(b7, x2); /* b7 += x2; */ \ - b5 = vec_add(b5, x3); /* b5 += x3; */ \ - b3 = vec_add(b3, x2); /* b3 += x2; */ \ - b1 = vec_add(b1, x3); /* b1 += x3; */ \ - /* }}} */ - - - -/* two dimensional discrete cosine transform */ - -void fdct_altivec(int16_t *block) -{ -POWERPC_PERF_DECLARE(altivec_fdct, 1); - vector signed short *bp; - vector float *cp; - vector float b00, b10, b20, b30, b40, b50, b60, b70; - vector float b01, b11, b21, b31, b41, b51, b61, b71; - vector float mzero, cnst, cnsts0, cnsts1, cnsts2; - vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; - - POWERPC_PERF_START_COUNT(altivec_fdct, 1); - - - /* setup constants {{{ */ - /* mzero = -0.0 */ - mzero = ((vector float)vec_splat_u32(-1)); - mzero = ((vector float)vec_sl(vu32(mzero), vu32(mzero))); - cp = fdctconsts; - cnsts0 = vec_ld(0, cp); cp++; - cnsts1 = vec_ld(0, cp); cp++; - cnsts2 = vec_ld(0, cp); - /* }}} */ - - - /* 8x8 matrix transpose (vector short[8]) {{{ */ -#define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b)) - - bp = (vector signed short*)block; - b00 = ((vector float)vec_ld(0, bp)); - b40 = ((vector float)vec_ld(16*4, bp)); - b01 = ((vector float)MERGE_S16(h, b00, b40)); - b11 = ((vector float)MERGE_S16(l, b00, b40)); - bp++; - b10 = ((vector float)vec_ld(0, bp)); - b50 = ((vector float)vec_ld(16*4, bp)); - b21 = ((vector float)MERGE_S16(h, b10, b50)); - b31 = ((vector float)MERGE_S16(l, b10, b50)); - bp++; - b20 = ((vector float)vec_ld(0, bp)); - b60 = ((vector float)vec_ld(16*4, bp)); - b41 = ((vector float)MERGE_S16(h, b20, b60)); - b51 = ((vector float)MERGE_S16(l, b20, b60)); - bp++; - b30 = ((vector float)vec_ld(0, bp)); - b70 = ((vector float)vec_ld(16*4, bp)); - b61 = ((vector float)MERGE_S16(h, b30, b70)); - b71 = ((vector float)MERGE_S16(l, b30, b70)); - - x0 = ((vector float)MERGE_S16(h, b01, b41)); - x1 = ((vector float)MERGE_S16(l, b01, b41)); - x2 = ((vector float)MERGE_S16(h, b11, b51)); - x3 = ((vector float)MERGE_S16(l, b11, b51)); - x4 = ((vector float)MERGE_S16(h, b21, b61)); - x5 = ((vector float)MERGE_S16(l, b21, b61)); - x6 = ((vector float)MERGE_S16(h, b31, b71)); - x7 = ((vector float)MERGE_S16(l, b31, b71)); - - b00 = ((vector float)MERGE_S16(h, x0, x4)); - b10 = ((vector float)MERGE_S16(l, x0, x4)); - b20 = ((vector float)MERGE_S16(h, x1, x5)); - b30 = ((vector float)MERGE_S16(l, x1, x5)); - b40 = ((vector float)MERGE_S16(h, x2, x6)); - b50 = ((vector float)MERGE_S16(l, x2, x6)); - b60 = ((vector float)MERGE_S16(h, x3, x7)); - b70 = ((vector float)MERGE_S16(l, x3, x7)); - -#undef MERGE_S16 - /* }}} */ - - -/* Some of the initial calculations can be done as vector short before - * conversion to vector float. The following code section takes advantage - * of this. - */ -#if 1 - /* fdct rows {{{ */ - x0 = ((vector float)vec_add(vs16(b00), vs16(b70))); - x7 = ((vector float)vec_sub(vs16(b00), vs16(b70))); - x1 = ((vector float)vec_add(vs16(b10), vs16(b60))); - x6 = ((vector float)vec_sub(vs16(b10), vs16(b60))); - x2 = ((vector float)vec_add(vs16(b20), vs16(b50))); - x5 = ((vector float)vec_sub(vs16(b20), vs16(b50))); - x3 = ((vector float)vec_add(vs16(b30), vs16(b40))); - x4 = ((vector float)vec_sub(vs16(b30), vs16(b40))); - - b70 = ((vector float)vec_add(vs16(x0), vs16(x3))); - b10 = ((vector float)vec_add(vs16(x1), vs16(x2))); - - b00 = ((vector float)vec_add(vs16(b70), vs16(b10))); - b40 = ((vector float)vec_sub(vs16(b70), vs16(b10))); - -#define CTF0(n) \ - b##n##1 = ((vector float)vec_unpackl(vs16(b##n##0))); \ - b##n##0 = ((vector float)vec_unpackh(vs16(b##n##0))); \ - b##n##1 = vec_ctf(vs32(b##n##1), 0); \ - b##n##0 = vec_ctf(vs32(b##n##0), 0); - - CTF0(0); - CTF0(4); - - b20 = ((vector float)vec_sub(vs16(x0), vs16(x3))); - b60 = ((vector float)vec_sub(vs16(x1), vs16(x2))); - - CTF0(2); - CTF0(6); - -#undef CTF0 - - x0 = vec_add(b60, b20); - x1 = vec_add(b61, b21); - - cnst = LD_W2; - x0 = vec_madd(cnst, x0, mzero); - x1 = vec_madd(cnst, x1, mzero); - cnst = LD_W1; - b20 = vec_madd(cnst, b20, x0); - b21 = vec_madd(cnst, b21, x1); - cnst = LD_W0; - b60 = vec_madd(cnst, b60, x0); - b61 = vec_madd(cnst, b61, x1); - -#define CTFX(x,b) \ - b##0 = ((vector float)vec_unpackh(vs16(x))); \ - b##1 = ((vector float)vec_unpackl(vs16(x))); \ - b##0 = vec_ctf(vs32(b##0), 0); \ - b##1 = vec_ctf(vs32(b##1), 0); \ - - CTFX(x4, b7); - CTFX(x5, b5); - CTFX(x6, b3); - CTFX(x7, b1); - -#undef CTFX - - - x0 = vec_add(b70, b10); - x1 = vec_add(b50, b30); - x2 = vec_add(b70, b30); - x3 = vec_add(b50, b10); - x8 = vec_add(x2, x3); - cnst = LD_W3; - x8 = vec_madd(cnst, x8, mzero); - - cnst = LD_W8; - x0 = vec_madd(cnst, x0, mzero); - cnst = LD_W9; - x1 = vec_madd(cnst, x1, mzero); - cnst = LD_WA; - x2 = vec_madd(cnst, x2, x8); - cnst = LD_WB; - x3 = vec_madd(cnst, x3, x8); - - cnst = LD_W4; - b70 = vec_madd(cnst, b70, x0); - cnst = LD_W5; - b50 = vec_madd(cnst, b50, x1); - cnst = LD_W6; - b30 = vec_madd(cnst, b30, x1); - cnst = LD_W7; - b10 = vec_madd(cnst, b10, x0); - - b70 = vec_add(b70, x2); - b50 = vec_add(b50, x3); - b30 = vec_add(b30, x2); - b10 = vec_add(b10, x3); - - - x0 = vec_add(b71, b11); - x1 = vec_add(b51, b31); - x2 = vec_add(b71, b31); - x3 = vec_add(b51, b11); - x8 = vec_add(x2, x3); - cnst = LD_W3; - x8 = vec_madd(cnst, x8, mzero); - - cnst = LD_W8; - x0 = vec_madd(cnst, x0, mzero); - cnst = LD_W9; - x1 = vec_madd(cnst, x1, mzero); - cnst = LD_WA; - x2 = vec_madd(cnst, x2, x8); - cnst = LD_WB; - x3 = vec_madd(cnst, x3, x8); - - cnst = LD_W4; - b71 = vec_madd(cnst, b71, x0); - cnst = LD_W5; - b51 = vec_madd(cnst, b51, x1); - cnst = LD_W6; - b31 = vec_madd(cnst, b31, x1); - cnst = LD_W7; - b11 = vec_madd(cnst, b11, x0); - - b71 = vec_add(b71, x2); - b51 = vec_add(b51, x3); - b31 = vec_add(b31, x2); - b11 = vec_add(b11, x3); - /* }}} */ -#else - /* convert to float {{{ */ -#define CTF(n) \ - vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \ - vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \ - b##n##1 = vec_ctf(vs32(b##n##1), 0); \ - b##n##0 = vec_ctf(vs32(b##n##0), 0); \ - - CTF(0); - CTF(1); - CTF(2); - CTF(3); - CTF(4); - CTF(5); - CTF(6); - CTF(7); - -#undef CTF - /* }}} */ - - FDCTROW(b00, b10, b20, b30, b40, b50, b60, b70); - FDCTROW(b01, b11, b21, b31, b41, b51, b61, b71); -#endif - - - /* 8x8 matrix transpose (vector float[8][2]) {{{ */ - x0 = vec_mergel(b00, b20); - x1 = vec_mergeh(b00, b20); - x2 = vec_mergel(b10, b30); - x3 = vec_mergeh(b10, b30); - - b00 = vec_mergeh(x1, x3); - b10 = vec_mergel(x1, x3); - b20 = vec_mergeh(x0, x2); - b30 = vec_mergel(x0, x2); - - x4 = vec_mergel(b41, b61); - x5 = vec_mergeh(b41, b61); - x6 = vec_mergel(b51, b71); - x7 = vec_mergeh(b51, b71); - - b41 = vec_mergeh(x5, x7); - b51 = vec_mergel(x5, x7); - b61 = vec_mergeh(x4, x6); - b71 = vec_mergel(x4, x6); - - x0 = vec_mergel(b01, b21); - x1 = vec_mergeh(b01, b21); - x2 = vec_mergel(b11, b31); - x3 = vec_mergeh(b11, b31); - - x4 = vec_mergel(b40, b60); - x5 = vec_mergeh(b40, b60); - x6 = vec_mergel(b50, b70); - x7 = vec_mergeh(b50, b70); - - b40 = vec_mergeh(x1, x3); - b50 = vec_mergel(x1, x3); - b60 = vec_mergeh(x0, x2); - b70 = vec_mergel(x0, x2); - - b01 = vec_mergeh(x5, x7); - b11 = vec_mergel(x5, x7); - b21 = vec_mergeh(x4, x6); - b31 = vec_mergel(x4, x6); - /* }}} */ - - - FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70); - FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71); - - - /* round, convert back to short {{{ */ -#define CTS(n) \ - b##n##0 = vec_round(b##n##0); \ - b##n##1 = vec_round(b##n##1); \ - b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \ - b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \ - b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \ - vec_st(vs16(b##n##0), 0, bp); - - bp = (vector signed short*)block; - CTS(0); bp++; - CTS(1); bp++; - CTS(2); bp++; - CTS(3); bp++; - CTS(4); bp++; - CTS(5); bp++; - CTS(6); bp++; - CTS(7); - -#undef CTS - /* }}} */ - -POWERPC_PERF_STOP_COUNT(altivec_fdct, 1); -} - -/* vim:set foldmethod=marker foldlevel=0: */ diff --git a/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c b/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c deleted file mode 100644 index 384a774ff..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * FFT/IFFT transforms - * AltiVec-enabled - * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> - * Based on code Copyright (c) 2002 Fabrice Bellard. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "../dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_altivec.h" - -/* - those three macros are from libavcodec/fft.c - and are required for the reference C code -*/ -/* butter fly op */ -#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ -{\ - FFTSample ax, ay, bx, by;\ - bx=pre1;\ - by=pim1;\ - ax=qre1;\ - ay=qim1;\ - pre = (bx + ax);\ - pim = (by + ay);\ - qre = (bx - ax);\ - qim = (by - ay);\ -} -#define MUL16(a,b) ((a) * (b)) -#define CMUL(pre, pim, are, aim, bre, bim) \ -{\ - pre = (MUL16(are, bre) - MUL16(aim, bim));\ - pim = (MUL16(are, bim) + MUL16(bre, aim));\ -} - - -/** - * Do a complex FFT with the parameters defined in ff_fft_init(). The - * input data must be permuted before with s->revtab table. No - * 1.0/sqrt(n) normalization is done. - * AltiVec-enabled - * This code assumes that the 'z' pointer is 16 bytes-aligned - * It also assumes all FFTComplex are 8 bytes-aligned pair of float - * The code is exactly the same as the SSE version, except - * that successive MUL + ADD/SUB have been merged into - * fused multiply-add ('vec_madd' in altivec) - */ -void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z) -{ -POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6); - register const vector float vczero = (const vector float)vec_splat_u32(0.); - - int ln = s->nbits; - int j, np, np2; - int nblocks, nloops; - register FFTComplex *p, *q; - FFTComplex *cptr, *cptr1; - int k; - -POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); - - np = 1 << ln; - - { - vector float *r, a, b, a1, c1, c2; - - r = (vector float *)&z[0]; - - c1 = vcii(p,p,n,n); - - if (s->inverse) - { - c2 = vcii(p,p,n,p); - } - else - { - c2 = vcii(p,p,p,n); - } - - j = (np >> 2); - do { - a = vec_ld(0, r); - a1 = vec_ld(sizeof(vector float), r); - - b = vec_perm(a,a,vcprmle(1,0,3,2)); - a = vec_madd(a,c1,b); - /* do the pass 0 butterfly */ - - b = vec_perm(a1,a1,vcprmle(1,0,3,2)); - b = vec_madd(a1,c1,b); - /* do the pass 0 butterfly */ - - /* multiply third by -i */ - b = vec_perm(b,b,vcprmle(2,3,1,0)); - - /* do the pass 1 butterfly */ - vec_st(vec_madd(b,c2,a), 0, r); - vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r); - - r += 2; - } while (--j != 0); - } - /* pass 2 .. ln-1 */ - - nblocks = np >> 3; - nloops = 1 << 2; - np2 = np >> 1; - - cptr1 = s->exptab1; - do { - p = z; - q = z + nloops; - j = nblocks; - do { - cptr = cptr1; - k = nloops >> 1; - do { - vector float a,b,c,t1; - - a = vec_ld(0, (float*)p); - b = vec_ld(0, (float*)q); - - /* complex mul */ - c = vec_ld(0, (float*)cptr); - /* cre*re cim*re */ - t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero); - c = vec_ld(sizeof(vector float), (float*)cptr); - /* -cim*im cre*im */ - b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1); - - /* butterfly */ - vec_st(vec_add(a,b), 0, (float*)p); - vec_st(vec_sub(a,b), 0, (float*)q); - - p += 2; - q += 2; - cptr += 4; - } while (--k); - - p += nloops; - q += nloops; - } while (--j); - cptr1 += nloops * 2; - nblocks = nblocks >> 1; - nloops = nloops << 1; - } while (nblocks != 0); - -POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6); -} diff --git a/contrib/ffmpeg/libavcodec/ppc/float_altivec.c b/contrib/ffmpeg/libavcodec/ppc/float_altivec.c deleted file mode 100644 index 22c2de61a..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/float_altivec.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "../dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_altivec.h" - -static void vector_fmul_altivec(float *dst, const float *src, int len) -{ - int i; - vector float d0, d1, s, zero = (vector float)vec_splat_u32(0); - for(i=0; i<len-7; i+=8) { - d0 = vec_ld(0, dst+i); - s = vec_ld(0, src+i); - d1 = vec_ld(16, dst+i); - d0 = vec_madd(d0, s, zero); - d1 = vec_madd(d1, vec_ld(16,src+i), zero); - vec_st(d0, 0, dst+i); - vec_st(d1, 16, dst+i); - } -} - -static void vector_fmul_reverse_altivec(float *dst, const float *src0, - const float *src1, int len) -{ - int i; - vector float d, s0, s1, h0, l0, - s2, s3, zero = (vector float)vec_splat_u32(0); - src1 += len-4; - for(i=0; i<len-7; i+=8) { - s1 = vec_ld(0, src1-i); // [a,b,c,d] - s0 = vec_ld(0, src0+i); - l0 = vec_mergel(s1, s1); // [c,c,d,d] - s3 = vec_ld(-16, src1-i); - h0 = vec_mergeh(s1, s1); // [a,a,b,b] - s2 = vec_ld(16, src0+i); - s1 = vec_mergeh(vec_mergel(l0,h0), // [d,b,d,b] - vec_mergeh(l0,h0)); // [c,a,c,a] - // [d,c,b,a] - l0 = vec_mergel(s3, s3); - d = vec_madd(s0, s1, zero); - h0 = vec_mergeh(s3, s3); - vec_st(d, 0, dst+i); - s3 = vec_mergeh(vec_mergel(l0,h0), - vec_mergeh(l0,h0)); - d = vec_madd(s2, s3, zero); - vec_st(d, 16, dst+i); - } -} - -static void vector_fmul_add_add_altivec(float *dst, const float *src0, - const float *src1, const float *src2, - int src3, int len, int step) -{ - int i; - vector float d, s0, s1, s2, t0, t1, edges; - vector unsigned char align = vec_lvsr(0,dst), - mask = vec_lvsl(0, dst); - -#if 0 //FIXME: there is still something wrong - if (step == 2) { - int y; - vector float d0, d1, s3, t2; - vector unsigned int sel = - vec_mergeh(vec_splat_u32(-1), vec_splat_u32(0)); - t1 = vec_ld(16, dst); - for (i=0,y=0; i<len-3; i+=4,y+=8) { - - s0 = vec_ld(0,src0+i); - s1 = vec_ld(0,src1+i); - s2 = vec_ld(0,src2+i); - -// t0 = vec_ld(0, dst+y); //[x x x|a] -// t1 = vec_ld(16, dst+y); //[b c d|e] - t2 = vec_ld(31, dst+y); //[f g h|x] - - d = vec_madd(s0,s1,s2); // [A B C D] - - // [A A B B] - - // [C C D D] - - d0 = vec_perm(t0, t1, mask); // [a b c d] - - d0 = vec_sel(vec_mergeh(d, d), d0, sel); // [A b B d] - - edges = vec_perm(t1, t0, mask); - - t0 = vec_perm(edges, d0, align); // [x x x|A] - - t1 = vec_perm(d0, edges, align); // [b B d|e] - - vec_stl(t0, 0, dst+y); - - d1 = vec_perm(t1, t2, mask); // [e f g h] - - d1 = vec_sel(vec_mergel(d, d), d1, sel); // [C f D h] - - edges = vec_perm(t2, t1, mask); - - t1 = vec_perm(edges, d1, align); // [b B d|C] - - t2 = vec_perm(d1, edges, align); // [f D h|x] - - vec_stl(t1, 16, dst+y); - - t0 = t1; - - vec_stl(t2, 31, dst+y); - - t1 = t2; - } - } else - #endif - if (step == 1 && src3 == 0) - for (i=0; i<len-3; i+=4) { - t0 = vec_ld(0, dst+i); - t1 = vec_ld(15, dst+i); - s0 = vec_ld(0, src0+i); - s1 = vec_ld(0, src1+i); - s2 = vec_ld(0, src2+i); - edges = vec_perm(t1 ,t0, mask); - d = vec_madd(s0,s1,s2); - t1 = vec_perm(d, edges, align); - t0 = vec_perm(edges, d, align); - vec_st(t1, 15, dst+i); - vec_st(t0, 0, dst+i); - } - else - ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); -} - -void float_to_int16_altivec(int16_t *dst, const float *src, int len) -{ - int i; - vector float s0, s1; - vector signed int t0, t1; - vector signed short d0, d1, d; - vector unsigned char align; - if(((long)dst)&15) //FIXME - for(i=0; i<len-7; i+=8) { - s0 = vec_ld(0, src+i); - s1 = vec_ld(16, src+i); - t0 = vec_cts(s0, 0); - d0 = vec_ld(0, dst+i); - t1 = vec_cts(s1, 0); - d1 = vec_ld(15, dst+i); - d = vec_packs(t0,t1); - d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); - align = vec_lvsr(0, dst+i); - d0 = vec_perm(d1, d, align); - d1 = vec_perm(d, d1, align); - vec_st(d0, 0, dst+i); - vec_st(d1,15, dst+i); - } - else - for(i=0; i<len-7; i+=8) { - s0 = vec_ld(0, src+i); - s1 = vec_ld(16, src+i); - t0 = vec_cts(s0, 0); - t1 = vec_cts(s1, 0); - d = vec_packs(t0,t1); - vec_st(d, 0, dst+i); - } -} - -void float_init_altivec(DSPContext* c, AVCodecContext *avctx) -{ - c->vector_fmul = vector_fmul_altivec; - c->vector_fmul_reverse = vector_fmul_reverse_altivec; - c->vector_fmul_add_add = vector_fmul_add_add_altivec; - if(!(avctx->flags & CODEC_FLAG_BITEXACT)) - c->float_to_int16 = float_to_int16_altivec; -} diff --git a/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h b/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h deleted file mode 100644 index 5a4a55188..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * gcc fixes for altivec. - * Used to workaround broken gcc (FSF gcc-3 pre gcc-3.3) - * and to stay somewhat compatible with Darwin. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef _GCC_FIXES_ -#define _GCC_FIXES_ - -#ifdef HAVE_ALTIVEC_H -#include <altivec.h> -#endif - -#ifdef CONFIG_DARWIN -# ifndef __MWERKS__ -# define AVV(x...) (x) -# else -# define AVV -# endif -#define REG_v(a) asm ( #a ) -#else - -#define AVV(x...) {x} - -#if (__GNUC__ < 4) -# define REG_v(a) -#else -# define REG_v(a) asm ( #a ) -#endif - -#if (__GNUC__ * 100 + __GNUC_MINOR__ < 303) - -/* This code was provided to me by Bartosch Pixa - * as a separate header file (broken_mergel.h). - * thanks to lu_zero for the workaround. - * - * See this mail for more information: - * http://gcc.gnu.org/ml/gcc/2003-04/msg00967.html - */ - -static inline vector signed char ff_vmrglb (vector signed char const A, - vector signed char const B) -{ - static const vector unsigned char lowbyte = { - 0x08, 0x18, 0x09, 0x19, 0x0a, 0x1a, 0x0b, 0x1b, - 0x0c, 0x1c, 0x0d, 0x1d, 0x0e, 0x1e, 0x0f, 0x1f - }; - return vec_perm (A, B, lowbyte); -} - -static inline vector signed short ff_vmrglh (vector signed short const A, - vector signed short const B) -{ - static const vector unsigned char lowhalf = { - 0x08, 0x09, 0x18, 0x19, 0x0a, 0x0b, 0x1a, 0x1b, - 0x0c, 0x0d, 0x1c, 0x1d, 0x0e, 0x0f, 0x1e, 0x1f - }; - return vec_perm (A, B, lowhalf); -} - -static inline vector signed int ff_vmrglw (vector signed int const A, - vector signed int const B) -{ - static const vector unsigned char lowword = { - 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, - 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f - }; - return vec_perm (A, B, lowword); -} -/*#define ff_vmrglb ff_vmrglb -#define ff_vmrglh ff_vmrglh -#define ff_vmrglw ff_vmrglw -*/ -#undef vec_mergel - -#define vec_mergel(a1, a2) \ -__ch (__bin_args_eq (vector signed char, (a1), vector signed char, (a2)), \ - ((vector signed char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \ -__ch (__bin_args_eq (vector unsigned char, (a1), vector unsigned char, (a2)), \ - ((vector unsigned char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \ -__ch (__bin_args_eq (vector signed short, (a1), vector signed short, (a2)), \ - ((vector signed short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \ -__ch (__bin_args_eq (vector unsigned short, (a1), vector unsigned short, (a2)), \ - ((vector unsigned short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \ -__ch (__bin_args_eq (vector float, (a1), vector float, (a2)), \ - ((vector float) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ -__ch (__bin_args_eq (vector signed int, (a1), vector signed int, (a2)), \ - ((vector signed int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ -__ch (__bin_args_eq (vector unsigned int, (a1), vector unsigned int, (a2)), \ - ((vector unsigned int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ - __altivec_link_error_invalid_argument ()))))))) - -#endif - -#endif /* CONFIG_DARWIN */ - -#ifndef __MWERKS__ -#define const_vector const vector -#else -#define const_vector vector -#endif - -#endif /* _GCC_FIXES_ */ diff --git a/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c b/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c deleted file mode 100644 index 42c936bb3..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * GMC (Global Motion Compensation) - * AltiVec-enabled - * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "../dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_altivec.h" - -/* - altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, - to preserve proper dst alignement. -*/ -#define GMC1_PERF_COND (h==8) -void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) -{ -POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); - const unsigned short __attribute__ ((aligned(16))) rounder_a[8] = - {rounder, rounder, rounder, rounder, - rounder, rounder, rounder, rounder}; - const unsigned short __attribute__ ((aligned(16))) ABCD[8] = - { - (16-x16)*(16-y16), /* A */ - ( x16)*(16-y16), /* B */ - (16-x16)*( y16), /* C */ - ( x16)*( y16), /* D */ - 0, 0, 0, 0 /* padding */ - }; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vcsr8 = (const_vector unsigned short)vec_splat_u16(8); - register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; - register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD; - int i; - unsigned long dst_odd = (unsigned long)dst & 0x0000000F; - unsigned long src_really_odd = (unsigned long)src & 0x0000000F; - - -POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); - - tempA = vec_ld(0, (unsigned short*)ABCD); - Av = vec_splat(tempA, 0); - Bv = vec_splat(tempA, 1); - Cv = vec_splat(tempA, 2); - Dv = vec_splat(tempA, 3); - - rounderV = vec_ld(0, (unsigned short*)rounder_a); - - // we'll be able to pick-up our 9 char elements - // at src from those 32 bytes - // we load the first batch here, as inside the loop - // we can re-use 'src+stride' from one iteration - // as the 'src' of the next. - src_0 = vec_ld(0, src); - src_1 = vec_ld(16, src); - srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); - - if (src_really_odd != 0x0000000F) - { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. - srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); - } - else - { - srcvB = src_1; - } - srcvA = vec_mergeh(vczero, srcvA); - srcvB = vec_mergeh(vczero, srcvB); - - for(i=0; i<h; i++) - { - dst_odd = (unsigned long)dst & 0x0000000F; - src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; - - dstv = vec_ld(0, dst); - - // we we'll be able to pick-up our 9 char elements - // at src + stride from those 32 bytes - // then reuse the resulting 2 vectors srvcC and srcvD - // as the next srcvA and srcvB - src_0 = vec_ld(stride + 0, src); - src_1 = vec_ld(stride + 16, src); - srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); - - if (src_really_odd != 0x0000000F) - { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. - srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); - } - else - { - srcvD = src_1; - } - - srcvC = vec_mergeh(vczero, srcvC); - srcvD = vec_mergeh(vczero, srcvD); - - - // OK, now we (finally) do the math :-) - // those four instructions replaces 32 int muls & 32 int adds. - // isn't AltiVec nice ? - tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); - tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); - tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); - tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); - - srcvA = srcvC; - srcvB = srcvD; - - tempD = vec_sr(tempD, vcsr8); - - dstv2 = vec_pack(tempD, (vector unsigned short)vczero); - - if (dst_odd) - { - dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); - } - else - { - dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); - } - - vec_st(dstv2, 0, dst); - - dst += stride; - src += stride; - } - -POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); -} diff --git a/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c b/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c deleted file mode 100644 index bac620e82..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c +++ /dev/null @@ -1,565 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "../dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_altivec.h" -#include "types_altivec.h" - -#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s -#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) - -#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC -#define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec -#define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num -#define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec -#define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num -#define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec -#define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num -#define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec -#define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num -#include "h264_template_altivec.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_h264_chroma_mc8_altivec -#undef PREFIX_h264_chroma_mc8_num -#undef PREFIX_h264_qpel16_h_lowpass_altivec -#undef PREFIX_h264_qpel16_h_lowpass_num -#undef PREFIX_h264_qpel16_v_lowpass_altivec -#undef PREFIX_h264_qpel16_v_lowpass_num -#undef PREFIX_h264_qpel16_hv_lowpass_altivec -#undef PREFIX_h264_qpel16_hv_lowpass_num - -#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC -#define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec -#define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num -#define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec -#define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num -#define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec -#define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num -#define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec -#define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num -#include "h264_template_altivec.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_h264_chroma_mc8_altivec -#undef PREFIX_h264_chroma_mc8_num -#undef PREFIX_h264_qpel16_h_lowpass_altivec -#undef PREFIX_h264_qpel16_h_lowpass_num -#undef PREFIX_h264_qpel16_v_lowpass_altivec -#undef PREFIX_h264_qpel16_v_lowpass_num -#undef PREFIX_h264_qpel16_hv_lowpass_altivec -#undef PREFIX_h264_qpel16_hv_lowpass_num - -#define H264_MC(OPNAME, SIZE, CODETYPE) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \ - DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\ - OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ -}\ - -/* this code assume that stride % 16 == 0 */ -void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { - signed int ABCD[4] __attribute__((aligned(16))) = - {((8 - x) * (8 - y)), - ((x) * (8 - y)), - ((8 - x) * (y)), - ((x) * (y))}; - register int i; - vector unsigned char fperm; - const vector signed int vABCD = vec_ld(0, ABCD); - const vector signed short vA = vec_splat((vector signed short)vABCD, 1); - const vector signed short vB = vec_splat((vector signed short)vABCD, 3); - const vector signed short vC = vec_splat((vector signed short)vABCD, 5); - const vector signed short vD = vec_splat((vector signed short)vABCD, 7); - const vector signed int vzero = vec_splat_s32(0); - const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); - const vector unsigned short v6us = vec_splat_u16(6); - register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; - register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; - - vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; - vector unsigned char vsrc0uc, vsrc1uc; - vector signed short vsrc0ssH, vsrc1ssH; - vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; - vector signed short vsrc2ssH, vsrc3ssH, psum; - vector unsigned char vdst, ppsum, fsum; - - if (((unsigned long)dst) % 16 == 0) { - fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, - 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F); - } else { - fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F); - } - - vsrcAuc = vec_ld(0, src); - - if (loadSecond) - vsrcBuc = vec_ld(16, src); - vsrcperm0 = vec_lvsl(0, src); - vsrcperm1 = vec_lvsl(1, src); - - vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcBuc; - else - vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); - - vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc0uc); - vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc1uc); - - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - - - vsrcCuc = vec_ld(stride + 0, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc2uc); - vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc3uc); - - psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); - psum = vec_mladd(vB, vsrc1ssH, psum); - psum = vec_mladd(vC, vsrc2ssH, psum); - psum = vec_mladd(vD, vsrc3ssH, psum); - psum = vec_add(v28ss, psum); - psum = vec_sra(psum, v6us); - - vdst = vec_ld(0, dst); - ppsum = (vector unsigned char)vec_packsu(psum, psum); - fsum = vec_perm(vdst, ppsum, fperm); - - vec_st(fsum, 0, dst); - - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; - - dst += stride; - src += stride; - } - } else { - vector unsigned char vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 16, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc3uc = vsrcDuc; - else - vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc2uc); - vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc3uc); - - psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); - psum = vec_mladd(vB, vsrc1ssH, psum); - psum = vec_mladd(vC, vsrc2ssH, psum); - psum = vec_mladd(vD, vsrc3ssH, psum); - psum = vec_add(v28ss, psum); - psum = vec_sr(psum, v6us); - - vdst = vec_ld(0, dst); - ppsum = (vector unsigned char)vec_pack(psum, psum); - fsum = vec_perm(vdst, ppsum, fperm); - - vec_st(fsum, 0, dst); - - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; - - dst += stride; - src += stride; - } - } -} - -static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, - const uint8_t * src2, int dst_stride, - int src_stride1, int h) -{ - int i; - vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align; - - mask_ = vec_lvsl(0, src2); - - for (i = 0; i < h; i++) { - - tmp1 = vec_ld(i * src_stride1, src1); - mask = vec_lvsl(i * src_stride1, src1); - tmp2 = vec_ld(i * src_stride1 + 15, src1); - - a = vec_perm(tmp1, tmp2, mask); - - tmp1 = vec_ld(i * 16, src2); - tmp2 = vec_ld(i * 16 + 15, src2); - - b = vec_perm(tmp1, tmp2, mask_); - - tmp1 = vec_ld(0, dst); - mask = vec_lvsl(0, dst); - tmp2 = vec_ld(15, dst); - - d = vec_avg(a, b); - - edges = vec_perm(tmp2, tmp1, mask); - - align = vec_lvsr(0, dst); - - tmp2 = vec_perm(d, edges, align); - tmp1 = vec_perm(edges, d, align); - - vec_st(tmp2, 15, dst); - vec_st(tmp1, 0 , dst); - - dst += dst_stride; - } -} - -static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, - const uint8_t * src2, int dst_stride, - int src_stride1, int h) -{ - int i; - vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align; - - mask_ = vec_lvsl(0, src2); - - for (i = 0; i < h; i++) { - - tmp1 = vec_ld(i * src_stride1, src1); - mask = vec_lvsl(i * src_stride1, src1); - tmp2 = vec_ld(i * src_stride1 + 15, src1); - - a = vec_perm(tmp1, tmp2, mask); - - tmp1 = vec_ld(i * 16, src2); - tmp2 = vec_ld(i * 16 + 15, src2); - - b = vec_perm(tmp1, tmp2, mask_); - - tmp1 = vec_ld(0, dst); - mask = vec_lvsl(0, dst); - tmp2 = vec_ld(15, dst); - - d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b)); - - edges = vec_perm(tmp2, tmp1, mask); - - align = vec_lvsr(0, dst); - - tmp2 = vec_perm(d, edges, align); - tmp1 = vec_perm(edges, d, align); - - vec_st(tmp2, 15, dst); - vec_st(tmp1, 0 , dst); - - dst += dst_stride; - } -} - -/* Implemented but could be faster -#define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h) -#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h) - */ - - H264_MC(put_, 16, altivec) - H264_MC(avg_, 16, altivec) - - -/**************************************************************************** - * IDCT transform: - ****************************************************************************/ - -#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ - /* a0 = SRC(0) + SRC(4); */ \ - vec_s16_t a0v = vec_add(s0, s4); \ - /* a2 = SRC(0) - SRC(4); */ \ - vec_s16_t a2v = vec_sub(s0, s4); \ - /* a4 = (SRC(2)>>1) - SRC(6); */ \ - vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \ - /* a6 = (SRC(6)>>1) + SRC(2); */ \ - vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \ - /* b0 = a0 + a6; */ \ - vec_s16_t b0v = vec_add(a0v, a6v); \ - /* b2 = a2 + a4; */ \ - vec_s16_t b2v = vec_add(a2v, a4v); \ - /* b4 = a2 - a4; */ \ - vec_s16_t b4v = vec_sub(a2v, a4v); \ - /* b6 = a0 - a6; */ \ - vec_s16_t b6v = vec_sub(a0v, a6v); \ - /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ - /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ - vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \ - /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ - /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ - vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ - /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ - /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ - vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ - /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ - vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ - /* b1 = (a7>>2) + a1; */ \ - vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \ - /* b3 = a3 + (a5>>2); */ \ - vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \ - /* b5 = (a3>>2) - a5; */ \ - vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \ - /* b7 = a7 - (a1>>2); */ \ - vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ - /* DST(0, b0 + b7); */ \ - d0 = vec_add(b0v, b7v); \ - /* DST(1, b2 + b5); */ \ - d1 = vec_add(b2v, b5v); \ - /* DST(2, b4 + b3); */ \ - d2 = vec_add(b4v, b3v); \ - /* DST(3, b6 + b1); */ \ - d3 = vec_add(b6v, b1v); \ - /* DST(4, b6 - b1); */ \ - d4 = vec_sub(b6v, b1v); \ - /* DST(5, b4 - b3); */ \ - d5 = vec_sub(b4v, b3v); \ - /* DST(6, b2 - b5); */ \ - d6 = vec_sub(b2v, b5v); \ - /* DST(7, b0 - b7); */ \ - d7 = vec_sub(b0v, b7v); \ -} - -#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ - /* unaligned load */ \ - vec_u8_t hv = vec_ld( 0, dest ); \ - vec_u8_t lv = vec_ld( 7, dest ); \ - vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \ - vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \ - vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \ - vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \ - vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \ - vec_u8_t edgehv; \ - /* unaligned store */ \ - vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ - vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ - lv = vec_sel( lv, bodyv, edgelv ); \ - vec_st( lv, 7, dest ); \ - hv = vec_ld( 0, dest ); \ - edgehv = vec_perm( zero_u8v, sel, perm_stv ); \ - hv = vec_sel( hv, bodyv, edgehv ); \ - vec_st( hv, 0, dest ); \ - } - -void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { - vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7; - vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7; - vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; - - vec_u8_t perm_ldv = vec_lvsl(0, dst); - vec_u8_t perm_stv = vec_lvsr(8, dst); - - const vec_u16_t onev = vec_splat_u16(1); - const vec_u16_t twov = vec_splat_u16(2); - const vec_u16_t sixv = vec_splat_u16(6); - - const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0, - -1,-1,-1,-1,-1,-1,-1,-1); - LOAD_ZERO; - - dct[0] += 32; // rounding for the >>6 at the end - - s0 = vec_ld(0x00, (int16_t*)dct); - s1 = vec_ld(0x10, (int16_t*)dct); - s2 = vec_ld(0x20, (int16_t*)dct); - s3 = vec_ld(0x30, (int16_t*)dct); - s4 = vec_ld(0x40, (int16_t*)dct); - s5 = vec_ld(0x50, (int16_t*)dct); - s6 = vec_ld(0x60, (int16_t*)dct); - s7 = vec_ld(0x70, (int16_t*)dct); - - IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, - d0, d1, d2, d3, d4, d5, d6, d7); - - TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 ); - - IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7, - idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); - - ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); -} - -void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { - -#ifdef HAVE_ALTIVEC - if (has_altivec()) { - c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; - c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec; - c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; - c->h264_idct8_add = ff_h264_idct8_add_altivec; - -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ - c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \ - c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \ - c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \ - c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \ - c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \ - c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \ - c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \ - c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \ - c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \ - c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \ - c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \ - c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \ - c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \ - c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \ - c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec - - dspfunc(put_h264_qpel, 0, 16); - dspfunc(avg_h264_qpel, 0, 16); -#undef dspfunc - - } else -#endif /* HAVE_ALTIVEC */ - { - // Non-AltiVec PPC optimisations - - // ... pending ... - } -} diff --git a/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c b/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c deleted file mode 100644 index e8ad67f2f..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c +++ /dev/null @@ -1,719 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* this code assume that stride % 16 == 0 */ -void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { - POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); - signed int ABCD[4] __attribute__((aligned(16))) = - {((8 - x) * (8 - y)), - ((x) * (8 - y)), - ((8 - x) * (y)), - ((x) * (y))}; - register int i; - vector unsigned char fperm; - const vector signed int vABCD = vec_ld(0, ABCD); - const vector signed short vA = vec_splat((vector signed short)vABCD, 1); - const vector signed short vB = vec_splat((vector signed short)vABCD, 3); - const vector signed short vC = vec_splat((vector signed short)vABCD, 5); - const vector signed short vD = vec_splat((vector signed short)vABCD, 7); - const vector signed int vzero = vec_splat_s32(0); - const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); - const vector unsigned short v6us = vec_splat_u16(6); - register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; - register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; - - vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; - vector unsigned char vsrc0uc, vsrc1uc; - vector signed short vsrc0ssH, vsrc1ssH; - vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; - vector signed short vsrc2ssH, vsrc3ssH, psum; - vector unsigned char vdst, ppsum, vfdst, fsum; - - POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); - - if (((unsigned long)dst) % 16 == 0) { - fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, - 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F); - } else { - fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F); - } - - vsrcAuc = vec_ld(0, src); - - if (loadSecond) - vsrcBuc = vec_ld(16, src); - vsrcperm0 = vec_lvsl(0, src); - vsrcperm1 = vec_lvsl(1, src); - - vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcBuc; - else - vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); - - vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc0uc); - vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc1uc); - - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - - - vsrcCuc = vec_ld(stride + 0, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc2uc); - vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc3uc); - - psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); - psum = vec_mladd(vB, vsrc1ssH, psum); - psum = vec_mladd(vC, vsrc2ssH, psum); - psum = vec_mladd(vD, vsrc3ssH, psum); - psum = vec_add(v32ss, psum); - psum = vec_sra(psum, v6us); - - vdst = vec_ld(0, dst); - ppsum = (vector unsigned char)vec_packsu(psum, psum); - vfdst = vec_perm(vdst, ppsum, fperm); - - OP_U8_ALTIVEC(fsum, vfdst, vdst); - - vec_st(fsum, 0, dst); - - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; - - dst += stride; - src += stride; - } - } else { - vector unsigned char vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 16, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc3uc = vsrcDuc; - else - vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc2uc); - vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc3uc); - - psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); - psum = vec_mladd(vB, vsrc1ssH, psum); - psum = vec_mladd(vC, vsrc2ssH, psum); - psum = vec_mladd(vD, vsrc3ssH, psum); - psum = vec_add(v32ss, psum); - psum = vec_sr(psum, v6us); - - vdst = vec_ld(0, dst); - ppsum = (vector unsigned char)vec_pack(psum, psum); - vfdst = vec_perm(vdst, ppsum, fperm); - - OP_U8_ALTIVEC(fsum, vfdst, vdst); - - vec_st(fsum, 0, dst); - - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; - - dst += stride; - src += stride; - } - } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); -} - -/* this code assume stride % 16 == 0 */ -static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); - register int i; - - const vector signed int vzero = vec_splat_s32(0); - const vector unsigned char permM2 = vec_lvsl(-2, src); - const vector unsigned char permM1 = vec_lvsl(-1, src); - const vector unsigned char permP0 = vec_lvsl(+0, src); - const vector unsigned char permP1 = vec_lvsl(+1, src); - const vector unsigned char permP2 = vec_lvsl(+2, src); - const vector unsigned char permP3 = vec_lvsl(+3, src); - const vector signed short v5ss = vec_splat_s16(5); - const vector unsigned short v5us = vec_splat_u16(5); - const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); - const vector unsigned char dstperm = vec_lvsr(0, dst); - const vector unsigned char neg1 = - (const vector unsigned char) vec_splat_s8(-1); - - const vector unsigned char dstmask = - vec_perm((const vector unsigned char)vzero, - neg1, dstperm); - - vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - - register int align = ((((unsigned long)src) - 2) % 16); - - vector signed short srcP0A, srcP0B, srcP1A, srcP1B, - srcP2A, srcP2B, srcP3A, srcP3B, - srcM1A, srcM1B, srcM2A, srcM2B, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, - pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, - psumA, psumB, sumA, sumB; - - vector unsigned char sum, dst1, dst2, vdst, fsum, - rsum, fdst1, fdst2; - - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); - - for (i = 0 ; i < 16 ; i ++) { - vector unsigned char srcR1 = vec_ld(-2, src); - vector unsigned char srcR2 = vec_ld(14, src); - - switch (align) { - default: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = vec_perm(srcR1, srcR2, permP3); - } break; - case 11: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = srcR2; - } break; - case 12: { - vector unsigned char srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = srcR2; - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 13: { - vector unsigned char srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = srcR2; - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 14: { - vector unsigned char srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = srcR2; - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 15: { - vector unsigned char srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = srcR2; - srcP0 = vec_perm(srcR2, srcR3, permP0); - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - } - - srcP0A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP0); - srcP0B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP0); - srcP1A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP1); - srcP1B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP1); - - srcP2A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP2); - srcP2B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP2); - srcP3A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP3); - srcP3B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP3); - - srcM1A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcM1); - srcM1B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcM1); - srcM2A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcM2); - srcM2B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcM2); - - sum1A = vec_adds(srcP0A, srcP1A); - sum1B = vec_adds(srcP0B, srcP1B); - sum2A = vec_adds(srcM1A, srcP2A); - sum2B = vec_adds(srcM1B, srcP2B); - sum3A = vec_adds(srcM2A, srcP3A); - sum3B = vec_adds(srcM2B, srcP3B); - - pp1A = vec_mladd(sum1A, v20ss, v16ss); - pp1B = vec_mladd(sum1B, v20ss, v16ss); - - pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); - pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); - - pp3A = vec_add(sum3A, pp1A); - pp3B = vec_add(sum3B, pp1B); - - psumA = vec_sub(pp3A, pp2A); - psumB = vec_sub(pp3B, pp2B); - - sumA = vec_sra(psumA, v5us); - sumB = vec_sra(psumB, v5us); - - sum = vec_packsu(sumA, sumB); - - dst1 = vec_ld(0, dst); - dst2 = vec_ld(16, dst); - vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); - - OP_U8_ALTIVEC(fsum, sum, vdst); - - rsum = vec_perm(fsum, fsum, dstperm); - fdst1 = vec_sel(dst1, rsum, dstmask); - fdst2 = vec_sel(rsum, dst2, dstmask); - - vec_st(fdst1, 0, dst); - vec_st(fdst2, 16, dst); - - src += srcStride; - dst += dstStride; - } -POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); -} - -/* this code assume stride % 16 == 0 */ -static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); - - register int i; - - const vector signed int vzero = vec_splat_s32(0); - const vector unsigned char perm = vec_lvsl(0, src); - const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vector unsigned short v5us = vec_splat_u16(5); - const vector signed short v5ss = vec_splat_s16(5); - const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); - const vector unsigned char dstperm = vec_lvsr(0, dst); - const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); - const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); - - uint8_t *srcbis = src - (srcStride * 2); - - const vector unsigned char srcM2a = vec_ld(0, srcbis); - const vector unsigned char srcM2b = vec_ld(16, srcbis); - const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); -// srcbis += srcStride; - const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride); - const vector unsigned char srcM1b = vec_ld(16, srcbis); - const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); -// srcbis += srcStride; - const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride); - const vector unsigned char srcP0b = vec_ld(16, srcbis); - const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); -// srcbis += srcStride; - const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride); - const vector unsigned char srcP1b = vec_ld(16, srcbis); - const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); -// srcbis += srcStride; - const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride); - const vector unsigned char srcP2b = vec_ld(16, srcbis); - const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); -// srcbis += srcStride; - - vector signed short srcM2ssA = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcM2); - vector signed short srcM2ssB = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcM2); - vector signed short srcM1ssA = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcM1); - vector signed short srcM1ssB = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcM1); - vector signed short srcP0ssA = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP0); - vector signed short srcP0ssB = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP0); - vector signed short srcP1ssA = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP1); - vector signed short srcP1ssB = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP1); - vector signed short srcP2ssA = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP2); - vector signed short srcP2ssB = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP2); - - vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, - psumA, psumB, sumA, sumB, - srcP3ssA, srcP3ssB, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; - - vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, - srcP3a, srcP3b, srcP3; - - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); - - for (i = 0 ; i < 16 ; i++) { - srcP3a = vec_ld(0, srcbis += srcStride); - srcP3b = vec_ld(16, srcbis); - srcP3 = vec_perm(srcP3a, srcP3b, perm); - srcP3ssA = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP3); - srcP3ssB = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP3); -// srcbis += srcStride; - - sum1A = vec_adds(srcP0ssA, srcP1ssA); - sum1B = vec_adds(srcP0ssB, srcP1ssB); - sum2A = vec_adds(srcM1ssA, srcP2ssA); - sum2B = vec_adds(srcM1ssB, srcP2ssB); - sum3A = vec_adds(srcM2ssA, srcP3ssA); - sum3B = vec_adds(srcM2ssB, srcP3ssB); - - srcM2ssA = srcM1ssA; - srcM2ssB = srcM1ssB; - srcM1ssA = srcP0ssA; - srcM1ssB = srcP0ssB; - srcP0ssA = srcP1ssA; - srcP0ssB = srcP1ssB; - srcP1ssA = srcP2ssA; - srcP1ssB = srcP2ssB; - srcP2ssA = srcP3ssA; - srcP2ssB = srcP3ssB; - - pp1A = vec_mladd(sum1A, v20ss, v16ss); - pp1B = vec_mladd(sum1B, v20ss, v16ss); - - pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); - pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); - - pp3A = vec_add(sum3A, pp1A); - pp3B = vec_add(sum3B, pp1B); - - psumA = vec_sub(pp3A, pp2A); - psumB = vec_sub(pp3B, pp2B); - - sumA = vec_sra(psumA, v5us); - sumB = vec_sra(psumB, v5us); - - sum = vec_packsu(sumA, sumB); - - dst1 = vec_ld(0, dst); - dst2 = vec_ld(16, dst); - vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); - - OP_U8_ALTIVEC(fsum, sum, vdst); - - rsum = vec_perm(fsum, fsum, dstperm); - fdst1 = vec_sel(dst1, rsum, dstmask); - fdst2 = vec_sel(rsum, dst2, dstmask); - - vec_st(fdst1, 0, dst); - vec_st(fdst2, 16, dst); - - dst += dstStride; - } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); -} - -/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ -static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); - register int i; - const vector signed int vzero = vec_splat_s32(0); - const vector unsigned char permM2 = vec_lvsl(-2, src); - const vector unsigned char permM1 = vec_lvsl(-1, src); - const vector unsigned char permP0 = vec_lvsl(+0, src); - const vector unsigned char permP1 = vec_lvsl(+1, src); - const vector unsigned char permP2 = vec_lvsl(+2, src); - const vector unsigned char permP3 = vec_lvsl(+3, src); - const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vector unsigned int v10ui = vec_splat_u32(10); - const vector signed short v5ss = vec_splat_s16(5); - const vector signed short v1ss = vec_splat_s16(1); - const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); - const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); - - register int align = ((((unsigned long)src) - 2) % 16); - - const vector unsigned char neg1 = (const vector unsigned char) - vec_splat_s8(-1); - - vector signed short srcP0A, srcP0B, srcP1A, srcP1B, - srcP2A, srcP2B, srcP3A, srcP3B, - srcM1A, srcM1B, srcM2A, srcM2B, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, - pp1A, pp1B, pp2A, pp2B, psumA, psumB; - - const vector unsigned char dstperm = vec_lvsr(0, dst); - - const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); - - const vector unsigned char mperm = (const vector unsigned char) - AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, - 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); - int16_t *tmpbis = tmp; - - vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, - tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, - tmpP2ssA, tmpP2ssB; - - vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, - pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, - pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, - ssumAe, ssumAo, ssumBe, ssumBo; - vector unsigned char fsum, sumv, sum, dst1, dst2, vdst, - rsum, fdst1, fdst2; - vector signed short ssume, ssumo; - - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); - src -= (2 * srcStride); - for (i = 0 ; i < 21 ; i ++) { - vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - vector unsigned char srcR1 = vec_ld(-2, src); - vector unsigned char srcR2 = vec_ld(14, src); - - switch (align) { - default: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = vec_perm(srcR1, srcR2, permP3); - } break; - case 11: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = srcR2; - } break; - case 12: { - vector unsigned char srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = srcR2; - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 13: { - vector unsigned char srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = srcR2; - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 14: { - vector unsigned char srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = srcR2; - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 15: { - vector unsigned char srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = srcR2; - srcP0 = vec_perm(srcR2, srcR3, permP0); - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - } - - srcP0A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP0); - srcP0B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP0); - srcP1A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP1); - srcP1B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP1); - - srcP2A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP2); - srcP2B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP2); - srcP3A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP3); - srcP3B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP3); - - srcM1A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcM1); - srcM1B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcM1); - srcM2A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcM2); - srcM2B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcM2); - - sum1A = vec_adds(srcP0A, srcP1A); - sum1B = vec_adds(srcP0B, srcP1B); - sum2A = vec_adds(srcM1A, srcP2A); - sum2B = vec_adds(srcM1B, srcP2B); - sum3A = vec_adds(srcM2A, srcP3A); - sum3B = vec_adds(srcM2B, srcP3B); - - pp1A = vec_mladd(sum1A, v20ss, sum3A); - pp1B = vec_mladd(sum1B, v20ss, sum3B); - - pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); - pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); - - psumA = vec_sub(pp1A, pp2A); - psumB = vec_sub(pp1B, pp2B); - - vec_st(psumA, 0, tmp); - vec_st(psumB, 16, tmp); - - src += srcStride; - tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ - } - - tmpM2ssA = vec_ld(0, tmpbis); - tmpM2ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpM1ssA = vec_ld(0, tmpbis); - tmpM1ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP0ssA = vec_ld(0, tmpbis); - tmpP0ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP1ssA = vec_ld(0, tmpbis); - tmpP1ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP2ssA = vec_ld(0, tmpbis); - tmpP2ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - - for (i = 0 ; i < 16 ; i++) { - const vector signed short tmpP3ssA = vec_ld(0, tmpbis); - const vector signed short tmpP3ssB = vec_ld(16, tmpbis); - - const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); - const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); - const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); - const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); - const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); - const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); - - tmpbis += tmpStride; - - tmpM2ssA = tmpM1ssA; - tmpM2ssB = tmpM1ssB; - tmpM1ssA = tmpP0ssA; - tmpM1ssB = tmpP0ssB; - tmpP0ssA = tmpP1ssA; - tmpP0ssB = tmpP1ssB; - tmpP1ssA = tmpP2ssA; - tmpP1ssB = tmpP2ssB; - tmpP2ssA = tmpP3ssA; - tmpP2ssB = tmpP3ssB; - - pp1Ae = vec_mule(sum1A, v20ss); - pp1Ao = vec_mulo(sum1A, v20ss); - pp1Be = vec_mule(sum1B, v20ss); - pp1Bo = vec_mulo(sum1B, v20ss); - - pp2Ae = vec_mule(sum2A, v5ss); - pp2Ao = vec_mulo(sum2A, v5ss); - pp2Be = vec_mule(sum2B, v5ss); - pp2Bo = vec_mulo(sum2B, v5ss); - - pp3Ae = vec_sra((vector signed int)sum3A, v16ui); - pp3Ao = vec_mulo(sum3A, v1ss); - pp3Be = vec_sra((vector signed int)sum3B, v16ui); - pp3Bo = vec_mulo(sum3B, v1ss); - - pp1cAe = vec_add(pp1Ae, v512si); - pp1cAo = vec_add(pp1Ao, v512si); - pp1cBe = vec_add(pp1Be, v512si); - pp1cBo = vec_add(pp1Bo, v512si); - - pp32Ae = vec_sub(pp3Ae, pp2Ae); - pp32Ao = vec_sub(pp3Ao, pp2Ao); - pp32Be = vec_sub(pp3Be, pp2Be); - pp32Bo = vec_sub(pp3Bo, pp2Bo); - - sumAe = vec_add(pp1cAe, pp32Ae); - sumAo = vec_add(pp1cAo, pp32Ao); - sumBe = vec_add(pp1cBe, pp32Be); - sumBo = vec_add(pp1cBo, pp32Bo); - - ssumAe = vec_sra(sumAe, v10ui); - ssumAo = vec_sra(sumAo, v10ui); - ssumBe = vec_sra(sumBe, v10ui); - ssumBo = vec_sra(sumBo, v10ui); - - ssume = vec_packs(ssumAe, ssumBe); - ssumo = vec_packs(ssumAo, ssumBo); - - sumv = vec_packsu(ssume, ssumo); - sum = vec_perm(sumv, sumv, mperm); - - dst1 = vec_ld(0, dst); - dst2 = vec_ld(16, dst); - vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); - - OP_U8_ALTIVEC(fsum, sum, vdst); - - rsum = vec_perm(fsum, fsum, dstperm); - fdst1 = vec_sel(dst1, rsum, dstmask); - fdst2 = vec_sel(rsum, dst2, dstmask); - - vec_st(fdst1, 0, dst); - vec_st(fdst2, 16, dst); - - dst += dstStride; - } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); -} diff --git a/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c b/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c deleted file mode 100644 index 66c8082f7..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c +++ /dev/null @@ -1,237 +0,0 @@ -/* - * Copyright (c) 2001 Michel Lespinasse - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -/* - * NOTE: This code is based on GPL code from the libmpeg2 project. The - * author, Michel Lespinasses, has given explicit permission to release - * under LGPL as part of ffmpeg. - * - */ - -/* - * FFMpeg integration by Dieter Shirley - * - * This file is a direct copy of the altivec idct module from the libmpeg2 - * project. I've deleted all of the libmpeg2 specific code, renamed the functions and - * re-ordered the function parameters. The only change to the IDCT function - * itself was to factor out the partial transposition, and to perform a full - * transpose at the end of the function. - */ - - -#include <stdlib.h> /* malloc(), free() */ -#include <string.h> -#include "../dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_altivec.h" - -#define vector_s16_t vector signed short -#define const_vector_s16_t const_vector signed short -#define vector_u16_t vector unsigned short -#define vector_s8_t vector signed char -#define vector_u8_t vector unsigned char -#define vector_s32_t vector signed int -#define vector_u32_t vector unsigned int - -#define IDCT_HALF \ - /* 1st stage */ \ - t1 = vec_mradds (a1, vx7, vx1 ); \ - t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ - t7 = vec_mradds (a2, vx5, vx3); \ - t3 = vec_mradds (ma2, vx3, vx5); \ - \ - /* 2nd stage */ \ - t5 = vec_adds (vx0, vx4); \ - t0 = vec_subs (vx0, vx4); \ - t2 = vec_mradds (a0, vx6, vx2); \ - t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ - t6 = vec_adds (t8, t3); \ - t3 = vec_subs (t8, t3); \ - t8 = vec_subs (t1, t7); \ - t1 = vec_adds (t1, t7); \ - \ - /* 3rd stage */ \ - t7 = vec_adds (t5, t2); \ - t2 = vec_subs (t5, t2); \ - t5 = vec_adds (t0, t4); \ - t0 = vec_subs (t0, t4); \ - t4 = vec_subs (t8, t3); \ - t3 = vec_adds (t8, t3); \ - \ - /* 4th stage */ \ - vy0 = vec_adds (t7, t1); \ - vy7 = vec_subs (t7, t1); \ - vy1 = vec_mradds (c4, t3, t5); \ - vy6 = vec_mradds (mc4, t3, t5); \ - vy2 = vec_mradds (c4, t4, t0); \ - vy5 = vec_mradds (mc4, t4, t0); \ - vy3 = vec_adds (t2, t6); \ - vy4 = vec_subs (t2, t6); - - -#define IDCT \ - vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ - vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ - vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \ - vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \ - vector_u16_t shift; \ - \ - c4 = vec_splat (constants[0], 0); \ - a0 = vec_splat (constants[0], 1); \ - a1 = vec_splat (constants[0], 2); \ - a2 = vec_splat (constants[0], 3); \ - mc4 = vec_splat (constants[0], 4); \ - ma2 = vec_splat (constants[0], 5); \ - bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \ - \ - zero = vec_splat_s16 (0); \ - shift = vec_splat_u16 (4); \ - \ - vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ - vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ - vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ - vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ - vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ - vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ - vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ - vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ - \ - IDCT_HALF \ - \ - vx0 = vec_mergeh (vy0, vy4); \ - vx1 = vec_mergel (vy0, vy4); \ - vx2 = vec_mergeh (vy1, vy5); \ - vx3 = vec_mergel (vy1, vy5); \ - vx4 = vec_mergeh (vy2, vy6); \ - vx5 = vec_mergel (vy2, vy6); \ - vx6 = vec_mergeh (vy3, vy7); \ - vx7 = vec_mergel (vy3, vy7); \ - \ - vy0 = vec_mergeh (vx0, vx4); \ - vy1 = vec_mergel (vx0, vx4); \ - vy2 = vec_mergeh (vx1, vx5); \ - vy3 = vec_mergel (vx1, vx5); \ - vy4 = vec_mergeh (vx2, vx6); \ - vy5 = vec_mergel (vx2, vx6); \ - vy6 = vec_mergeh (vx3, vx7); \ - vy7 = vec_mergel (vx3, vx7); \ - \ - vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ - vx1 = vec_mergel (vy0, vy4); \ - vx2 = vec_mergeh (vy1, vy5); \ - vx3 = vec_mergel (vy1, vy5); \ - vx4 = vec_mergeh (vy2, vy6); \ - vx5 = vec_mergel (vy2, vy6); \ - vx6 = vec_mergeh (vy3, vy7); \ - vx7 = vec_mergel (vy3, vy7); \ - \ - IDCT_HALF \ - \ - shift = vec_splat_u16 (6); \ - vx0 = vec_sra (vy0, shift); \ - vx1 = vec_sra (vy1, shift); \ - vx2 = vec_sra (vy2, shift); \ - vx3 = vec_sra (vy3, shift); \ - vx4 = vec_sra (vy4, shift); \ - vx5 = vec_sra (vy5, shift); \ - vx6 = vec_sra (vy6, shift); \ - vx7 = vec_sra (vy7, shift); - - -static const_vector_s16_t constants[5] = { - (vector_s16_t) AVV(23170, 13573, 6518, 21895, -23170, -21895, 32, 31), - (vector_s16_t) AVV(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725), - (vector_s16_t) AVV(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521), - (vector_s16_t) AVV(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692), - (vector_s16_t) AVV(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722) -}; - -void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block) -{ -POWERPC_PERF_DECLARE(altivec_idct_put_num, 1); - vector_u8_t tmp; - -#ifdef CONFIG_POWERPC_PERF -POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); -#endif - IDCT - -#define COPY(dest,src) \ - tmp = vec_packsu (src, src); \ - vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); - - COPY (dest, vx0) dest += stride; - COPY (dest, vx1) dest += stride; - COPY (dest, vx2) dest += stride; - COPY (dest, vx3) dest += stride; - COPY (dest, vx4) dest += stride; - COPY (dest, vx5) dest += stride; - COPY (dest, vx6) dest += stride; - COPY (dest, vx7) - -POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1); -} - -void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block) -{ -POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); - vector_u8_t tmp; - vector_s16_t tmp2, tmp3; - vector_u8_t perm0; - vector_u8_t perm1; - vector_u8_t p0, p1, p; - -#ifdef CONFIG_POWERPC_PERF -POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); -#endif - - IDCT - - p0 = vec_lvsl (0, dest); - p1 = vec_lvsl (stride, dest); - p = vec_splat_u8 (-1); - perm0 = vec_mergeh (p, p0); - perm1 = vec_mergeh (p, p1); - -#define ADD(dest,src,perm) \ - /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ - tmp = vec_ld (0, dest); \ - tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \ - tmp3 = vec_adds (tmp2, src); \ - tmp = vec_packsu (tmp3, tmp3); \ - vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); - - ADD (dest, vx0, perm0) dest += stride; - ADD (dest, vx1, perm1) dest += stride; - ADD (dest, vx2, perm0) dest += stride; - ADD (dest, vx3, perm1) dest += stride; - ADD (dest, vx4, perm0) dest += stride; - ADD (dest, vx5, perm1) dest += stride; - ADD (dest, vx6, perm0) dest += stride; - ADD (dest, vx7, perm1) - -POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); -} - diff --git a/contrib/ffmpeg/libavcodec/ppc/mathops.h b/contrib/ffmpeg/libavcodec/ppc/mathops.h deleted file mode 100644 index 6af23f246..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/mathops.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2001, 2002 Fabrice Bellard. - * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#if defined(ARCH_POWERPC_405) -/* signed 16x16 -> 32 multiply add accumulate */ -# define MAC16(rt, ra, rb) \ - asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); - -/* signed 16x16 -> 32 multiply */ -# define MUL16(ra, rb) \ - ({ int __rt; - asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); - __rt; }) -#endif diff --git a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c b/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c deleted file mode 100644 index 3822cb20e..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c +++ /dev/null @@ -1,603 +0,0 @@ -/* - * Copyright (c) 2002 Dieter Shirley - * - * dct_unquantize_h263_altivec: - * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdlib.h> -#include <stdio.h> -#include "../dsputil.h" -#include "../mpegvideo.h" - -#include "gcc_fixes.h" - -#include "dsputil_altivec.h" - -// Swaps two variables (used for altivec registers) -#define SWAP(a,b) \ -do { \ - __typeof__(a) swap_temp=a; \ - a=b; \ - b=swap_temp; \ -} while (0) - -// transposes a matrix consisting of four vectors with four elements each -#define TRANSPOSE4(a,b,c,d) \ -do { \ - __typeof__(a) _trans_ach = vec_mergeh(a, c); \ - __typeof__(a) _trans_acl = vec_mergel(a, c); \ - __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ - __typeof__(a) _trans_bdl = vec_mergel(b, d); \ - \ - a = vec_mergeh(_trans_ach, _trans_bdh); \ - b = vec_mergel(_trans_ach, _trans_bdh); \ - c = vec_mergeh(_trans_acl, _trans_bdl); \ - d = vec_mergel(_trans_acl, _trans_bdl); \ -} while (0) - - -// Loads a four-byte value (int or float) from the target address -// into every element in the target vector. Only works if the -// target address is four-byte aligned (which should be always). -#define LOAD4(vec, address) \ -{ \ - __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ - vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ - vec = vec_ld(0, _load_addr); \ - vec = vec_perm(vec, vec, _perm_vec); \ - vec = vec_splat(vec, 0); \ -} - - -#ifdef CONFIG_DARWIN -#define FOUROF(a) (a) -#else -// slower, for dumb non-apple GCC -#define FOUROF(a) {a,a,a,a} -#endif -int dct_quantize_altivec(MpegEncContext* s, - DCTELEM* data, int n, - int qscale, int* overflow) -{ - int lastNonZero; - vector float row0, row1, row2, row3, row4, row5, row6, row7; - vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7; - const_vector float zero = (const_vector float)FOUROF(0.); - // used after quantise step - int oldBaseValue = 0; - - // Load the data into the row/alt vectors - { - vector signed short data0, data1, data2, data3, data4, data5, data6, data7; - - data0 = vec_ld(0, data); - data1 = vec_ld(16, data); - data2 = vec_ld(32, data); - data3 = vec_ld(48, data); - data4 = vec_ld(64, data); - data5 = vec_ld(80, data); - data6 = vec_ld(96, data); - data7 = vec_ld(112, data); - - // Transpose the data before we start - TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); - - // load the data into floating point vectors. We load - // the high half of each row into the main row vectors - // and the low half into the alt vectors. - row0 = vec_ctf(vec_unpackh(data0), 0); - alt0 = vec_ctf(vec_unpackl(data0), 0); - row1 = vec_ctf(vec_unpackh(data1), 0); - alt1 = vec_ctf(vec_unpackl(data1), 0); - row2 = vec_ctf(vec_unpackh(data2), 0); - alt2 = vec_ctf(vec_unpackl(data2), 0); - row3 = vec_ctf(vec_unpackh(data3), 0); - alt3 = vec_ctf(vec_unpackl(data3), 0); - row4 = vec_ctf(vec_unpackh(data4), 0); - alt4 = vec_ctf(vec_unpackl(data4), 0); - row5 = vec_ctf(vec_unpackh(data5), 0); - alt5 = vec_ctf(vec_unpackl(data5), 0); - row6 = vec_ctf(vec_unpackh(data6), 0); - alt6 = vec_ctf(vec_unpackl(data6), 0); - row7 = vec_ctf(vec_unpackh(data7), 0); - alt7 = vec_ctf(vec_unpackl(data7), 0); - } - - // The following block could exist as a separate an altivec dct - // function. However, if we put it inline, the DCT data can remain - // in the vector local variables, as floats, which we'll use during the - // quantize step... - { - const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f); - const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f); - const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f); - const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f); - const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f); - const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f); - const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f); - const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f); - const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f); - const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f); - const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f); - const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f); - - - int whichPass, whichHalf; - - for(whichPass = 1; whichPass<=2; whichPass++) - { - for(whichHalf = 1; whichHalf<=2; whichHalf++) - { - vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - vector float tmp10, tmp11, tmp12, tmp13; - vector float z1, z2, z3, z4, z5; - - tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7]; - tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7]; - tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4]; - tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4]; - tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6]; - tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6]; - tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5]; - tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5]; - - tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3; - tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3; - tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2; - tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2; - - - // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS); - row0 = vec_add(tmp10, tmp11); - - // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS); - row4 = vec_sub(tmp10, tmp11); - - - // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); - z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero); - - // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), - // CONST_BITS-PASS1_BITS); - row2 = vec_madd(tmp13, vec_0_765366865, z1); - - // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), - // CONST_BITS-PASS1_BITS); - row6 = vec_madd(tmp12, vec_1_847759065, z1); - - z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7; - z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6; - z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6; - z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7; - - // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ - z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero); - - // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ - z3 = vec_madd(z3, vec_1_961570560, z5); - - // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ - z4 = vec_madd(z4, vec_0_390180644, z5); - - // The following adds are rolled into the multiplies above - // z3 = vec_add(z3, z5); // z3 += z5; - // z4 = vec_add(z4, z5); // z4 += z5; - - // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ - // Wow! It's actually more effecient to roll this multiply - // into the adds below, even thought the multiply gets done twice! - // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero); - - // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ - // Same with this one... - // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero); - - // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ - // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); - row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3)); - - // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ - // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); - row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4)); - - // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ - // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); - row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3)); - - // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ - // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); - row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4)); - - // Swap the row values with the alts. If this is the first half, - // this sets up the low values to be acted on in the second half. - // If this is the second half, it puts the high values back in - // the row values where they are expected to be when we're done. - SWAP(row0, alt0); - SWAP(row1, alt1); - SWAP(row2, alt2); - SWAP(row3, alt3); - SWAP(row4, alt4); - SWAP(row5, alt5); - SWAP(row6, alt6); - SWAP(row7, alt7); - } - - if (whichPass == 1) - { - // transpose the data for the second pass - - // First, block transpose the upper right with lower left. - SWAP(row4, alt0); - SWAP(row5, alt1); - SWAP(row6, alt2); - SWAP(row7, alt3); - - // Now, transpose each block of four - TRANSPOSE4(row0, row1, row2, row3); - TRANSPOSE4(row4, row5, row6, row7); - TRANSPOSE4(alt0, alt1, alt2, alt3); - TRANSPOSE4(alt4, alt5, alt6, alt7); - } - } - } - - // perform the quantise step, using the floating point data - // still in the row/alt registers - { - const int* biasAddr; - const vector signed int* qmat; - vector float bias, negBias; - - if (s->mb_intra) - { - vector signed int baseVector; - - // We must cache element 0 in the intra case - // (it needs special handling). - baseVector = vec_cts(vec_splat(row0, 0), 0); - vec_ste(baseVector, 0, &oldBaseValue); - - qmat = (vector signed int*)s->q_intra_matrix[qscale]; - biasAddr = &(s->intra_quant_bias); - } - else - { - qmat = (vector signed int*)s->q_inter_matrix[qscale]; - biasAddr = &(s->inter_quant_bias); - } - - // Load the bias vector (We add 0.5 to the bias so that we're - // rounding when we convert to int, instead of flooring.) - { - vector signed int biasInt; - const vector float negOneFloat = (vector float)FOUROF(-1.0f); - LOAD4(biasInt, biasAddr); - bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT); - negBias = vec_madd(bias, negOneFloat, zero); - } - - { - vector float q0, q1, q2, q3, q4, q5, q6, q7; - - q0 = vec_ctf(qmat[0], QMAT_SHIFT); - q1 = vec_ctf(qmat[2], QMAT_SHIFT); - q2 = vec_ctf(qmat[4], QMAT_SHIFT); - q3 = vec_ctf(qmat[6], QMAT_SHIFT); - q4 = vec_ctf(qmat[8], QMAT_SHIFT); - q5 = vec_ctf(qmat[10], QMAT_SHIFT); - q6 = vec_ctf(qmat[12], QMAT_SHIFT); - q7 = vec_ctf(qmat[14], QMAT_SHIFT); - - row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias), - vec_cmpgt(row0, zero)); - row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias), - vec_cmpgt(row1, zero)); - row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias), - vec_cmpgt(row2, zero)); - row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias), - vec_cmpgt(row3, zero)); - row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias), - vec_cmpgt(row4, zero)); - row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias), - vec_cmpgt(row5, zero)); - row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias), - vec_cmpgt(row6, zero)); - row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias), - vec_cmpgt(row7, zero)); - - q0 = vec_ctf(qmat[1], QMAT_SHIFT); - q1 = vec_ctf(qmat[3], QMAT_SHIFT); - q2 = vec_ctf(qmat[5], QMAT_SHIFT); - q3 = vec_ctf(qmat[7], QMAT_SHIFT); - q4 = vec_ctf(qmat[9], QMAT_SHIFT); - q5 = vec_ctf(qmat[11], QMAT_SHIFT); - q6 = vec_ctf(qmat[13], QMAT_SHIFT); - q7 = vec_ctf(qmat[15], QMAT_SHIFT); - - alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias), - vec_cmpgt(alt0, zero)); - alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias), - vec_cmpgt(alt1, zero)); - alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias), - vec_cmpgt(alt2, zero)); - alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias), - vec_cmpgt(alt3, zero)); - alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias), - vec_cmpgt(alt4, zero)); - alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias), - vec_cmpgt(alt5, zero)); - alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias), - vec_cmpgt(alt6, zero)); - alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias), - vec_cmpgt(alt7, zero)); - } - - - } - - // Store the data back into the original block - { - vector signed short data0, data1, data2, data3, data4, data5, data6, data7; - - data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0)); - data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0)); - data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0)); - data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0)); - data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0)); - data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0)); - data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0)); - data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0)); - - { - // Clamp for overflow - vector signed int max_q_int, min_q_int; - vector signed short max_q, min_q; - - LOAD4(max_q_int, &(s->max_qcoeff)); - LOAD4(min_q_int, &(s->min_qcoeff)); - - max_q = vec_pack(max_q_int, max_q_int); - min_q = vec_pack(min_q_int, min_q_int); - - data0 = vec_max(vec_min(data0, max_q), min_q); - data1 = vec_max(vec_min(data1, max_q), min_q); - data2 = vec_max(vec_min(data2, max_q), min_q); - data4 = vec_max(vec_min(data4, max_q), min_q); - data5 = vec_max(vec_min(data5, max_q), min_q); - data6 = vec_max(vec_min(data6, max_q), min_q); - data7 = vec_max(vec_min(data7, max_q), min_q); - } - - { - vector bool char zero_01, zero_23, zero_45, zero_67; - vector signed char scanIndices_01, scanIndices_23, scanIndices_45, scanIndices_67; - vector signed char negOne = vec_splat_s8(-1); - vector signed char* scanPtr = - (vector signed char*)(s->intra_scantable.inverse); - signed char lastNonZeroChar; - - // Determine the largest non-zero index. - zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero), - vec_cmpeq(data1, (vector signed short)zero)); - zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero), - vec_cmpeq(data3, (vector signed short)zero)); - zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero), - vec_cmpeq(data5, (vector signed short)zero)); - zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero), - vec_cmpeq(data7, (vector signed short)zero)); - - // 64 biggest values - scanIndices_01 = vec_sel(scanPtr[0], negOne, zero_01); - scanIndices_23 = vec_sel(scanPtr[1], negOne, zero_23); - scanIndices_45 = vec_sel(scanPtr[2], negOne, zero_45); - scanIndices_67 = vec_sel(scanPtr[3], negOne, zero_67); - - // 32 largest values - scanIndices_01 = vec_max(scanIndices_01, scanIndices_23); - scanIndices_45 = vec_max(scanIndices_45, scanIndices_67); - - // 16 largest values - scanIndices_01 = vec_max(scanIndices_01, scanIndices_45); - - // 8 largest values - scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), - vec_mergel(scanIndices_01, negOne)); - - // 4 largest values - scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), - vec_mergel(scanIndices_01, negOne)); - - // 2 largest values - scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), - vec_mergel(scanIndices_01, negOne)); - - // largest value - scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), - vec_mergel(scanIndices_01, negOne)); - - scanIndices_01 = vec_splat(scanIndices_01, 0); - - - vec_ste(scanIndices_01, 0, &lastNonZeroChar); - - lastNonZero = lastNonZeroChar; - - // While the data is still in vectors we check for the transpose IDCT permute - // and handle it using the vector unit if we can. This is the permute used - // by the altivec idct, so it is common when using the altivec dct. - - if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) - { - TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); - } - - vec_st(data0, 0, data); - vec_st(data1, 16, data); - vec_st(data2, 32, data); - vec_st(data3, 48, data); - vec_st(data4, 64, data); - vec_st(data5, 80, data); - vec_st(data6, 96, data); - vec_st(data7, 112, data); - } - } - - // special handling of block[0] - if (s->mb_intra) - { - if (!s->h263_aic) - { - if (n < 4) - oldBaseValue /= s->y_dc_scale; - else - oldBaseValue /= s->c_dc_scale; - } - - // Divide by 8, rounding the result - data[0] = (oldBaseValue + 4) >> 3; - } - - // We handled the tranpose permutation above and we don't - // need to permute the "no" permutation case. - if ((lastNonZero > 0) && - (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && - (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) - { - ff_block_permute(data, s->dsp.idct_permutation, - s->intra_scantable.scantable, lastNonZero); - } - - return lastNonZero; -} -#undef FOUROF - -/* - AltiVec version of dct_unquantize_h263 - this code assumes `block' is 16 bytes-aligned -*/ -void dct_unquantize_h263_altivec(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ -POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1); - int i, level, qmul, qadd; - int nCoeffs; - - assert(s->block_last_index[n]>=0); - -POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); - - qadd = (qscale - 1) | 1; - qmul = qscale << 1; - - if (s->mb_intra) { - if (!s->h263_aic) { - if (n < 4) - block[0] = block[0] * s->y_dc_scale; - else - block[0] = block[0] * s->c_dc_scale; - }else - qadd = 0; - i = 1; - nCoeffs= 63; //does not allways use zigzag table - } else { - i = 0; - nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - } - - { - register const_vector signed short vczero = (const_vector signed short)vec_splat_s16(0); - short __attribute__ ((aligned(16))) qmul8[] = - { - qmul, qmul, qmul, qmul, - qmul, qmul, qmul, qmul - }; - short __attribute__ ((aligned(16))) qadd8[] = - { - qadd, qadd, qadd, qadd, - qadd, qadd, qadd, qadd - }; - short __attribute__ ((aligned(16))) nqadd8[] = - { - -qadd, -qadd, -qadd, -qadd, - -qadd, -qadd, -qadd, -qadd - }; - register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; - register vector bool short blockv_null, blockv_neg; - register short backup_0 = block[0]; - register int j = 0; - - qmulv = vec_ld(0, qmul8); - qaddv = vec_ld(0, qadd8); - nqaddv = vec_ld(0, nqadd8); - -#if 0 // block *is* 16 bytes-aligned, it seems. - // first make sure block[j] is 16 bytes-aligned - for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { - level = block[j]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; - } - block[j] = level; - } - } -#endif - - // vectorize all the 16 bytes-aligned blocks - // of 8 elements - for(; (j + 7) <= nCoeffs ; j+=8) - { - blockv = vec_ld(j << 1, block); - blockv_neg = vec_cmplt(blockv, vczero); - blockv_null = vec_cmpeq(blockv, vczero); - // choose between +qadd or -qadd as the third operand - temp1 = vec_sel(qaddv, nqaddv, blockv_neg); - // multiply & add (block{i,i+7} * qmul [+-] qadd) - temp1 = vec_mladd(blockv, qmulv, temp1); - // put 0 where block[{i,i+7} used to have 0 - blockv = vec_sel(temp1, blockv, blockv_null); - vec_st(blockv, j << 1, block); - } - - // if nCoeffs isn't a multiple of 8, finish the job - // using good old scalar units. - // (we could do it using a truncated vector, - // but I'm not sure it's worth the hassle) - for(; j <= nCoeffs ; j++) { - level = block[j]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; - } - block[j] = level; - } - } - - if (i == 1) - { // cheat. this avoid special-casing the first iteration - block[0] = backup_0; - } - } -POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); -} diff --git a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c b/contrib/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c deleted file mode 100644 index c5e822f77..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2002 Dieter Shirley - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "../dsputil.h" -#include "../mpegvideo.h" -#include <time.h> - -#ifdef HAVE_ALTIVEC -#include "dsputil_altivec.h" -#endif - -extern int dct_quantize_altivec(MpegEncContext *s, - DCTELEM *block, int n, - int qscale, int *overflow); -extern void dct_unquantize_h263_altivec(MpegEncContext *s, - DCTELEM *block, int n, int qscale); - -extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); -extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); - - -void MPV_common_init_ppc(MpegEncContext *s) -{ -#ifdef HAVE_ALTIVEC - if (has_altivec()) - { - if (s->avctx->lowres==0) - { - if ((s->avctx->idct_algo == FF_IDCT_AUTO) || - (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) - { - s->dsp.idct_put = idct_put_altivec; - s->dsp.idct_add = idct_add_altivec; - s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - } - } - - // Test to make sure that the dct required alignments are met. - if ((((long)(s->q_intra_matrix) & 0x0f) != 0) || - (((long)(s->q_inter_matrix) & 0x0f) != 0)) - { - av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned " - "to use Altivec DCT. Reverting to non-altivec version.\n"); - return; - } - - if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) - { - av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned " - "to use Altivec DCT. Reverting to non-altivec version.\n"); - return; - } - - - if ((s->avctx->dct_algo == FF_DCT_AUTO) || - (s->avctx->dct_algo == FF_DCT_ALTIVEC)) - { -#if 0 /* seems to cause trouble under some circumstances */ - s->dct_quantize = dct_quantize_altivec; -#endif - s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec; - s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec; - } - } else -#endif - { - /* Non-AltiVec PPC optimisations here */ - } -} - diff --git a/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c b/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c deleted file mode 100644 index b15672ffe..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c +++ /dev/null @@ -1,788 +0,0 @@ -/* - * Altivec optimized snow DSP utils - * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * - */ - -#include "../dsputil.h" - -#include "gcc_fixes.h" -#include "dsputil_altivec.h" -#include "../snow.h" - -#undef NDEBUG -#include <assert.h> - - - -//FIXME remove this replication -#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num))) - -static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line) -{ - int offset; - DWTELEM * buffer; - -// av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line); - - assert(buf->data_stack_top >= 0); -// assert(!buf->line[line]); - if (buf->line[line]) - return buf->line[line]; - - offset = buf->line_width * line; - buffer = buf->data_stack[buf->data_stack_top]; - buf->data_stack_top--; - buf->line[line] = buffer; - -// av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1); - - return buffer; -} - - -//altivec code - -void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width) -{ - const int w2= (width+1)>>1; - DECLARE_ALIGNED_16(DWTELEM, temp[(width>>1)]); - const int w_l= (width>>1); - const int w_r= w2 - 1; - int i; - vector signed int t1, t2, x, y, tmp1, tmp2; - vector signed int *vbuf, *vtmp; - vector unsigned char align; - - - - { // Lift 0 - DWTELEM * const ref = b + w2 - 1; - DWTELEM b_0 = b[0]; - vbuf = (vector signed int *)b; - - tmp1 = vec_ld (0, ref); - align = vec_lvsl (0, ref); - tmp2 = vec_ld (15, ref); - t1= vec_perm(tmp1, tmp2, align); - - i = 0; - - for (i=0; i<w_l-15; i+=16) { -#if 0 - b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3); - b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3); - b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3); - b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3); -#else - - tmp1 = vec_ld (0, ref+4+i); - tmp2 = vec_ld (15, ref+4+i); - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_add(vec_add(y,y),y); - - tmp1 = vec_ld (0, ref+8+i); - - y = vec_add(y, vec_splat_s32(4)); - y = vec_sra(y, vec_splat_u32(3)); - - tmp2 = vec_ld (15, ref+8+i); - - *vbuf = vec_sub(*vbuf, y); - - t1=t2; - - vbuf++; - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_add(vec_add(y,y),y); - - tmp1 = vec_ld (0, ref+12+i); - - y = vec_add(y, vec_splat_s32(4)); - y = vec_sra(y, vec_splat_u32(3)); - - tmp2 = vec_ld (15, ref+12+i); - - *vbuf = vec_sub(*vbuf, y); - - t1=t2; - - vbuf++; - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_add(vec_add(y,y),y); - - tmp1 = vec_ld (0, ref+16+i); - - y = vec_add(y, vec_splat_s32(4)); - y = vec_sra(y, vec_splat_u32(3)); - - tmp2 = vec_ld (15, ref+16+i); - - *vbuf = vec_sub(*vbuf, y); - - t1=t2; - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_add(vec_add(y,y),y); - - vbuf++; - - y = vec_add(y, vec_splat_s32(4)); - y = vec_sra(y, vec_splat_u32(3)); - *vbuf = vec_sub(*vbuf, y); - - t1=t2; - - vbuf++; - -#endif - } - - snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); - b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); - } - - { // Lift 1 - DWTELEM * const dst = b+w2; - - i = 0; - for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){ - dst[i] = dst[i] - (b[i] + b[i + 1]); - } - - align = vec_lvsl(0, b+i); - tmp1 = vec_ld(0, b+i); - vbuf = (vector signed int*) (dst + i); - tmp2 = vec_ld(15, b+i); - - t1 = vec_perm(tmp1, tmp2, align); - - for (; i<w_r-3; i+=4) { - -#if 0 - dst[i] = dst[i] - (b[i] + b[i + 1]); - dst[i+1] = dst[i+1] - (b[i+1] + b[i + 2]); - dst[i+2] = dst[i+2] - (b[i+2] + b[i + 3]); - dst[i+3] = dst[i+3] - (b[i+3] + b[i + 4]); -#else - - tmp1 = vec_ld(0, b+4+i); - tmp2 = vec_ld(15, b+4+i); - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1, vec_sld(t1,t2,4)); - *vbuf = vec_sub (*vbuf, y); - - vbuf++; - - t1 = t2; - -#endif - - } - - snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); - } - - { // Lift 2 - DWTELEM * const ref = b+w2 - 1; - DWTELEM b_0 = b[0]; - vbuf= (vector signed int *) b; - - tmp1 = vec_ld (0, ref); - align = vec_lvsl (0, ref); - tmp2 = vec_ld (15, ref); - t1= vec_perm(tmp1, tmp2, align); - - i = 0; - for (; i<w_l-15; i+=16) { -#if 0 - b[i] = b[i] - (((8 -(ref[i] + ref[i+1])) - (b[i] <<2)) >> 4); - b[i+1] = b[i+1] - (((8 -(ref[i+1] + ref[i+2])) - (b[i+1]<<2)) >> 4); - b[i+2] = b[i+2] - (((8 -(ref[i+2] + ref[i+3])) - (b[i+2]<<2)) >> 4); - b[i+3] = b[i+3] - (((8 -(ref[i+3] + ref[i+4])) - (b[i+3]<<2)) >> 4); -#else - tmp1 = vec_ld (0, ref+4+i); - tmp2 = vec_ld (15, ref+4+i); - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_sub(vec_splat_s32(8),y); - - tmp1 = vec_ld (0, ref+8+i); - - x = vec_sl(*vbuf,vec_splat_u32(2)); - y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); - - tmp2 = vec_ld (15, ref+8+i); - - *vbuf = vec_sub( *vbuf, y); - - t1 = t2; - - vbuf++; - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_sub(vec_splat_s32(8),y); - - tmp1 = vec_ld (0, ref+12+i); - - x = vec_sl(*vbuf,vec_splat_u32(2)); - y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); - - tmp2 = vec_ld (15, ref+12+i); - - *vbuf = vec_sub( *vbuf, y); - - t1 = t2; - - vbuf++; - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_sub(vec_splat_s32(8),y); - - tmp1 = vec_ld (0, ref+16+i); - - x = vec_sl(*vbuf,vec_splat_u32(2)); - y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); - - tmp2 = vec_ld (15, ref+16+i); - - *vbuf = vec_sub( *vbuf, y); - - t1 = t2; - - vbuf++; - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_sub(vec_splat_s32(8),y); - - t1 = t2; - - x = vec_sl(*vbuf,vec_splat_u32(2)); - y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); - *vbuf = vec_sub( *vbuf, y); - - vbuf++; - -#endif - } - - snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); - b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS); - } - - { // Lift 3 - DWTELEM * const src = b+w2; - - vbuf = (vector signed int *)b; - vtmp = (vector signed int *)temp; - - i = 0; - align = vec_lvsl(0, src); - - for (; i<w_r-3; i+=4) { -#if 0 - temp[i] = src[i] - ((-3*(b[i] + b[i+1]))>>1); - temp[i+1] = src[i+1] - ((-3*(b[i+1] + b[i+2]))>>1); - temp[i+2] = src[i+2] - ((-3*(b[i+2] + b[i+3]))>>1); - temp[i+3] = src[i+3] - ((-3*(b[i+3] + b[i+4]))>>1); -#else - tmp1 = vec_ld(0,src+i); - t1 = vec_add(vbuf[0],vec_sld(vbuf[0],vbuf[1],4)); - tmp2 = vec_ld(15,src+i); - t1 = vec_sub(vec_splat_s32(0),t1); //bad! - t1 = vec_add(t1,vec_add(t1,t1)); - t2 = vec_perm(tmp1 ,tmp2 ,align); - t1 = vec_sra(t1,vec_splat_u32(1)); - vbuf++; - *vtmp = vec_sub(t2,t1); - vtmp++; - -#endif - - } - - snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -3, 0, 1); - } - - { - //Interleave - int a; - vector signed int *t = (vector signed int *)temp, - *v = (vector signed int *)b; - - snow_interleave_line_header(&i, width, b, temp); - - for (; (i & 0xE) != 0xE; i-=2){ - b[i+1] = temp[i>>1]; - b[i] = b[i>>1]; - } - for (i-=14; i>=0; i-=16){ - a=i/4; - - v[a+3]=vec_mergel(v[(a>>1)+1],t[(a>>1)+1]); - v[a+2]=vec_mergeh(v[(a>>1)+1],t[(a>>1)+1]); - v[a+1]=vec_mergel(v[a>>1],t[a>>1]); - v[a]=vec_mergeh(v[a>>1],t[a>>1]); - - } - - } -} - -void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width) -{ - int i, w4 = width/4; - vector signed int *v0, *v1,*v2,*v3,*v4,*v5; - vector signed int t1, t2; - - v0=(vector signed int *)b0; - v1=(vector signed int *)b1; - v2=(vector signed int *)b2; - v3=(vector signed int *)b3; - v4=(vector signed int *)b4; - v5=(vector signed int *)b5; - - for (i=0; i< w4;i++) - { - - #if 0 - b4[i] -= (3*(b3[i] + b5[i])+4)>>3; - b3[i] -= ((b2[i] + b4[i])); - b2[i] += ((b1[i] + b3[i])+4*b2[i]+8)>>4; - b1[i] += (3*(b0[i] + b2[i]))>>1; - #else - t1 = vec_add(v3[i], v5[i]); - t2 = vec_add(t1, vec_add(t1,t1)); - t1 = vec_add(t2, vec_splat_s32(4)); - v4[i] = vec_sub(v4[i], vec_sra(t1,vec_splat_u32(3))); - - v3[i] = vec_sub(v3[i], vec_add(v2[i], v4[i])); - - t1 = vec_add(vec_splat_s32(8), vec_add(v1[i], v3[i])); - t2 = vec_sl(v2[i], vec_splat_u32(2)); - v2[i] = vec_add(v2[i], vec_sra(vec_add(t1,t2),vec_splat_u32(4))); - t1 = vec_add(v0[i], v2[i]); - t2 = vec_add(t1, vec_add(t1,t1)); - v1[i] = vec_add(v1[i], vec_sra(t2,vec_splat_u32(1))); - - #endif - } - - for(i*=4; i < width; i++) - { - b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; - b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; - b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; - b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; - } -} - -#define LOAD_BLOCKS \ - tmp1 = vec_ld(0, &block[3][y*src_stride]);\ - align = vec_lvsl(0, &block[3][y*src_stride]);\ - tmp2 = vec_ld(15, &block[3][y*src_stride]);\ -\ - b3 = vec_perm(tmp1,tmp2,align);\ -\ - tmp1 = vec_ld(0, &block[2][y*src_stride]);\ - align = vec_lvsl(0, &block[2][y*src_stride]);\ - tmp2 = vec_ld(15, &block[2][y*src_stride]);\ -\ - b2 = vec_perm(tmp1,tmp2,align);\ -\ - tmp1 = vec_ld(0, &block[1][y*src_stride]);\ - align = vec_lvsl(0, &block[1][y*src_stride]);\ - tmp2 = vec_ld(15, &block[1][y*src_stride]);\ -\ - b1 = vec_perm(tmp1,tmp2,align);\ -\ - tmp1 = vec_ld(0, &block[0][y*src_stride]);\ - align = vec_lvsl(0, &block[0][y*src_stride]);\ - tmp2 = vec_ld(15, &block[0][y*src_stride]);\ -\ - b0 = vec_perm(tmp1,tmp2,align); - -#define LOAD_OBMCS \ - tmp1 = vec_ld(0, obmc1);\ - align = vec_lvsl(0, obmc1);\ - tmp2 = vec_ld(15, obmc1);\ -\ - ob1 = vec_perm(tmp1,tmp2,align);\ -\ - tmp1 = vec_ld(0, obmc2);\ - align = vec_lvsl(0, obmc2);\ - tmp2 = vec_ld(15, obmc2);\ -\ - ob2 = vec_perm(tmp1,tmp2,align);\ -\ - tmp1 = vec_ld(0, obmc3);\ - align = vec_lvsl(0, obmc3);\ - tmp2 = vec_ld(15, obmc3);\ -\ - ob3 = vec_perm(tmp1,tmp2,align);\ -\ - tmp1 = vec_ld(0, obmc4);\ - align = vec_lvsl(0, obmc4);\ - tmp2 = vec_ld(15, obmc4);\ -\ - ob4 = vec_perm(tmp1,tmp2,align); - -/* interleave logic - * h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ] - * h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ] - * h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ] - */ - -#define STEPS_0_1\ - h1 = (vector unsigned short)\ - vec_mergeh(ob1, ob2);\ -\ - h2 = (vector unsigned short)\ - vec_mergeh(ob3, ob4);\ -\ - ih = (vector unsigned char)\ - vec_mergeh(h1,h2);\ -\ - l1 = (vector unsigned short) vec_mergeh(b3, b2);\ -\ - ih1 = (vector unsigned char) vec_mergel(h1, h2);\ -\ - l2 = (vector unsigned short) vec_mergeh(b1, b0);\ -\ - il = (vector unsigned char) vec_mergeh(l1, l2);\ -\ - v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ -\ - il1 = (vector unsigned char) vec_mergel(l1, l2);\ -\ - v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); - -#define FINAL_STEP_SCALAR\ - for(x=0; x<b_w; x++)\ - if(add){\ - vbuf[x] += dst[x + src_x];\ - vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\ - if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\ - dst8[x + y*src_stride] = vbuf[x];\ - }else{\ - dst[x + src_x] -= vbuf[x];\ - } - -static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc, - const int obmc_stride, - uint8_t * * block, int b_w, - int b_h, int src_x, int src_y, - int src_stride, slice_buffer * sb, - int add, uint8_t * dst8) -{ - int y, x; - DWTELEM * dst; - vector unsigned short h1, h2, l1, l2; - vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; - vector unsigned char b0,b1,b2,b3; - vector unsigned char ob1,ob2,ob3,ob4; - - DECLARE_ALIGNED_16(int, vbuf[16]); - vector signed int *v = (vector signed int *)vbuf, *d; - - for(y=0; y<b_h; y++){ - //FIXME ugly missue of obmc_stride - - uint8_t *obmc1= obmc + y*obmc_stride; - uint8_t *obmc2= obmc1+ (obmc_stride>>1); - uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); - uint8_t *obmc4= obmc3+ (obmc_stride>>1); - - dst = slice_buffer_get_line(sb, src_y + y); - d = (vector signed int *)(dst + src_x); - -//FIXME i could avoid some loads! - - // load blocks - LOAD_BLOCKS - - // load obmcs - LOAD_OBMCS - - // steps 0 1 - STEPS_0_1 - - FINAL_STEP_SCALAR - - } - -} - -#define STEPS_2_3\ - h1 = (vector unsigned short) vec_mergel(ob1, ob2);\ -\ - h2 = (vector unsigned short) vec_mergel(ob3, ob4);\ -\ - ih = (vector unsigned char) vec_mergeh(h1,h2);\ -\ - l1 = (vector unsigned short) vec_mergel(b3, b2);\ -\ - l2 = (vector unsigned short) vec_mergel(b1, b0);\ -\ - ih1 = (vector unsigned char) vec_mergel(h1,h2);\ -\ - il = (vector unsigned char) vec_mergeh(l1,l2);\ -\ - v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ -\ - il1 = (vector unsigned char) vec_mergel(l1,l2);\ -\ - v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); - - -static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc, - const int obmc_stride, - uint8_t * * block, int b_w, - int b_h, int src_x, int src_y, - int src_stride, slice_buffer * sb, - int add, uint8_t * dst8) -{ - int y, x; - DWTELEM * dst; - vector unsigned short h1, h2, l1, l2; - vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; - vector unsigned char b0,b1,b2,b3; - vector unsigned char ob1,ob2,ob3,ob4; - DECLARE_ALIGNED_16(int, vbuf[b_w]); - vector signed int *v = (vector signed int *)vbuf, *d; - - for(y=0; y<b_h; y++){ - //FIXME ugly missue of obmc_stride - - uint8_t *obmc1= obmc + y*obmc_stride; - uint8_t *obmc2= obmc1+ (obmc_stride>>1); - uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); - uint8_t *obmc4= obmc3+ (obmc_stride>>1); - - dst = slice_buffer_get_line(sb, src_y + y); - d = (vector signed int *)(dst + src_x); - - // load blocks - LOAD_BLOCKS - - // load obmcs - LOAD_OBMCS - - // steps 0 1 2 3 - STEPS_0_1 - - STEPS_2_3 - - FINAL_STEP_SCALAR - - } -} - -#define FINAL_STEP_VEC \ -\ - if(add)\ - {\ - for(x=0; x<b_w/4; x++)\ - {\ - v[x] = vec_add(v[x], d[x]);\ - v[x] = vec_sra(vec_add(v[x],\ - vec_sl( vec_splat_s32(1),\ - vec_splat_u32(7))),\ - vec_splat_u32(8));\ -\ - mask = (vector bool int) vec_sl((vector signed int)\ - vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\ - mask = (vector bool int) vec_and(v[x],vec_nor(mask,mask));\ -\ - mask = (vector bool int)\ - vec_cmpeq((vector signed int)mask,\ - (vector signed int)vec_splat_u32(0));\ -\ - vs = vec_sra(v[x],vec_splat_u32(8));\ - vs = vec_sra(v[x],vec_splat_u32(8));\ - vs = vec_sra(v[x],vec_splat_u32(15));\ -\ - vs = vec_nor(vs,vs);\ -\ - v[x]= vec_sel(v[x],vs,mask);\ - }\ -\ - for(x=0; x<b_w; x++)\ - dst8[x + y*src_stride] = vbuf[x];\ -\ - }\ - else\ - for(x=0; x<b_w/4; x++)\ - d[x] = vec_sub(d[x], v[x]); - -static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc, - const int obmc_stride, - uint8_t * * block, int b_w, - int b_h, int src_x, int src_y, - int src_stride, slice_buffer * sb, - int add, uint8_t * dst8) -{ - int y, x; - DWTELEM * dst; - vector bool int mask; - vector signed int vs; - vector unsigned short h1, h2, l1, l2; - vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; - vector unsigned char b0,b1,b2,b3; - vector unsigned char ob1,ob2,ob3,ob4; - - DECLARE_ALIGNED_16(int, vbuf[16]); - vector signed int *v = (vector signed int *)vbuf, *d; - - for(y=0; y<b_h; y++){ - //FIXME ugly missue of obmc_stride - - uint8_t *obmc1= obmc + y*obmc_stride; - uint8_t *obmc2= obmc1+ (obmc_stride>>1); - uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); - uint8_t *obmc4= obmc3+ (obmc_stride>>1); - - dst = slice_buffer_get_line(sb, src_y + y); - d = (vector signed int *)(dst + src_x); - -//FIXME i could avoid some loads! - - // load blocks - LOAD_BLOCKS - - // load obmcs - LOAD_OBMCS - - // steps 0 1 - STEPS_0_1 - - FINAL_STEP_VEC - - } - -} - -static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc, - const int obmc_stride, - uint8_t * * block, int b_w, - int b_h, int src_x, int src_y, - int src_stride, slice_buffer * sb, - int add, uint8_t * dst8) -{ - int y, x; - DWTELEM * dst; - vector bool int mask; - vector signed int vs; - vector unsigned short h1, h2, l1, l2; - vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; - vector unsigned char b0,b1,b2,b3; - vector unsigned char ob1,ob2,ob3,ob4; - DECLARE_ALIGNED_16(int, vbuf[b_w]); - vector signed int *v = (vector signed int *)vbuf, *d; - - for(y=0; y<b_h; y++){ - //FIXME ugly missue of obmc_stride - - uint8_t *obmc1= obmc + y*obmc_stride; - uint8_t *obmc2= obmc1+ (obmc_stride>>1); - uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); - uint8_t *obmc4= obmc3+ (obmc_stride>>1); - - dst = slice_buffer_get_line(sb, src_y + y); - d = (vector signed int *)(dst + src_x); - - // load blocks - LOAD_BLOCKS - - // load obmcs - LOAD_OBMCS - - // steps 0 1 2 3 - STEPS_0_1 - - STEPS_2_3 - - FINAL_STEP_VEC - - } -} - - -void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride, - uint8_t * * block, int b_w, int b_h, - int src_x, int src_y, int src_stride, - slice_buffer * sb, int add, - uint8_t * dst8) -{ - if (src_x&15) { - if (b_w == 16) - inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block, - b_w, b_h, src_x, src_y, - src_stride, sb, add, dst8); - else if (b_w == 8) - inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block, - b_w, b_h, src_x, src_y, - src_stride, sb, add, dst8); - else - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, - src_y, src_stride, sb, add, dst8); - } else { - if (b_w == 16) - inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block, - b_w, b_h, src_x, src_y, - src_stride, sb, add, dst8); - else if (b_w == 8) - inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block, - b_w, b_h, src_x, src_y, - src_stride, sb, add, dst8); - else - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, - src_y, src_stride, sb, add, dst8); - } -} - - -void snow_init_altivec(DSPContext* c, AVCodecContext *avctx) -{ - c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; - c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; - c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; -} diff --git a/contrib/ffmpeg/libavcodec/ppc/types_altivec.h b/contrib/ffmpeg/libavcodec/ppc/types_altivec.h deleted file mode 100644 index f29026e04..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/types_altivec.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2006 Guillaume Poirier <gpoirier@mplayerhq.hu> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/*********************************************************************** - * Vector types - **********************************************************************/ -#define vec_u8_t vector unsigned char -#define vec_s8_t vector signed char -#define vec_u16_t vector unsigned short -#define vec_s16_t vector signed short -#define vec_u32_t vector unsigned int -#define vec_s32_t vector signed int - -/*********************************************************************** - * Null vector - **********************************************************************/ -#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 ) - -#define zero_u8v (vec_u8_t) zerov -#define zero_s8v (vec_s8_t) zerov -#define zero_u16v (vec_u16_t) zerov -#define zero_s16v (vec_s16_t) zerov -#define zero_u32v (vec_u32_t) zerov -#define zero_s32v (vec_s32_t) zerov diff --git a/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c b/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c deleted file mode 100644 index 114c9d41f..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c +++ /dev/null @@ -1,338 +0,0 @@ -/* - * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized - * Copyright (c) 2006 Konstantin Shishkov - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - */ - -#include "../dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_altivec.h" - -// main steps of 8x8 transform -#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \ -do { \ - t0 = vec_sl(vec_add(s0, s4), vec_2); \ - t0 = vec_add(vec_sl(t0, vec_1), t0); \ - t0 = vec_add(t0, vec_rnd); \ - t1 = vec_sl(vec_sub(s0, s4), vec_2); \ - t1 = vec_add(vec_sl(t1, vec_1), t1); \ - t1 = vec_add(t1, vec_rnd); \ - t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \ - t2 = vec_add(t2, vec_sl(s2, vec_4)); \ - t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \ - t3 = vec_sub(t3, vec_sl(s6, vec_4)); \ - t4 = vec_add(t0, t2); \ - t5 = vec_add(t1, t3); \ - t6 = vec_sub(t1, t3); \ - t7 = vec_sub(t0, t2); \ -\ - t0 = vec_sl(vec_add(s1, s3), vec_4); \ - t0 = vec_add(t0, vec_sl(s5, vec_3)); \ - t0 = vec_add(t0, vec_sl(s7, vec_2)); \ - t0 = vec_add(t0, vec_sub(s5, s3)); \ -\ - t1 = vec_sl(vec_sub(s1, s5), vec_4); \ - t1 = vec_sub(t1, vec_sl(s7, vec_3)); \ - t1 = vec_sub(t1, vec_sl(s3, vec_2)); \ - t1 = vec_sub(t1, vec_add(s1, s7)); \ -\ - t2 = vec_sl(vec_sub(s7, s3), vec_4); \ - t2 = vec_add(t2, vec_sl(s1, vec_3)); \ - t2 = vec_add(t2, vec_sl(s5, vec_2)); \ - t2 = vec_add(t2, vec_sub(s1, s7)); \ -\ - t3 = vec_sl(vec_sub(s5, s7), vec_4); \ - t3 = vec_sub(t3, vec_sl(s3, vec_3)); \ - t3 = vec_add(t3, vec_sl(s1, vec_2)); \ - t3 = vec_sub(t3, vec_add(s3, s5)); \ -\ - s0 = vec_add(t4, t0); \ - s1 = vec_add(t5, t1); \ - s2 = vec_add(t6, t2); \ - s3 = vec_add(t7, t3); \ - s4 = vec_sub(t7, t3); \ - s5 = vec_sub(t6, t2); \ - s6 = vec_sub(t5, t1); \ - s7 = vec_sub(t4, t0); \ -}while(0) - -#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \ -do { \ - s0 = vec_sra(s0, vec_3); \ - s1 = vec_sra(s1, vec_3); \ - s2 = vec_sra(s2, vec_3); \ - s3 = vec_sra(s3, vec_3); \ - s4 = vec_sra(s4, vec_3); \ - s5 = vec_sra(s5, vec_3); \ - s6 = vec_sra(s6, vec_3); \ - s7 = vec_sra(s7, vec_3); \ -}while(0) - -#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \ -do { \ - s0 = vec_sra(s0, vec_7); \ - s1 = vec_sra(s1, vec_7); \ - s2 = vec_sra(s2, vec_7); \ - s3 = vec_sra(s3, vec_7); \ - s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \ - s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \ - s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \ - s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \ -}while(0) - -/* main steps of 4x4 transform */ -#define STEP4(s0, s1, s2, s3, vec_rnd) \ -do { \ - t1 = vec_add(vec_sl(s0, vec_4), s0); \ - t1 = vec_add(t1, vec_rnd); \ - t2 = vec_add(vec_sl(s2, vec_4), s2); \ - t0 = vec_add(t1, t2); \ - t1 = vec_sub(t1, t2); \ - t3 = vec_sl(vec_sub(s3, s1), vec_1); \ - t3 = vec_add(t3, vec_sl(t3, vec_2)); \ - t2 = vec_add(t3, vec_sl(s1, vec_5)); \ - t3 = vec_add(t3, vec_sl(s3, vec_3)); \ - t3 = vec_add(t3, vec_sl(s3, vec_2)); \ - s0 = vec_add(t0, t2); \ - s1 = vec_sub(t1, t3); \ - s2 = vec_add(t1, t3); \ - s3 = vec_sub(t0, t2); \ -}while (0) - -#define SHIFT_HOR4(s0, s1, s2, s3) \ - s0 = vec_sra(s0, vec_3); \ - s1 = vec_sra(s1, vec_3); \ - s2 = vec_sra(s2, vec_3); \ - s3 = vec_sra(s3, vec_3); - -#define SHIFT_VERT4(s0, s1, s2, s3) \ - s0 = vec_sra(s0, vec_7); \ - s1 = vec_sra(s1, vec_7); \ - s2 = vec_sra(s2, vec_7); \ - s3 = vec_sra(s3, vec_7); - -/** Do inverse transform on 8x8 block -*/ -static void vc1_inv_trans_8x8_altivec(DCTELEM block[64]) -{ - vector signed short src0, src1, src2, src3, src4, src5, src6, src7; - vector signed int s0, s1, s2, s3, s4, s5, s6, s7; - vector signed int s8, s9, sA, sB, sC, sD, sE, sF; - vector signed int t0, t1, t2, t3, t4, t5, t6, t7; - const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); - const vector unsigned int vec_7 = vec_splat_u32(7); - const vector unsigned int vec_5 = vec_splat_u32(5); - const vector unsigned int vec_4 = vec_splat_u32(4); - const vector signed int vec_4s = vec_splat_s32(4); - const vector unsigned int vec_3 = vec_splat_u32(3); - const vector unsigned int vec_2 = vec_splat_u32(2); - const vector signed int vec_1s = vec_splat_s32(1); - const vector unsigned int vec_1 = vec_splat_u32(1); - - - src0 = vec_ld( 0, block); - src1 = vec_ld( 16, block); - src2 = vec_ld( 32, block); - src3 = vec_ld( 48, block); - src4 = vec_ld( 64, block); - src5 = vec_ld( 80, block); - src6 = vec_ld( 96, block); - src7 = vec_ld(112, block); - - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - s0 = vec_unpackl(src0); - s1 = vec_unpackl(src1); - s2 = vec_unpackl(src2); - s3 = vec_unpackl(src3); - s4 = vec_unpackl(src4); - s5 = vec_unpackl(src5); - s6 = vec_unpackl(src6); - s7 = vec_unpackl(src7); - s8 = vec_unpackh(src0); - s9 = vec_unpackh(src1); - sA = vec_unpackh(src2); - sB = vec_unpackh(src3); - sC = vec_unpackh(src4); - sD = vec_unpackh(src5); - sE = vec_unpackh(src6); - sF = vec_unpackh(src7); - STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); - SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); - STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); - SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); - src0 = vec_pack(s8, s0); - src1 = vec_pack(s9, s1); - src2 = vec_pack(sA, s2); - src3 = vec_pack(sB, s3); - src4 = vec_pack(sC, s4); - src5 = vec_pack(sD, s5); - src6 = vec_pack(sE, s6); - src7 = vec_pack(sF, s7); - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - - s0 = vec_unpackl(src0); - s1 = vec_unpackl(src1); - s2 = vec_unpackl(src2); - s3 = vec_unpackl(src3); - s4 = vec_unpackl(src4); - s5 = vec_unpackl(src5); - s6 = vec_unpackl(src6); - s7 = vec_unpackl(src7); - s8 = vec_unpackh(src0); - s9 = vec_unpackh(src1); - sA = vec_unpackh(src2); - sB = vec_unpackh(src3); - sC = vec_unpackh(src4); - sD = vec_unpackh(src5); - sE = vec_unpackh(src6); - sF = vec_unpackh(src7); - STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64); - SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7); - STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64); - SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF); - src0 = vec_pack(s8, s0); - src1 = vec_pack(s9, s1); - src2 = vec_pack(sA, s2); - src3 = vec_pack(sB, s3); - src4 = vec_pack(sC, s4); - src5 = vec_pack(sD, s5); - src6 = vec_pack(sE, s6); - src7 = vec_pack(sF, s7); - - vec_st(src0, 0, block); - vec_st(src1, 16, block); - vec_st(src2, 32, block); - vec_st(src3, 48, block); - vec_st(src4, 64, block); - vec_st(src5, 80, block); - vec_st(src6, 96, block); - vec_st(src7,112, block); -} - -/** Do inverse transform on 8x4 part of block -*/ -static void vc1_inv_trans_8x4_altivec(DCTELEM block[64], int n) -{ - vector signed short src0, src1, src2, src3, src4, src5, src6, src7; - vector signed int s0, s1, s2, s3, s4, s5, s6, s7; - vector signed int s8, s9, sA, sB, sC, sD, sE, sF; - vector signed int t0, t1, t2, t3, t4, t5, t6, t7; - const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); - const vector unsigned int vec_7 = vec_splat_u32(7); - const vector unsigned int vec_5 = vec_splat_u32(5); - const vector unsigned int vec_4 = vec_splat_u32(4); - const vector signed int vec_4s = vec_splat_s32(4); - const vector unsigned int vec_3 = vec_splat_u32(3); - const vector unsigned int vec_2 = vec_splat_u32(2); - const vector unsigned int vec_1 = vec_splat_u32(1); - - src0 = vec_ld( 0, block); - src1 = vec_ld( 16, block); - src2 = vec_ld( 32, block); - src3 = vec_ld( 48, block); - src4 = vec_ld( 64, block); - src5 = vec_ld( 80, block); - src6 = vec_ld( 96, block); - src7 = vec_ld(112, block); - - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - s0 = vec_unpackl(src0); - s1 = vec_unpackl(src1); - s2 = vec_unpackl(src2); - s3 = vec_unpackl(src3); - s4 = vec_unpackl(src4); - s5 = vec_unpackl(src5); - s6 = vec_unpackl(src6); - s7 = vec_unpackl(src7); - s8 = vec_unpackh(src0); - s9 = vec_unpackh(src1); - sA = vec_unpackh(src2); - sB = vec_unpackh(src3); - sC = vec_unpackh(src4); - sD = vec_unpackh(src5); - sE = vec_unpackh(src6); - sF = vec_unpackh(src7); - STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); - SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); - STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); - SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); - src0 = vec_pack(s8, s0); - src1 = vec_pack(s9, s1); - src2 = vec_pack(sA, s2); - src3 = vec_pack(sB, s3); - src4 = vec_pack(sC, s4); - src5 = vec_pack(sD, s5); - src6 = vec_pack(sE, s6); - src7 = vec_pack(sF, s7); - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - - if(!n){ // upper half of block - s0 = vec_unpackh(src0); - s1 = vec_unpackh(src1); - s2 = vec_unpackh(src2); - s3 = vec_unpackh(src3); - s8 = vec_unpackl(src0); - s9 = vec_unpackl(src1); - sA = vec_unpackl(src2); - sB = vec_unpackl(src3); - STEP4(s0, s1, s2, s3, vec_64); - SHIFT_VERT4(s0, s1, s2, s3); - STEP4(s8, s9, sA, sB, vec_64); - SHIFT_VERT4(s8, s9, sA, sB); - src0 = vec_pack(s0, s8); - src1 = vec_pack(s1, s9); - src2 = vec_pack(s2, sA); - src3 = vec_pack(s3, sB); - - vec_st(src0, 0, block); - vec_st(src1, 16, block); - vec_st(src2, 32, block); - vec_st(src3, 48, block); - } else { //lower half of block - s0 = vec_unpackh(src4); - s1 = vec_unpackh(src5); - s2 = vec_unpackh(src6); - s3 = vec_unpackh(src7); - s8 = vec_unpackl(src4); - s9 = vec_unpackl(src5); - sA = vec_unpackl(src6); - sB = vec_unpackl(src7); - STEP4(s0, s1, s2, s3, vec_64); - SHIFT_VERT4(s0, s1, s2, s3); - STEP4(s8, s9, sA, sB, vec_64); - SHIFT_VERT4(s8, s9, sA, sB); - src4 = vec_pack(s0, s8); - src5 = vec_pack(s1, s9); - src6 = vec_pack(s2, sA); - src7 = vec_pack(s3, sB); - - vec_st(src4, 64, block); - vec_st(src5, 80, block); - vec_st(src6, 96, block); - vec_st(src7,112, block); - } -} - - -void vc1dsp_init_altivec(DSPContext* dsp, AVCodecContext *avctx) { - dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec; - dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec; -} |