diff options
Diffstat (limited to 'contrib/ffmpeg/libavcodec/ppc')
22 files changed, 0 insertions, 7252 deletions
diff --git a/contrib/ffmpeg/libavcodec/ppc/check_altivec.c b/contrib/ffmpeg/libavcodec/ppc/check_altivec.c deleted file mode 100644 index cf55b9a1d..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/check_altivec.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - - -/** - * @file check_altivec.c - * Checks for AltiVec presence. - */ - -#ifdef __APPLE__ -#include <sys/sysctl.h> -#elif __AMIGAOS4__ -#include <exec/exec.h> -#include <interfaces/exec.h> -#include <proto/exec.h> -#endif /* __APPLE__ */ - -/** - * This function MAY rely on signal() or fork() in order to make sure altivec - * is present - */ - -int has_altivec(void) -{ -#ifdef __AMIGAOS4__ - ULONG result = 0; - extern struct ExecIFace *IExec; - - IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE); - if (result == VECTORTYPE_ALTIVEC) return 1; - return 0; -#elif __APPLE__ - int sels[2] = {CTL_HW, HW_VECTORUNIT}; - int has_vu = 0; - size_t len = sizeof(has_vu); - int err; - - err = sysctl(sels, 2, &has_vu, &len, NULL, 0); - - if (err == 0) return (has_vu != 0); - return 0; -#elif defined(RUNTIME_CPUDETECT) - int proc_ver; - // support of mfspr PVR emulation added in Linux 2.6.17 - asm volatile("mfspr %0, 287" : "=r" (proc_ver)); - proc_ver >>= 16; - if (proc_ver & 0x8000 || - proc_ver == 0x000c || - proc_ver == 0x0039 || proc_ver == 0x003c || - proc_ver == 0x0044 || proc_ver == 0x0045 || - proc_ver == 0x0070) - return 1; - return 0; -#else - // since we were compiled for altivec, just assume we have it - // until someone comes up with a proper way (not involving signal hacks). - return 1; -#endif /* __AMIGAOS4__ */ -} - diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c deleted file mode 100644 index 3d79c3ab5..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c +++ /dev/null @@ -1,1518 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_ppc.h" -#include "util_altivec.h" - -int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - DECLARE_ALIGNED_16(int, s); - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector unsigned char *tv; - vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; - vector unsigned int sad; - vector signed int sumdiffs; - - s = 0; - sad = (vector unsigned int)vec_splat_u32(0); - for(i=0;i<h;i++) { - /* - Read unaligned pixels into our vectors. The vectors are as follows: - pix1v: pix1[0]-pix1[15] - pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] - */ - tv = (vector unsigned char *) pix1; - pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); - - tv = (vector unsigned char *) &pix2[0]; - pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); - - tv = (vector unsigned char *) &pix2[1]; - pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); - - /* Calculate the average vector */ - avgv = vec_avg(pix2v, pix2iv); - - /* Calculate a sum of abs differences vector */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - DECLARE_ALIGNED_16(int, s); - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector unsigned char *tv; - vector unsigned char pix1v, pix2v, pix3v, avgv, t5; - vector unsigned int sad; - vector signed int sumdiffs; - uint8_t *pix3 = pix2 + line_size; - - s = 0; - sad = (vector unsigned int)vec_splat_u32(0); - - /* - Due to the fact that pix3 = pix2 + line_size, the pix3 of one - iteration becomes pix2 in the next iteration. We can use this - fact to avoid a potentially expensive unaligned read, each - time around the loop. - Read unaligned pixels into our vectors. The vectors are as follows: - pix2v: pix2[0]-pix2[15] - Split the pixel vectors into shorts - */ - tv = (vector unsigned char *) &pix2[0]; - pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); - - for(i=0;i<h;i++) { - /* - Read unaligned pixels into our vectors. The vectors are as follows: - pix1v: pix1[0]-pix1[15] - pix3v: pix3[0]-pix3[15] - */ - tv = (vector unsigned char *) pix1; - pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); - - tv = (vector unsigned char *) &pix3[0]; - pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); - - /* Calculate the average vector */ - avgv = vec_avg(pix2v, pix3v); - - /* Calculate a sum of abs differences vector */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2v = pix3v; - pix3 += line_size; - - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - return s; -} - -int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - DECLARE_ALIGNED_16(int, s); - uint8_t *pix3 = pix2 + line_size; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); - vector unsigned char *tv, avgv, t5; - vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; - vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; - vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; - vector unsigned short avghv, avglv; - vector unsigned short t1, t2, t3, t4; - vector unsigned int sad; - vector signed int sumdiffs; - - sad = (vector unsigned int)vec_splat_u32(0); - - s = 0; - - /* - Due to the fact that pix3 = pix2 + line_size, the pix3 of one - iteration becomes pix2 in the next iteration. We can use this - fact to avoid a potentially expensive unaligned read, as well - as some splitting, and vector addition each time around the loop. - Read unaligned pixels into our vectors. The vectors are as follows: - pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] - Split the pixel vectors into shorts - */ - tv = (vector unsigned char *) &pix2[0]; - pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); - - tv = (vector unsigned char *) &pix2[1]; - pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); - - pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); - pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); - pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); - pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); - t1 = vec_add(pix2hv, pix2ihv); - t2 = vec_add(pix2lv, pix2ilv); - - for(i=0;i<h;i++) { - /* - Read unaligned pixels into our vectors. The vectors are as follows: - pix1v: pix1[0]-pix1[15] - pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] - */ - tv = (vector unsigned char *) pix1; - pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); - - tv = (vector unsigned char *) &pix3[0]; - pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); - - tv = (vector unsigned char *) &pix3[1]; - pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); - - /* - Note that AltiVec does have vec_avg, but this works on vector pairs - and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding - would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. - Instead, we have to split the pixel vectors into vectors of shorts, - and do the averaging by hand. - */ - - /* Split the pixel vectors into shorts */ - pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); - pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); - pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); - pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); - - /* Do the averaging on them */ - t3 = vec_add(pix3hv, pix3ihv); - t4 = vec_add(pix3lv, pix3ilv); - - avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); - avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); - - /* Pack the shorts back into a result */ - avgv = vec_pack(avghv, avglv); - - /* Calculate a sum of abs differences vector */ - t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix3 += line_size; - /* Transfer the calculated values for pix3 into pix2 */ - t1 = t3; - t2 = t4; - } - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - DECLARE_ALIGNED_16(int, s); - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm1, perm2, *pix1v, *pix2v; - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sad; - vector signed int sumdiffs; - - sad = (vector unsigned int)vec_splat_u32(0); - - - for(i=0;i<h;i++) { - /* Read potentially unaligned pixels into t1 and t2 */ - perm1 = vec_lvsl(0, pix1); - pix1v = (vector unsigned char *) pix1; - perm2 = vec_lvsl(0, pix2); - pix2v = (vector unsigned char *) pix2; - t1 = vec_perm(pix1v[0], pix1v[1], perm1); - t2 = vec_perm(pix2v[0], pix2v[1], perm2); - - /* Calculate a sum of abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - DECLARE_ALIGNED_16(int, s); - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sad; - vector signed int sumdiffs; - - sad = (vector unsigned int)vec_splat_u32(0); - - permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); - - for(i=0;i<h;i++) { - /* Read potentially unaligned pixels into t1 and t2 - Since we're reading 16 pixels, and actually only want 8, - mask out the last 8 pixels. The 0s don't change the sum. */ - perm1 = vec_lvsl(0, pix1); - pix1v = (vector unsigned char *) pix1; - perm2 = vec_lvsl(0, pix2); - pix2v = (vector unsigned char *) pix2; - t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); - t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); - - /* Calculate a sum of abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t5, sad); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -int pix_norm1_altivec(uint8_t *pix, int line_size) -{ - int i; - DECLARE_ALIGNED_16(int, s); - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char *tv; - vector unsigned char pixv; - vector unsigned int sv; - vector signed int sum; - - sv = (vector unsigned int)vec_splat_u32(0); - - s = 0; - for (i = 0; i < 16; i++) { - /* Read in the potentially unaligned pixels */ - tv = (vector unsigned char *) pix; - pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); - - /* Square the values, and add them to our sum */ - sv = vec_msum(pixv, pixv, sv); - - pix += line_size; - } - /* Sum up the four partial sums, and put the result into s */ - sum = vec_sums((vector signed int) sv, (vector signed int) zero); - sum = vec_splat(sum, 3); - vec_ste(sum, 0, &s); - - return s; -} - -/** - * Sum of Squared Errors for a 8x8 block. - * AltiVec-enhanced. - * It's the sad8_altivec code above w/ squaring added. - */ -int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - DECLARE_ALIGNED_16(int, s); - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sum; - vector signed int sumsqr; - - sum = (vector unsigned int)vec_splat_u32(0); - - permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); - - - for(i=0;i<h;i++) { - /* Read potentially unaligned pixels into t1 and t2 - Since we're reading 16 pixels, and actually only want 8, - mask out the last 8 pixels. The 0s don't change the sum. */ - perm1 = vec_lvsl(0, pix1); - pix1v = (vector unsigned char *) pix1; - perm2 = vec_lvsl(0, pix2); - pix2v = (vector unsigned char *) pix2; - t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); - t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); - - /* - Since we want to use unsigned chars, we can take advantage - of the fact that abs(a-b)^2 = (a-b)^2. - */ - - /* Calculate abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Square the values and add them to our sum */ - sum = vec_msum(t5, t5, sum); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); - sumsqr = vec_splat(sumsqr, 3); - vec_ste(sumsqr, 0, &s); - - return s; -} - -/** - * Sum of Squared Errors for a 16x16 block. - * AltiVec-enhanced. - * It's the sad16_altivec code above w/ squaring added. - */ -int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) -{ - int i; - DECLARE_ALIGNED_16(int, s); - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm1, perm2, *pix1v, *pix2v; - vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sum; - vector signed int sumsqr; - - sum = (vector unsigned int)vec_splat_u32(0); - - for(i=0;i<h;i++) { - /* Read potentially unaligned pixels into t1 and t2 */ - perm1 = vec_lvsl(0, pix1); - pix1v = (vector unsigned char *) pix1; - perm2 = vec_lvsl(0, pix2); - pix2v = (vector unsigned char *) pix2; - t1 = vec_perm(pix1v[0], pix1v[1], perm1); - t2 = vec_perm(pix2v[0], pix2v[1], perm2); - - /* - Since we want to use unsigned chars, we can take advantage - of the fact that abs(a-b)^2 = (a-b)^2. - */ - - /* Calculate abs differences vector */ - t3 = vec_max(t1, t2); - t4 = vec_min(t1, t2); - t5 = vec_sub(t3, t4); - - /* Square the values and add them to our sum */ - sum = vec_msum(t5, t5, sum); - - pix1 += line_size; - pix2 += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); - sumsqr = vec_splat(sumsqr, 3); - vec_ste(sumsqr, 0, &s); - - return s; -} - -int pix_sum_altivec(uint8_t * pix, int line_size) -{ - const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); - vector unsigned char perm, *pixv; - vector unsigned char t1; - vector unsigned int sad; - vector signed int sumdiffs; - - int i; - DECLARE_ALIGNED_16(int, s); - - sad = (vector unsigned int)vec_splat_u32(0); - - for (i = 0; i < 16; i++) { - /* Read the potentially unaligned 16 pixels into t1 */ - perm = vec_lvsl(0, pix); - pixv = (vector unsigned char *) pix; - t1 = vec_perm(pixv[0], pixv[1], perm); - - /* Add each 4 pixel group together and put 4 results into sad */ - sad = vec_sum4s(t1, sad); - - pix += line_size; - } - - /* Sum up the four partial sums, and put the result into s */ - sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); - sumdiffs = vec_splat(sumdiffs, 3); - vec_ste(sumdiffs, 0, &s); - - return s; -} - -void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) -{ - int i; - vector unsigned char perm, bytes, *pixv; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector signed short shorts; - - for(i=0;i<8;i++) - { - // Read potentially unaligned pixels. - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - perm = vec_lvsl(0, pixels); - pixv = (vector unsigned char *) pixels; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts = (vector signed short)vec_mergeh(zero, bytes); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts, i*16, (vector signed short*)block); - - pixels += line_size; - } -} - -void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, - const uint8_t *s2, int stride) -{ - int i; - vector unsigned char perm, bytes, *pixv; - const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); - vector signed short shorts1, shorts2; - - for(i=0;i<4;i++) - { - // Read potentially unaligned pixels - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - perm = vec_lvsl(0, s1); - pixv = (vector unsigned char *) s1; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts1 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the same for the second block of pixels - perm = vec_lvsl(0, s2); - pixv = (vector unsigned char *) s2; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts2 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the subtraction - shorts1 = vec_sub(shorts1, shorts2); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts1, 0, (vector signed short*)block); - - s1 += stride; - s2 += stride; - block += 8; - - - // The code below is a copy of the code above... This is a manual - // unroll. - - // Read potentially unaligned pixels - // We're reading 16 pixels, and actually only want 8, - // but we simply ignore the extras. - perm = vec_lvsl(0, s1); - pixv = (vector unsigned char *) s1; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts1 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the same for the second block of pixels - perm = vec_lvsl(0, s2); - pixv = (vector unsigned char *) s2; - bytes = vec_perm(pixv[0], pixv[1], perm); - - // convert the bytes into shorts - shorts2 = (vector signed short)vec_mergeh(zero, bytes); - - // Do the subtraction - shorts1 = vec_sub(shorts1, shorts2); - - // save the data to the block, we assume the block is 16-byte aligned - vec_st(shorts1, 0, (vector signed short*)block); - - s1 += stride; - s2 += stride; - block += 8; - } -} - -void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { - register int i; - register vector unsigned char vdst, vsrc; - - /* dst and src are 16 bytes-aligned (guaranteed) */ - for(i = 0 ; (i + 15) < w ; i+=16) - { - vdst = vec_ld(i, (unsigned char*)dst); - vsrc = vec_ld(i, (unsigned char*)src); - vdst = vec_add(vsrc, vdst); - vec_st(vdst, i, (unsigned char*)dst); - } - /* if w is not a multiple of 16 */ - for (; (i < w) ; i++) - { - dst[i] = src[i]; - } -} - -/* next one assumes that ((line_size % 16) == 0) */ -void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1); - register vector unsigned char pixelsv1, pixelsv2; - register vector unsigned char pixelsv1B, pixelsv2B; - register vector unsigned char pixelsv1C, pixelsv2C; - register vector unsigned char pixelsv1D, pixelsv2D; - - register vector unsigned char perm = vec_lvsl(0, pixels); - int i; - register int line_size_2 = line_size << 1; - register int line_size_3 = line_size + line_size_2; - register int line_size_4 = line_size << 2; - -POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1); -// hand-unrolling the loop by 4 gains about 15% -// mininum execution time goes from 74 to 60 cycles -// it's faster than -funroll-loops, but using -// -funroll-loops w/ this is bad - 74 cycles again. -// all this is on a 7450, tuning for the 7450 -#if 0 - for(i=0; i<h; i++) { - pixelsv1 = vec_ld(0, (unsigned char*)pixels); - pixelsv2 = vec_ld(16, (unsigned char*)pixels); - vec_st(vec_perm(pixelsv1, pixelsv2, perm), - 0, (unsigned char*)block); - pixels+=line_size; - block +=line_size; - } -#else - for(i=0; i<h; i+=4) { - pixelsv1 = vec_ld(0, (unsigned char*)pixels); - pixelsv2 = vec_ld(15, (unsigned char*)pixels); - pixelsv1B = vec_ld(line_size, (unsigned char*)pixels); - pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels); - pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels); - pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels); - pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels); - pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels); - vec_st(vec_perm(pixelsv1, pixelsv2, perm), - 0, (unsigned char*)block); - vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), - line_size, (unsigned char*)block); - vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), - line_size_2, (unsigned char*)block); - vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), - line_size_3, (unsigned char*)block); - pixels+=line_size_4; - block +=line_size_4; - } -#endif -POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1); -} - -/* next one assumes that ((line_size % 16) == 0) */ -#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) -void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1); - register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; - register vector unsigned char perm = vec_lvsl(0, pixels); - int i; - -POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1); - - for(i=0; i<h; i++) { - pixelsv1 = vec_ld(0, (unsigned char*)pixels); - pixelsv2 = vec_ld(16, (unsigned char*)pixels); - blockv = vec_ld(0, block); - pixelsv = vec_perm(pixelsv1, pixelsv2, perm); - blockv = vec_avg(blockv,pixelsv); - vec_st(blockv, 0, (unsigned char*)block); - pixels+=line_size; - block +=line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1); -} - -/* next one assumes that ((line_size % 8) == 0) */ -void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1); - register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; - int i; - -POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1); - - for (i = 0; i < h; i++) { - /* - block is 8 bytes-aligned, so we're either in the - left block (16 bytes-aligned) or in the right block (not) - */ - int rightside = ((unsigned long)block & 0x0000000F); - - blockv = vec_ld(0, block); - pixelsv1 = vec_ld(0, (unsigned char*)pixels); - pixelsv2 = vec_ld(16, (unsigned char*)pixels); - pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); - - if (rightside) - { - pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); - } - else - { - pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); - } - - blockv = vec_avg(blockv, pixelsv); - - vec_st(blockv, 0, block); - - pixels += line_size; - block += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1); -} - -/* next one assumes that ((line_size % 8) == 0) */ -void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); - register int i; - register vector unsigned char - pixelsv1, pixelsv2, - pixelsavg; - register vector unsigned char - blockv, temp1, temp2; - register vector unsigned short - pixelssum1, pixelssum2, temp3; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - -POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1); - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) - { - blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } - else - { - blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); -} - -/* next one assumes that ((line_size % 8) == 0) */ -void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); - register int i; - register vector unsigned char - pixelsv1, pixelsv2, - pixelsavg; - register vector unsigned char - blockv, temp1, temp2; - register vector unsigned short - pixelssum1, pixelssum2, temp3; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vcone); - -POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vcone); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) - { - blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } - else - { - blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); -} - -/* next one assumes that ((line_size % 16) == 0) */ -void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); - register int i; - register vector unsigned char - pixelsv1, pixelsv2, pixelsv3, pixelsv4; - register vector unsigned char - blockv, temp1, temp2; - register vector unsigned short - pixelssum1, pixelssum2, temp3, - pixelssum3, pixelssum4, temp4; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - -POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum3 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum3 = vec_add(pixelssum3, vctwo); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - - for (i = 0; i < h ; i++) { - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - - pixelssum4 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp4 = vec_add(pixelssum3, pixelssum4); - temp4 = vec_sra(temp4, vctwo); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - - pixelssum3 = vec_add(pixelssum4, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - - blockv = vec_packsu(temp3, temp4); - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); -} - -/* next one assumes that ((line_size % 16) == 0) */ -void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); - register int i; - register vector unsigned char - pixelsv1, pixelsv2, pixelsv3, pixelsv4; - register vector unsigned char - blockv, temp1, temp2; - register vector unsigned short - pixelssum1, pixelssum2, temp3, - pixelssum3, pixelssum4, temp4; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); - register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); - -POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum3 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum3 = vec_add(pixelssum3, vcone); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vcone); - - for (i = 0; i < h ; i++) { - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } - else - { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv3 = vec_mergel(vczero, pixelsv1); - pixelsv4 = vec_mergel(vczero, pixelsv2); - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - - pixelssum4 = vec_add((vector unsigned short)pixelsv3, - (vector unsigned short)pixelsv4); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp4 = vec_add(pixelssum3, pixelssum4); - temp4 = vec_sra(temp4, vctwo); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - - pixelssum3 = vec_add(pixelssum4, vcone); - pixelssum1 = vec_add(pixelssum2, vcone); - - blockv = vec_packsu(temp3, temp4); - - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); -} - -int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ -POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); - int sum; - register const vector unsigned char vzero = - (const vector unsigned char)vec_splat_u8(0); - register vector signed short temp0, temp1, temp2, temp3, temp4, - temp5, temp6, temp7; -POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); - { - register const vector signed short vprod1 =(const vector signed short) - AVV( 1,-1, 1,-1, 1,-1, 1,-1); - register const vector signed short vprod2 =(const vector signed short) - AVV( 1, 1,-1,-1, 1, 1,-1,-1); - register const vector signed short vprod3 =(const vector signed short) - AVV( 1, 1, 1, 1,-1,-1,-1,-1); - register const vector unsigned char perm1 = (const vector unsigned char) - AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, - 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); - register const vector unsigned char perm2 = (const vector unsigned char) - AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, - 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B); - register const vector unsigned char perm3 = (const vector unsigned char) - AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); - -#define ONEITERBUTTERFLY(i, res) \ - { \ - register vector unsigned char src1, src2, srcO; \ - register vector unsigned char dst1, dst2, dstO; \ - register vector signed short srcV, dstV; \ - register vector signed short but0, but1, but2, op1, op2, op3; \ - src1 = vec_ld(stride * i, src); \ - src2 = vec_ld((stride * i) + 15, src); \ - srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ - dst1 = vec_ld(stride * i, dst); \ - dst2 = vec_ld((stride * i) + 15, dst); \ - dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ - /* promote the unsigned chars to signed shorts */ \ - /* we're in the 8x8 function, we only care for the first 8 */ \ - srcV = \ - (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)srcO); \ - dstV = \ - (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)dstO); \ - /* subtractions inside the first butterfly */ \ - but0 = vec_sub(srcV, dstV); \ - op1 = vec_perm(but0, but0, perm1); \ - but1 = vec_mladd(but0, vprod1, op1); \ - op2 = vec_perm(but1, but1, perm2); \ - but2 = vec_mladd(but1, vprod2, op2); \ - op3 = vec_perm(but2, but2, perm3); \ - res = vec_mladd(but2, vprod3, op3); \ - } - ONEITERBUTTERFLY(0, temp0); - ONEITERBUTTERFLY(1, temp1); - ONEITERBUTTERFLY(2, temp2); - ONEITERBUTTERFLY(3, temp3); - ONEITERBUTTERFLY(4, temp4); - ONEITERBUTTERFLY(5, temp5); - ONEITERBUTTERFLY(6, temp6); - ONEITERBUTTERFLY(7, temp7); - } -#undef ONEITERBUTTERFLY - { - register vector signed int vsum; - register vector signed short line0 = vec_add(temp0, temp1); - register vector signed short line1 = vec_sub(temp0, temp1); - register vector signed short line2 = vec_add(temp2, temp3); - register vector signed short line3 = vec_sub(temp2, temp3); - register vector signed short line4 = vec_add(temp4, temp5); - register vector signed short line5 = vec_sub(temp4, temp5); - register vector signed short line6 = vec_add(temp6, temp7); - register vector signed short line7 = vec_sub(temp6, temp7); - - register vector signed short line0B = vec_add(line0, line2); - register vector signed short line2B = vec_sub(line0, line2); - register vector signed short line1B = vec_add(line1, line3); - register vector signed short line3B = vec_sub(line1, line3); - register vector signed short line4B = vec_add(line4, line6); - register vector signed short line6B = vec_sub(line4, line6); - register vector signed short line5B = vec_add(line5, line7); - register vector signed short line7B = vec_sub(line5, line7); - - register vector signed short line0C = vec_add(line0B, line4B); - register vector signed short line4C = vec_sub(line0B, line4B); - register vector signed short line1C = vec_add(line1B, line5B); - register vector signed short line5C = vec_sub(line1B, line5B); - register vector signed short line2C = vec_add(line2B, line6B); - register vector signed short line6C = vec_sub(line2B, line6B); - register vector signed short line3C = vec_add(line3B, line7B); - register vector signed short line7C = vec_sub(line3B, line7B); - - vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); - vsum = vec_sum4s(vec_abs(line1C), vsum); - vsum = vec_sum4s(vec_abs(line2C), vsum); - vsum = vec_sum4s(vec_abs(line3C), vsum); - vsum = vec_sum4s(vec_abs(line4C), vsum); - vsum = vec_sum4s(vec_abs(line5C), vsum); - vsum = vec_sum4s(vec_abs(line6C), vsum); - vsum = vec_sum4s(vec_abs(line7C), vsum); - vsum = vec_sums(vsum, (vector signed int)vzero); - vsum = vec_splat(vsum, 3); - vec_ste(vsum, 0, &sum); - } -POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); - return sum; -} - -/* - 16x8 works with 16 elements ; it allows to avoid replicating - loads, and give the compiler more rooms for scheduling. - It's only used from inside hadamard8_diff16_altivec. - - Unfortunately, it seems gcc-3.3 is a bit dumb, and - the compiled code has a LOT of spill code, it seems - gcc (unlike xlc) cannot keep everything in registers - by itself. The following code include hand-made - registers allocation. It's not clean, but on - a 7450 the resulting code is much faster (best case - fall from 700+ cycles to 550). - - xlc doesn't add spill code, but it doesn't know how to - schedule for the 7450, and its code isn't much faster than - gcc-3.3 on the 7450 (but uses 25% less instructions...) - - On the 970, the hand-made RA is still a win (around 690 - vs. around 780), but xlc goes to around 660 on the - regular C code... -*/ - -static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { - int sum; - register vector signed short - temp0 REG_v(v0), - temp1 REG_v(v1), - temp2 REG_v(v2), - temp3 REG_v(v3), - temp4 REG_v(v4), - temp5 REG_v(v5), - temp6 REG_v(v6), - temp7 REG_v(v7); - register vector signed short - temp0S REG_v(v8), - temp1S REG_v(v9), - temp2S REG_v(v10), - temp3S REG_v(v11), - temp4S REG_v(v12), - temp5S REG_v(v13), - temp6S REG_v(v14), - temp7S REG_v(v15); - register const vector unsigned char vzero REG_v(v31)= - (const vector unsigned char)vec_splat_u8(0); - { - register const vector signed short vprod1 REG_v(v16)= - (const vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1); - register const vector signed short vprod2 REG_v(v17)= - (const vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1); - register const vector signed short vprod3 REG_v(v18)= - (const vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1); - register const vector unsigned char perm1 REG_v(v19)= - (const vector unsigned char) - AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, - 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); - register const vector unsigned char perm2 REG_v(v20)= - (const vector unsigned char) - AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, - 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B); - register const vector unsigned char perm3 REG_v(v21)= - (const vector unsigned char) - AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); - -#define ONEITERBUTTERFLY(i, res1, res2) \ - { \ - register vector unsigned char src1 REG_v(v22), \ - src2 REG_v(v23), \ - dst1 REG_v(v24), \ - dst2 REG_v(v25), \ - srcO REG_v(v22), \ - dstO REG_v(v23); \ - \ - register vector signed short srcV REG_v(v24), \ - dstV REG_v(v25), \ - srcW REG_v(v26), \ - dstW REG_v(v27), \ - but0 REG_v(v28), \ - but0S REG_v(v29), \ - op1 REG_v(v30), \ - but1 REG_v(v22), \ - op1S REG_v(v23), \ - but1S REG_v(v24), \ - op2 REG_v(v25), \ - but2 REG_v(v26), \ - op2S REG_v(v27), \ - but2S REG_v(v28), \ - op3 REG_v(v29), \ - op3S REG_v(v30); \ - \ - src1 = vec_ld(stride * i, src); \ - src2 = vec_ld((stride * i) + 16, src); \ - srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ - dst1 = vec_ld(stride * i, dst); \ - dst2 = vec_ld((stride * i) + 16, dst); \ - dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ - /* promote the unsigned chars to signed shorts */ \ - srcV = \ - (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)srcO); \ - dstV = \ - (vector signed short)vec_mergeh((vector signed char)vzero, \ - (vector signed char)dstO); \ - srcW = \ - (vector signed short)vec_mergel((vector signed char)vzero, \ - (vector signed char)srcO); \ - dstW = \ - (vector signed short)vec_mergel((vector signed char)vzero, \ - (vector signed char)dstO); \ - /* subtractions inside the first butterfly */ \ - but0 = vec_sub(srcV, dstV); \ - but0S = vec_sub(srcW, dstW); \ - op1 = vec_perm(but0, but0, perm1); \ - but1 = vec_mladd(but0, vprod1, op1); \ - op1S = vec_perm(but0S, but0S, perm1); \ - but1S = vec_mladd(but0S, vprod1, op1S); \ - op2 = vec_perm(but1, but1, perm2); \ - but2 = vec_mladd(but1, vprod2, op2); \ - op2S = vec_perm(but1S, but1S, perm2); \ - but2S = vec_mladd(but1S, vprod2, op2S); \ - op3 = vec_perm(but2, but2, perm3); \ - res1 = vec_mladd(but2, vprod3, op3); \ - op3S = vec_perm(but2S, but2S, perm3); \ - res2 = vec_mladd(but2S, vprod3, op3S); \ - } - ONEITERBUTTERFLY(0, temp0, temp0S); - ONEITERBUTTERFLY(1, temp1, temp1S); - ONEITERBUTTERFLY(2, temp2, temp2S); - ONEITERBUTTERFLY(3, temp3, temp3S); - ONEITERBUTTERFLY(4, temp4, temp4S); - ONEITERBUTTERFLY(5, temp5, temp5S); - ONEITERBUTTERFLY(6, temp6, temp6S); - ONEITERBUTTERFLY(7, temp7, temp7S); - } -#undef ONEITERBUTTERFLY - { - register vector signed int vsum; - register vector signed short line0S, line1S, line2S, line3S, line4S, - line5S, line6S, line7S, line0BS,line2BS, - line1BS,line3BS,line4BS,line6BS,line5BS, - line7BS,line0CS,line4CS,line1CS,line5CS, - line2CS,line6CS,line3CS,line7CS; - - register vector signed short line0 = vec_add(temp0, temp1); - register vector signed short line1 = vec_sub(temp0, temp1); - register vector signed short line2 = vec_add(temp2, temp3); - register vector signed short line3 = vec_sub(temp2, temp3); - register vector signed short line4 = vec_add(temp4, temp5); - register vector signed short line5 = vec_sub(temp4, temp5); - register vector signed short line6 = vec_add(temp6, temp7); - register vector signed short line7 = vec_sub(temp6, temp7); - - register vector signed short line0B = vec_add(line0, line2); - register vector signed short line2B = vec_sub(line0, line2); - register vector signed short line1B = vec_add(line1, line3); - register vector signed short line3B = vec_sub(line1, line3); - register vector signed short line4B = vec_add(line4, line6); - register vector signed short line6B = vec_sub(line4, line6); - register vector signed short line5B = vec_add(line5, line7); - register vector signed short line7B = vec_sub(line5, line7); - - register vector signed short line0C = vec_add(line0B, line4B); - register vector signed short line4C = vec_sub(line0B, line4B); - register vector signed short line1C = vec_add(line1B, line5B); - register vector signed short line5C = vec_sub(line1B, line5B); - register vector signed short line2C = vec_add(line2B, line6B); - register vector signed short line6C = vec_sub(line2B, line6B); - register vector signed short line3C = vec_add(line3B, line7B); - register vector signed short line7C = vec_sub(line3B, line7B); - - vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); - vsum = vec_sum4s(vec_abs(line1C), vsum); - vsum = vec_sum4s(vec_abs(line2C), vsum); - vsum = vec_sum4s(vec_abs(line3C), vsum); - vsum = vec_sum4s(vec_abs(line4C), vsum); - vsum = vec_sum4s(vec_abs(line5C), vsum); - vsum = vec_sum4s(vec_abs(line6C), vsum); - vsum = vec_sum4s(vec_abs(line7C), vsum); - - line0S = vec_add(temp0S, temp1S); - line1S = vec_sub(temp0S, temp1S); - line2S = vec_add(temp2S, temp3S); - line3S = vec_sub(temp2S, temp3S); - line4S = vec_add(temp4S, temp5S); - line5S = vec_sub(temp4S, temp5S); - line6S = vec_add(temp6S, temp7S); - line7S = vec_sub(temp6S, temp7S); - - line0BS = vec_add(line0S, line2S); - line2BS = vec_sub(line0S, line2S); - line1BS = vec_add(line1S, line3S); - line3BS = vec_sub(line1S, line3S); - line4BS = vec_add(line4S, line6S); - line6BS = vec_sub(line4S, line6S); - line5BS = vec_add(line5S, line7S); - line7BS = vec_sub(line5S, line7S); - - line0CS = vec_add(line0BS, line4BS); - line4CS = vec_sub(line0BS, line4BS); - line1CS = vec_add(line1BS, line5BS); - line5CS = vec_sub(line1BS, line5BS); - line2CS = vec_add(line2BS, line6BS); - line6CS = vec_sub(line2BS, line6BS); - line3CS = vec_add(line3BS, line7BS); - line7CS = vec_sub(line3BS, line7BS); - - vsum = vec_sum4s(vec_abs(line0CS), vsum); - vsum = vec_sum4s(vec_abs(line1CS), vsum); - vsum = vec_sum4s(vec_abs(line2CS), vsum); - vsum = vec_sum4s(vec_abs(line3CS), vsum); - vsum = vec_sum4s(vec_abs(line4CS), vsum); - vsum = vec_sum4s(vec_abs(line5CS), vsum); - vsum = vec_sum4s(vec_abs(line6CS), vsum); - vsum = vec_sum4s(vec_abs(line7CS), vsum); - vsum = vec_sums(vsum, (vector signed int)vzero); - vsum = vec_splat(vsum, 3); - vec_ste(vsum, 0, &sum); - } - return sum; -} - -int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ -POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1); - int score; -POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1); - score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); - if (h==16) { - dst += 8*stride; - src += 8*stride; - score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); - } -POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1); - return score; -} - -static void vorbis_inverse_coupling_altivec(float *mag, float *ang, - int blocksize) -{ - int i; - vector float m, a; - vector bool int t0, t1; - const vector unsigned int v_31 = //XXX - vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); - for(i=0; i<blocksize; i+=4) { - m = vec_ld(0, mag+i); - a = vec_ld(0, ang+i); - t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); - t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); - a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); - t0 = (vector bool int)vec_and(a, t1); - t1 = (vector bool int)vec_andc(a, t1); - a = vec_sub(m, (vector float)t1); - m = vec_add(m, (vector float)t0); - vec_stl(a, 0, ang+i); - vec_stl(m, 0, mag+i); - } -} - -/* next one assumes that ((line_size % 8) == 0) */ -void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) -{ -POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); - register int i; - register vector unsigned char pixelsv1, pixelsv2, pixelsavg; - register vector unsigned char blockv, temp1, temp2, blocktemp; - register vector unsigned short pixelssum1, pixelssum2, temp3; - - register const vector unsigned char vczero = (const vector unsigned char) - vec_splat_u8(0); - register const vector unsigned short vctwo = (const vector unsigned short) - vec_splat_u16(2); - - temp1 = vec_ld(0, pixels); - temp2 = vec_ld(16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); - if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); - } - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum1 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - pixelssum1 = vec_add(pixelssum1, vctwo); - -POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1); - for (i = 0; i < h ; i++) { - int rightside = ((unsigned long)block & 0x0000000F); - blockv = vec_ld(0, block); - - temp1 = vec_ld(line_size, pixels); - temp2 = vec_ld(line_size + 16, pixels); - pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); - if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) - { - pixelsv2 = temp2; - } else { - pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); - } - - pixelsv1 = vec_mergeh(vczero, pixelsv1); - pixelsv2 = vec_mergeh(vczero, pixelsv2); - pixelssum2 = vec_add((vector unsigned short)pixelsv1, - (vector unsigned short)pixelsv2); - temp3 = vec_add(pixelssum1, pixelssum2); - temp3 = vec_sra(temp3, vctwo); - pixelssum1 = vec_add(pixelssum2, vctwo); - pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); - - if (rightside) { - blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); - } else { - blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); - } - - blockv = vec_avg(blocktemp, blockv); - vec_st(blockv, 0, block); - - block += line_size; - pixels += line_size; - } - -POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1); -} - -void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx) -{ - c->pix_abs[0][1] = sad16_x2_altivec; - c->pix_abs[0][2] = sad16_y2_altivec; - c->pix_abs[0][3] = sad16_xy2_altivec; - c->pix_abs[0][0] = sad16_altivec; - c->pix_abs[1][0] = sad8_altivec; - c->sad[0]= sad16_altivec; - c->sad[1]= sad8_altivec; - c->pix_norm1 = pix_norm1_altivec; - c->sse[1]= sse8_altivec; - c->sse[0]= sse16_altivec; - c->pix_sum = pix_sum_altivec; - c->diff_pixels = diff_pixels_altivec; - c->get_pixels = get_pixels_altivec; - c->add_bytes= add_bytes_altivec; - c->put_pixels_tab[0][0] = put_pixels16_altivec; - /* the two functions do the same thing, so use the same code */ - c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; - c->avg_pixels_tab[0][0] = avg_pixels16_altivec; - c->avg_pixels_tab[1][0] = avg_pixels8_altivec; - c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; - c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; - c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; - c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; - c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; - - c->hadamard8_diff[0] = hadamard8_diff16_altivec; - c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; - if (ENABLE_VORBIS_DECODER) - c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; -} diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h deleted file mode 100644 index 43bd5abab..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef FFMPEG_DSPUTIL_ALTIVEC_H -#define FFMPEG_DSPUTIL_ALTIVEC_H - -#include <stdint.h> - -extern int has_altivec(void); - -void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); - -#endif /* FFMPEG_DSPUTIL_ALTIVEC_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c deleted file mode 100644 index 13dea06a1..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright (c) 2002 Brian Foley - * Copyright (c) 2002 Dieter Shirley - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil.h" - -#include "dsputil_ppc.h" - -#ifdef HAVE_ALTIVEC -#include "dsputil_altivec.h" - -extern void fdct_altivec(int16_t *block); -extern void gmc1_altivec(uint8_t *dst, uint8_t *src, int stride, int h, - int x16, int y16, int rounder); -extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); -extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); - -void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx); - -void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx); -void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx); -void snow_init_altivec(DSPContext* c, AVCodecContext *avctx); -void float_init_altivec(DSPContext* c, AVCodecContext *avctx); -void int_init_altivec(DSPContext* c, AVCodecContext *avctx); - -#endif - -int mm_flags = 0; - -int mm_support(void) -{ - int result = 0; -#ifdef HAVE_ALTIVEC - if (has_altivec()) { - result |= MM_ALTIVEC; - } -#endif /* result */ - return result; -} - -#ifdef CONFIG_POWERPC_PERF -unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; -/* list below must match enum in dsputil_ppc.h */ -static unsigned char* perfname[] = { - "ff_fft_calc_altivec", - "gmc1_altivec", - "dct_unquantize_h263_altivec", - "fdct_altivec", - "idct_add_altivec", - "idct_put_altivec", - "put_pixels16_altivec", - "avg_pixels16_altivec", - "avg_pixels8_altivec", - "put_pixels8_xy2_altivec", - "put_no_rnd_pixels8_xy2_altivec", - "put_pixels16_xy2_altivec", - "put_no_rnd_pixels16_xy2_altivec", - "hadamard8_diff8x8_altivec", - "hadamard8_diff16_altivec", - "avg_pixels8_xy2_altivec", - "clear_blocks_dcbz32_ppc", - "clear_blocks_dcbz128_ppc", - "put_h264_chroma_mc8_altivec", - "avg_h264_chroma_mc8_altivec", - "put_h264_qpel16_h_lowpass_altivec", - "avg_h264_qpel16_h_lowpass_altivec", - "put_h264_qpel16_v_lowpass_altivec", - "avg_h264_qpel16_v_lowpass_altivec", - "put_h264_qpel16_hv_lowpass_altivec", - "avg_h264_qpel16_hv_lowpass_altivec", - "" -}; -#include <stdio.h> -#endif - -#ifdef CONFIG_POWERPC_PERF -void powerpc_display_perf_report(void) -{ - int i, j; - av_log(NULL, AV_LOG_INFO, "PowerPC performance report\n Values are from the PMC registers, and represent whatever the registers are set to record.\n"); - for(i = 0 ; i < powerpc_perf_total ; i++) - { - for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) - { - if (perfdata[j][i][powerpc_data_num] != (unsigned long long)0) - av_log(NULL, AV_LOG_INFO, - " Function \"%s\" (pmc%d):\n\tmin: %"PRIu64"\n\tmax: %"PRIu64"\n\tavg: %1.2lf (%"PRIu64")\n", - perfname[i], - j+1, - perfdata[j][i][powerpc_data_min], - perfdata[j][i][powerpc_data_max], - (double)perfdata[j][i][powerpc_data_sum] / - (double)perfdata[j][i][powerpc_data_num], - perfdata[j][i][powerpc_data_num]); - } - } -} -#endif /* CONFIG_POWERPC_PERF */ - -/* ***** WARNING ***** WARNING ***** WARNING ***** */ -/* - clear_blocks_dcbz32_ppc will not work properly - on PowerPC processors with a cache line size - not equal to 32 bytes. - Fortunately all processor used by Apple up to - at least the 7450 (aka second generation G4) - use 32 bytes cache line. - This is due to the use of the 'dcbz' instruction. - It simply clear to zero a single cache line, - so you need to know the cache line size to use it ! - It's absurd, but it's fast... - - update 24/06/2003 : Apple released yesterday the G5, - with a PPC970. cache line size : 128 bytes. Oups. - The semantic of dcbz was changed, it always clear - 32 bytes. so the function below will work, but will - be slow. So I fixed check_dcbz_effect to use dcbzl, - which is defined to clear a cache line (as dcbz before). - So we still can distinguish, and use dcbz (32 bytes) - or dcbzl (one cache line) as required. - - see <http://developer.apple.com/technotes/tn/tn2087.html> - and <http://developer.apple.com/technotes/tn/tn2086.html> -*/ -void clear_blocks_dcbz32_ppc(DCTELEM *blocks) -{ -POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz32, 1); - register int misal = ((unsigned long)blocks & 0x00000010); - register int i = 0; -POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); -#if 1 - if (misal) { - ((unsigned long*)blocks)[0] = 0L; - ((unsigned long*)blocks)[1] = 0L; - ((unsigned long*)blocks)[2] = 0L; - ((unsigned long*)blocks)[3] = 0L; - i += 16; - } - for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) { - asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); - } - if (misal) { - ((unsigned long*)blocks)[188] = 0L; - ((unsigned long*)blocks)[189] = 0L; - ((unsigned long*)blocks)[190] = 0L; - ((unsigned long*)blocks)[191] = 0L; - i += 16; - } -#else - memset(blocks, 0, sizeof(DCTELEM)*6*64); -#endif -POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz32, 1); -} - -/* same as above, when dcbzl clear a whole 128B cache line - i.e. the PPC970 aka G5 */ -#ifdef HAVE_DCBZL -void clear_blocks_dcbz128_ppc(DCTELEM *blocks) -{ -POWERPC_PERF_DECLARE(powerpc_clear_blocks_dcbz128, 1); - register int misal = ((unsigned long)blocks & 0x0000007f); - register int i = 0; -POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz128, 1); -#if 1 - if (misal) { - // we could probably also optimize this case, - // but there's not much point as the machines - // aren't available yet (2003-06-26) - memset(blocks, 0, sizeof(DCTELEM)*6*64); - } - else - for ( ; i < sizeof(DCTELEM)*6*64 ; i += 128) { - asm volatile("dcbzl %0,%1" : : "b" (blocks), "r" (i) : "memory"); - } -#else - memset(blocks, 0, sizeof(DCTELEM)*6*64); -#endif -POWERPC_PERF_STOP_COUNT(powerpc_clear_blocks_dcbz128, 1); -} -#else -void clear_blocks_dcbz128_ppc(DCTELEM *blocks) -{ - memset(blocks, 0, sizeof(DCTELEM)*6*64); -} -#endif - -#ifdef HAVE_DCBZL -/* check dcbz report how many bytes are set to 0 by dcbz */ -/* update 24/06/2003 : replace dcbz by dcbzl to get - the intended effect (Apple "fixed" dcbz) - unfortunately this cannot be used unless the assembler - knows about dcbzl ... */ -long check_dcbzl_effect(void) -{ - register char *fakedata = av_malloc(1024); - register char *fakedata_middle; - register long zero = 0; - register long i = 0; - long count = 0; - - if (!fakedata) - { - return 0L; - } - - fakedata_middle = (fakedata + 512); - - memset(fakedata, 0xFF, 1024); - - /* below the constraint "b" seems to mean "Address base register" - in gcc-3.3 / RS/6000 speaks. seems to avoid using r0, so.... */ - asm volatile("dcbzl %0, %1" : : "b" (fakedata_middle), "r" (zero)); - - for (i = 0; i < 1024 ; i ++) - { - if (fakedata[i] == (char)0) - count++; - } - - av_free(fakedata); - - return count; -} -#else -long check_dcbzl_effect(void) -{ - return 0; -} -#endif - -static void prefetch_ppc(void *mem, int stride, int h) -{ - register const uint8_t *p = mem; - do { - asm volatile ("dcbt 0,%0" : : "r" (p)); - p+= stride; - } while(--h); -} - -void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) -{ - // Common optimizations whether AltiVec is available or not - c->prefetch = prefetch_ppc; - switch (check_dcbzl_effect()) { - case 32: - c->clear_blocks = clear_blocks_dcbz32_ppc; - break; - case 128: - c->clear_blocks = clear_blocks_dcbz128_ppc; - break; - default: - break; - } - -#ifdef HAVE_ALTIVEC - if(ENABLE_H264_DECODER) dsputil_h264_init_ppc(c, avctx); - - if (has_altivec()) { - mm_flags |= MM_ALTIVEC; - - dsputil_init_altivec(c, avctx); - if(ENABLE_SNOW_DECODER) snow_init_altivec(c, avctx); - if(ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER) - vc1dsp_init_altivec(c, avctx); - float_init_altivec(c, avctx); - int_init_altivec(c, avctx); - c->gmc1 = gmc1_altivec; - -#ifdef CONFIG_ENCODERS - if (avctx->dct_algo == FF_DCT_AUTO || - avctx->dct_algo == FF_DCT_ALTIVEC) - { - c->fdct = fdct_altivec; - } -#endif //CONFIG_ENCODERS - - if (avctx->lowres==0) - { - if ((avctx->idct_algo == FF_IDCT_AUTO) || - (avctx->idct_algo == FF_IDCT_ALTIVEC)) - { - c->idct_put = idct_put_altivec; - c->idct_add = idct_add_altivec; - c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - } - } - -#ifdef CONFIG_POWERPC_PERF - { - int i, j; - for (i = 0 ; i < powerpc_perf_total ; i++) - { - for (j = 0; j < POWERPC_NUM_PMC_ENABLED ; j++) - { - perfdata[j][i][powerpc_data_min] = 0xFFFFFFFFFFFFFFFFULL; - perfdata[j][i][powerpc_data_max] = 0x0000000000000000ULL; - perfdata[j][i][powerpc_data_sum] = 0x0000000000000000ULL; - perfdata[j][i][powerpc_data_num] = 0x0000000000000000ULL; - } - } - } -#endif /* CONFIG_POWERPC_PERF */ - } -#endif /* HAVE_ALTIVEC */ -} diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h deleted file mode 100644 index d8f6b27f9..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef FFMPEG_DSPUTIL_PPC_H -#define FFMPEG_DSPUTIL_PPC_H - -#ifdef CONFIG_POWERPC_PERF -void powerpc_display_perf_report(void); -/* the 604* have 2, the G3* have 4, the G4s have 6, - and the G5 are completely different (they MUST use - HAVE_PPC64, and let's hope all future 64 bis PPC - will use the same PMCs... */ -#define POWERPC_NUM_PMC_ENABLED 6 -/* if you add to the enum below, also add to the perfname array - in dsputil_ppc.c */ -enum powerpc_perf_index { - altivec_fft_num = 0, - altivec_gmc1_num, - altivec_dct_unquantize_h263_num, - altivec_fdct, - altivec_idct_add_num, - altivec_idct_put_num, - altivec_put_pixels16_num, - altivec_avg_pixels16_num, - altivec_avg_pixels8_num, - altivec_put_pixels8_xy2_num, - altivec_put_no_rnd_pixels8_xy2_num, - altivec_put_pixels16_xy2_num, - altivec_put_no_rnd_pixels16_xy2_num, - altivec_hadamard8_diff8x8_num, - altivec_hadamard8_diff16_num, - altivec_avg_pixels8_xy2_num, - powerpc_clear_blocks_dcbz32, - powerpc_clear_blocks_dcbz128, - altivec_put_h264_chroma_mc8_num, - altivec_avg_h264_chroma_mc8_num, - altivec_put_h264_qpel16_h_lowpass_num, - altivec_avg_h264_qpel16_h_lowpass_num, - altivec_put_h264_qpel16_v_lowpass_num, - altivec_avg_h264_qpel16_v_lowpass_num, - altivec_put_h264_qpel16_hv_lowpass_num, - altivec_avg_h264_qpel16_hv_lowpass_num, - powerpc_perf_total -}; -enum powerpc_data_index { - powerpc_data_min = 0, - powerpc_data_max, - powerpc_data_sum, - powerpc_data_num, - powerpc_data_total -}; -extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; - -#ifndef HAVE_PPC64 -#define POWERP_PMC_DATATYPE unsigned long -#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 937" : "=r" (a)) -#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a)) -#if (POWERPC_NUM_PMC_ENABLED > 2) -#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 941" : "=r" (a)) -#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 942" : "=r" (a)) -#else -#define POWERPC_GET_PMC3(a) do {} while (0) -#define POWERPC_GET_PMC4(a) do {} while (0) -#endif -#if (POWERPC_NUM_PMC_ENABLED > 4) -#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 929" : "=r" (a)) -#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 930" : "=r" (a)) -#else -#define POWERPC_GET_PMC5(a) do {} while (0) -#define POWERPC_GET_PMC6(a) do {} while (0) -#endif -#else /* HAVE_PPC64 */ -#define POWERP_PMC_DATATYPE unsigned long long -#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 771" : "=r" (a)) -#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 772" : "=r" (a)) -#if (POWERPC_NUM_PMC_ENABLED > 2) -#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 773" : "=r" (a)) -#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 774" : "=r" (a)) -#else -#define POWERPC_GET_PMC3(a) do {} while (0) -#define POWERPC_GET_PMC4(a) do {} while (0) -#endif -#if (POWERPC_NUM_PMC_ENABLED > 4) -#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 775" : "=r" (a)) -#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 776" : "=r" (a)) -#else -#define POWERPC_GET_PMC5(a) do {} while (0) -#define POWERPC_GET_PMC6(a) do {} while (0) -#endif -#endif /* HAVE_PPC64 */ -#define POWERPC_PERF_DECLARE(a, cond) \ - POWERP_PMC_DATATYPE \ - pmc_start[POWERPC_NUM_PMC_ENABLED], \ - pmc_stop[POWERPC_NUM_PMC_ENABLED], \ - pmc_loop_index; -#define POWERPC_PERF_START_COUNT(a, cond) do { \ - POWERPC_GET_PMC6(pmc_start[5]); \ - POWERPC_GET_PMC5(pmc_start[4]); \ - POWERPC_GET_PMC4(pmc_start[3]); \ - POWERPC_GET_PMC3(pmc_start[2]); \ - POWERPC_GET_PMC2(pmc_start[1]); \ - POWERPC_GET_PMC1(pmc_start[0]); \ - } while (0) -#define POWERPC_PERF_STOP_COUNT(a, cond) do { \ - POWERPC_GET_PMC1(pmc_stop[0]); \ - POWERPC_GET_PMC2(pmc_stop[1]); \ - POWERPC_GET_PMC3(pmc_stop[2]); \ - POWERPC_GET_PMC4(pmc_stop[3]); \ - POWERPC_GET_PMC5(pmc_stop[4]); \ - POWERPC_GET_PMC6(pmc_stop[5]); \ - if (cond) \ - { \ - for(pmc_loop_index = 0; \ - pmc_loop_index < POWERPC_NUM_PMC_ENABLED; \ - pmc_loop_index++) \ - { \ - if (pmc_stop[pmc_loop_index] >= pmc_start[pmc_loop_index]) \ - { \ - POWERP_PMC_DATATYPE diff = \ - pmc_stop[pmc_loop_index] - pmc_start[pmc_loop_index]; \ - if (diff < perfdata[pmc_loop_index][a][powerpc_data_min]) \ - perfdata[pmc_loop_index][a][powerpc_data_min] = diff; \ - if (diff > perfdata[pmc_loop_index][a][powerpc_data_max]) \ - perfdata[pmc_loop_index][a][powerpc_data_max] = diff; \ - perfdata[pmc_loop_index][a][powerpc_data_sum] += diff; \ - perfdata[pmc_loop_index][a][powerpc_data_num] ++; \ - } \ - } \ - } \ -} while (0) -#else /* CONFIG_POWERPC_PERF */ -// those are needed to avoid empty statements. -#define POWERPC_PERF_DECLARE(a, cond) int altivec_placeholder __attribute__ ((unused)) -#define POWERPC_PERF_START_COUNT(a, cond) do {} while (0) -#define POWERPC_PERF_STOP_COUNT(a, cond) do {} while (0) -#endif /* CONFIG_POWERPC_PERF */ - -#endif /* FFMPEG_DSPUTIL_PPC_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c b/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c deleted file mode 100644 index 6b9a35ba8..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c +++ /dev/null @@ -1,493 +0,0 @@ -/* ffmpeg/libavcodec/ppc/fdct_altivec.c, this file is part of the - * AltiVec optimized library for the FFMPEG Multimedia System - * Copyright (C) 2003 James Klicman <james@klicman.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - - -#include "common.h" -#include "dsputil.h" -#include "dsputil_ppc.h" -#include "gcc_fixes.h" - - -#define vs16(v) ((vector signed short)(v)) -#define vs32(v) ((vector signed int)(v)) -#define vu8(v) ((vector unsigned char)(v)) -#define vu16(v) ((vector unsigned short)(v)) -#define vu32(v) ((vector unsigned int)(v)) - - -#define C1 0.98078525066375732421875000 /* cos(1*PI/16) */ -#define C2 0.92387950420379638671875000 /* cos(2*PI/16) */ -#define C3 0.83146959543228149414062500 /* cos(3*PI/16) */ -#define C4 0.70710676908493041992187500 /* cos(4*PI/16) */ -#define C5 0.55557024478912353515625000 /* cos(5*PI/16) */ -#define C6 0.38268342614173889160156250 /* cos(6*PI/16) */ -#define C7 0.19509032368659973144531250 /* cos(7*PI/16) */ -#define SQRT_2 1.41421353816986083984375000 /* sqrt(2) */ - - -#define W0 -(2 * C2) -#define W1 (2 * C6) -#define W2 (SQRT_2 * C6) -#define W3 (SQRT_2 * C3) -#define W4 (SQRT_2 * (-C1 + C3 + C5 - C7)) -#define W5 (SQRT_2 * ( C1 + C3 - C5 + C7)) -#define W6 (SQRT_2 * ( C1 + C3 + C5 - C7)) -#define W7 (SQRT_2 * ( C1 + C3 - C5 - C7)) -#define W8 (SQRT_2 * ( C7 - C3)) -#define W9 (SQRT_2 * (-C1 - C3)) -#define WA (SQRT_2 * (-C3 - C5)) -#define WB (SQRT_2 * ( C5 - C3)) - - -static vector float fdctconsts[3] = { - (vector float)AVV( W0, W1, W2, W3 ), - (vector float)AVV( W4, W5, W6, W7 ), - (vector float)AVV( W8, W9, WA, WB ) -}; - -#define LD_W0 vec_splat(cnsts0, 0) -#define LD_W1 vec_splat(cnsts0, 1) -#define LD_W2 vec_splat(cnsts0, 2) -#define LD_W3 vec_splat(cnsts0, 3) -#define LD_W4 vec_splat(cnsts1, 0) -#define LD_W5 vec_splat(cnsts1, 1) -#define LD_W6 vec_splat(cnsts1, 2) -#define LD_W7 vec_splat(cnsts1, 3) -#define LD_W8 vec_splat(cnsts2, 0) -#define LD_W9 vec_splat(cnsts2, 1) -#define LD_WA vec_splat(cnsts2, 2) -#define LD_WB vec_splat(cnsts2, 3) - - -#define FDCTROW(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \ - x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ - x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ - x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ - x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ - x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ - x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ - x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ - x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ - \ - b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ - b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ - b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ - b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ - \ - b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ - b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ - b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ - cnst = LD_W2; \ - b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ - cnst = LD_W1; \ - b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ - cnst = LD_W0; \ - b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ - \ - x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ - x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ - x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ - x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ - x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ - cnst = LD_W3; \ - x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ - \ - cnst = LD_W8; \ - x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ - cnst = LD_W9; \ - x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ - cnst = LD_WA; \ - x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ - cnst = LD_WB; \ - x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ - \ - cnst = LD_W4; \ - b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ - cnst = LD_W5; \ - b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ - cnst = LD_W6; \ - b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ - cnst = LD_W7; \ - b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ - \ - b7 = vec_add(b7, x2); /* b7 = b7 + x2; */ \ - b5 = vec_add(b5, x3); /* b5 = b5 + x3; */ \ - b3 = vec_add(b3, x2); /* b3 = b3 + x2; */ \ - b1 = vec_add(b1, x3); /* b1 = b1 + x3; */ \ - /* }}} */ - -#define FDCTCOL(b0,b1,b2,b3,b4,b5,b6,b7) /* {{{ */ \ - x0 = vec_add(b0, b7); /* x0 = b0 + b7; */ \ - x7 = vec_sub(b0, b7); /* x7 = b0 - b7; */ \ - x1 = vec_add(b1, b6); /* x1 = b1 + b6; */ \ - x6 = vec_sub(b1, b6); /* x6 = b1 - b6; */ \ - x2 = vec_add(b2, b5); /* x2 = b2 + b5; */ \ - x5 = vec_sub(b2, b5); /* x5 = b2 - b5; */ \ - x3 = vec_add(b3, b4); /* x3 = b3 + b4; */ \ - x4 = vec_sub(b3, b4); /* x4 = b3 - b4; */ \ - \ - b7 = vec_add(x0, x3); /* b7 = x0 + x3; */ \ - b1 = vec_add(x1, x2); /* b1 = x1 + x2; */ \ - b0 = vec_add(b7, b1); /* b0 = b7 + b1; */ \ - b4 = vec_sub(b7, b1); /* b4 = b7 - b1; */ \ - \ - b2 = vec_sub(x0, x3); /* b2 = x0 - x3; */ \ - b6 = vec_sub(x1, x2); /* b6 = x1 - x2; */ \ - b5 = vec_add(b6, b2); /* b5 = b6 + b2; */ \ - cnst = LD_W2; \ - b5 = vec_madd(cnst, b5, mzero); /* b5 = b5 * W2; */ \ - cnst = LD_W1; \ - b2 = vec_madd(cnst, b2, b5); /* b2 = b5 + b2 * W1; */ \ - cnst = LD_W0; \ - b6 = vec_madd(cnst, b6, b5); /* b6 = b5 + b6 * W0; */ \ - \ - x0 = vec_add(x4, x7); /* x0 = x4 + x7; */ \ - x1 = vec_add(x5, x6); /* x1 = x5 + x6; */ \ - x2 = vec_add(x4, x6); /* x2 = x4 + x6; */ \ - x3 = vec_add(x5, x7); /* x3 = x5 + x7; */ \ - x8 = vec_add(x2, x3); /* x8 = x2 + x3; */ \ - cnst = LD_W3; \ - x8 = vec_madd(cnst, x8, mzero); /* x8 = x8 * W3; */ \ - \ - cnst = LD_W8; \ - x0 = vec_madd(cnst, x0, mzero); /* x0 *= W8; */ \ - cnst = LD_W9; \ - x1 = vec_madd(cnst, x1, mzero); /* x1 *= W9; */ \ - cnst = LD_WA; \ - x2 = vec_madd(cnst, x2, x8); /* x2 = x2 * WA + x8; */ \ - cnst = LD_WB; \ - x3 = vec_madd(cnst, x3, x8); /* x3 = x3 * WB + x8; */ \ - \ - cnst = LD_W4; \ - b7 = vec_madd(cnst, x4, x0); /* b7 = x4 * W4 + x0; */ \ - cnst = LD_W5; \ - b5 = vec_madd(cnst, x5, x1); /* b5 = x5 * W5 + x1; */ \ - cnst = LD_W6; \ - b3 = vec_madd(cnst, x6, x1); /* b3 = x6 * W6 + x1; */ \ - cnst = LD_W7; \ - b1 = vec_madd(cnst, x7, x0); /* b1 = x7 * W7 + x0; */ \ - \ - b7 = vec_add(b7, x2); /* b7 += x2; */ \ - b5 = vec_add(b5, x3); /* b5 += x3; */ \ - b3 = vec_add(b3, x2); /* b3 += x2; */ \ - b1 = vec_add(b1, x3); /* b1 += x3; */ \ - /* }}} */ - - - -/* two dimensional discrete cosine transform */ - -void fdct_altivec(int16_t *block) -{ -POWERPC_PERF_DECLARE(altivec_fdct, 1); - vector signed short *bp; - vector float *cp; - vector float b00, b10, b20, b30, b40, b50, b60, b70; - vector float b01, b11, b21, b31, b41, b51, b61, b71; - vector float mzero, cnst, cnsts0, cnsts1, cnsts2; - vector float x0, x1, x2, x3, x4, x5, x6, x7, x8; - - POWERPC_PERF_START_COUNT(altivec_fdct, 1); - - - /* setup constants {{{ */ - /* mzero = -0.0 */ - mzero = ((vector float)vec_splat_u32(-1)); - mzero = ((vector float)vec_sl(vu32(mzero), vu32(mzero))); - cp = fdctconsts; - cnsts0 = vec_ld(0, cp); cp++; - cnsts1 = vec_ld(0, cp); cp++; - cnsts2 = vec_ld(0, cp); - /* }}} */ - - - /* 8x8 matrix transpose (vector short[8]) {{{ */ -#define MERGE_S16(hl,a,b) vec_merge##hl(vs16(a), vs16(b)) - - bp = (vector signed short*)block; - b00 = ((vector float)vec_ld(0, bp)); - b40 = ((vector float)vec_ld(16*4, bp)); - b01 = ((vector float)MERGE_S16(h, b00, b40)); - b11 = ((vector float)MERGE_S16(l, b00, b40)); - bp++; - b10 = ((vector float)vec_ld(0, bp)); - b50 = ((vector float)vec_ld(16*4, bp)); - b21 = ((vector float)MERGE_S16(h, b10, b50)); - b31 = ((vector float)MERGE_S16(l, b10, b50)); - bp++; - b20 = ((vector float)vec_ld(0, bp)); - b60 = ((vector float)vec_ld(16*4, bp)); - b41 = ((vector float)MERGE_S16(h, b20, b60)); - b51 = ((vector float)MERGE_S16(l, b20, b60)); - bp++; - b30 = ((vector float)vec_ld(0, bp)); - b70 = ((vector float)vec_ld(16*4, bp)); - b61 = ((vector float)MERGE_S16(h, b30, b70)); - b71 = ((vector float)MERGE_S16(l, b30, b70)); - - x0 = ((vector float)MERGE_S16(h, b01, b41)); - x1 = ((vector float)MERGE_S16(l, b01, b41)); - x2 = ((vector float)MERGE_S16(h, b11, b51)); - x3 = ((vector float)MERGE_S16(l, b11, b51)); - x4 = ((vector float)MERGE_S16(h, b21, b61)); - x5 = ((vector float)MERGE_S16(l, b21, b61)); - x6 = ((vector float)MERGE_S16(h, b31, b71)); - x7 = ((vector float)MERGE_S16(l, b31, b71)); - - b00 = ((vector float)MERGE_S16(h, x0, x4)); - b10 = ((vector float)MERGE_S16(l, x0, x4)); - b20 = ((vector float)MERGE_S16(h, x1, x5)); - b30 = ((vector float)MERGE_S16(l, x1, x5)); - b40 = ((vector float)MERGE_S16(h, x2, x6)); - b50 = ((vector float)MERGE_S16(l, x2, x6)); - b60 = ((vector float)MERGE_S16(h, x3, x7)); - b70 = ((vector float)MERGE_S16(l, x3, x7)); - -#undef MERGE_S16 - /* }}} */ - - -/* Some of the initial calculations can be done as vector short before - * conversion to vector float. The following code section takes advantage - * of this. - */ -#if 1 - /* fdct rows {{{ */ - x0 = ((vector float)vec_add(vs16(b00), vs16(b70))); - x7 = ((vector float)vec_sub(vs16(b00), vs16(b70))); - x1 = ((vector float)vec_add(vs16(b10), vs16(b60))); - x6 = ((vector float)vec_sub(vs16(b10), vs16(b60))); - x2 = ((vector float)vec_add(vs16(b20), vs16(b50))); - x5 = ((vector float)vec_sub(vs16(b20), vs16(b50))); - x3 = ((vector float)vec_add(vs16(b30), vs16(b40))); - x4 = ((vector float)vec_sub(vs16(b30), vs16(b40))); - - b70 = ((vector float)vec_add(vs16(x0), vs16(x3))); - b10 = ((vector float)vec_add(vs16(x1), vs16(x2))); - - b00 = ((vector float)vec_add(vs16(b70), vs16(b10))); - b40 = ((vector float)vec_sub(vs16(b70), vs16(b10))); - -#define CTF0(n) \ - b##n##1 = ((vector float)vec_unpackl(vs16(b##n##0))); \ - b##n##0 = ((vector float)vec_unpackh(vs16(b##n##0))); \ - b##n##1 = vec_ctf(vs32(b##n##1), 0); \ - b##n##0 = vec_ctf(vs32(b##n##0), 0); - - CTF0(0); - CTF0(4); - - b20 = ((vector float)vec_sub(vs16(x0), vs16(x3))); - b60 = ((vector float)vec_sub(vs16(x1), vs16(x2))); - - CTF0(2); - CTF0(6); - -#undef CTF0 - - x0 = vec_add(b60, b20); - x1 = vec_add(b61, b21); - - cnst = LD_W2; - x0 = vec_madd(cnst, x0, mzero); - x1 = vec_madd(cnst, x1, mzero); - cnst = LD_W1; - b20 = vec_madd(cnst, b20, x0); - b21 = vec_madd(cnst, b21, x1); - cnst = LD_W0; - b60 = vec_madd(cnst, b60, x0); - b61 = vec_madd(cnst, b61, x1); - -#define CTFX(x,b) \ - b##0 = ((vector float)vec_unpackh(vs16(x))); \ - b##1 = ((vector float)vec_unpackl(vs16(x))); \ - b##0 = vec_ctf(vs32(b##0), 0); \ - b##1 = vec_ctf(vs32(b##1), 0); \ - - CTFX(x4, b7); - CTFX(x5, b5); - CTFX(x6, b3); - CTFX(x7, b1); - -#undef CTFX - - - x0 = vec_add(b70, b10); - x1 = vec_add(b50, b30); - x2 = vec_add(b70, b30); - x3 = vec_add(b50, b10); - x8 = vec_add(x2, x3); - cnst = LD_W3; - x8 = vec_madd(cnst, x8, mzero); - - cnst = LD_W8; - x0 = vec_madd(cnst, x0, mzero); - cnst = LD_W9; - x1 = vec_madd(cnst, x1, mzero); - cnst = LD_WA; - x2 = vec_madd(cnst, x2, x8); - cnst = LD_WB; - x3 = vec_madd(cnst, x3, x8); - - cnst = LD_W4; - b70 = vec_madd(cnst, b70, x0); - cnst = LD_W5; - b50 = vec_madd(cnst, b50, x1); - cnst = LD_W6; - b30 = vec_madd(cnst, b30, x1); - cnst = LD_W7; - b10 = vec_madd(cnst, b10, x0); - - b70 = vec_add(b70, x2); - b50 = vec_add(b50, x3); - b30 = vec_add(b30, x2); - b10 = vec_add(b10, x3); - - - x0 = vec_add(b71, b11); - x1 = vec_add(b51, b31); - x2 = vec_add(b71, b31); - x3 = vec_add(b51, b11); - x8 = vec_add(x2, x3); - cnst = LD_W3; - x8 = vec_madd(cnst, x8, mzero); - - cnst = LD_W8; - x0 = vec_madd(cnst, x0, mzero); - cnst = LD_W9; - x1 = vec_madd(cnst, x1, mzero); - cnst = LD_WA; - x2 = vec_madd(cnst, x2, x8); - cnst = LD_WB; - x3 = vec_madd(cnst, x3, x8); - - cnst = LD_W4; - b71 = vec_madd(cnst, b71, x0); - cnst = LD_W5; - b51 = vec_madd(cnst, b51, x1); - cnst = LD_W6; - b31 = vec_madd(cnst, b31, x1); - cnst = LD_W7; - b11 = vec_madd(cnst, b11, x0); - - b71 = vec_add(b71, x2); - b51 = vec_add(b51, x3); - b31 = vec_add(b31, x2); - b11 = vec_add(b11, x3); - /* }}} */ -#else - /* convert to float {{{ */ -#define CTF(n) \ - vs32(b##n##1) = vec_unpackl(vs16(b##n##0)); \ - vs32(b##n##0) = vec_unpackh(vs16(b##n##0)); \ - b##n##1 = vec_ctf(vs32(b##n##1), 0); \ - b##n##0 = vec_ctf(vs32(b##n##0), 0); \ - - CTF(0); - CTF(1); - CTF(2); - CTF(3); - CTF(4); - CTF(5); - CTF(6); - CTF(7); - -#undef CTF - /* }}} */ - - FDCTROW(b00, b10, b20, b30, b40, b50, b60, b70); - FDCTROW(b01, b11, b21, b31, b41, b51, b61, b71); -#endif - - - /* 8x8 matrix transpose (vector float[8][2]) {{{ */ - x0 = vec_mergel(b00, b20); - x1 = vec_mergeh(b00, b20); - x2 = vec_mergel(b10, b30); - x3 = vec_mergeh(b10, b30); - - b00 = vec_mergeh(x1, x3); - b10 = vec_mergel(x1, x3); - b20 = vec_mergeh(x0, x2); - b30 = vec_mergel(x0, x2); - - x4 = vec_mergel(b41, b61); - x5 = vec_mergeh(b41, b61); - x6 = vec_mergel(b51, b71); - x7 = vec_mergeh(b51, b71); - - b41 = vec_mergeh(x5, x7); - b51 = vec_mergel(x5, x7); - b61 = vec_mergeh(x4, x6); - b71 = vec_mergel(x4, x6); - - x0 = vec_mergel(b01, b21); - x1 = vec_mergeh(b01, b21); - x2 = vec_mergel(b11, b31); - x3 = vec_mergeh(b11, b31); - - x4 = vec_mergel(b40, b60); - x5 = vec_mergeh(b40, b60); - x6 = vec_mergel(b50, b70); - x7 = vec_mergeh(b50, b70); - - b40 = vec_mergeh(x1, x3); - b50 = vec_mergel(x1, x3); - b60 = vec_mergeh(x0, x2); - b70 = vec_mergel(x0, x2); - - b01 = vec_mergeh(x5, x7); - b11 = vec_mergel(x5, x7); - b21 = vec_mergeh(x4, x6); - b31 = vec_mergel(x4, x6); - /* }}} */ - - - FDCTCOL(b00, b10, b20, b30, b40, b50, b60, b70); - FDCTCOL(b01, b11, b21, b31, b41, b51, b61, b71); - - - /* round, convert back to short {{{ */ -#define CTS(n) \ - b##n##0 = vec_round(b##n##0); \ - b##n##1 = vec_round(b##n##1); \ - b##n##0 = ((vector float)vec_cts(b##n##0, 0)); \ - b##n##1 = ((vector float)vec_cts(b##n##1, 0)); \ - b##n##0 = ((vector float)vec_pack(vs32(b##n##0), vs32(b##n##1))); \ - vec_st(vs16(b##n##0), 0, bp); - - bp = (vector signed short*)block; - CTS(0); bp++; - CTS(1); bp++; - CTS(2); bp++; - CTS(3); bp++; - CTS(4); bp++; - CTS(5); bp++; - CTS(6); bp++; - CTS(7); - -#undef CTS - /* }}} */ - -POWERPC_PERF_STOP_COUNT(altivec_fdct, 1); -} - -/* vim:set foldmethod=marker foldlevel=0: */ diff --git a/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c b/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c deleted file mode 100644 index e0b77807f..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * FFT/IFFT transforms - * AltiVec-enabled - * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> - * Based on code Copyright (c) 2002 Fabrice Bellard. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ -#include "dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_ppc.h" -#include "util_altivec.h" -/* - those three macros are from libavcodec/fft.c - and are required for the reference C code -*/ -/* butter fly op */ -#define BF(pre, pim, qre, qim, pre1, pim1, qre1, qim1) \ -{\ - FFTSample ax, ay, bx, by;\ - bx=pre1;\ - by=pim1;\ - ax=qre1;\ - ay=qim1;\ - pre = (bx + ax);\ - pim = (by + ay);\ - qre = (bx - ax);\ - qim = (by - ay);\ -} -#define MUL16(a,b) ((a) * (b)) -#define CMUL(pre, pim, are, aim, bre, bim) \ -{\ - pre = (MUL16(are, bre) - MUL16(aim, bim));\ - pim = (MUL16(are, bim) + MUL16(bre, aim));\ -} - - -/** - * Do a complex FFT with the parameters defined in ff_fft_init(). The - * input data must be permuted before with s->revtab table. No - * 1.0/sqrt(n) normalization is done. - * AltiVec-enabled - * This code assumes that the 'z' pointer is 16 bytes-aligned - * It also assumes all FFTComplex are 8 bytes-aligned pair of float - * The code is exactly the same as the SSE version, except - * that successive MUL + ADD/SUB have been merged into - * fused multiply-add ('vec_madd' in altivec) - */ -void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z) -{ -POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6); - register const vector float vczero = (const vector float)vec_splat_u32(0.); - - int ln = s->nbits; - int j, np, np2; - int nblocks, nloops; - register FFTComplex *p, *q; - FFTComplex *cptr, *cptr1; - int k; - -POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6); - - np = 1 << ln; - - { - vector float *r, a, b, a1, c1, c2; - - r = (vector float *)&z[0]; - - c1 = vcii(p,p,n,n); - - if (s->inverse) - { - c2 = vcii(p,p,n,p); - } - else - { - c2 = vcii(p,p,p,n); - } - - j = (np >> 2); - do { - a = vec_ld(0, r); - a1 = vec_ld(sizeof(vector float), r); - - b = vec_perm(a,a,vcprmle(1,0,3,2)); - a = vec_madd(a,c1,b); - /* do the pass 0 butterfly */ - - b = vec_perm(a1,a1,vcprmle(1,0,3,2)); - b = vec_madd(a1,c1,b); - /* do the pass 0 butterfly */ - - /* multiply third by -i */ - b = vec_perm(b,b,vcprmle(2,3,1,0)); - - /* do the pass 1 butterfly */ - vec_st(vec_madd(b,c2,a), 0, r); - vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r); - - r += 2; - } while (--j != 0); - } - /* pass 2 .. ln-1 */ - - nblocks = np >> 3; - nloops = 1 << 2; - np2 = np >> 1; - - cptr1 = s->exptab1; - do { - p = z; - q = z + nloops; - j = nblocks; - do { - cptr = cptr1; - k = nloops >> 1; - do { - vector float a,b,c,t1; - - a = vec_ld(0, (float*)p); - b = vec_ld(0, (float*)q); - - /* complex mul */ - c = vec_ld(0, (float*)cptr); - /* cre*re cim*re */ - t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero); - c = vec_ld(sizeof(vector float), (float*)cptr); - /* -cim*im cre*im */ - b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1); - - /* butterfly */ - vec_st(vec_add(a,b), 0, (float*)p); - vec_st(vec_sub(a,b), 0, (float*)q); - - p += 2; - q += 2; - cptr += 4; - } while (--k); - - p += nloops; - q += nloops; - } while (--j); - cptr1 += nloops * 2; - nblocks = nblocks >> 1; - nloops = nloops << 1; - } while (nblocks != 0); - -POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6); -} diff --git a/contrib/ffmpeg/libavcodec/ppc/float_altivec.c b/contrib/ffmpeg/libavcodec/ppc/float_altivec.c deleted file mode 100644 index 750e6d7f9..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/float_altivec.c +++ /dev/null @@ -1,193 +0,0 @@ -/* - * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_altivec.h" - -static void vector_fmul_altivec(float *dst, const float *src, int len) -{ - int i; - vector float d0, d1, s, zero = (vector float)vec_splat_u32(0); - for(i=0; i<len-7; i+=8) { - d0 = vec_ld(0, dst+i); - s = vec_ld(0, src+i); - d1 = vec_ld(16, dst+i); - d0 = vec_madd(d0, s, zero); - d1 = vec_madd(d1, vec_ld(16,src+i), zero); - vec_st(d0, 0, dst+i); - vec_st(d1, 16, dst+i); - } -} - -static void vector_fmul_reverse_altivec(float *dst, const float *src0, - const float *src1, int len) -{ - int i; - vector float d, s0, s1, h0, l0, - s2, s3, zero = (vector float)vec_splat_u32(0); - src1 += len-4; - for(i=0; i<len-7; i+=8) { - s1 = vec_ld(0, src1-i); // [a,b,c,d] - s0 = vec_ld(0, src0+i); - l0 = vec_mergel(s1, s1); // [c,c,d,d] - s3 = vec_ld(-16, src1-i); - h0 = vec_mergeh(s1, s1); // [a,a,b,b] - s2 = vec_ld(16, src0+i); - s1 = vec_mergeh(vec_mergel(l0,h0), // [d,b,d,b] - vec_mergeh(l0,h0)); // [c,a,c,a] - // [d,c,b,a] - l0 = vec_mergel(s3, s3); - d = vec_madd(s0, s1, zero); - h0 = vec_mergeh(s3, s3); - vec_st(d, 0, dst+i); - s3 = vec_mergeh(vec_mergel(l0,h0), - vec_mergeh(l0,h0)); - d = vec_madd(s2, s3, zero); - vec_st(d, 16, dst+i); - } -} - -static void vector_fmul_add_add_altivec(float *dst, const float *src0, - const float *src1, const float *src2, - int src3, int len, int step) -{ - int i; - vector float d, s0, s1, s2, t0, t1, edges; - vector unsigned char align = vec_lvsr(0,dst), - mask = vec_lvsl(0, dst); - -#if 0 //FIXME: there is still something wrong - if (step == 2) { - int y; - vector float d0, d1, s3, t2; - vector unsigned int sel = - vec_mergeh(vec_splat_u32(-1), vec_splat_u32(0)); - t1 = vec_ld(16, dst); - for (i=0,y=0; i<len-3; i+=4,y+=8) { - - s0 = vec_ld(0,src0+i); - s1 = vec_ld(0,src1+i); - s2 = vec_ld(0,src2+i); - -// t0 = vec_ld(0, dst+y); //[x x x|a] -// t1 = vec_ld(16, dst+y); //[b c d|e] - t2 = vec_ld(31, dst+y); //[f g h|x] - - d = vec_madd(s0,s1,s2); // [A B C D] - - // [A A B B] - - // [C C D D] - - d0 = vec_perm(t0, t1, mask); // [a b c d] - - d0 = vec_sel(vec_mergeh(d, d), d0, sel); // [A b B d] - - edges = vec_perm(t1, t0, mask); - - t0 = vec_perm(edges, d0, align); // [x x x|A] - - t1 = vec_perm(d0, edges, align); // [b B d|e] - - vec_stl(t0, 0, dst+y); - - d1 = vec_perm(t1, t2, mask); // [e f g h] - - d1 = vec_sel(vec_mergel(d, d), d1, sel); // [C f D h] - - edges = vec_perm(t2, t1, mask); - - t1 = vec_perm(edges, d1, align); // [b B d|C] - - t2 = vec_perm(d1, edges, align); // [f D h|x] - - vec_stl(t1, 16, dst+y); - - t0 = t1; - - vec_stl(t2, 31, dst+y); - - t1 = t2; - } - } else - #endif - if (step == 1 && src3 == 0) - for (i=0; i<len-3; i+=4) { - t0 = vec_ld(0, dst+i); - t1 = vec_ld(15, dst+i); - s0 = vec_ld(0, src0+i); - s1 = vec_ld(0, src1+i); - s2 = vec_ld(0, src2+i); - edges = vec_perm(t1 ,t0, mask); - d = vec_madd(s0,s1,s2); - t1 = vec_perm(d, edges, align); - t0 = vec_perm(edges, d, align); - vec_st(t1, 15, dst+i); - vec_st(t0, 0, dst+i); - } - else - ff_vector_fmul_add_add_c(dst, src0, src1, src2, src3, len, step); -} - -void float_to_int16_altivec(int16_t *dst, const float *src, int len) -{ - int i; - vector float s0, s1; - vector signed int t0, t1; - vector signed short d0, d1, d; - vector unsigned char align; - if(((long)dst)&15) //FIXME - for(i=0; i<len-7; i+=8) { - s0 = vec_ld(0, src+i); - s1 = vec_ld(16, src+i); - t0 = vec_cts(s0, 0); - d0 = vec_ld(0, dst+i); - t1 = vec_cts(s1, 0); - d1 = vec_ld(15, dst+i); - d = vec_packs(t0,t1); - d1 = vec_perm(d1, d0, vec_lvsl(0,dst+i)); - align = vec_lvsr(0, dst+i); - d0 = vec_perm(d1, d, align); - d1 = vec_perm(d, d1, align); - vec_st(d0, 0, dst+i); - vec_st(d1,15, dst+i); - } - else - for(i=0; i<len-7; i+=8) { - s0 = vec_ld(0, src+i); - s1 = vec_ld(16, src+i); - t0 = vec_cts(s0, 0); - t1 = vec_cts(s1, 0); - d = vec_packs(t0,t1); - vec_st(d, 0, dst+i); - } -} - -void float_init_altivec(DSPContext* c, AVCodecContext *avctx) -{ - c->vector_fmul = vector_fmul_altivec; - c->vector_fmul_reverse = vector_fmul_reverse_altivec; - c->vector_fmul_add_add = vector_fmul_add_add_altivec; - if(!(avctx->flags & CODEC_FLAG_BITEXACT)) - c->float_to_int16 = float_to_int16_altivec; -} diff --git a/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h b/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h deleted file mode 100644 index b8a908a61..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * gcc fixes for altivec. - * Used to workaround broken gcc (FSF gcc-3 pre gcc-3.3) - * and to stay somewhat compatible with Darwin. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef FFMPEG_GCC_FIXES_H -#define FFMPEG_GCC_FIXES_H - -#include "config.h" - -#ifdef HAVE_ALTIVEC_H -#include <altivec.h> -#endif - -#if (__GNUC__ < 4) -# define REG_v(a) -#else -# define REG_v(a) asm ( #a ) -#endif - -#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3) - -/* This code was provided to me by Bartosch Pixa - * as a separate header file (broken_mergel.h). - * thanks to lu_zero for the workaround. - * - * See this mail for more information: - * http://gcc.gnu.org/ml/gcc/2003-04/msg00967.html - */ - -static inline vector signed char ff_vmrglb (vector signed char const A, - vector signed char const B) -{ - static const vector unsigned char lowbyte = { - 0x08, 0x18, 0x09, 0x19, 0x0a, 0x1a, 0x0b, 0x1b, - 0x0c, 0x1c, 0x0d, 0x1d, 0x0e, 0x1e, 0x0f, 0x1f - }; - return vec_perm (A, B, lowbyte); -} - -static inline vector signed short ff_vmrglh (vector signed short const A, - vector signed short const B) -{ - static const vector unsigned char lowhalf = { - 0x08, 0x09, 0x18, 0x19, 0x0a, 0x0b, 0x1a, 0x1b, - 0x0c, 0x0d, 0x1c, 0x1d, 0x0e, 0x0f, 0x1e, 0x1f - }; - return vec_perm (A, B, lowhalf); -} - -static inline vector signed int ff_vmrglw (vector signed int const A, - vector signed int const B) -{ - static const vector unsigned char lowword = { - 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, - 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f - }; - return vec_perm (A, B, lowword); -} -/*#define ff_vmrglb ff_vmrglb -#define ff_vmrglh ff_vmrglh -#define ff_vmrglw ff_vmrglw -*/ -#undef vec_mergel - -#define vec_mergel(a1, a2) \ -__ch (__bin_args_eq (vector signed char, (a1), vector signed char, (a2)), \ - ((vector signed char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \ -__ch (__bin_args_eq (vector unsigned char, (a1), vector unsigned char, (a2)), \ - ((vector unsigned char) ff_vmrglb ((vector signed char) (a1), (vector signed char) (a2))), \ -__ch (__bin_args_eq (vector signed short, (a1), vector signed short, (a2)), \ - ((vector signed short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \ -__ch (__bin_args_eq (vector unsigned short, (a1), vector unsigned short, (a2)), \ - ((vector unsigned short) ff_vmrglh ((vector signed short) (a1), (vector signed short) (a2))), \ -__ch (__bin_args_eq (vector float, (a1), vector float, (a2)), \ - ((vector float) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ -__ch (__bin_args_eq (vector signed int, (a1), vector signed int, (a2)), \ - ((vector signed int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ -__ch (__bin_args_eq (vector unsigned int, (a1), vector unsigned int, (a2)), \ - ((vector unsigned int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ - __altivec_link_error_invalid_argument ()))))))) - -#endif /* (__GNUC__ == 3 && __GNUC_MINOR__ < 3) */ - -#endif /* FFMPEG_GCC_FIXES_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c b/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c deleted file mode 100644 index 8151410d4..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c +++ /dev/null @@ -1,147 +0,0 @@ -/* - * GMC (Global Motion Compensation) - * AltiVec-enabled - * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_ppc.h" -#include "util_altivec.h" - -/* - altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, - to preserve proper dst alignment. -*/ -#define GMC1_PERF_COND (h==8) -void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) -{ -POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); - const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) = - {rounder, rounder, rounder, rounder, - rounder, rounder, rounder, rounder}; - const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) = - { - (16-x16)*(16-y16), /* A */ - ( x16)*(16-y16), /* B */ - (16-x16)*( y16), /* C */ - ( x16)*( y16), /* D */ - 0, 0, 0, 0 /* padding */ - }; - register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); - register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8); - register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; - register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD; - int i; - unsigned long dst_odd = (unsigned long)dst & 0x0000000F; - unsigned long src_really_odd = (unsigned long)src & 0x0000000F; - - -POWERPC_PERF_START_COUNT(altivec_gmc1_num, GMC1_PERF_COND); - - tempA = vec_ld(0, (unsigned short*)ABCD); - Av = vec_splat(tempA, 0); - Bv = vec_splat(tempA, 1); - Cv = vec_splat(tempA, 2); - Dv = vec_splat(tempA, 3); - - rounderV = vec_ld(0, (unsigned short*)rounder_a); - - // we'll be able to pick-up our 9 char elements - // at src from those 32 bytes - // we load the first batch here, as inside the loop - // we can re-use 'src+stride' from one iteration - // as the 'src' of the next. - src_0 = vec_ld(0, src); - src_1 = vec_ld(16, src); - srcvA = vec_perm(src_0, src_1, vec_lvsl(0, src)); - - if (src_really_odd != 0x0000000F) - { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. - srcvB = vec_perm(src_0, src_1, vec_lvsl(1, src)); - } - else - { - srcvB = src_1; - } - srcvA = vec_mergeh(vczero, srcvA); - srcvB = vec_mergeh(vczero, srcvB); - - for(i=0; i<h; i++) - { - dst_odd = (unsigned long)dst & 0x0000000F; - src_really_odd = (((unsigned long)src) + stride) & 0x0000000F; - - dstv = vec_ld(0, dst); - - // we we'll be able to pick-up our 9 char elements - // at src + stride from those 32 bytes - // then reuse the resulting 2 vectors srvcC and srcvD - // as the next srcvA and srcvB - src_0 = vec_ld(stride + 0, src); - src_1 = vec_ld(stride + 16, src); - srcvC = vec_perm(src_0, src_1, vec_lvsl(stride + 0, src)); - - if (src_really_odd != 0x0000000F) - { // if src & 0xF == 0xF, then (src+1) is properly aligned on the second vector. - srcvD = vec_perm(src_0, src_1, vec_lvsl(stride + 1, src)); - } - else - { - srcvD = src_1; - } - - srcvC = vec_mergeh(vczero, srcvC); - srcvD = vec_mergeh(vczero, srcvD); - - - // OK, now we (finally) do the math :-) - // those four instructions replaces 32 int muls & 32 int adds. - // isn't AltiVec nice ? - tempA = vec_mladd((vector unsigned short)srcvA, Av, rounderV); - tempB = vec_mladd((vector unsigned short)srcvB, Bv, tempA); - tempC = vec_mladd((vector unsigned short)srcvC, Cv, tempB); - tempD = vec_mladd((vector unsigned short)srcvD, Dv, tempC); - - srcvA = srcvC; - srcvB = srcvD; - - tempD = vec_sr(tempD, vcsr8); - - dstv2 = vec_pack(tempD, (vector unsigned short)vczero); - - if (dst_odd) - { - dstv2 = vec_perm(dstv, dstv2, vcprm(0,1,s0,s1)); - } - else - { - dstv2 = vec_perm(dstv, dstv2, vcprm(s0,s1,2,3)); - } - - vec_st(dstv2, 0, dst); - - dst += stride; - src += stride; - } - -POWERPC_PERF_STOP_COUNT(altivec_gmc1_num, GMC1_PERF_COND); -} diff --git a/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c b/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c deleted file mode 100644 index c716b1e33..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c +++ /dev/null @@ -1,904 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_ppc.h" -#include "dsputil_altivec.h" -#include "util_altivec.h" -#include "types_altivec.h" - -#define PUT_OP_U8_ALTIVEC(d, s, dst) d = s -#define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s) - -#define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC -#define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec -#define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num -#define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec -#define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num -#define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec -#define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num -#define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec -#define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num -#include "h264_template_altivec.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_h264_chroma_mc8_altivec -#undef PREFIX_h264_chroma_mc8_num -#undef PREFIX_h264_qpel16_h_lowpass_altivec -#undef PREFIX_h264_qpel16_h_lowpass_num -#undef PREFIX_h264_qpel16_v_lowpass_altivec -#undef PREFIX_h264_qpel16_v_lowpass_num -#undef PREFIX_h264_qpel16_hv_lowpass_altivec -#undef PREFIX_h264_qpel16_hv_lowpass_num - -#define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC -#define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec -#define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num -#define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec -#define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num -#define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec -#define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num -#define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec -#define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num -#include "h264_template_altivec.c" -#undef OP_U8_ALTIVEC -#undef PREFIX_h264_chroma_mc8_altivec -#undef PREFIX_h264_chroma_mc8_num -#undef PREFIX_h264_qpel16_h_lowpass_altivec -#undef PREFIX_h264_qpel16_h_lowpass_num -#undef PREFIX_h264_qpel16_v_lowpass_altivec -#undef PREFIX_h264_qpel16_v_lowpass_num -#undef PREFIX_h264_qpel16_hv_lowpass_altivec -#undef PREFIX_h264_qpel16_hv_lowpass_num - -#define H264_MC(OPNAME, SIZE, CODETYPE) \ -static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \ - DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\ - OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\ - put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ -}\ -\ -static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\ - DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\ - DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\ - put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\ - put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\ - OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\ -}\ - -/* this code assume that stride % 16 == 0 */ -void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { - DECLARE_ALIGNED_16(signed int, ABCD[4]) = - {((8 - x) * (8 - y)), - ((x) * (8 - y)), - ((8 - x) * (y)), - ((x) * (y))}; - register int i; - vec_u8_t fperm; - const vec_s32_t vABCD = vec_ld(0, ABCD); - const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); - const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); - const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); - const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); - LOAD_ZERO; - const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); - const vec_u16_t v6us = vec_splat_u16(6); - register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; - register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; - - vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; - vec_u8_t vsrc0uc, vsrc1uc; - vec_s16_t vsrc0ssH, vsrc1ssH; - vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; - vec_s16_t vsrc2ssH, vsrc3ssH, psum; - vec_u8_t vdst, ppsum, fsum; - - if (((unsigned long)dst) % 16 == 0) { - fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, - 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F); - } else { - fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F); - } - - vsrcAuc = vec_ld(0, src); - - if (loadSecond) - vsrcBuc = vec_ld(16, src); - vsrcperm0 = vec_lvsl(0, src); - vsrcperm1 = vec_lvsl(1, src); - - vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcBuc; - else - vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); - - vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc); - vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc); - - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - - - vsrcCuc = vec_ld(stride + 0, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc); - vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc); - - psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); - psum = vec_mladd(vB, vsrc1ssH, psum); - psum = vec_mladd(vC, vsrc2ssH, psum); - psum = vec_mladd(vD, vsrc3ssH, psum); - psum = vec_add(v28ss, psum); - psum = vec_sra(psum, v6us); - - vdst = vec_ld(0, dst); - ppsum = (vec_u8_t)vec_packsu(psum, psum); - fsum = vec_perm(vdst, ppsum, fperm); - - vec_st(fsum, 0, dst); - - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; - - dst += stride; - src += stride; - } - } else { - vec_u8_t vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 16, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc3uc = vsrcDuc; - else - vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc); - vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc); - - psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); - psum = vec_mladd(vB, vsrc1ssH, psum); - psum = vec_mladd(vC, vsrc2ssH, psum); - psum = vec_mladd(vD, vsrc3ssH, psum); - psum = vec_add(v28ss, psum); - psum = vec_sr(psum, v6us); - - vdst = vec_ld(0, dst); - ppsum = (vec_u8_t)vec_pack(psum, psum); - fsum = vec_perm(vdst, ppsum, fperm); - - vec_st(fsum, 0, dst); - - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; - - dst += stride; - src += stride; - } - } -} - -static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, - const uint8_t * src2, int dst_stride, - int src_stride1, int h) -{ - int i; - vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align; - - mask_ = vec_lvsl(0, src2); - - for (i = 0; i < h; i++) { - - tmp1 = vec_ld(i * src_stride1, src1); - mask = vec_lvsl(i * src_stride1, src1); - tmp2 = vec_ld(i * src_stride1 + 15, src1); - - a = vec_perm(tmp1, tmp2, mask); - - tmp1 = vec_ld(i * 16, src2); - tmp2 = vec_ld(i * 16 + 15, src2); - - b = vec_perm(tmp1, tmp2, mask_); - - tmp1 = vec_ld(0, dst); - mask = vec_lvsl(0, dst); - tmp2 = vec_ld(15, dst); - - d = vec_avg(a, b); - - edges = vec_perm(tmp2, tmp1, mask); - - align = vec_lvsr(0, dst); - - tmp2 = vec_perm(d, edges, align); - tmp1 = vec_perm(edges, d, align); - - vec_st(tmp2, 15, dst); - vec_st(tmp1, 0 , dst); - - dst += dst_stride; - } -} - -static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, - const uint8_t * src2, int dst_stride, - int src_stride1, int h) -{ - int i; - vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align; - - mask_ = vec_lvsl(0, src2); - - for (i = 0; i < h; i++) { - - tmp1 = vec_ld(i * src_stride1, src1); - mask = vec_lvsl(i * src_stride1, src1); - tmp2 = vec_ld(i * src_stride1 + 15, src1); - - a = vec_perm(tmp1, tmp2, mask); - - tmp1 = vec_ld(i * 16, src2); - tmp2 = vec_ld(i * 16 + 15, src2); - - b = vec_perm(tmp1, tmp2, mask_); - - tmp1 = vec_ld(0, dst); - mask = vec_lvsl(0, dst); - tmp2 = vec_ld(15, dst); - - d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b)); - - edges = vec_perm(tmp2, tmp1, mask); - - align = vec_lvsr(0, dst); - - tmp2 = vec_perm(d, edges, align); - tmp1 = vec_perm(edges, d, align); - - vec_st(tmp2, 15, dst); - vec_st(tmp1, 0 , dst); - - dst += dst_stride; - } -} - -/* Implemented but could be faster -#define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h) -#define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h) - */ - - H264_MC(put_, 16, altivec) - H264_MC(avg_, 16, altivec) - - -/**************************************************************************** - * IDCT transform: - ****************************************************************************/ - -#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \ - /* 1st stage */ \ - vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \ - vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \ - vz2 = vec_sra(vb1,vec_splat_u16(1)); \ - vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ - vz3 = vec_sra(vb3,vec_splat_u16(1)); \ - vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ - /* 2nd stage: output */ \ - va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \ - va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \ - va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \ - va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */ - -#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ - b0 = vec_mergeh( a0, a0 ); \ - b1 = vec_mergeh( a1, a0 ); \ - b2 = vec_mergeh( a2, a0 ); \ - b3 = vec_mergeh( a3, a0 ); \ - a0 = vec_mergeh( b0, b2 ); \ - a1 = vec_mergel( b0, b2 ); \ - a2 = vec_mergeh( b1, b3 ); \ - a3 = vec_mergel( b1, b3 ); \ - b0 = vec_mergeh( a0, a2 ); \ - b1 = vec_mergel( a0, a2 ); \ - b2 = vec_mergeh( a1, a3 ); \ - b3 = vec_mergel( a1, a3 ) - -#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ - vdst_orig = vec_ld(0, dst); \ - vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ - vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst); \ - va = vec_add(va, vdst_ss); \ - va_u8 = vec_packsu(va, zero_s16v); \ - va_u32 = vec_splat((vec_u32_t)va_u8, 0); \ - vec_ste(va_u32, element, (uint32_t*)dst); - -static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride) -{ - vec_s16_t va0, va1, va2, va3; - vec_s16_t vz0, vz1, vz2, vz3; - vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3; - vec_u8_t va_u8; - vec_u32_t va_u32; - vec_s16_t vdst_ss; - const vec_u16_t v6us = vec_splat_u16(6); - vec_u8_t vdst, vdst_orig; - vec_u8_t vdst_mask = vec_lvsl(0, dst); - int element = ((unsigned long)dst & 0xf) >> 2; - LOAD_ZERO; - - block[0] += 32; /* add 32 as a DC-level for rounding */ - - vtmp0 = vec_ld(0,block); - vtmp1 = vec_sld(vtmp0, vtmp0, 8); - vtmp2 = vec_ld(16,block); - vtmp3 = vec_sld(vtmp2, vtmp2, 8); - - VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); - VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); - VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); - - va0 = vec_sra(va0,v6us); - va1 = vec_sra(va1,v6us); - va2 = vec_sra(va2,v6us); - va3 = vec_sra(va3,v6us); - - VEC_LOAD_U8_ADD_S16_STORE_U8(va0); - dst += stride; - VEC_LOAD_U8_ADD_S16_STORE_U8(va1); - dst += stride; - VEC_LOAD_U8_ADD_S16_STORE_U8(va2); - dst += stride; - VEC_LOAD_U8_ADD_S16_STORE_U8(va3); -} - -#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ - /* a0 = SRC(0) + SRC(4); */ \ - vec_s16_t a0v = vec_add(s0, s4); \ - /* a2 = SRC(0) - SRC(4); */ \ - vec_s16_t a2v = vec_sub(s0, s4); \ - /* a4 = (SRC(2)>>1) - SRC(6); */ \ - vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \ - /* a6 = (SRC(6)>>1) + SRC(2); */ \ - vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \ - /* b0 = a0 + a6; */ \ - vec_s16_t b0v = vec_add(a0v, a6v); \ - /* b2 = a2 + a4; */ \ - vec_s16_t b2v = vec_add(a2v, a4v); \ - /* b4 = a2 - a4; */ \ - vec_s16_t b4v = vec_sub(a2v, a4v); \ - /* b6 = a0 - a6; */ \ - vec_s16_t b6v = vec_sub(a0v, a6v); \ - /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ - /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ - vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \ - /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ - /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ - vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ - /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ - /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ - vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ - /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ - vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ - /* b1 = (a7>>2) + a1; */ \ - vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \ - /* b3 = a3 + (a5>>2); */ \ - vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \ - /* b5 = (a3>>2) - a5; */ \ - vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \ - /* b7 = a7 - (a1>>2); */ \ - vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ - /* DST(0, b0 + b7); */ \ - d0 = vec_add(b0v, b7v); \ - /* DST(1, b2 + b5); */ \ - d1 = vec_add(b2v, b5v); \ - /* DST(2, b4 + b3); */ \ - d2 = vec_add(b4v, b3v); \ - /* DST(3, b6 + b1); */ \ - d3 = vec_add(b6v, b1v); \ - /* DST(4, b6 - b1); */ \ - d4 = vec_sub(b6v, b1v); \ - /* DST(5, b4 - b3); */ \ - d5 = vec_sub(b4v, b3v); \ - /* DST(6, b2 - b5); */ \ - d6 = vec_sub(b2v, b5v); \ - /* DST(7, b0 - b7); */ \ - d7 = vec_sub(b0v, b7v); \ -} - -#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ - /* unaligned load */ \ - vec_u8_t hv = vec_ld( 0, dest ); \ - vec_u8_t lv = vec_ld( 7, dest ); \ - vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \ - vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \ - vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \ - vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \ - vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \ - vec_u8_t edgehv; \ - /* unaligned store */ \ - vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ - vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ - lv = vec_sel( lv, bodyv, edgelv ); \ - vec_st( lv, 7, dest ); \ - hv = vec_ld( 0, dest ); \ - edgehv = vec_perm( zero_u8v, sel, perm_stv ); \ - hv = vec_sel( hv, bodyv, edgehv ); \ - vec_st( hv, 0, dest ); \ - } - -void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { - vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7; - vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7; - vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; - - vec_u8_t perm_ldv = vec_lvsl(0, dst); - vec_u8_t perm_stv = vec_lvsr(8, dst); - - const vec_u16_t onev = vec_splat_u16(1); - const vec_u16_t twov = vec_splat_u16(2); - const vec_u16_t sixv = vec_splat_u16(6); - - const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1); - LOAD_ZERO; - - dct[0] += 32; // rounding for the >>6 at the end - - s0 = vec_ld(0x00, (int16_t*)dct); - s1 = vec_ld(0x10, (int16_t*)dct); - s2 = vec_ld(0x20, (int16_t*)dct); - s3 = vec_ld(0x30, (int16_t*)dct); - s4 = vec_ld(0x40, (int16_t*)dct); - s5 = vec_ld(0x50, (int16_t*)dct); - s6 = vec_ld(0x60, (int16_t*)dct); - s7 = vec_ld(0x70, (int16_t*)dct); - - IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, - d0, d1, d2, d3, d4, d5, d6, d7); - - TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 ); - - IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7, - idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7); - - ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel); - ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); -} - -#define transpose4x16(r0, r1, r2, r3) { \ - register vec_u8_t r4; \ - register vec_u8_t r5; \ - register vec_u8_t r6; \ - register vec_u8_t r7; \ - \ - r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ - r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ - r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ - r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ - \ - r0 = vec_mergeh(r4, r6); /*all set 0*/ \ - r1 = vec_mergel(r4, r6); /*all set 1*/ \ - r2 = vec_mergeh(r5, r7); /*all set 2*/ \ - r3 = vec_mergel(r5, r7); /*all set 3*/ \ -} - -static inline void write16x4(uint8_t *dst, int dst_stride, - register vec_u8_t r0, register vec_u8_t r1, - register vec_u8_t r2, register vec_u8_t r3) { - DECLARE_ALIGNED_16(unsigned char, result[64]); - uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; - int int_dst_stride = dst_stride/4; - - vec_st(r0, 0, result); - vec_st(r1, 16, result); - vec_st(r2, 32, result); - vec_st(r3, 48, result); - /* FIXME: there has to be a better way!!!! */ - *dst_int = *src_int; - *(dst_int+ int_dst_stride) = *(src_int + 1); - *(dst_int+ 2*int_dst_stride) = *(src_int + 2); - *(dst_int+ 3*int_dst_stride) = *(src_int + 3); - *(dst_int+ 4*int_dst_stride) = *(src_int + 4); - *(dst_int+ 5*int_dst_stride) = *(src_int + 5); - *(dst_int+ 6*int_dst_stride) = *(src_int + 6); - *(dst_int+ 7*int_dst_stride) = *(src_int + 7); - *(dst_int+ 8*int_dst_stride) = *(src_int + 8); - *(dst_int+ 9*int_dst_stride) = *(src_int + 9); - *(dst_int+10*int_dst_stride) = *(src_int + 10); - *(dst_int+11*int_dst_stride) = *(src_int + 11); - *(dst_int+12*int_dst_stride) = *(src_int + 12); - *(dst_int+13*int_dst_stride) = *(src_int + 13); - *(dst_int+14*int_dst_stride) = *(src_int + 14); - *(dst_int+15*int_dst_stride) = *(src_int + 15); -} - -/** \brief performs a 6x16 transpose of data in src, and stores it to dst - \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing - out of unaligned_load() */ -#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ - register vec_u8_t r0 = unaligned_load(0, src); \ - register vec_u8_t r1 = unaligned_load( src_stride, src); \ - register vec_u8_t r2 = unaligned_load(2* src_stride, src); \ - register vec_u8_t r3 = unaligned_load(3* src_stride, src); \ - register vec_u8_t r4 = unaligned_load(4* src_stride, src); \ - register vec_u8_t r5 = unaligned_load(5* src_stride, src); \ - register vec_u8_t r6 = unaligned_load(6* src_stride, src); \ - register vec_u8_t r7 = unaligned_load(7* src_stride, src); \ - register vec_u8_t r14 = unaligned_load(14*src_stride, src); \ - register vec_u8_t r15 = unaligned_load(15*src_stride, src); \ - \ - r8 = unaligned_load( 8*src_stride, src); \ - r9 = unaligned_load( 9*src_stride, src); \ - r10 = unaligned_load(10*src_stride, src); \ - r11 = unaligned_load(11*src_stride, src); \ - r12 = unaligned_load(12*src_stride, src); \ - r13 = unaligned_load(13*src_stride, src); \ - \ - /*Merge first pairs*/ \ - r0 = vec_mergeh(r0, r8); /*0, 8*/ \ - r1 = vec_mergeh(r1, r9); /*1, 9*/ \ - r2 = vec_mergeh(r2, r10); /*2,10*/ \ - r3 = vec_mergeh(r3, r11); /*3,11*/ \ - r4 = vec_mergeh(r4, r12); /*4,12*/ \ - r5 = vec_mergeh(r5, r13); /*5,13*/ \ - r6 = vec_mergeh(r6, r14); /*6,14*/ \ - r7 = vec_mergeh(r7, r15); /*7,15*/ \ - \ - /*Merge second pairs*/ \ - r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \ - r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \ - r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \ - r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \ - r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \ - r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \ - r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \ - r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \ - \ - /*Third merge*/ \ - r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ - r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ - r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ - r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \ - r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \ - r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \ - /* Don't need to compute 3 and 7*/ \ - \ - /*Final merge*/ \ - r8 = vec_mergeh(r0, r4); /*all set 0*/ \ - r9 = vec_mergel(r0, r4); /*all set 1*/ \ - r10 = vec_mergeh(r1, r5); /*all set 2*/ \ - r11 = vec_mergel(r1, r5); /*all set 3*/ \ - r12 = vec_mergeh(r2, r6); /*all set 4*/ \ - r13 = vec_mergel(r2, r6); /*all set 5*/ \ - /* Don't need to compute 14 and 15*/ \ - \ -} - -// out: o = |x-y| < a -static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x, - register vec_u8_t y, - register vec_u8_t a) { - - register vec_u8_t diff = vec_subs(x, y); - register vec_u8_t diffneg = vec_subs(y, x); - register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */ - o = (vec_u8_t)vec_cmplt(o, a); - return o; -} - -static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0, - register vec_u8_t p1, - register vec_u8_t q0, - register vec_u8_t q1, - register vec_u8_t alpha, - register vec_u8_t beta) { - - register vec_u8_t mask; - register vec_u8_t tempmask; - - mask = diff_lt_altivec(p0, q0, alpha); - tempmask = diff_lt_altivec(p1, p0, beta); - mask = vec_and(mask, tempmask); - tempmask = diff_lt_altivec(q1, q0, beta); - mask = vec_and(mask, tempmask); - - return mask; -} - -// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) -static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0, - register vec_u8_t p1, - register vec_u8_t p2, - register vec_u8_t q0, - register vec_u8_t tc0) { - - register vec_u8_t average = vec_avg(p0, q0); - register vec_u8_t temp; - register vec_u8_t uncliped; - register vec_u8_t ones; - register vec_u8_t max; - register vec_u8_t min; - register vec_u8_t newp1; - - temp = vec_xor(average, p2); - average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ - ones = vec_splat_u8(1); - temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ - uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ - max = vec_adds(p1, tc0); - min = vec_subs(p1, tc0); - newp1 = vec_max(min, uncliped); - newp1 = vec_min(max, newp1); - return newp1; -} - -#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ - \ - const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ - \ - register vec_u8_t pq0bit = vec_xor(p0,q0); \ - register vec_u8_t q1minus; \ - register vec_u8_t p0minus; \ - register vec_u8_t stage1; \ - register vec_u8_t stage2; \ - register vec_u8_t vec160; \ - register vec_u8_t delta; \ - register vec_u8_t deltaneg; \ - \ - q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ - stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ - stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ - p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ - stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \ - pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \ - stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \ - stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \ - vec160 = vec_ld(0, &A0v); \ - deltaneg = vec_subs(vec160, stage2); /* -d */ \ - delta = vec_subs(stage2, vec160); /* d */ \ - deltaneg = vec_min(tc0masked, deltaneg); \ - delta = vec_min(tc0masked, delta); \ - p0 = vec_subs(p0, deltaneg); \ - q0 = vec_subs(q0, delta); \ - p0 = vec_adds(p0, delta); \ - q0 = vec_adds(q0, deltaneg); \ -} - -#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ - DECLARE_ALIGNED_16(unsigned char, temp[16]); \ - register vec_u8_t alphavec; \ - register vec_u8_t betavec; \ - register vec_u8_t mask; \ - register vec_u8_t p1mask; \ - register vec_u8_t q1mask; \ - register vector signed char tc0vec; \ - register vec_u8_t finaltc0; \ - register vec_u8_t tc0masked; \ - register vec_u8_t newp1; \ - register vec_u8_t newq1; \ - \ - temp[0] = alpha; \ - temp[1] = beta; \ - alphavec = vec_ld(0, temp); \ - betavec = vec_splat(alphavec, 0x1); \ - alphavec = vec_splat(alphavec, 0x0); \ - mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \ - \ - *((int *)temp) = *((int *)tc0); \ - tc0vec = vec_ld(0, (signed char*)temp); \ - tc0vec = vec_mergeh(tc0vec, tc0vec); \ - tc0vec = vec_mergeh(tc0vec, tc0vec); \ - mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ - finaltc0 = vec_and((vec_u8_t)tc0vec, mask); /* tc = tc0 */ \ - \ - p1mask = diff_lt_altivec(p2, p0, betavec); \ - p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \ - tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec); \ - finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ - newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ - /*end if*/ \ - \ - q1mask = diff_lt_altivec(q2, q0, betavec); \ - q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ - tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec); \ - finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ - newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ - /*end if*/ \ - \ - h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ - p1 = newp1; \ - q1 = newq1; \ -} - -static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - - if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { - register vec_u8_t p2 = vec_ld(-3*stride, pix); - register vec_u8_t p1 = vec_ld(-2*stride, pix); - register vec_u8_t p0 = vec_ld(-1*stride, pix); - register vec_u8_t q0 = vec_ld(0, pix); - register vec_u8_t q1 = vec_ld(stride, pix); - register vec_u8_t q2 = vec_ld(2*stride, pix); - h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); - vec_st(p1, -2*stride, pix); - vec_st(p0, -1*stride, pix); - vec_st(q0, 0, pix); - vec_st(q1, stride, pix); - } -} - -static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { - - register vec_u8_t line0, line1, line2, line3, line4, line5; - if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) - return; - readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); - h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); - transpose4x16(line1, line2, line3, line4); - write16x4(pix-2, stride, line1, line2, line3, line4); -} - -void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { - - if (has_altivec()) { - c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; - c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec; - c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; - c->h264_idct_add = ff_h264_idct_add_altivec; - c->h264_idct8_add = ff_h264_idct8_add_altivec; - c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; - c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; - -#define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ - c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \ - c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \ - c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \ - c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \ - c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \ - c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \ - c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \ - c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \ - c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \ - c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \ - c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \ - c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \ - c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \ - c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \ - c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec - - dspfunc(put_h264_qpel, 0, 16); - dspfunc(avg_h264_qpel, 0, 16); -#undef dspfunc - } -} diff --git a/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c b/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c deleted file mode 100644 index d8ad96419..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c +++ /dev/null @@ -1,694 +0,0 @@ -/* - * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -//#define DEBUG_ALIGNMENT -#ifdef DEBUG_ALIGNMENT -#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); -#else -#define ASSERT_ALIGNED(ptr) ; -#endif - -/* this code assume that stride % 16 == 0 */ - -#define CHROMA_MC8_ALTIVEC_CORE \ - vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\ - vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\ -\ - psum = vec_mladd(vA, vsrc0ssH, v32ss);\ - psum = vec_mladd(vB, vsrc1ssH, psum);\ - psum = vec_mladd(vC, vsrc2ssH, psum);\ - psum = vec_mladd(vD, vsrc3ssH, psum);\ - psum = vec_sr(psum, v6us);\ -\ - vdst = vec_ld(0, dst);\ - ppsum = (vec_u8_t)vec_pack(psum, psum);\ - vfdst = vec_perm(vdst, ppsum, fperm);\ -\ - OP_U8_ALTIVEC(fsum, vfdst, vdst);\ -\ - vec_st(fsum, 0, dst);\ -\ - vsrc0ssH = vsrc2ssH;\ - vsrc1ssH = vsrc3ssH;\ -\ - dst += stride;\ - src += stride; - -#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ -\ - vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\ - vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\ -\ - psum = vec_mladd(vA, vsrc0ssH, v32ss);\ - psum = vec_mladd(vE, vsrc1ssH, psum);\ - psum = vec_sr(psum, v6us);\ -\ - vdst = vec_ld(0, dst);\ - ppsum = (vec_u8_t)vec_pack(psum, psum);\ - vfdst = vec_perm(vdst, ppsum, fperm);\ -\ - OP_U8_ALTIVEC(fsum, vfdst, vdst);\ -\ - vec_st(fsum, 0, dst);\ -\ - dst += stride;\ - src += stride; - -void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, - int stride, int h, int x, int y) { - POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); - DECLARE_ALIGNED_16(signed int, ABCD[4]) = - {((8 - x) * (8 - y)), - (( x) * (8 - y)), - ((8 - x) * ( y)), - (( x) * ( y))}; - register int i; - vec_u8_t fperm; - const vec_s32_t vABCD = vec_ld(0, ABCD); - const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); - const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); - const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); - const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); - LOAD_ZERO; - const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); - const vec_u16_t v6us = vec_splat_u16(6); - register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; - register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; - - vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; - vec_u8_t vsrc0uc, vsrc1uc; - vec_s16_t vsrc0ssH, vsrc1ssH; - vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; - vec_s16_t vsrc2ssH, vsrc3ssH, psum; - vec_u8_t vdst, ppsum, vfdst, fsum; - - POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); - - if (((unsigned long)dst) % 16 == 0) { - fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, - 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F); - } else { - fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F); - } - - vsrcAuc = vec_ld(0, src); - - if (loadSecond) - vsrcBuc = vec_ld(16, src); - vsrcperm0 = vec_lvsl(0, src); - vsrcperm1 = vec_lvsl(1, src); - - vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcBuc; - else - vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); - - vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc); - vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc); - - if (ABCD[3]) { - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE - } - } else { - vec_u8_t vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 16, src); - vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc3uc = vsrcDuc; - else - vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE - } - } - } else { - const vec_s16_t vE = vec_add(vB, vC); - if (ABCD[2]) { // x == 0 B == 0 - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - - vsrc0uc = vsrc1uc; - } - } else { - vec_u8_t vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 15, src); - vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - - vsrc0uc = vsrc1uc; - } - } - } else { // y == 0 C == 0 - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(0, src); - vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - } - } else { - vec_u8_t vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(0, src); - vsrcDuc = vec_ld(15, src); - vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc1uc = vsrcDuc; - else - vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - CHROMA_MC8_ALTIVEC_CORE_SIMPLE - } - } - } - } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); -} - -#undef CHROMA_MC8_ALTIVEC_CORE - -/* this code assume stride % 16 == 0 */ -static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); - register int i; - - LOAD_ZERO; - const vec_u8_t permM2 = vec_lvsl(-2, src); - const vec_u8_t permM1 = vec_lvsl(-1, src); - const vec_u8_t permP0 = vec_lvsl(+0, src); - const vec_u8_t permP1 = vec_lvsl(+1, src); - const vec_u8_t permP2 = vec_lvsl(+2, src); - const vec_u8_t permP3 = vec_lvsl(+3, src); - const vec_s16_t v5ss = vec_splat_s16(5); - const vec_u16_t v5us = vec_splat_u16(5); - const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); - - vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - - register int align = ((((unsigned long)src) - 2) % 16); - - vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, - srcP2A, srcP2B, srcP3A, srcP3B, - srcM1A, srcM1B, srcM2A, srcM2B, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, - pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, - psumA, psumB, sumA, sumB; - - vec_u8_t sum, vdst, fsum; - - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); - - for (i = 0 ; i < 16 ; i ++) { - vec_u8_t srcR1 = vec_ld(-2, src); - vec_u8_t srcR2 = vec_ld(14, src); - - switch (align) { - default: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = vec_perm(srcR1, srcR2, permP3); - } break; - case 11: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = srcR2; - } break; - case 12: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = srcR2; - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 13: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = srcR2; - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 14: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = srcR2; - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 15: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = srcR2; - srcP0 = vec_perm(srcR2, srcR3, permP0); - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - } - - srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); - srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); - srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); - srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); - - srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); - srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); - srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); - srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); - - srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); - srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); - srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); - srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); - - sum1A = vec_adds(srcP0A, srcP1A); - sum1B = vec_adds(srcP0B, srcP1B); - sum2A = vec_adds(srcM1A, srcP2A); - sum2B = vec_adds(srcM1B, srcP2B); - sum3A = vec_adds(srcM2A, srcP3A); - sum3B = vec_adds(srcM2B, srcP3B); - - pp1A = vec_mladd(sum1A, v20ss, v16ss); - pp1B = vec_mladd(sum1B, v20ss, v16ss); - - pp2A = vec_mladd(sum2A, v5ss, zero_s16v); - pp2B = vec_mladd(sum2B, v5ss, zero_s16v); - - pp3A = vec_add(sum3A, pp1A); - pp3B = vec_add(sum3B, pp1B); - - psumA = vec_sub(pp3A, pp2A); - psumB = vec_sub(pp3B, pp2B); - - sumA = vec_sra(psumA, v5us); - sumB = vec_sra(psumB, v5us); - - sum = vec_packsu(sumA, sumB); - - ASSERT_ALIGNED(dst); - vdst = vec_ld(0, dst); - - OP_U8_ALTIVEC(fsum, sum, vdst); - - vec_st(fsum, 0, dst); - - src += srcStride; - dst += dstStride; - } -POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); -} - -/* this code assume stride % 16 == 0 */ -static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1); - - register int i; - - LOAD_ZERO; - const vec_u8_t perm = vec_lvsl(0, src); - const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vec_u16_t v5us = vec_splat_u16(5); - const vec_s16_t v5ss = vec_splat_s16(5); - const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); - - uint8_t *srcbis = src - (srcStride * 2); - - const vec_u8_t srcM2a = vec_ld(0, srcbis); - const vec_u8_t srcM2b = vec_ld(16, srcbis); - const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); -// srcbis += srcStride; - const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); - const vec_u8_t srcM1b = vec_ld(16, srcbis); - const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); -// srcbis += srcStride; - const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); - const vec_u8_t srcP0b = vec_ld(16, srcbis); - const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); -// srcbis += srcStride; - const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); - const vec_u8_t srcP1b = vec_ld(16, srcbis); - const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); -// srcbis += srcStride; - const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); - const vec_u8_t srcP2b = vec_ld(16, srcbis); - const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); -// srcbis += srcStride; - - vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); - vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); - vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); - vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); - vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); - vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); - vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); - vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); - vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); - vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); - - vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, - psumA, psumB, sumA, sumB, - srcP3ssA, srcP3ssB, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; - - vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; - - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); - - for (i = 0 ; i < 16 ; i++) { - srcP3a = vec_ld(0, srcbis += srcStride); - srcP3b = vec_ld(16, srcbis); - srcP3 = vec_perm(srcP3a, srcP3b, perm); - srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); - srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); -// srcbis += srcStride; - - sum1A = vec_adds(srcP0ssA, srcP1ssA); - sum1B = vec_adds(srcP0ssB, srcP1ssB); - sum2A = vec_adds(srcM1ssA, srcP2ssA); - sum2B = vec_adds(srcM1ssB, srcP2ssB); - sum3A = vec_adds(srcM2ssA, srcP3ssA); - sum3B = vec_adds(srcM2ssB, srcP3ssB); - - srcM2ssA = srcM1ssA; - srcM2ssB = srcM1ssB; - srcM1ssA = srcP0ssA; - srcM1ssB = srcP0ssB; - srcP0ssA = srcP1ssA; - srcP0ssB = srcP1ssB; - srcP1ssA = srcP2ssA; - srcP1ssB = srcP2ssB; - srcP2ssA = srcP3ssA; - srcP2ssB = srcP3ssB; - - pp1A = vec_mladd(sum1A, v20ss, v16ss); - pp1B = vec_mladd(sum1B, v20ss, v16ss); - - pp2A = vec_mladd(sum2A, v5ss, zero_s16v); - pp2B = vec_mladd(sum2B, v5ss, zero_s16v); - - pp3A = vec_add(sum3A, pp1A); - pp3B = vec_add(sum3B, pp1B); - - psumA = vec_sub(pp3A, pp2A); - psumB = vec_sub(pp3B, pp2B); - - sumA = vec_sra(psumA, v5us); - sumB = vec_sra(psumB, v5us); - - sum = vec_packsu(sumA, sumB); - - ASSERT_ALIGNED(dst); - vdst = vec_ld(0, dst); - - OP_U8_ALTIVEC(fsum, sum, vdst); - - vec_st(fsum, 0, dst); - - dst += dstStride; - } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); -} - -/* this code assume stride % 16 == 0 *and* tmp is properly aligned */ -static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { - POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); - register int i; - LOAD_ZERO; - const vec_u8_t permM2 = vec_lvsl(-2, src); - const vec_u8_t permM1 = vec_lvsl(-1, src); - const vec_u8_t permP0 = vec_lvsl(+0, src); - const vec_u8_t permP1 = vec_lvsl(+1, src); - const vec_u8_t permP2 = vec_lvsl(+2, src); - const vec_u8_t permP3 = vec_lvsl(+3, src); - const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vec_u32_t v10ui = vec_splat_u32(10); - const vec_s16_t v5ss = vec_splat_s16(5); - const vec_s16_t v1ss = vec_splat_s16(1); - const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); - const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); - - register int align = ((((unsigned long)src) - 2) % 16); - - vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, - srcP2A, srcP2B, srcP3A, srcP3B, - srcM1A, srcM1B, srcM2A, srcM2B, - sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, - pp1A, pp1B, pp2A, pp2B, psumA, psumB; - - const vec_u8_t mperm = (const vec_u8_t) - AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, - 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); - int16_t *tmpbis = tmp; - - vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, - tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, - tmpP2ssA, tmpP2ssB; - - vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, - pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, - pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, - ssumAe, ssumAo, ssumBe, ssumBo; - vec_u8_t fsum, sumv, sum, vdst; - vec_s16_t ssume, ssumo; - - POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); - src -= (2 * srcStride); - for (i = 0 ; i < 21 ; i ++) { - vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - vec_u8_t srcR1 = vec_ld(-2, src); - vec_u8_t srcR2 = vec_ld(14, src); - - switch (align) { - default: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = vec_perm(srcR1, srcR2, permP3); - } break; - case 11: { - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = vec_perm(srcR1, srcR2, permP2); - srcP3 = srcR2; - } break; - case 12: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = vec_perm(srcR1, srcR2, permP1); - srcP2 = srcR2; - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 13: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = vec_perm(srcR1, srcR2, permP0); - srcP1 = srcR2; - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 14: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = vec_perm(srcR1, srcR2, permM1); - srcP0 = srcR2; - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - case 15: { - vec_u8_t srcR3 = vec_ld(30, src); - srcM2 = vec_perm(srcR1, srcR2, permM2); - srcM1 = srcR2; - srcP0 = vec_perm(srcR2, srcR3, permP0); - srcP1 = vec_perm(srcR2, srcR3, permP1); - srcP2 = vec_perm(srcR2, srcR3, permP2); - srcP3 = vec_perm(srcR2, srcR3, permP3); - } break; - } - - srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); - srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); - srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); - srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); - - srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); - srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); - srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); - srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); - - srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); - srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); - srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); - srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); - - sum1A = vec_adds(srcP0A, srcP1A); - sum1B = vec_adds(srcP0B, srcP1B); - sum2A = vec_adds(srcM1A, srcP2A); - sum2B = vec_adds(srcM1B, srcP2B); - sum3A = vec_adds(srcM2A, srcP3A); - sum3B = vec_adds(srcM2B, srcP3B); - - pp1A = vec_mladd(sum1A, v20ss, sum3A); - pp1B = vec_mladd(sum1B, v20ss, sum3B); - - pp2A = vec_mladd(sum2A, v5ss, zero_s16v); - pp2B = vec_mladd(sum2B, v5ss, zero_s16v); - - psumA = vec_sub(pp1A, pp2A); - psumB = vec_sub(pp1B, pp2B); - - vec_st(psumA, 0, tmp); - vec_st(psumB, 16, tmp); - - src += srcStride; - tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */ - } - - tmpM2ssA = vec_ld(0, tmpbis); - tmpM2ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpM1ssA = vec_ld(0, tmpbis); - tmpM1ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP0ssA = vec_ld(0, tmpbis); - tmpP0ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP1ssA = vec_ld(0, tmpbis); - tmpP1ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - tmpP2ssA = vec_ld(0, tmpbis); - tmpP2ssB = vec_ld(16, tmpbis); - tmpbis += tmpStride; - - for (i = 0 ; i < 16 ; i++) { - const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); - const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); - - const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); - const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); - const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); - const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); - const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); - const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); - - tmpbis += tmpStride; - - tmpM2ssA = tmpM1ssA; - tmpM2ssB = tmpM1ssB; - tmpM1ssA = tmpP0ssA; - tmpM1ssB = tmpP0ssB; - tmpP0ssA = tmpP1ssA; - tmpP0ssB = tmpP1ssB; - tmpP1ssA = tmpP2ssA; - tmpP1ssB = tmpP2ssB; - tmpP2ssA = tmpP3ssA; - tmpP2ssB = tmpP3ssB; - - pp1Ae = vec_mule(sum1A, v20ss); - pp1Ao = vec_mulo(sum1A, v20ss); - pp1Be = vec_mule(sum1B, v20ss); - pp1Bo = vec_mulo(sum1B, v20ss); - - pp2Ae = vec_mule(sum2A, v5ss); - pp2Ao = vec_mulo(sum2A, v5ss); - pp2Be = vec_mule(sum2B, v5ss); - pp2Bo = vec_mulo(sum2B, v5ss); - - pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); - pp3Ao = vec_mulo(sum3A, v1ss); - pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); - pp3Bo = vec_mulo(sum3B, v1ss); - - pp1cAe = vec_add(pp1Ae, v512si); - pp1cAo = vec_add(pp1Ao, v512si); - pp1cBe = vec_add(pp1Be, v512si); - pp1cBo = vec_add(pp1Bo, v512si); - - pp32Ae = vec_sub(pp3Ae, pp2Ae); - pp32Ao = vec_sub(pp3Ao, pp2Ao); - pp32Be = vec_sub(pp3Be, pp2Be); - pp32Bo = vec_sub(pp3Bo, pp2Bo); - - sumAe = vec_add(pp1cAe, pp32Ae); - sumAo = vec_add(pp1cAo, pp32Ao); - sumBe = vec_add(pp1cBe, pp32Be); - sumBo = vec_add(pp1cBo, pp32Bo); - - ssumAe = vec_sra(sumAe, v10ui); - ssumAo = vec_sra(sumAo, v10ui); - ssumBe = vec_sra(sumBe, v10ui); - ssumBo = vec_sra(sumBo, v10ui); - - ssume = vec_packs(ssumAe, ssumBe); - ssumo = vec_packs(ssumAo, ssumBo); - - sumv = vec_packsu(ssume, ssumo); - sum = vec_perm(sumv, sumv, mperm); - - ASSERT_ALIGNED(dst); - vdst = vec_ld(0, dst); - - OP_U8_ALTIVEC(fsum, sum, vdst); - - vec_st(fsum, 0, dst); - - dst += dstStride; - } - POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); -} diff --git a/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c b/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c deleted file mode 100644 index 37b2f62c3..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c +++ /dev/null @@ -1,236 +0,0 @@ -/* - * Copyright (c) 2001 Michel Lespinasse - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * NOTE: This code is based on GPL code from the libmpeg2 project. The - * author, Michel Lespinasses, has given explicit permission to release - * under LGPL as part of ffmpeg. - * - */ - -/* - * FFMpeg integration by Dieter Shirley - * - * This file is a direct copy of the altivec idct module from the libmpeg2 - * project. I've deleted all of the libmpeg2 specific code, renamed the functions and - * re-ordered the function parameters. The only change to the IDCT function - * itself was to factor out the partial transposition, and to perform a full - * transpose at the end of the function. - */ - - -#include <stdlib.h> /* malloc(), free() */ -#include <string.h> -#include "dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_ppc.h" - -#define vector_s16_t vector signed short -#define const_vector_s16_t const vector signed short -#define vector_u16_t vector unsigned short -#define vector_s8_t vector signed char -#define vector_u8_t vector unsigned char -#define vector_s32_t vector signed int -#define vector_u32_t vector unsigned int - -#define IDCT_HALF \ - /* 1st stage */ \ - t1 = vec_mradds (a1, vx7, vx1 ); \ - t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \ - t7 = vec_mradds (a2, vx5, vx3); \ - t3 = vec_mradds (ma2, vx3, vx5); \ - \ - /* 2nd stage */ \ - t5 = vec_adds (vx0, vx4); \ - t0 = vec_subs (vx0, vx4); \ - t2 = vec_mradds (a0, vx6, vx2); \ - t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \ - t6 = vec_adds (t8, t3); \ - t3 = vec_subs (t8, t3); \ - t8 = vec_subs (t1, t7); \ - t1 = vec_adds (t1, t7); \ - \ - /* 3rd stage */ \ - t7 = vec_adds (t5, t2); \ - t2 = vec_subs (t5, t2); \ - t5 = vec_adds (t0, t4); \ - t0 = vec_subs (t0, t4); \ - t4 = vec_subs (t8, t3); \ - t3 = vec_adds (t8, t3); \ - \ - /* 4th stage */ \ - vy0 = vec_adds (t7, t1); \ - vy7 = vec_subs (t7, t1); \ - vy1 = vec_mradds (c4, t3, t5); \ - vy6 = vec_mradds (mc4, t3, t5); \ - vy2 = vec_mradds (c4, t4, t0); \ - vy5 = vec_mradds (mc4, t4, t0); \ - vy3 = vec_adds (t2, t6); \ - vy4 = vec_subs (t2, t6); - - -#define IDCT \ - vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ - vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ - vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \ - vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \ - vector_u16_t shift; \ - \ - c4 = vec_splat (constants[0], 0); \ - a0 = vec_splat (constants[0], 1); \ - a1 = vec_splat (constants[0], 2); \ - a2 = vec_splat (constants[0], 3); \ - mc4 = vec_splat (constants[0], 4); \ - ma2 = vec_splat (constants[0], 5); \ - bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \ - \ - zero = vec_splat_s16 (0); \ - shift = vec_splat_u16 (4); \ - \ - vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \ - vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \ - vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \ - vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \ - vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \ - vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \ - vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \ - vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \ - \ - IDCT_HALF \ - \ - vx0 = vec_mergeh (vy0, vy4); \ - vx1 = vec_mergel (vy0, vy4); \ - vx2 = vec_mergeh (vy1, vy5); \ - vx3 = vec_mergel (vy1, vy5); \ - vx4 = vec_mergeh (vy2, vy6); \ - vx5 = vec_mergel (vy2, vy6); \ - vx6 = vec_mergeh (vy3, vy7); \ - vx7 = vec_mergel (vy3, vy7); \ - \ - vy0 = vec_mergeh (vx0, vx4); \ - vy1 = vec_mergel (vx0, vx4); \ - vy2 = vec_mergeh (vx1, vx5); \ - vy3 = vec_mergel (vx1, vx5); \ - vy4 = vec_mergeh (vx2, vx6); \ - vy5 = vec_mergel (vx2, vx6); \ - vy6 = vec_mergeh (vx3, vx7); \ - vy7 = vec_mergel (vx3, vx7); \ - \ - vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \ - vx1 = vec_mergel (vy0, vy4); \ - vx2 = vec_mergeh (vy1, vy5); \ - vx3 = vec_mergel (vy1, vy5); \ - vx4 = vec_mergeh (vy2, vy6); \ - vx5 = vec_mergel (vy2, vy6); \ - vx6 = vec_mergeh (vy3, vy7); \ - vx7 = vec_mergel (vy3, vy7); \ - \ - IDCT_HALF \ - \ - shift = vec_splat_u16 (6); \ - vx0 = vec_sra (vy0, shift); \ - vx1 = vec_sra (vy1, shift); \ - vx2 = vec_sra (vy2, shift); \ - vx3 = vec_sra (vy3, shift); \ - vx4 = vec_sra (vy4, shift); \ - vx5 = vec_sra (vy5, shift); \ - vx6 = vec_sra (vy6, shift); \ - vx7 = vec_sra (vy7, shift); - - -static const_vector_s16_t constants[5] = { - (vector_s16_t) AVV(23170, 13573, 6518, 21895, -23170, -21895, 32, 31), - (vector_s16_t) AVV(16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725), - (vector_s16_t) AVV(22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521), - (vector_s16_t) AVV(21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692), - (vector_s16_t) AVV(19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722) -}; - -void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block) -{ -POWERPC_PERF_DECLARE(altivec_idct_put_num, 1); - vector_u8_t tmp; - -#ifdef CONFIG_POWERPC_PERF -POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); -#endif - IDCT - -#define COPY(dest,src) \ - tmp = vec_packsu (src, src); \ - vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); - - COPY (dest, vx0) dest += stride; - COPY (dest, vx1) dest += stride; - COPY (dest, vx2) dest += stride; - COPY (dest, vx3) dest += stride; - COPY (dest, vx4) dest += stride; - COPY (dest, vx5) dest += stride; - COPY (dest, vx6) dest += stride; - COPY (dest, vx7) - -POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1); -} - -void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block) -{ -POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); - vector_u8_t tmp; - vector_s16_t tmp2, tmp3; - vector_u8_t perm0; - vector_u8_t perm1; - vector_u8_t p0, p1, p; - -#ifdef CONFIG_POWERPC_PERF -POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); -#endif - - IDCT - - p0 = vec_lvsl (0, dest); - p1 = vec_lvsl (stride, dest); - p = vec_splat_u8 (-1); - perm0 = vec_mergeh (p, p0); - perm1 = vec_mergeh (p, p1); - -#define ADD(dest,src,perm) \ - /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ - tmp = vec_ld (0, dest); \ - tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \ - tmp3 = vec_adds (tmp2, src); \ - tmp = vec_packsu (tmp3, tmp3); \ - vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); - - ADD (dest, vx0, perm0) dest += stride; - ADD (dest, vx1, perm1) dest += stride; - ADD (dest, vx2, perm0) dest += stride; - ADD (dest, vx3, perm1) dest += stride; - ADD (dest, vx4, perm0) dest += stride; - ADD (dest, vx5, perm1) dest += stride; - ADD (dest, vx6, perm0) dest += stride; - ADD (dest, vx7, perm1) - -POWERPC_PERF_STOP_COUNT(altivec_idct_add_num, 1); -} - diff --git a/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.c b/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.c deleted file mode 100644 index 3b161c5a6..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.c +++ /dev/null @@ -1,153 +0,0 @@ -/* - * High quality image resampling with polyphase filters - * Copyright (c) 2001 Fabrice Bellard. - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file imgresample_altivec.c - * High quality image resampling with polyphase filters - AltiVec bits - */ - -#include "gcc_fixes.h" - -typedef union { - vector unsigned char v; - unsigned char c[16]; -} vec_uc_t; - -typedef union { - vector signed short v; - signed short s[8]; -} vec_ss_t; - -void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, - int wrap, int16_t *filter) -{ - int sum, i; - const uint8_t *s; - vector unsigned char *tv, tmp, dstv, zero; - vec_ss_t srchv[4], srclv[4], fv[4]; - vector signed short zeros, sumhv, sumlv; - s = src; - - for(i=0;i<4;i++) - { - /* - The vec_madds later on does an implicit >>15 on the result. - Since FILTER_BITS is 8, and we have 15 bits of magnitude in - a signed short, we have just enough bits to pre-shift our - filter constants <<7 to compensate for vec_madds. - */ - fv[i].s[0] = filter[i] << (15-FILTER_BITS); - fv[i].v = vec_splat(fv[i].v, 0); - } - - zero = vec_splat_u8(0); - zeros = vec_splat_s16(0); - - - /* - When we're resampling, we'd ideally like both our input buffers, - and output buffers to be 16-byte aligned, so we can do both aligned - reads and writes. Sadly we can't always have this at the moment, so - we opt for aligned writes, as unaligned writes have a huge overhead. - To do this, do enough scalar resamples to get dst 16-byte aligned. - */ - i = (-(int)dst) & 0xf; - while(i>0) { - sum = s[0 * wrap] * filter[0] + - s[1 * wrap] * filter[1] + - s[2 * wrap] * filter[2] + - s[3 * wrap] * filter[3]; - sum = sum >> FILTER_BITS; - if (sum<0) sum = 0; else if (sum>255) sum=255; - dst[0] = sum; - dst++; - s++; - dst_width--; - i--; - } - - /* Do our altivec resampling on 16 pixels at once. */ - while(dst_width>=16) { - /* - Read 16 (potentially unaligned) bytes from each of - 4 lines into 4 vectors, and split them into shorts. - Interleave the multipy/accumulate for the resample - filter with the loads to hide the 3 cycle latency - the vec_madds have. - */ - tv = (vector unsigned char *) &s[0 * wrap]; - tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); - srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); - srclv[0].v = (vector signed short) vec_mergel(zero, tmp); - sumhv = vec_madds(srchv[0].v, fv[0].v, zeros); - sumlv = vec_madds(srclv[0].v, fv[0].v, zeros); - - tv = (vector unsigned char *) &s[1 * wrap]; - tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap])); - srchv[1].v = (vector signed short) vec_mergeh(zero, tmp); - srclv[1].v = (vector signed short) vec_mergel(zero, tmp); - sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv); - sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv); - - tv = (vector unsigned char *) &s[2 * wrap]; - tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap])); - srchv[2].v = (vector signed short) vec_mergeh(zero, tmp); - srclv[2].v = (vector signed short) vec_mergel(zero, tmp); - sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv); - sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv); - - tv = (vector unsigned char *) &s[3 * wrap]; - tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap])); - srchv[3].v = (vector signed short) vec_mergeh(zero, tmp); - srclv[3].v = (vector signed short) vec_mergel(zero, tmp); - sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); - sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); - - /* - Pack the results into our destination vector, - and do an aligned write of that back to memory. - */ - dstv = vec_packsu(sumhv, sumlv) ; - vec_st(dstv, 0, (vector unsigned char *) dst); - - dst+=16; - s+=16; - dst_width-=16; - } - - /* - If there are any leftover pixels, resample them - with the slow scalar method. - */ - while(dst_width>0) { - sum = s[0 * wrap] * filter[0] + - s[1 * wrap] * filter[1] + - s[2 * wrap] * filter[2] + - s[3 * wrap] * filter[3]; - sum = sum >> FILTER_BITS; - if (sum<0) sum = 0; else if (sum>255) sum=255; - dst[0] = sum; - dst++; - s++; - dst_width--; - } -} - diff --git a/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.h b/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.h deleted file mode 100644 index 538c1bee6..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef FFMPEG_IMGRESAMPLE_ALTIVEC_H -#define FFMPEG_IMGRESAMPLE_ALTIVEC_H - -#include <stdint.h> - -void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, - int wrap, int16_t *filter); -#endif /* FFMPEG_IMGRESAMPLE_ALTIVEC_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/int_altivec.c b/contrib/ffmpeg/libavcodec/ppc/int_altivec.c deleted file mode 100644 index 95497c99a..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/int_altivec.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - ** @file int_altivec.c - ** integer misc ops. - **/ - -#include "dsputil.h" - -#include "gcc_fixes.h" - -#include "dsputil_altivec.h" - -static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, - int size) { - int i, size16; - vector signed char vpix1; - vector signed short vpix2, vdiff, vpix1l,vpix1h; - union { vector signed int vscore; - int32_t score[4]; - } u; - u.vscore = vec_splat_s32(0); -// -//XXX lazy way, fix it later - -#define vec_unaligned_load(b) \ - vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b)); - - size16 = size >> 4; - while(size16) { -// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); - //load pix1 and the first batch of pix2 - - vpix1 = vec_unaligned_load(pix1); - vpix2 = vec_unaligned_load(pix2); - pix2 += 8; - //unpack - vpix1h = vec_unpackh(vpix1); - vdiff = vec_sub(vpix1h, vpix2); - vpix1l = vec_unpackl(vpix1); - // load another batch from pix2 - vpix2 = vec_unaligned_load(pix2); - u.vscore = vec_msum(vdiff, vdiff, u.vscore); - vdiff = vec_sub(vpix1l, vpix2); - u.vscore = vec_msum(vdiff, vdiff, u.vscore); - pix1 += 16; - pix2 += 8; - size16--; - } - u.vscore = vec_sums(u.vscore, vec_splat_s32(0)); - - size %= 16; - for (i = 0; i < size; i++) { - u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); - } - return u.score[3]; -} - -void int_init_altivec(DSPContext* c, AVCodecContext *avctx) -{ - c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec; -} diff --git a/contrib/ffmpeg/libavcodec/ppc/mathops.h b/contrib/ffmpeg/libavcodec/ppc/mathops.h deleted file mode 100644 index d7cc85365..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/mathops.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * simple math operations - * Copyright (c) 2001, 2002 Fabrice Bellard. - * Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef FFMPEG_PPC_MATHOPS_H -#define FFMPEG_PPC_MATHOPS_H - -#if defined(ARCH_POWERPC_405) -/* signed 16x16 -> 32 multiply add accumulate */ -# define MAC16(rt, ra, rb) \ - asm ("maclhw %0, %2, %3" : "=r" (rt) : "0" (rt), "r" (ra), "r" (rb)); - -/* signed 16x16 -> 32 multiply */ -# define MUL16(ra, rb) \ - ({ int __rt; - asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); - __rt; }) -#endif - -#endif /* FFMPEG_PPC_MATHOPS_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c b/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c deleted file mode 100644 index a2ba5e125..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c +++ /dev/null @@ -1,645 +0,0 @@ -/* - * Copyright (c) 2002 Dieter Shirley - * - * dct_unquantize_h263_altivec: - * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stdlib.h> -#include <stdio.h> -#include "dsputil.h" -#include "mpegvideo.h" - -#include "gcc_fixes.h" - -#include "dsputil_ppc.h" -#include "util_altivec.h" -// Swaps two variables (used for altivec registers) -#define SWAP(a,b) \ -do { \ - __typeof__(a) swap_temp=a; \ - a=b; \ - b=swap_temp; \ -} while (0) - -// transposes a matrix consisting of four vectors with four elements each -#define TRANSPOSE4(a,b,c,d) \ -do { \ - __typeof__(a) _trans_ach = vec_mergeh(a, c); \ - __typeof__(a) _trans_acl = vec_mergel(a, c); \ - __typeof__(a) _trans_bdh = vec_mergeh(b, d); \ - __typeof__(a) _trans_bdl = vec_mergel(b, d); \ - \ - a = vec_mergeh(_trans_ach, _trans_bdh); \ - b = vec_mergel(_trans_ach, _trans_bdh); \ - c = vec_mergeh(_trans_acl, _trans_bdl); \ - d = vec_mergel(_trans_acl, _trans_bdl); \ -} while (0) - - -// Loads a four-byte value (int or float) from the target address -// into every element in the target vector. Only works if the -// target address is four-byte aligned (which should be always). -#define LOAD4(vec, address) \ -{ \ - __typeof__(vec)* _load_addr = (__typeof__(vec)*)(address); \ - vector unsigned char _perm_vec = vec_lvsl(0,(address)); \ - vec = vec_ld(0, _load_addr); \ - vec = vec_perm(vec, vec, _perm_vec); \ - vec = vec_splat(vec, 0); \ -} - - -#define FOUROF(a) AVV(a,a,a,a) - -int dct_quantize_altivec(MpegEncContext* s, - DCTELEM* data, int n, - int qscale, int* overflow) -{ - int lastNonZero; - vector float row0, row1, row2, row3, row4, row5, row6, row7; - vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7; - const vector float zero = (const vector float)FOUROF(0.); - // used after quantize step - int oldBaseValue = 0; - - // Load the data into the row/alt vectors - { - vector signed short data0, data1, data2, data3, data4, data5, data6, data7; - - data0 = vec_ld(0, data); - data1 = vec_ld(16, data); - data2 = vec_ld(32, data); - data3 = vec_ld(48, data); - data4 = vec_ld(64, data); - data5 = vec_ld(80, data); - data6 = vec_ld(96, data); - data7 = vec_ld(112, data); - - // Transpose the data before we start - TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); - - // load the data into floating point vectors. We load - // the high half of each row into the main row vectors - // and the low half into the alt vectors. - row0 = vec_ctf(vec_unpackh(data0), 0); - alt0 = vec_ctf(vec_unpackl(data0), 0); - row1 = vec_ctf(vec_unpackh(data1), 0); - alt1 = vec_ctf(vec_unpackl(data1), 0); - row2 = vec_ctf(vec_unpackh(data2), 0); - alt2 = vec_ctf(vec_unpackl(data2), 0); - row3 = vec_ctf(vec_unpackh(data3), 0); - alt3 = vec_ctf(vec_unpackl(data3), 0); - row4 = vec_ctf(vec_unpackh(data4), 0); - alt4 = vec_ctf(vec_unpackl(data4), 0); - row5 = vec_ctf(vec_unpackh(data5), 0); - alt5 = vec_ctf(vec_unpackl(data5), 0); - row6 = vec_ctf(vec_unpackh(data6), 0); - alt6 = vec_ctf(vec_unpackl(data6), 0); - row7 = vec_ctf(vec_unpackh(data7), 0); - alt7 = vec_ctf(vec_unpackl(data7), 0); - } - - // The following block could exist as a separate an altivec dct - // function. However, if we put it inline, the DCT data can remain - // in the vector local variables, as floats, which we'll use during the - // quantize step... - { - const vector float vec_0_298631336 = (vector float)FOUROF(0.298631336f); - const vector float vec_0_390180644 = (vector float)FOUROF(-0.390180644f); - const vector float vec_0_541196100 = (vector float)FOUROF(0.541196100f); - const vector float vec_0_765366865 = (vector float)FOUROF(0.765366865f); - const vector float vec_0_899976223 = (vector float)FOUROF(-0.899976223f); - const vector float vec_1_175875602 = (vector float)FOUROF(1.175875602f); - const vector float vec_1_501321110 = (vector float)FOUROF(1.501321110f); - const vector float vec_1_847759065 = (vector float)FOUROF(-1.847759065f); - const vector float vec_1_961570560 = (vector float)FOUROF(-1.961570560f); - const vector float vec_2_053119869 = (vector float)FOUROF(2.053119869f); - const vector float vec_2_562915447 = (vector float)FOUROF(-2.562915447f); - const vector float vec_3_072711026 = (vector float)FOUROF(3.072711026f); - - - int whichPass, whichHalf; - - for(whichPass = 1; whichPass<=2; whichPass++) - { - for(whichHalf = 1; whichHalf<=2; whichHalf++) - { - vector float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - vector float tmp10, tmp11, tmp12, tmp13; - vector float z1, z2, z3, z4, z5; - - tmp0 = vec_add(row0, row7); // tmp0 = dataptr[0] + dataptr[7]; - tmp7 = vec_sub(row0, row7); // tmp7 = dataptr[0] - dataptr[7]; - tmp3 = vec_add(row3, row4); // tmp3 = dataptr[3] + dataptr[4]; - tmp4 = vec_sub(row3, row4); // tmp4 = dataptr[3] - dataptr[4]; - tmp1 = vec_add(row1, row6); // tmp1 = dataptr[1] + dataptr[6]; - tmp6 = vec_sub(row1, row6); // tmp6 = dataptr[1] - dataptr[6]; - tmp2 = vec_add(row2, row5); // tmp2 = dataptr[2] + dataptr[5]; - tmp5 = vec_sub(row2, row5); // tmp5 = dataptr[2] - dataptr[5]; - - tmp10 = vec_add(tmp0, tmp3); // tmp10 = tmp0 + tmp3; - tmp13 = vec_sub(tmp0, tmp3); // tmp13 = tmp0 - tmp3; - tmp11 = vec_add(tmp1, tmp2); // tmp11 = tmp1 + tmp2; - tmp12 = vec_sub(tmp1, tmp2); // tmp12 = tmp1 - tmp2; - - - // dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS); - row0 = vec_add(tmp10, tmp11); - - // dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS); - row4 = vec_sub(tmp10, tmp11); - - - // z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100); - z1 = vec_madd(vec_add(tmp12, tmp13), vec_0_541196100, (vector float)zero); - - // dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865), - // CONST_BITS-PASS1_BITS); - row2 = vec_madd(tmp13, vec_0_765366865, z1); - - // dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065), - // CONST_BITS-PASS1_BITS); - row6 = vec_madd(tmp12, vec_1_847759065, z1); - - z1 = vec_add(tmp4, tmp7); // z1 = tmp4 + tmp7; - z2 = vec_add(tmp5, tmp6); // z2 = tmp5 + tmp6; - z3 = vec_add(tmp4, tmp6); // z3 = tmp4 + tmp6; - z4 = vec_add(tmp5, tmp7); // z4 = tmp5 + tmp7; - - // z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */ - z5 = vec_madd(vec_add(z3, z4), vec_1_175875602, (vector float)zero); - - // z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */ - z3 = vec_madd(z3, vec_1_961570560, z5); - - // z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */ - z4 = vec_madd(z4, vec_0_390180644, z5); - - // The following adds are rolled into the multiplies above - // z3 = vec_add(z3, z5); // z3 += z5; - // z4 = vec_add(z4, z5); // z4 += z5; - - // z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */ - // Wow! It's actually more effecient to roll this multiply - // into the adds below, even thought the multiply gets done twice! - // z2 = vec_madd(z2, vec_2_562915447, (vector float)zero); - - // z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */ - // Same with this one... - // z1 = vec_madd(z1, vec_0_899976223, (vector float)zero); - - // tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */ - // dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS); - row7 = vec_madd(tmp4, vec_0_298631336, vec_madd(z1, vec_0_899976223, z3)); - - // tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */ - // dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS); - row5 = vec_madd(tmp5, vec_2_053119869, vec_madd(z2, vec_2_562915447, z4)); - - // tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */ - // dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS); - row3 = vec_madd(tmp6, vec_3_072711026, vec_madd(z2, vec_2_562915447, z3)); - - // tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */ - // dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS); - row1 = vec_madd(z1, vec_0_899976223, vec_madd(tmp7, vec_1_501321110, z4)); - - // Swap the row values with the alts. If this is the first half, - // this sets up the low values to be acted on in the second half. - // If this is the second half, it puts the high values back in - // the row values where they are expected to be when we're done. - SWAP(row0, alt0); - SWAP(row1, alt1); - SWAP(row2, alt2); - SWAP(row3, alt3); - SWAP(row4, alt4); - SWAP(row5, alt5); - SWAP(row6, alt6); - SWAP(row7, alt7); - } - - if (whichPass == 1) - { - // transpose the data for the second pass - - // First, block transpose the upper right with lower left. - SWAP(row4, alt0); - SWAP(row5, alt1); - SWAP(row6, alt2); - SWAP(row7, alt3); - - // Now, transpose each block of four - TRANSPOSE4(row0, row1, row2, row3); - TRANSPOSE4(row4, row5, row6, row7); - TRANSPOSE4(alt0, alt1, alt2, alt3); - TRANSPOSE4(alt4, alt5, alt6, alt7); - } - } - } - - // perform the quantize step, using the floating point data - // still in the row/alt registers - { - const int* biasAddr; - const vector signed int* qmat; - vector float bias, negBias; - - if (s->mb_intra) - { - vector signed int baseVector; - - // We must cache element 0 in the intra case - // (it needs special handling). - baseVector = vec_cts(vec_splat(row0, 0), 0); - vec_ste(baseVector, 0, &oldBaseValue); - - qmat = (vector signed int*)s->q_intra_matrix[qscale]; - biasAddr = &(s->intra_quant_bias); - } - else - { - qmat = (vector signed int*)s->q_inter_matrix[qscale]; - biasAddr = &(s->inter_quant_bias); - } - - // Load the bias vector (We add 0.5 to the bias so that we're - // rounding when we convert to int, instead of flooring.) - { - vector signed int biasInt; - const vector float negOneFloat = (vector float)FOUROF(-1.0f); - LOAD4(biasInt, biasAddr); - bias = vec_ctf(biasInt, QUANT_BIAS_SHIFT); - negBias = vec_madd(bias, negOneFloat, zero); - } - - { - vector float q0, q1, q2, q3, q4, q5, q6, q7; - - q0 = vec_ctf(qmat[0], QMAT_SHIFT); - q1 = vec_ctf(qmat[2], QMAT_SHIFT); - q2 = vec_ctf(qmat[4], QMAT_SHIFT); - q3 = vec_ctf(qmat[6], QMAT_SHIFT); - q4 = vec_ctf(qmat[8], QMAT_SHIFT); - q5 = vec_ctf(qmat[10], QMAT_SHIFT); - q6 = vec_ctf(qmat[12], QMAT_SHIFT); - q7 = vec_ctf(qmat[14], QMAT_SHIFT); - - row0 = vec_sel(vec_madd(row0, q0, negBias), vec_madd(row0, q0, bias), - vec_cmpgt(row0, zero)); - row1 = vec_sel(vec_madd(row1, q1, negBias), vec_madd(row1, q1, bias), - vec_cmpgt(row1, zero)); - row2 = vec_sel(vec_madd(row2, q2, negBias), vec_madd(row2, q2, bias), - vec_cmpgt(row2, zero)); - row3 = vec_sel(vec_madd(row3, q3, negBias), vec_madd(row3, q3, bias), - vec_cmpgt(row3, zero)); - row4 = vec_sel(vec_madd(row4, q4, negBias), vec_madd(row4, q4, bias), - vec_cmpgt(row4, zero)); - row5 = vec_sel(vec_madd(row5, q5, negBias), vec_madd(row5, q5, bias), - vec_cmpgt(row5, zero)); - row6 = vec_sel(vec_madd(row6, q6, negBias), vec_madd(row6, q6, bias), - vec_cmpgt(row6, zero)); - row7 = vec_sel(vec_madd(row7, q7, negBias), vec_madd(row7, q7, bias), - vec_cmpgt(row7, zero)); - - q0 = vec_ctf(qmat[1], QMAT_SHIFT); - q1 = vec_ctf(qmat[3], QMAT_SHIFT); - q2 = vec_ctf(qmat[5], QMAT_SHIFT); - q3 = vec_ctf(qmat[7], QMAT_SHIFT); - q4 = vec_ctf(qmat[9], QMAT_SHIFT); - q5 = vec_ctf(qmat[11], QMAT_SHIFT); - q6 = vec_ctf(qmat[13], QMAT_SHIFT); - q7 = vec_ctf(qmat[15], QMAT_SHIFT); - - alt0 = vec_sel(vec_madd(alt0, q0, negBias), vec_madd(alt0, q0, bias), - vec_cmpgt(alt0, zero)); - alt1 = vec_sel(vec_madd(alt1, q1, negBias), vec_madd(alt1, q1, bias), - vec_cmpgt(alt1, zero)); - alt2 = vec_sel(vec_madd(alt2, q2, negBias), vec_madd(alt2, q2, bias), - vec_cmpgt(alt2, zero)); - alt3 = vec_sel(vec_madd(alt3, q3, negBias), vec_madd(alt3, q3, bias), - vec_cmpgt(alt3, zero)); - alt4 = vec_sel(vec_madd(alt4, q4, negBias), vec_madd(alt4, q4, bias), - vec_cmpgt(alt4, zero)); - alt5 = vec_sel(vec_madd(alt5, q5, negBias), vec_madd(alt5, q5, bias), - vec_cmpgt(alt5, zero)); - alt6 = vec_sel(vec_madd(alt6, q6, negBias), vec_madd(alt6, q6, bias), - vec_cmpgt(alt6, zero)); - alt7 = vec_sel(vec_madd(alt7, q7, negBias), vec_madd(alt7, q7, bias), - vec_cmpgt(alt7, zero)); - } - - - } - - // Store the data back into the original block - { - vector signed short data0, data1, data2, data3, data4, data5, data6, data7; - - data0 = vec_pack(vec_cts(row0, 0), vec_cts(alt0, 0)); - data1 = vec_pack(vec_cts(row1, 0), vec_cts(alt1, 0)); - data2 = vec_pack(vec_cts(row2, 0), vec_cts(alt2, 0)); - data3 = vec_pack(vec_cts(row3, 0), vec_cts(alt3, 0)); - data4 = vec_pack(vec_cts(row4, 0), vec_cts(alt4, 0)); - data5 = vec_pack(vec_cts(row5, 0), vec_cts(alt5, 0)); - data6 = vec_pack(vec_cts(row6, 0), vec_cts(alt6, 0)); - data7 = vec_pack(vec_cts(row7, 0), vec_cts(alt7, 0)); - - { - // Clamp for overflow - vector signed int max_q_int, min_q_int; - vector signed short max_q, min_q; - - LOAD4(max_q_int, &(s->max_qcoeff)); - LOAD4(min_q_int, &(s->min_qcoeff)); - - max_q = vec_pack(max_q_int, max_q_int); - min_q = vec_pack(min_q_int, min_q_int); - - data0 = vec_max(vec_min(data0, max_q), min_q); - data1 = vec_max(vec_min(data1, max_q), min_q); - data2 = vec_max(vec_min(data2, max_q), min_q); - data4 = vec_max(vec_min(data4, max_q), min_q); - data5 = vec_max(vec_min(data5, max_q), min_q); - data6 = vec_max(vec_min(data6, max_q), min_q); - data7 = vec_max(vec_min(data7, max_q), min_q); - } - - { - vector bool char zero_01, zero_23, zero_45, zero_67; - vector signed char scanIndices_01, scanIndices_23, scanIndices_45, scanIndices_67; - vector signed char negOne = vec_splat_s8(-1); - vector signed char* scanPtr = - (vector signed char*)(s->intra_scantable.inverse); - signed char lastNonZeroChar; - - // Determine the largest non-zero index. - zero_01 = vec_pack(vec_cmpeq(data0, (vector signed short)zero), - vec_cmpeq(data1, (vector signed short)zero)); - zero_23 = vec_pack(vec_cmpeq(data2, (vector signed short)zero), - vec_cmpeq(data3, (vector signed short)zero)); - zero_45 = vec_pack(vec_cmpeq(data4, (vector signed short)zero), - vec_cmpeq(data5, (vector signed short)zero)); - zero_67 = vec_pack(vec_cmpeq(data6, (vector signed short)zero), - vec_cmpeq(data7, (vector signed short)zero)); - - // 64 biggest values - scanIndices_01 = vec_sel(scanPtr[0], negOne, zero_01); - scanIndices_23 = vec_sel(scanPtr[1], negOne, zero_23); - scanIndices_45 = vec_sel(scanPtr[2], negOne, zero_45); - scanIndices_67 = vec_sel(scanPtr[3], negOne, zero_67); - - // 32 largest values - scanIndices_01 = vec_max(scanIndices_01, scanIndices_23); - scanIndices_45 = vec_max(scanIndices_45, scanIndices_67); - - // 16 largest values - scanIndices_01 = vec_max(scanIndices_01, scanIndices_45); - - // 8 largest values - scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), - vec_mergel(scanIndices_01, negOne)); - - // 4 largest values - scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), - vec_mergel(scanIndices_01, negOne)); - - // 2 largest values - scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), - vec_mergel(scanIndices_01, negOne)); - - // largest value - scanIndices_01 = vec_max(vec_mergeh(scanIndices_01, negOne), - vec_mergel(scanIndices_01, negOne)); - - scanIndices_01 = vec_splat(scanIndices_01, 0); - - - vec_ste(scanIndices_01, 0, &lastNonZeroChar); - - lastNonZero = lastNonZeroChar; - - // While the data is still in vectors we check for the transpose IDCT permute - // and handle it using the vector unit if we can. This is the permute used - // by the altivec idct, so it is common when using the altivec dct. - - if ((lastNonZero > 0) && (s->dsp.idct_permutation_type == FF_TRANSPOSE_IDCT_PERM)) - { - TRANSPOSE8(data0, data1, data2, data3, data4, data5, data6, data7); - } - - vec_st(data0, 0, data); - vec_st(data1, 16, data); - vec_st(data2, 32, data); - vec_st(data3, 48, data); - vec_st(data4, 64, data); - vec_st(data5, 80, data); - vec_st(data6, 96, data); - vec_st(data7, 112, data); - } - } - - // special handling of block[0] - if (s->mb_intra) - { - if (!s->h263_aic) - { - if (n < 4) - oldBaseValue /= s->y_dc_scale; - else - oldBaseValue /= s->c_dc_scale; - } - - // Divide by 8, rounding the result - data[0] = (oldBaseValue + 4) >> 3; - } - - // We handled the transpose permutation above and we don't - // need to permute the "no" permutation case. - if ((lastNonZero > 0) && - (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && - (s->dsp.idct_permutation_type != FF_NO_IDCT_PERM)) - { - ff_block_permute(data, s->dsp.idct_permutation, - s->intra_scantable.scantable, lastNonZero); - } - - return lastNonZero; -} - -/* - AltiVec version of dct_unquantize_h263 - this code assumes `block' is 16 bytes-aligned -*/ -void dct_unquantize_h263_altivec(MpegEncContext *s, - DCTELEM *block, int n, int qscale) -{ -POWERPC_PERF_DECLARE(altivec_dct_unquantize_h263_num, 1); - int i, level, qmul, qadd; - int nCoeffs; - - assert(s->block_last_index[n]>=0); - -POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); - - qadd = (qscale - 1) | 1; - qmul = qscale << 1; - - if (s->mb_intra) { - if (!s->h263_aic) { - if (n < 4) - block[0] = block[0] * s->y_dc_scale; - else - block[0] = block[0] * s->c_dc_scale; - }else - qadd = 0; - i = 1; - nCoeffs= 63; //does not always use zigzag table - } else { - i = 0; - nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; - } - - { - register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); - DECLARE_ALIGNED_16(short, qmul8[]) = - { - qmul, qmul, qmul, qmul, - qmul, qmul, qmul, qmul - }; - DECLARE_ALIGNED_16(short, qadd8[]) = - { - qadd, qadd, qadd, qadd, - qadd, qadd, qadd, qadd - }; - DECLARE_ALIGNED_16(short, nqadd8[]) = - { - -qadd, -qadd, -qadd, -qadd, - -qadd, -qadd, -qadd, -qadd - }; - register vector signed short blockv, qmulv, qaddv, nqaddv, temp1; - register vector bool short blockv_null, blockv_neg; - register short backup_0 = block[0]; - register int j = 0; - - qmulv = vec_ld(0, qmul8); - qaddv = vec_ld(0, qadd8); - nqaddv = vec_ld(0, nqadd8); - -#if 0 // block *is* 16 bytes-aligned, it seems. - // first make sure block[j] is 16 bytes-aligned - for(j = 0; (j <= nCoeffs) && ((((unsigned long)block) + (j << 1)) & 0x0000000F) ; j++) { - level = block[j]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; - } - block[j] = level; - } - } -#endif - - // vectorize all the 16 bytes-aligned blocks - // of 8 elements - for(; (j + 7) <= nCoeffs ; j+=8) - { - blockv = vec_ld(j << 1, block); - blockv_neg = vec_cmplt(blockv, vczero); - blockv_null = vec_cmpeq(blockv, vczero); - // choose between +qadd or -qadd as the third operand - temp1 = vec_sel(qaddv, nqaddv, blockv_neg); - // multiply & add (block{i,i+7} * qmul [+-] qadd) - temp1 = vec_mladd(blockv, qmulv, temp1); - // put 0 where block[{i,i+7} used to have 0 - blockv = vec_sel(temp1, blockv, blockv_null); - vec_st(blockv, j << 1, block); - } - - // if nCoeffs isn't a multiple of 8, finish the job - // using good old scalar units. - // (we could do it using a truncated vector, - // but I'm not sure it's worth the hassle) - for(; j <= nCoeffs ; j++) { - level = block[j]; - if (level) { - if (level < 0) { - level = level * qmul - qadd; - } else { - level = level * qmul + qadd; - } - block[j] = level; - } - } - - if (i == 1) - { // cheat. this avoid special-casing the first iteration - block[0] = backup_0; - } - } -POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); -} - - -extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); -extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); - -void MPV_common_init_altivec(MpegEncContext *s) -{ - if ((mm_flags & MM_ALTIVEC) == 0) return; - - if (s->avctx->lowres==0) - { - if ((s->avctx->idct_algo == FF_IDCT_AUTO) || - (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) - { - s->dsp.idct_put = idct_put_altivec; - s->dsp.idct_add = idct_add_altivec; - s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - } - } - - // Test to make sure that the dct required alignments are met. - if ((((long)(s->q_intra_matrix) & 0x0f) != 0) || - (((long)(s->q_inter_matrix) & 0x0f) != 0)) - { - av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned " - "to use AltiVec DCT. Reverting to non-AltiVec version.\n"); - return; - } - - if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) - { - av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned " - "to use AltiVec DCT. Reverting to non-AltiVec version.\n"); - return; - } - - - if ((s->avctx->dct_algo == FF_DCT_AUTO) || - (s->avctx->dct_algo == FF_DCT_ALTIVEC)) - { -#if 0 /* seems to cause trouble under some circumstances */ - s->dct_quantize = dct_quantize_altivec; -#endif - s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec; - s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec; - } -} diff --git a/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c b/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c deleted file mode 100644 index 8770f05f5..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c +++ /dev/null @@ -1,789 +0,0 @@ -/* - * AltiVec-optimized snow DSP utils - * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil.h" - -#include "gcc_fixes.h" -#include "dsputil_altivec.h" -#include "snow.h" - -#undef NDEBUG -#include <assert.h> - - - -//FIXME remove this replication -#define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num))) - -static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line) -{ - int offset; - DWTELEM * buffer; - -// av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line); - - assert(buf->data_stack_top >= 0); -// assert(!buf->line[line]); - if (buf->line[line]) - return buf->line[line]; - - offset = buf->line_width * line; - buffer = buf->data_stack[buf->data_stack_top]; - buf->data_stack_top--; - buf->line[line] = buffer; - -// av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1); - - return buffer; -} - - -//altivec code - -void ff_snow_horizontal_compose97i_altivec(IDWTELEM *b, int width) -{ -#if 0 - const int w2= (width+1)>>1; - DECLARE_ALIGNED_16(IDWTELEM, temp[(width>>1)]); - const int w_l= (width>>1); - const int w_r= w2 - 1; - int i; - vector signed short t1, t2, x, y, tmp1, tmp2; - vector signed short *vbuf, *vtmp; - vector unsigned char align; - - { // Lift 0 - IDWTELEM * const ref = b + w2 - 1; - IDWTELEM b_0 = b[0]; - vector signed short v7 = vec_splat_s16(7); - vbuf = (vector signed short *)b; - - tmp1 = vec_ld (0, ref); - align = vec_lvsl (0, ref); - tmp2 = vec_ld (15, ref); - t1 = vec_perm(tmp1, tmp2, align); - - for (i=0; i<w_l-15; i+=16) { -#if 0 -/* b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3); - b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3); - b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3); - b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);*/ - b[i+0] = b[i+0] + ((7 * (ref[i+0] + ref[i+1])-1) >> 8); -#else - - tmp1 = vec_ld (0, ref+8+i); - tmp2 = vec_ld (15, ref+8+i); - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1, vec_sld(t1,t2,2)); -// y = vec_add(vec_add(y,y),y); - - tmp1 = vec_ld (0, ref+12+i); - - y = vec_add(y, vec_splat_s32(4)); - y = vec_sra(y, vec_splat_u32(3)); - - tmp2 = vec_ld (15, ref+12+i); - - *vbuf = vec_sub(*vbuf, y); - - t1 = t2; - - vbuf++; - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_add(vec_add(y,y),y); - - tmp1 = vec_ld (0, ref+12+i); - - y = vec_add(y, vec_splat_s32(4)); - y = vec_sra(y, vec_splat_u32(3)); - - tmp2 = vec_ld (15, ref+12+i); - - *vbuf = vec_sub(*vbuf, y); - - t1=t2; - - vbuf++; - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_add(vec_add(y,y),y); - - tmp1 = vec_ld (0, ref+16+i); - - y = vec_add(y, vec_splat_s32(4)); - y = vec_sra(y, vec_splat_u32(3)); - - tmp2 = vec_ld (15, ref+16+i); - - *vbuf = vec_sub(*vbuf, y); - - t1=t2; - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_add(vec_add(y,y),y); - - vbuf++; - - y = vec_add(y, vec_splat_s32(4)); - y = vec_sra(y, vec_splat_u32(3)); - *vbuf = vec_sub(*vbuf, y); - - t1=t2; - - vbuf++; - -#endif - - } - - snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); - b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); - } - - { // Lift 1 - DWTELEM * const dst = b+w2; - - i = 0; - for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){ - dst[i] = dst[i] - (b[i] + b[i + 1]); - } - - align = vec_lvsl(0, b+i); - tmp1 = vec_ld(0, b+i); - vbuf = (vector signed int*) (dst + i); - tmp2 = vec_ld(15, b+i); - - t1 = vec_perm(tmp1, tmp2, align); - - for (; i<w_r-3; i+=4) { - -#if 0 - dst[i] = dst[i] - (b[i] + b[i + 1]); - dst[i+1] = dst[i+1] - (b[i+1] + b[i + 2]); - dst[i+2] = dst[i+2] - (b[i+2] + b[i + 3]); - dst[i+3] = dst[i+3] - (b[i+3] + b[i + 4]); -#else - - tmp1 = vec_ld(0, b+4+i); - tmp2 = vec_ld(15, b+4+i); - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1, vec_sld(t1,t2,4)); - *vbuf = vec_sub (*vbuf, y); - - vbuf++; - - t1 = t2; - -#endif - - } - - snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); - } - - { // Lift 2 - DWTELEM * const ref = b+w2 - 1; - DWTELEM b_0 = b[0]; - vbuf= (vector signed int *) b; - - tmp1 = vec_ld (0, ref); - align = vec_lvsl (0, ref); - tmp2 = vec_ld (15, ref); - t1= vec_perm(tmp1, tmp2, align); - - i = 0; - for (; i<w_l-15; i+=16) { -#if 0 - b[i] = b[i] - (((8 -(ref[i] + ref[i+1])) - (b[i] <<2)) >> 4); - b[i+1] = b[i+1] - (((8 -(ref[i+1] + ref[i+2])) - (b[i+1]<<2)) >> 4); - b[i+2] = b[i+2] - (((8 -(ref[i+2] + ref[i+3])) - (b[i+2]<<2)) >> 4); - b[i+3] = b[i+3] - (((8 -(ref[i+3] + ref[i+4])) - (b[i+3]<<2)) >> 4); -#else - tmp1 = vec_ld (0, ref+4+i); - tmp2 = vec_ld (15, ref+4+i); - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_sub(vec_splat_s32(8),y); - - tmp1 = vec_ld (0, ref+8+i); - - x = vec_sl(*vbuf,vec_splat_u32(2)); - y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); - - tmp2 = vec_ld (15, ref+8+i); - - *vbuf = vec_sub( *vbuf, y); - - t1 = t2; - - vbuf++; - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_sub(vec_splat_s32(8),y); - - tmp1 = vec_ld (0, ref+12+i); - - x = vec_sl(*vbuf,vec_splat_u32(2)); - y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); - - tmp2 = vec_ld (15, ref+12+i); - - *vbuf = vec_sub( *vbuf, y); - - t1 = t2; - - vbuf++; - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_sub(vec_splat_s32(8),y); - - tmp1 = vec_ld (0, ref+16+i); - - x = vec_sl(*vbuf,vec_splat_u32(2)); - y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); - - tmp2 = vec_ld (15, ref+16+i); - - *vbuf = vec_sub( *vbuf, y); - - t1 = t2; - - vbuf++; - - t2 = vec_perm(tmp1, tmp2, align); - - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_sub(vec_splat_s32(8),y); - - t1 = t2; - - x = vec_sl(*vbuf,vec_splat_u32(2)); - y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); - *vbuf = vec_sub( *vbuf, y); - - vbuf++; - -#endif - } - - snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); - b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS); - } - - { // Lift 3 - DWTELEM * const src = b+w2; - - vbuf = (vector signed int *)b; - vtmp = (vector signed int *)temp; - - i = 0; - align = vec_lvsl(0, src); - - for (; i<w_r-3; i+=4) { -#if 0 - temp[i] = src[i] - ((-3*(b[i] + b[i+1]))>>1); - temp[i+1] = src[i+1] - ((-3*(b[i+1] + b[i+2]))>>1); - temp[i+2] = src[i+2] - ((-3*(b[i+2] + b[i+3]))>>1); - temp[i+3] = src[i+3] - ((-3*(b[i+3] + b[i+4]))>>1); -#else - tmp1 = vec_ld(0,src+i); - t1 = vec_add(vbuf[0],vec_sld(vbuf[0],vbuf[1],4)); - tmp2 = vec_ld(15,src+i); - t1 = vec_sub(vec_splat_s32(0),t1); //bad! - t1 = vec_add(t1,vec_add(t1,t1)); - t2 = vec_perm(tmp1 ,tmp2 ,align); - t1 = vec_sra(t1,vec_splat_u32(1)); - vbuf++; - *vtmp = vec_sub(t2,t1); - vtmp++; - -#endif - - } - - snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -3, 0, 1); - } - - { - //Interleave - int a; - vector signed int *t = (vector signed int *)temp, - *v = (vector signed int *)b; - - snow_interleave_line_header(&i, width, b, temp); - - for (; (i & 0xE) != 0xE; i-=2){ - b[i+1] = temp[i>>1]; - b[i] = b[i>>1]; - } - for (i-=14; i>=0; i-=16){ - a=i/4; - - v[a+3]=vec_mergel(v[(a>>1)+1],t[(a>>1)+1]); - v[a+2]=vec_mergeh(v[(a>>1)+1],t[(a>>1)+1]); - v[a+1]=vec_mergel(v[a>>1],t[a>>1]); - v[a]=vec_mergeh(v[a>>1],t[a>>1]); - - } - - } -#endif -} - -void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width) -{ - int i, w4 = width/4; - vector signed int *v0, *v1,*v2,*v3,*v4,*v5; - vector signed int t1, t2; - - v0=(vector signed int *)b0; - v1=(vector signed int *)b1; - v2=(vector signed int *)b2; - v3=(vector signed int *)b3; - v4=(vector signed int *)b4; - v5=(vector signed int *)b5; - - for (i=0; i< w4;i++) - { - - #if 0 - b4[i] -= (3*(b3[i] + b5[i])+4)>>3; - b3[i] -= ((b2[i] + b4[i])); - b2[i] += ((b1[i] + b3[i])+4*b2[i]+8)>>4; - b1[i] += (3*(b0[i] + b2[i]))>>1; - #else - t1 = vec_add(v3[i], v5[i]); - t2 = vec_add(t1, vec_add(t1,t1)); - t1 = vec_add(t2, vec_splat_s32(4)); - v4[i] = vec_sub(v4[i], vec_sra(t1,vec_splat_u32(3))); - - v3[i] = vec_sub(v3[i], vec_add(v2[i], v4[i])); - - t1 = vec_add(vec_splat_s32(8), vec_add(v1[i], v3[i])); - t2 = vec_sl(v2[i], vec_splat_u32(2)); - v2[i] = vec_add(v2[i], vec_sra(vec_add(t1,t2),vec_splat_u32(4))); - t1 = vec_add(v0[i], v2[i]); - t2 = vec_add(t1, vec_add(t1,t1)); - v1[i] = vec_add(v1[i], vec_sra(t2,vec_splat_u32(1))); - - #endif - } - - for(i*=4; i < width; i++) - { - b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; - b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; - b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; - b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; - } -} - -#define LOAD_BLOCKS \ - tmp1 = vec_ld(0, &block[3][y*src_stride]);\ - align = vec_lvsl(0, &block[3][y*src_stride]);\ - tmp2 = vec_ld(15, &block[3][y*src_stride]);\ -\ - b3 = vec_perm(tmp1,tmp2,align);\ -\ - tmp1 = vec_ld(0, &block[2][y*src_stride]);\ - align = vec_lvsl(0, &block[2][y*src_stride]);\ - tmp2 = vec_ld(15, &block[2][y*src_stride]);\ -\ - b2 = vec_perm(tmp1,tmp2,align);\ -\ - tmp1 = vec_ld(0, &block[1][y*src_stride]);\ - align = vec_lvsl(0, &block[1][y*src_stride]);\ - tmp2 = vec_ld(15, &block[1][y*src_stride]);\ -\ - b1 = vec_perm(tmp1,tmp2,align);\ -\ - tmp1 = vec_ld(0, &block[0][y*src_stride]);\ - align = vec_lvsl(0, &block[0][y*src_stride]);\ - tmp2 = vec_ld(15, &block[0][y*src_stride]);\ -\ - b0 = vec_perm(tmp1,tmp2,align); - -#define LOAD_OBMCS \ - tmp1 = vec_ld(0, obmc1);\ - align = vec_lvsl(0, obmc1);\ - tmp2 = vec_ld(15, obmc1);\ -\ - ob1 = vec_perm(tmp1,tmp2,align);\ -\ - tmp1 = vec_ld(0, obmc2);\ - align = vec_lvsl(0, obmc2);\ - tmp2 = vec_ld(15, obmc2);\ -\ - ob2 = vec_perm(tmp1,tmp2,align);\ -\ - tmp1 = vec_ld(0, obmc3);\ - align = vec_lvsl(0, obmc3);\ - tmp2 = vec_ld(15, obmc3);\ -\ - ob3 = vec_perm(tmp1,tmp2,align);\ -\ - tmp1 = vec_ld(0, obmc4);\ - align = vec_lvsl(0, obmc4);\ - tmp2 = vec_ld(15, obmc4);\ -\ - ob4 = vec_perm(tmp1,tmp2,align); - -/* interleave logic - * h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ] - * h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ] - * h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ] - */ - -#define STEPS_0_1\ - h1 = (vector unsigned short)\ - vec_mergeh(ob1, ob2);\ -\ - h2 = (vector unsigned short)\ - vec_mergeh(ob3, ob4);\ -\ - ih = (vector unsigned char)\ - vec_mergeh(h1,h2);\ -\ - l1 = (vector unsigned short) vec_mergeh(b3, b2);\ -\ - ih1 = (vector unsigned char) vec_mergel(h1, h2);\ -\ - l2 = (vector unsigned short) vec_mergeh(b1, b0);\ -\ - il = (vector unsigned char) vec_mergeh(l1, l2);\ -\ - v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ -\ - il1 = (vector unsigned char) vec_mergel(l1, l2);\ -\ - v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); - -#define FINAL_STEP_SCALAR\ - for(x=0; x<b_w; x++)\ - if(add){\ - vbuf[x] += dst[x + src_x];\ - vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\ - if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\ - dst8[x + y*src_stride] = vbuf[x];\ - }else{\ - dst[x + src_x] -= vbuf[x];\ - } - -static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc, - const int obmc_stride, - uint8_t * * block, int b_w, - int b_h, int src_x, int src_y, - int src_stride, slice_buffer * sb, - int add, uint8_t * dst8) -{ - int y, x; - DWTELEM * dst; - vector unsigned short h1, h2, l1, l2; - vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; - vector unsigned char b0,b1,b2,b3; - vector unsigned char ob1,ob2,ob3,ob4; - - DECLARE_ALIGNED_16(int, vbuf[16]); - vector signed int *v = (vector signed int *)vbuf, *d; - - for(y=0; y<b_h; y++){ - //FIXME ugly misuse of obmc_stride - - uint8_t *obmc1= obmc + y*obmc_stride; - uint8_t *obmc2= obmc1+ (obmc_stride>>1); - uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); - uint8_t *obmc4= obmc3+ (obmc_stride>>1); - - dst = slice_buffer_get_line(sb, src_y + y); - d = (vector signed int *)(dst + src_x); - -//FIXME i could avoid some loads! - - // load blocks - LOAD_BLOCKS - - // load obmcs - LOAD_OBMCS - - // steps 0 1 - STEPS_0_1 - - FINAL_STEP_SCALAR - - } - -} - -#define STEPS_2_3\ - h1 = (vector unsigned short) vec_mergel(ob1, ob2);\ -\ - h2 = (vector unsigned short) vec_mergel(ob3, ob4);\ -\ - ih = (vector unsigned char) vec_mergeh(h1,h2);\ -\ - l1 = (vector unsigned short) vec_mergel(b3, b2);\ -\ - l2 = (vector unsigned short) vec_mergel(b1, b0);\ -\ - ih1 = (vector unsigned char) vec_mergel(h1,h2);\ -\ - il = (vector unsigned char) vec_mergeh(l1,l2);\ -\ - v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ -\ - il1 = (vector unsigned char) vec_mergel(l1,l2);\ -\ - v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); - - -static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc, - const int obmc_stride, - uint8_t * * block, int b_w, - int b_h, int src_x, int src_y, - int src_stride, slice_buffer * sb, - int add, uint8_t * dst8) -{ - int y, x; - DWTELEM * dst; - vector unsigned short h1, h2, l1, l2; - vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; - vector unsigned char b0,b1,b2,b3; - vector unsigned char ob1,ob2,ob3,ob4; - DECLARE_ALIGNED_16(int, vbuf[b_w]); - vector signed int *v = (vector signed int *)vbuf, *d; - - for(y=0; y<b_h; y++){ - //FIXME ugly misuse of obmc_stride - - uint8_t *obmc1= obmc + y*obmc_stride; - uint8_t *obmc2= obmc1+ (obmc_stride>>1); - uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); - uint8_t *obmc4= obmc3+ (obmc_stride>>1); - - dst = slice_buffer_get_line(sb, src_y + y); - d = (vector signed int *)(dst + src_x); - - // load blocks - LOAD_BLOCKS - - // load obmcs - LOAD_OBMCS - - // steps 0 1 2 3 - STEPS_0_1 - - STEPS_2_3 - - FINAL_STEP_SCALAR - - } -} - -#define FINAL_STEP_VEC \ -\ - if(add)\ - {\ - for(x=0; x<b_w/4; x++)\ - {\ - v[x] = vec_add(v[x], d[x]);\ - v[x] = vec_sra(vec_add(v[x],\ - vec_sl( vec_splat_s32(1),\ - vec_splat_u32(7))),\ - vec_splat_u32(8));\ -\ - mask = (vector bool int) vec_sl((vector signed int)\ - vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\ - mask = (vector bool int) vec_and(v[x],vec_nor(mask,mask));\ -\ - mask = (vector bool int)\ - vec_cmpeq((vector signed int)mask,\ - (vector signed int)vec_splat_u32(0));\ -\ - vs = vec_sra(v[x],vec_splat_u32(8));\ - vs = vec_sra(v[x],vec_splat_u32(8));\ - vs = vec_sra(v[x],vec_splat_u32(15));\ -\ - vs = vec_nor(vs,vs);\ -\ - v[x]= vec_sel(v[x],vs,mask);\ - }\ -\ - for(x=0; x<b_w; x++)\ - dst8[x + y*src_stride] = vbuf[x];\ -\ - }\ - else\ - for(x=0; x<b_w/4; x++)\ - d[x] = vec_sub(d[x], v[x]); - -static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc, - const int obmc_stride, - uint8_t * * block, int b_w, - int b_h, int src_x, int src_y, - int src_stride, slice_buffer * sb, - int add, uint8_t * dst8) -{ - int y, x; - DWTELEM * dst; - vector bool int mask; - vector signed int vs; - vector unsigned short h1, h2, l1, l2; - vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; - vector unsigned char b0,b1,b2,b3; - vector unsigned char ob1,ob2,ob3,ob4; - - DECLARE_ALIGNED_16(int, vbuf[16]); - vector signed int *v = (vector signed int *)vbuf, *d; - - for(y=0; y<b_h; y++){ - //FIXME ugly misuse of obmc_stride - - uint8_t *obmc1= obmc + y*obmc_stride; - uint8_t *obmc2= obmc1+ (obmc_stride>>1); - uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); - uint8_t *obmc4= obmc3+ (obmc_stride>>1); - - dst = slice_buffer_get_line(sb, src_y + y); - d = (vector signed int *)(dst + src_x); - -//FIXME i could avoid some loads! - - // load blocks - LOAD_BLOCKS - - // load obmcs - LOAD_OBMCS - - // steps 0 1 - STEPS_0_1 - - FINAL_STEP_VEC - - } - -} - -static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc, - const int obmc_stride, - uint8_t * * block, int b_w, - int b_h, int src_x, int src_y, - int src_stride, slice_buffer * sb, - int add, uint8_t * dst8) -{ - int y, x; - DWTELEM * dst; - vector bool int mask; - vector signed int vs; - vector unsigned short h1, h2, l1, l2; - vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; - vector unsigned char b0,b1,b2,b3; - vector unsigned char ob1,ob2,ob3,ob4; - DECLARE_ALIGNED_16(int, vbuf[b_w]); - vector signed int *v = (vector signed int *)vbuf, *d; - - for(y=0; y<b_h; y++){ - //FIXME ugly misuse of obmc_stride - - uint8_t *obmc1= obmc + y*obmc_stride; - uint8_t *obmc2= obmc1+ (obmc_stride>>1); - uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); - uint8_t *obmc4= obmc3+ (obmc_stride>>1); - - dst = slice_buffer_get_line(sb, src_y + y); - d = (vector signed int *)(dst + src_x); - - // load blocks - LOAD_BLOCKS - - // load obmcs - LOAD_OBMCS - - // steps 0 1 2 3 - STEPS_0_1 - - STEPS_2_3 - - FINAL_STEP_VEC - - } -} - - -void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride, - uint8_t * * block, int b_w, int b_h, - int src_x, int src_y, int src_stride, - slice_buffer * sb, int add, - uint8_t * dst8) -{ - if (src_x&15) { - if (b_w == 16) - inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block, - b_w, b_h, src_x, src_y, - src_stride, sb, add, dst8); - else if (b_w == 8) - inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block, - b_w, b_h, src_x, src_y, - src_stride, sb, add, dst8); - else - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, - src_y, src_stride, sb, add, dst8); - } else { - if (b_w == 16) - inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block, - b_w, b_h, src_x, src_y, - src_stride, sb, add, dst8); - else if (b_w == 8) - inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block, - b_w, b_h, src_x, src_y, - src_stride, sb, add, dst8); - else - ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, - src_y, src_stride, sb, add, dst8); - } -} - - -void snow_init_altivec(DSPContext* c, AVCodecContext *avctx) -{ -#if 0 - c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; - c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; - c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; -#endif -} diff --git a/contrib/ffmpeg/libavcodec/ppc/types_altivec.h b/contrib/ffmpeg/libavcodec/ppc/types_altivec.h deleted file mode 100644 index 6d41a928b..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/types_altivec.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (c) 2006 Guillaume Poirier <gpoirier@mplayerhq.hu> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#ifndef FFMPEG_TYPES_ALTIVEC_H -#define FFMPEG_TYPES_ALTIVEC_H - -/*********************************************************************** - * Vector types - **********************************************************************/ -#define vec_u8_t vector unsigned char -#define vec_s8_t vector signed char -#define vec_u16_t vector unsigned short -#define vec_s16_t vector signed short -#define vec_u32_t vector unsigned int -#define vec_s32_t vector signed int - -/*********************************************************************** - * Null vector - **********************************************************************/ -#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 ) - -#define zero_u8v (vec_u8_t) zerov -#define zero_s8v (vec_s8_t) zerov -#define zero_u16v (vec_u16_t) zerov -#define zero_s16v (vec_s16_t) zerov -#define zero_u32v (vec_u32_t) zerov -#define zero_s32v (vec_s32_t) zerov - -#endif /* FFMPEG_TYPES_ALTIVEC_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/util_altivec.h b/contrib/ffmpeg/libavcodec/ppc/util_altivec.h deleted file mode 100644 index 6a8afb1b2..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/util_altivec.h +++ /dev/null @@ -1,105 +0,0 @@ -/* - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/** - * @file util_altivec.h - * Contains misc utility macros and inline functions - */ - -#ifndef FFMPEG_UTIL_ALTIVEC_H -#define FFMPEG_UTIL_ALTIVEC_H - -#include <stdint.h> - -#include "config.h" - -#ifdef HAVE_ALTIVEC_H -#include <altivec.h> -#endif - -// used to build registers permutation vectors (vcprm) -// the 's' are for words in the _s_econd vector -#define WORD_0 0x00,0x01,0x02,0x03 -#define WORD_1 0x04,0x05,0x06,0x07 -#define WORD_2 0x08,0x09,0x0a,0x0b -#define WORD_3 0x0c,0x0d,0x0e,0x0f -#define WORD_s0 0x10,0x11,0x12,0x13 -#define WORD_s1 0x14,0x15,0x16,0x17 -#define WORD_s2 0x18,0x19,0x1a,0x1b -#define WORD_s3 0x1c,0x1d,0x1e,0x1f - -#define vcprm(a,b,c,d) (const vector unsigned char)AVV(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) -#define vcii(a,b,c,d) (const vector float)AVV(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) - -// vcprmle is used to keep the same index as in the SSE version. -// it's the same as vcprm, with the index inversed -// ('le' is Little Endian) -#define vcprmle(a,b,c,d) vcprm(d,c,b,a) - -// used to build inverse/identity vectors (vcii) -// n is _n_egative, p is _p_ositive -#define FLOAT_n -1. -#define FLOAT_p 1. - - -// Transpose 8x8 matrix of 16-bit elements (in-place) -#define TRANSPOSE8(a,b,c,d,e,f,g,h) \ -do { \ - vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \ - vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \ - \ - A1 = vec_mergeh (a, e); \ - B1 = vec_mergel (a, e); \ - C1 = vec_mergeh (b, f); \ - D1 = vec_mergel (b, f); \ - E1 = vec_mergeh (c, g); \ - F1 = vec_mergel (c, g); \ - G1 = vec_mergeh (d, h); \ - H1 = vec_mergel (d, h); \ - \ - A2 = vec_mergeh (A1, E1); \ - B2 = vec_mergel (A1, E1); \ - C2 = vec_mergeh (B1, F1); \ - D2 = vec_mergel (B1, F1); \ - E2 = vec_mergeh (C1, G1); \ - F2 = vec_mergel (C1, G1); \ - G2 = vec_mergeh (D1, H1); \ - H2 = vec_mergel (D1, H1); \ - \ - a = vec_mergeh (A2, E2); \ - b = vec_mergel (A2, E2); \ - c = vec_mergeh (B2, F2); \ - d = vec_mergel (B2, F2); \ - e = vec_mergeh (C2, G2); \ - f = vec_mergel (C2, G2); \ - g = vec_mergeh (D2, H2); \ - h = vec_mergel (D2, H2); \ -} while (0) - - -/** \brief loads unaligned vector \a *src with offset \a offset - and returns it */ -static inline vector unsigned char unaligned_load(int offset, uint8_t *src) -{ - register vector unsigned char first = vec_ld(offset, src); - register vector unsigned char second = vec_ld(offset+15, src); - register vector unsigned char mask = vec_lvsl(offset, src); - return vec_perm(first, second, mask); -} - -#endif /* FFMPEG_UTIL_ALTIVEC_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c b/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c deleted file mode 100644 index 87bef808e..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c +++ /dev/null @@ -1,330 +0,0 @@ -/* - * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized - * Copyright (c) 2006 Konstantin Shishkov - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "dsputil.h" - -#include "gcc_fixes.h" - -#include "util_altivec.h" - -// main steps of 8x8 transform -#define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \ -do { \ - t0 = vec_sl(vec_add(s0, s4), vec_2); \ - t0 = vec_add(vec_sl(t0, vec_1), t0); \ - t0 = vec_add(t0, vec_rnd); \ - t1 = vec_sl(vec_sub(s0, s4), vec_2); \ - t1 = vec_add(vec_sl(t1, vec_1), t1); \ - t1 = vec_add(t1, vec_rnd); \ - t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \ - t2 = vec_add(t2, vec_sl(s2, vec_4)); \ - t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \ - t3 = vec_sub(t3, vec_sl(s6, vec_4)); \ - t4 = vec_add(t0, t2); \ - t5 = vec_add(t1, t3); \ - t6 = vec_sub(t1, t3); \ - t7 = vec_sub(t0, t2); \ -\ - t0 = vec_sl(vec_add(s1, s3), vec_4); \ - t0 = vec_add(t0, vec_sl(s5, vec_3)); \ - t0 = vec_add(t0, vec_sl(s7, vec_2)); \ - t0 = vec_add(t0, vec_sub(s5, s3)); \ -\ - t1 = vec_sl(vec_sub(s1, s5), vec_4); \ - t1 = vec_sub(t1, vec_sl(s7, vec_3)); \ - t1 = vec_sub(t1, vec_sl(s3, vec_2)); \ - t1 = vec_sub(t1, vec_add(s1, s7)); \ -\ - t2 = vec_sl(vec_sub(s7, s3), vec_4); \ - t2 = vec_add(t2, vec_sl(s1, vec_3)); \ - t2 = vec_add(t2, vec_sl(s5, vec_2)); \ - t2 = vec_add(t2, vec_sub(s1, s7)); \ -\ - t3 = vec_sl(vec_sub(s5, s7), vec_4); \ - t3 = vec_sub(t3, vec_sl(s3, vec_3)); \ - t3 = vec_add(t3, vec_sl(s1, vec_2)); \ - t3 = vec_sub(t3, vec_add(s3, s5)); \ -\ - s0 = vec_add(t4, t0); \ - s1 = vec_add(t5, t1); \ - s2 = vec_add(t6, t2); \ - s3 = vec_add(t7, t3); \ - s4 = vec_sub(t7, t3); \ - s5 = vec_sub(t6, t2); \ - s6 = vec_sub(t5, t1); \ - s7 = vec_sub(t4, t0); \ -}while(0) - -#define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \ -do { \ - s0 = vec_sra(s0, vec_3); \ - s1 = vec_sra(s1, vec_3); \ - s2 = vec_sra(s2, vec_3); \ - s3 = vec_sra(s3, vec_3); \ - s4 = vec_sra(s4, vec_3); \ - s5 = vec_sra(s5, vec_3); \ - s6 = vec_sra(s6, vec_3); \ - s7 = vec_sra(s7, vec_3); \ -}while(0) - -#define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \ -do { \ - s0 = vec_sra(s0, vec_7); \ - s1 = vec_sra(s1, vec_7); \ - s2 = vec_sra(s2, vec_7); \ - s3 = vec_sra(s3, vec_7); \ - s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \ - s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \ - s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \ - s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \ -}while(0) - -/* main steps of 4x4 transform */ -#define STEP4(s0, s1, s2, s3, vec_rnd) \ -do { \ - t1 = vec_add(vec_sl(s0, vec_4), s0); \ - t1 = vec_add(t1, vec_rnd); \ - t2 = vec_add(vec_sl(s2, vec_4), s2); \ - t0 = vec_add(t1, t2); \ - t1 = vec_sub(t1, t2); \ - t3 = vec_sl(vec_sub(s3, s1), vec_1); \ - t3 = vec_add(t3, vec_sl(t3, vec_2)); \ - t2 = vec_add(t3, vec_sl(s1, vec_5)); \ - t3 = vec_add(t3, vec_sl(s3, vec_3)); \ - t3 = vec_add(t3, vec_sl(s3, vec_2)); \ - s0 = vec_add(t0, t2); \ - s1 = vec_sub(t1, t3); \ - s2 = vec_add(t1, t3); \ - s3 = vec_sub(t0, t2); \ -}while (0) - -#define SHIFT_HOR4(s0, s1, s2, s3) \ - s0 = vec_sra(s0, vec_3); \ - s1 = vec_sra(s1, vec_3); \ - s2 = vec_sra(s2, vec_3); \ - s3 = vec_sra(s3, vec_3); - -#define SHIFT_VERT4(s0, s1, s2, s3) \ - s0 = vec_sra(s0, vec_7); \ - s1 = vec_sra(s1, vec_7); \ - s2 = vec_sra(s2, vec_7); \ - s3 = vec_sra(s3, vec_7); - -/** Do inverse transform on 8x8 block -*/ -static void vc1_inv_trans_8x8_altivec(DCTELEM block[64]) -{ - vector signed short src0, src1, src2, src3, src4, src5, src6, src7; - vector signed int s0, s1, s2, s3, s4, s5, s6, s7; - vector signed int s8, s9, sA, sB, sC, sD, sE, sF; - vector signed int t0, t1, t2, t3, t4, t5, t6, t7; - const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); - const vector unsigned int vec_7 = vec_splat_u32(7); - const vector unsigned int vec_4 = vec_splat_u32(4); - const vector signed int vec_4s = vec_splat_s32(4); - const vector unsigned int vec_3 = vec_splat_u32(3); - const vector unsigned int vec_2 = vec_splat_u32(2); - const vector signed int vec_1s = vec_splat_s32(1); - const vector unsigned int vec_1 = vec_splat_u32(1); - - - src0 = vec_ld( 0, block); - src1 = vec_ld( 16, block); - src2 = vec_ld( 32, block); - src3 = vec_ld( 48, block); - src4 = vec_ld( 64, block); - src5 = vec_ld( 80, block); - src6 = vec_ld( 96, block); - src7 = vec_ld(112, block); - - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - s0 = vec_unpackl(src0); - s1 = vec_unpackl(src1); - s2 = vec_unpackl(src2); - s3 = vec_unpackl(src3); - s4 = vec_unpackl(src4); - s5 = vec_unpackl(src5); - s6 = vec_unpackl(src6); - s7 = vec_unpackl(src7); - s8 = vec_unpackh(src0); - s9 = vec_unpackh(src1); - sA = vec_unpackh(src2); - sB = vec_unpackh(src3); - sC = vec_unpackh(src4); - sD = vec_unpackh(src5); - sE = vec_unpackh(src6); - sF = vec_unpackh(src7); - STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); - SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); - STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); - SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); - src0 = vec_pack(s8, s0); - src1 = vec_pack(s9, s1); - src2 = vec_pack(sA, s2); - src3 = vec_pack(sB, s3); - src4 = vec_pack(sC, s4); - src5 = vec_pack(sD, s5); - src6 = vec_pack(sE, s6); - src7 = vec_pack(sF, s7); - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - - s0 = vec_unpackl(src0); - s1 = vec_unpackl(src1); - s2 = vec_unpackl(src2); - s3 = vec_unpackl(src3); - s4 = vec_unpackl(src4); - s5 = vec_unpackl(src5); - s6 = vec_unpackl(src6); - s7 = vec_unpackl(src7); - s8 = vec_unpackh(src0); - s9 = vec_unpackh(src1); - sA = vec_unpackh(src2); - sB = vec_unpackh(src3); - sC = vec_unpackh(src4); - sD = vec_unpackh(src5); - sE = vec_unpackh(src6); - sF = vec_unpackh(src7); - STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64); - SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7); - STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64); - SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF); - src0 = vec_pack(s8, s0); - src1 = vec_pack(s9, s1); - src2 = vec_pack(sA, s2); - src3 = vec_pack(sB, s3); - src4 = vec_pack(sC, s4); - src5 = vec_pack(sD, s5); - src6 = vec_pack(sE, s6); - src7 = vec_pack(sF, s7); - - vec_st(src0, 0, block); - vec_st(src1, 16, block); - vec_st(src2, 32, block); - vec_st(src3, 48, block); - vec_st(src4, 64, block); - vec_st(src5, 80, block); - vec_st(src6, 96, block); - vec_st(src7,112, block); -} - -/** Do inverse transform on 8x4 part of block -*/ -static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block) -{ - vector signed short src0, src1, src2, src3, src4, src5, src6, src7; - vector signed int s0, s1, s2, s3, s4, s5, s6, s7; - vector signed int s8, s9, sA, sB, sC, sD, sE, sF; - vector signed int t0, t1, t2, t3, t4, t5, t6, t7; - const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); - const vector unsigned int vec_7 = vec_splat_u32(7); - const vector unsigned int vec_5 = vec_splat_u32(5); - const vector unsigned int vec_4 = vec_splat_u32(4); - const vector signed int vec_4s = vec_splat_s32(4); - const vector unsigned int vec_3 = vec_splat_u32(3); - const vector unsigned int vec_2 = vec_splat_u32(2); - const vector unsigned int vec_1 = vec_splat_u32(1); - vector unsigned char tmp; - vector signed short tmp2, tmp3; - vector unsigned char perm0, perm1, p0, p1, p; - - src0 = vec_ld( 0, block); - src1 = vec_ld( 16, block); - src2 = vec_ld( 32, block); - src3 = vec_ld( 48, block); - src4 = vec_ld( 64, block); - src5 = vec_ld( 80, block); - src6 = vec_ld( 96, block); - src7 = vec_ld(112, block); - - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - s0 = vec_unpackl(src0); - s1 = vec_unpackl(src1); - s2 = vec_unpackl(src2); - s3 = vec_unpackl(src3); - s4 = vec_unpackl(src4); - s5 = vec_unpackl(src5); - s6 = vec_unpackl(src6); - s7 = vec_unpackl(src7); - s8 = vec_unpackh(src0); - s9 = vec_unpackh(src1); - sA = vec_unpackh(src2); - sB = vec_unpackh(src3); - sC = vec_unpackh(src4); - sD = vec_unpackh(src5); - sE = vec_unpackh(src6); - sF = vec_unpackh(src7); - STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); - SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); - STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); - SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); - src0 = vec_pack(s8, s0); - src1 = vec_pack(s9, s1); - src2 = vec_pack(sA, s2); - src3 = vec_pack(sB, s3); - src4 = vec_pack(sC, s4); - src5 = vec_pack(sD, s5); - src6 = vec_pack(sE, s6); - src7 = vec_pack(sF, s7); - TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - - s0 = vec_unpackh(src0); - s1 = vec_unpackh(src1); - s2 = vec_unpackh(src2); - s3 = vec_unpackh(src3); - s8 = vec_unpackl(src0); - s9 = vec_unpackl(src1); - sA = vec_unpackl(src2); - sB = vec_unpackl(src3); - STEP4(s0, s1, s2, s3, vec_64); - SHIFT_VERT4(s0, s1, s2, s3); - STEP4(s8, s9, sA, sB, vec_64); - SHIFT_VERT4(s8, s9, sA, sB); - src0 = vec_pack(s0, s8); - src1 = vec_pack(s1, s9); - src2 = vec_pack(s2, sA); - src3 = vec_pack(s3, sB); - - p0 = vec_lvsl (0, dest); - p1 = vec_lvsl (stride, dest); - p = vec_splat_u8 (-1); - perm0 = vec_mergeh (p, p0); - perm1 = vec_mergeh (p, p1); - -#define ADD(dest,src,perm) \ - /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ - tmp = vec_ld (0, dest); \ - tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \ - tmp3 = vec_adds (tmp2, src); \ - tmp = vec_packsu (tmp3, tmp3); \ - vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \ - vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest); - - ADD (dest, src0, perm0) dest += stride; - ADD (dest, src1, perm1) dest += stride; - ADD (dest, src2, perm0) dest += stride; - ADD (dest, src3, perm1) -} - - -void vc1dsp_init_altivec(DSPContext* dsp, AVCodecContext *avctx) { - dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec; - dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec; -} |
