diff options
| author | Diego 'Flameeyes' Pettenò <flameeyes@gmail.com> | 2008-03-01 03:05:13 +0100 |
|---|---|---|
| committer | Diego 'Flameeyes' Pettenò <flameeyes@gmail.com> | 2008-03-01 03:05:13 +0100 |
| commit | 1d0b3b20c34517b9d1ddf3ea347776304b0c4b44 (patch) | |
| tree | 89f4fc640c2becc6f00ae08996754952ecf149c1 /contrib/ffmpeg/libavcodec/ppc | |
| parent | 09496ad3469a0ade8dbd9a351e639b78f20b7942 (diff) | |
| download | xine-lib-1d0b3b20c34517b9d1ddf3ea347776304b0c4b44.tar.gz xine-lib-1d0b3b20c34517b9d1ddf3ea347776304b0c4b44.tar.bz2 | |
Update internal FFmpeg copy.
Diffstat (limited to 'contrib/ffmpeg/libavcodec/ppc')
23 files changed, 1450 insertions, 901 deletions
diff --git a/contrib/ffmpeg/libavcodec/ppc/check_altivec.c b/contrib/ffmpeg/libavcodec/ppc/check_altivec.c new file mode 100644 index 000000000..cf55b9a1d --- /dev/null +++ b/contrib/ffmpeg/libavcodec/ppc/check_altivec.c @@ -0,0 +1,75 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +/** + * @file check_altivec.c + * Checks for AltiVec presence. + */ + +#ifdef __APPLE__ +#include <sys/sysctl.h> +#elif __AMIGAOS4__ +#include <exec/exec.h> +#include <interfaces/exec.h> +#include <proto/exec.h> +#endif /* __APPLE__ */ + +/** + * This function MAY rely on signal() or fork() in order to make sure altivec + * is present + */ + +int has_altivec(void) +{ +#ifdef __AMIGAOS4__ + ULONG result = 0; + extern struct ExecIFace *IExec; + + IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE); + if (result == VECTORTYPE_ALTIVEC) return 1; + return 0; +#elif __APPLE__ + int sels[2] = {CTL_HW, HW_VECTORUNIT}; + int has_vu = 0; + size_t len = sizeof(has_vu); + int err; + + err = sysctl(sels, 2, &has_vu, &len, NULL, 0); + + if (err == 0) return (has_vu != 0); + return 0; +#elif defined(RUNTIME_CPUDETECT) + int proc_ver; + // support of mfspr PVR emulation added in Linux 2.6.17 + asm volatile("mfspr %0, 287" : "=r" (proc_ver)); + proc_ver >>= 16; + if (proc_ver & 0x8000 || + proc_ver == 0x000c || + proc_ver == 0x0039 || proc_ver == 0x003c || + proc_ver == 0x0044 || proc_ver == 0x0045 || + proc_ver == 0x0070) + return 1; + return 0; +#else + // since we were compiled for altivec, just assume we have it + // until someone comes up with a proper way (not involving signal hacks). + return 1; +#endif /* __AMIGAOS4__ */ +} + diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c index bbc53d761..3d79c3ab5 100644 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.c @@ -20,44 +20,18 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "../dsputil.h" +#include "dsputil.h" #include "gcc_fixes.h" -#include "dsputil_altivec.h" - -#ifdef CONFIG_DARWIN -#include <sys/sysctl.h> -#else /* CONFIG_DARWIN */ -#ifdef __AMIGAOS4__ -#include <exec/exec.h> -#include <interfaces/exec.h> -#include <proto/exec.h> -#else /* __AMIGAOS4__ */ -#include <signal.h> -#include <setjmp.h> - -static sigjmp_buf jmpbuf; -static volatile sig_atomic_t canjump = 0; - -static void sigill_handler (int sig) -{ - if (!canjump) { - signal (sig, SIG_DFL); - raise (sig); - } - - canjump = 0; - siglongjmp (jmpbuf, 1); -} -#endif /* CONFIG_DARWIN */ -#endif /* __AMIGAOS4__ */ +#include "dsputil_ppc.h" +#include "util_altivec.h" int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; - int s __attribute__((aligned(16))); - const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); + DECLARE_ALIGNED_16(int, s); + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; @@ -103,8 +77,8 @@ int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; - int s __attribute__((aligned(16))); - const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); + DECLARE_ALIGNED_16(int, s); + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix3v, avgv, t5; vector unsigned int sad; @@ -163,10 +137,10 @@ int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; - int s __attribute__((aligned(16))); + DECLARE_ALIGNED_16(int, s); uint8_t *pix3 = pix2 + line_size; - const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); - const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2); + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); + const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); vector unsigned char *tv, avgv, t5; vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; @@ -218,7 +192,7 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); /* - Note that Altivec does have vec_avg, but this works on vector pairs + Note that AltiVec does have vec_avg, but this works on vector pairs and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. Instead, we have to split the pixel vectors into vectors of shorts, @@ -264,8 +238,8 @@ int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; - int s __attribute__((aligned(16))); - const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); + DECLARE_ALIGNED_16(int, s); + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; @@ -306,8 +280,8 @@ int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; - int s __attribute__((aligned(16))); - const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); + DECLARE_ALIGNED_16(int, s); + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sad; @@ -351,8 +325,8 @@ int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) int pix_norm1_altivec(uint8_t *pix, int line_size) { int i; - int s __attribute__((aligned(16))); - const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); + DECLARE_ALIGNED_16(int, s); + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char *tv; vector unsigned char pixv; vector unsigned int sv; @@ -387,8 +361,8 @@ int pix_norm1_altivec(uint8_t *pix, int line_size) int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; - int s __attribute__((aligned(16))); - const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); + DECLARE_ALIGNED_16(int, s); + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sum; @@ -443,8 +417,8 @@ int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) { int i; - int s __attribute__((aligned(16))); - const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); + DECLARE_ALIGNED_16(int, s); + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; vector unsigned int sum; @@ -488,14 +462,14 @@ int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) int pix_sum_altivec(uint8_t * pix, int line_size) { - const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0); + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm, *pixv; vector unsigned char t1; vector unsigned int sad; vector signed int sumdiffs; int i; - int s __attribute__((aligned(16))); + DECLARE_ALIGNED_16(int, s); sad = (vector unsigned int)vec_splat_u32(0); @@ -523,7 +497,7 @@ void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line { int i; vector unsigned char perm, bytes, *pixv; - const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector signed short shorts; for(i=0;i<8;i++) @@ -550,7 +524,7 @@ void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, { int i; vector unsigned char perm, bytes, *pixv; - const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0); + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector signed short shorts1, shorts2; for(i=0;i<4;i++) @@ -769,8 +743,8 @@ POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1); blockv, temp1, temp2; register vector unsigned short pixelssum1, pixelssum2, temp3; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); temp1 = vec_ld(0, pixels); temp2 = vec_ld(16, pixels); @@ -845,9 +819,9 @@ POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); blockv, temp1, temp2; register vector unsigned short pixelssum1, pixelssum2, temp3; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); temp1 = vec_ld(0, pixels); temp2 = vec_ld(16, pixels); @@ -922,8 +896,8 @@ POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1); register vector unsigned short pixelssum1, pixelssum2, temp3, pixelssum3, pixelssum4, temp4; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1); @@ -1004,9 +978,9 @@ POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); register vector unsigned short pixelssum1, pixelssum2, temp3, pixelssum3, pixelssum4, temp4; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1); - register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2); + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); @@ -1078,25 +1052,25 @@ POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1); int sum; - register const_vector unsigned char vzero = - (const_vector unsigned char)vec_splat_u8(0); + register const vector unsigned char vzero = + (const vector unsigned char)vec_splat_u8(0); register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); { - register const_vector signed short vprod1 =(const_vector signed short) + register const vector signed short vprod1 =(const vector signed short) AVV( 1,-1, 1,-1, 1,-1, 1,-1); - register const_vector signed short vprod2 =(const_vector signed short) + register const vector signed short vprod2 =(const vector signed short) AVV( 1, 1,-1,-1, 1, 1,-1,-1); - register const_vector signed short vprod3 =(const_vector signed short) + register const vector signed short vprod3 =(const vector signed short) AVV( 1, 1, 1, 1,-1,-1,-1,-1); - register const_vector unsigned char perm1 = (const_vector unsigned char) + register const vector unsigned char perm1 = (const vector unsigned char) AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); - register const_vector unsigned char perm2 = (const_vector unsigned char) + register const vector unsigned char perm2 = (const vector unsigned char) AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B); - register const_vector unsigned char perm3 = (const_vector unsigned char) + register const vector unsigned char perm3 = (const vector unsigned char) AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); @@ -1120,7 +1094,7 @@ POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1); dstV = \ (vector signed short)vec_mergeh((vector signed char)vzero, \ (vector signed char)dstO); \ - /* substractions inside the first butterfly */ \ + /* subtractions inside the first butterfly */ \ but0 = vec_sub(srcV, dstV); \ op1 = vec_perm(but0, but0, perm1); \ but1 = vec_mladd(but0, vprod1, op1); \ @@ -1201,7 +1175,7 @@ POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1); schedule for the 7450, and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less instructions...) - On the 970, the hand-made RA is still a win (arount 690 + On the 970, the hand-made RA is still a win (around 690 vs. around 780), but xlc goes to around 660 on the regular C code... */ @@ -1226,25 +1200,25 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, temp5S REG_v(v13), temp6S REG_v(v14), temp7S REG_v(v15); - register const_vector unsigned char vzero REG_v(v31)= - (const_vector unsigned char)vec_splat_u8(0); + register const vector unsigned char vzero REG_v(v31)= + (const vector unsigned char)vec_splat_u8(0); { - register const_vector signed short vprod1 REG_v(v16)= - (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1); - register const_vector signed short vprod2 REG_v(v17)= - (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1); - register const_vector signed short vprod3 REG_v(v18)= - (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1); - register const_vector unsigned char perm1 REG_v(v19)= - (const_vector unsigned char) + register const vector signed short vprod1 REG_v(v16)= + (const vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1); + register const vector signed short vprod2 REG_v(v17)= + (const vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1); + register const vector signed short vprod3 REG_v(v18)= + (const vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1); + register const vector unsigned char perm1 REG_v(v19)= + (const vector unsigned char) AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D); - register const_vector unsigned char perm2 REG_v(v20)= - (const_vector unsigned char) + register const vector unsigned char perm2 REG_v(v20)= + (const vector unsigned char) AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B); - register const_vector unsigned char perm3 REG_v(v21)= - (const_vector unsigned char) + register const vector unsigned char perm3 REG_v(v21)= + (const vector unsigned char) AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07); @@ -1293,7 +1267,7 @@ static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, dstW = \ (vector signed short)vec_mergel((vector signed char)vzero, \ (vector signed char)dstO); \ - /* substractions inside the first butterfly */ \ + /* subtractions inside the first butterfly */ \ but0 = vec_sub(srcV, dstV); \ but0S = vec_sub(srcW, dstW); \ op1 = vec_perm(but0, but0, perm1); \ @@ -1419,50 +1393,6 @@ POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1); return score; } -int has_altivec(void) -{ -#ifdef __AMIGAOS4__ - ULONG result = 0; - extern struct ExecIFace *IExec; - - IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE); - if (result == VECTORTYPE_ALTIVEC) return 1; - return 0; -#else /* __AMIGAOS4__ */ - -#ifdef CONFIG_DARWIN - int sels[2] = {CTL_HW, HW_VECTORUNIT}; - int has_vu = 0; - size_t len = sizeof(has_vu); - int err; - - err = sysctl(sels, 2, &has_vu, &len, NULL, 0); - - if (err == 0) return (has_vu != 0); -#else /* CONFIG_DARWIN */ -/* no Darwin, do it the brute-force way */ -/* this is borrowed from the libmpeg2 library */ - { - signal (SIGILL, sigill_handler); - if (sigsetjmp (jmpbuf, 1)) { - signal (SIGILL, SIG_DFL); - } else { - canjump = 1; - - asm volatile ("mtspr 256, %0\n\t" - "vand %%v0, %%v0, %%v0" - : - : "r" (-1)); - - signal (SIGILL, SIG_DFL); - return 1; - } - } -#endif /* CONFIG_DARWIN */ - return 0; -#endif /* __AMIGAOS4__ */ -} - static void vorbis_inverse_coupling_altivec(float *mag, float *ang, int blocksize) { @@ -1495,9 +1425,9 @@ POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1); register vector unsigned char blockv, temp1, temp2, blocktemp; register vector unsigned short pixelssum1, pixelssum2, temp3; - register const_vector unsigned char vczero = (const_vector unsigned char) + register const vector unsigned char vczero = (const vector unsigned char) vec_splat_u8(0); - register const_vector unsigned short vctwo = (const_vector unsigned short) + register const vector unsigned short vctwo = (const vector unsigned short) vec_splat_u16(2); temp1 = vec_ld(0, pixels); @@ -1583,7 +1513,6 @@ void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx) c->hadamard8_diff[0] = hadamard8_diff16_altivec; c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; -#ifdef CONFIG_VORBIS_DECODER - c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; -#endif + if (ENABLE_VORBIS_DECODER) + c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; } diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h index 560d778bb..43bd5abab 100644 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h +++ b/contrib/ffmpeg/libavcodec/ppc/dsputil_altivec.h @@ -20,12 +20,10 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef _DSPUTIL_ALTIVEC_ -#define _DSPUTIL_ALTIVEC_ +#ifndef FFMPEG_DSPUTIL_ALTIVEC_H +#define FFMPEG_DSPUTIL_ALTIVEC_H -#include "dsputil_ppc.h" - -#ifdef HAVE_ALTIVEC +#include <stdint.h> extern int has_altivec(void); @@ -33,74 +31,4 @@ void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h); -// used to build registers permutation vectors (vcprm) -// the 's' are for words in the _s_econd vector -#define WORD_0 0x00,0x01,0x02,0x03 -#define WORD_1 0x04,0x05,0x06,0x07 -#define WORD_2 0x08,0x09,0x0a,0x0b -#define WORD_3 0x0c,0x0d,0x0e,0x0f -#define WORD_s0 0x10,0x11,0x12,0x13 -#define WORD_s1 0x14,0x15,0x16,0x17 -#define WORD_s2 0x18,0x19,0x1a,0x1b -#define WORD_s3 0x1c,0x1d,0x1e,0x1f - -#ifdef CONFIG_DARWIN -#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) -#else -#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} -#endif - -// vcprmle is used to keep the same index as in the SSE version. -// it's the same as vcprm, with the index inversed -// ('le' is Little Endian) -#define vcprmle(a,b,c,d) vcprm(d,c,b,a) - -// used to build inverse/identity vectors (vcii) -// n is _n_egative, p is _p_ositive -#define FLOAT_n -1. -#define FLOAT_p 1. - - -#ifdef CONFIG_DARWIN -#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) -#else -#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} -#endif - -// Transpose 8x8 matrix of 16-bit elements (in-place) -#define TRANSPOSE8(a,b,c,d,e,f,g,h) \ -do { \ - vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \ - vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \ - \ - A1 = vec_mergeh (a, e); \ - B1 = vec_mergel (a, e); \ - C1 = vec_mergeh (b, f); \ - D1 = vec_mergel (b, f); \ - E1 = vec_mergeh (c, g); \ - F1 = vec_mergel (c, g); \ - G1 = vec_mergeh (d, h); \ - H1 = vec_mergel (d, h); \ - \ - A2 = vec_mergeh (A1, E1); \ - B2 = vec_mergel (A1, E1); \ - C2 = vec_mergeh (B1, F1); \ - D2 = vec_mergel (B1, F1); \ - E2 = vec_mergeh (C1, G1); \ - F2 = vec_mergel (C1, G1); \ - G2 = vec_mergeh (D1, H1); \ - H2 = vec_mergel (D1, H1); \ - \ - a = vec_mergeh (A2, E2); \ - b = vec_mergel (A2, E2); \ - c = vec_mergeh (B2, F2); \ - d = vec_mergel (B2, F2); \ - e = vec_mergeh (C2, G2); \ - f = vec_mergel (C2, G2); \ - g = vec_mergeh (D2, H2); \ - h = vec_mergel (D2, H2); \ -} while (0) - -#endif /* HAVE_ALTIVEC */ - -#endif /* _DSPUTIL_ALTIVEC_ */ +#endif /* FFMPEG_DSPUTIL_ALTIVEC_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c index 117a7adf1..13dea06a1 100644 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c +++ b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.c @@ -20,7 +20,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "../dsputil.h" +#include "dsputil.h" #include "dsputil_ppc.h" @@ -39,6 +39,7 @@ void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx); void vc1dsp_init_altivec(DSPContext* c, AVCodecContext *avctx); void snow_init_altivec(DSPContext* c, AVCodecContext *avctx); void float_init_altivec(DSPContext* c, AVCodecContext *avctx); +void int_init_altivec(DSPContext* c, AVCodecContext *avctx); #endif @@ -154,11 +155,7 @@ POWERPC_PERF_START_COUNT(powerpc_clear_blocks_dcbz32, 1); i += 16; } for ( ; i < sizeof(DCTELEM)*6*64-31 ; i += 32) { -#ifndef __MWERKS__ asm volatile("dcbz %0,%1" : : "b" (blocks), "r" (i) : "memory"); -#else - __dcbz( blocks, i ); -#endif } if (misal) { ((unsigned long*)blocks)[188] = 0L; @@ -213,7 +210,7 @@ void clear_blocks_dcbz128_ppc(DCTELEM *blocks) knows about dcbzl ... */ long check_dcbzl_effect(void) { - register char *fakedata = (char*)av_malloc(1024); + register char *fakedata = av_malloc(1024); register char *fakedata_middle; register long zero = 0; register long i = 0; @@ -260,7 +257,7 @@ static void prefetch_ppc(void *mem, int stride, int h) void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) { - // Common optimizations whether Altivec is available or not + // Common optimizations whether AltiVec is available or not c->prefetch = prefetch_ppc; switch (check_dcbzl_effect()) { case 32: @@ -284,6 +281,7 @@ void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx) if(ENABLE_VC1_DECODER || ENABLE_WMV3_DECODER) vc1dsp_init_altivec(c, avctx); float_init_altivec(c, avctx); + int_init_altivec(c, avctx); c->gmc1 = gmc1_altivec; #ifdef CONFIG_ENCODERS diff --git a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h index 5b25732b2..d8f6b27f9 100644 --- a/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h +++ b/contrib/ffmpeg/libavcodec/ppc/dsputil_ppc.h @@ -18,14 +18,14 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef _DSPUTIL_PPC_ -#define _DSPUTIL_PPC_ +#ifndef FFMPEG_DSPUTIL_PPC_H +#define FFMPEG_DSPUTIL_PPC_H #ifdef CONFIG_POWERPC_PERF void powerpc_display_perf_report(void); /* the 604* have 2, the G3* have 4, the G4s have 6, and the G5 are completely different (they MUST use - POWERPC_MODE_64BITS, and let's hope all future 64 bis PPC + HAVE_PPC64, and let's hope all future 64 bis PPC will use the same PMCs... */ #define POWERPC_NUM_PMC_ENABLED 6 /* if you add to the enum below, also add to the perfname array @@ -68,7 +68,7 @@ enum powerpc_data_index { }; extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][powerpc_data_total]; -#ifndef POWERPC_MODE_64BITS +#ifndef HAVE_PPC64 #define POWERP_PMC_DATATYPE unsigned long #define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 937" : "=r" (a)) #define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a)) @@ -86,7 +86,7 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][ #define POWERPC_GET_PMC5(a) do {} while (0) #define POWERPC_GET_PMC6(a) do {} while (0) #endif -#else /* POWERPC_MODE_64BITS */ +#else /* HAVE_PPC64 */ #define POWERP_PMC_DATATYPE unsigned long long #define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 771" : "=r" (a)) #define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 772" : "=r" (a)) @@ -104,7 +104,7 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][ #define POWERPC_GET_PMC5(a) do {} while (0) #define POWERPC_GET_PMC6(a) do {} while (0) #endif -#endif /* POWERPC_MODE_64BITS */ +#endif /* HAVE_PPC64 */ #define POWERPC_PERF_DECLARE(a, cond) \ POWERP_PMC_DATATYPE \ pmc_start[POWERPC_NUM_PMC_ENABLED], \ @@ -152,4 +152,4 @@ extern unsigned long long perfdata[POWERPC_NUM_PMC_ENABLED][powerpc_perf_total][ #define POWERPC_PERF_STOP_COUNT(a, cond) do {} while (0) #endif /* CONFIG_POWERPC_PERF */ -#endif /* _DSPUTIL_PPC_ */ +#endif /* FFMPEG_DSPUTIL_PPC_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c b/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c index 2418c32bb..6b9a35ba8 100644 --- a/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/fdct_altivec.c @@ -21,8 +21,8 @@ #include "common.h" -#include "../dsputil.h" -#include "dsputil_altivec.h" +#include "dsputil.h" +#include "dsputil_ppc.h" #include "gcc_fixes.h" diff --git a/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c b/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c index 384a774ff..e0b77807f 100644 --- a/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/fft_altivec.c @@ -20,12 +20,12 @@ * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "../dsputil.h" +#include "dsputil.h" #include "gcc_fixes.h" -#include "dsputil_altivec.h" - +#include "dsputil_ppc.h" +#include "util_altivec.h" /* those three macros are from libavcodec/fft.c and are required for the reference C code diff --git a/contrib/ffmpeg/libavcodec/ppc/float_altivec.c b/contrib/ffmpeg/libavcodec/ppc/float_altivec.c index 22c2de61a..750e6d7f9 100644 --- a/contrib/ffmpeg/libavcodec/ppc/float_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/float_altivec.c @@ -18,7 +18,7 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "../dsputil.h" +#include "dsputil.h" #include "gcc_fixes.h" diff --git a/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h b/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h index 5a4a55188..b8a908a61 100644 --- a/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h +++ b/contrib/ffmpeg/libavcodec/ppc/gcc_fixes.h @@ -20,31 +20,22 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef _GCC_FIXES_ -#define _GCC_FIXES_ +#ifndef FFMPEG_GCC_FIXES_H +#define FFMPEG_GCC_FIXES_H + +#include "config.h" #ifdef HAVE_ALTIVEC_H #include <altivec.h> #endif -#ifdef CONFIG_DARWIN -# ifndef __MWERKS__ -# define AVV(x...) (x) -# else -# define AVV -# endif -#define REG_v(a) asm ( #a ) -#else - -#define AVV(x...) {x} - #if (__GNUC__ < 4) # define REG_v(a) #else # define REG_v(a) asm ( #a ) #endif -#if (__GNUC__ * 100 + __GNUC_MINOR__ < 303) +#if (__GNUC__ == 3 && __GNUC_MINOR__ < 3) /* This code was provided to me by Bartosch Pixa * as a separate header file (broken_mergel.h). @@ -106,14 +97,6 @@ __ch (__bin_args_eq (vector unsigned int, (a1), vector unsigned int, (a2)), \ ((vector unsigned int) ff_vmrglw ((vector signed int) (a1), (vector signed int) (a2))), \ __altivec_link_error_invalid_argument ()))))))) -#endif - -#endif /* CONFIG_DARWIN */ - -#ifndef __MWERKS__ -#define const_vector const vector -#else -#define const_vector vector -#endif +#endif /* (__GNUC__ == 3 && __GNUC_MINOR__ < 3) */ -#endif /* _GCC_FIXES_ */ +#endif /* FFMPEG_GCC_FIXES_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c b/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c index 42c936bb3..8151410d4 100644 --- a/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/gmc_altivec.c @@ -20,24 +20,25 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "../dsputil.h" +#include "dsputil.h" #include "gcc_fixes.h" -#include "dsputil_altivec.h" +#include "dsputil_ppc.h" +#include "util_altivec.h" /* altivec-enhanced gmc1. ATM this code assume stride is a multiple of 8, - to preserve proper dst alignement. + to preserve proper dst alignment. */ #define GMC1_PERF_COND (h==8) void gmc1_altivec(uint8_t *dst /* align 8 */, uint8_t *src /* align1 */, int stride, int h, int x16, int y16, int rounder) { POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); - const unsigned short __attribute__ ((aligned(16))) rounder_a[8] = + const DECLARE_ALIGNED_16(unsigned short, rounder_a[8]) = {rounder, rounder, rounder, rounder, rounder, rounder, rounder, rounder}; - const unsigned short __attribute__ ((aligned(16))) ABCD[8] = + const DECLARE_ALIGNED_16(unsigned short, ABCD[8]) = { (16-x16)*(16-y16), /* A */ ( x16)*(16-y16), /* B */ @@ -45,8 +46,8 @@ POWERPC_PERF_DECLARE(altivec_gmc1_num, GMC1_PERF_COND); ( x16)*( y16), /* D */ 0, 0, 0, 0 /* padding */ }; - register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0); - register const_vector unsigned short vcsr8 = (const_vector unsigned short)vec_splat_u16(8); + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vcsr8 = (const vector unsigned short)vec_splat_u16(8); register vector unsigned char dstv, dstv2, src_0, src_1, srcvA, srcvB, srcvC, srcvD; register vector unsigned short Av, Bv, Cv, Dv, rounderV, tempA, tempB, tempC, tempD; int i; diff --git a/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c b/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c index bac620e82..c716b1e33 100644 --- a/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/h264_altivec.c @@ -18,11 +18,13 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#include "../dsputil.h" +#include "dsputil.h" #include "gcc_fixes.h" +#include "dsputil_ppc.h" #include "dsputil_altivec.h" +#include "util_altivec.h" #include "types_altivec.h" #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s @@ -180,130 +182,124 @@ static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint /* this code assume that stride % 16 == 0 */ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { - signed int ABCD[4] __attribute__((aligned(16))) = + DECLARE_ALIGNED_16(signed int, ABCD[4]) = {((8 - x) * (8 - y)), - ((x) * (8 - y)), - ((8 - x) * (y)), - ((x) * (y))}; + ((x) * (8 - y)), + ((8 - x) * (y)), + ((x) * (y))}; register int i; - vector unsigned char fperm; - const vector signed int vABCD = vec_ld(0, ABCD); - const vector signed short vA = vec_splat((vector signed short)vABCD, 1); - const vector signed short vB = vec_splat((vector signed short)vABCD, 3); - const vector signed short vC = vec_splat((vector signed short)vABCD, 5); - const vector signed short vD = vec_splat((vector signed short)vABCD, 7); - const vector signed int vzero = vec_splat_s32(0); - const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); - const vector unsigned short v6us = vec_splat_u16(6); - register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; + vec_u8_t fperm; + const vec_s32_t vABCD = vec_ld(0, ABCD); + const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); + const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); + const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); + const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); + LOAD_ZERO; + const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); + const vec_u16_t v6us = vec_splat_u16(6); + register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; - vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; - vector unsigned char vsrc0uc, vsrc1uc; - vector signed short vsrc0ssH, vsrc1ssH; - vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; - vector signed short vsrc2ssH, vsrc3ssH, psum; - vector unsigned char vdst, ppsum, fsum; + vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; + vec_u8_t vsrc0uc, vsrc1uc; + vec_s16_t vsrc0ssH, vsrc1ssH; + vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; + vec_s16_t vsrc2ssH, vsrc3ssH, psum; + vec_u8_t vdst, ppsum, fsum; if (((unsigned long)dst) % 16 == 0) { - fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, - 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F); + fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, + 0x08, 0x09, 0x0A, 0x0B, + 0x0C, 0x0D, 0x0E, 0x0F); } else { - fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F); + fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, + 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F); } vsrcAuc = vec_ld(0, src); if (loadSecond) - vsrcBuc = vec_ld(16, src); + vsrcBuc = vec_ld(16, src); vsrcperm0 = vec_lvsl(0, src); vsrcperm1 = vec_lvsl(1, src); vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); if (reallyBadAlign) - vsrc1uc = vsrcBuc; + vsrc1uc = vsrcBuc; else - vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); + vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); - vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc0uc); - vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc1uc); + vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc); + vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc); if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { + for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); + vsrcCuc = vec_ld(stride + 0, src); - vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); + vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); + vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc2uc); - vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc3uc); + vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc); + vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc); - psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); - psum = vec_mladd(vB, vsrc1ssH, psum); - psum = vec_mladd(vC, vsrc2ssH, psum); - psum = vec_mladd(vD, vsrc3ssH, psum); - psum = vec_add(v28ss, psum); - psum = vec_sra(psum, v6us); + psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); + psum = vec_mladd(vB, vsrc1ssH, psum); + psum = vec_mladd(vC, vsrc2ssH, psum); + psum = vec_mladd(vD, vsrc3ssH, psum); + psum = vec_add(v28ss, psum); + psum = vec_sra(psum, v6us); - vdst = vec_ld(0, dst); - ppsum = (vector unsigned char)vec_packsu(psum, psum); - fsum = vec_perm(vdst, ppsum, fperm); + vdst = vec_ld(0, dst); + ppsum = (vec_u8_t)vec_packsu(psum, psum); + fsum = vec_perm(vdst, ppsum, fperm); - vec_st(fsum, 0, dst); + vec_st(fsum, 0, dst); - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; + vsrc0ssH = vsrc2ssH; + vsrc1ssH = vsrc3ssH; - dst += stride; - src += stride; - } + dst += stride; + src += stride; + } } else { - vector unsigned char vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 16, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc3uc = vsrcDuc; - else - vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc2uc); - vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc3uc); - - psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); - psum = vec_mladd(vB, vsrc1ssH, psum); - psum = vec_mladd(vC, vsrc2ssH, psum); - psum = vec_mladd(vD, vsrc3ssH, psum); - psum = vec_add(v28ss, psum); - psum = vec_sr(psum, v6us); - - vdst = vec_ld(0, dst); - ppsum = (vector unsigned char)vec_pack(psum, psum); - fsum = vec_perm(vdst, ppsum, fperm); - - vec_st(fsum, 0, dst); - - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; - - dst += stride; - src += stride; - } + vec_u8_t vsrcDuc; + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(stride + 0, src); + vsrcDuc = vec_ld(stride + 16, src); + + vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); + if (reallyBadAlign) + vsrc3uc = vsrcDuc; + else + vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); + + vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc); + vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc); + + psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); + psum = vec_mladd(vB, vsrc1ssH, psum); + psum = vec_mladd(vC, vsrc2ssH, psum); + psum = vec_mladd(vD, vsrc3ssH, psum); + psum = vec_add(v28ss, psum); + psum = vec_sr(psum, v6us); + + vdst = vec_ld(0, dst); + ppsum = (vec_u8_t)vec_pack(psum, psum); + fsum = vec_perm(vdst, ppsum, fperm); + + vec_st(fsum, 0, dst); + + vsrc0ssH = vsrc2ssH; + vsrc1ssH = vsrc3ssH; + + dst += stride; + src += stride; + } } } @@ -312,7 +308,7 @@ static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, int src_stride1, int h) { int i; - vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align; + vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align; mask_ = vec_lvsl(0, src2); @@ -354,7 +350,7 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, int src_stride1, int h) { int i; - vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align; + vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align; mask_ = vec_lvsl(0, src2); @@ -404,6 +400,82 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1, * IDCT transform: ****************************************************************************/ +#define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \ + /* 1st stage */ \ + vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \ + vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \ + vz2 = vec_sra(vb1,vec_splat_u16(1)); \ + vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \ + vz3 = vec_sra(vb3,vec_splat_u16(1)); \ + vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \ + /* 2nd stage: output */ \ + va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \ + va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \ + va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \ + va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */ + +#define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \ + b0 = vec_mergeh( a0, a0 ); \ + b1 = vec_mergeh( a1, a0 ); \ + b2 = vec_mergeh( a2, a0 ); \ + b3 = vec_mergeh( a3, a0 ); \ + a0 = vec_mergeh( b0, b2 ); \ + a1 = vec_mergel( b0, b2 ); \ + a2 = vec_mergeh( b1, b3 ); \ + a3 = vec_mergel( b1, b3 ); \ + b0 = vec_mergeh( a0, a2 ); \ + b1 = vec_mergel( a0, a2 ); \ + b2 = vec_mergeh( a1, a3 ); \ + b3 = vec_mergel( a1, a3 ) + +#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ + vdst_orig = vec_ld(0, dst); \ + vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ + vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst); \ + va = vec_add(va, vdst_ss); \ + va_u8 = vec_packsu(va, zero_s16v); \ + va_u32 = vec_splat((vec_u32_t)va_u8, 0); \ + vec_ste(va_u32, element, (uint32_t*)dst); + +static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride) +{ + vec_s16_t va0, va1, va2, va3; + vec_s16_t vz0, vz1, vz2, vz3; + vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3; + vec_u8_t va_u8; + vec_u32_t va_u32; + vec_s16_t vdst_ss; + const vec_u16_t v6us = vec_splat_u16(6); + vec_u8_t vdst, vdst_orig; + vec_u8_t vdst_mask = vec_lvsl(0, dst); + int element = ((unsigned long)dst & 0xf) >> 2; + LOAD_ZERO; + + block[0] += 32; /* add 32 as a DC-level for rounding */ + + vtmp0 = vec_ld(0,block); + vtmp1 = vec_sld(vtmp0, vtmp0, 8); + vtmp2 = vec_ld(16,block); + vtmp3 = vec_sld(vtmp2, vtmp2, 8); + + VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); + VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); + VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); + + va0 = vec_sra(va0,v6us); + va1 = vec_sra(va1,v6us); + va2 = vec_sra(va2,v6us); + va3 = vec_sra(va3,v6us); + + VEC_LOAD_U8_ADD_S16_STORE_U8(va0); + dst += stride; + VEC_LOAD_U8_ADD_S16_STORE_U8(va1); + dst += stride; + VEC_LOAD_U8_ADD_S16_STORE_U8(va2); + dst += stride; + VEC_LOAD_U8_ADD_S16_STORE_U8(va3); +} + #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ /* a0 = SRC(0) + SRC(4); */ \ vec_s16_t a0v = vec_add(s0, s4); \ @@ -491,8 +563,7 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { const vec_u16_t twov = vec_splat_u16(2); const vec_u16_t sixv = vec_splat_u16(6); - const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0, - -1,-1,-1,-1,-1,-1,-1,-1); + const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1); LOAD_ZERO; dct[0] += 32; // rounding for the >>6 at the end @@ -524,42 +595,310 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel); } +#define transpose4x16(r0, r1, r2, r3) { \ + register vec_u8_t r4; \ + register vec_u8_t r5; \ + register vec_u8_t r6; \ + register vec_u8_t r7; \ + \ + r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ + r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ + r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \ + r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \ + \ + r0 = vec_mergeh(r4, r6); /*all set 0*/ \ + r1 = vec_mergel(r4, r6); /*all set 1*/ \ + r2 = vec_mergeh(r5, r7); /*all set 2*/ \ + r3 = vec_mergel(r5, r7); /*all set 3*/ \ +} + +static inline void write16x4(uint8_t *dst, int dst_stride, + register vec_u8_t r0, register vec_u8_t r1, + register vec_u8_t r2, register vec_u8_t r3) { + DECLARE_ALIGNED_16(unsigned char, result[64]); + uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; + int int_dst_stride = dst_stride/4; + + vec_st(r0, 0, result); + vec_st(r1, 16, result); + vec_st(r2, 32, result); + vec_st(r3, 48, result); + /* FIXME: there has to be a better way!!!! */ + *dst_int = *src_int; + *(dst_int+ int_dst_stride) = *(src_int + 1); + *(dst_int+ 2*int_dst_stride) = *(src_int + 2); + *(dst_int+ 3*int_dst_stride) = *(src_int + 3); + *(dst_int+ 4*int_dst_stride) = *(src_int + 4); + *(dst_int+ 5*int_dst_stride) = *(src_int + 5); + *(dst_int+ 6*int_dst_stride) = *(src_int + 6); + *(dst_int+ 7*int_dst_stride) = *(src_int + 7); + *(dst_int+ 8*int_dst_stride) = *(src_int + 8); + *(dst_int+ 9*int_dst_stride) = *(src_int + 9); + *(dst_int+10*int_dst_stride) = *(src_int + 10); + *(dst_int+11*int_dst_stride) = *(src_int + 11); + *(dst_int+12*int_dst_stride) = *(src_int + 12); + *(dst_int+13*int_dst_stride) = *(src_int + 13); + *(dst_int+14*int_dst_stride) = *(src_int + 14); + *(dst_int+15*int_dst_stride) = *(src_int + 15); +} + +/** \brief performs a 6x16 transpose of data in src, and stores it to dst + \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing + out of unaligned_load() */ +#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ + register vec_u8_t r0 = unaligned_load(0, src); \ + register vec_u8_t r1 = unaligned_load( src_stride, src); \ + register vec_u8_t r2 = unaligned_load(2* src_stride, src); \ + register vec_u8_t r3 = unaligned_load(3* src_stride, src); \ + register vec_u8_t r4 = unaligned_load(4* src_stride, src); \ + register vec_u8_t r5 = unaligned_load(5* src_stride, src); \ + register vec_u8_t r6 = unaligned_load(6* src_stride, src); \ + register vec_u8_t r7 = unaligned_load(7* src_stride, src); \ + register vec_u8_t r14 = unaligned_load(14*src_stride, src); \ + register vec_u8_t r15 = unaligned_load(15*src_stride, src); \ + \ + r8 = unaligned_load( 8*src_stride, src); \ + r9 = unaligned_load( 9*src_stride, src); \ + r10 = unaligned_load(10*src_stride, src); \ + r11 = unaligned_load(11*src_stride, src); \ + r12 = unaligned_load(12*src_stride, src); \ + r13 = unaligned_load(13*src_stride, src); \ + \ + /*Merge first pairs*/ \ + r0 = vec_mergeh(r0, r8); /*0, 8*/ \ + r1 = vec_mergeh(r1, r9); /*1, 9*/ \ + r2 = vec_mergeh(r2, r10); /*2,10*/ \ + r3 = vec_mergeh(r3, r11); /*3,11*/ \ + r4 = vec_mergeh(r4, r12); /*4,12*/ \ + r5 = vec_mergeh(r5, r13); /*5,13*/ \ + r6 = vec_mergeh(r6, r14); /*6,14*/ \ + r7 = vec_mergeh(r7, r15); /*7,15*/ \ + \ + /*Merge second pairs*/ \ + r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \ + r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \ + r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \ + r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \ + r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \ + r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \ + r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \ + r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \ + \ + /*Third merge*/ \ + r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \ + r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \ + r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \ + r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \ + r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \ + r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \ + /* Don't need to compute 3 and 7*/ \ + \ + /*Final merge*/ \ + r8 = vec_mergeh(r0, r4); /*all set 0*/ \ + r9 = vec_mergel(r0, r4); /*all set 1*/ \ + r10 = vec_mergeh(r1, r5); /*all set 2*/ \ + r11 = vec_mergel(r1, r5); /*all set 3*/ \ + r12 = vec_mergeh(r2, r6); /*all set 4*/ \ + r13 = vec_mergel(r2, r6); /*all set 5*/ \ + /* Don't need to compute 14 and 15*/ \ + \ +} + +// out: o = |x-y| < a +static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x, + register vec_u8_t y, + register vec_u8_t a) { + + register vec_u8_t diff = vec_subs(x, y); + register vec_u8_t diffneg = vec_subs(y, x); + register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */ + o = (vec_u8_t)vec_cmplt(o, a); + return o; +} + +static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0, + register vec_u8_t p1, + register vec_u8_t q0, + register vec_u8_t q1, + register vec_u8_t alpha, + register vec_u8_t beta) { + + register vec_u8_t mask; + register vec_u8_t tempmask; + + mask = diff_lt_altivec(p0, q0, alpha); + tempmask = diff_lt_altivec(p1, p0, beta); + mask = vec_and(mask, tempmask); + tempmask = diff_lt_altivec(q1, q0, beta); + mask = vec_and(mask, tempmask); + + return mask; +} + +// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) +static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0, + register vec_u8_t p1, + register vec_u8_t p2, + register vec_u8_t q0, + register vec_u8_t tc0) { + + register vec_u8_t average = vec_avg(p0, q0); + register vec_u8_t temp; + register vec_u8_t uncliped; + register vec_u8_t ones; + register vec_u8_t max; + register vec_u8_t min; + register vec_u8_t newp1; + + temp = vec_xor(average, p2); + average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ + ones = vec_splat_u8(1); + temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */ + uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */ + max = vec_adds(p1, tc0); + min = vec_subs(p1, tc0); + newp1 = vec_max(min, uncliped); + newp1 = vec_min(max, newp1); + return newp1; +} + +#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ + \ + const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ + \ + register vec_u8_t pq0bit = vec_xor(p0,q0); \ + register vec_u8_t q1minus; \ + register vec_u8_t p0minus; \ + register vec_u8_t stage1; \ + register vec_u8_t stage2; \ + register vec_u8_t vec160; \ + register vec_u8_t delta; \ + register vec_u8_t deltaneg; \ + \ + q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ + stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ + stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \ + p0minus = vec_nor(p0, p0); /* 255 - p0 */ \ + stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \ + pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \ + stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \ + stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \ + vec160 = vec_ld(0, &A0v); \ + deltaneg = vec_subs(vec160, stage2); /* -d */ \ + delta = vec_subs(stage2, vec160); /* d */ \ + deltaneg = vec_min(tc0masked, deltaneg); \ + delta = vec_min(tc0masked, delta); \ + p0 = vec_subs(p0, deltaneg); \ + q0 = vec_subs(q0, delta); \ + p0 = vec_adds(p0, delta); \ + q0 = vec_adds(q0, deltaneg); \ +} + +#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ + DECLARE_ALIGNED_16(unsigned char, temp[16]); \ + register vec_u8_t alphavec; \ + register vec_u8_t betavec; \ + register vec_u8_t mask; \ + register vec_u8_t p1mask; \ + register vec_u8_t q1mask; \ + register vector signed char tc0vec; \ + register vec_u8_t finaltc0; \ + register vec_u8_t tc0masked; \ + register vec_u8_t newp1; \ + register vec_u8_t newq1; \ + \ + temp[0] = alpha; \ + temp[1] = beta; \ + alphavec = vec_ld(0, temp); \ + betavec = vec_splat(alphavec, 0x1); \ + alphavec = vec_splat(alphavec, 0x0); \ + mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \ + \ + *((int *)temp) = *((int *)tc0); \ + tc0vec = vec_ld(0, (signed char*)temp); \ + tc0vec = vec_mergeh(tc0vec, tc0vec); \ + tc0vec = vec_mergeh(tc0vec, tc0vec); \ + mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ + finaltc0 = vec_and((vec_u8_t)tc0vec, mask); /* tc = tc0 */ \ + \ + p1mask = diff_lt_altivec(p2, p0, betavec); \ + p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \ + tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec); \ + finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ + newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ + /*end if*/ \ + \ + q1mask = diff_lt_altivec(q2, q0, betavec); \ + q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ + tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec); \ + finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ + newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ + /*end if*/ \ + \ + h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \ + p1 = newp1; \ + q1 = newq1; \ +} + +static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { + + if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { + register vec_u8_t p2 = vec_ld(-3*stride, pix); + register vec_u8_t p1 = vec_ld(-2*stride, pix); + register vec_u8_t p0 = vec_ld(-1*stride, pix); + register vec_u8_t q0 = vec_ld(0, pix); + register vec_u8_t q1 = vec_ld(stride, pix); + register vec_u8_t q2 = vec_ld(2*stride, pix); + h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); + vec_st(p1, -2*stride, pix); + vec_st(p0, -1*stride, pix); + vec_st(q0, 0, pix); + vec_st(q1, stride, pix); + } +} + +static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { + + register vec_u8_t line0, line1, line2, line3, line4, line5; + if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) + return; + readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); + h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); + transpose4x16(line1, line2, line3, line4); + write16x4(pix-2, stride, line1, line2, line3, line4); +} + void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) { -#ifdef HAVE_ALTIVEC - if (has_altivec()) { - c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; - c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec; - c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; - c->h264_idct8_add = ff_h264_idct8_add_altivec; + if (has_altivec()) { + c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec; + c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec; + c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec; + c->h264_idct_add = ff_h264_idct_add_altivec; + c->h264_idct8_add = ff_h264_idct8_add_altivec; + c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec; + c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec; #define dspfunc(PFX, IDX, NUM) \ - c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ - c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \ - c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \ - c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \ - c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \ - c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \ - c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \ - c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \ - c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \ - c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \ - c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \ - c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \ - c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \ - c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \ - c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \ - c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec - - dspfunc(put_h264_qpel, 0, 16); - dspfunc(avg_h264_qpel, 0, 16); + c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \ + c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \ + c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \ + c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \ + c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \ + c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \ + c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \ + c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \ + c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \ + c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \ + c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \ + c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \ + c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \ + c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \ + c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \ + c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec + + dspfunc(put_h264_qpel, 0, 16); + dspfunc(avg_h264_qpel, 0, 16); #undef dspfunc - - } else -#endif /* HAVE_ALTIVEC */ - { - // Non-AltiVec PPC optimisations - - // ... pending ... - } + } } diff --git a/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c b/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c index e8ad67f2f..d8ad96419 100644 --- a/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/h264_template_altivec.c @@ -18,186 +18,227 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +//#define DEBUG_ALIGNMENT +#ifdef DEBUG_ALIGNMENT +#define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F)); +#else +#define ASSERT_ALIGNED(ptr) ; +#endif + /* this code assume that stride % 16 == 0 */ -void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) { + +#define CHROMA_MC8_ALTIVEC_CORE \ + vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\ + vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\ +\ + psum = vec_mladd(vA, vsrc0ssH, v32ss);\ + psum = vec_mladd(vB, vsrc1ssH, psum);\ + psum = vec_mladd(vC, vsrc2ssH, psum);\ + psum = vec_mladd(vD, vsrc3ssH, psum);\ + psum = vec_sr(psum, v6us);\ +\ + vdst = vec_ld(0, dst);\ + ppsum = (vec_u8_t)vec_pack(psum, psum);\ + vfdst = vec_perm(vdst, ppsum, fperm);\ +\ + OP_U8_ALTIVEC(fsum, vfdst, vdst);\ +\ + vec_st(fsum, 0, dst);\ +\ + vsrc0ssH = vsrc2ssH;\ + vsrc1ssH = vsrc3ssH;\ +\ + dst += stride;\ + src += stride; + +#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ +\ + vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\ + vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\ +\ + psum = vec_mladd(vA, vsrc0ssH, v32ss);\ + psum = vec_mladd(vE, vsrc1ssH, psum);\ + psum = vec_sr(psum, v6us);\ +\ + vdst = vec_ld(0, dst);\ + ppsum = (vec_u8_t)vec_pack(psum, psum);\ + vfdst = vec_perm(vdst, ppsum, fperm);\ +\ + OP_U8_ALTIVEC(fsum, vfdst, vdst);\ +\ + vec_st(fsum, 0, dst);\ +\ + dst += stride;\ + src += stride; + +void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, + int stride, int h, int x, int y) { POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1); - signed int ABCD[4] __attribute__((aligned(16))) = + DECLARE_ALIGNED_16(signed int, ABCD[4]) = {((8 - x) * (8 - y)), - ((x) * (8 - y)), - ((8 - x) * (y)), - ((x) * (y))}; + (( x) * (8 - y)), + ((8 - x) * ( y)), + (( x) * ( y))}; register int i; - vector unsigned char fperm; - const vector signed int vABCD = vec_ld(0, ABCD); - const vector signed short vA = vec_splat((vector signed short)vABCD, 1); - const vector signed short vB = vec_splat((vector signed short)vABCD, 3); - const vector signed short vC = vec_splat((vector signed short)vABCD, 5); - const vector signed short vD = vec_splat((vector signed short)vABCD, 7); - const vector signed int vzero = vec_splat_s32(0); - const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); - const vector unsigned short v6us = vec_splat_u16(6); + vec_u8_t fperm; + const vec_s32_t vABCD = vec_ld(0, ABCD); + const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); + const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); + const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); + const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); + LOAD_ZERO; + const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); + const vec_u16_t v6us = vec_splat_u16(6); register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; - vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; - vector unsigned char vsrc0uc, vsrc1uc; - vector signed short vsrc0ssH, vsrc1ssH; - vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc; - vector signed short vsrc2ssH, vsrc3ssH, psum; - vector unsigned char vdst, ppsum, vfdst, fsum; + vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; + vec_u8_t vsrc0uc, vsrc1uc; + vec_s16_t vsrc0ssH, vsrc1ssH; + vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; + vec_s16_t vsrc2ssH, vsrc3ssH, psum; + vec_u8_t vdst, ppsum, vfdst, fsum; POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); if (((unsigned long)dst) % 16 == 0) { - fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, - 0x14, 0x15, 0x16, 0x17, - 0x08, 0x09, 0x0A, 0x0B, - 0x0C, 0x0D, 0x0E, 0x0F); + fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, + 0x08, 0x09, 0x0A, 0x0B, + 0x0C, 0x0D, 0x0E, 0x0F); } else { - fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, - 0x04, 0x05, 0x06, 0x07, - 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F); + fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07, + 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F); } vsrcAuc = vec_ld(0, src); if (loadSecond) - vsrcBuc = vec_ld(16, src); + vsrcBuc = vec_ld(16, src); vsrcperm0 = vec_lvsl(0, src); vsrcperm1 = vec_lvsl(1, src); vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0); if (reallyBadAlign) - vsrc1uc = vsrcBuc; + vsrc1uc = vsrcBuc; else - vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); - - vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc0uc); - vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc1uc); - - if (!loadSecond) {// -> !reallyBadAlign - for (i = 0 ; i < h ; i++) { - - - vsrcCuc = vec_ld(stride + 0, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); - vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); - - vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc2uc); - vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc3uc); - - psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); - psum = vec_mladd(vB, vsrc1ssH, psum); - psum = vec_mladd(vC, vsrc2ssH, psum); - psum = vec_mladd(vD, vsrc3ssH, psum); - psum = vec_add(v32ss, psum); - psum = vec_sra(psum, v6us); - - vdst = vec_ld(0, dst); - ppsum = (vector unsigned char)vec_packsu(psum, psum); - vfdst = vec_perm(vdst, ppsum, fperm); - - OP_U8_ALTIVEC(fsum, vfdst, vdst); - - vec_st(fsum, 0, dst); - - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; - - dst += stride; - src += stride; - } + vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); + + vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc); + vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc); + + if (ABCD[3]) { + if (!loadSecond) {// -> !reallyBadAlign + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(stride + 0, src); + vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); + vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); + + CHROMA_MC8_ALTIVEC_CORE + } + } else { + vec_u8_t vsrcDuc; + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(stride + 0, src); + vsrcDuc = vec_ld(stride + 16, src); + vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); + if (reallyBadAlign) + vsrc3uc = vsrcDuc; + else + vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); + + CHROMA_MC8_ALTIVEC_CORE + } + } } else { - vector unsigned char vsrcDuc; - for (i = 0 ; i < h ; i++) { - vsrcCuc = vec_ld(stride + 0, src); - vsrcDuc = vec_ld(stride + 16, src); - - vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); - if (reallyBadAlign) - vsrc3uc = vsrcDuc; - else - vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); - - vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc2uc); - vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, - (vector unsigned char)vsrc3uc); - - psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); - psum = vec_mladd(vB, vsrc1ssH, psum); - psum = vec_mladd(vC, vsrc2ssH, psum); - psum = vec_mladd(vD, vsrc3ssH, psum); - psum = vec_add(v32ss, psum); - psum = vec_sr(psum, v6us); - - vdst = vec_ld(0, dst); - ppsum = (vector unsigned char)vec_pack(psum, psum); - vfdst = vec_perm(vdst, ppsum, fperm); - - OP_U8_ALTIVEC(fsum, vfdst, vdst); - - vec_st(fsum, 0, dst); - - vsrc0ssH = vsrc2ssH; - vsrc1ssH = vsrc3ssH; - - dst += stride; - src += stride; - } + const vec_s16_t vE = vec_add(vB, vC); + if (ABCD[2]) { // x == 0 B == 0 + if (!loadSecond) {// -> !reallyBadAlign + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(stride + 0, src); + vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); + CHROMA_MC8_ALTIVEC_CORE_SIMPLE + + vsrc0uc = vsrc1uc; + } + } else { + vec_u8_t vsrcDuc; + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(stride + 0, src); + vsrcDuc = vec_ld(stride + 15, src); + vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); + CHROMA_MC8_ALTIVEC_CORE_SIMPLE + + vsrc0uc = vsrc1uc; + } + } + } else { // y == 0 C == 0 + if (!loadSecond) {// -> !reallyBadAlign + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(0, src); + vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); + vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); + + CHROMA_MC8_ALTIVEC_CORE_SIMPLE + } + } else { + vec_u8_t vsrcDuc; + for (i = 0 ; i < h ; i++) { + vsrcCuc = vec_ld(0, src); + vsrcDuc = vec_ld(15, src); + vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0); + if (reallyBadAlign) + vsrc1uc = vsrcDuc; + else + vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); + + CHROMA_MC8_ALTIVEC_CORE_SIMPLE + } + } + } } POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1); } +#undef CHROMA_MC8_ALTIVEC_CORE + /* this code assume stride % 16 == 0 */ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); register int i; - const vector signed int vzero = vec_splat_s32(0); - const vector unsigned char permM2 = vec_lvsl(-2, src); - const vector unsigned char permM1 = vec_lvsl(-1, src); - const vector unsigned char permP0 = vec_lvsl(+0, src); - const vector unsigned char permP1 = vec_lvsl(+1, src); - const vector unsigned char permP2 = vec_lvsl(+2, src); - const vector unsigned char permP3 = vec_lvsl(+3, src); - const vector signed short v5ss = vec_splat_s16(5); - const vector unsigned short v5us = vec_splat_u16(5); - const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); - const vector unsigned char dstperm = vec_lvsr(0, dst); - const vector unsigned char neg1 = - (const vector unsigned char) vec_splat_s8(-1); - - const vector unsigned char dstmask = - vec_perm((const vector unsigned char)vzero, - neg1, dstperm); - - vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; + LOAD_ZERO; + const vec_u8_t permM2 = vec_lvsl(-2, src); + const vec_u8_t permM1 = vec_lvsl(-1, src); + const vec_u8_t permP0 = vec_lvsl(+0, src); + const vec_u8_t permP1 = vec_lvsl(+1, src); + const vec_u8_t permP2 = vec_lvsl(+2, src); + const vec_u8_t permP3 = vec_lvsl(+3, src); + const vec_s16_t v5ss = vec_splat_s16(5); + const vec_u16_t v5us = vec_splat_u16(5); + const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); + const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); + + vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; register int align = ((((unsigned long)src) - 2) % 16); - vector signed short srcP0A, srcP0B, srcP1A, srcP1B, + vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, srcP2A, srcP2B, srcP3A, srcP3B, srcM1A, srcM1B, srcM2A, srcM2B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, psumA, psumB, sumA, sumB; - vector unsigned char sum, dst1, dst2, vdst, fsum, - rsum, fdst1, fdst2; + vec_u8_t sum, vdst, fsum; POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); for (i = 0 ; i < 16 ; i ++) { - vector unsigned char srcR1 = vec_ld(-2, src); - vector unsigned char srcR2 = vec_ld(14, src); + vec_u8_t srcR1 = vec_ld(-2, src); + vec_u8_t srcR2 = vec_ld(14, src); switch (align) { default: { @@ -217,7 +258,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i srcP3 = srcR2; } break; case 12: { - vector unsigned char srcR3 = vec_ld(30, src); + vec_u8_t srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); @@ -226,7 +267,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { - vector unsigned char srcR3 = vec_ld(30, src); + vec_u8_t srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); @@ -235,7 +276,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { - vector unsigned char srcR3 = vec_ld(30, src); + vec_u8_t srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; @@ -244,7 +285,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { - vector unsigned char srcR3 = vec_ld(30, src); + vec_u8_t srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); @@ -254,32 +295,20 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i } break; } - srcP0A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP0); - srcP0B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP0); - srcP1A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP1); - srcP1B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP1); - - srcP2A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP2); - srcP2B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP2); - srcP3A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP3); - srcP3B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP3); - - srcM1A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcM1); - srcM1B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcM1); - srcM2A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcM2); - srcM2B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcM2); + srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); + srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); + srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); + srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); + + srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); + srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); + srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); + srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); + + srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); + srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); + srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); + srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); sum1A = vec_adds(srcP0A, srcP1A); sum1B = vec_adds(srcP0B, srcP1B); @@ -291,8 +320,8 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss); - pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); - pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); + pp2A = vec_mladd(sum2A, v5ss, zero_s16v); + pp2B = vec_mladd(sum2B, v5ss, zero_s16v); pp3A = vec_add(sum3A, pp1A); pp3B = vec_add(sum3B, pp1B); @@ -305,18 +334,12 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i sum = vec_packsu(sumA, sumB); - dst1 = vec_ld(0, dst); - dst2 = vec_ld(16, dst); - vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); + ASSERT_ALIGNED(dst); + vdst = vec_ld(0, dst); OP_U8_ALTIVEC(fsum, sum, vdst); - rsum = vec_perm(fsum, fsum, dstperm); - fdst1 = vec_sel(dst1, rsum, dstmask); - fdst2 = vec_sel(rsum, dst2, dstmask); - - vec_st(fdst1, 0, dst); - vec_st(fdst2, 16, dst); + vec_st(fsum, 0, dst); src += srcStride; dst += dstStride; @@ -330,67 +353,53 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i register int i; - const vector signed int vzero = vec_splat_s32(0); - const vector unsigned char perm = vec_lvsl(0, src); - const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vector unsigned short v5us = vec_splat_u16(5); - const vector signed short v5ss = vec_splat_s16(5); - const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); - const vector unsigned char dstperm = vec_lvsr(0, dst); - const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1); - const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); + LOAD_ZERO; + const vec_u8_t perm = vec_lvsl(0, src); + const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); + const vec_u16_t v5us = vec_splat_u16(5); + const vec_s16_t v5ss = vec_splat_s16(5); + const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); uint8_t *srcbis = src - (srcStride * 2); - const vector unsigned char srcM2a = vec_ld(0, srcbis); - const vector unsigned char srcM2b = vec_ld(16, srcbis); - const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm); + const vec_u8_t srcM2a = vec_ld(0, srcbis); + const vec_u8_t srcM2b = vec_ld(16, srcbis); + const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); // srcbis += srcStride; - const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride); - const vector unsigned char srcM1b = vec_ld(16, srcbis); - const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm); + const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); + const vec_u8_t srcM1b = vec_ld(16, srcbis); + const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); // srcbis += srcStride; - const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride); - const vector unsigned char srcP0b = vec_ld(16, srcbis); - const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm); + const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); + const vec_u8_t srcP0b = vec_ld(16, srcbis); + const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); // srcbis += srcStride; - const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride); - const vector unsigned char srcP1b = vec_ld(16, srcbis); - const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm); + const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); + const vec_u8_t srcP1b = vec_ld(16, srcbis); + const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); // srcbis += srcStride; - const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride); - const vector unsigned char srcP2b = vec_ld(16, srcbis); - const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm); + const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); + const vec_u8_t srcP2b = vec_ld(16, srcbis); + const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); // srcbis += srcStride; - vector signed short srcM2ssA = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcM2); - vector signed short srcM2ssB = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcM2); - vector signed short srcM1ssA = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcM1); - vector signed short srcM1ssB = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcM1); - vector signed short srcP0ssA = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP0); - vector signed short srcP0ssB = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP0); - vector signed short srcP1ssA = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP1); - vector signed short srcP1ssB = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP1); - vector signed short srcP2ssA = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP2); - vector signed short srcP2ssB = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP2); - - vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, + vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); + vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); + vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); + vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); + vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); + vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); + vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); + vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); + vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); + vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); + + vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, psumA, psumB, sumA, sumB, srcP3ssA, srcP3ssB, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; - vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, - srcP3a, srcP3b, srcP3; + vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); @@ -398,10 +407,8 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i srcP3a = vec_ld(0, srcbis += srcStride); srcP3b = vec_ld(16, srcbis); srcP3 = vec_perm(srcP3a, srcP3b, perm); - srcP3ssA = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP3); - srcP3ssB = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP3); + srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); + srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); // srcbis += srcStride; sum1A = vec_adds(srcP0ssA, srcP1ssA); @@ -425,8 +432,8 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss); - pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); - pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); + pp2A = vec_mladd(sum2A, v5ss, zero_s16v); + pp2B = vec_mladd(sum2B, v5ss, zero_s16v); pp3A = vec_add(sum3A, pp1A); pp3B = vec_add(sum3B, pp1B); @@ -439,18 +446,12 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i sum = vec_packsu(sumA, sumB); - dst1 = vec_ld(0, dst); - dst2 = vec_ld(16, dst); - vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); + ASSERT_ALIGNED(dst); + vdst = vec_ld(0, dst); OP_U8_ALTIVEC(fsum, sum, vdst); - rsum = vec_perm(fsum, fsum, dstperm); - fdst1 = vec_sel(dst1, rsum, dstmask); - fdst2 = vec_sel(rsum, dst2, dstmask); - - vec_st(fdst1, 0, dst); - vec_st(fdst2, 16, dst); + vec_st(fsum, 0, dst); dst += dstStride; } @@ -461,58 +462,50 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); register int i; - const vector signed int vzero = vec_splat_s32(0); - const vector unsigned char permM2 = vec_lvsl(-2, src); - const vector unsigned char permM1 = vec_lvsl(-1, src); - const vector unsigned char permP0 = vec_lvsl(+0, src); - const vector unsigned char permP1 = vec_lvsl(+1, src); - const vector unsigned char permP2 = vec_lvsl(+2, src); - const vector unsigned char permP3 = vec_lvsl(+3, src); - const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); - const vector unsigned int v10ui = vec_splat_u32(10); - const vector signed short v5ss = vec_splat_s16(5); - const vector signed short v1ss = vec_splat_s16(1); - const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); - const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); + LOAD_ZERO; + const vec_u8_t permM2 = vec_lvsl(-2, src); + const vec_u8_t permM1 = vec_lvsl(-1, src); + const vec_u8_t permP0 = vec_lvsl(+0, src); + const vec_u8_t permP1 = vec_lvsl(+1, src); + const vec_u8_t permP2 = vec_lvsl(+2, src); + const vec_u8_t permP3 = vec_lvsl(+3, src); + const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); + const vec_u32_t v10ui = vec_splat_u32(10); + const vec_s16_t v5ss = vec_splat_s16(5); + const vec_s16_t v1ss = vec_splat_s16(1); + const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); + const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); register int align = ((((unsigned long)src) - 2) % 16); - const vector unsigned char neg1 = (const vector unsigned char) - vec_splat_s8(-1); - - vector signed short srcP0A, srcP0B, srcP1A, srcP1B, + vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, srcP2A, srcP2B, srcP3A, srcP3B, srcM1A, srcM1B, srcM2A, srcM2B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, pp1A, pp1B, pp2A, pp2B, psumA, psumB; - const vector unsigned char dstperm = vec_lvsr(0, dst); - - const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm); - - const vector unsigned char mperm = (const vector unsigned char) + const vec_u8_t mperm = (const vec_u8_t) AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); int16_t *tmpbis = tmp; - vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, + vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, tmpP2ssA, tmpP2ssB; - vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, + vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, ssumAe, ssumAo, ssumBe, ssumBo; - vector unsigned char fsum, sumv, sum, dst1, dst2, vdst, - rsum, fdst1, fdst2; - vector signed short ssume, ssumo; + vec_u8_t fsum, sumv, sum, vdst; + vec_s16_t ssume, ssumo; POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); src -= (2 * srcStride); for (i = 0 ; i < 21 ; i ++) { - vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; - vector unsigned char srcR1 = vec_ld(-2, src); - vector unsigned char srcR2 = vec_ld(14, src); + vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; + vec_u8_t srcR1 = vec_ld(-2, src); + vec_u8_t srcR2 = vec_ld(14, src); switch (align) { default: { @@ -532,7 +525,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, srcP3 = srcR2; } break; case 12: { - vector unsigned char srcR3 = vec_ld(30, src); + vec_u8_t srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); @@ -541,7 +534,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 13: { - vector unsigned char srcR3 = vec_ld(30, src); + vec_u8_t srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = vec_perm(srcR1, srcR2, permP0); @@ -550,7 +543,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 14: { - vector unsigned char srcR3 = vec_ld(30, src); + vec_u8_t srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = vec_perm(srcR1, srcR2, permM1); srcP0 = srcR2; @@ -559,7 +552,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, srcP3 = vec_perm(srcR2, srcR3, permP3); } break; case 15: { - vector unsigned char srcR3 = vec_ld(30, src); + vec_u8_t srcR3 = vec_ld(30, src); srcM2 = vec_perm(srcR1, srcR2, permM2); srcM1 = srcR2; srcP0 = vec_perm(srcR2, srcR3, permP0); @@ -569,32 +562,20 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, } break; } - srcP0A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP0); - srcP0B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP0); - srcP1A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP1); - srcP1B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP1); - - srcP2A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP2); - srcP2B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP2); - srcP3A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcP3); - srcP3B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcP3); - - srcM1A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcM1); - srcM1B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcM1); - srcM2A = (vector signed short) - vec_mergeh((vector unsigned char)vzero, srcM2); - srcM2B = (vector signed short) - vec_mergel((vector unsigned char)vzero, srcM2); + srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); + srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); + srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); + srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); + + srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); + srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); + srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); + srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); + + srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); + srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); + srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); + srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); sum1A = vec_adds(srcP0A, srcP1A); sum1B = vec_adds(srcP0B, srcP1B); @@ -606,8 +587,8 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, pp1A = vec_mladd(sum1A, v20ss, sum3A); pp1B = vec_mladd(sum1B, v20ss, sum3B); - pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero); - pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero); + pp2A = vec_mladd(sum2A, v5ss, zero_s16v); + pp2B = vec_mladd(sum2B, v5ss, zero_s16v); psumA = vec_sub(pp1A, pp2A); psumB = vec_sub(pp1B, pp2B); @@ -636,15 +617,15 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, tmpbis += tmpStride; for (i = 0 ; i < 16 ; i++) { - const vector signed short tmpP3ssA = vec_ld(0, tmpbis); - const vector signed short tmpP3ssB = vec_ld(16, tmpbis); + const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); + const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); - const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA); - const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB); - const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA); - const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB); - const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA); - const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB); + const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); + const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); + const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); + const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); + const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); + const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); tmpbis += tmpStride; @@ -669,9 +650,9 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, pp2Be = vec_mule(sum2B, v5ss); pp2Bo = vec_mulo(sum2B, v5ss); - pp3Ae = vec_sra((vector signed int)sum3A, v16ui); + pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); pp3Ao = vec_mulo(sum3A, v1ss); - pp3Be = vec_sra((vector signed int)sum3B, v16ui); + pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); pp3Bo = vec_mulo(sum3B, v1ss); pp1cAe = vec_add(pp1Ae, v512si); @@ -700,18 +681,12 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, sumv = vec_packsu(ssume, ssumo); sum = vec_perm(sumv, sumv, mperm); - dst1 = vec_ld(0, dst); - dst2 = vec_ld(16, dst); - vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst)); + ASSERT_ALIGNED(dst); + vdst = vec_ld(0, dst); OP_U8_ALTIVEC(fsum, sum, vdst); - rsum = vec_perm(fsum, fsum, dstperm); - fdst1 = vec_sel(dst1, rsum, dstmask); - fdst2 = vec_sel(rsum, dst2, dstmask); - - vec_st(fdst1, 0, dst); - vec_st(fdst2, 16, dst); + vec_st(fsum, 0, dst); dst += dstStride; } diff --git a/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c b/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c index 66c8082f7..37b2f62c3 100644 --- a/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/idct_altivec.c @@ -16,7 +16,6 @@ * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * */ /* @@ -39,14 +38,14 @@ #include <stdlib.h> /* malloc(), free() */ #include <string.h> -#include "../dsputil.h" +#include "dsputil.h" #include "gcc_fixes.h" -#include "dsputil_altivec.h" +#include "dsputil_ppc.h" #define vector_s16_t vector signed short -#define const_vector_s16_t const_vector signed short +#define const_vector_s16_t const vector signed short #define vector_u16_t vector unsigned short #define vector_s8_t vector signed char #define vector_u8_t vector unsigned char diff --git a/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.c b/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.c new file mode 100644 index 000000000..3b161c5a6 --- /dev/null +++ b/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.c @@ -0,0 +1,153 @@ +/* + * High quality image resampling with polyphase filters + * Copyright (c) 2001 Fabrice Bellard. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file imgresample_altivec.c + * High quality image resampling with polyphase filters - AltiVec bits + */ + +#include "gcc_fixes.h" + +typedef union { + vector unsigned char v; + unsigned char c[16]; +} vec_uc_t; + +typedef union { + vector signed short v; + signed short s[8]; +} vec_ss_t; + +void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, + int wrap, int16_t *filter) +{ + int sum, i; + const uint8_t *s; + vector unsigned char *tv, tmp, dstv, zero; + vec_ss_t srchv[4], srclv[4], fv[4]; + vector signed short zeros, sumhv, sumlv; + s = src; + + for(i=0;i<4;i++) + { + /* + The vec_madds later on does an implicit >>15 on the result. + Since FILTER_BITS is 8, and we have 15 bits of magnitude in + a signed short, we have just enough bits to pre-shift our + filter constants <<7 to compensate for vec_madds. + */ + fv[i].s[0] = filter[i] << (15-FILTER_BITS); + fv[i].v = vec_splat(fv[i].v, 0); + } + + zero = vec_splat_u8(0); + zeros = vec_splat_s16(0); + + + /* + When we're resampling, we'd ideally like both our input buffers, + and output buffers to be 16-byte aligned, so we can do both aligned + reads and writes. Sadly we can't always have this at the moment, so + we opt for aligned writes, as unaligned writes have a huge overhead. + To do this, do enough scalar resamples to get dst 16-byte aligned. + */ + i = (-(int)dst) & 0xf; + while(i>0) { + sum = s[0 * wrap] * filter[0] + + s[1 * wrap] * filter[1] + + s[2 * wrap] * filter[2] + + s[3 * wrap] * filter[3]; + sum = sum >> FILTER_BITS; + if (sum<0) sum = 0; else if (sum>255) sum=255; + dst[0] = sum; + dst++; + s++; + dst_width--; + i--; + } + + /* Do our altivec resampling on 16 pixels at once. */ + while(dst_width>=16) { + /* + Read 16 (potentially unaligned) bytes from each of + 4 lines into 4 vectors, and split them into shorts. + Interleave the multipy/accumulate for the resample + filter with the loads to hide the 3 cycle latency + the vec_madds have. + */ + tv = (vector unsigned char *) &s[0 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap])); + srchv[0].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[0].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[0].v, fv[0].v, zeros); + sumlv = vec_madds(srclv[0].v, fv[0].v, zeros); + + tv = (vector unsigned char *) &s[1 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap])); + srchv[1].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[1].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv); + sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv); + + tv = (vector unsigned char *) &s[2 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap])); + srchv[2].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[2].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv); + sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv); + + tv = (vector unsigned char *) &s[3 * wrap]; + tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap])); + srchv[3].v = (vector signed short) vec_mergeh(zero, tmp); + srclv[3].v = (vector signed short) vec_mergel(zero, tmp); + sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv); + sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv); + + /* + Pack the results into our destination vector, + and do an aligned write of that back to memory. + */ + dstv = vec_packsu(sumhv, sumlv) ; + vec_st(dstv, 0, (vector unsigned char *) dst); + + dst+=16; + s+=16; + dst_width-=16; + } + + /* + If there are any leftover pixels, resample them + with the slow scalar method. + */ + while(dst_width>0) { + sum = s[0 * wrap] * filter[0] + + s[1 * wrap] * filter[1] + + s[2 * wrap] * filter[2] + + s[3 * wrap] * filter[3]; + sum = sum >> FILTER_BITS; + if (sum<0) sum = 0; else if (sum>255) sum=255; + dst[0] = sum; + dst++; + s++; + dst_width--; + } +} + diff --git a/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.h b/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.h new file mode 100644 index 000000000..538c1bee6 --- /dev/null +++ b/contrib/ffmpeg/libavcodec/ppc/imgresample_altivec.h @@ -0,0 +1,26 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef FFMPEG_IMGRESAMPLE_ALTIVEC_H +#define FFMPEG_IMGRESAMPLE_ALTIVEC_H + +#include <stdint.h> + +void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src, + int wrap, int16_t *filter); +#endif /* FFMPEG_IMGRESAMPLE_ALTIVEC_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/int_altivec.c b/contrib/ffmpeg/libavcodec/ppc/int_altivec.c new file mode 100644 index 000000000..95497c99a --- /dev/null +++ b/contrib/ffmpeg/libavcodec/ppc/int_altivec.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2007 Luca Barbato <lu_zero@gentoo.org> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + ** @file int_altivec.c + ** integer misc ops. + **/ + +#include "dsputil.h" + +#include "gcc_fixes.h" + +#include "dsputil_altivec.h" + +static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2, + int size) { + int i, size16; + vector signed char vpix1; + vector signed short vpix2, vdiff, vpix1l,vpix1h; + union { vector signed int vscore; + int32_t score[4]; + } u; + u.vscore = vec_splat_s32(0); +// +//XXX lazy way, fix it later + +#define vec_unaligned_load(b) \ + vec_perm(vec_ld(0,b),vec_ld(15,b),vec_lvsl(0, b)); + + size16 = size >> 4; + while(size16) { +// score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); + //load pix1 and the first batch of pix2 + + vpix1 = vec_unaligned_load(pix1); + vpix2 = vec_unaligned_load(pix2); + pix2 += 8; + //unpack + vpix1h = vec_unpackh(vpix1); + vdiff = vec_sub(vpix1h, vpix2); + vpix1l = vec_unpackl(vpix1); + // load another batch from pix2 + vpix2 = vec_unaligned_load(pix2); + u.vscore = vec_msum(vdiff, vdiff, u.vscore); + vdiff = vec_sub(vpix1l, vpix2); + u.vscore = vec_msum(vdiff, vdiff, u.vscore); + pix1 += 16; + pix2 += 8; + size16--; + } + u.vscore = vec_sums(u.vscore, vec_splat_s32(0)); + + size %= 16; + for (i = 0; i < size; i++) { + u.score[3] += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]); + } + return u.score[3]; +} + +void int_init_altivec(DSPContext* c, AVCodecContext *avctx) +{ + c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec; +} diff --git a/contrib/ffmpeg/libavcodec/ppc/mathops.h b/contrib/ffmpeg/libavcodec/ppc/mathops.h index 6af23f246..d7cc85365 100644 --- a/contrib/ffmpeg/libavcodec/ppc/mathops.h +++ b/contrib/ffmpeg/libavcodec/ppc/mathops.h @@ -20,6 +20,9 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#ifndef FFMPEG_PPC_MATHOPS_H +#define FFMPEG_PPC_MATHOPS_H + #if defined(ARCH_POWERPC_405) /* signed 16x16 -> 32 multiply add accumulate */ # define MAC16(rt, ra, rb) \ @@ -31,3 +34,5 @@ asm ("mullhw %0, %1, %2" : "=r" (__rt) : "r" (ra), "r" (rb)); __rt; }) #endif + +#endif /* FFMPEG_PPC_MATHOPS_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c b/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c index 3822cb20e..a2ba5e125 100644 --- a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/mpegvideo_altivec.c @@ -23,13 +23,13 @@ #include <stdlib.h> #include <stdio.h> -#include "../dsputil.h" -#include "../mpegvideo.h" +#include "dsputil.h" +#include "mpegvideo.h" #include "gcc_fixes.h" -#include "dsputil_altivec.h" - +#include "dsputil_ppc.h" +#include "util_altivec.h" // Swaps two variables (used for altivec registers) #define SWAP(a,b) \ do { \ @@ -66,12 +66,8 @@ do { \ } -#ifdef CONFIG_DARWIN -#define FOUROF(a) (a) -#else -// slower, for dumb non-apple GCC -#define FOUROF(a) {a,a,a,a} -#endif +#define FOUROF(a) AVV(a,a,a,a) + int dct_quantize_altivec(MpegEncContext* s, DCTELEM* data, int n, int qscale, int* overflow) @@ -79,8 +75,8 @@ int dct_quantize_altivec(MpegEncContext* s, int lastNonZero; vector float row0, row1, row2, row3, row4, row5, row6, row7; vector float alt0, alt1, alt2, alt3, alt4, alt5, alt6, alt7; - const_vector float zero = (const_vector float)FOUROF(0.); - // used after quantise step + const vector float zero = (const vector float)FOUROF(0.); + // used after quantize step int oldBaseValue = 0; // Load the data into the row/alt vectors @@ -258,7 +254,7 @@ int dct_quantize_altivec(MpegEncContext* s, } } - // perform the quantise step, using the floating point data + // perform the quantize step, using the floating point data // still in the row/alt registers { const int* biasAddr; @@ -474,7 +470,7 @@ int dct_quantize_altivec(MpegEncContext* s, data[0] = (oldBaseValue + 4) >> 3; } - // We handled the tranpose permutation above and we don't + // We handled the transpose permutation above and we don't // need to permute the "no" permutation case. if ((lastNonZero > 0) && (s->dsp.idct_permutation_type != FF_TRANSPOSE_IDCT_PERM) && @@ -486,7 +482,6 @@ int dct_quantize_altivec(MpegEncContext* s, return lastNonZero; } -#undef FOUROF /* AltiVec version of dct_unquantize_h263 @@ -515,25 +510,25 @@ POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); }else qadd = 0; i = 1; - nCoeffs= 63; //does not allways use zigzag table + nCoeffs= 63; //does not always use zigzag table } else { i = 0; nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; } { - register const_vector signed short vczero = (const_vector signed short)vec_splat_s16(0); - short __attribute__ ((aligned(16))) qmul8[] = + register const vector signed short vczero = (const vector signed short)vec_splat_s16(0); + DECLARE_ALIGNED_16(short, qmul8[]) = { qmul, qmul, qmul, qmul, qmul, qmul, qmul, qmul }; - short __attribute__ ((aligned(16))) qadd8[] = + DECLARE_ALIGNED_16(short, qadd8[]) = { qadd, qadd, qadd, qadd, qadd, qadd, qadd, qadd }; - short __attribute__ ((aligned(16))) nqadd8[] = + DECLARE_ALIGNED_16(short, nqadd8[]) = { -qadd, -qadd, -qadd, -qadd, -qadd, -qadd, -qadd, -qadd @@ -601,3 +596,50 @@ POWERPC_PERF_START_COUNT(altivec_dct_unquantize_h263_num, 1); } POWERPC_PERF_STOP_COUNT(altivec_dct_unquantize_h263_num, nCoeffs == 63); } + + +extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); +extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); + +void MPV_common_init_altivec(MpegEncContext *s) +{ + if ((mm_flags & MM_ALTIVEC) == 0) return; + + if (s->avctx->lowres==0) + { + if ((s->avctx->idct_algo == FF_IDCT_AUTO) || + (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) + { + s->dsp.idct_put = idct_put_altivec; + s->dsp.idct_add = idct_add_altivec; + s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; + } + } + + // Test to make sure that the dct required alignments are met. + if ((((long)(s->q_intra_matrix) & 0x0f) != 0) || + (((long)(s->q_inter_matrix) & 0x0f) != 0)) + { + av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned " + "to use AltiVec DCT. Reverting to non-AltiVec version.\n"); + return; + } + + if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) + { + av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned " + "to use AltiVec DCT. Reverting to non-AltiVec version.\n"); + return; + } + + + if ((s->avctx->dct_algo == FF_DCT_AUTO) || + (s->avctx->dct_algo == FF_DCT_ALTIVEC)) + { +#if 0 /* seems to cause trouble under some circumstances */ + s->dct_quantize = dct_quantize_altivec; +#endif + s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec; + s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec; + } +} diff --git a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c b/contrib/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c deleted file mode 100644 index c5e822f77..000000000 --- a/contrib/ffmpeg/libavcodec/ppc/mpegvideo_ppc.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2002 Dieter Shirley - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "../dsputil.h" -#include "../mpegvideo.h" -#include <time.h> - -#ifdef HAVE_ALTIVEC -#include "dsputil_altivec.h" -#endif - -extern int dct_quantize_altivec(MpegEncContext *s, - DCTELEM *block, int n, - int qscale, int *overflow); -extern void dct_unquantize_h263_altivec(MpegEncContext *s, - DCTELEM *block, int n, int qscale); - -extern void idct_put_altivec(uint8_t *dest, int line_size, int16_t *block); -extern void idct_add_altivec(uint8_t *dest, int line_size, int16_t *block); - - -void MPV_common_init_ppc(MpegEncContext *s) -{ -#ifdef HAVE_ALTIVEC - if (has_altivec()) - { - if (s->avctx->lowres==0) - { - if ((s->avctx->idct_algo == FF_IDCT_AUTO) || - (s->avctx->idct_algo == FF_IDCT_ALTIVEC)) - { - s->dsp.idct_put = idct_put_altivec; - s->dsp.idct_add = idct_add_altivec; - s->dsp.idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; - } - } - - // Test to make sure that the dct required alignments are met. - if ((((long)(s->q_intra_matrix) & 0x0f) != 0) || - (((long)(s->q_inter_matrix) & 0x0f) != 0)) - { - av_log(s->avctx, AV_LOG_INFO, "Internal Error: q-matrix blocks must be 16-byte aligned " - "to use Altivec DCT. Reverting to non-altivec version.\n"); - return; - } - - if (((long)(s->intra_scantable.inverse) & 0x0f) != 0) - { - av_log(s->avctx, AV_LOG_INFO, "Internal Error: scan table blocks must be 16-byte aligned " - "to use Altivec DCT. Reverting to non-altivec version.\n"); - return; - } - - - if ((s->avctx->dct_algo == FF_DCT_AUTO) || - (s->avctx->dct_algo == FF_DCT_ALTIVEC)) - { -#if 0 /* seems to cause trouble under some circumstances */ - s->dct_quantize = dct_quantize_altivec; -#endif - s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec; - s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec; - } - } else -#endif - { - /* Non-AltiVec PPC optimisations here */ - } -} - diff --git a/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c b/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c index b15672ffe..8770f05f5 100644 --- a/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/snow_altivec.c @@ -1,5 +1,5 @@ /* - * Altivec optimized snow DSP utils + * AltiVec-optimized snow DSP utils * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> * * This file is part of FFmpeg. @@ -17,15 +17,13 @@ * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * - * */ -#include "../dsputil.h" +#include "dsputil.h" #include "gcc_fixes.h" #include "dsputil_altivec.h" -#include "../snow.h" +#include "snow.h" #undef NDEBUG #include <assert.h> @@ -60,57 +58,56 @@ static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line) //altivec code -void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width) +void ff_snow_horizontal_compose97i_altivec(IDWTELEM *b, int width) { +#if 0 const int w2= (width+1)>>1; - DECLARE_ALIGNED_16(DWTELEM, temp[(width>>1)]); + DECLARE_ALIGNED_16(IDWTELEM, temp[(width>>1)]); const int w_l= (width>>1); const int w_r= w2 - 1; int i; - vector signed int t1, t2, x, y, tmp1, tmp2; - vector signed int *vbuf, *vtmp; + vector signed short t1, t2, x, y, tmp1, tmp2; + vector signed short *vbuf, *vtmp; vector unsigned char align; - - { // Lift 0 - DWTELEM * const ref = b + w2 - 1; - DWTELEM b_0 = b[0]; - vbuf = (vector signed int *)b; + IDWTELEM * const ref = b + w2 - 1; + IDWTELEM b_0 = b[0]; + vector signed short v7 = vec_splat_s16(7); + vbuf = (vector signed short *)b; tmp1 = vec_ld (0, ref); align = vec_lvsl (0, ref); tmp2 = vec_ld (15, ref); - t1= vec_perm(tmp1, tmp2, align); - - i = 0; + t1 = vec_perm(tmp1, tmp2, align); for (i=0; i<w_l-15; i+=16) { #if 0 - b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3); +/* b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3); b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3); b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3); - b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3); + b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);*/ + b[i+0] = b[i+0] + ((7 * (ref[i+0] + ref[i+1])-1) >> 8); #else - tmp1 = vec_ld (0, ref+4+i); - tmp2 = vec_ld (15, ref+4+i); + tmp1 = vec_ld (0, ref+8+i); + tmp2 = vec_ld (15, ref+8+i); t2 = vec_perm(tmp1, tmp2, align); - y = vec_add(t1,vec_sld(t1,t2,4)); - y = vec_add(vec_add(y,y),y); + y = vec_add(t1, vec_sld(t1,t2,2)); +// y = vec_add(vec_add(y,y),y); - tmp1 = vec_ld (0, ref+8+i); + tmp1 = vec_ld (0, ref+12+i); y = vec_add(y, vec_splat_s32(4)); y = vec_sra(y, vec_splat_u32(3)); - tmp2 = vec_ld (15, ref+8+i); + tmp2 = vec_ld (15, ref+12+i); *vbuf = vec_sub(*vbuf, y); - t1=t2; + t1 = t2; vbuf++; @@ -164,6 +161,7 @@ void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width) vbuf++; #endif + } snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); @@ -365,6 +363,7 @@ void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width) } } +#endif } void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width) @@ -524,7 +523,7 @@ static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc, vector signed int *v = (vector signed int *)vbuf, *d; for(y=0; y<b_h; y++){ - //FIXME ugly missue of obmc_stride + //FIXME ugly misuse of obmc_stride uint8_t *obmc1= obmc + y*obmc_stride; uint8_t *obmc2= obmc1+ (obmc_stride>>1); @@ -590,7 +589,7 @@ static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc, vector signed int *v = (vector signed int *)vbuf, *d; for(y=0; y<b_h; y++){ - //FIXME ugly missue of obmc_stride + //FIXME ugly misuse of obmc_stride uint8_t *obmc1= obmc + y*obmc_stride; uint8_t *obmc2= obmc1+ (obmc_stride>>1); @@ -673,7 +672,7 @@ static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc, vector signed int *v = (vector signed int *)vbuf, *d; for(y=0; y<b_h; y++){ - //FIXME ugly missue of obmc_stride + //FIXME ugly misuse of obmc_stride uint8_t *obmc1= obmc + y*obmc_stride; uint8_t *obmc2= obmc1+ (obmc_stride>>1); @@ -719,7 +718,7 @@ static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc, vector signed int *v = (vector signed int *)vbuf, *d; for(y=0; y<b_h; y++){ - //FIXME ugly missue of obmc_stride + //FIXME ugly misuse of obmc_stride uint8_t *obmc1= obmc + y*obmc_stride; uint8_t *obmc2= obmc1+ (obmc_stride>>1); @@ -782,7 +781,9 @@ void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride, void snow_init_altivec(DSPContext* c, AVCodecContext *avctx) { +#if 0 c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; +#endif } diff --git a/contrib/ffmpeg/libavcodec/ppc/types_altivec.h b/contrib/ffmpeg/libavcodec/ppc/types_altivec.h index f29026e04..6d41a928b 100644 --- a/contrib/ffmpeg/libavcodec/ppc/types_altivec.h +++ b/contrib/ffmpeg/libavcodec/ppc/types_altivec.h @@ -18,6 +18,9 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#ifndef FFMPEG_TYPES_ALTIVEC_H +#define FFMPEG_TYPES_ALTIVEC_H + /*********************************************************************** * Vector types **********************************************************************/ @@ -39,3 +42,5 @@ #define zero_s16v (vec_s16_t) zerov #define zero_u32v (vec_u32_t) zerov #define zero_s32v (vec_s32_t) zerov + +#endif /* FFMPEG_TYPES_ALTIVEC_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/util_altivec.h b/contrib/ffmpeg/libavcodec/ppc/util_altivec.h new file mode 100644 index 000000000..6a8afb1b2 --- /dev/null +++ b/contrib/ffmpeg/libavcodec/ppc/util_altivec.h @@ -0,0 +1,105 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file util_altivec.h + * Contains misc utility macros and inline functions + */ + +#ifndef FFMPEG_UTIL_ALTIVEC_H +#define FFMPEG_UTIL_ALTIVEC_H + +#include <stdint.h> + +#include "config.h" + +#ifdef HAVE_ALTIVEC_H +#include <altivec.h> +#endif + +// used to build registers permutation vectors (vcprm) +// the 's' are for words in the _s_econd vector +#define WORD_0 0x00,0x01,0x02,0x03 +#define WORD_1 0x04,0x05,0x06,0x07 +#define WORD_2 0x08,0x09,0x0a,0x0b +#define WORD_3 0x0c,0x0d,0x0e,0x0f +#define WORD_s0 0x10,0x11,0x12,0x13 +#define WORD_s1 0x14,0x15,0x16,0x17 +#define WORD_s2 0x18,0x19,0x1a,0x1b +#define WORD_s3 0x1c,0x1d,0x1e,0x1f + +#define vcprm(a,b,c,d) (const vector unsigned char)AVV(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) +#define vcii(a,b,c,d) (const vector float)AVV(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) + +// vcprmle is used to keep the same index as in the SSE version. +// it's the same as vcprm, with the index inversed +// ('le' is Little Endian) +#define vcprmle(a,b,c,d) vcprm(d,c,b,a) + +// used to build inverse/identity vectors (vcii) +// n is _n_egative, p is _p_ositive +#define FLOAT_n -1. +#define FLOAT_p 1. + + +// Transpose 8x8 matrix of 16-bit elements (in-place) +#define TRANSPOSE8(a,b,c,d,e,f,g,h) \ +do { \ + vector signed short A1, B1, C1, D1, E1, F1, G1, H1; \ + vector signed short A2, B2, C2, D2, E2, F2, G2, H2; \ + \ + A1 = vec_mergeh (a, e); \ + B1 = vec_mergel (a, e); \ + C1 = vec_mergeh (b, f); \ + D1 = vec_mergel (b, f); \ + E1 = vec_mergeh (c, g); \ + F1 = vec_mergel (c, g); \ + G1 = vec_mergeh (d, h); \ + H1 = vec_mergel (d, h); \ + \ + A2 = vec_mergeh (A1, E1); \ + B2 = vec_mergel (A1, E1); \ + C2 = vec_mergeh (B1, F1); \ + D2 = vec_mergel (B1, F1); \ + E2 = vec_mergeh (C1, G1); \ + F2 = vec_mergel (C1, G1); \ + G2 = vec_mergeh (D1, H1); \ + H2 = vec_mergel (D1, H1); \ + \ + a = vec_mergeh (A2, E2); \ + b = vec_mergel (A2, E2); \ + c = vec_mergeh (B2, F2); \ + d = vec_mergel (B2, F2); \ + e = vec_mergeh (C2, G2); \ + f = vec_mergel (C2, G2); \ + g = vec_mergeh (D2, H2); \ + h = vec_mergel (D2, H2); \ +} while (0) + + +/** \brief loads unaligned vector \a *src with offset \a offset + and returns it */ +static inline vector unsigned char unaligned_load(int offset, uint8_t *src) +{ + register vector unsigned char first = vec_ld(offset, src); + register vector unsigned char second = vec_ld(offset+15, src); + register vector unsigned char mask = vec_lvsl(offset, src); + return vec_perm(first, second, mask); +} + +#endif /* FFMPEG_UTIL_ALTIVEC_H */ diff --git a/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c b/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c index 114c9d41f..87bef808e 100644 --- a/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c +++ b/contrib/ffmpeg/libavcodec/ppc/vc1dsp_altivec.c @@ -17,14 +17,13 @@ * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - * */ -#include "../dsputil.h" +#include "dsputil.h" #include "gcc_fixes.h" -#include "dsputil_altivec.h" +#include "util_altivec.h" // main steps of 8x8 transform #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \ @@ -139,7 +138,6 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64]) vector signed int t0, t1, t2, t3, t4, t5, t6, t7; const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); const vector unsigned int vec_7 = vec_splat_u32(7); - const vector unsigned int vec_5 = vec_splat_u32(5); const vector unsigned int vec_4 = vec_splat_u32(4); const vector signed int vec_4s = vec_splat_s32(4); const vector unsigned int vec_3 = vec_splat_u32(3); @@ -229,7 +227,7 @@ static void vc1_inv_trans_8x8_altivec(DCTELEM block[64]) /** Do inverse transform on 8x4 part of block */ -static void vc1_inv_trans_8x4_altivec(DCTELEM block[64], int n) +static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block) { vector signed short src0, src1, src2, src3, src4, src5, src6, src7; vector signed int s0, s1, s2, s3, s4, s5, s6, s7; @@ -243,6 +241,9 @@ static void vc1_inv_trans_8x4_altivec(DCTELEM block[64], int n) const vector unsigned int vec_3 = vec_splat_u32(3); const vector unsigned int vec_2 = vec_splat_u32(2); const vector unsigned int vec_1 = vec_splat_u32(1); + vector unsigned char tmp; + vector signed short tmp2, tmp3; + vector unsigned char perm0, perm1, p0, p1, p; src0 = vec_ld( 0, block); src1 = vec_ld( 16, block); @@ -284,51 +285,42 @@ static void vc1_inv_trans_8x4_altivec(DCTELEM block[64], int n) src7 = vec_pack(sF, s7); TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); - if(!n){ // upper half of block - s0 = vec_unpackh(src0); - s1 = vec_unpackh(src1); - s2 = vec_unpackh(src2); - s3 = vec_unpackh(src3); - s8 = vec_unpackl(src0); - s9 = vec_unpackl(src1); - sA = vec_unpackl(src2); - sB = vec_unpackl(src3); - STEP4(s0, s1, s2, s3, vec_64); - SHIFT_VERT4(s0, s1, s2, s3); - STEP4(s8, s9, sA, sB, vec_64); - SHIFT_VERT4(s8, s9, sA, sB); - src0 = vec_pack(s0, s8); - src1 = vec_pack(s1, s9); - src2 = vec_pack(s2, sA); - src3 = vec_pack(s3, sB); + s0 = vec_unpackh(src0); + s1 = vec_unpackh(src1); + s2 = vec_unpackh(src2); + s3 = vec_unpackh(src3); + s8 = vec_unpackl(src0); + s9 = vec_unpackl(src1); + sA = vec_unpackl(src2); + sB = vec_unpackl(src3); + STEP4(s0, s1, s2, s3, vec_64); + SHIFT_VERT4(s0, s1, s2, s3); + STEP4(s8, s9, sA, sB, vec_64); + SHIFT_VERT4(s8, s9, sA, sB); + src0 = vec_pack(s0, s8); + src1 = vec_pack(s1, s9); + src2 = vec_pack(s2, sA); + src3 = vec_pack(s3, sB); + + p0 = vec_lvsl (0, dest); + p1 = vec_lvsl (stride, dest); + p = vec_splat_u8 (-1); + perm0 = vec_mergeh (p, p0); + perm1 = vec_mergeh (p, p1); - vec_st(src0, 0, block); - vec_st(src1, 16, block); - vec_st(src2, 32, block); - vec_st(src3, 48, block); - } else { //lower half of block - s0 = vec_unpackh(src4); - s1 = vec_unpackh(src5); - s2 = vec_unpackh(src6); - s3 = vec_unpackh(src7); - s8 = vec_unpackl(src4); - s9 = vec_unpackl(src5); - sA = vec_unpackl(src6); - sB = vec_unpackl(src7); - STEP4(s0, s1, s2, s3, vec_64); - SHIFT_VERT4(s0, s1, s2, s3); - STEP4(s8, s9, sA, sB, vec_64); - SHIFT_VERT4(s8, s9, sA, sB); - src4 = vec_pack(s0, s8); - src5 = vec_pack(s1, s9); - src6 = vec_pack(s2, sA); - src7 = vec_pack(s3, sB); +#define ADD(dest,src,perm) \ + /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ + tmp = vec_ld (0, dest); \ + tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \ + tmp3 = vec_adds (tmp2, src); \ + tmp = vec_packsu (tmp3, tmp3); \ + vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \ + vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest); - vec_st(src4, 64, block); - vec_st(src5, 80, block); - vec_st(src6, 96, block); - vec_st(src7,112, block); - } + ADD (dest, src0, perm0) dest += stride; + ADD (dest, src1, perm1) dest += stride; + ADD (dest, src2, perm0) dest += stride; + ADD (dest, src3, perm1) } |
