diff options
author | Miguel Freitas <miguelfreitas@users.sourceforge.net> | 2003-01-31 18:29:43 +0000 |
---|---|---|
committer | Miguel Freitas <miguelfreitas@users.sourceforge.net> | 2003-01-31 18:29:43 +0000 |
commit | 5350f2b7701f01bc4f234d3971fb8a623a8cd72a (patch) | |
tree | 5f6cd350778863ad8d2612bce4ac2f6270919115 /src/libffmpeg/libavcodec/ppc/dsputil_altivec.c | |
parent | 8b0e8647a0d0c279b6a355362452dff4bd6f5c05 (diff) | |
download | xine-lib-5350f2b7701f01bc4f234d3971fb8a623a8cd72a.tar.gz xine-lib-5350f2b7701f01bc4f234d3971fb8a623a8cd72a.tar.bz2 |
update ffmpeg
CVS patchset: 4068
CVS date: 2003/01/31 18:29:43
Diffstat (limited to 'src/libffmpeg/libavcodec/ppc/dsputil_altivec.c')
-rw-r--r-- | src/libffmpeg/libavcodec/ppc/dsputil_altivec.c | 902 |
1 files changed, 861 insertions, 41 deletions
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c index 5f14ed0eb..dc62e70f4 100644 --- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c +++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c @@ -1,6 +1,7 @@ /* * Copyright (c) 2002 Brian Foley * Copyright (c) 2002 Dieter Shirley + * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -20,21 +21,39 @@ #include "../dsputil.h" #include "dsputil_altivec.h" -#if CONFIG_DARWIN +#ifdef CONFIG_DARWIN #include <sys/sysctl.h> -#endif +#else /* CONFIG_DARWIN */ +#include <signal.h> +#include <setjmp.h> + +static sigjmp_buf jmpbuf; +static volatile sig_atomic_t canjump = 0; + +static void sigill_handler (int sig) +{ + if (!canjump) { + signal (sig, SIG_DFL); + raise (sig); + } + + canjump = 0; + siglongjmp (jmpbuf, 1); +} +#endif /* CONFIG_DARWIN */ int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { - int s, i; - vector unsigned char *tv, zero; + int i; + int s __attribute__((aligned(16))); + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); + vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; s = 0; - zero = vec_splat_u8(0); - sad = vec_splat_u32(0); + sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { /* Read unaligned pixels into our vectors. The vectors are as follows: @@ -72,16 +91,17 @@ int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { - int s, i; - vector unsigned char *tv, zero; + int i; + int s __attribute__((aligned(16))); + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); + vector unsigned char *tv; vector unsigned char pix1v, pix2v, pix3v, avgv, t5; vector unsigned int sad; vector signed int sumdiffs; uint8_t *pix3 = pix2 + line_size; s = 0; - zero = vec_splat_u8(0); - sad = vec_splat_u32(0); + sad = (vector unsigned int)vec_splat_u32(0); /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one @@ -131,20 +151,21 @@ int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { - int s, i; + int i; + int s __attribute__((aligned(16))); uint8_t *pix3 = pix2 + line_size; - vector unsigned char *tv, avgv, t5, zero; + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); + const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); + vector unsigned char *tv, avgv, t5; vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; - vector unsigned short avghv, avglv, two; + vector unsigned short avghv, avglv; vector unsigned short t1, t2, t3, t4; vector unsigned int sad; vector signed int sumdiffs; - zero = vec_splat_u8(0); - two = vec_splat_u16(2); - sad = vec_splat_u32(0); + sad = (vector unsigned int)vec_splat_u32(0); s = 0; @@ -231,14 +252,15 @@ int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { - int i, s; + int i; + int s __attribute__((aligned(16))); + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sad, zero; + vector unsigned int sad; vector signed int sumdiffs; - zero = (vector unsigned int) (0); - sad = (vector unsigned int) (0); + sad = (vector unsigned int)vec_splat_u32(0); for(i=0;i<16;i++) { @@ -272,15 +294,20 @@ int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) { - int i, s; + int i; + int s __attribute__((aligned(16))); + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; vector unsigned char t1, t2, t3,t4, t5; - vector unsigned int sad, zero; + vector unsigned int sad; vector signed int sumdiffs; - zero = (vector unsigned int) (0); - sad = (vector unsigned int) (0); - permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); + sad = (vector unsigned int)vec_splat_u32(0); +#ifdef CONFIG_DARWIN + permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); +#else + permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; +#endif for(i=0;i<8;i++) { /* Read potentially unaligned pixels into t1 and t2 @@ -315,14 +342,15 @@ int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size) int pix_norm1_altivec(uint8_t *pix, int line_size) { - int s, i; - vector unsigned char *tv, zero; + int i; + int s __attribute__((aligned(16))); + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); + vector unsigned char *tv; vector unsigned char pixv; vector unsigned int sv; vector signed int sum; - - zero = vec_splat_u8(0); - sv = vec_splat_u32(0); + + sv = (vector unsigned int)vec_splat_u32(0); s = 0; for (i = 0; i < 16; i++) { @@ -343,18 +371,127 @@ int pix_norm1_altivec(uint8_t *pix, int line_size) return s; } -int pix_sum_altivec(UINT8 * pix, int line_size) +/** + * Sum of Squared Errors for a 8x8 block. + * AltiVec-enhanced. + * It's the pix_abs8x8_altivec code above w/ squaring added. + */ +int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) { + int i; + int s __attribute__((aligned(16))); + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); + vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; + vector unsigned char t1, t2, t3,t4, t5; + vector unsigned int sum; + vector signed int sumsqr; + + sum = (vector unsigned int)vec_splat_u32(0); +#ifdef CONFIG_DARWIN + permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0); +#else + permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; +#endif + + for(i=0;i<8;i++) { + /* Read potentially unaligned pixels into t1 and t2 + Since we're reading 16 pixels, and actually only want 8, + mask out the last 8 pixels. The 0s don't change the sum. */ + perm1 = vec_lvsl(0, pix1); + pix1v = (vector unsigned char *) pix1; + perm2 = vec_lvsl(0, pix2); + pix2v = (vector unsigned char *) pix2; + t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); + t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); + /* + Since we want to use unsigned chars, we can take advantage + of the fact that abs(a-b)^2 = (a-b)^2. + */ + + /* Calculate abs differences vector */ + t3 = vec_max(t1, t2); + t4 = vec_min(t1, t2); + t5 = vec_sub(t3, t4); + + /* Square the values and add them to our sum */ + sum = vec_msum(t5, t5, sum); + + pix1 += line_size; + pix2 += line_size; + } + + /* Sum up the four partial sums, and put the result into s */ + sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); + sumsqr = vec_splat(sumsqr, 3); + vec_ste(sumsqr, 0, &s); + + return s; +} + +/** + * Sum of Squared Errors for a 16x16 block. + * AltiVec-enhanced. + * It's the pix_abs16x16_altivec code above w/ squaring added. + */ +int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size) +{ + int i; + int s __attribute__((aligned(16))); + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); + vector unsigned char perm1, perm2, *pix1v, *pix2v; + vector unsigned char t1, t2, t3,t4, t5; + vector unsigned int sum; + vector signed int sumsqr; + + sum = (vector unsigned int)vec_splat_u32(0); + + for(i=0;i<16;i++) { + /* Read potentially unaligned pixels into t1 and t2 */ + perm1 = vec_lvsl(0, pix1); + pix1v = (vector unsigned char *) pix1; + perm2 = vec_lvsl(0, pix2); + pix2v = (vector unsigned char *) pix2; + t1 = vec_perm(pix1v[0], pix1v[1], perm1); + t2 = vec_perm(pix2v[0], pix2v[1], perm2); + + /* + Since we want to use unsigned chars, we can take advantage + of the fact that abs(a-b)^2 = (a-b)^2. + */ + + /* Calculate abs differences vector */ + t3 = vec_max(t1, t2); + t4 = vec_min(t1, t2); + t5 = vec_sub(t3, t4); + + /* Square the values and add them to our sum */ + sum = vec_msum(t5, t5, sum); + + pix1 += line_size; + pix2 += line_size; + } + + /* Sum up the four partial sums, and put the result into s */ + sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); + sumsqr = vec_splat(sumsqr, 3); + vec_ste(sumsqr, 0, &s); + + return s; +} + +int pix_sum_altivec(UINT8 * pix, int line_size) +{ + const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); vector unsigned char perm, *pixv; vector unsigned char t1; - vector unsigned int sad, zero; + vector unsigned int sad; vector signed int sumdiffs; - int s, i; - - zero = (vector unsigned int) (0); - sad = (vector unsigned int) (0); + int i; + int s __attribute__((aligned(16))); + + sad = (vector unsigned int)vec_splat_u32(0); for (i = 0; i < 16; i++) { /* Read the potentially unaligned 16 pixels into t1 */ @@ -380,7 +517,7 @@ void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_s { int i; vector unsigned char perm, bytes, *pixv; - vector unsigned char zero = (vector unsigned char) (0); + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector signed short shorts; for(i=0;i<8;i++) @@ -407,7 +544,7 @@ void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1, { int i; vector unsigned char perm, bytes, *pixv; - vector unsigned char zero = (vector unsigned char) (0); + const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); vector signed short shorts1, shorts2; for(i=0;i<4;i++) @@ -474,10 +611,675 @@ void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1, } } +int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { + return pix_abs16x16_altivec(a,b,stride); +} + +int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) { + return pix_abs8x8_altivec(a,b,stride); +} + +void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int i; + for(i=0; i+7<w; i++){ + dst[i+0] += src[i+0]; + dst[i+1] += src[i+1]; + dst[i+2] += src[i+2]; + dst[i+3] += src[i+3]; + dst[i+4] += src[i+4]; + dst[i+5] += src[i+5]; + dst[i+6] += src[i+6]; + dst[i+7] += src[i+7]; + } + for(; i<w; i++) + dst[i+0] += src[i+0]; +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + register int i; + register vector unsigned char vdst, vsrc; + + /* dst and src are 16 bytes-aligned (guaranteed) */ + for(i = 0 ; (i + 15) < w ; i++) + { + vdst = vec_ld(i << 4, (unsigned char*)dst); + vsrc = vec_ld(i << 4, (unsigned char*)src); + vdst = vec_add(vsrc, vdst); + vec_st(vdst, i << 4, (unsigned char*)dst); + } + /* if w is not a multiple of 16 */ + for (; (i < w) ; i++) + { + dst[i] = src[i]; + } +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} + +/* next one assumes that ((line_size % 16) == 0) */ +void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ +POWERPC_TBL_DECLARE(altivec_put_pixels16_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int i; + +POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1); + + for(i=0; i<h; i++) { + *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l); + *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l); + *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l); + *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l); + pixels+=line_size; + block +=line_size; + } + +POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + register vector unsigned char pixelsv1, pixelsv2; + register vector unsigned char perm = vec_lvsl(0, pixels); + int i; + +POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1); + + for(i=0; i<h; i++) { + pixelsv1 = vec_ld(0, (unsigned char*)pixels); + pixelsv2 = vec_ld(16, (unsigned char*)pixels); + vec_st(vec_perm(pixelsv1, pixelsv2, perm), + 0, (unsigned char*)block); + pixels+=line_size; + block +=line_size; + } + +POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1); + +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} + +/* next one assumes that ((line_size % 16) == 0) */ +#define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) +void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ +POWERPC_TBL_DECLARE(altivec_avg_pixels16_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int i; + +POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); + + for(i=0; i<h; i++) { + op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l)); + op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l)); + op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l)); + op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l)); + pixels+=line_size; + block +=line_size; + } + +POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; + register vector unsigned char perm = vec_lvsl(0, pixels); + int i; + +POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1); + + for(i=0; i<h; i++) { + pixelsv1 = vec_ld(0, (unsigned char*)pixels); + pixelsv2 = vec_ld(16, (unsigned char*)pixels); + blockv = vec_ld(0, block); + pixelsv = vec_perm(pixelsv1, pixelsv2, perm); + blockv = vec_avg(blockv,pixelsv); + vec_st(blockv, 0, (unsigned char*)block); + pixels+=line_size; + block +=line_size; + } + +POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1); + +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} + +/* next one assumes that ((line_size % 8) == 0) */ +void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) +{ +POWERPC_TBL_DECLARE(altivec_avg_pixels8_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int i; +POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1); + for (i = 0; i < h; i++) { + *((uint32_t *) (block)) = + (((*((uint32_t *) (block))) | + ((((const struct unaligned_32 *) (pixels))->l))) - + ((((*((uint32_t *) (block))) ^ + ((((const struct unaligned_32 *) (pixels))-> + l))) & 0xFEFEFEFEUL) >> 1)); + *((uint32_t *) (block + 4)) = + (((*((uint32_t *) (block + 4))) | + ((((const struct unaligned_32 *) (pixels + 4))->l))) - + ((((*((uint32_t *) (block + 4))) ^ + ((((const struct unaligned_32 *) (pixels + + 4))-> + l))) & 0xFEFEFEFEUL) >> 1)); + pixels += line_size; + block += line_size; + } +POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; + int i; + +POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1); + + for (i = 0; i < h; i++) { + /* + block is 8 bytes-aligned, so we're either in the + left block (16 bytes-aligned) or in the right block (not) + */ + int rightside = ((unsigned long)block & 0x0000000F); + + blockv = vec_ld(0, block); + pixelsv1 = vec_ld(0, (unsigned char*)pixels); + pixelsv2 = vec_ld(16, (unsigned char*)pixels); + pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); + + if (rightside) + { + pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); + } + else + { + pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); + } + + blockv = vec_avg(blockv, pixelsv); + + vec_st(blockv, 0, block); + + pixels += line_size; + block += line_size; + } + +POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1); + +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} + +/* next one assumes that ((line_size % 8) == 0) */ +void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ +POWERPC_TBL_DECLARE(altivec_put_pixels8_xy2_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int j; +POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1); + for (j = 0; j < 2; j++) { + int i; + const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); + const uint32_t b = + (((const struct unaligned_32 *) (pixels + 1))->l); + uint32_t l0 = + (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; + uint32_t h0 = + ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); + uint32_t l1, h1; + pixels += line_size; + for (i = 0; i < h; i += 2) { + uint32_t a = (((const struct unaligned_32 *) (pixels))->l); + uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); + l1 = (a & 0x03030303UL) + (b & 0x03030303UL); + h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + a = (((const struct unaligned_32 *) (pixels))->l); + b = (((const struct unaligned_32 *) (pixels + 1))->l); + l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; + h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + } pixels += 4 - line_size * (h + 1); + block += 4 - line_size * h; + } + +POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + register int i; + register vector unsigned char + pixelsv1, pixelsv2, + pixelsavg; + register vector unsigned char + blockv, temp1, temp2; + register vector unsigned short + pixelssum1, pixelssum2, temp3; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) + { + pixelsv2 = temp2; + } + else + { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vctwo); + +POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1); + for (i = 0; i < h ; i++) { + int rightside = ((unsigned long)block & 0x0000000F); + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) + { + pixelsv2 = temp2; + } + else + { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + pixelssum1 = vec_add(pixelssum2, vctwo); + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); + + if (rightside) + { + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); + } + else + { + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); + } + + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } + +POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1); +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} + +/* next one assumes that ((line_size % 8) == 0) */ +void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) +{ +POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int j; +POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); + for (j = 0; j < 2; j++) { + int i; + const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); + const uint32_t b = + (((const struct unaligned_32 *) (pixels + 1))->l); + uint32_t l0 = + (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; + uint32_t h0 = + ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); + uint32_t l1, h1; + pixels += line_size; + for (i = 0; i < h; i += 2) { + uint32_t a = (((const struct unaligned_32 *) (pixels))->l); + uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); + l1 = (a & 0x03030303UL) + (b & 0x03030303UL); + h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + a = (((const struct unaligned_32 *) (pixels))->l); + b = (((const struct unaligned_32 *) (pixels + 1))->l); + l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; + h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + } pixels += 4 - line_size * (h + 1); + block += 4 - line_size * h; + } + +POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + register int i; + register vector unsigned char + pixelsv1, pixelsv2, + pixelsavg; + register vector unsigned char + blockv, temp1, temp2; + register vector unsigned short + pixelssum1, pixelssum2, temp3; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) + { + pixelsv2 = temp2; + } + else + { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vcone); + +POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); + for (i = 0; i < h ; i++) { + int rightside = ((unsigned long)block & 0x0000000F); + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) + { + pixelsv2 = temp2; + } + else + { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + pixelssum1 = vec_add(pixelssum2, vcone); + pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); + + if (rightside) + { + blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); + } + else + { + blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); + } + + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } + +POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} + +/* next one assumes that ((line_size % 16) == 0) */ +void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) +{ +POWERPC_TBL_DECLARE(altivec_put_pixels16_xy2_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int j; +POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); + for (j = 0; j < 4; j++) { + int i; + const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); + const uint32_t b = + (((const struct unaligned_32 *) (pixels + 1))->l); + uint32_t l0 = + (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; + uint32_t h0 = + ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); + uint32_t l1, h1; + pixels += line_size; + for (i = 0; i < h; i += 2) { + uint32_t a = (((const struct unaligned_32 *) (pixels))->l); + uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); + l1 = (a & 0x03030303UL) + (b & 0x03030303UL); + h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + a = (((const struct unaligned_32 *) (pixels))->l); + b = (((const struct unaligned_32 *) (pixels + 1))->l); + l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL; + h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + } pixels += 4 - line_size * (h + 1); + block += 4 - line_size * h; + } + +POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + register int i; + register vector unsigned char + pixelsv1, pixelsv2, pixelsv3, pixelsv4; + register vector unsigned char + blockv, temp1, temp2; + register vector unsigned short + pixelssum1, pixelssum2, temp3, + pixelssum3, pixelssum4, temp4; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) + { + pixelsv2 = temp2; + } + else + { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv3 = vec_mergel(vczero, pixelsv1); + pixelsv4 = vec_mergel(vczero, pixelsv2); + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum3 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum3 = vec_add(pixelssum3, vctwo); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vctwo); + +POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); + for (i = 0; i < h ; i++) { + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) + { + pixelsv2 = temp2; + } + else + { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv3 = vec_mergel(vczero, pixelsv1); + pixelsv4 = vec_mergel(vczero, pixelsv2); + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + + pixelssum4 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp4 = vec_add(pixelssum3, pixelssum4); + temp4 = vec_sra(temp4, vctwo); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + + pixelssum3 = vec_add(pixelssum4, vctwo); + pixelssum1 = vec_add(pixelssum2, vctwo); + + blockv = vec_packsu(temp3, temp4); + + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } + +POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1); +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} + +/* next one assumes that ((line_size % 16) == 0) */ +void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) +{ +POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1); +#ifdef ALTIVEC_USE_REFERENCE_C_CODE + int j; +POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); + for (j = 0; j < 4; j++) { + int i; + const uint32_t a = (((const struct unaligned_32 *) (pixels))->l); + const uint32_t b = + (((const struct unaligned_32 *) (pixels + 1))->l); + uint32_t l0 = + (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; + uint32_t h0 = + ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); + uint32_t l1, h1; + pixels += line_size; + for (i = 0; i < h; i += 2) { + uint32_t a = (((const struct unaligned_32 *) (pixels))->l); + uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l); + l1 = (a & 0x03030303UL) + (b & 0x03030303UL); + h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + a = (((const struct unaligned_32 *) (pixels))->l); + b = (((const struct unaligned_32 *) (pixels + 1))->l); + l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL; + h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2); + *((uint32_t *) block) = + h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL); + pixels += line_size; + block += line_size; + } pixels += 4 - line_size * (h + 1); + block += 4 - line_size * h; + } + +POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); + +#else /* ALTIVEC_USE_REFERENCE_C_CODE */ + register int i; + register vector unsigned char + pixelsv1, pixelsv2, pixelsv3, pixelsv4; + register vector unsigned char + blockv, temp1, temp2; + register vector unsigned short + pixelssum1, pixelssum2, temp3, + pixelssum3, pixelssum4, temp4; + register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); + register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); + register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); + + temp1 = vec_ld(0, pixels); + temp2 = vec_ld(16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); + if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) + { + pixelsv2 = temp2; + } + else + { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); + } + pixelsv3 = vec_mergel(vczero, pixelsv1); + pixelsv4 = vec_mergel(vczero, pixelsv2); + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + pixelssum3 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum3 = vec_add(pixelssum3, vcone); + pixelssum1 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + pixelssum1 = vec_add(pixelssum1, vcone); + +POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); + for (i = 0; i < h ; i++) { + blockv = vec_ld(0, block); + + temp1 = vec_ld(line_size, pixels); + temp2 = vec_ld(line_size + 16, pixels); + pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); + if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) + { + pixelsv2 = temp2; + } + else + { + pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); + } + + pixelsv3 = vec_mergel(vczero, pixelsv1); + pixelsv4 = vec_mergel(vczero, pixelsv2); + pixelsv1 = vec_mergeh(vczero, pixelsv1); + pixelsv2 = vec_mergeh(vczero, pixelsv2); + + pixelssum4 = vec_add((vector unsigned short)pixelsv3, + (vector unsigned short)pixelsv4); + pixelssum2 = vec_add((vector unsigned short)pixelsv1, + (vector unsigned short)pixelsv2); + temp4 = vec_add(pixelssum3, pixelssum4); + temp4 = vec_sra(temp4, vctwo); + temp3 = vec_add(pixelssum1, pixelssum2); + temp3 = vec_sra(temp3, vctwo); + + pixelssum3 = vec_add(pixelssum4, vcone); + pixelssum1 = vec_add(pixelssum2, vcone); + + blockv = vec_packsu(temp3, temp4); + + vec_st(blockv, 0, block); + + block += line_size; + pixels += line_size; + } + +POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); +#endif /* ALTIVEC_USE_REFERENCE_C_CODE */ +} int has_altivec(void) { -#if CONFIG_DARWIN +#ifdef CONFIG_DARWIN int sels[2] = {CTL_HW, HW_VECTORUNIT}; int has_vu = 0; size_t len = sizeof(has_vu); @@ -486,7 +1288,25 @@ int has_altivec(void) err = sysctl(sels, 2, &has_vu, &len, NULL, 0); if (err == 0) return (has_vu != 0); -#endif +#else /* CONFIG_DARWIN */ +/* no Darwin, do it the brute-force way */ +/* this is borrowed from the libmpeg2 library */ + { + signal (SIGILL, sigill_handler); + if (sigsetjmp (jmpbuf, 1)) { + signal (SIGILL, SIG_DFL); + } else { + canjump = 1; + + asm volatile ("mtspr 256, %0\n\t" + "vand %%v0, %%v0, %%v0" + : + : "r" (-1)); + + signal (SIGILL, SIG_DFL); + return 1; + } + } +#endif /* CONFIG_DARWIN */ return 0; } - |