update ffmpeg

CVS patchset: 4068 CVS date: 2003/01/31 18:29:43
author: Miguel Freitas <miguelfreitas@users.sourceforge.net> 2003-01-31 18:29:43 +0000
committer: Miguel Freitas <miguelfreitas@users.sourceforge.net> 2003-01-31 18:29:43 +0000
commit: 5350f2b7701f01bc4f234d3971fb8a623a8cd72a (patch)
tree: 5f6cd350778863ad8d2612bce4ac2f6270919115 /src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
parent: 8b0e8647a0d0c279b6a355362452dff4bd6f5c05 (diff)
download: xine-lib-5350f2b7701f01bc4f234d3971fb8a623a8cd72a.tar.gz
xine-lib-5350f2b7701f01bc4f234d3971fb8a623a8cd72a.tar.bz2
1 files changed, 861 insertions, 41 deletions
diff --git a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
index 5f14ed0eb..dc62e70f4 100644
--- a/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
+++ b/src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 2002 Brian Foley
  * Copyright (c) 2002 Dieter Shirley
+ * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -20,21 +21,39 @@
 #include "../dsputil.h"
 #include "dsputil_altivec.h"
 
-#if CONFIG_DARWIN
+#ifdef CONFIG_DARWIN
 #include <sys/sysctl.h>
-#endif
+#else /* CONFIG_DARWIN */
+#include <signal.h>
+#include <setjmp.h>
+
+static sigjmp_buf jmpbuf;
+static volatile sig_atomic_t canjump = 0;
+
+static void sigill_handler (int sig)
+{
+    if (!canjump) {
+        signal (sig, SIG_DFL);
+        raise (sig);
+    }
+    
+    canjump = 0;
+    siglongjmp (jmpbuf, 1);
+}
+#endif /* CONFIG_DARWIN */
 
 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int s, i;
-    vector unsigned char *tv, zero;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
+    vector unsigned char *tv;
     vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
     vector unsigned int sad;
     vector signed int sumdiffs;
 
     s = 0;
-    zero = vec_splat_u8(0);
-    sad = vec_splat_u32(0);
+    sad = (vector unsigned int)vec_splat_u32(0);
     for(i=0;i<16;i++) {
         /*
            Read unaligned pixels into our vectors. The vectors are as follows:
@@ -72,16 +91,17 @@ int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int s, i;
-    vector unsigned char *tv, zero;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
+    vector unsigned char *tv;
     vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
     vector unsigned int sad;
     vector signed int sumdiffs;
     uint8_t *pix3 = pix2 + line_size;
 
     s = 0;
-    zero = vec_splat_u8(0);
-    sad = vec_splat_u32(0);
+    sad = (vector unsigned int)vec_splat_u32(0);
 
     /*
        Due to the fact that pix3 = pix2 + line_size, the pix3 of one
@@ -131,20 +151,21 @@ int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int s, i;
+    int i;
+    int s __attribute__((aligned(16)));
     uint8_t *pix3 = pix2 + line_size;
-    vector unsigned char *tv, avgv, t5, zero;
+    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
+    const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
+    vector unsigned char *tv, avgv, t5;
     vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
     vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
     vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
-    vector unsigned short avghv, avglv, two;
+    vector unsigned short avghv, avglv;
     vector unsigned short t1, t2, t3, t4;
     vector unsigned int sad;
     vector signed int sumdiffs;
 
-    zero = vec_splat_u8(0);
-    two = vec_splat_u16(2);
-    sad = vec_splat_u32(0);
+    sad = (vector unsigned int)vec_splat_u32(0);
     
     s = 0;
 
@@ -231,14 +252,15 @@ int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int i, s;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
     vector unsigned char perm1, perm2, *pix1v, *pix2v;
     vector unsigned char t1, t2, t3,t4, t5;
-    vector unsigned int sad, zero;
+    vector unsigned int sad;
     vector signed int sumdiffs;
     
-    zero = (vector unsigned int) (0);
-    sad = (vector unsigned int) (0);
+    sad = (vector unsigned int)vec_splat_u32(0);
 
 
     for(i=0;i<16;i++) {
@@ -272,15 +294,20 @@ int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 {
-    int i, s;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
     vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
     vector unsigned char t1, t2, t3,t4, t5;
-    vector unsigned int sad, zero;
+    vector unsigned int sad;
     vector signed int sumdiffs;
 
-    zero = (vector unsigned int) (0);
-    sad = (vector unsigned int) (0);
-    permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
+    sad = (vector unsigned int)vec_splat_u32(0);
+#ifdef CONFIG_DARWIN
+    permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
+#else
+    permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
+#endif
 
     for(i=0;i<8;i++) {
 	/* Read potentially unaligned pixels into t1 and t2
@@ -315,14 +342,15 @@ int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 
 int pix_norm1_altivec(uint8_t *pix, int line_size)
 {
-    int s, i;
-    vector unsigned char *tv, zero;
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
+    vector unsigned char *tv;
     vector unsigned char pixv;
     vector unsigned int sv;
     vector signed int sum;
-        
-    zero = vec_splat_u8(0);
-    sv = vec_splat_u32(0);
+    
+    sv = (vector unsigned int)vec_splat_u32(0);
     
     s = 0;
     for (i = 0; i < 16; i++) {
@@ -343,18 +371,127 @@ int pix_norm1_altivec(uint8_t *pix, int line_size)
     return s;
 }
 
-int pix_sum_altivec(UINT8 * pix, int line_size)
+/**
+ * Sum of Squared Errors for a 8x8 block.
+ * AltiVec-enhanced.
+ * It's the pix_abs8x8_altivec code above w/ squaring added.
+ */
+int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
 {
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
+    vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
+    vector unsigned char t1, t2, t3,t4, t5;
+    vector unsigned int sum;
+    vector signed int sumsqr;
+    
+    sum = (vector unsigned int)vec_splat_u32(0);
+#ifdef CONFIG_DARWIN
+    permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
+#else
+    permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
+#endif
+    
+    for(i=0;i<8;i++) {
+	/* Read potentially unaligned pixels into t1 and t2
+	   Since we're reading 16 pixels, and actually only want 8,
+	   mask out the last 8 pixels. The 0s don't change the sum. */
+        perm1 = vec_lvsl(0, pix1);
+        pix1v = (vector unsigned char *) pix1;
+        perm2 = vec_lvsl(0, pix2);
+        pix2v = (vector unsigned char *) pix2;
+        t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
+        t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
 
+        /*
+          Since we want to use unsigned chars, we can take advantage
+          of the fact that abs(a-b)^2 = (a-b)^2.
+        */
+        
+	/* Calculate abs differences vector */ 
+        t3 = vec_max(t1, t2);
+        t4 = vec_min(t1, t2);
+        t5 = vec_sub(t3, t4);
+        
+        /* Square the values and add them to our sum */
+        sum = vec_msum(t5, t5, sum);
+        
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    
+    /* Sum up the four partial sums, and put the result into s */
+    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
+    sumsqr = vec_splat(sumsqr, 3);
+    vec_ste(sumsqr, 0, &s);
+    
+    return s;
+}
+
+/**
+ * Sum of Squared Errors for a 16x16 block.
+ * AltiVec-enhanced.
+ * It's the pix_abs16x16_altivec code above w/ squaring added.
+ */
+int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
+{
+    int i;
+    int s __attribute__((aligned(16)));
+    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
+    vector unsigned char perm1, perm2, *pix1v, *pix2v;
+    vector unsigned char t1, t2, t3,t4, t5;
+    vector unsigned int sum;
+    vector signed int sumsqr;
+    
+    sum = (vector unsigned int)vec_splat_u32(0);
+    
+    for(i=0;i<16;i++) {
+	/* Read potentially unaligned pixels into t1 and t2 */
+        perm1 = vec_lvsl(0, pix1);
+        pix1v = (vector unsigned char *) pix1;
+        perm2 = vec_lvsl(0, pix2);
+        pix2v = (vector unsigned char *) pix2;
+        t1 = vec_perm(pix1v[0], pix1v[1], perm1);
+        t2 = vec_perm(pix2v[0], pix2v[1], perm2);
+
+        /*
+          Since we want to use unsigned chars, we can take advantage
+          of the fact that abs(a-b)^2 = (a-b)^2.
+        */
+        
+	/* Calculate abs differences vector */ 
+        t3 = vec_max(t1, t2);
+        t4 = vec_min(t1, t2);
+        t5 = vec_sub(t3, t4);
+        
+        /* Square the values and add them to our sum */
+        sum = vec_msum(t5, t5, sum);
+        
+        pix1 += line_size;
+        pix2 += line_size;
+    }
+    
+    /* Sum up the four partial sums, and put the result into s */
+    sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
+    sumsqr = vec_splat(sumsqr, 3);
+    vec_ste(sumsqr, 0, &s);
+    
+    return s;
+}
+
+int pix_sum_altivec(UINT8 * pix, int line_size)
+{
+    const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
     vector unsigned char perm, *pixv;
     vector unsigned char t1;
-    vector unsigned int sad, zero;
+    vector unsigned int sad;
     vector signed int sumdiffs;
 
-    int s, i;
-
-    zero = (vector unsigned int) (0);
-    sad = (vector unsigned int) (0);
+    int i;
+    int s __attribute__((aligned(16)));
+    
+    sad = (vector unsigned int)vec_splat_u32(0);
     
     for (i = 0; i < 16; i++) {
 	/* Read the potentially unaligned 16 pixels into t1 */
@@ -380,7 +517,7 @@ void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_s
 {
     int i;
     vector unsigned char perm, bytes, *pixv;
-    vector unsigned char zero = (vector unsigned char) (0);
+    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
     vector signed short shorts;
 
     for(i=0;i<8;i++)
@@ -407,7 +544,7 @@ void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
 {
     int i;
     vector unsigned char perm, bytes, *pixv;
-    vector unsigned char zero = (vector unsigned char) (0);
+    const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
     vector signed short shorts1, shorts2;
 
     for(i=0;i<4;i++)
@@ -474,10 +611,675 @@ void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
     }
 }
 
+int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
+  return pix_abs16x16_altivec(a,b,stride);
+}
+
+int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
+  return pix_abs8x8_altivec(a,b,stride);
+}
+
+void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int i;
+    for(i=0; i+7<w; i++){
+        dst[i+0] += src[i+0];
+        dst[i+1] += src[i+1];
+        dst[i+2] += src[i+2];
+        dst[i+3] += src[i+3];
+        dst[i+4] += src[i+4];
+        dst[i+5] += src[i+5];
+        dst[i+6] += src[i+6];
+        dst[i+7] += src[i+7];
+    }
+    for(; i<w; i++)
+        dst[i+0] += src[i+0];
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+    register int i;
+    register vector unsigned char vdst, vsrc;
+    
+    /* dst and src are 16 bytes-aligned (guaranteed) */
+    for(i = 0 ; (i + 15) < w ; i++)
+    {
+      vdst = vec_ld(i << 4, (unsigned char*)dst);
+      vsrc = vec_ld(i << 4, (unsigned char*)src);
+      vdst = vec_add(vsrc, vdst);
+      vec_st(vdst, i << 4, (unsigned char*)dst);
+    }
+    /* if w is not a multiple of 16 */
+    for (; (i < w) ; i++)
+    {
+      dst[i] = src[i];
+    }
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_pixels16_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int i;
+
+POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
+
+    for(i=0; i<h; i++) {
+      *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
+      *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
+      *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
+      *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
+      pixels+=line_size;
+      block +=line_size;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+    register vector unsigned char pixelsv1, pixelsv2;
+    register vector unsigned char perm = vec_lvsl(0, pixels);
+    int i;
+
+POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
+
+    for(i=0; i<h; i++) {
+      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
+      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
+      vec_st(vec_perm(pixelsv1, pixelsv2, perm),
+             0, (unsigned char*)block);
+      pixels+=line_size;
+      block +=line_size;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
+
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+#define op_avg(a,b)  a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
+void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_avg_pixels16_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int i;
+
+POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
+
+    for(i=0; i<h; i++) {
+      op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
+      op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
+      op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
+      op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
+      pixels+=line_size;
+      block +=line_size;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
+    register vector unsigned char perm = vec_lvsl(0, pixels);
+    int i;
+
+POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
+
+    for(i=0; i<h; i++) {
+      pixelsv1 = vec_ld(0, (unsigned char*)pixels);
+      pixelsv2 = vec_ld(16, (unsigned char*)pixels);
+      blockv = vec_ld(0, block);
+      pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
+      blockv = vec_avg(blockv,pixelsv);
+      vec_st(blockv, 0, (unsigned char*)block);
+      pixels+=line_size;
+      block +=line_size;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
+
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_avg_pixels8_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int i;
+POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1);
+    for (i = 0; i < h; i++) {
+        *((uint32_t *) (block)) =
+            (((*((uint32_t *) (block))) |
+              ((((const struct unaligned_32 *) (pixels))->l))) -
+             ((((*((uint32_t *) (block))) ^
+                ((((const struct unaligned_32 *) (pixels))->
+                  l))) & 0xFEFEFEFEUL) >> 1));
+        *((uint32_t *) (block + 4)) =
+            (((*((uint32_t *) (block + 4))) |
+              ((((const struct unaligned_32 *) (pixels + 4))->l))) -
+             ((((*((uint32_t *) (block + 4))) ^
+                ((((const struct unaligned_32 *) (pixels +
+                                                  4))->
+                  l))) & 0xFEFEFEFEUL) >> 1));
+        pixels += line_size;
+        block += line_size;
+    }
+POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+    register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
+    int i;
+
+POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1);
+ 
+   for (i = 0; i < h; i++) {
+     /*
+       block is 8 bytes-aligned, so we're either in the
+       left block (16 bytes-aligned) or in the right block (not)
+     */
+     int rightside = ((unsigned long)block & 0x0000000F);
+     
+     blockv = vec_ld(0, block);
+     pixelsv1 = vec_ld(0, (unsigned char*)pixels);
+     pixelsv2 = vec_ld(16, (unsigned char*)pixels);
+     pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
+     
+     if (rightside)
+     {
+       pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
+     }
+     else
+     {
+       pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
+     }
+     
+     blockv = vec_avg(blockv, pixelsv);
+
+     vec_st(blockv, 0, block);
+     
+     pixels += line_size;
+     block += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1);
+ 
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_pixels8_xy2_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int j;
+POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1);
+    for (j = 0; j < 2; j++) {
+      int i;
+      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+      const uint32_t b =
+        (((const struct unaligned_32 *) (pixels + 1))->l);
+      uint32_t l0 =
+        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
+      uint32_t h0 =
+        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+      uint32_t l1, h1;
+      pixels += line_size;
+      for (i = 0; i < h; i += 2) {
+        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+        a = (((const struct unaligned_32 *) (pixels))->l);
+        b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+      } pixels += 4 - line_size * (h + 1);
+      block += 4 - line_size * h;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+   register int i;
+   register vector unsigned char
+     pixelsv1, pixelsv2,
+     pixelsavg;
+   register vector unsigned char
+     blockv, temp1, temp2;
+   register vector unsigned short
+     pixelssum1, pixelssum2, temp3;
+   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+   
+   temp1 = vec_ld(0, pixels);
+   temp2 = vec_ld(16, pixels);
+   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
+   {
+     pixelsv2 = temp2;
+   }
+   else
+   {
+     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+   }
+   pixelsv1 = vec_mergeh(vczero, pixelsv1);
+   pixelsv2 = vec_mergeh(vczero, pixelsv2);
+   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                        (vector unsigned short)pixelsv2);
+   pixelssum1 = vec_add(pixelssum1, vctwo);
+   
+POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1); 
+   for (i = 0; i < h ; i++) {
+     int rightside = ((unsigned long)block & 0x0000000F);
+     blockv = vec_ld(0, block);
+
+     temp1 = vec_ld(line_size, pixels);
+     temp2 = vec_ld(line_size + 16, pixels);
+     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
+     {
+       pixelsv2 = temp2;
+     }
+     else
+     {
+       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+     }
+
+     pixelsv1 = vec_mergeh(vczero, pixelsv1);
+     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                          (vector unsigned short)pixelsv2);
+     temp3 = vec_add(pixelssum1, pixelssum2);
+     temp3 = vec_sra(temp3, vctwo);
+     pixelssum1 = vec_add(pixelssum2, vctwo);
+     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
+     
+     if (rightside)
+     {
+       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
+     }
+     else
+     {
+       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
+     }
+     
+     vec_st(blockv, 0, block);
+     
+     block += line_size;
+     pixels += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 8) == 0) */
+void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int j;
+POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
+    for (j = 0; j < 2; j++) {
+      int i;
+      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+      const uint32_t b =
+        (((const struct unaligned_32 *) (pixels + 1))->l);
+      uint32_t l0 =
+        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
+      uint32_t h0 =
+        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+      uint32_t l1, h1;
+      pixels += line_size;
+      for (i = 0; i < h; i += 2) {
+        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+        a = (((const struct unaligned_32 *) (pixels))->l);
+        b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+      } pixels += 4 - line_size * (h + 1);
+      block += 4 - line_size * h;
+    }
+    
+POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+   register int i;
+   register vector unsigned char
+     pixelsv1, pixelsv2,
+     pixelsavg;
+   register vector unsigned char
+     blockv, temp1, temp2;
+   register vector unsigned short
+     pixelssum1, pixelssum2, temp3;
+   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+   register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
+   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+   
+   temp1 = vec_ld(0, pixels);
+   temp2 = vec_ld(16, pixels);
+   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
+   {
+     pixelsv2 = temp2;
+   }
+   else
+   {
+     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+   }
+   pixelsv1 = vec_mergeh(vczero, pixelsv1);
+   pixelsv2 = vec_mergeh(vczero, pixelsv2);
+   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                        (vector unsigned short)pixelsv2);
+   pixelssum1 = vec_add(pixelssum1, vcone);
+   
+POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1); 
+   for (i = 0; i < h ; i++) {
+     int rightside = ((unsigned long)block & 0x0000000F);
+     blockv = vec_ld(0, block);
+
+     temp1 = vec_ld(line_size, pixels);
+     temp2 = vec_ld(line_size + 16, pixels);
+     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
+     {
+       pixelsv2 = temp2;
+     }
+     else
+     {
+       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+     }
+
+     pixelsv1 = vec_mergeh(vczero, pixelsv1);
+     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                          (vector unsigned short)pixelsv2);
+     temp3 = vec_add(pixelssum1, pixelssum2);
+     temp3 = vec_sra(temp3, vctwo);
+     pixelssum1 = vec_add(pixelssum2, vcone);
+     pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
+     
+     if (rightside)
+     {
+       blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
+     }
+     else
+     {
+       blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
+     }
+     
+     vec_st(blockv, 0, block);
+     
+     block += line_size;
+     pixels += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_pixels16_xy2_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int j;
+POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1);
+      for (j = 0; j < 4; j++) {
+      int i;
+      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+      const uint32_t b =
+        (((const struct unaligned_32 *) (pixels + 1))->l);
+      uint32_t l0 =
+        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
+      uint32_t h0 =
+        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+      uint32_t l1, h1;
+      pixels += line_size;
+      for (i = 0; i < h; i += 2) {
+        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+        a = (((const struct unaligned_32 *) (pixels))->l);
+        b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+      } pixels += 4 - line_size * (h + 1);
+      block += 4 - line_size * h;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+   register int i;
+   register vector unsigned char
+     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
+   register vector unsigned char
+     blockv, temp1, temp2;
+   register vector unsigned short
+     pixelssum1, pixelssum2, temp3,
+     pixelssum3, pixelssum4, temp4;
+   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+   
+   temp1 = vec_ld(0, pixels);
+   temp2 = vec_ld(16, pixels);
+   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
+   {
+     pixelsv2 = temp2;
+   }
+   else
+   {
+     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+   }
+   pixelsv3 = vec_mergel(vczero, pixelsv1);
+   pixelsv4 = vec_mergel(vczero, pixelsv2);
+   pixelsv1 = vec_mergeh(vczero, pixelsv1);
+   pixelsv2 = vec_mergeh(vczero, pixelsv2);
+   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
+                        (vector unsigned short)pixelsv4);
+   pixelssum3 = vec_add(pixelssum3, vctwo);
+   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                        (vector unsigned short)pixelsv2);
+   pixelssum1 = vec_add(pixelssum1, vctwo);
+   
+POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1); 
+   for (i = 0; i < h ; i++) {
+     blockv = vec_ld(0, block);
+
+     temp1 = vec_ld(line_size, pixels);
+     temp2 = vec_ld(line_size + 16, pixels);
+     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
+     {
+       pixelsv2 = temp2;
+     }
+     else
+     {
+       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+     }
+
+     pixelsv3 = vec_mergel(vczero, pixelsv1);
+     pixelsv4 = vec_mergel(vczero, pixelsv2);
+     pixelsv1 = vec_mergeh(vczero, pixelsv1);
+     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+     
+     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
+                          (vector unsigned short)pixelsv4);
+     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                          (vector unsigned short)pixelsv2);
+     temp4 = vec_add(pixelssum3, pixelssum4);
+     temp4 = vec_sra(temp4, vctwo);
+     temp3 = vec_add(pixelssum1, pixelssum2);
+     temp3 = vec_sra(temp3, vctwo);
+
+     pixelssum3 = vec_add(pixelssum4, vctwo);
+     pixelssum1 = vec_add(pixelssum2, vctwo);
+
+     blockv = vec_packsu(temp3, temp4);
+     
+     vec_st(blockv, 0, block);
+     
+     block += line_size;
+     pixels += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
+
+/* next one assumes that ((line_size % 16) == 0) */
+void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
+{
+POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
+#ifdef ALTIVEC_USE_REFERENCE_C_CODE
+    int j;
+POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
+      for (j = 0; j < 4; j++) {
+      int i;
+      const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+      const uint32_t b =
+        (((const struct unaligned_32 *) (pixels + 1))->l);
+      uint32_t l0 =
+        (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
+      uint32_t h0 =
+        ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+      uint32_t l1, h1;
+      pixels += line_size;
+      for (i = 0; i < h; i += 2) {
+        uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
+        uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
+        h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+        a = (((const struct unaligned_32 *) (pixels))->l);
+        b = (((const struct unaligned_32 *) (pixels + 1))->l);
+        l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
+        h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
+        *((uint32_t *) block) =
+          h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
+        pixels += line_size;
+        block += line_size;
+      } pixels += 4 - line_size * (h + 1);
+      block += 4 - line_size * h;
+    }
+
+POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
+
+#else /* ALTIVEC_USE_REFERENCE_C_CODE */
+   register int i;
+   register vector unsigned char
+     pixelsv1, pixelsv2, pixelsv3, pixelsv4;
+   register vector unsigned char
+     blockv, temp1, temp2;
+   register vector unsigned short
+     pixelssum1, pixelssum2, temp3,
+     pixelssum3, pixelssum4, temp4;
+   register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
+   register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
+   register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
+   
+   temp1 = vec_ld(0, pixels);
+   temp2 = vec_ld(16, pixels);
+   pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
+   if ((((unsigned long)pixels) & 0x0000000F) ==  0x0000000F)
+   {
+     pixelsv2 = temp2;
+   }
+   else
+   {
+     pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
+   }
+   pixelsv3 = vec_mergel(vczero, pixelsv1);
+   pixelsv4 = vec_mergel(vczero, pixelsv2);
+   pixelsv1 = vec_mergeh(vczero, pixelsv1);
+   pixelsv2 = vec_mergeh(vczero, pixelsv2);
+   pixelssum3 = vec_add((vector unsigned short)pixelsv3,
+                        (vector unsigned short)pixelsv4);
+   pixelssum3 = vec_add(pixelssum3, vcone);
+   pixelssum1 = vec_add((vector unsigned short)pixelsv1,
+                        (vector unsigned short)pixelsv2);
+   pixelssum1 = vec_add(pixelssum1, vcone);
+   
+POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1); 
+   for (i = 0; i < h ; i++) {
+     blockv = vec_ld(0, block);
+
+     temp1 = vec_ld(line_size, pixels);
+     temp2 = vec_ld(line_size + 16, pixels);
+     pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
+     if (((((unsigned long)pixels) + line_size) & 0x0000000F) ==  0x0000000F)
+     {
+       pixelsv2 = temp2;
+     }
+     else
+     {
+       pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
+     }
+
+     pixelsv3 = vec_mergel(vczero, pixelsv1);
+     pixelsv4 = vec_mergel(vczero, pixelsv2);
+     pixelsv1 = vec_mergeh(vczero, pixelsv1);
+     pixelsv2 = vec_mergeh(vczero, pixelsv2);
+     
+     pixelssum4 = vec_add((vector unsigned short)pixelsv3,
+                          (vector unsigned short)pixelsv4);
+     pixelssum2 = vec_add((vector unsigned short)pixelsv1,
+                          (vector unsigned short)pixelsv2);
+     temp4 = vec_add(pixelssum3, pixelssum4);
+     temp4 = vec_sra(temp4, vctwo);
+     temp3 = vec_add(pixelssum1, pixelssum2);
+     temp3 = vec_sra(temp3, vctwo);
+
+     pixelssum3 = vec_add(pixelssum4, vcone);
+     pixelssum1 = vec_add(pixelssum2, vcone);
+
+     blockv = vec_packsu(temp3, temp4);
+     
+     vec_st(blockv, 0, block);
+     
+     block += line_size;
+     pixels += line_size;
+   }
+   
+POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
+#endif /* ALTIVEC_USE_REFERENCE_C_CODE */
+}
 
 int has_altivec(void)
 {
-#if CONFIG_DARWIN
+#ifdef CONFIG_DARWIN
     int sels[2] = {CTL_HW, HW_VECTORUNIT};
     int has_vu = 0;
     size_t len = sizeof(has_vu);
@@ -486,7 +1288,25 @@ int has_altivec(void)
     err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
 
     if (err == 0) return (has_vu != 0);
-#endif
+#else /* CONFIG_DARWIN */
+/* no Darwin, do it the brute-force way */
+/* this is borrowed from the libmpeg2 library */
+    {
+      signal (SIGILL, sigill_handler);
+      if (sigsetjmp (jmpbuf, 1)) {
+        signal (SIGILL, SIG_DFL);
+      } else {
+        canjump = 1;
+        
+        asm volatile ("mtspr 256, %0\n\t"
+                      "vand %%v0, %%v0, %%v0"
+                      :
+                      : "r" (-1));
+        
+        signal (SIGILL, SIG_DFL);
+        return 1;
+      }
+    }
+#endif /* CONFIG_DARWIN */
     return 0;
 }
-
author	Miguel Freitas <miguelfreitas@users.sourceforge.net>	2003-01-31 18:29:43 +0000
committer	Miguel Freitas <miguelfreitas@users.sourceforge.net>	2003-01-31 18:29:43 +0000
commit	5350f2b7701f01bc4f234d3971fb8a623a8cd72a (patch)
tree	5f6cd350778863ad8d2612bce4ac2f6270919115 /src/libffmpeg/libavcodec/ppc/dsputil_altivec.c
parent	8b0e8647a0d0c279b6a355362452dff4bd6f5c05 (diff)
download	xine-lib-5350f2b7701f01bc4f234d3971fb8a623a8cd72a.tar.gz xine-lib-5350f2b7701f01bc4f234d3971fb8a623a8cd72a.tar.bz2