1 files changed, 143 insertions, 103 deletions
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
index 5e1aa2093..9a3fb1eac 100644
--- a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
+++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
@@ -22,6 +22,8 @@
 
 void simple_idct_axp(DCTELEM *block);
 
+void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+			int line_size, int h);
 void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
 				int line_size);
 void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
@@ -103,145 +105,183 @@ void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
 }
 #endif
 
-/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
-   Since the immediate result could be greater than 255, we do the
-   shift first. The result is too low by one if the bytes were both
-   odd, so we need to add (l1 & l2) & BYTE_VEC(0x01).  */
-static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2)
-{
-    UINT64 correction = (l1 & l2) & BYTE_VEC(0x01);
-    l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
-    l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
-    return l1 + l2 + correction;
+static void clear_blocks_axp(DCTELEM *blocks) {
+    uint64_t *p = (uint64_t *) blocks;
+    int n = sizeof(DCTELEM) * 6 * 64;
+
+    do {
+        p[0] = 0;
+        p[1] = 0;
+        p[2] = 0;
+        p[3] = 0;
+        p[4] = 0;
+        p[5] = 0;
+        p[6] = 0;
+        p[7] = 0;
+        p += 8;
+        n -= 8 * 8;
+    } while (n);
 }
 
-/* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1
-   The '1' only has an effect when one byte is even and the other odd,
-   i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01).
-   Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01).  */
-static inline UINT64 avg2(UINT64 l1, UINT64 l2)
+static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
 {
-    UINT64 correction = (l1 | l2) & BYTE_VEC(0x01);
-    l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
-    l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
-    return l1 + l2 + correction;
+    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
 }
 
-static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
+static inline uint64_t avg2(uint64_t a, uint64_t b)
 {
-    UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
-	      + ((l2 & ~BYTE_VEC(0x03)) >> 2)
-	      + ((l3 & ~BYTE_VEC(0x03)) >> 2)
-	      + ((l4 & ~BYTE_VEC(0x03)) >> 2);
-    UINT64 r2 = ((  (l1 & BYTE_VEC(0x03))
-		  + (l2 & BYTE_VEC(0x03))
-		  + (l3 & BYTE_VEC(0x03))
-		  + (l4 & BYTE_VEC(0x03))
-		  + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
-    return r1 + r2;
+    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);    
 }
 
-static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
+#if 0
+/* The XY2 routines basically utilize this scheme, but reuse parts in
+   each iteration.  */
+static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
 {
-    UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
-	      + ((l2 & ~BYTE_VEC(0x03)) >> 2)
-	      + ((l3 & ~BYTE_VEC(0x03)) >> 2)
-	      + ((l4 & ~BYTE_VEC(0x03)) >> 2);
-    UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
-		 + (l2 & BYTE_VEC(0x03))
-		 + (l3 & BYTE_VEC(0x03))
-		 + (l4 & BYTE_VEC(0x03))
-		 + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
+    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+		+ ((l2 & ~BYTE_VEC(0x03)) >> 2)
+		+ ((l3 & ~BYTE_VEC(0x03)) >> 2)
+		+ ((l4 & ~BYTE_VEC(0x03)) >> 2);
+    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
+		    + (l2 & BYTE_VEC(0x03))
+		    + (l3 & BYTE_VEC(0x03))
+		    + (l4 & BYTE_VEC(0x03))
+		    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
     return r1 + r2;
 }
+#endif
 
-#define PIXOPNAME(suffix) put ## suffix
-#define BTYPE UINT8
-#define AVG2 avg2
-#define AVG4 avg4
-#define STORE(l, b) stq(l, b)
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
-#undef AVG2
-#undef AVG4
-#undef STORE
+#define OP(LOAD, STORE)                         \
+    do {                                        \
+        STORE(LOAD(pixels), block);             \
+        pixels += line_size;                    \
+        block += line_size;                     \
+    } while (--h)
 
-#define PIXOPNAME(suffix) put_no_rnd ## suffix
-#define BTYPE UINT8
-#define AVG2 avg2_no_rnd
-#define AVG4 avg4_no_rnd
-#define STORE(l, b) stq(l, b)
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
-#undef AVG2
-#undef AVG4
-#undef STORE
+#define OP_X2(LOAD, STORE)                                      \
+    do {                                                        \
+        uint64_t pix1, pix2;                                    \
+                                                                \
+        pix1 = LOAD(pixels);                                    \
+        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
+        STORE(AVG2(pix1, pix2), block);                         \
+        pixels += line_size;                                    \
+        block += line_size;                                     \
+    } while (--h)
 
-/* The following functions are untested.  */
-#if 0
+#define OP_Y2(LOAD, STORE)                      \
+    do {                                        \
+        uint64_t pix = LOAD(pixels);            \
+        do {                                    \
+            uint64_t next_pix;                  \
+                                                \
+            pixels += line_size;                \
+            next_pix = LOAD(pixels);            \
+            STORE(AVG2(pix, next_pix), block);  \
+            block += line_size;                 \
+            pix = next_pix;                     \
+        } while (--h);                          \
+    } while (0)
+
+#define OP_XY2(LOAD, STORE)                                                 \
+    do {                                                                    \
+        uint64_t pix1 = LOAD(pixels);                                       \
+        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
+        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
+                       + (pix2 & BYTE_VEC(0x03));                           \
+        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
+                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
+                                                                            \
+        do {                                                                \
+            uint64_t npix1, npix2;                                          \
+            uint64_t npix_l, npix_h;                                        \
+            uint64_t avg;                                                   \
+                                                                            \
+            pixels += line_size;                                            \
+            npix1 = LOAD(pixels);                                           \
+            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
+            npix_l = (npix1 & BYTE_VEC(0x03))                               \
+                   + (npix2 & BYTE_VEC(0x03));                              \
+            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
+                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
+            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
+                + pix_h + npix_h;                                           \
+            STORE(avg, block);                                              \
+                                                                            \
+            block += line_size;                                             \
+            pix_l = npix_l;                                                 \
+            pix_h = npix_h;                                                 \
+        } while (--h);                                                      \
+    } while (0)
+
+#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                            \
+static void OPNAME ## _pixels ## SUFF ## _axp                           \
+        (uint8_t *restrict block, const uint8_t *restrict pixels,       \
+         int line_size, int h)                                          \
+{                                                                       \
+    if ((size_t) pixels & 0x7) {                                        \
+        OPKIND(uldq, STORE);                                            \
+    } else {                                                            \
+        OPKIND(ldq, STORE);                                             \
+    }                                                                   \
+}
 
-#define PIXOPNAME(suffix) avg ## suffix
-#define BTYPE UINT8
+#define PIXOP(OPNAME, STORE)                    \
+    MAKE_OP(OPNAME, ,     OP,     STORE)        \
+    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
+    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
+    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
+
+/* Rounding primitives.  */
 #define AVG2 avg2
 #define AVG4 avg4
+#define AVG4_ROUNDER BYTE_VEC(0x02)
+#define STORE(l, b) stq(l, b)
+PIXOP(put, STORE);
+
+#undef STORE
 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
+PIXOP(avg, STORE);
+
+/* Not rounding primitives.  */
 #undef AVG2
 #undef AVG4
+#undef AVG4_ROUNDER
 #undef STORE
-
-#define PIXOPNAME(suffix) avg_no_rnd ## suffix
-#define BTYPE UINT8
 #define AVG2 avg2_no_rnd
 #define AVG4 avg4_no_rnd
-#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
-#undef AVG2
-#undef AVG4
-#undef STORE
+#define AVG4_ROUNDER BYTE_VEC(0x01)
+#define STORE(l, b) stq(l, b)
+PIXOP(put_no_rnd, STORE);
 
-#define PIXOPNAME(suffix) sub ## suffix
-#define BTYPE DCTELEM
-#define AVG2 avg2
-#define AVG4 avg4
-#define STORE(l, block) do {		\
-    UINT64 xxx = l;			\
-    (block)[0] -= (xxx >>  0) & 0xff;	\
-    (block)[1] -= (xxx >>  8) & 0xff;	\
-    (block)[2] -= (xxx >> 16) & 0xff;	\
-    (block)[3] -= (xxx >> 24) & 0xff;	\
-    (block)[4] -= (xxx >> 32) & 0xff;	\
-    (block)[5] -= (xxx >> 40) & 0xff;	\
-    (block)[6] -= (xxx >> 48) & 0xff;	\
-    (block)[7] -= (xxx >> 56) & 0xff;	\
-} while (0)
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
-#undef AVG2
-#undef AVG4
 #undef STORE
-
-#endif
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+PIXOP(avg_no_rnd, STORE);
 
 void dsputil_init_alpha(void)
 {
-    put_pixels_tab[0] = put_pixels_axp;
+    put_pixels_tab[0] = put_pixels_axp_asm;
     put_pixels_tab[1] = put_pixels_x2_axp;
     put_pixels_tab[2] = put_pixels_y2_axp;
     put_pixels_tab[3] = put_pixels_xy2_axp;
 
-    put_no_rnd_pixels_tab[0] = put_pixels_axp;
+    put_no_rnd_pixels_tab[0] = put_pixels_axp_asm;
     put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
     put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
     put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
 
+    avg_pixels_tab[0] = avg_pixels_axp;
+    avg_pixels_tab[1] = avg_pixels_x2_axp;
+    avg_pixels_tab[2] = avg_pixels_y2_axp;
+    avg_pixels_tab[3] = avg_pixels_xy2_axp;
+
+    avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_axp;
+    avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_axp;
+    avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_axp;
+    avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_axp;
+
+    clear_blocks = clear_blocks_axp;
+
     /* amask clears all bits that correspond to present features.  */
     if (amask(AMASK_MVI) == 0) {
         put_pixels_clamped = put_pixels_clamped_mvi_asm;