3 files changed, 160 insertions, 137 deletions
diff --git a/src/libffmpeg/libavcodec/alpha/asm.h b/src/libffmpeg/libavcodec/alpha/asm.h
index 0f4685f11..ceaf0be4a 100644
--- a/src/libffmpeg/libavcodec/alpha/asm.h
+++ b/src/libffmpeg/libavcodec/alpha/asm.h
@@ -20,122 +20,115 @@
 #ifndef LIBAVCODEC_ALPHA_ASM_H
 #define LIBAVCODEC_ALPHA_ASM_H
 
-#include <stdint.h>
+#include <inttypes.h>
 
 #define AMASK_BWX (1 << 0)
 #define AMASK_FIX (1 << 1)
+#define AMASK_CIX (1 << 2)
 #define AMASK_MVI (1 << 8)
 
-static inline uint64_t BYTE_VEC(uint64_t x)
+inline static uint64_t BYTE_VEC(uint64_t x)
 {
     x |= x <<  8;
     x |= x << 16;
     x |= x << 32;
     return x;
 }
-static inline uint64_t WORD_VEC(uint64_t x)
+inline static uint64_t WORD_VEC(uint64_t x)
 {
     x |= x << 16;
     x |= x << 32;
     return x;
 }
 
-static inline int32_t ldl(const void* p)
-{
-    return *(const int32_t*) p;
-}
-static inline uint64_t ldq(const void* p)
-{
-    return *(const uint64_t*) p;
-}
-/* FIXME ccc doesn't seem to get it? Use inline asm?  */
-static inline uint64_t ldq_u(const void* p)
-{
-    return *(const uint64_t*) ((uintptr_t) p & ~7ul);
-}
-static inline void stl(uint32_t l, void* p)
-{
-    *(uint32_t*) p = l;
-}
-static inline void stq(uint64_t l, void* p)
-{
-    *(uint64_t*) p = l;
-}
+#define ldq(p) (*(const uint64_t *) (p))
+#define ldl(p) (*(const int32_t *) (p))
+#define stl(l, p) do { *(uint32_t *) (p) = (l); } while (0)
+#define stq(l, p) do { *(uint64_t *) (p) = (l); } while (0)
 
 #ifdef __GNUC__
-#define OPCODE1(name)						\
-static inline uint64_t name(uint64_t l)				\
-{								\
-    uint64_t r;							\
-    asm (#name " %1, %0" : "=r" (r) : "r" (l));			\
-    return r;							\
-}
-
-#define OPCODE2(name)						\
-static inline uint64_t name(uint64_t l1, uint64_t l2)		\
-{								\
-    uint64_t r;							\
-    asm (#name " %1, %2, %0" : "=r" (r) : "r" (l1), "rI" (l2));	\
-    return r;							\
-}
-
-/* We don't want gcc to move this around or combine it with another
-   rpcc, so mark it volatile.  */
-static inline uint64_t rpcc(void)
-{
-    uint64_t r;
-    asm volatile ("rpcc %0" : "=r" (r));
-    return r;
-}
-
-static inline uint64_t uldq(const void* v)
-{
-    struct foo {
-	unsigned long l;
-    } __attribute__((packed));
-
-    return ((const struct foo*) v)->l;
-}
+#define ASM_ACCEPT_MVI asm (".arch pca56")
+struct unaligned_long { uint64_t l; } __attribute__((packed));
+#define ldq_u(p)     (*(const uint64_t *) (((uint64_t) (p)) & ~7ul))
+#define uldq(a)	     (((const struct unaligned_long *) (a))->l)
+
+#if __GNUC__ >= 3 && __GNUC_MINOR__ >= 2
+#define cmpbge	__builtin_alpha_cmpbge
+/* Avoid warnings.  */
+#define extql(a, b)	__builtin_alpha_extql(a, (uint64_t) (b))
+#define extqh(a, b)	__builtin_alpha_extqh(a, (uint64_t) (b))
+#define zap	__builtin_alpha_zap
+#define zapnot	__builtin_alpha_zapnot
+#define amask	__builtin_alpha_amask
+#define implver	__builtin_alpha_implver
+#define rpcc	__builtin_alpha_rpcc
+#define minub8	__builtin_alpha_minub8
+#define minsb8	__builtin_alpha_minsb8
+#define minuw4	__builtin_alpha_minuw4
+#define minsw4	__builtin_alpha_minsw4
+#define maxub8	__builtin_alpha_maxub8
+#define maxsb8	__builtin_alpha_maxsb8
+#define maxuw4	__builtin_alpha_maxuw4	
+#define maxsw4	__builtin_alpha_maxsw4
+#define perr	__builtin_alpha_perr
+#define pklb	__builtin_alpha_pklb
+#define pkwb	__builtin_alpha_pkwb
+#define unpkbl	__builtin_alpha_unpkbl
+#define unpkbw	__builtin_alpha_unpkbw
+#else
+#define cmpbge(a, b) ({ uint64_t __r; asm ("cmpbge  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extql(a, b)  ({ uint64_t __r; asm ("extql   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define extqh(a, b)  ({ uint64_t __r; asm ("extqh   %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define zap(a, b)    ({ uint64_t __r; asm ("zap     %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define zapnot(a, b) ({ uint64_t __r; asm ("zapnot  %r1,%2,%0"  : "=r" (__r) : "rJ"  (a), "rI" (b)); __r; })
+#define amask(a)     ({ uint64_t __r; asm ("amask   %1,%0"      : "=r" (__r) : "rI"  (a));	     __r; })
+#define implver()    ({ uint64_t __r; asm ("implver %0"         : "=r" (__r));			     __r; })
+#define rpcc()	     ({ uint64_t __r; asm volatile ("rpcc %0"   : "=r" (__r));			     __r; })
+#define minub8(a, b) ({ uint64_t __r; asm ("minub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minsb8(a, b) ({ uint64_t __r; asm ("minsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minuw4(a, b) ({ uint64_t __r; asm ("minuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define minsw4(a, b) ({ uint64_t __r; asm ("minsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxub8(a, b) ({ uint64_t __r; asm ("maxub8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxsb8(a, b) ({ uint64_t __r; asm ("maxsb8  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxuw4(a, b) ({ uint64_t __r; asm ("maxuw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define maxsw4(a, b) ({ uint64_t __r; asm ("maxsw4  %r1,%2,%0"  : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; })
+#define perr(a, b)   ({ uint64_t __r; asm ("perr    %r1,%r2,%0" : "=r" (__r) : "%rJ" (a), "rJ" (b)); __r; })
+#define pklb(a)      ({ uint64_t __r; asm ("pklb    %r1,%0"     : "=r" (__r) : "rJ"  (a));	     __r; })
+#define pkwb(a)      ({ uint64_t __r; asm ("pkwb    %r1,%0"     : "=r" (__r) : "rJ"  (a));	     __r; })
+#define unpkbl(a)    ({ uint64_t __r; asm ("unpkbl  %r1,%0"     : "=r" (__r) : "rJ"  (a));	     __r; })
+#define unpkbw(a)    ({ uint64_t __r; asm ("unpkbw  %r1,%0"     : "=r" (__r) : "rJ"  (a));	     __r; })
+#endif
 
-#elif defined(__DECC)		/* Compaq "ccc" compiler */
+#elif defined(__DECC)		/* Digital/Compaq "ccc" compiler */
 
 #include <c_asm.h>
-#define OPCODE1(name)							\
-static inline uint64_t name(uint64_t l)					\
-{									\
-    return asm (#name " %a0, %v0", l);					\
-}
-
-#define OPCODE2(name)							\
-static inline uint64_t name(uint64_t l1, uint64_t l2)			\
-{									\
-    return asm (#name " %a0, %a1, %v0", l1, l2);			\
-}
-
-static inline uint64_t rpcc(void)
-{
-    return asm  ("rpcc %v0");
-}
-
-static inline uint64_t uldq(const void* v)
-{
-    return *(const __unaligned uint64_t *) v;
-}
-
+#define ASM_ACCEPT_MVI
+#define ldq_u(a)     asm ("ldq_u   %v0,0(%a0)", a)
+#define uldq(a)	     (*(const __unaligned uint64_t *) (a))
+#define cmpbge(a, b) asm ("cmpbge  %a0,%a1,%v0", a, b)
+#define extql(a, b)  asm ("extql   %a0,%a1,%v0", a, b)
+#define extqh(a, b)  asm ("extqh   %a0,%a1,%v0", a, b)
+#define zap(a, b)    asm ("zap     %a0,%a1,%v0", a, b)
+#define zapnot(a, b) asm ("zapnot  %a0,%a1,%v0", a, b)
+#define amask(a)     asm ("amask   %a0,%v0", a)
+#define implver()    asm ("implver %v0")
+#define rpcc()	     asm ("rpcc	   %v0")
+#define minub8(a, b) asm ("minub8  %a0,%a1,%v0", a, b)
+#define minsb8(a, b) asm ("minsb8  %a0,%a1,%v0", a, b)
+#define minuw4(a, b) asm ("minuw4  %a0,%a1,%v0", a, b)
+#define minsw4(a, b) asm ("minsw4  %a0,%a1,%v0", a, b)
+#define maxub8(a, b) asm ("maxub8  %a0,%a1,%v0", a, b)
+#define maxsb8(a, b) asm ("maxsb8  %a0,%a1,%v0", a, b)
+#define maxuw4(a, b) asm ("maxuw4  %a0,%a1,%v0", a, b)
+#define maxsw4(a, b) asm ("maxsw4  %a0,%a1,%v0", a, b)
+#define perr(a, b)   asm ("perr    %a0,%a1,%v0", a, b)
+#define pklb(a)      asm ("pklb    %a0,%v0", a)
+#define pkwb(a)      asm ("pkwb    %a0,%v0", a)
+#define unpkbl(a)    asm ("unpkbl  %a0,%v0", a)
+#define unpkbw(a)    asm ("unpkbw  %a0,%v0", a)
+
+#else
+#error "Unknown compiler!"
 #endif
 
-OPCODE1(amask);
-OPCODE1(unpkbw);
-OPCODE1(pkwb);
-OPCODE2(extql);
-OPCODE2(extqh);
-OPCODE2(zap);
-OPCODE2(cmpbge);
-OPCODE2(minsw4);
-OPCODE2(minuw4);
-OPCODE2(minub8);
-OPCODE2(maxsw4);
-OPCODE2(maxuw4);
-OPCODE2(perr);
-
 #endif /* LIBAVCODEC_ALPHA_ASM_H */
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
index 3a54904f4..5e1aa2093 100644
--- a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
+++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
@@ -22,58 +22,86 @@
 
 void simple_idct_axp(DCTELEM *block);
 
-static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, 
-				   int line_size)
+void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
+				int line_size);
+void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
+				int line_size);
+
+#if 0
+/* These functions were the base for the optimized assembler routines,
+   and remain here for documentation purposes.  */
+static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
+                                   int line_size)
 {
     int i = 8;
+    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
+
+    ASM_ACCEPT_MVI;
+
     do {
-	UINT64 shorts;
+        uint64_t shorts0, shorts1;
 
-	shorts = ldq(block);
-	shorts = maxsw4(shorts, 0);
-	shorts = minsw4(shorts, WORD_VEC(0x00ff));
-	stl(pkwb(shorts), pixels);
+        shorts0 = ldq(block);
+        shorts0 = maxsw4(shorts0, 0);
+        shorts0 = minsw4(shorts0, clampmask);
+        stl(pkwb(shorts0), pixels);
 
-	shorts = ldq(block + 4);
-	shorts = maxsw4(shorts, 0);
-	shorts = minsw4(shorts, WORD_VEC(0x00ff));
-	stl(pkwb(shorts), pixels + 4);
+        shorts1 = ldq(block + 4);
+        shorts1 = maxsw4(shorts1, 0);
+        shorts1 = minsw4(shorts1, clampmask);
+        stl(pkwb(shorts1), pixels + 4);
 
-	pixels += line_size;
-	block += 8;
+        pixels += line_size;
+        block += 8;
     } while (--i);
 }
 
-static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels, 
-				   int line_size)
+void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
+                            int line_size)
 {
-    int i = 8;
+    int h = 8;
+    /* Keep this function a leaf function by generating the constants
+       manually (mainly for the hack value ;-).  */
+    uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
+    uint64_t signmask  = zap(-1, 0x33);
+    signmask ^= signmask >> 1;  /* 0x8000800080008000 */
+
+    ASM_ACCEPT_MVI;
+
     do {
-	UINT64 shorts; 
-
-	shorts = ldq(block);
-	shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */
-	shorts += unpkbw(ldl(pixels));
-	shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */
-	shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */
-	shorts &= ~WORD_VEC(0x4000); /* ...and zap them */
-	shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */
-	stl(pkwb(shorts), pixels);
-
-	/* next 4 */
-	shorts = ldq(block + 4);
-	shorts &= ~WORD_VEC(0x8000);
-	shorts += unpkbw(ldl(pixels + 4));
-	shorts &= ~WORD_VEC(0x8000);
-	shorts = minuw4(shorts, WORD_VEC(0x4000));
-	shorts &= ~WORD_VEC(0x4000);
-	shorts = minsw4(shorts, WORD_VEC(0x00ff));
-	stl(pkwb(shorts), pixels + 4);
-
-	pixels += line_size;
-	block += 8;
-    } while (--i);
+        uint64_t shorts0, pix0, signs0;
+        uint64_t shorts1, pix1, signs1;
+
+        shorts0 = ldq(block);
+        shorts1 = ldq(block + 4);
+
+        pix0    = unpkbw(ldl(pixels));
+        /* Signed subword add (MMX paddw).  */
+        signs0  = shorts0 & signmask;
+        shorts0 &= ~signmask;
+        shorts0 += pix0;
+        shorts0 ^= signs0;
+        /* Clamp. */
+        shorts0 = maxsw4(shorts0, 0);
+        shorts0 = minsw4(shorts0, clampmask);   
+
+        /* Next 4.  */
+        pix1    = unpkbw(ldl(pixels + 4));
+        signs1  = shorts1 & signmask;
+        shorts1 &= ~signmask;
+        shorts1 += pix1;
+        shorts1 ^= signs1;
+        shorts1 = maxsw4(shorts1, 0);
+        shorts1 = minsw4(shorts1, clampmask);
+
+        stl(pkwb(shorts0), pixels);
+        stl(pkwb(shorts1), pixels + 4);
+
+        pixels += line_size;
+        block += 8;
+    } while (--h);
 }
+#endif
 
 /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
    Since the immediate result could be greater than 255, we do the
@@ -216,8 +244,7 @@ void dsputil_init_alpha(void)
 
     /* amask clears all bits that correspond to present features.  */
     if (amask(AMASK_MVI) == 0) {
-	fprintf(stderr, "MVI extension detected\n");
-	put_pixels_clamped = put_pixels_clamped_axp;
-	add_pixels_clamped = add_pixels_clamped_axp;
+        put_pixels_clamped = put_pixels_clamped_mvi_asm;
+        add_pixels_clamped = add_pixels_clamped_mvi_asm;
     }
 }
diff --git a/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c b/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c
index d0af5e1d3..eb1997eee 100644
--- a/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c
+++ b/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c
@@ -28,6 +28,9 @@ static void dct_unquantize_h263_axp(MpegEncContext *s,
 {
     int i, level;
     UINT64 qmul, qadd;
+
+    ASM_ACCEPT_MVI;
+    
     if (s->mb_intra) {
         if (n < 4) 
             block[0] = block[0] * s->y_dc_scale;