4 files changed, 552 insertions, 155 deletions
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
index 5e1aa2093..9a3fb1eac 100644
--- a/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
+++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha.c
@@ -22,6 +22,8 @@
 
 void simple_idct_axp(DCTELEM *block);
 
+void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+			int line_size, int h);
 void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
 				int line_size);
 void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
@@ -103,145 +105,183 @@ void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
 }
 #endif
 
-/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
-   Since the immediate result could be greater than 255, we do the
-   shift first. The result is too low by one if the bytes were both
-   odd, so we need to add (l1 & l2) & BYTE_VEC(0x01).  */
-static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2)
-{
-    UINT64 correction = (l1 & l2) & BYTE_VEC(0x01);
-    l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
-    l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
-    return l1 + l2 + correction;
+static void clear_blocks_axp(DCTELEM *blocks) {
+    uint64_t *p = (uint64_t *) blocks;
+    int n = sizeof(DCTELEM) * 6 * 64;
+
+    do {
+        p[0] = 0;
+        p[1] = 0;
+        p[2] = 0;
+        p[3] = 0;
+        p[4] = 0;
+        p[5] = 0;
+        p[6] = 0;
+        p[7] = 0;
+        p += 8;
+        n -= 8 * 8;
+    } while (n);
 }
 
-/* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1
-   The '1' only has an effect when one byte is even and the other odd,
-   i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01).
-   Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01).  */
-static inline UINT64 avg2(UINT64 l1, UINT64 l2)
+static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
 {
-    UINT64 correction = (l1 | l2) & BYTE_VEC(0x01);
-    l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
-    l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
-    return l1 + l2 + correction;
+    return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
 }
 
-static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
+static inline uint64_t avg2(uint64_t a, uint64_t b)
 {
-    UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
-	      + ((l2 & ~BYTE_VEC(0x03)) >> 2)
-	      + ((l3 & ~BYTE_VEC(0x03)) >> 2)
-	      + ((l4 & ~BYTE_VEC(0x03)) >> 2);
-    UINT64 r2 = ((  (l1 & BYTE_VEC(0x03))
-		  + (l2 & BYTE_VEC(0x03))
-		  + (l3 & BYTE_VEC(0x03))
-		  + (l4 & BYTE_VEC(0x03))
-		  + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
-    return r1 + r2;
+    return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);    
 }
 
-static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
+#if 0
+/* The XY2 routines basically utilize this scheme, but reuse parts in
+   each iteration.  */
+static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
 {
-    UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
-	      + ((l2 & ~BYTE_VEC(0x03)) >> 2)
-	      + ((l3 & ~BYTE_VEC(0x03)) >> 2)
-	      + ((l4 & ~BYTE_VEC(0x03)) >> 2);
-    UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
-		 + (l2 & BYTE_VEC(0x03))
-		 + (l3 & BYTE_VEC(0x03))
-		 + (l4 & BYTE_VEC(0x03))
-		 + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
+    uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
+		+ ((l2 & ~BYTE_VEC(0x03)) >> 2)
+		+ ((l3 & ~BYTE_VEC(0x03)) >> 2)
+		+ ((l4 & ~BYTE_VEC(0x03)) >> 2);
+    uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
+		    + (l2 & BYTE_VEC(0x03))
+		    + (l3 & BYTE_VEC(0x03))
+		    + (l4 & BYTE_VEC(0x03))
+		    + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
     return r1 + r2;
 }
+#endif
 
-#define PIXOPNAME(suffix) put ## suffix
-#define BTYPE UINT8
-#define AVG2 avg2
-#define AVG4 avg4
-#define STORE(l, b) stq(l, b)
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
-#undef AVG2
-#undef AVG4
-#undef STORE
+#define OP(LOAD, STORE)                         \
+    do {                                        \
+        STORE(LOAD(pixels), block);             \
+        pixels += line_size;                    \
+        block += line_size;                     \
+    } while (--h)
 
-#define PIXOPNAME(suffix) put_no_rnd ## suffix
-#define BTYPE UINT8
-#define AVG2 avg2_no_rnd
-#define AVG4 avg4_no_rnd
-#define STORE(l, b) stq(l, b)
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
-#undef AVG2
-#undef AVG4
-#undef STORE
+#define OP_X2(LOAD, STORE)                                      \
+    do {                                                        \
+        uint64_t pix1, pix2;                                    \
+                                                                \
+        pix1 = LOAD(pixels);                                    \
+        pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
+        STORE(AVG2(pix1, pix2), block);                         \
+        pixels += line_size;                                    \
+        block += line_size;                                     \
+    } while (--h)
 
-/* The following functions are untested.  */
-#if 0
+#define OP_Y2(LOAD, STORE)                      \
+    do {                                        \
+        uint64_t pix = LOAD(pixels);            \
+        do {                                    \
+            uint64_t next_pix;                  \
+                                                \
+            pixels += line_size;                \
+            next_pix = LOAD(pixels);            \
+            STORE(AVG2(pix, next_pix), block);  \
+            block += line_size;                 \
+            pix = next_pix;                     \
+        } while (--h);                          \
+    } while (0)
+
+#define OP_XY2(LOAD, STORE)                                                 \
+    do {                                                                    \
+        uint64_t pix1 = LOAD(pixels);                                       \
+        uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);           \
+        uint64_t pix_l = (pix1 & BYTE_VEC(0x03))                            \
+                       + (pix2 & BYTE_VEC(0x03));                           \
+        uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2)                    \
+                       + ((pix2 & ~BYTE_VEC(0x03)) >> 2);                   \
+                                                                            \
+        do {                                                                \
+            uint64_t npix1, npix2;                                          \
+            uint64_t npix_l, npix_h;                                        \
+            uint64_t avg;                                                   \
+                                                                            \
+            pixels += line_size;                                            \
+            npix1 = LOAD(pixels);                                           \
+            npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56);              \
+            npix_l = (npix1 & BYTE_VEC(0x03))                               \
+                   + (npix2 & BYTE_VEC(0x03));                              \
+            npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2)                       \
+                   + ((npix2 & ~BYTE_VEC(0x03)) >> 2);                      \
+            avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
+                + pix_h + npix_h;                                           \
+            STORE(avg, block);                                              \
+                                                                            \
+            block += line_size;                                             \
+            pix_l = npix_l;                                                 \
+            pix_h = npix_h;                                                 \
+        } while (--h);                                                      \
+    } while (0)
+
+#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE)                            \
+static void OPNAME ## _pixels ## SUFF ## _axp                           \
+        (uint8_t *restrict block, const uint8_t *restrict pixels,       \
+         int line_size, int h)                                          \
+{                                                                       \
+    if ((size_t) pixels & 0x7) {                                        \
+        OPKIND(uldq, STORE);                                            \
+    } else {                                                            \
+        OPKIND(ldq, STORE);                                             \
+    }                                                                   \
+}
 
-#define PIXOPNAME(suffix) avg ## suffix
-#define BTYPE UINT8
+#define PIXOP(OPNAME, STORE)                    \
+    MAKE_OP(OPNAME, ,     OP,     STORE)        \
+    MAKE_OP(OPNAME, _x2,  OP_X2,  STORE)        \
+    MAKE_OP(OPNAME, _y2,  OP_Y2,  STORE)        \
+    MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
+
+/* Rounding primitives.  */
 #define AVG2 avg2
 #define AVG4 avg4
+#define AVG4_ROUNDER BYTE_VEC(0x02)
+#define STORE(l, b) stq(l, b)
+PIXOP(put, STORE);
+
+#undef STORE
 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
+PIXOP(avg, STORE);
+
+/* Not rounding primitives.  */
 #undef AVG2
 #undef AVG4
+#undef AVG4_ROUNDER
 #undef STORE
-
-#define PIXOPNAME(suffix) avg_no_rnd ## suffix
-#define BTYPE UINT8
 #define AVG2 avg2_no_rnd
 #define AVG4 avg4_no_rnd
-#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
-#undef AVG2
-#undef AVG4
-#undef STORE
+#define AVG4_ROUNDER BYTE_VEC(0x01)
+#define STORE(l, b) stq(l, b)
+PIXOP(put_no_rnd, STORE);
 
-#define PIXOPNAME(suffix) sub ## suffix
-#define BTYPE DCTELEM
-#define AVG2 avg2
-#define AVG4 avg4
-#define STORE(l, block) do {		\
-    UINT64 xxx = l;			\
-    (block)[0] -= (xxx >>  0) & 0xff;	\
-    (block)[1] -= (xxx >>  8) & 0xff;	\
-    (block)[2] -= (xxx >> 16) & 0xff;	\
-    (block)[3] -= (xxx >> 24) & 0xff;	\
-    (block)[4] -= (xxx >> 32) & 0xff;	\
-    (block)[5] -= (xxx >> 40) & 0xff;	\
-    (block)[6] -= (xxx >> 48) & 0xff;	\
-    (block)[7] -= (xxx >> 56) & 0xff;	\
-} while (0)
-#include "pixops.h"
-#undef PIXOPNAME
-#undef BTYPE
-#undef AVG2
-#undef AVG4
 #undef STORE
-
-#endif
+#define STORE(l, b) stq(AVG2(l, ldq(b)), b);
+PIXOP(avg_no_rnd, STORE);
 
 void dsputil_init_alpha(void)
 {
-    put_pixels_tab[0] = put_pixels_axp;
+    put_pixels_tab[0] = put_pixels_axp_asm;
     put_pixels_tab[1] = put_pixels_x2_axp;
     put_pixels_tab[2] = put_pixels_y2_axp;
     put_pixels_tab[3] = put_pixels_xy2_axp;
 
-    put_no_rnd_pixels_tab[0] = put_pixels_axp;
+    put_no_rnd_pixels_tab[0] = put_pixels_axp_asm;
     put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
     put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
     put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
 
+    avg_pixels_tab[0] = avg_pixels_axp;
+    avg_pixels_tab[1] = avg_pixels_x2_axp;
+    avg_pixels_tab[2] = avg_pixels_y2_axp;
+    avg_pixels_tab[3] = avg_pixels_xy2_axp;
+
+    avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_axp;
+    avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_axp;
+    avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_axp;
+    avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_axp;
+
+    clear_blocks = clear_blocks_axp;
+
     /* amask clears all bits that correspond to present features.  */
     if (amask(AMASK_MVI) == 0) {
         put_pixels_clamped = put_pixels_clamped_mvi_asm;
diff --git a/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S b/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
new file mode 100644
index 000000000..5349e443c
--- /dev/null
+++ b/src/libffmpeg/libavcodec/alpha/dsputil_alpha_asm.S
@@ -0,0 +1,306 @@
+/*
+ * Alpha optimized DSP utils
+ * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * These functions are scheduled for pca56. They should work
+ * reasonably on ev6, though.
+ */
+
+#include "regdef.h"
+#ifdef HAVE_AV_CONFIG_H	
+#include "config.h"
+#endif
+
+/* Some nicer register names.  */
+#define ta t10
+#define tb t11
+#define tc t12
+#define td AT
+/* Danger: these overlap with the argument list and the return value */
+#define te a5
+#define tf a4
+#define tg a3
+#define th v0
+                
+        .set noat
+        .set noreorder
+        .arch pca56
+        .text
+
+/************************************************************************
+ * void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
+ *                         int line_size, int h)
+ */
+        .align 6
+        .globl put_pixels_axp_asm
+        .ent put_pixels_axp_asm
+put_pixels_axp_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+#ifdef HAVE_GPROF
+        lda     AT, _mcount
+        jsr     AT, (AT), _mcount
+#endif
+
+        and     a1, 7, t0
+        beq     t0, $aligned
+
+        .align 4
+$unaligned:
+        ldq_u   t0, 0(a1)
+        ldq_u   t1, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t2, 0(a1)
+        ldq_u   t3, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+	ldq_u   t4, 0(a1)
+        ldq_u   t5, 8(a1)
+        addq    a1, a2, a1
+        nop
+
+        ldq_u   t6, 0(a1)
+        ldq_u   t7, 8(a1)
+        extql   t0, a1, t0
+        addq    a1, a2, a1
+
+        extqh   t1, a1, t1
+        addq    a0, a2, t8
+        extql   t2, a1, t2
+        addq    t8, a2, t9
+
+        extqh   t3, a1, t3
+        addq    t9, a2, ta
+        extql   t4, a1, t4
+        or      t0, t1, t0
+
+        extqh   t5, a1, t5
+        or      t2, t3, t2
+        extql   t6, a1, t6
+        or      t4, t5, t4
+
+        extqh   t7, a1, t7
+        or      t6, t7, t6
+        stq     t0, 0(a0)
+        stq     t2, 0(t8)
+
+        stq     t4, 0(t9)
+        subq    a3, 4, a3
+        stq     t6, 0(ta)
+        addq    ta, a2, a0
+
+        bne     a3, $unaligned
+        ret
+
+        .align 4
+$aligned:
+        ldq     t0, 0(a1)
+        addq    a1, a2, a1
+        ldq     t1, 0(a1)
+        addq    a1, a2, a1
+
+        ldq     t2, 0(a1)
+        addq    a1, a2, a1
+        ldq     t3, 0(a1)
+        addq    a1, a2, a1
+
+        ldq     t4, 0(a1)
+        addq    a1, a2, a1
+        ldq     t5, 0(a1)
+        addq    a1, a2, a1
+
+        ldq     t6, 0(a1)
+        addq    a1, a2, a1
+        ldq     t7, 0(a1)
+        addq    a1, a2, a1
+
+        addq    a0, a2, t8
+        stq     t0, 0(a0)
+        addq    t8, a2, t9
+        stq     t1, 0(t8)
+
+        addq    t9, a2, ta
+        stq     t2, 0(t9)
+        addq    ta, a2, tb
+        stq     t3, 0(ta)
+
+        addq    tb, a2, tc
+        stq     t4, 0(tb)
+        addq    tc, a2, td
+        stq     t5, 0(tc)
+
+        addq    td, a2, te
+        stq     t6, 0(td)
+        addq    te, a2, a0
+        stq     t7, 0(te)
+
+        subq    a3, 8, a3
+        bne     a3, $aligned
+
+        ret
+        .end put_pixels_axp_asm
+
+/************************************************************************
+ * void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
+ *                                 int line_size)
+ */
+        .align 6
+        .globl put_pixels_clamped_mvi_asm
+        .ent put_pixels_clamped_mvi_asm
+put_pixels_clamped_mvi_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+#ifdef HAVE_GPROF
+        lda     AT, _mcount
+        jsr     AT, (AT), _mcount
+#endif
+
+        lda     t8, -1
+        lda     t9, 8           # loop counter
+        zap     t8, 0xaa, t8    # 00ff00ff00ff00ff
+
+        .align 4
+1:      ldq     t0,  0(a0)
+        ldq     t1,  8(a0)
+        ldq     t2, 16(a0)
+        ldq     t3, 24(a0)
+
+        maxsw4  t0, zero, t0
+        subq    t9, 2, t9
+        maxsw4  t1, zero, t1
+        lda     a0, 32(a0)
+
+        maxsw4  t2, zero, t2
+        addq    a1, a2, ta
+        maxsw4  t3, zero, t3
+        minsw4  t0, t8, t0
+        
+        minsw4  t1, t8, t1
+        minsw4  t2, t8, t2
+        minsw4  t3, t8, t3
+        pkwb    t0, t0
+        
+        pkwb    t1, t1
+        pkwb    t2, t2
+        pkwb    t3, t3
+        stl     t0, 0(a1)
+        
+        stl     t1, 4(a1)
+        addq    ta, a2, a1
+        stl     t2, 0(ta)
+        stl     t3, 4(ta)
+
+        bne     t9, 1b
+        ret
+        .end put_pixels_clamped_mvi_asm
+
+/************************************************************************
+ * void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
+ *                                 int line_size)
+ */
+        .align 6
+        .globl add_pixels_clamped_mvi_asm
+        .ent add_pixels_clamped_mvi_asm
+add_pixels_clamped_mvi_asm:
+        .frame sp, 0, ra
+        .prologue 0
+
+#ifdef HAVE_GPROF
+        lda     AT, _mcount
+        jsr     AT, (AT), _mcount
+#endif
+
+        lda     t1, -1
+        lda     th, 8
+        zap     t1, 0x33, tg
+        nop
+
+        srl     tg, 1, t0
+        xor     tg, t0, tg      # 0x8000800080008000
+        zap     t1, 0xaa, tf    # 0x00ff00ff00ff00ff
+
+        .align 4
+1:      ldl     t1, 0(a1)       # pix0 (try to hit cache line soon)
+        ldl     t4, 4(a1)       # pix1
+        addq    a1, a2, te      # pixels += line_size
+        ldq     t0, 0(a0)       # shorts0
+
+        ldl     t7, 0(te)       # pix2 (try to hit cache line soon)
+        ldl     ta, 4(te)       # pix3
+        ldq     t3, 8(a0)       # shorts1
+        ldq     t6, 16(a0)      # shorts2
+
+        ldq     t9, 24(a0)      # shorts3
+        unpkbw  t1, t1          # 0 0 (quarter/op no.)
+        and     t0, tg, t2      # 0 1
+        unpkbw  t4, t4          # 1 0
+
+        bic     t0, tg, t0      # 0 2
+        unpkbw  t7, t7          # 2 0
+        and     t3, tg, t5      # 1 1
+        addq    t0, t1, t0      # 0 3 
+
+        xor     t0, t2, t0      # 0 4
+        unpkbw  ta, ta          # 3 0
+        and     t6, tg, t8      # 2 1
+        maxsw4  t0, zero, t0    # 0 5
+        
+        bic     t3, tg, t3      # 1 2
+        bic     t6, tg, t6      # 2 2
+        minsw4  t0, tf, t0      # 0 6
+        addq    t3, t4, t3      # 1 3
+        
+        pkwb    t0, t0          # 0 7
+        xor     t3, t5, t3      # 1 4
+        maxsw4  t3, zero, t3    # 1 5
+        addq    t6, t7, t6      # 2 3
+
+        xor     t6, t8, t6      # 2 4
+        and     t9, tg, tb      # 3 1
+        minsw4  t3, tf, t3      # 1 6
+        bic     t9, tg, t9      # 3 2
+
+        maxsw4  t6, zero, t6    # 2 5
+        addq    t9, ta, t9      # 3 3
+        stl     t0, 0(a1)       # 0 8   
+        minsw4  t6, tf, t6      # 2 6
+
+        xor     t9, tb, t9      # 3 4
+        maxsw4  t9, zero, t9    # 3 5
+        lda     a0, 32(a0)      # block += 16;
+        pkwb    t3, t3          # 1 7
+        
+        minsw4  t9, tf, t9      # 3 6
+        subq    th, 2, th
+        pkwb    t6, t6          # 2 7
+        pkwb    t9, t9          # 3 7
+
+        stl     t3, 4(a1)       # 1 8
+        addq    te, a2, a1      # pixels += line_size
+        stl     t6, 0(te)       # 2 8
+        stl     t9, 4(te)       # 3 8
+
+        bne     th, 1b
+        ret     
+        .end add_pixels_clamped_mvi_asm
diff --git a/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c b/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c
index eb1997eee..0be327079 100644
--- a/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c
+++ b/src/libffmpeg/libavcodec/alpha/mpegvideo_alpha.c
@@ -23,69 +23,75 @@
 
 extern UINT8 zigzag_end[64];
 
-static void dct_unquantize_h263_axp(MpegEncContext *s, 
-				    DCTELEM *block, int n, int qscale)
+static void dct_unquantize_h263_axp(MpegEncContext *s, DCTELEM *block,
+                                    int n, int qscale)
 {
-    int i, level;
-    UINT64 qmul, qadd;
+    int i, n_coeffs;
+    uint64_t qmul, qadd;
+    uint64_t correction;
+    DCTELEM *orig_block = block;
+    DCTELEM block0;
 
-    ASM_ACCEPT_MVI;
-    
     if (s->mb_intra) {
-        if (n < 4) 
-            block[0] = block[0] * s->y_dc_scale;
-        else
-            block[0] = block[0] * s->c_dc_scale;
-	/* Catch up to aligned point.  */
-	qmul = s->qscale << 1;
-	qadd = (s->qscale - 1) | 1;
-	for (i = 1; i < 4; ++i) {
-	    level = block[i];
-	    if (level) {
-		if (level < 0) {
-		    level = level * qmul - qadd;
-		} else {
-		    level = level * qmul + qadd;
-		}
-		block[i] = level;
-	    }
-	}
-	block += 4;
-	i = 60 / 4;
+        if (!s->h263_aic) {
+            if (n < 4) 
+                block0 = block[0] * s->y_dc_scale;
+            else
+                block0 = block[0] * s->c_dc_scale;
+        }
+        n_coeffs = 64; // does not always use zigzag table 
     } else {
-        i = zigzag_end[s->block_last_index[n]] / 4;
+        n_coeffs = zigzag_end[s->block_last_index[n]];
     }
-    qmul = s->qscale << 1;
+
+    qmul = qscale << 1;
     qadd = WORD_VEC((qscale - 1) | 1);
-    do {
-	UINT64 levels, negmask, zeromask, corr;
-	levels = ldq(block);
-	if (levels == 0)
-	    continue;
-	zeromask = cmpbge(0, levels);
-	zeromask &= zeromask >> 1;
-	/* Negate all negative words.  */
-	negmask = maxsw4(levels, WORD_VEC(0xffff)); /* negative -> ffff (-1) */
-	negmask = minsw4(negmask, 0);		    /* positive -> 0000 (0) */
-	corr    = negmask & WORD_VEC(0x0001); /* twos-complement correction */
-	levels ^= negmask;
-	levels += corr;
+    /* This mask kills spill from negative subwords to the next subword.  */ 
+    correction = WORD_VEC((qmul - 1) + 1); /* multiplication / addition */
+
+    for(i = 0; i < n_coeffs; block += 4, i += 4) {
+        uint64_t levels, negmask, zeros, add;
+
+        levels = ldq(block);
+        if (levels == 0)
+            continue;
+
+#ifdef __alpha_max__
+        /* I don't think the speed difference justifies runtime
+           detection.  */
+	ASM_ACCEPT_MVI;
+        negmask = maxsw4(levels, -1); /* negative -> ffff (-1) */
+        negmask = minsw4(negmask, 0); /* positive -> 0000 (0) */
+#else
+        negmask = cmpbge(WORD_VEC(0x7fff), levels);
+        negmask &= (negmask >> 1) | (1 << 7);
+        negmask = zap(-1, negmask);
+#endif
+
+        zeros = cmpbge(0, levels);
+        zeros &= zeros >> 1;
+        /* zeros |= zeros << 1 is not needed since qadd <= 255, so
+           zapping the lower byte suffices.  */
 
-	levels = levels * qmul;
-	levels += zap(qadd, zeromask);
+        levels *= qmul;
+        levels -= correction & (negmask << 16);
 
-	/* Re-negate negative words.  */
-	levels -= corr;
-	levels ^= negmask;
+        /* Negate qadd for negative levels.  */
+        add = qadd ^ negmask;
+        add += WORD_VEC(0x0001) & negmask;
+        /* Set qadd to 0 for levels == 0.  */
+        add = zap(add, zeros);
 
-	stq(levels, block);
-    } while (block += 4, --i);
+        levels += add;
+
+        stq(levels, block);
+    }
+
+    if (s->mb_intra && !s->h263_aic)
+        orig_block[0] = block0;
 }
 
 void MPV_common_init_axp(MpegEncContext *s)
 {
-    if (amask(AMASK_MVI) == 0) {
-        if (s->out_format == FMT_H263)
-	    s->dct_unquantize = dct_unquantize_h263_axp;
-    }
+    s->dct_unquantize_h263 = dct_unquantize_h263_axp;
 }
diff --git a/src/libffmpeg/libavcodec/alpha/regdef.h b/src/libffmpeg/libavcodec/alpha/regdef.h
new file mode 100644
index 000000000..7e7fc06b2
--- /dev/null
+++ b/src/libffmpeg/libavcodec/alpha/regdef.h
@@ -0,0 +1,45 @@
+/* Some BSDs don't seem to have regdef.h... sigh  */
+#ifndef alpha_regdef_h
+#define alpha_regdef_h
+
+#define v0      $0      /* function return value */
+
+#define t0      $1      /* temporary registers (caller-saved) */
+#define t1      $2
+#define t2      $3
+#define t3      $4
+#define t4      $5
+#define t5      $6
+#define t6      $7
+#define t7      $8
+
+#define s0      $9      /* saved-registers (callee-saved registers) */
+#define s1      $10
+#define s2      $11
+#define s3      $12
+#define s4      $13
+#define s5      $14
+#define s6      $15
+#define fp      s6      /* frame-pointer (s6 in frame-less procedures) */
+
+#define a0      $16     /* argument registers (caller-saved) */
+#define a1      $17
+#define a2      $18
+#define a3      $19
+#define a4      $20
+#define a5      $21
+
+#define t8      $22     /* more temps (caller-saved) */
+#define t9      $23
+#define t10     $24
+#define t11     $25
+#define ra      $26     /* return address register */
+#define t12     $27
+
+#define pv      t12     /* procedure-variable register */
+#define AT      $at     /* assembler temporary */
+#define gp      $29     /* global pointer */
+#define sp      $30     /* stack pointer */
+#define zero    $31     /* reads as zero, writes are noops */
+
+#endif /* alpha_regdef_h */